RSS

Category Archives: Python

Python Email Crawler Class

Email Crawler

Email Crawler

import os
import re

class EmailCrawler:
email_list = []
emailregex = re.compile('\w+[@][a-zA-Z_\.]+\.[a-zA-Z]{2,6}')
output_path = os.getcwd()

def setBaseURL(self,url):
self.tocrawl = set([url])

def run(self):
dir = raw_input('Enter the directory path for crawled files\n')
self.verifyDir(dir)
self.crawl(dir)

def output(self):
with open("emails.txt", "w") as a:
for email in self.email_list:
a.write(str(email) + os.linesep)
print email

def verifyDir(self,path):
if not os.path.exists(path):
print "This directory does not exist"
exit

def crawl(self,dir_path):
print "Crawling Email links in " + dir_path + "....\n\n"
for path, subdirs, files in os.walk(self.get_raw_string(dir_path)):
for filename in files:
filePath = os.path.join(path, filename)
f=open(filePath, 'r')
html=f.read()
f.close()
results = self.emailregex.findall(html)
if results:
for email in results:
if email not in self.email_list:
self.email_list.append(email)
self.output()

def get_raw_string(self,text):
"""Returns a raw string representation of text"""
escape_dict={'\a':r'\a',
'\b':r'\b',
'\c':r'\c',
'\f':r'\f',
'\n':r'\n',
'\r':r'\r',
'\t':r'\t',
'\v':r'\v',
'\'':r'\'',
'\"':r'\"'}

new_string=''
for char in text:
try:
new_string += escape_dict[char]
except KeyError:
new_string += char
return new_string

How to run this email crawler?
Load and run this program in Python Shell. (In Python shell, Run >> Run Module)

import email_crawler
e = email_crawler.EmailCrawler()
e.run()
Advertisements
 
Leave a comment

Posted by on March 7, 2013 in Python

 

Tags:

Python Web Crawler Class

Web Crawler

web crawler

import sys
import re
import urllib2
import urlparse
import datetime
import os

class WebCrawler:
tocrawl = set([])
crawled = set([])
linkregex = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')

def setBaseURL(self,url):
self.tocrawl = set([url])

def run(self):
url = raw_input('Enter an URL to crawl\n')
self.setBaseURL(url)
dir = raw_input('Where should I put crawled HTML source files?\n')
self.crawl(dir)

def getTitle(self,html):
startPos = html.find('<title>')
if startPos != -1:
endPos = html.find('</title>', startPos+7)
if endPos != -1:
title = html[startPos+7:endPos]
return title

def writeToFile(self,url):
with open('hyperlinks.txt', 'a') as file:
file.write(url + '\n')

def writeHTML(self,fileName,html,dirPath):
self.verifyDir(dirPath)
fileName = re.sub('[\\/:"*?<>|]',"",fileName)
with open(dirPath+'\\'+ fileName + '.txt', 'w+') as file:
file.write(html)

def verifyDir(self,path):
if not os.path.exists(path):
os.makedirs(path)

def crawl(self,dir_path):
while 1:
try:
if self.tocrawl:
crawling = self.tocrawl.pop()
print '\n\nStart Crawling - ' + crawling + '\n'
except KeyError:
raise StopIteration
url = urlparse.urlparse(crawling)
try:
response = urllib2.urlopen(crawling)
except:
continue
msg = response.read()
self.writeHTML(crawling,msg,dir_path)

#Display page title
print self.getTitle(msg)

links = self.linkregex.findall(msg)
self.crawled.add(crawling)
self.writeToFile(crawling)
for link in links:
if link.startswith('mailto'):
continue
if link.startswith('/'):
link = 'http://' + url[1] + link
elif link.startswith('#'):
link = 'http://' + url[1] + url[2] + link
elif not link.startswith('http'):
link = 'http://' + url[1] + '/' + link
if link not in self.crawled:
print '----' + link
self.tocrawl.add(link)

How to run this crawler?

Run following command in Python shell.

import web_crawler
w = web_crawler.WebCrawler()
w.run()
 
6 Comments

Posted by on March 7, 2013 in Python

 

Tags:

Crawl Site For Keywords

import re
import urllib2

keywordregex = re.compile('<meta\sname=["\']keywords["\']\scontent=["\'](.*?)["\']\s/>')

try:
response = urllib2.urlopen("http://fun-lanka.site50.net/");
except Exception, e:
print e.args

msg = response.read()
keywordlist = keywordregex.findall(msg)
if len(keywordlist) > 0:
keywordlist = keywordlist[0]
keywordlist = keywordlist.split(", ")
print keywordlist





 
Leave a comment

Posted by on January 28, 2013 in Python

 

Tags: ,