RSS

Python Email Crawler Class

07 Mar
Email Crawler

Email Crawler

import os
import re

class EmailCrawler:
email_list = []
emailregex = re.compile('\w+[@][a-zA-Z_\.]+\.[a-zA-Z]{2,6}')
output_path = os.getcwd()

def setBaseURL(self,url):
self.tocrawl = set([url])

def run(self):
dir = raw_input('Enter the directory path for crawled files\n')
self.verifyDir(dir)
self.crawl(dir)

def output(self):
with open("emails.txt", "w") as a:
for email in self.email_list:
a.write(str(email) + os.linesep)
print email

def verifyDir(self,path):
if not os.path.exists(path):
print "This directory does not exist"
exit

def crawl(self,dir_path):
print "Crawling Email links in " + dir_path + "....\n\n"
for path, subdirs, files in os.walk(self.get_raw_string(dir_path)):
for filename in files:
filePath = os.path.join(path, filename)
f=open(filePath, 'r')
html=f.read()
f.close()
results = self.emailregex.findall(html)
if results:
for email in results:
if email not in self.email_list:
self.email_list.append(email)
self.output()

def get_raw_string(self,text):
"""Returns a raw string representation of text"""
escape_dict={'\a':r'\a',
'\b':r'\b',
'\c':r'\c',
'\f':r'\f',
'\n':r'\n',
'\r':r'\r',
'\t':r'\t',
'\v':r'\v',
'\'':r'\'',
'\"':r'\"'}

new_string=''
for char in text:
try:
new_string += escape_dict[char]
except KeyError:
new_string += char
return new_string

How to run this email crawler?
Load and run this program in Python Shell. (In Python shell, Run >> Run Module)

import email_crawler
e = email_crawler.EmailCrawler()
e.run()
Advertisements
 
Leave a comment

Posted by on March 7, 2013 in Python

 

Tags:

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

 
%d bloggers like this: