email_extractor

1 # -*- coding: utf-8 -*- 2 #! /usr/bin/env python 3 4 """ 5 Web Data Extractor, extract emails by sitecrawl 6 Copyright (C) 2011 KATHURIA Pulkit 7 Contact: pulkit@jaist.ac.jp 8 9 Contributors: 10 Open Source Sitemap Generator sitemap_gen by Vladimir Toncar 11 http://toncar.cz/opensource/sitemap_gen.html 12 13 This program is free software; you can redistribute it and/or modify 14 it under the terms of the GNU General Public License as published by 15 the Free Software Foundation; either version 3 of the License, or 16 (at your option) any later version. 17 18 This program is distributed in the hope that it will be useful, 19 but WITHOUT ANY WARRANTY; without even the implied warranty of 20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 GNU General Public License for more details. 22 23 You should have received a copy of the GNU General Public License 24 along with this program. If not, see <http://www.gnu.org/licenses/>. 25 """ 26 import sys 27 import re 28 import commands 29 from urllib import urlopen 30 from collections import defaultdict 31 import argparse 32 import string 33 import urllib2 34 import urlparse 35 from HTMLParser import HTMLParser 36 from HTMLParser import HTMLParseError 37 import robotparser 38 import httplib 39

40 -def getPage(url):

41 try: 42 f = urllib2.urlopen(url) 43 page = "" 44 for i in f.readlines(): 45 page += i 46 date = f.info().getdate('Last-Modified') 47 if date == None: 48 date = (0, 0, 0) 49 else: 50 date = date[:3] 51 f.close() 52 return (page, date, f.url) 53 except urllib2.URLError, detail: 54 pass 55 return (None, (0,0,0), "")

56

57 -def joinUrls(baseUrl, newUrl):

58 helpUrl, fragment = urlparse.urldefrag(newUrl) 59 return urlparse.urljoin(baseUrl, helpUrl)

60

61 -def getRobotParser(startUrl):

62 rp = robotparser.RobotFileParser() 63 robotUrl = urlparse.urljoin(startUrl, "/robots.txt") 64 page, date, url = getPage(robotUrl) 65 66 if page == None: 67 return None 68 rp.parse(page) 69 return rp

70

71 -class MyHTMLParser(HTMLParser):

72 - def __init__(self, pageMap, redirects, baseUrl, maxUrls, blockExtensions, robotParser):

73 HTMLParser.__init__(self) 74 self.pageMap = pageMap 75 self.redirects = redirects 76 self.baseUrl = baseUrl 77 self.server = urlparse.urlsplit(baseUrl)[1] # netloc in python 2.5 78 self.maxUrls = maxUrls 79 self.blockExtensions = blockExtensions 80 self.robotParser = robotParser

81 - def hasBlockedExtension(self, url):

82 p = urlparse.urlparse(url) 83 path = p[2].upper() # path attribute 84 for i in self.blockExtensions: 85 if path.endswith(i): 86 return 1 87 return 0

88 - def handle_starttag(self, tag, attrs):

89 if len(self.pageMap) >= self.maxUrls: 90 return 91 if (tag.upper() == "BASE"): 92 if (attrs[0][0].upper() == "HREF"): 93 self.baseUrl = joinUrls(self.baseUrl, attrs[0][1]) 94 if (tag.upper() == "A"): 95 url = "" 96 for attr in attrs: 97 if (attr[0].upper() == "REL") and (attr[1].upper().find('NOFOLLOW') != -1): 98 return 99 elif (attr[0].upper() == "HREF") and (attr[1].upper().find('MAILTO:') == -1): 100 url = joinUrls(self.baseUrl, attr[1]) 101 if url == "": return 102 if urlparse.urlsplit(url)[1] <> self.server: 103 return 104 if self.hasBlockedExtension(url) or self.redirects.count(url) > 0: 105 return 106 if (self.robotParser <> None) and not(self.robotParser.can_fetch("*", url)): 107 return 108 if not(self.pageMap.has_key(url)): 109 self.pageMap[url] = ()

110

111 -def getUrlToProcess(pageMap):

112 for i in pageMap.keys(): 113 if pageMap[i] == (): 114 return i 115 return None

116

117 -def parsePages(startUrl, maxUrls, blockExtensions):

118 pageMap = {} 119 pageMap[startUrl] = () 120 redirects = [] 121 robotParser = getRobotParser(startUrl) 122 while True: 123 url = getUrlToProcess(pageMap) 124 if url == None: 125 break 126 print " ", url 127 page, date, newUrl = getPage(url) 128 if page == None: 129 del pageMap[url] 130 elif url != newUrl: 131 print newUrl 132 del pageMap[url] 133 pageMap[newUrl] = () 134 redirects.append(url) 135 else: 136 pageMap[url] = date 137 parser = MyHTMLParser(pageMap, redirects, url, maxUrls, blockExtensions, robotParser) 138 try: 139 parser.feed(page) 140 parser.close() 141 except HTMLParseError: 142 pass 143 except UnicodeDecodeError: 144 pass 145 return pageMap

146

147 -def grab_email(text):

148 found = [] 149 mailsrch = re.compile(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}') 150 for line in text: 151 found.extend(mailsrch.findall(line)) 152 u = {} 153 for item in found: 154 u[item] = 1 155 return u.keys()

156

157 -def urltext(url):

158 viewsource = urlopen(url).readlines() 159 return viewsource

160

161 -def crawl_site(url, limit):

162 return parsePages(url, limit, 'None')

163 164 if __name__ == '__main__': 165 parser = argparse.ArgumentParser(add_help = True) 166 parser = argparse.ArgumentParser(description= 'Web Email Extractor') 167 parser.add_argument('-l','--limit', action="store", default=100, dest= "limit", type= int, help='-l numUrlsToCrawl') 168 parser.add_argument('-u','--url', action="store" ,dest= "url", help='-u http://sitename.com') 169 myarguments = parser.parse_args() 170 emails = defaultdict(int) 171 for url in crawl_site(myarguments.url, myarguments.limit): 172 for email in grab_email(urltext(url)): 173 if not emails.has_key(email): print email 174 emails[email] += 1 175

Source Code for Module email_extractor