Module email_extractor
[hide private]
[frames] | no frames]

Source Code for Module email_extractor

  1  # -*- coding: utf-8 -*- 
  2  #! /usr/bin/env python 
  3   
  4  """ 
  5      Web Data Extractor, extract emails by sitecrawl 
  6      Copyright (C) 2011 KATHURIA Pulkit 
  7      Contact: pulkit@jaist.ac.jp 
  8   
  9      Contributors: 
 10          Open Source Sitemap Generator sitemap_gen by Vladimir Toncar 
 11          http://toncar.cz/opensource/sitemap_gen.html 
 12   
 13      This program is free software; you can redistribute it and/or modify 
 14      it under the terms of the GNU General Public License as published by 
 15      the Free Software Foundation; either version 3 of the License, or 
 16      (at your option) any later version. 
 17   
 18      This program is distributed in the hope that it will be useful, 
 19      but WITHOUT ANY WARRANTY; without even the implied warranty of 
 20      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 21      GNU General Public License for more details. 
 22   
 23      You should have received a copy of the GNU General Public License 
 24      along with this program.  If not, see <http://www.gnu.org/licenses/>. 
 25  """ 
 26  import sys 
 27  import re 
 28  import commands 
 29  from urllib import urlopen 
 30  from collections import defaultdict 
 31  import argparse 
 32  import string 
 33  import urllib2 
 34  import urlparse 
 35  from HTMLParser import HTMLParser 
 36  from HTMLParser import HTMLParseError 
 37  import robotparser 
 38  import httplib 
 39   
40 -def getPage(url):
41 try: 42 f = urllib2.urlopen(url) 43 page = "" 44 for i in f.readlines(): 45 page += i 46 date = f.info().getdate('Last-Modified') 47 if date == None: 48 date = (0, 0, 0) 49 else: 50 date = date[:3] 51 f.close() 52 return (page, date, f.url) 53 except urllib2.URLError, detail: 54 pass 55 return (None, (0,0,0), "")
56
57 -def joinUrls(baseUrl, newUrl):
58 helpUrl, fragment = urlparse.urldefrag(newUrl) 59 return urlparse.urljoin(baseUrl, helpUrl)
60
61 -def getRobotParser(startUrl):
62 rp = robotparser.RobotFileParser() 63 robotUrl = urlparse.urljoin(startUrl, "/robots.txt") 64 page, date, url = getPage(robotUrl) 65 66 if page == None: 67 return None 68 rp.parse(page) 69 return rp
70
71 -class MyHTMLParser(HTMLParser):
72 - def __init__(self, pageMap, redirects, baseUrl, maxUrls, blockExtensions, robotParser):
73 HTMLParser.__init__(self) 74 self.pageMap = pageMap 75 self.redirects = redirects 76 self.baseUrl = baseUrl 77 self.server = urlparse.urlsplit(baseUrl)[1] # netloc in python 2.5 78 self.maxUrls = maxUrls 79 self.blockExtensions = blockExtensions 80 self.robotParser = robotParser
81 - def hasBlockedExtension(self, url):
82 p = urlparse.urlparse(url) 83 path = p[2].upper() # path attribute 84 for i in self.blockExtensions: 85 if path.endswith(i): 86 return 1 87 return 0
88 - def handle_starttag(self, tag, attrs):
89 if len(self.pageMap) >= self.maxUrls: 90 return 91 if (tag.upper() == "BASE"): 92 if (attrs[0][0].upper() == "HREF"): 93 self.baseUrl = joinUrls(self.baseUrl, attrs[0][1]) 94 if (tag.upper() == "A"): 95 url = "" 96 for attr in attrs: 97 if (attr[0].upper() == "REL") and (attr[1].upper().find('NOFOLLOW') != -1): 98 return 99 elif (attr[0].upper() == "HREF") and (attr[1].upper().find('MAILTO:') == -1): 100 url = joinUrls(self.baseUrl, attr[1]) 101 if url == "": return 102 if urlparse.urlsplit(url)[1] <> self.server: 103 return 104 if self.hasBlockedExtension(url) or self.redirects.count(url) > 0: 105 return 106 if (self.robotParser <> None) and not(self.robotParser.can_fetch("*", url)): 107 return 108 if not(self.pageMap.has_key(url)): 109 self.pageMap[url] = ()
110
111 -def getUrlToProcess(pageMap):
112 for i in pageMap.keys(): 113 if pageMap[i] == (): 114 return i 115 return None
116
117 -def parsePages(startUrl, maxUrls, blockExtensions):
118 pageMap = {} 119 pageMap[startUrl] = () 120 redirects = [] 121 robotParser = getRobotParser(startUrl) 122 while True: 123 url = getUrlToProcess(pageMap) 124 if url == None: 125 break 126 print " ", url 127 page, date, newUrl = getPage(url) 128 if page == None: 129 del pageMap[url] 130 elif url != newUrl: 131 print newUrl 132 del pageMap[url] 133 pageMap[newUrl] = () 134 redirects.append(url) 135 else: 136 pageMap[url] = date 137 parser = MyHTMLParser(pageMap, redirects, url, maxUrls, blockExtensions, robotParser) 138 try: 139 parser.feed(page) 140 parser.close() 141 except HTMLParseError: 142 pass 143 except UnicodeDecodeError: 144 pass 145 return pageMap
146
147 -def grab_email(text):
148 found = [] 149 mailsrch = re.compile(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}') 150 for line in text: 151 found.extend(mailsrch.findall(line)) 152 u = {} 153 for item in found: 154 u[item] = 1 155 return u.keys()
156
157 -def urltext(url):
158 viewsource = urlopen(url).readlines() 159 return viewsource
160
161 -def crawl_site(url, limit):
162 return parsePages(url, limit, 'None')
163 164 if __name__ == '__main__': 165 parser = argparse.ArgumentParser(add_help = True) 166 parser = argparse.ArgumentParser(description= 'Web Email Extractor') 167 parser.add_argument('-l','--limit', action="store", default=100, dest= "limit", type= int, help='-l numUrlsToCrawl') 168 parser.add_argument('-u','--url', action="store" ,dest= "url", help='-u http://sitename.com') 169 myarguments = parser.parse_args() 170 emails = defaultdict(int) 171 for url in crawl_site(myarguments.url, myarguments.limit): 172 for email in grab_email(urltext(url)): 173 if not emails.has_key(email): print email 174 emails[email] += 1 175