1
2
3
4 """
5 Web Data Extractor, extract emails by sitecrawl
6 Copyright (C) 2011 KATHURIA Pulkit
7 Contact: pulkit@jaist.ac.jp
8
9 Contributors:
10 Open Source Sitemap Generator sitemap_gen by Vladimir Toncar
11 http://toncar.cz/opensource/sitemap_gen.html
12
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 3 of the License, or
16 (at your option) any later version.
17
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with this program. If not, see <http://www.gnu.org/licenses/>.
25 """
26 import sys
27 import re
28 import commands
29 from urllib import urlopen
30 from collections import defaultdict
31 import argparse
32 import string
33 import urllib2
34 import urlparse
35 from HTMLParser import HTMLParser
36 from HTMLParser import HTMLParseError
37 import robotparser
38 import httplib
39
41 try:
42 f = urllib2.urlopen(url)
43 page = ""
44 for i in f.readlines():
45 page += i
46 date = f.info().getdate('Last-Modified')
47 if date == None:
48 date = (0, 0, 0)
49 else:
50 date = date[:3]
51 f.close()
52 return (page, date, f.url)
53 except urllib2.URLError, detail:
54 pass
55 return (None, (0,0,0), "")
56
58 helpUrl, fragment = urlparse.urldefrag(newUrl)
59 return urlparse.urljoin(baseUrl, helpUrl)
60
62 rp = robotparser.RobotFileParser()
63 robotUrl = urlparse.urljoin(startUrl, "/robots.txt")
64 page, date, url = getPage(robotUrl)
65
66 if page == None:
67 return None
68 rp.parse(page)
69 return rp
70
72 - def __init__(self, pageMap, redirects, baseUrl, maxUrls, blockExtensions, robotParser):
73 HTMLParser.__init__(self)
74 self.pageMap = pageMap
75 self.redirects = redirects
76 self.baseUrl = baseUrl
77 self.server = urlparse.urlsplit(baseUrl)[1]
78 self.maxUrls = maxUrls
79 self.blockExtensions = blockExtensions
80 self.robotParser = robotParser
82 p = urlparse.urlparse(url)
83 path = p[2].upper()
84 for i in self.blockExtensions:
85 if path.endswith(i):
86 return 1
87 return 0
89 if len(self.pageMap) >= self.maxUrls:
90 return
91 if (tag.upper() == "BASE"):
92 if (attrs[0][0].upper() == "HREF"):
93 self.baseUrl = joinUrls(self.baseUrl, attrs[0][1])
94 if (tag.upper() == "A"):
95 url = ""
96 for attr in attrs:
97 if (attr[0].upper() == "REL") and (attr[1].upper().find('NOFOLLOW') != -1):
98 return
99 elif (attr[0].upper() == "HREF") and (attr[1].upper().find('MAILTO:') == -1):
100 url = joinUrls(self.baseUrl, attr[1])
101 if url == "": return
102 if urlparse.urlsplit(url)[1] <> self.server:
103 return
104 if self.hasBlockedExtension(url) or self.redirects.count(url) > 0:
105 return
106 if (self.robotParser <> None) and not(self.robotParser.can_fetch("*", url)):
107 return
108 if not(self.pageMap.has_key(url)):
109 self.pageMap[url] = ()
110
112 for i in pageMap.keys():
113 if pageMap[i] == ():
114 return i
115 return None
116
117 -def parsePages(startUrl, maxUrls, blockExtensions):
118 pageMap = {}
119 pageMap[startUrl] = ()
120 redirects = []
121 robotParser = getRobotParser(startUrl)
122 while True:
123 url = getUrlToProcess(pageMap)
124 if url == None:
125 break
126 print " ", url
127 page, date, newUrl = getPage(url)
128 if page == None:
129 del pageMap[url]
130 elif url != newUrl:
131 print newUrl
132 del pageMap[url]
133 pageMap[newUrl] = ()
134 redirects.append(url)
135 else:
136 pageMap[url] = date
137 parser = MyHTMLParser(pageMap, redirects, url, maxUrls, blockExtensions, robotParser)
138 try:
139 parser.feed(page)
140 parser.close()
141 except HTMLParseError:
142 pass
143 except UnicodeDecodeError:
144 pass
145 return pageMap
146
148 found = []
149 mailsrch = re.compile(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}')
150 for line in text:
151 found.extend(mailsrch.findall(line))
152 u = {}
153 for item in found:
154 u[item] = 1
155 return u.keys()
156
158 viewsource = urlopen(url).readlines()
159 return viewsource
160
163
164 if __name__ == '__main__':
165 parser = argparse.ArgumentParser(add_help = True)
166 parser = argparse.ArgumentParser(description= 'Web Email Extractor')
167 parser.add_argument('-l','--limit', action="store", default=100, dest= "limit", type= int, help='-l numUrlsToCrawl')
168 parser.add_argument('-u','--url', action="store" ,dest= "url", help='-u http://sitename.com')
169 myarguments = parser.parse_args()
170 emails = defaultdict(int)
171 for url in crawl_site(myarguments.url, myarguments.limit):
172 for email in grab_email(urltext(url)):
173 if not emails.has_key(email): print email
174 emails[email] += 1
175