1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 __all__ = ['search']
32
33 import os
34 import sys
35 import time
36
37 if sys.version_info[0] > 2:
38 from http.cookiejar import LWPCookieJar
39 from urllib.request import Request, urlopen
40 from urllib.parse import quote_plus, urlparse, parse_qs
41 else:
42 from cookielib import LWPCookieJar
43 from urllib import quote_plus
44 from urllib2 import Request, urlopen
45 from urlparse import urlparse, parse_qs
46
47
48 try:
49 from bs4 import BeautifulSoup
50 except ImportError:
51 from BeautifulSoup import BeautifulSoup
52
53
54 url_home = "http://www.google.%(tld)s/"
55 url_search = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&btnG=Google+Search"
56 url_next_page = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&start=%(start)d"
57 url_search_num = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&num=%(num)d&btnG=Google+Search"
58 url_next_page_num = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d"
59
60
61 home_folder = os.getenv('HOME')
62 if not home_folder:
63 home_folder = os.getenv('USERHOME')
64 if not home_folder:
65 home_folder = '.'
66 cookie_jar = LWPCookieJar(os.path.join(home_folder, '.google-cookie'))
67 try:
68 cookie_jar.load()
69 except Exception:
70 pass
71
72
74 """
75 Request the given URL and return the response page, using the cookie jar.
76
77 @type url: str
78 @param url: URL to retrieve.
79
80 @rtype: str
81 @return: Web page retrieved for the given URL.
82
83 @raise IOError: An exception is raised on error.
84 @raise urllib2.URLError: An exception is raised on error.
85 @raise urllib2.HTTPError: An exception is raised on error.
86 """
87 request = Request(url)
88 request.add_header('User-Agent',
89 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)')
90 cookie_jar.add_cookie_header(request)
91 response = urlopen(request)
92 cookie_jar.extract_cookies(response, request)
93 html = response.read()
94 response.close()
95 cookie_jar.save()
96 return html
97
98
99
101 try:
102
103
104
105 o = urlparse(link, 'http')
106 if o.netloc and 'google' not in o.netloc:
107 return link
108
109
110 if link.startswith('/url?'):
111 link = parse_qs(o.query)['q'][0]
112
113
114
115 o = urlparse(link, 'http')
116 if o.netloc and 'google' not in o.netloc:
117 return link
118
119
120 except Exception:
121 pass
122 return None
123
124
125 -def search(query, tld='com', lang='en', num=10, start=0, stop=None, pause=2.0):
126 """
127 Search the given query string using Google.
128
129 @type query: str
130 @param query: Query string. Must NOT be url-encoded.
131
132 @type tld: str
133 @param tld: Top level domain.
134
135 @type lang: str
136 @param lang: Languaje.
137
138 @type num: int
139 @param num: Number of results per page.
140
141 @type start: int
142 @param start: First result to retrieve.
143
144 @type stop: int
145 @param stop: Last result to retrieve.
146 Use C{None} to keep searching forever.
147
148 @type pause: float
149 @param pause: Lapse to wait between HTTP requests.
150 A lapse too long will make the search slow, but a lapse too short may
151 cause Google to block your IP. Your mileage may vary!
152
153 @rtype: generator
154 @return: Generator (iterator) that yields found URLs. If the C{stop}
155 parameter is C{None} the iterator will loop forever.
156 """
157
158
159
160 hashes = set()
161
162
163 query = quote_plus(query)
164
165
166 get_page(url_home % vars())
167
168
169 if start:
170 if num == 10:
171 url = url_next_page % vars()
172 else:
173 url = url_next_page_num % vars()
174 else:
175 if num == 10:
176 url = url_search % vars()
177 else:
178 url = url_search_num % vars()
179
180
181 while not stop or start < stop:
182
183
184 time.sleep(pause)
185
186
187 html = get_page(url)
188
189
190 soup = BeautifulSoup(html)
191 anchors = soup.find(id='search').findAll('a')
192 for a in anchors:
193
194
195 try:
196 link = a['href']
197 except KeyError:
198 continue
199
200
201 link = filter_result(link)
202 if not link:
203 continue
204
205
206 h = hash(link)
207 if h in hashes:
208 continue
209 hashes.add(h)
210
211
212 yield link
213
214
215 if not soup.find(id='nav'):
216 break
217
218
219 start += num
220 if num == 10:
221 url = url_next_page % vars()
222 else:
223 url = url_next_page_num % vars()
224
225
226 if __name__ == "__main__":
227
228 from optparse import OptionParser, IndentedHelpFormatter
229
238
239
240 formatter = BannerHelpFormatter(
241 "Python script to use the Google search engine\n"
242 "By Mario Vilas (mvilas at gmail dot com)\n"
243 "https://github.com/MarioVilas/google\n"
244 )
245 parser = OptionParser(formatter=formatter)
246 parser.set_usage("%prog [options] query")
247 parser.add_option("--tld", metavar="TLD", type="string", default="com",
248 help="top level domain to use [default: com]")
249 parser.add_option("--lang", metavar="LANGUAGE", type="string", default="en",
250 help="produce results in the given language [default: en]")
251 parser.add_option("--num", metavar="NUMBER", type="int", default=10,
252 help="number of results per page [default: 10]")
253 parser.add_option("--start", metavar="NUMBER", type="int", default=0,
254 help="first result to retrieve [default: 0]")
255 parser.add_option("--stop", metavar="NUMBER", type="int", default=0,
256 help="last result to retrieve [default: unlimited]")
257 parser.add_option("--pause", metavar="SECONDS", type="float", default=2.0,
258 help="pause between HTTP requests [default: 2.0]")
259 (options, args) = parser.parse_args()
260 query = ' '.join(args)
261 if not query:
262 parser.print_help()
263 sys.exit(2)
264 params = [(k,v) for (k,v) in options.__dict__.items() if not k.startswith('_')]
265 params = dict(params)
266
267
268 for url in search(query, **params):
269 print(url)
270