google

Source Code for Module google

1 #!/usr/bin/env python 2 3 # Python bindings to the Google search engine 4 # Copyright (c) 2009-2013, Mario Vilas 5 # All rights reserved. 6 # 7 # Redistribution and use in source and binary forms, with or without 8 # modification, are permitted provided that the following conditions are met: 9 # 10 # * Redistributions of source code must retain the above copyright notice, 11 # this list of conditions and the following disclaimer. 12 # * Redistributions in binary form must reproduce the above copyright 13 # notice,this list of conditions and the following disclaimer in the 14 # documentation and/or other materials provided with the distribution. 15 # * Neither the name of the copyright holder nor the names of its 16 # contributors may be used to endorse or promote products derived from 17 # this software without specific prior written permission. 18 # 19 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 # POSSIBILITY OF SUCH DAMAGE. 30 31 __all__ = ['search'] 32 33 import os 34 import sys 35 import time 36 37 if sys.version_info[0] > 2: 38 from http.cookiejar import LWPCookieJar 39 from urllib.request import Request, urlopen 40 from urllib.parse import quote_plus, urlparse, parse_qs 41 else: 42 from cookielib import LWPCookieJar 43 from urllib import quote_plus 44 from urllib2 import Request, urlopen 45 from urlparse import urlparse, parse_qs 46 47 # Try to use BeautifulSoup 4 if available, fall back to 3 otherwise. 48 try: 49 from bs4 import BeautifulSoup 50 except ImportError: 51 from BeautifulSoup import BeautifulSoup 52 53 # URL templates to make Google searches. 54 url_home = "http://www.google.%(tld)s/" 55 url_search = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&btnG=Google+Search" 56 url_next_page = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&start=%(start)d" 57 url_search_num = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&num=%(num)d&btnG=Google+Search" 58 url_next_page_num = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d" 59 60 # Cookie jar. Stored at the user's home folder. 61 home_folder = os.getenv('HOME') 62 if not home_folder: 63 home_folder = os.getenv('USERHOME') 64 if not home_folder: 65 home_folder = '.' # Use the current folder on error. 66 cookie_jar = LWPCookieJar(os.path.join(home_folder, '.google-cookie')) 67 try: 68 cookie_jar.load() 69 except Exception: 70 pass 71 72 # Request the given URL and return the response page, using the cookie jar.

73 -def get_page(url):

74 """ 75 Request the given URL and return the response page, using the cookie jar. 76 77 @type url: str 78 @param url: URL to retrieve. 79 80 @rtype: str 81 @return: Web page retrieved for the given URL. 82 83 @raise IOError: An exception is raised on error. 84 @raise urllib2.URLError: An exception is raised on error. 85 @raise urllib2.HTTPError: An exception is raised on error. 86 """ 87 request = Request(url) 88 request.add_header('User-Agent', 89 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)') 90 cookie_jar.add_cookie_header(request) 91 response = urlopen(request) 92 cookie_jar.extract_cookies(response, request) 93 html = response.read() 94 response.close() 95 cookie_jar.save() 96 return html

97 98 # Filter links found in the Google result pages HTML code. 99 # Returns None if the link doesn't yield a valid result.

100 -def filter_result(link):

101 try: 102 103 # Valid results are absolute URLs not pointing to a Google domain 104 # like images.google.com or googleusercontent.com 105 o = urlparse(link, 'http') 106 if o.netloc and 'google' not in o.netloc: 107 return link 108 109 # Decode hidden URLs. 110 if link.startswith('/url?'): 111 link = parse_qs(o.query)['q'][0] 112 113 # Valid results are absolute URLs not pointing to a Google domain 114 # like images.google.com or googleusercontent.com 115 o = urlparse(link, 'http') 116 if o.netloc and 'google' not in o.netloc: 117 return link 118 119 # Otherwise, or on error, return None. 120 except Exception: 121 pass 122 return None

123 124 # Returns a generator that yields URLs.

125 -def search(query, tld='com', lang='en', num=10, start=0, stop=None, pause=2.0):

126 """ 127 Search the given query string using Google. 128 129 @type query: str 130 @param query: Query string. Must NOT be url-encoded. 131 132 @type tld: str 133 @param tld: Top level domain. 134 135 @type lang: str 136 @param lang: Languaje. 137 138 @type num: int 139 @param num: Number of results per page. 140 141 @type start: int 142 @param start: First result to retrieve. 143 144 @type stop: int 145 @param stop: Last result to retrieve. 146 Use C{None} to keep searching forever. 147 148 @type pause: float 149 @param pause: Lapse to wait between HTTP requests. 150 A lapse too long will make the search slow, but a lapse too short may 151 cause Google to block your IP. Your mileage may vary! 152 153 @rtype: generator 154 @return: Generator (iterator) that yields found URLs. If the C{stop} 155 parameter is C{None} the iterator will loop forever. 156 """ 157 158 # Set of hashes for the results found. 159 # This is used to avoid repeated results. 160 hashes = set() 161 162 # Prepare the search string. 163 query = quote_plus(query) 164 165 # Grab the cookie from the home page. 166 get_page(url_home % vars()) 167 168 # Prepare the URL of the first request. 169 if start: 170 if num == 10: 171 url = url_next_page % vars() 172 else: 173 url = url_next_page_num % vars() 174 else: 175 if num == 10: 176 url = url_search % vars() 177 else: 178 url = url_search_num % vars() 179 180 # Loop until we reach the maximum result, if any (otherwise, loop forever). 181 while not stop or start < stop: 182 183 # Sleep between requests. 184 time.sleep(pause) 185 186 # Request the Google Search results page. 187 html = get_page(url) 188 189 # Parse the response and process every anchored URL. 190 soup = BeautifulSoup(html) 191 anchors = soup.find(id='search').findAll('a') 192 for a in anchors: 193 194 # Get the URL from the anchor tag. 195 try: 196 link = a['href'] 197 except KeyError: 198 continue 199 200 # Filter invalid links and links pointing to Google itself. 201 link = filter_result(link) 202 if not link: 203 continue 204 205 # Discard repeated results. 206 h = hash(link) 207 if h in hashes: 208 continue 209 hashes.add(h) 210 211 # Yield the result. 212 yield link 213 214 # End if there are no more results. 215 if not soup.find(id='nav'): 216 break 217 218 # Prepare the URL for the next request. 219 start += num 220 if num == 10: 221 url = url_next_page % vars() 222 else: 223 url = url_next_page_num % vars()

224 225 # When run as a script... 226 if __name__ == "__main__": 227 228 from optparse import OptionParser, IndentedHelpFormatter 229

230 - class BannerHelpFormatter(IndentedHelpFormatter):

231 "Just a small tweak to optparse to be able to print a banner."

232 - def __init__(self, banner, *argv, **argd):

233 self.banner = banner 234 IndentedHelpFormatter.__init__(self, *argv, **argd)

235 - def format_usage(self, usage):

236 msg = IndentedHelpFormatter.format_usage(self, usage) 237 return '%s\n%s' % (self.banner, msg)

238 239 # Parse the command line arguments. 240 formatter = BannerHelpFormatter( 241 "Python script to use the Google search engine\n" 242 "By Mario Vilas (mvilas at gmail dot com)\n" 243 "https://github.com/MarioVilas/google\n" 244 ) 245 parser = OptionParser(formatter=formatter) 246 parser.set_usage("%prog [options] query") 247 parser.add_option("--tld", metavar="TLD", type="string", default="com", 248 help="top level domain to use [default: com]") 249 parser.add_option("--lang", metavar="LANGUAGE", type="string", default="en", 250 help="produce results in the given language [default: en]") 251 parser.add_option("--num", metavar="NUMBER", type="int", default=10, 252 help="number of results per page [default: 10]") 253 parser.add_option("--start", metavar="NUMBER", type="int", default=0, 254 help="first result to retrieve [default: 0]") 255 parser.add_option("--stop", metavar="NUMBER", type="int", default=0, 256 help="last result to retrieve [default: unlimited]") 257 parser.add_option("--pause", metavar="SECONDS", type="float", default=2.0, 258 help="pause between HTTP requests [default: 2.0]") 259 (options, args) = parser.parse_args() 260 query = ' '.join(args) 261 if not query: 262 parser.print_help() 263 sys.exit(2) 264 params = [(k,v) for (k,v) in options.__dict__.items() if not k.startswith('_')] 265 params = dict(params) 266 267 # Run the query. 268 for url in search(query, **params): 269 print(url) 270