Module google
[hide private]
[frames] | no frames]

Source Code for Module google

  1  #!/usr/bin/env python 
  2   
  3  # Python bindings to the Google search engine 
  4  # Copyright (c) 2009-2013, Mario Vilas 
  5  # All rights reserved. 
  6  # 
  7  # Redistribution and use in source and binary forms, with or without 
  8  # modification, are permitted provided that the following conditions are met: 
  9  # 
 10  #     * Redistributions of source code must retain the above copyright notice, 
 11  #       this list of conditions and the following disclaimer. 
 12  #     * Redistributions in binary form must reproduce the above copyright 
 13  #       notice,this list of conditions and the following disclaimer in the 
 14  #       documentation and/or other materials provided with the distribution. 
 15  #     * Neither the name of the copyright holder nor the names of its 
 16  #       contributors may be used to endorse or promote products derived from 
 17  #       this software without specific prior written permission. 
 18  # 
 19  # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
 20  # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
 21  # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
 22  # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
 23  # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
 24  # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
 25  # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
 26  # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
 27  # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
 28  # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
 29  # POSSIBILITY OF SUCH DAMAGE. 
 30   
 31  __all__ = ['search'] 
 32   
 33  import os 
 34  import sys 
 35  import time 
 36   
 37  if sys.version_info[0] > 2: 
 38      from http.cookiejar import LWPCookieJar 
 39      from urllib.request import Request, urlopen 
 40      from urllib.parse import quote_plus, urlparse, parse_qs 
 41  else: 
 42      from cookielib import LWPCookieJar 
 43      from urllib import quote_plus 
 44      from urllib2 import Request, urlopen 
 45      from urlparse import urlparse, parse_qs 
 46   
 47  # Try to use BeautifulSoup 4 if available, fall back to 3 otherwise. 
 48  try: 
 49      from bs4 import BeautifulSoup 
 50  except ImportError: 
 51      from BeautifulSoup import BeautifulSoup 
 52   
 53  # URL templates to make Google searches. 
 54  url_home          = "http://www.google.%(tld)s/" 
 55  url_search        = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&btnG=Google+Search" 
 56  url_next_page     = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&start=%(start)d" 
 57  url_search_num    = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&num=%(num)d&btnG=Google+Search" 
 58  url_next_page_num = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d" 
 59   
 60  # Cookie jar. Stored at the user's home folder. 
 61  home_folder = os.getenv('HOME') 
 62  if not home_folder: 
 63      home_folder = os.getenv('USERHOME') 
 64      if not home_folder: 
 65          home_folder = '.'   # Use the current folder on error. 
 66  cookie_jar = LWPCookieJar(os.path.join(home_folder, '.google-cookie')) 
 67  try: 
 68      cookie_jar.load() 
 69  except Exception: 
 70      pass 
 71   
 72  # Request the given URL and return the response page, using the cookie jar. 
73 -def get_page(url):
74 """ 75 Request the given URL and return the response page, using the cookie jar. 76 77 @type url: str 78 @param url: URL to retrieve. 79 80 @rtype: str 81 @return: Web page retrieved for the given URL. 82 83 @raise IOError: An exception is raised on error. 84 @raise urllib2.URLError: An exception is raised on error. 85 @raise urllib2.HTTPError: An exception is raised on error. 86 """ 87 request = Request(url) 88 request.add_header('User-Agent', 89 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)') 90 cookie_jar.add_cookie_header(request) 91 response = urlopen(request) 92 cookie_jar.extract_cookies(response, request) 93 html = response.read() 94 response.close() 95 cookie_jar.save() 96 return html
97 98 # Filter links found in the Google result pages HTML code. 99 # Returns None if the link doesn't yield a valid result.
100 -def filter_result(link):
101 try: 102 103 # Valid results are absolute URLs not pointing to a Google domain 104 # like images.google.com or googleusercontent.com 105 o = urlparse(link, 'http') 106 if o.netloc and 'google' not in o.netloc: 107 return link 108 109 # Decode hidden URLs. 110 if link.startswith('/url?'): 111 link = parse_qs(o.query)['q'][0] 112 113 # Valid results are absolute URLs not pointing to a Google domain 114 # like images.google.com or googleusercontent.com 115 o = urlparse(link, 'http') 116 if o.netloc and 'google' not in o.netloc: 117 return link 118 119 # Otherwise, or on error, return None. 120 except Exception: 121 pass 122 return None
123 124 # Returns a generator that yields URLs.
125 -def search(query, tld='com', lang='en', num=10, start=0, stop=None, pause=2.0):
126 """ 127 Search the given query string using Google. 128 129 @type query: str 130 @param query: Query string. Must NOT be url-encoded. 131 132 @type tld: str 133 @param tld: Top level domain. 134 135 @type lang: str 136 @param lang: Languaje. 137 138 @type num: int 139 @param num: Number of results per page. 140 141 @type start: int 142 @param start: First result to retrieve. 143 144 @type stop: int 145 @param stop: Last result to retrieve. 146 Use C{None} to keep searching forever. 147 148 @type pause: float 149 @param pause: Lapse to wait between HTTP requests. 150 A lapse too long will make the search slow, but a lapse too short may 151 cause Google to block your IP. Your mileage may vary! 152 153 @rtype: generator 154 @return: Generator (iterator) that yields found URLs. If the C{stop} 155 parameter is C{None} the iterator will loop forever. 156 """ 157 158 # Set of hashes for the results found. 159 # This is used to avoid repeated results. 160 hashes = set() 161 162 # Prepare the search string. 163 query = quote_plus(query) 164 165 # Grab the cookie from the home page. 166 get_page(url_home % vars()) 167 168 # Prepare the URL of the first request. 169 if start: 170 if num == 10: 171 url = url_next_page % vars() 172 else: 173 url = url_next_page_num % vars() 174 else: 175 if num == 10: 176 url = url_search % vars() 177 else: 178 url = url_search_num % vars() 179 180 # Loop until we reach the maximum result, if any (otherwise, loop forever). 181 while not stop or start < stop: 182 183 # Sleep between requests. 184 time.sleep(pause) 185 186 # Request the Google Search results page. 187 html = get_page(url) 188 189 # Parse the response and process every anchored URL. 190 soup = BeautifulSoup(html) 191 anchors = soup.find(id='search').findAll('a') 192 for a in anchors: 193 194 # Get the URL from the anchor tag. 195 try: 196 link = a['href'] 197 except KeyError: 198 continue 199 200 # Filter invalid links and links pointing to Google itself. 201 link = filter_result(link) 202 if not link: 203 continue 204 205 # Discard repeated results. 206 h = hash(link) 207 if h in hashes: 208 continue 209 hashes.add(h) 210 211 # Yield the result. 212 yield link 213 214 # End if there are no more results. 215 if not soup.find(id='nav'): 216 break 217 218 # Prepare the URL for the next request. 219 start += num 220 if num == 10: 221 url = url_next_page % vars() 222 else: 223 url = url_next_page_num % vars()
224 225 # When run as a script... 226 if __name__ == "__main__": 227 228 from optparse import OptionParser, IndentedHelpFormatter 229
230 - class BannerHelpFormatter(IndentedHelpFormatter):
231 "Just a small tweak to optparse to be able to print a banner."
232 - def __init__(self, banner, *argv, **argd):
233 self.banner = banner 234 IndentedHelpFormatter.__init__(self, *argv, **argd)
235 - def format_usage(self, usage):
236 msg = IndentedHelpFormatter.format_usage(self, usage) 237 return '%s\n%s' % (self.banner, msg)
238 239 # Parse the command line arguments. 240 formatter = BannerHelpFormatter( 241 "Python script to use the Google search engine\n" 242 "By Mario Vilas (mvilas at gmail dot com)\n" 243 "https://github.com/MarioVilas/google\n" 244 ) 245 parser = OptionParser(formatter=formatter) 246 parser.set_usage("%prog [options] query") 247 parser.add_option("--tld", metavar="TLD", type="string", default="com", 248 help="top level domain to use [default: com]") 249 parser.add_option("--lang", metavar="LANGUAGE", type="string", default="en", 250 help="produce results in the given language [default: en]") 251 parser.add_option("--num", metavar="NUMBER", type="int", default=10, 252 help="number of results per page [default: 10]") 253 parser.add_option("--start", metavar="NUMBER", type="int", default=0, 254 help="first result to retrieve [default: 0]") 255 parser.add_option("--stop", metavar="NUMBER", type="int", default=0, 256 help="last result to retrieve [default: unlimited]") 257 parser.add_option("--pause", metavar="SECONDS", type="float", default=2.0, 258 help="pause between HTTP requests [default: 2.0]") 259 (options, args) = parser.parse_args() 260 query = ' '.join(args) 261 if not query: 262 parser.print_help() 263 sys.exit(2) 264 params = [(k,v) for (k,v) in options.__dict__.items() if not k.startswith('_')] 265 params = dict(params) 266 267 # Run the query. 268 for url in search(query, **params): 269 print(url) 270