Source code for crawley.crawlers
from eventlet.green import urllib2
from eventlet import GreenPool
from re import compile, match
from http.request import Request
from http.cookies import CookieHandler
from persistance import session
from extractors import XPathExtractor
from exceptions import AuthenticationError
from utils import url_matcher
[docs]class BaseCrawler(object):
"""
User's Crawlers must inherit from this class, may
override some methods and define the start_urls list,
the scrapers and the max crawling depth.
"""
start_urls = []
""" A list containing the start urls for the crawler"""
allowed_urls = []
""" A list of urls allowed for crawl"""
scrapers = []
""" A list of scrapers classes"""
max_depth = -1
""" The maximun crawling recursive level"""
extractor = None
""" The extractor class. Default is XPathExtractor"""
login = None
""" The login data. A tuple of (url, login_dict).
Example: ("www.mypage.com/login", {'user' : 'myuser', 'pass', 'mypassword'})
"""
_url_regex = compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
def __init__(self, storage=None):
self.storage = storage
if self.extractor is None:
self.extractor = XPathExtractor
self.extractor = self.extractor()
self.cookie_hanlder = CookieHandler()
def _get_response(self, url, data=None):
"""
Returns the response object from a request
params:
data: if this param is present it makes a POST.
"""
request = Request(url, cookie_handler=self.cookie_hanlder)
try:
return request.get_response(data)
except Exception, e:
return None
def _get_data(self, url, data=None):
"""
Returns the response data from a request
params:
data: if this param is present it makes a POST.
"""
response = self._get_response(url, data)
if response is None or response.getcode() != 200:
return None
return response.read()
def _manage_scrapers(self, url, data):
"""
Checks if some scraper is suited for data extraction on the current url.
If so, gets the extractor object and delegate the scraping task
to the scraper Object
"""
for Scraper in self.scrapers:
if [pattern for pattern in Scraper.matching_urls if url_matcher(url, pattern)]:
html = self.extractor.get_object(data)
Scraper().scrape(html)
session.commit()
def _save_urls(self, url, new_url):
"""
Stores the url in an [UrlEntity] Object
"""
if self.storage is not None:
self.storage(parent=url, href=new_url)
session.commit()
def _validate_url(self, url):
"""
Validates if the url is in the crawler's [allowed_urls] list.
"""
if not self.allowed_urls:
return True
return bool([True for pattern in self.allowed_urls if url_matcher(url, pattern)])
def _fetch(self, url, depth_level=0):
"""
Recursive url fetching.
Params:
depth_level: The maximun recursion level
url: The url to start crawling
"""
if not self._validate_url(url):
return
data = self._get_data(url);
if data is None:
return
self._manage_scrapers(url, data)
for new_url in self.get_urls(data):
self._save_urls(url, new_url)
if depth_level >= self.max_depth:
return
self.pool.spawn_n(self._fetch, new_url, depth_level + 1)
def _login(self):
"""
If target pages are hidden behind a login then
pass through it first.
self.login can be None or a tuple containing
(login_url, params_dict)
"""
if self.login is None:
return
url, data = self.login
if self._get_response(url, data) is None:
raise AuthenticationError("Can't login")
[docs] def start(self):
"""
Crawler's run method
"""
self._login()
self.pool = GreenPool()
for url in self.start_urls:
self.pool.spawn_n(self._fetch, url, depth_level=0)
self.pool.waitall()
#overridables
[docs] def get_urls(self, html):
"""
Returns a list of urls found in the current html page
"""
urls = []
for url_match in self._url_regex.finditer(html):
urls.append(url_match.group(0))
return urls