Source code for crawley.crawlers

from eventlet.green import urllib2
from eventlet import GreenPool

from re import compile, match

from http.request import Request
from http.cookies import CookieHandler
from persistance import session
from extractors import XPathExtractor
from exceptions import AuthenticationError
from utils import url_matcher


[docs]class BaseCrawler(object):
    """
        User's Crawlers must inherit from this class, may
        override some methods and define the start_urls list,
        the scrapers and the max crawling depth.
    """
    
    start_urls = []    
    """ A list containing the start urls for the crawler"""
    
    allowed_urls = []
    """ A list of urls allowed for crawl"""
    
    scrapers = []    
    """ A list of scrapers classes"""
    
    max_depth = -1
    """ The maximun crawling recursive level"""
    
    extractor = None
    """ The extractor class. Default is XPathExtractor"""
    
    login = None
    """ The login data. A tuple of (url, login_dict).
        Example: ("www.mypage.com/login", {'user' : 'myuser', 'pass', 'mypassword'})
    """
    
    _url_regex = compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
    
    def __init__(self, storage=None):        
        
        self.storage = storage
        
        if self.extractor is None:
            self.extractor = XPathExtractor
        
        self.extractor = self.extractor()
        self.cookie_hanlder = CookieHandler()
            
    def _get_response(self, url, data=None):
        """
            Returns the response object from a request
            
            params:
                data: if this param is present it makes a POST.
        """
                
        request = Request(url, cookie_handler=self.cookie_hanlder)
                    
        try:            
            return request.get_response(data)
        except Exception, e:
            return None
    
    def _get_data(self, url, data=None):
        """
            Returns the response data from a request
            
            params:
                data: if this param is present it makes a POST.
        """
        
        response = self._get_response(url, data)
        if response is None or response.getcode() != 200:            
            return None
        return response.read()
    
    def _manage_scrapers(self, url, data):
        """
            Checks if some scraper is suited for data extraction on the current url.
            If so, gets the extractor object and delegate the scraping task
            to the scraper Object
        """
        
        for Scraper in self.scrapers:
            if [pattern for pattern in Scraper.matching_urls if url_matcher(url, pattern)]: 
                html = self.extractor.get_object(data)
                Scraper().scrape(html)
                session.commit()
    
    def _save_urls(self, url, new_url):
        """
            Stores the url in an [UrlEntity] Object
        """
        
        if self.storage is not None:
            self.storage(parent=url, href=new_url)
            session.commit()
    
    def _validate_url(self, url):
        """
            Validates if the url is in the crawler's [allowed_urls] list.
        """
        
        if not self.allowed_urls:
            return True
            
        return bool([True for pattern in self.allowed_urls if url_matcher(url, pattern)])
    
    def _fetch(self, url, depth_level=0):
        """
            Recursive url fetching. 
            
            Params:
                depth_level: The maximun recursion level
                url: The url to start crawling
        """
                                   
        if not self._validate_url(url):
            return
        
        data = self._get_data(url);
        if data is None:
            return
            
        self._manage_scrapers(url, data)
            
        for new_url in self.get_urls(data):
                                    
            self._save_urls(url, new_url)            
            
            if depth_level >= self.max_depth:
                return
            self.pool.spawn_n(self._fetch, new_url, depth_level + 1)
    
    def _login(self):
        """
            If target pages are hidden behind a login then
            pass through it first.
            
            self.login can be None or a tuple containing 
            (login_url, params_dict)
        """
        if self.login is None:
            return        
            
        url, data = self.login
        if self._get_response(url, data) is None:
            raise AuthenticationError("Can't login")
    
[docs]    def start(self):
        """
            Crawler's run method
        """
        self._login()
        
        self.pool = GreenPool()        
        
        for url in self.start_urls:
            self.pool.spawn_n(self._fetch, url, depth_level=0)
            
        self.pool.waitall()


    #overridables
    
[docs]    def get_urls(self, html):
        """
            Returns a list of urls found in the current html page
        """
        urls = []
        for url_match in self._url_regex.finditer(html):
            urls.append(url_match.group(0))
        return urls
Navigation

Source code for crawley.crawlers

Quick search

Navigation