Source code for crawley.crawlers.base

from eventlet import GreenPool

from re import compile as re_compile

from crawley.config import CRAWLEY_ROOT_DIR
from crawley.http.managers import RequestManager
from crawley.http.cookies import CookieHandler
from crawley.http.response import Response
from crawley.extractors import XPathExtractor
from crawley.exceptions import AuthenticationError
from crawley.utils import url_matcher

user_crawlers = []

[docs]class CrawlerMeta(type):
    """
        This metaclass adds the user's crawlers to a list
        used by the CLI commands.
        Abstract base crawlers won't be added.
    """

    def __init__(cls, name, bases, dct):

        if not hasattr(cls, '__module__' ) or not cls.__module__.startswith(CRAWLEY_ROOT_DIR):
            user_crawlers.append(cls)
        super(CrawlerMeta, cls).__init__(name, bases, dct)


[docs]class BaseCrawler(object):
    """
        User's Crawlers must inherit from this class, may
        override some methods and define the start_urls list,
        the scrapers and the max crawling depth.
    """

    __metaclass__ = CrawlerMeta

    start_urls = []
    """ A list containing the start urls for the crawler"""

    allowed_urls = []
    """ A list of urls allowed for crawl"""

    scrapers = []
    """ A list of scrapers classes"""

    max_depth = -1
    """ The maximun crawling recursive level"""

    extractor = None
    """ The extractor class. Default is XPathExtractor"""

    post_urls = []
    """ The Post data for the urls. A List of tuples containing (url, data_dict)
        Example: ("http://www.mypage.com/post_url", {'page' : '1', 'color' : 'blue'})
    """

    login = None
    """ The login data. A tuple of (url, login_dict).
        Example: ("http://www.mypage.com/login", {'user' : 'myuser', 'pass', 'mypassword'})
    """

    _url_regex = re_compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')

    def __init__(self, storage=None, sessions=None, debug=False):
        """
            Initializes the crawler

            params:

                storages: A list of entities

                sessions: Database or Documents persistant sessions

                debug: indicates if the crawler logs to stdout debug info
        """

        self.storage = storage

        if sessions is None:
            sessions = []

        self.sessions = sessions
        self.debug = debug

        if self.extractor is None:
            self.extractor = XPathExtractor

        self.extractor = self.extractor()
        self.cookie_hanlder = CookieHandler()

        self.pool = GreenPool()
        self.request_manager = RequestManager()

    def _get_response(self, url, data=None):
        """
            Returns the response object from a request

            params:
                data: if this param is present it makes a POST.
        """

        return self.request_manager.make_request(url, self.cookie_hanlder, data)

    def _get_data(self, url, data=None):
        """
            Returns the response data from a request

            params:
                data: if this param is present it makes a POST.
        """

        for pattern, post_data in self.post_urls:
            if url_matcher(url, pattern):
                data = post_data

        return self._get_response(url, data)

    def _manage_scrapers(self, url, data):
        """
            Checks if some scraper is suited for data extraction on the current url.
            If so, gets the extractor object and delegate the scraping task
            to the scraper Object
        """
        urls = []

        for scraper_class in self.scrapers:

            if [pattern for pattern in scraper_class.matching_urls if url_matcher(url, pattern)]:

                html = self.extractor.get_object(data)

                response = Response(html, url)
                scraper = scraper_class()
                scraper.scrape(response)

                self._commit()

                urls.extend(scraper.get_urls(response))

        return urls

    def _commit(self):
        """
            Makes a Commit in all sessions
        """

        for session in self.sessions:
            session.commit()

    def _save_urls(self, url, new_url):
        """
            Stores the url in an [UrlEntity] Object
        """

        if self.storage is not None:

            self.storage(parent=url, href=new_url)
            self._commit()

    def _validate_url(self, url):
        """
            Validates if the url is in the crawler's [allowed_urls] list.
        """

        if not self.allowed_urls:
            return True

        return bool([True for pattern in self.allowed_urls if url_matcher(url, pattern)])

    def _fetch(self, url, depth_level=0):
        """
            Recursive url fetching.

            Params:
                depth_level: The maximun recursion level
                url: The url to start crawling
        """

        if not self._validate_url(url):
            return

        if self.debug:
            print "crawling -> %s" % url

        data = self._get_data(url)
        if data is None:
            return

        urls = self._manage_scrapers(url, data)
        if not urls:
            urls = self.get_urls(data)

        for new_url in urls:

            self._save_urls(url, new_url)

            if depth_level >= self.max_depth:
                return
            self.pool.spawn_n(self._fetch, new_url, depth_level + 1)

    def _login(self):
        """
            If target pages are hidden behind a login then
            pass through it first.

            self.login can be None or a tuple containing
            (login_url, params_dict)
        """
        if self.login is None:
            return

        url, data = self.login
        if self._get_response(url, data) is None:
            raise AuthenticationError("Can't login")

[docs]    def start(self):
        """
            Crawler's run method
        """
        self._login()

        for url in self.start_urls:
            self.pool.spawn_n(self._fetch, url, depth_level=0)

        self.pool.waitall()


    #overridables

[docs]    def get_urls(self, html):
        """
            Returns a list of urls found in the current html page
        """
        urls = []
        for url_match in self._url_regex.finditer(html):
            urls.append(url_match.group(0))
        return urls
Navigation

Source code for crawley.crawlers.base

Quick search

Navigation