Source code for point.plugs.spider

# p/plugs/spider.py
#
#

""" poller module. """

## IMPORTS

from point import Object, kernel
from point.utils import parse_urls

## basic imports

import threading
import logging

## Spider class

[docs]class Spider(Object): def __init__(p, time_sleep, *args, **kwargs): Object.__init__(p, *args, **kwargs) p.time_sleep = time_sleep p.time_in = time.time() p.errors = [] p.urls = [] p.url = Object() p.followed = [] p.speed = 0 p.depth = 5
[docs] def crawl(p, *args, **kwargs): p.time_in = time.time() url = args[0] urls = [] if not p.url: p.url.url = url p.url.basepath, p.url.base, p.url.root, p.url.file = parse_url(p.url.url) pnr = len(url.split("/")) if pnr > p.depth: logging.warn("%s depth > 5" % url) ; return if url not in p.urls: p.urls.append(url) content = do_url("GET", url) newurl = need_redirect(content) if newurl: content = do_url("GET", newurl) ; logging.warn("redirecting to %s" % newurl) newurl2 = need_redirect(content) if newurl2: content = do_url("GET", url) ; logging.warn("redirecting to %s" % newurl2) time.sleep(p.speed) p.speed += 0.1 urls = parse_urls(url, content.read()) o = Object() o.spider = True o.orig_url = url o.urls = urls o.save() for u in urls: if u in p.urls: continue if not p.url.base in u: continue if u in p.errors: continue p.put(p.crawl, u) return urls # spider command
[docs]def do_spider(event): if "SPIDER" not in kernel.run: kernel.run.SPIDER = Spider(300.0) event.repy("crawling %s" % event.rest) kernel.run.SPIDER.crawl(event.rest)
kernel.cmnds.register("spider", do_spider)