Source code for point.plugs.spider
# p/plugs/spider.py
#
#
""" poller module. """
## IMPORTS
from point import Object, kernel
from point.utils import parse_urls
## basic imports
import threading
import logging
## Spider class
[docs]class Spider(Object):
def __init__(p, time_sleep, *args, **kwargs):
Object.__init__(p, *args, **kwargs)
p.time_sleep = time_sleep
p.time_in = time.time()
p.errors = []
p.urls = []
p.url = Object()
p.followed = []
p.speed = 0
p.depth = 5
[docs] def crawl(p, *args, **kwargs):
p.time_in = time.time()
url = args[0]
urls = []
if not p.url:
p.url.url = url
p.url.basepath, p.url.base, p.url.root, p.url.file = parse_url(p.url.url)
pnr = len(url.split("/"))
if pnr > p.depth: logging.warn("%s depth > 5" % url) ; return
if url not in p.urls:
p.urls.append(url)
content = do_url("GET", url)
newurl = need_redirect(content)
if newurl: content = do_url("GET", newurl) ; logging.warn("redirecting to %s" % newurl)
newurl2 = need_redirect(content)
if newurl2: content = do_url("GET", url) ; logging.warn("redirecting to %s" % newurl2)
time.sleep(p.speed)
p.speed += 0.1
urls = parse_urls(url, content.read())
o = Object()
o.spider = True
o.orig_url = url
o.urls = urls
o.save()
for u in urls:
if u in p.urls: continue
if not p.url.base in u: continue
if u in p.errors: continue
p.put(p.crawl, u)
return urls
# spider command
[docs]def do_spider(event):
if "SPIDER" not in kernel.run: kernel.run.SPIDER = Spider(300.0)
event.repy("crawling %s" % event.rest)
kernel.run.SPIDER.crawl(event.rest)
kernel.cmnds.register("spider", do_spider)