Source code for core.service.spider
# core/dutch/spider.py
#
#
""" Spider plugin. """
## IMPORTS
from core.utils.url import parse_urls, parse_url, need_redirect, get_url
from core.errors import error
from core.kernel import kernel
from core.thing import Thing
import logging
import time
## DEFINES
urls = {"vergadering": "http://www.tweedekamer.nl/vergaderingen/commissievergaderingen?qry=*&srt=date:asc:date&fld_tk_categorie=Vergaderingen&fld_tk_subcategorie=Commissievergaderingen&clusterName=Vergaderingen",
"commissie": "http://www.tweedekamer.nl/vergaderingen/commissievergaderingen",
"cel": "http://nl.wikipedia.org/wiki/Cel_%28biologie%29",
"tucht": "http://coreten.overheid.nl/BWBR0006251/"
}
url_list = []
## Spider class
[docs]class Spider(Thing):
def __init__(zelf, *args, **kwargs):
Thing.__init__(zelf, *args, **kwargs)
zelf.errors = []
zelf.urls = []
zelf.url = Thing()
zelf.followed = []
zelf.speed = 0
zelf.depth = 10
[docs] def boot(zelf, *args, **kwargs):
for fn in kernel.all("spider"):
logging.warn("spider %s" % fn)
obj = Thing().load(fn)
zelf.crawl(obj.url)
[docs] def crawl(zelf, *args, **kwargs):
time.sleep(zelf.speed)
zelf.speed += 1.0
try: url, search = args
except ValueError: url = args[0] ; search = ""
logging.warn("crawl %s" % url)
urls = []
if not zelf.url:
zelf.url.url = url
zelf.url.basepath, zelf.url.base, zelf.url.root, zelf.url.file = parse_url(zelf.url.url)
pnr = len(url.split("/"))
if pnr > zelf.depth: logging.warn("%s max depth " % url) ; return
if url not in zelf.urls and url not in url_list:
zelf.urls.append(url)
try: content = get_url("GET", url)
except: error() ; return
newurl = need_redirect(content)
if newurl:
logging.warn("redirecting to %s" % newurl)
content = get_url("GET", newurl)
newurl2 = need_redirect(content)
if newurl2:
logging.warn("redirecting to %s" % newurl2)
content = get_url("GET", newurl2)
data = content.read()
#if search:
# div = extract_div(search, data)
# if div: o.txt = strip_html(div)
#else: o.content = str(data, "utf-8")
urls = parse_urls(url, data)
for u in urls:
if u in zelf.urls: continue
if not zelf.url.base in u: continue
if u in zelf.errors: continue
zelf.crawl(u, search)
o = Thing()
o.prefix = "spider.data"
o.service = "spider"
o.urls = urls
o.sync()
return urls
## INIT
[docs]def spider(event):
try: url = event.parsed.args[0]
except IndexError: return
if "http" not in url: return
try: search = event.parsed.args[1]
except IndexError: search = ""
urls = kernel.all("spider")
if url not in urls:
thing = Thing()
thing.prefix = "spider"
thing.url = url
thing.search = search
thing.save()
s = Spider()
s.crawl(url, search)
for url in s.urls: event.reply(url)
kernel.register("spider", spider)
## INIT
[docs]def init(*args, **kwargs):
spider = Spider()
kernel.put(spider.boot)