Source code for core.service.spider

# core/dutch/spider.py
#
#

""" Spider plugin. """

## IMPORTS

from core.utils.url import parse_urls, parse_url, need_redirect, get_url
from core.errors import error
from core.kernel import kernel
from core.thing import Thing

import logging
import time

## DEFINES

urls = {"vergadering": "http://www.tweedekamer.nl/vergaderingen/commissievergaderingen?qry=*&srt=date:asc:date&fld_tk_categorie=Vergaderingen&fld_tk_subcategorie=Commissievergaderingen&clusterName=Vergaderingen",
        "commissie": "http://www.tweedekamer.nl/vergaderingen/commissievergaderingen",
        "cel": "http://nl.wikipedia.org/wiki/Cel_%28biologie%29",
        "tucht": "http://coreten.overheid.nl/BWBR0006251/"
       }

url_list = []

## Spider class

[docs]class Spider(Thing): def __init__(zelf, *args, **kwargs): Thing.__init__(zelf, *args, **kwargs) zelf.errors = [] zelf.urls = [] zelf.url = Thing() zelf.followed = [] zelf.speed = 0 zelf.depth = 10
[docs] def boot(zelf, *args, **kwargs): for fn in kernel.all("spider"): logging.warn("spider %s" % fn) obj = Thing().load(fn) zelf.crawl(obj.url)
[docs] def crawl(zelf, *args, **kwargs): time.sleep(zelf.speed) zelf.speed += 1.0 try: url, search = args except ValueError: url = args[0] ; search = "" logging.warn("crawl %s" % url) urls = [] if not zelf.url: zelf.url.url = url zelf.url.basepath, zelf.url.base, zelf.url.root, zelf.url.file = parse_url(zelf.url.url) pnr = len(url.split("/")) if pnr > zelf.depth: logging.warn("%s max depth " % url) ; return if url not in zelf.urls and url not in url_list: zelf.urls.append(url) try: content = get_url("GET", url) except: error() ; return newurl = need_redirect(content) if newurl: logging.warn("redirecting to %s" % newurl) content = get_url("GET", newurl) newurl2 = need_redirect(content) if newurl2: logging.warn("redirecting to %s" % newurl2) content = get_url("GET", newurl2) data = content.read() #if search: # div = extract_div(search, data) # if div: o.txt = strip_html(div) #else: o.content = str(data, "utf-8") urls = parse_urls(url, data) for u in urls: if u in zelf.urls: continue if not zelf.url.base in u: continue if u in zelf.errors: continue zelf.crawl(u, search) o = Thing() o.prefix = "spider.data" o.service = "spider" o.urls = urls o.sync() return urls ## INIT
[docs]def spider(event): try: url = event.parsed.args[0] except IndexError: return if "http" not in url: return try: search = event.parsed.args[1] except IndexError: search = "" urls = kernel.all("spider") if url not in urls: thing = Thing() thing.prefix = "spider" thing.url = url thing.search = search thing.save() s = Spider() s.crawl(url, search) for url in s.urls: event.reply(url)
kernel.register("spider", spider) ## INIT
[docs]def init(*args, **kwargs): spider = Spider() kernel.put(spider.boot)