Source code for mini.looper
# mini/looper.py
#
#
""" poller module. """
## IMPORTS
from mini import Object, kernel
from mini.utils import parse_urls
## basic imports
import threading
import logging
## Spider class
[docs]class Spider(Object):
def __init__(zelf, sleeptime, *args, **kwargs):
Object.__init__(zelf, *args, **kwargs)
zelf.sleeptime = sleeptime
zelf.errors = []
zelf.urls = []
zelf.url = Object()
zelf.followed = []
zelf.speed = 0
zelf.depth = 5
[docs] def crawl(zelf, *args, **kwargs):
url = args[0]
urls = []
if not zelf.url:
zelf.url.url = url
zelf.url.basepath, zelf.url.base, zelf.url.root, zelf.url.file = parse_url(zelf.url.url)
pnr = len(url.split("/"))
if pnr > zelf.depth: logging.warn("%s depth > 5" % url) ; return
if url not in zelf.urls:
zelf.urls.append(url)
content = do_url("GET", url)
newurl = need_redirect(content)
if newurl: content = do_url("GET", newurl) ; logging.warn("redirecting to %s" % newurl)
newurl2 = need_redirect(content)
if newurl2: content = do_url("GET", url) ; logging.warn("redirecting to %s" % newurl2)
time.sleep(zelf.speed)
zelf.speed += 0.1
urls = parse_urls(url, content.read())
o = Object()
o.spider = True
o.orig_url = url
o.urls = urls
o.save()
for u in urls:
if u in zelf.urls: continue
if not zelf.url.base in u: continue
if u in zelf.errors: continue
zelf.put(zelf.crawl, u)
return urls
## RSS class