# wet/utils/url.py
#
#
""" url utils module. """
__copyright__ = "Copyright 2015, B.H.J Thate"
## IMPORTS
from wet import __version__
import urllib.request, urllib.error, urllib.parse
import html.parser
import logging
import http
## AGENT
[docs]def useragent(): return 'Mozilla/5.0 (X11; Linux x86_64); WET %s - wet voorstel voor casemanager en bewindvoering voor diegenen die niet zelfredzaam zijn.; http://pikacode.com/bthate/wet)' % __version__
## HELPERS
 
[docs]def unescape(text): return html.parser.HTMLParser().unescape(text)
 
[docs]def get_urls(data):
    urls = []
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(data) 
    tags = soup('a')
    for tag in tags:
       href = tag.get("href")
       if href:
           href = href.split("#")[0]
           if not href: continue
           #if not href.endswith(".html"): continue
           if ".." in href: continue
           if href.startswith("mailto"): continue
           if href not in urls: urls.append(href)
    logging.info("# %s urls" % len(urls))
    return urls
 
[docs]def get_encoding(data):
    if hasattr(data, 'info') and 'content-type' in data.info and 'charset' in data.info['content-type'].lower():
        charset = data.info['content-type'].lower().split('charset', 1)[1].strip()
        if charset[0] == '=':
            charset = charset[1:].strip()
            if ';' in charset: return charset.split(';')[0].strip()
            return charset
    if '<meta' in data.lower():
        metas = re.findall('<meta[^>]+>', data, re.I | re.M)
        if metas:
            for meta in metas:
                test_http_equiv = re.search('http-equiv\s*=\s*[\'"]([^\'"]+)[\'"]', meta, re.I)
                if test_http_equiv and test_http_equiv.group(1).lower() == 'content-type':
                    test_content = re.search('content\s*=\s*[\'"]([^\'"]+)[\'"]', meta, re.I)
                    if test_content:
                        test_charset = re.search('charset\s*=\s*([^\s\'"]+)', meta, re.I)
                        if test_charset: return test_charset.group(1)
    if chardet:
        test = chardet.detect(data)
        if 'encoding' in test: return test['encoding']
    return sys.getdefaultencoding()
## URL
 
[docs]def get_url(type, url, myheaders={}, postdata={}, keyfile=None, certfile="", port=80):
    headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'text/plain; text/html; application/json', 'User-Agent': useragent()}
    headers.update(myheaders)
    urlparts = urllib.parse.urlparse(url)
    if "https" in url: connection = http.client.HTTPSConnection(urlparts[1]) # keyfile, certfile)
    else: connection = http.client.HTTPConnection(urlparts[1])
    connection.connect()
    connection.request(type, urlparts[2], headers=headers)
    resp = connection.getresponse()
    logging.info("! %s/%s %s %s" % (type, resp.status, resp.reason, url))
    return resp
 
[docs]def need_redirect(resp):
    if resp.status == 301 or resp.status == 302: url = resp.getheader("Location") ; return url
## PARSING
 
[docs]def parse_url(*args, **kwargs):
    """
    Attribute       Index   Value                   Value if not present
    scheme          0       URL scheme specifier    empty string
    netloc          1       Network location part   empty string
    path            2       Hierarchical path       empty string
    query           3       Query component         empty string
    fragment        4       Fragment identifier     empty string
    """
    url = args[0]
    parsed = urllib.parse.urlsplit(url)
    target = parsed[2].split("/")
    if "." in target[-1]: basepath = "/".join(target[:-1]) ; file = target[-1]
    else: basepath = parsed[2] ; file = None
    if basepath.endswith("/"): basepath = basepath[:-1]
    base = urllib.parse.urlunsplit((parsed[0], parsed[1], basepath , "", ""))
    root = urllib.parse.urlunsplit((parsed[0], parsed[1], "", "", ""))
    return (basepath, base, root, file)
 
[docs]def parse_urls(*args, **kwargs):
    import bs4
    url = args[0]
    try: content = args[1]
    except: content = get_url("GET", url).read()
    basepath, base, root, file = parse_url(url)
    s = bs4.BeautifulSoup(content)
    urls = []
    tags = s('a')
    for tag in tags:
        href = tag.get("href")
        if href:
            href = href.split("#")[0]
            if not href: continue
            if ".." in href: continue
            if href.startswith("mailto"): continue
            if not "http" in href:
                if href.startswith("/"): href = root + href
                else: href = base + "/" + href
                if not root in href: continue
            if href not in urls: urls.append(href)
    logging.warn("# %s urls gevonden" % len(urls))
    return urls
 
[docs]def strip_html(text):
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(str(text))
    return soup.get_text()
 
[docs]def strip_wiki(text):
    text = text.replace("[[", "")
    text = text.replace("]]", "")
    text = text.replace("}}", "")
    text = text.replace("{{", "")
    text = unescape(text)
    text = re.sub("<ref .*?/>", "", text)
    text = re.sub("<ref>.*?</ref>", "", text)
    text = re.sub("<ref .*?</ref>", "", text)
    return text