# core/utils/url.py
#
#
""" url utils module. """
__copyright__ = "Copyright 2015, B.H.J Thate"
## IMPORTS
from core import __version__
import urllib.request, urllib.error, urllib.parse
import html.parser
import logging
import http
## AGENT
[docs]def useragent(): return 'Mozilla/5.0 (X11; Linux x86_64); CORE %s - core voorstel voor casemanager en bewindvoering voor diegenen die niet zelfredzaam zijn.; http://pikacode.com/bthate/core)' % __version__
## HELPERS
[docs]def unescape(text): return html.parser.HTMLParser().unescape(text)
[docs]def get_urls(data):
urls = []
from bs4 import BeautifulSoup
soup = BeautifulSoup(data)
tags = soup('a')
for tag in tags:
href = tag.get("href")
if href:
href = href.split("#")[0]
if not href: continue
#if not href.endswith(".html"): continue
if ".." in href: continue
if href.startswith("mailto"): continue
if href not in urls: urls.append(href)
logging.info("# %s urls" % len(urls))
return urls
[docs]def get_encoding(data):
if hasattr(data, 'info') and 'content-type' in data.info and 'charset' in data.info['content-type'].lower():
charset = data.info['content-type'].lower().split('charset', 1)[1].strip()
if charset[0] == '=':
charset = charset[1:].strip()
if ';' in charset: return charset.split(';')[0].strip()
return charset
if '<meta' in data.lower():
metas = re.findall('<meta[^>]+>', data, re.I | re.M)
if metas:
for meta in metas:
test_http_equiv = re.search('http-equiv\s*=\s*[\'"]([^\'"]+)[\'"]', meta, re.I)
if test_http_equiv and test_http_equiv.group(1).lower() == 'content-type':
test_content = re.search('content\s*=\s*[\'"]([^\'"]+)[\'"]', meta, re.I)
if test_content:
test_charset = re.search('charset\s*=\s*([^\s\'"]+)', meta, re.I)
if test_charset: return test_charset.group(1)
if chardet:
test = chardet.detect(data)
if 'encoding' in test: return test['encoding']
return sys.getdefaultencoding()
## URL
[docs]def get_url(type, url, myheaders={}, postdata={}, keyfile=None, certfile="", port=80):
headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'text/plain; text/html; application/json', 'User-Agent': useragent()}
headers.update(myheaders)
urlparts = urllib.parse.urlparse(url)
if "https" in url: connection = http.client.HTTPSConnection(urlparts[1]) # keyfile, certfile)
else: connection = http.client.HTTPConnection(urlparts[1])
connection.connect()
connection.request(type, urlparts[2], headers=headers)
resp = connection.getresponse()
logging.info("! %s/%s %s %s" % (type, resp.status, resp.reason, url))
return resp
[docs]def need_redirect(resp):
if resp.status == 301 or resp.status == 302: url = resp.getheader("Location") ; return url
## PARSING
[docs]def parse_url(*args, **kwargs):
"""
Attribute Index Value Value if not present
scheme 0 URL scheme specifier empty string
netloc 1 Network location part empty string
path 2 Hierarchical path empty string
query 3 Query component empty string
fragment 4 Fragment identifier empty string
"""
url = args[0]
parsed = urllib.parse.urlsplit(url)
target = parsed[2].split("/")
if "." in target[-1]: basepath = "/".join(target[:-1]) ; file = target[-1]
else: basepath = parsed[2] ; file = None
if basepath.endswith("/"): basepath = basepath[:-1]
base = urllib.parse.urlunsplit((parsed[0], parsed[1], basepath , "", ""))
root = urllib.parse.urlunsplit((parsed[0], parsed[1], "", "", ""))
return (basepath, base, root, file)
[docs]def parse_urls(*args, **kwargs):
import bs4
url = args[0]
try: content = args[1]
except: content = get_url("GET", url).read()
basepath, base, root, file = parse_url(url)
s = bs4.BeautifulSoup(content)
urls = []
tags = s('a')
for tag in tags:
href = tag.get("href")
if href:
href = href.split("#")[0]
if not href: continue
if ".." in href: continue
if href.startswith("mailto"): continue
if not "http" in href:
if href.startswith("/"): href = root + href
else: href = base + "/" + href
if not root in href: continue
if href not in urls: urls.append(href)
logging.warn("# %s urls gevonden" % len(urls))
return urls
[docs]def strip_html(text):
from bs4 import BeautifulSoup
soup = BeautifulSoup(str(text))
return soup.get_text()
[docs]def strip_wiki(text):
text = text.replace("[[", "")
text = text.replace("]]", "")
text = text.replace("}}", "")
text = text.replace("{{", "")
text = unescape(text)
text = re.sub("<ref .*?/>", "", text)
text = re.sub("<ref>.*?</ref>", "", text)
text = re.sub("<ref .*?</ref>", "", text)
return text