Source code for core.utils.url

# core/utils/url.py
#
#

""" url utils module. """

__copyright__ = "Copyright 2015, B.H.J Thate"

## IMPORTS

from core import __version__

import urllib.request, urllib.error, urllib.parse
import html.parser

import logging
import http

## AGENT

[docs]def useragent(): return 'Mozilla/5.0 (X11; Linux x86_64); CORE %s - core voorstel voor casemanager en bewindvoering voor diegenen die niet zelfredzaam zijn.; http://pikacode.com/bthate/core)' % __version__ ## HELPERS
[docs]def unescape(text): return html.parser.HTMLParser().unescape(text)
[docs]def get_urls(data): urls = [] from bs4 import BeautifulSoup soup = BeautifulSoup(data) tags = soup('a') for tag in tags: href = tag.get("href") if href: href = href.split("#")[0] if not href: continue #if not href.endswith(".html"): continue if ".." in href: continue if href.startswith("mailto"): continue if href not in urls: urls.append(href) logging.info("# %s urls" % len(urls)) return urls
[docs]def extract_div(search, data): from bs4 import BeautifulSoup soup = BeautifulSoup(data) divs = soup('div') for div in divs: if div.get(search): return div ## ENCODING
[docs]def get_encoding(data): if hasattr(data, 'info') and 'content-type' in data.info and 'charset' in data.info['content-type'].lower(): charset = data.info['content-type'].lower().split('charset', 1)[1].strip() if charset[0] == '=': charset = charset[1:].strip() if ';' in charset: return charset.split(';')[0].strip() return charset if '<meta' in data.lower(): metas = re.findall('<meta[^>]+>', data, re.I | re.M) if metas: for meta in metas: test_http_equiv = re.search('http-equiv\s*=\s*[\'"]([^\'"]+)[\'"]', meta, re.I) if test_http_equiv and test_http_equiv.group(1).lower() == 'content-type': test_content = re.search('content\s*=\s*[\'"]([^\'"]+)[\'"]', meta, re.I) if test_content: test_charset = re.search('charset\s*=\s*([^\s\'"]+)', meta, re.I) if test_charset: return test_charset.group(1) if chardet: test = chardet.detect(data) if 'encoding' in test: return test['encoding'] return sys.getdefaultencoding() ## URL
[docs]def get_url(type, url, myheaders={}, postdata={}, keyfile=None, certfile="", port=80): headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'text/plain; text/html; application/json', 'User-Agent': useragent()} headers.update(myheaders) urlparts = urllib.parse.urlparse(url) if "https" in url: connection = http.client.HTTPSConnection(urlparts[1]) # keyfile, certfile) else: connection = http.client.HTTPConnection(urlparts[1]) connection.connect() connection.request(type, urlparts[2], headers=headers) resp = connection.getresponse() logging.info("! %s/%s %s %s" % (type, resp.status, resp.reason, url)) return resp
[docs]def need_redirect(resp): if resp.status == 301 or resp.status == 302: url = resp.getheader("Location") ; return url ## PARSING
[docs]def parse_url(*args, **kwargs): """ Attribute Index Value Value if not present scheme 0 URL scheme specifier empty string netloc 1 Network location part empty string path 2 Hierarchical path empty string query 3 Query component empty string fragment 4 Fragment identifier empty string """ url = args[0] parsed = urllib.parse.urlsplit(url) target = parsed[2].split("/") if "." in target[-1]: basepath = "/".join(target[:-1]) ; file = target[-1] else: basepath = parsed[2] ; file = None if basepath.endswith("/"): basepath = basepath[:-1] base = urllib.parse.urlunsplit((parsed[0], parsed[1], basepath , "", "")) root = urllib.parse.urlunsplit((parsed[0], parsed[1], "", "", "")) return (basepath, base, root, file)
[docs]def parse_urls(*args, **kwargs): import bs4 url = args[0] try: content = args[1] except: content = get_url("GET", url).read() basepath, base, root, file = parse_url(url) s = bs4.BeautifulSoup(content) urls = [] tags = s('a') for tag in tags: href = tag.get("href") if href: href = href.split("#")[0] if not href: continue if ".." in href: continue if href.startswith("mailto"): continue if not "http" in href: if href.startswith("/"): href = root + href else: href = base + "/" + href if not root in href: continue if href not in urls: urls.append(href) logging.warn("# %s urls gevonden" % len(urls)) return urls
[docs]def strip_html(text): from bs4 import BeautifulSoup soup = BeautifulSoup(str(text)) return soup.get_text()
[docs]def strip_wiki(text): text = text.replace("[[", "") text = text.replace("]]", "") text = text.replace("}}", "") text = text.replace("{{", "") text = unescape(text) text = re.sub("<ref .*?/>", "", text) text = re.sub("<ref>.*?</ref>", "", text) text = re.sub("<ref .*?</ref>", "", text) return text