Source code for cslbot.helpers.urlutils

# -*- coding: utf-8 -*-
# Copyright (C) 2013-2015 Samuel Damashek, Peter Foley, James Forcier, Srijay Kasturi, Reed Koser, Christopher Reffett, and Fox Wilson
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

import json
import re
import socket
import ssl
from urllib import request
from urllib.error import HTTPError, URLError
from urllib.parse import urlsplit, urlunsplit

from lxml.html import parse

from requests import post
from requests.exceptions import ConnectTimeout

from . import misc
from .exception import CommandFailedException


[docs]def get_short(msg, key):
    if len(msg) < 20:
        return msg
    try:
        data = post('https://www.googleapis.com/urlshortener/v1/url', params={'key': key}, data=json.dumps({'longUrl': msg}),
                    headers={'Content-Type': 'application/json'}, timeout=10).json()
    except ConnectTimeout as e:
        # Sanitize the error before throwing it
        raise ConnectTimeout(re.sub('key=.*', 'key=<removed>', str(e)))
    if 'error' in data:
        return msg
    else:
        return data['id']


[docs]def ensure_prefix(url):
    url = url.split('://', maxsplit=1)
    if len(url) == 1:
        url = ['http', url[0]]
    return "://".join(url)


[docs]def get_title(url):
    title = 'No Title Found'
    try:
        url = ensure_prefix(url)
        url = urlsplit(url)
        url = urlunsplit((url[0], url[1].encode('idna').decode(), url[2], url[3], url[4]))
        url = url.encode('ascii', 'replace').decode()
        # User-Agent is really hard to get right :(
        headers = {'User-Agent': 'Mozilla/5.0 CslBot'}
        req = request.build_opener(request.HTTPCookieProcessor).open(request.Request(url, headers=headers), timeout=10)
        ctype = req.getheader('Content-Type')
        if ctype is not None and ctype.startswith('image/'):
            title = 'Image'
        else:
            html = parse(req)
            t = html.find('.//title') if html.getroot() is not None else None
            if t is not None and t.text is not None:
                # Try to handle multiple types of unicode.
                try:
                    title = bytes(map(ord, t.text)).decode('utf-8')
                except (UnicodeDecodeError, ValueError):
                    title = t.text
                title = title.replace('\n', ' ').strip()
            elif ctype is not None:
                title = ctype
            else:
                title = "Title Not Found"
    except (socket.timeout, ssl.CertificateError) as e:
        raise CommandFailedException(e)
    except HTTPError as e:
        title = 'HTTP Error %d' % e.code
    except ConnectionResetError as e:
        raise CommandFailedException(e.strerror)
    except URLError as e:
        if not hasattr(e.reason, 'strerror') or e.reason.strerror is None:
            raise CommandFailedException(e.reason)
        else:
            raise CommandFailedException(e.reason.strerror)
    title = misc.truncate_msg(title, 256)
    return title