Source code for weblayer.utils
#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" :py:mod:`weblayer.utils` provides a set of utility functions for
converting and encoding.
"""
__all__ = [
'encode_to_utf8',
'decode_to_unicode',
'xhtml_escape',
'url_escape',
'unicode_urlencode',
'json_encode',
'json_decode',
'generate_hash'
]
import hashlib
import random
import time
import urllib
import xml.sax.saxutils
try: #pragma NO COVERAGE
import simplejson as json
except ImportError: #pragma NO COVERAGE
import json
[docs]def encode_to_utf8(value):
""" Converts a ``unicode`` to a utf-8 encoded ``str``::
>>> a = u'foo'
>>> a
u'foo'
>>> encode_to_utf8(a)
'foo'
>>> b = u'\u817e\u8baf\u9996\u9875'
>>> c = '\xe8\x85\xbe\xe8\xae\xaf\xe9\xa6\x96\xe9\xa1\xb5'
>>> assert encode_to_utf8(b) == c
Regular strings get left alone::
>>> d = 'foo'
>>> encode_to_utf8(d)
'foo'
Other types raise a ``ValueError``::
>>> e = None
>>> encode_to_utf8(e) #doctest: +NORMALIZE_WHITESPACE
Traceback (most recent call last):
...
ValueError: None must be a `basestring`
"""
if not isinstance(value, basestring):
raise ValueError('%s must be a `basestring`' % value)
elif isinstance(value, unicode):
return value.encode("utf-8")
return value
[docs]def decode_to_unicode(value):
""" Converts a (hopefully) utf-8 encoded ``str`` to a ``unicode``::
>>> a = 'foo'
>>> decode_to_unicode(a)
u'foo'
>>> b = '\xe8\x85\xbe\xe8\xae\xaf\xe9\xa6\x96\xe9\xa1\xb5'
>>> decode_to_unicode(b)
u'\u817e\u8baf\u9996\u9875'
Unicode values get left alone::
>>> c = u'foo'
>>> decode_to_unicode(c)
u'foo'
Other types raise a ``ValueError``::
>>> d = None
>>> decode_to_unicode(d) #doctest: +NORMALIZE_WHITESPACE
Traceback (most recent call last):
...
ValueError: None must be a `basestring`
"""
if not isinstance(value, basestring):
raise ValueError('%s must be a `basestring`' % value)
elif isinstance(value, str):
return value.decode("utf-8")
return value
[docs]def unicode_urlencode(items):
""" Ensures all ``items`` are encoded to utf-8 and passed to
:py:func:`~urllib.urlencode`.
Pass it a dict, comes out like a query string::
>>> r1 = unicode_urlencode({'a': 'b'})
>>> r1
'a=b'
Ditto a list of two item tuples::
>>> r2 = unicode_urlencode([('a', 'b')])
>>> r2 == r1
True
Converting any unicode values to utf8::
>>> unicode_urlencode({'a': u'b'})
'a=b'
>>> r3 = unicode_urlencode({'a': u'\u817e\u8baf\u9996\u9875'})
>>> r3
'a=%E8%85%BE%E8%AE%AF%E9%A6%96%E9%A1%B5'
Before running them through :py:func:`~urllib.urlencode`::
>>> from urllib import urlencode
>>> r4 = urlencode({'a': encode_to_utf8(u'\u817e\u8baf\u9996\u9875')})
>>> r4 == r3
True
All values must be instances of ``basestring``::
>>> unicode_urlencode({'a': object()}) #doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
Traceback (most recent call last):
...
ValueError: <object object ... must be a `basestring`
Lists must contain at least two values to unpack::
>>> unicode_urlencode(['a', 'b']) #doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
Traceback (most recent call last):
...
ValueError: need more than 1 value to unpack
And not more than two values to unpack::
>>> unicode_urlencode([('a', 'b', 'c')]) #doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
Traceback (most recent call last):
...
ValueError: too many values to unpack
"""
if isinstance(items, dict):
items = items.items()
items = [(k, encode_to_utf8(v)) for k, v in items]
return urllib.urlencode(items)
[docs]def xhtml_escape(value):
"""Escapes a string so it is valid within XML or XHTML::
>>> xhtml_escape('a')
'a'
>>> xhtml_escape('<')
'<'
>>> xhtml_escape('&')
'&'
Including double quotes::
>>> xhtml_escape('"')
'"'
Encoding the result to utf-8::
>>> xhtml_escape(u'a')
'a'
"""
escaped = xml.sax.saxutils.escape(value, {'"': """})
return encode_to_utf8(escaped)
[docs]def url_escape(value):
""" Returns a URL-encoded version of ``value``.
Runs the value through :py:func:`~urllib.quote_plus`::
>>> url_escape('a')
'a'
>>> url_escape(' ')
'+'
Encoding it first to utf-8::
>>> url_escape(u'a')
'a'
>>> url_escape(u'http://foo.com?bar=baz')
'http%3A%2F%2Ffoo.com%3Fbar%3Dbaz'
Which means the value must be a ``basestring``::
>>> url_escape(None) #doctest: +ELLIPSIS
Traceback (most recent call last):
...
ValueError: None must be a `basestring`
"""
return urllib.quote_plus(encode_to_utf8(value))
[docs]def json_encode(value, ensure_ascii=False, **kwargs):
""" JSON encodes the given ``value``::
>>> json_encode({'a': 'b'}) == json.dumps({'a': 'b'})
True
>>> json_encode({'a': 'b'})
'{"a": "b"}'
>>> json_encode({'a': None})
'{"a": null}'
>>> json_encode([])
'[]'
With ``ensure_ascii`` ``False`` by default::
>>> json_encode({'a': u'\u817e\u8baf\u9996\u9875'})
u'{"a": "\u817e\u8baf\u9996\u9875"}'
>>> result = json_encode({'a': u'\u817e\u8baf'}, ensure_ascii=True)
>>> result == '{"a": "\\u817e\\u8baf"}'
True
Raises a ``TypeError`` if the ``value`` isn't serializable::
>>> json_encode([object()]) #doctest: +ELLIPSIS
Traceback (most recent call last):
...
TypeError: <object object ... is not JSON serializable
"""
return json.dumps(value, ensure_ascii=ensure_ascii, **kwargs)
[docs]def json_decode(value, **kwargs):
""" If ``value`` is valid JSON, parses it into a Python object::
>>> json_decode('{}') == json.loads('{}')
True
>>> json_decode('{}')
{}
>>> json_decode('[null]')
[None]
Passing the value through :py:func:`decode_to_unicode` to start with::
>>> json_decode('{"a": "b"}')
{u'a': u'b'}
>>> json_decode('{"a": "\\u817e\\u8baf\\u9996\\u9875"}')
{u'a': u'\u817e\u8baf\u9996\u9875'}
Raises a ``ValueError`` if the decoded ``value`` can't be parsed::
>>> json_decode('{"a": object()}') #doctest: +ELLIPSIS
Traceback (most recent call last):
...
ValueError: No JSON object could be decoded
"""
return json.loads(decode_to_unicode(value), **kwargs)
[docs]def generate_hash(s=None, algorithm='sha512', block_size=512):
""" Generates a :py:func:`~hashlib.hash.hexdigest` string, either randomly
or from a string or file like object (like an open file or a buffer).
By default, the hash is randomly generated and uses the ``sha512``
algorithm::
>>> s1 = generate_hash()
>>> isinstance(s1, str)
True
>>> len(s1) == 128
True
>>> s2 = generate_hash()
>>> s1 == s2
False
>>> s3 = generate_hash(algorithm='sha512')
>>> len(s1) == len(s3)
True
The hash can be generated from a seed::
>>> generate_hash(s='a')
'1f40fc92da241694750979ee6cf582f2d5d7d28e18335de05abc54d0560e0f5302860c652bf08d560252aa5e74210546f369fbbbce8c12cfc7957b2652fe9a75'
Using ``None`` as the seed (which is the default) will, as we've seen,
generate a random value::
>>> s6 = generate_hash(s=None)
>>> s7 = generate_hash(s=None)
>>> s6 == s7
False
Using a file like object (anything with a ``read()`` method) will use
the contents of the file like object::
>>> from StringIO import StringIO
>>> sock = StringIO()
>>> sock.write('abc')
>>> sock.seek(0)
>>> s8 = generate_hash(s=sock)
>>> s9 = generate_hash(s='abc')
>>> s8 == s9
True
Reading the contents into memory in blocks of ``block_size``, which
defaults to ``512``::
>>> from mock import Mock
>>> sock = Mock()
>>> sock.read.return_value = None
>>> s10 = generate_hash(s=sock)
>>> sock.read.assert_called_with(512)
>>> s10 = generate_hash(s=sock, block_size=1024)
>>> sock.read.assert_called_with(1024)
Using other types as a seed (anything that :py:mod:`hashlib` doesn't
like) will raise a ``TypeError``::
>>> generate_hash(s=[]) #doctest: +ELLIPSIS
Traceback (most recent call last):
...
TypeError: ...
The algorithm name can also be passed in::
>>> s4 = generate_hash(algorithm='md5')
>>> s5 = generate_hash(algorithm='sha224')
>>> len(s4) == 32 and len(s5) == 56
True
As long as it's available in :py:mod:`hashlib`::
>>> generate_hash(algorithm='foo')
Traceback (most recent call last):
...
AttributeError: 'module' object has no attribute 'foo'
"""
# get the hasher
hasher = getattr(hashlib, algorithm)()
# read in the data
if hasattr(s, 'read') and callable(s.read):
while True:
data = s.read(block_size)
if not data:
break
hasher.update(data)
else:
if s is None:
s = '%s%s' % (random.random(), time.time())
hasher.update(s)
# return a hexdigest of the hash
return hasher.hexdigest()