Source code for weblayer.utils
#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" :py:mod:`weblayer.utils` provides a set of utility functions for
converting and encoding.
__all__ = [
import hashlib
import random
import time
import urllib
import xml.sax.saxutils
try: #pragma NO COVERAGE
import simplejson as json
except ImportError: #pragma NO COVERAGE
import json
[docs]def encode_to_utf8(value):
""" Converts a ``unicode`` to a utf-8 encoded ``str``::
>>> a = u'foo'
>>> a
>>> encode_to_utf8(a)
>>> b = u'\u817e\u8baf\u9996\u9875'
>>> c = '\xe8\x85\xbe\xe8\xae\xaf\xe9\xa6\x96\xe9\xa1\xb5'
>>> assert encode_to_utf8(b) == c
Regular strings get left alone::
>>> d = 'foo'
>>> encode_to_utf8(d)
Other types raise a ``ValueError``::
>>> e = None
>>> encode_to_utf8(e) #doctest: +NORMALIZE_WHITESPACE
Traceback (most recent call last):
ValueError: None must be a `basestring`
if not isinstance(value, basestring):
raise ValueError('%s must be a `basestring`' % value)
elif isinstance(value, unicode):
return value.encode("utf-8")
return value
[docs]def decode_to_unicode(value):
""" Converts a (hopefully) utf-8 encoded ``str`` to a ``unicode``::
>>> a = 'foo'
>>> decode_to_unicode(a)
>>> b = '\xe8\x85\xbe\xe8\xae\xaf\xe9\xa6\x96\xe9\xa1\xb5'
>>> decode_to_unicode(b)
Unicode values get left alone::
>>> c = u'foo'
>>> decode_to_unicode(c)
Other types raise a ``ValueError``::
>>> d = None
>>> decode_to_unicode(d) #doctest: +NORMALIZE_WHITESPACE
Traceback (most recent call last):
ValueError: None must be a `basestring`
if not isinstance(value, basestring):
raise ValueError('%s must be a `basestring`' % value)
elif isinstance(value, str):
return value.decode("utf-8")
return value
[docs]def unicode_urlencode(items):
""" Ensures all ``items`` are encoded to utf-8 and passed to
Pass it a dict, comes out like a query string::
>>> r1 = unicode_urlencode({'a': 'b'})
>>> r1
Ditto a list of two item tuples::
>>> r2 = unicode_urlencode([('a', 'b')])
>>> r2 == r1
Converting any unicode values to utf8::
>>> unicode_urlencode({'a': u'b'})
>>> r3 = unicode_urlencode({'a': u'\u817e\u8baf\u9996\u9875'})
>>> r3
Before running them through :py:func:`~urllib.urlencode`::
>>> from urllib import urlencode
>>> r4 = urlencode({'a': encode_to_utf8(u'\u817e\u8baf\u9996\u9875')})
>>> r4 == r3
All values must be instances of ``basestring``::
>>> unicode_urlencode({'a': object()}) #doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
Traceback (most recent call last):
ValueError: <object object ... must be a `basestring`
Lists must contain at least two values to unpack::
>>> unicode_urlencode(['a', 'b']) #doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
Traceback (most recent call last):
ValueError: need more than 1 value to unpack
And not more than two values to unpack::
>>> unicode_urlencode([('a', 'b', 'c')]) #doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
Traceback (most recent call last):
ValueError: too many values to unpack
if isinstance(items, dict):
items = items.items()
items = [(k, encode_to_utf8(v)) for k, v in items]
return urllib.urlencode(items)
[docs]def xhtml_escape(value):
"""Escapes a string so it is valid within XML or XHTML::
>>> xhtml_escape('a')
>>> xhtml_escape('<')
>>> xhtml_escape('&')
Including double quotes::
>>> xhtml_escape('"')
Encoding the result to utf-8::
>>> xhtml_escape(u'a')
escaped = xml.sax.saxutils.escape(value, {'"': """})
return encode_to_utf8(escaped)
[docs]def url_escape(value):
""" Returns a URL-encoded version of ``value``.
Runs the value through :py:func:`~urllib.quote_plus`::
>>> url_escape('a')
>>> url_escape(' ')
Encoding it first to utf-8::
>>> url_escape(u'a')
>>> url_escape(u'')
Which means the value must be a ``basestring``::
>>> url_escape(None) #doctest: +ELLIPSIS
Traceback (most recent call last):
ValueError: None must be a `basestring`
return urllib.quote_plus(encode_to_utf8(value))
[docs]def json_encode(value, ensure_ascii=False, **kwargs):
""" JSON encodes the given ``value``::
>>> json_encode({'a': 'b'}) == json.dumps({'a': 'b'})
>>> json_encode({'a': 'b'})
'{"a": "b"}'
>>> json_encode({'a': None})
'{"a": null}'
>>> json_encode([])
With ``ensure_ascii`` ``False`` by default::
>>> json_encode({'a': u'\u817e\u8baf\u9996\u9875'})
u'{"a": "\u817e\u8baf\u9996\u9875"}'
>>> result = json_encode({'a': u'\u817e\u8baf'}, ensure_ascii=True)
>>> result == '{"a": "\\u817e\\u8baf"}'
Raises a ``TypeError`` if the ``value`` isn't serializable::
>>> json_encode([object()]) #doctest: +ELLIPSIS
Traceback (most recent call last):
TypeError: <object object ... is not JSON serializable
return json.dumps(value, ensure_ascii=ensure_ascii, **kwargs)
[docs]def json_decode(value, **kwargs):
""" If ``value`` is valid JSON, parses it into a Python object::
>>> json_decode('{}') == json.loads('{}')
>>> json_decode('{}')
>>> json_decode('[null]')
Passing the value through :py:func:`decode_to_unicode` to start with::
>>> json_decode('{"a": "b"}')
{u'a': u'b'}
>>> json_decode('{"a": "\\u817e\\u8baf\\u9996\\u9875"}')
{u'a': u'\u817e\u8baf\u9996\u9875'}
Raises a ``ValueError`` if the decoded ``value`` can't be parsed::
>>> json_decode('{"a": object()}') #doctest: +ELLIPSIS
Traceback (most recent call last):
ValueError: No JSON object could be decoded
return json.loads(decode_to_unicode(value), **kwargs)
[docs]def generate_hash(s=None, algorithm='sha512', block_size=512):
""" Generates a :py:func:`~hashlib.hash.hexdigest` string, either randomly
or from a string or file like object (like an open file or a buffer).
By default, the hash is randomly generated and uses the ``sha512``
>>> s1 = generate_hash()
>>> isinstance(s1, str)
>>> len(s1) == 128
>>> s2 = generate_hash()
>>> s1 == s2
>>> s3 = generate_hash(algorithm='sha512')
>>> len(s1) == len(s3)
The hash can be generated from a seed::
>>> generate_hash(s='a')
Using ``None`` as the seed (which is the default) will, as we've seen,
generate a random value::
>>> s6 = generate_hash(s=None)
>>> s7 = generate_hash(s=None)
>>> s6 == s7
Using a file like object (anything with a ``read()`` method) will use
the contents of the file like object::
>>> from StringIO import StringIO
>>> sock = StringIO()
>>> sock.write('abc')
>>> s8 = generate_hash(s=sock)
>>> s9 = generate_hash(s='abc')
>>> s8 == s9
Reading the contents into memory in blocks of ``block_size``, which
defaults to ``512``::
>>> from mock import Mock
>>> sock = Mock()
>>> = None
>>> s10 = generate_hash(s=sock)
>>> s10 = generate_hash(s=sock, block_size=1024)
Using other types as a seed (anything that :py:mod:`hashlib` doesn't
like) will raise a ``TypeError``::
>>> generate_hash(s=[]) #doctest: +ELLIPSIS
Traceback (most recent call last):
TypeError: ...
The algorithm name can also be passed in::
>>> s4 = generate_hash(algorithm='md5')
>>> s5 = generate_hash(algorithm='sha224')
>>> len(s4) == 32 and len(s5) == 56
As long as it's available in :py:mod:`hashlib`::
>>> generate_hash(algorithm='foo')
Traceback (most recent call last):
AttributeError: 'module' object has no attribute 'foo'
# get the hasher
hasher = getattr(hashlib, algorithm)()
# read in the data
if hasattr(s, 'read') and callable(
while True:
data =
if not data:
if s is None:
s = '%s%s' % (random.random(), time.time())
# return a hexdigest of the hash
return hasher.hexdigest()