Source code for weblayer.utils

#!/usr/bin/env python
# -*- coding: utf-8 -*-

""" :py:mod:`weblayer.utils` provides a set of utility functions for
  converting and encoding.
"""

__all__ = [
    'encode_to_utf8',
    'decode_to_unicode',
    'xhtml_escape',
    'url_escape',
    'unicode_urlencode',
    'json_encode',
    'json_decode',
    'generate_hash'
]

import hashlib
import random
import time
import urllib
import xml.sax.saxutils

try: #pragma NO COVERAGE
    import simplejson as json
except ImportError: #pragma NO COVERAGE
    import json

[docs]def encode_to_utf8(value):
    """ Converts a ``unicode`` to a utf-8 encoded ``str``::
      
          >>> a = u'foo'
          >>> a
          u'foo'
          >>> encode_to_utf8(a)
          'foo'
          >>> b = u'\u817e\u8baf\u9996\u9875'
          >>> c = '\xe8\x85\xbe\xe8\xae\xaf\xe9\xa6\x96\xe9\xa1\xb5'
          >>> assert encode_to_utf8(b) == c
      
      Regular strings get left alone::
      
          >>> d = 'foo'
          >>> encode_to_utf8(d)
          'foo'
      
      Other types raise a ``ValueError``::
      
          >>> e = None
          >>> encode_to_utf8(e) #doctest: +NORMALIZE_WHITESPACE
          Traceback (most recent call last):
          ...
          ValueError: None must be a `basestring`
      
    """
    
    if not isinstance(value, basestring):
        raise ValueError('%s must be a `basestring`' % value)
    elif isinstance(value, unicode):
        return value.encode("utf-8")
    return value
    

[docs]def decode_to_unicode(value):
    """ Converts a (hopefully) utf-8 encoded ``str`` to a ``unicode``::
      
          >>> a = 'foo'
          >>> decode_to_unicode(a)
          u'foo'
          >>> b = '\xe8\x85\xbe\xe8\xae\xaf\xe9\xa6\x96\xe9\xa1\xb5'
          >>> decode_to_unicode(b)
          u'\u817e\u8baf\u9996\u9875'
      
      Unicode values get left alone::
      
          >>> c = u'foo'
          >>> decode_to_unicode(c)
          u'foo'
      
      Other types raise a ``ValueError``::
      
          >>> d = None
          >>> decode_to_unicode(d) #doctest: +NORMALIZE_WHITESPACE
          Traceback (most recent call last):
          ...
          ValueError: None must be a `basestring`
      
    """
    
    if not isinstance(value, basestring):
        raise ValueError('%s must be a `basestring`' % value)
    elif isinstance(value, str):
        return value.decode("utf-8")
    return value
    

[docs]def unicode_urlencode(items):
    """ Ensures all ``items`` are encoded to utf-8 and passed to
      :py:func:`~urllib.urlencode`.
      
      Pass it a dict, comes out like a query string::
      
          >>> r1 = unicode_urlencode({'a': 'b'})
          >>> r1
          'a=b'
      
      Ditto a list of two item tuples::
      
          >>> r2 = unicode_urlencode([('a', 'b')])
          >>> r2 == r1
          True
      
      Converting any unicode values to utf8::
      
          >>> unicode_urlencode({'a': u'b'})
          'a=b'
          >>> r3 = unicode_urlencode({'a': u'\u817e\u8baf\u9996\u9875'})
          >>> r3
          'a=%E8%85%BE%E8%AE%AF%E9%A6%96%E9%A1%B5'
      
      Before running them through :py:func:`~urllib.urlencode`::
      
          >>> from urllib import urlencode
          >>> r4 = urlencode({'a': encode_to_utf8(u'\u817e\u8baf\u9996\u9875')})
          >>> r4 == r3
          True
      
      All values must be instances of ``basestring``::
      
          >>> unicode_urlencode({'a': object()}) #doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
          Traceback (most recent call last):
          ...
          ValueError: <object object ... must be a `basestring`
      
      Lists must contain at least two values to unpack::
      
          >>> unicode_urlencode(['a', 'b']) #doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
          Traceback (most recent call last):
          ...
          ValueError: need more than 1 value to unpack
      
      And not more than two values to unpack::
      
          >>> unicode_urlencode([('a', 'b', 'c')]) #doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
          Traceback (most recent call last):
          ...
          ValueError: too many values to unpack
      
    """
    
    if isinstance(items, dict):
        items = items.items()
    items = [(k, encode_to_utf8(v)) for k, v in items]
    return urllib.urlencode(items)
    


[docs]def xhtml_escape(value):
    """Escapes a string so it is valid within XML or XHTML::
      
          >>> xhtml_escape('a')
          'a'
          >>> xhtml_escape('<')
          '&lt;'
          >>> xhtml_escape('&')
          '&amp;'
      
      Including double quotes::
      
          >>> xhtml_escape('"')
          '&quot;'
      
      Encoding the result to utf-8::
      
          >>> xhtml_escape(u'a')
          'a'
      
    """
    
    escaped = xml.sax.saxutils.escape(value, {'"': "&quot;"})
    return encode_to_utf8(escaped)
    

[docs]def url_escape(value):
    """ Returns a URL-encoded version of ``value``.
      
      Runs the value through :py:func:`~urllib.quote_plus`::
      
          >>> url_escape('a')
          'a'
          >>> url_escape(' ')
          '+'
          
      Encoding it first to utf-8::
      
          >>> url_escape(u'a')
          'a'
          >>> url_escape(u'http://foo.com?bar=baz')
          'http%3A%2F%2Ffoo.com%3Fbar%3Dbaz'
      
      Which means the value must be a ``basestring``::
      
          >>> url_escape(None) #doctest: +ELLIPSIS
          Traceback (most recent call last):
          ...
          ValueError: None must be a `basestring`
      
    """
    
    return urllib.quote_plus(encode_to_utf8(value))
    


[docs]def json_encode(value, ensure_ascii=False, **kwargs):
    """ JSON encodes the given ``value``::
      
          >>> json_encode({'a': 'b'}) == json.dumps({'a': 'b'})
          True
          >>> json_encode({'a': 'b'})
          '{"a": "b"}'
          >>> json_encode({'a': None})
          '{"a": null}'
          >>> json_encode([])
          '[]'
          
      With ``ensure_ascii`` ``False`` by default::
      
          >>> json_encode({'a': u'\u817e\u8baf\u9996\u9875'})
          u'{"a": "\u817e\u8baf\u9996\u9875"}'
          >>> result = json_encode({'a': u'\u817e\u8baf'}, ensure_ascii=True)
          >>> result == '{"a": "\\u817e\\u8baf"}'
          True
      
      Raises a ``TypeError`` if the ``value`` isn't serializable::
      
          >>> json_encode([object()]) #doctest: +ELLIPSIS
          Traceback (most recent call last):
          ...
          TypeError: <object object ... is not JSON serializable
      
    """
    
    return json.dumps(value, ensure_ascii=ensure_ascii, **kwargs)
    

[docs]def json_decode(value, **kwargs):
    """ If ``value`` is valid JSON, parses it into a Python object::
      
          >>> json_decode('{}') == json.loads('{}')
          True
          >>> json_decode('{}')
          {}
          >>> json_decode('[null]')
          [None]
      
      Passing the value through :py:func:`decode_to_unicode` to start with::
      
          >>> json_decode('{"a": "b"}')
          {u'a': u'b'}
          >>> json_decode('{"a": "\\u817e\\u8baf\\u9996\\u9875"}')
          {u'a': u'\u817e\u8baf\u9996\u9875'}
      
      Raises a ``ValueError`` if the decoded ``value`` can't be parsed::
      
          >>> json_decode('{"a": object()}') #doctest: +ELLIPSIS
          Traceback (most recent call last):
          ...
          ValueError: No JSON object could be decoded
      
    """
    
    return json.loads(decode_to_unicode(value), **kwargs)
    


[docs]def generate_hash(s=None, algorithm='sha512', block_size=512):
    """ Generates a :py:func:`~hashlib.hash.hexdigest` string, either randomly
      or from a string or file like object (like an open file or a buffer).
      
      By default, the hash is randomly generated and uses the ``sha512``
      algorithm::
      
          >>> s1 = generate_hash()
          >>> isinstance(s1, str)
          True
          >>> len(s1) == 128
          True
          >>> s2 = generate_hash()
          >>> s1 == s2
          False
          >>> s3 = generate_hash(algorithm='sha512')
          >>> len(s1) == len(s3)
          True
      
      The hash can be generated from a seed::
      
          >>> generate_hash(s='a')
          '1f40fc92da241694750979ee6cf582f2d5d7d28e18335de05abc54d0560e0f5302860c652bf08d560252aa5e74210546f369fbbbce8c12cfc7957b2652fe9a75'
      
      Using ``None`` as the seed (which is the default) will, as we've seen, 
      generate a random value::
      
          >>> s6 = generate_hash(s=None)
          >>> s7 = generate_hash(s=None)
          >>> s6 == s7
          False
      
      Using a file like object (anything with a ``read()`` method) will use
      the contents of the file like object::
      
          >>> from StringIO import StringIO
          >>> sock = StringIO()
          >>> sock.write('abc')
          >>> sock.seek(0)
          >>> s8 = generate_hash(s=sock)
          >>> s9 = generate_hash(s='abc')
          >>> s8 == s9
          True
      
      Reading the contents into memory in blocks of ``block_size``, which
      defaults to ``512``::
      
          >>> from mock import Mock
          >>> sock = Mock()
          >>> sock.read.return_value = None
          >>> s10 = generate_hash(s=sock)
          >>> sock.read.assert_called_with(512)
          >>> s10 = generate_hash(s=sock, block_size=1024)
          >>> sock.read.assert_called_with(1024)
      
      Using other types as a seed (anything that :py:mod:`hashlib` doesn't
      like) will raise a ``TypeError``::
      
          >>> generate_hash(s=[]) #doctest: +ELLIPSIS
          Traceback (most recent call last):
          ...
          TypeError: ...
      
      The algorithm name can also be passed in::
      
          >>> s4 = generate_hash(algorithm='md5')
          >>> s5 = generate_hash(algorithm='sha224')
          >>> len(s4) == 32 and len(s5) == 56
          True
      
      As long as it's available in :py:mod:`hashlib`::
      
          >>> generate_hash(algorithm='foo')
          Traceback (most recent call last):
          ...
          AttributeError: 'module' object has no attribute 'foo'
      
    """
    
    # get the hasher
    hasher = getattr(hashlib, algorithm)()
    
    # read in the data
    if hasattr(s, 'read') and callable(s.read):
        while True:
            data = s.read(block_size)
            if not data:
                break
            hasher.update(data)
    else:
        if s is None:
            s = '%s%s' % (random.random(), time.time())
        hasher.update(s)
    
    # return a hexdigest of the hash
    return hasher.hexdigest()
Navigation

Source code for weblayer.utils

Quick search

Navigation