# coding: utf8
"""
webencodings
~~~~~~~~~~~~
This is a Python implementation of the `WHATWG Encoding standard
<http://encoding.spec.whatwg.org/>`. See README for details.
:copyright: Copyright 2012 by Simon Sapin
:license: BSD, see LICENSE for details.
"""
from __future__ import unicode_literals
import string
import codecs
from .labels import LABELS
VERSION = '0.2'
# U+0009, U+000A, U+000C, U+000D, and U+0020.
ASCII_WHITESPACE = '\t\n\f\r '
ASCII_LOWERCASE_MAP = dict(zip(map(ord, string.ascii_uppercase),
map(ord, string.ascii_lowercase)))
UTF8_SIG_DECODER = codecs.getdecoder('utf_8_sig')
UTF16_DECODER = codecs.getdecoder('utf_16')
INCREMENTAL_UTF8_SIG_DECODER = codecs.getincrementaldecoder('utf_8_sig')
INCREMENTAL_UTF16_DECODER = codecs.getincrementaldecoder('utf_16')
# Some names in Encoding are not valid Python aliases. Remap these.
PYTHON_NAMES = {
'iso-8859-8-i': 'iso-8859-8',
'x-mac-cyrillic': 'mac-cyrillic',
'macintosh': 'mac-roman',
'windows-874': 'cp874'}
CACHE = {}
[docs]def lookup(label):
"""
Look for an encoding by its label.
This is the spec’s `get an encoding
<http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.
Supported labels are listed there.
:param label: A string.
:returns:
An :class:`Encoding` object, or :obj:`None` for an unknown label.
"""
# ASCII_WHITESPACE is Unicode, so the result of .strip() is Unicode.
# We want the Unicode version of .translate().
label = label.strip(ASCII_WHITESPACE).translate(ASCII_LOWERCASE_MAP)
name = LABELS.get(label)
if name is None:
return None
encoding = CACHE.get(name)
if encoding is None:
if name == 'x-user-defined':
from .x_user_defined import codec_info
else:
python_name = PYTHON_NAMES.get(name, name)
# Any python_name value that gets to here should be valid.
codec_info = codecs.lookup(python_name)
encoding = Encoding(name, codec_info)
CACHE[name] = encoding
return encoding
def _get_codec_info(encoding):
"""
Accept either an encoding object or label.
:param encoding: An :class:`Encoding` object or a label string.
:returns: A :class:`codecs.CodecInfo` object.
:raises: :exc:`~exceptions.LookupError` for an unknown label.
"""
if not hasattr(encoding, '_decoder'):
result = lookup(encoding)
if result is None:
raise LookupError('Unknown encoding label: %r' % encoding)
else:
encoding = result
return encoding.codec_info
[docs]class Encoding(object):
def __init__(self, name, codec_info):
self.name = name
self.codec_info = codec_info
def __repr__(self):
return '<Encoding %s>' % self.name
#: The UTF-8 encoding. Should be used for new content and formats.
UTF8 = lookup('utf-8')
[docs]def decode(input, fallback_encoding, errors='replace'):
"""
Decode a single string.
:param input: A byte string
:param fallback_encoding:
An :class:`Encoding` object or a label string.
Ignored if :obj:`input` has a BOM.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
:return: An Unicode string
"""
codec_info = _get_codec_info(fallback_encoding)
if input.startswith((b'\xFF\xFE', b'\xFE\xFF')):
# UTF-16 BOM. Python’s utf_16 skips it and uses it to pick BE or LE.
decoder = UTF16_DECODER
elif input.startswith(b'\xEF\xBB\xBF'):
# UTF-8 BOM. Python’s utf_8_sig skips it.
decoder = UTF8_SIG_DECODER
else:
decoder = codec_info.decode
return decoder(input, errors)[0]
[docs]def encode(input, encoding=UTF8, errors='strict'):
"""
Encode a single string.
:param input: An Unicode string.
:param encoding: An :class:`Encoding` object or a label string.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
:return: A byte string.
"""
return _get_codec_info(encoding).encode(input, errors)[0]
[docs]def iter_decode(input, fallback_encoding, errors='replace'):
"""
“Pull”-based decoder.
:param input: An iterable of byte strings.
:param fallback_encoding:
An :class:`Encoding` object or a label string.
Ignored if :obj:`input` has a BOM.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
:returns: An iterable of Unicode strings.
"""
# Fail early if `fallback_encoding` is an invalid label.
decoder = make_incremental_decoder(fallback_encoding, errors)
return _iter_function(input, decoder, b'')
[docs]def iter_encode(input, encoding=UTF8, errors='strict'):
"""
“Pull”-based encoder.
:param input: An iterable of Unicode strings.
:param encoding: An :class:`Encoding` object or a label string.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
:returns: An iterable of byte strings.
"""
# Fail early if `encoding` is an invalid label.
encoder = make_incremental_encoder(encoding, errors)
return _iter_function(input, encoder, '')
def _iter_function(input, function, empty):
for chunck in input:
output = function(chunck)
if output:
yield output
output = function(empty, True)
if output:
yield output
[docs]def make_incremental_decoder(fallback_encoding, errors='replace'):
"""
“Push”-based decoder.
:param fallback_encoding:
An :class:`Encoding` object or a label string.
Ignored if :obj:`input` has a BOM.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
:returns:
An incremental decoder callable like this:
.. currentmodule:: None
.. function:: incremental_decoder(input, final=False)
:param input: A byte string.
:param final:
Indicate that no more input is available.
Must be :obj:`True` if this is the last call.
:returns: An Unicode string.
"""
fallback_decoder = _get_codec_info(fallback_encoding).incrementaldecoder
# Using a mutable dict to simulate nonlocal on Python 2.x
state = dict(buffer=b'', decoder=None)
def incremental_decoder(input, final=False):
decoder = state['decoder']
if decoder is None:
buffer = state['buffer'] + input
if buffer.startswith((b'\xFF\xFE', b'\xFE\xFF')):
# UTF-16 BOM.
# Python’s utf_16 skips it and uses it to pick BE or LE.
decoder = INCREMENTAL_UTF16_DECODER
elif buffer.startswith(b'\xEF\xBB\xBF'):
# UTF-8 BOM. Python’s utf_8_sig skips it.
decoder = INCREMENTAL_UTF8_SIG_DECODER
elif final or len(buffer) >= 3:
# No BOM.
decoder = fallback_decoder
else:
# Not enough data yet.
state['buffer'] = buffer
return ''
decoder = state['decoder'] = decoder(errors).decode
input = buffer
return decoder(input, final)
return incremental_decoder
[docs]def make_incremental_encoder(encoding=UTF8, errors='strict'):
"""
“Push”-based encoder.
:param encoding: An :class:`Encoding` object or a label string.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
:returns:
An incremental encoder callable like this:
.. currentmodule:: None
.. function:: incremental_encoder(input, final=False)
:param input: An Unicode string.
:param final:
Indicate that no more input is available.
Must be :obj:`True` if this is the last call.
:returns: A byte string.
"""
return _get_codec_info(encoding).incrementalencoder(errors).encode