Source code for htmltag

# -*- coding: utf-8 -*-
#
#       Copyright 2014 Liftoff Software Corporation
#
# For license information see LICENSE.txt
from __future__ import unicode_literals

# Meta
__version__ = '1.6'
__version_info__ = (1, 6)
__license__ = "Apache 2.0"
__author__ = 'Dan McDougall <daniel.mcdougall@liftoffsoftware.com>'

# NOTE: The doctests only pass in Python 3 due to the missing u'' syntax.

__doc__ = """\
The htmltag module
==================
.. note::

    The latest, complete documentation of htmltag can be found here:
    http://liftoff.github.io/htmltag/

    The latest version of this module can be obtained from Github:
    https://github.com/LiftoffSoftware/htmltag

htmltag.py - A Python (2 *and* 3) module for wrapping whatever strings you want
in HTML tags. Example::

    >>> from htmltag import strong
    >>> print(strong("SO STRONG!"))
    <strong>SO STRONG!</strong>

What tags are supported?  All of them!  An important facet of modern web
programming is the ability to use your own custom tags.  For example::

    >>> from htmltag import foobar
    >>> foobar('Custom tag example')
    '<foobar>Custom tag example</foobar>'

To add attributes inside your tag just pass them as keyword arguments::

    >>> from htmltag import a
    >>> print(a('awesome software', href='http://liftoffsoftware.com/'))
    <a href="http://liftoffsoftware.com/">awesome software</a>

To work around the problem of reserved words as keyword arguments (i.e. can't
have 'class="foo"') just prefix the keyword with an underscore like so::

    >>> from htmltag import div
    >>> print(div("example", _class="someclass"))
    <div class="someclass">example</div>

Another option--which is useful for things like 'data-\*' attributes--is to pass
keyword arguments as a dict using the `\*\* operator
<http://docs.python.org/2/tutorial/controlflow.html#unpacking-argument-lists>`_
like so::

    >>> from htmltag import li
    >>> print(li("CEO", **{"class": "user", "data-name": "Dan McDougall"}))
    <li class="user" data-name="Dan McDougall">CEO</li>

If you want to use upper-case tags just import them in caps:

    >>> from htmltag import STRONG
    >>> print(STRONG('whatever'))
    <STRONG>whatever</STRONG>

Combining Tags and Content
--------------------------
You can combine multiple tags to create a larger HTML string like so::

    >>> from htmltag import table, tr, td
    >>> print(table(
    ...     tr(td('100'), td('200'), id="row1"),
    ...     tr(td('150'), td('250'), id="row2"),
    ... ))
    <table><tr id="row1"><td>100</td><td>200</td></tr><tr id="row2"><td>150</td><td>250</td></tr></table>

**NOTE:** If you're going to do something like the above please use a *real*
template language/module instead of `htmltag`.  You're *probably* "doing it
wrong" if you end up with something like the above in your code.  For example,
try `Tornado's template engine
<http://www.tornadoweb.org/en/stable/template.html>`_.

Special Characters
------------------
Special characters that cause trouble like, '<', '>', and '&' will be
automatically converted into HTML entities.  If you don't want that to happen
just wrap your string in :class:`htmltag.HTML` like so::

    >>> from htmltag import HTML, a
    >>> txt = HTML("<strong>I am already HTML. Don't escape me!</strong>")
    >>> a(txt, href="http://liftoffsoftware.com/")
    '<a href="http://liftoffsoftware.com/"><strong>I am already HTML. Don\\'t escape me!</strong></a>'

Since Python doesn't allow modules to have dashes (-) in their names, if you
need to create a tag like that just use an underscore and change its 'tagname'
attribute::

    >>> from htmltag import foo_bar
    >>> print(foo_bar('baz')) # Before
    <foo_bar>baz</foo_bar>
    >>> foo_bar.tagname = 'foo-bar'
    >>> print(foo_bar('baz')) # After
    <foo-bar>baz</foo-bar>

By default self-closing HTML tags like '<img>' will not include an ending slash.
To change this behavior (i.e. for XHTML) just set 'ending_slash' to `True`::

    >>> from htmltag import img
    >>> img.ending_slash = True
    >>> img(src="http://somehost/images/image.png")
    '<img src="http://somehost/images/image.png" />'
    >>> img.ending_slash = False # Reset for later doctests

Protections Against Cross-Site Scripting (XSS)
----------------------------------------------
By default all unsafe (XSS) content in HTML tags will be removed::

    >>> from htmltag import a, img
    >>> a(img(src="javascript:alert('pwned!')"), href="http://hacker/")
    '<a href="http://hacker/">(removed)</a>'

If you want to change this behavior set the tag's 'safe_mode' attribute like
so::

    >>> from htmltag import a, img
    >>> a.safe_mode = False
    >>> img.safe_mode = False
    >>> a(img(src="javascript:alert('pwned!')"), href="http://hacker/")
    '<a href="http://hacker/"><img src="javascript:alert(\\'pwned!\\')"></a>'
    >>> a.safe_mode = True # Reset for later doctests
    >>> img.safe_mode = True # Ditto

You may also change the replacement text if you like::

    >>> from htmltag import a, img
    >>> img.replacement = "No no no!"
    >>> a(img(src="javascript:alert('pwned!')"), href="http://hacker/")
    '<a href="http://hacker/">No no no!</a>'

If you set 'replacement' to 'entities' the rejected HTML will be converted to
character entities like so::

    >>> from htmltag import a, img
    >>> a.replacement = "entities"
    >>> img.replacement = "entities"
    >>> a(img(src="javascript:alert('pwned!')"), href="http://hacker/")
    '<a href="http://hacker/">&lt;img src="javascript:alert(\\'pwned!\\')"&gt;</a>'

It is also possible to create a whitelist of allowed tags.  All other tags
contained therein will automatically be replaced::

    >>> from htmltag import span
    >>> whitelist = ['span', 'b', 'i', 'strong']
    >>> span.whitelist = whitelist
    >>> span(HTML('This is <b>bold</b> new lib is <script>awesome();</script>'))
    '<span>This is <b>bold</b> new lib is (removed)awesome();(removed)</span>'

Lastly, all strings returned by `htmltag` are actually a subclass of `str`:
`~htmltag.HTML`.  It has a useful `escaped` property:

    >>> from htmltag import address
    >>> address.safe_mode = False # Turn off so we have a dangerous example ;)
    >>> html = address('1 Hacker Ln., Nowhere, USA')
    >>> print(html)
    <address>1 Hacker Ln., Nowhere, USA</address>
    >>> print(html.escaped)
    &lt;address&gt;1 Hacker Ln., Nowhere, USA&lt;/address&gt;

This can be extremely useful if you want to be double-sure that no executable
stuff ends up in your program's output.


Functions and Classes
=====================
"""

import sys, re, cgi, logging
from types import ModuleType

if sys.version_info.major == 2:
    stringtype = unicode
else: # Python 3
    stringtype = str

self_closing_tags = set([
    'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input',
    'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr',
])
FILE = __file__

[docs]def strip_xss(html, whitelist=None, replacement="(removed)"): """ This function returns a tuple containing: * *html* with all non-whitelisted HTML tags replaced with *replacement*. * A `set()` containing the tags that were removed. Any tags that contain JavaScript, VBScript, or other known XSS/executable functions will also be removed. If *whitelist* is not given the following will be used:: whitelist = set([ 'a', 'abbr', 'aside', 'audio', 'bdi', 'bdo', 'blockquote', 'canvas', 'caption', 'code', 'col', 'colgroup', 'data', 'dd', 'del', 'details', 'div', 'dl', 'dt', 'em', 'figcaption', 'figure', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins', 'kbd', 'li', 'mark', 'ol', 'p', 'pre', 'q', 'rp', 'rt', 'ruby', 's', 'samp', 'small', 'source', 'span', 'strong', 'sub', 'summary', 'sup', 'table', 'td', 'th', 'time', 'tr', 'track', 'u', 'ul', 'var', 'video', 'wbr' ]) .. note:: To disable the whitelisting simply set `whitelist="off"`. Example:: >>> html = '<span>Hello, exploit: <img src="javascript:alert(\"pwned!\")"></span>' >>> html, rejects = strip_xss(html) >>> print("'%s', Rejected: '%s'" % (html, " ".join(rejects))) '<span>Hello, exploit: (removed)</span>', Rejected: '<img src="javascript:alert("pwned!")">' .. note:: The default *replacement* is "(removed)". If *replacement* is "entities" bad HTML tags will be encoded into HTML entities. This allows things like <script>'whatever'</script> to be displayed without execution (which would be much less annoying to users that were merely trying to share a code example). Here's an example:: >>> html = '<span>Hello, exploit: <img src="javascript:alert(\"pwned!\")"></span>' >>> html, rejects = strip_xss(html, replacement="entities") >>> print(html) <span>Hello, exploit: &lt;img src="javascript:alert("pwned!")"&gt;</span> >>> print("Rejected: '%s'" % ", ".join(rejects)) Rejected: '<img src="javascript:alert("pwned!")">' **NOTE:** This function should work to protect against *all* `the XSS examples at OWASP <https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet>`_. Please `let us know <https://github.com/LiftoffSoftware/htmltag/issues>`_ if you find something we missed. """ re_html_tag = re.compile( # This matches HTML tags (if used correctly) "(?i)<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[^'\">\s]+))?)+\s*|\s*)\/?>") # This will match things like 'onmouseover=' ('on<whatever>=') on_events_re = re.compile('.*\s+(on[a-z]+\s*=).*') if not whitelist: # These are all pretty safe and covers most of what users would want in # terms of formatting and sharing media (images, audio, video, etc). whitelist = set([ 'a', 'abbr', 'aside', 'audio', 'bdi', 'bdo', 'blockquote', 'canvas', 'caption', 'code', 'col', 'colgroup', 'data', 'dd', 'del', 'details', 'div', 'dl', 'dt', 'em', 'figcaption', 'figure', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins', 'kbd', 'li', 'mark', 'ol', 'p', 'pre', 'q', 'rp', 'rt', 'ruby', 's', 'samp', 'small', 'source', 'span', 'strong', 'sub', 'summary', 'sup', 'table', 'td', 'th', 'time', 'tr', 'track', 'u', 'ul', 'var', 'video', 'wbr' ]) elif whitelist == "off": whitelist = None # Disable it altogether bad_tags = set() for tag in re_html_tag.finditer(html): tag = tag.group() tag_lower = tag.lower() short_tag = tag_lower.split()[0].lstrip('</').rstrip('>') if whitelist and short_tag not in whitelist: bad_tags.add(tag) continue # Make sure the tag can't execute any JavaScript if "javascript:" in tag_lower: bad_tags.add(tag) continue # on<whatever> events are not allowed (just another XSS vuln) if on_events_re.search(tag_lower): bad_tags.add(tag) continue # Flash sucks if "fscommand" in tag_lower: bad_tags.add(tag) continue # I'd be impressed if an attacker tried this one (super obscure) if "seeksegmenttime" in tag_lower: bad_tags.add(tag) continue # Yes we'll protect IE users from themselves... if "vbscript:" in tag_lower: bad_tags.add(tag) continue if replacement == "entities": for bad_tag in bad_tags: escaped = cgi.escape(bad_tag).encode('ascii', 'xmlcharrefreplace') html = html.replace(bad_tag, escaped.decode('ascii')) else: for bad_tag in bad_tags: html = html.replace(bad_tag, replacement) return (html, bad_tags)
[docs]class HTML(stringtype): """ .. versionadded:: 1.2.0 A subclass of Python's built-in `str` to add a simple `__html__` method that lets us know this string is HTML and does not need to be escaped. It also has an `escaped` property that will return `self` with all special characters converted into HTML entities. """ tagname = None
[docs] def __html__(self): """ Returns `self` (we're already a string) in unmodified form. """ return self
@property
[docs] def escaped(self): """ A property that returns `self` with all characters that have special meaning (in HTML/XML) replaced with HTML entities. Example:: >>> print(HTML('<span>These span tags will be escaped</span>').escaped) &lt;span&gt;These span tags will be escaped&lt;/span&gt; """ return cgi.escape(self).encode( 'ascii', 'xmlcharrefreplace').decode('ascii')
[docs] def append(self, *strings): """ Adds any number of supplied *strings* to `self` (we're a subclass of `str` remember) just before the last closing tag and returns a new instance of `~htmltag.HTML` with the result. Example:: >>> from htmltag import span, b >>> html = span('Test:') >>> print(html) <span>Test:</span> >>> html = html.append(' ', b('appended')) >>> print(html) <span>Test: <b>appended</b></span> In the case of self-closing tags like '<img>' the string will simply be appended after the tag:: >>> from htmltag import img >>> image = img(src="http://company.com/image.png") >>> print(image.append("Appended string")) <img src="http://company.com/image.png">Appended string .. note:: Why not update ourselves in-place? Because we're a subclass of `str`; in Python strings are immutable. """ close_tag_start = self.rfind('</') if self.tagname: # More accurate close_tag_start = self.rfind('</'+self.tagname) if close_tag_start == -1: # Couldn't find closing tag return self + "".join(strings) # Just tack on to the end ending = self[close_tag_start:] beginning = self[:close_tag_start] if self.tagname: # Preserve it tagname = self.tagname new = HTML(beginning + "".join(strings) + ending) new.tagname = tagname return new else: return HTML(beginning + "".join(strings) + ending)
[docs]class TagWrap(object): """ Lets you wrap whatever string you want in whatever HTML tag (*tagname*) you want. **Optional Keyword Arguments:** :keyword safe_mode: If `True` dangerous (XSS) content will be removed from all HTML. Defaults to `True` :keyword whitelist: If given only tags that exist in the whitelist will be allowed. All else will be escaped into HTML entities. :keyword replacement: A string to replace unsafe HTML with. If set to "entities", will convert unsafe tags to HTML entities so they display as-is but won't be evaluated by renderers/browsers'. The defaults is "(removed)". :keyword log_rejects: If `True` rejected unsafe (XSS) HTML will be logged using :meth:`logging.error`. Defaults to `False` :keyword ending_slash: If `True` self-closing HTML tags like '<img>' will not have a '/' placed before the '>'. Usually only necessary with XML and XHTML documents (as opposed to regular HTML). Defaults to `False`. :type safe_mode: boolean :type whitelist: iterable :type replacement: string, "entities", or "off" :type log_rejects: boolean :type ending_slash: boolean The `TagWrap` class may be used in a direct fashion (as opposed to the metaprogramming magic way: ``from htmltag import sometag``):: >>> from htmltag import TagWrap >>> img = TagWrap('img', ending_slash=True) >>> print(img(src="http://company.com/someimage.png")) <img src="http://company.com/someimage.png" /> The `TagWrap` class also has a :meth:`~TagWrap.copy` method which can be useful when you want a new tag to have the same attributes as another:: >>> from htmltag import TagWrap >>> whitelist = ["b", "i", "strong", "a", "em"] >>> replacement = "(tag not allowed)" >>> b = TagWrap('b', whitelist=whitelist, replacement=replacement) >>> i = b.copy('i') >>> print(i.whitelist) ['b', 'i', 'strong', 'a', 'em'] Here's how you can create a number of tags with your own custom settings all at once:: >>> import sys >>> from htmltag import TagWrap >>> whitelist = ["b", "i", "strong", "a", "em"] # Whitelist ourselves >>> replacement = "(tag not allowed)" >>> for tag in whitelist: ... setattr(sys.modules[__name__], tag, ... TagWrap(tag, whitelist=whitelist, replacement=replacement)) >>> strong.replacement # doctest: +SKIP '(tag not allowed)' # doctest: +SKIP .. note:: ``sys.modules[__name__]`` is the current module; the global 'self'. """ # NOTE: The above doctest is skipped because it only works in reality :) def __init__(self, tagname, **kwargs): self.tagname = tagname self.safe_mode = kwargs.get('safe_mode', True) self.whitelist = kwargs.get('whitelist', "off") self.replacement = kwargs.get('replacement', '(removed)') self.log_rejects = kwargs.get('log_rejects', False) # This only applies to self-closing tags: self.ending_slash = kwargs.get('ending_slash', False)
[docs] def escape(self, string): """ Returns *string* with all instances of '<', '>', and '&' converted into HTML entities. """ html_entities = {"&": "&amp;", '<': '&lt;', '>': '&gt;'} return HTML("".join(html_entities.get(c, c) for c in string))
[docs] def wrap(self, tag, *args, **kwargs): """ Returns all *args* (strings) wrapped in HTML tags like so:: >>> b = TagWrap('b') >>> print(b('bold text')) <b>bold text</b> To add attributes to the tag you can pass them as keyword arguments:: >>> a = TagWrap('a') >>> print(a('awesome software', href='http://liftoffsoftware.com/')) <a href="http://liftoffsoftware.com/">awesome software</a> .. note:: :meth:`~TagWrap.wrap` will automatically convert '<', '>', \ and '&' into HTML entities unless the wrapped string has an `__html__` \ method """ template = "<{tagstart}>{content}</{tag}>" if tag in self_closing_tags: template = "<{tagstart}>" # self-closing tags don't have content if self.ending_slash: template = "<{tagstart} />" content = "" for string in args: if not hasattr(string, '__html__'): # Indicates already escaped string = self.escape(string) content += string.__html__() tagstart = tag if kwargs: tagstart += ' ' for key, value in kwargs.items(): key = key.lstrip('_') tagstart = tagstart + '{key}="{value}" '.format( key=key, value=value) tagstart = tagstart.rstrip() html = template.format(tagstart=tagstart, content=content, tag=tag) if self.safe_mode: html, rejected = strip_xss( html, whitelist=self.whitelist, replacement=self.replacement) if self.log_rejects: logging.error( "{name} rejected unsafe HTML: '{rejected}'".format( name=self.__class__.__name__, rejected=rejected)) html = HTML(html) html.tagname = tag # So we can easily append() return html
[docs] def copy(self, tagname, **kwargs): """ Returns a new instance of `TagWrap` using the given *tagname* that has all the same attributes as this instance. If *kwargs* is given they will override the attributes of the created instance. """ new_kwargs = { 'replacement': self.replacement, 'whitelist': self.whitelist, 'safe_mode': self.safe_mode, 'log_rejects': self.log_rejects, 'ending_slash': self.ending_slash } new_kwargs.update(**kwargs) return TagWrap(tagname, **new_kwargs)
def __call__(self, *args, **kwargs): return self.wrap(self.tagname, *args, **kwargs) def __getitem__(self, k): if k == "__all__": raise ImportError("Cannot 'import *' with htmltag.") if isinstance(k, str): if k.startswith('__') and k.endswith("__"): raise AttributeError elif k in self.__dict__: return self.__dict__[k] raise ImportError( "Using IPython? Ignore that ^ traceback stuff and try again " "(second time usually works to get your traceback)")
[docs]class SelfWrap(ModuleType): """ This class is the magic that lets us do things like:: >>> from htmltag import span """ def __init__(self, tagname, *args, **kwargs): self.tagname = tagname # This is necessary for reload() to work and so we don't overwrite # these values with instances of TagWrap: no_override = [ 'HTML', 'SelfWrap', 'TagWrap', 'strip_xss', '__author__', '__builtins__', '__doc__', '__license__', '__name__', '__package__', '__version__', '__version_info__' ] for attr in no_override: setattr(self, attr, getattr(tagname, attr, None)) self.__path__ = [] # Required for Python 3.3 self.__file__ = FILE # Needed for Sphinx docs def __getattr__(self, name): # "from htmltag import a" <--*name* will be 'a' # This is how Python looks up the module name if name not in self.__dict__: # So we don't overwrite what's already set # Make our instance of TagWrap exist so we can return it properly setattr(self, name, TagWrap(name)) return self.__dict__[name] def __call__(self, *args, **kwargs): # This turns the 'a' in "from htmltag import a" into a callable: return TagWrap(self.tagname, *args, **kwargs)
if __name__ == "__main__": # NOTE: Execute `python htmltag.py -v` to run the doctests. # Doctests should work in both Python 2 and Python 3. import doctest doctest.testmod() else: self = sys.modules[__name__] sys.modules[__name__] = SelfWrap(self)