view gpp/core/html.py @ 271:4746df47a538

Follow on to last rev (r292) for #126. Missed updating a shoutbox template. Also the repoze.timeago package uses UTC time by default. Change this to local time for now until we decide to switch over to UTC for everything.
author Brian Neal <bgneal@gmail.com>
date Sun, 26 Sep 2010 17:42:00 +0000
parents b3b11edf91d8
children
line wrap: on
line source
import html5lib
from html5lib import sanitizer, treebuilders, treewalkers, serializer

def sanitizer_factory(*args, **kwargs):
    san = sanitizer.HTMLSanitizer(*args, **kwargs)
    # This isn't available yet
    # san.strip_tokens = True
    return san

def clean_html(buf):
    """Cleans HTML of dangerous tags and content."""
    buf = buf.strip()
    if not buf:
        return buf

    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
            tokenizer=sanitizer_factory)
    dom_tree = p.parseFragment(buf)

    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = serializer.htmlserializer.HTMLSerializer(
            omit_optional_tags=False,
            quote_attr_values=True)
    return s.render(stream) 

# vim: ts=4 sw=4