view gpp/core/html.py @ 339:b871892264f2

Adding the sg101 IRC bot code to SVN. This code is pretty rough and needs love, but it gets the job done (one of my first Python apps). This fixes #150.
author Brian Neal <bgneal@gmail.com>
date Sat, 26 Feb 2011 21:27:49 +0000
parents b3b11edf91d8
children
line wrap: on
line source
import html5lib
from html5lib import sanitizer, treebuilders, treewalkers, serializer

def sanitizer_factory(*args, **kwargs):
    san = sanitizer.HTMLSanitizer(*args, **kwargs)
    # This isn't available yet
    # san.strip_tokens = True
    return san

def clean_html(buf):
    """Cleans HTML of dangerous tags and content."""
    buf = buf.strip()
    if not buf:
        return buf

    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
            tokenizer=sanitizer_factory)
    dom_tree = p.parseFragment(buf)

    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = serializer.htmlserializer.HTMLSerializer(
            omit_optional_tags=False,
            quote_attr_values=True)
    return s.render(stream) 

# vim: ts=4 sw=4