bgneal@9: import html5lib
bgneal@9: from html5lib import sanitizer, treebuilders, treewalkers, serializer
bgneal@9:
bgneal@9: def sanitizer_factory(*args, **kwargs):
bgneal@9: san = sanitizer.HTMLSanitizer(*args, **kwargs)
bgneal@9: # This isn't available yet
bgneal@9: # san.strip_tokens = True
bgneal@9: return san
bgneal@9:
bgneal@9: def clean_html(buf):
bgneal@9: """Cleans HTML of dangerous tags and content."""
bgneal@9: buf = buf.strip()
bgneal@9: if not buf:
bgneal@9: return buf
bgneal@9:
bgneal@9: p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
bgneal@9: tokenizer=sanitizer_factory)
bgneal@9: dom_tree = p.parseFragment(buf)
bgneal@9:
bgneal@9: walker = treewalkers.getTreeWalker("dom")
bgneal@9: stream = walker(dom_tree)
bgneal@9:
bgneal@9: s = serializer.htmlserializer.HTMLSerializer(
bgneal@9: omit_optional_tags=False,
bgneal@9: quote_attr_values=True)
bgneal@9: return s.render(stream)
bgneal@9:
bgneal@9: # vim: ts=4 sw=4