bgneal@9: import html5lib
bgneal@9: from html5lib import sanitizer, treebuilders, treewalkers, serializer
bgneal@9: 
bgneal@9: def sanitizer_factory(*args, **kwargs):
bgneal@9:     san = sanitizer.HTMLSanitizer(*args, **kwargs)
bgneal@9:     # This isn't available yet
bgneal@9:     # san.strip_tokens = True
bgneal@9:     return san
bgneal@9: 
bgneal@9: def clean_html(buf):
bgneal@9:     """Cleans HTML of dangerous tags and content."""
bgneal@9:     buf = buf.strip()
bgneal@9:     if not buf:
bgneal@9:         return buf
bgneal@9: 
bgneal@9:     p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
bgneal@9:             tokenizer=sanitizer_factory)
bgneal@9:     dom_tree = p.parseFragment(buf)
bgneal@9: 
bgneal@9:     walker = treewalkers.getTreeWalker("dom")
bgneal@9:     stream = walker(dom_tree)
bgneal@9: 
bgneal@9:     s = serializer.htmlserializer.HTMLSerializer(
bgneal@9:             omit_optional_tags=False,
bgneal@9:             quote_attr_values=True)
bgneal@9:     return s.render(stream) 
bgneal@9: 
bgneal@9: # vim: ts=4 sw=4