bgneal@9: import html5lib bgneal@9: from html5lib import sanitizer, treebuilders, treewalkers, serializer bgneal@9: bgneal@9: def sanitizer_factory(*args, **kwargs): bgneal@9: san = sanitizer.HTMLSanitizer(*args, **kwargs) bgneal@9: # This isn't available yet bgneal@9: # san.strip_tokens = True bgneal@9: return san bgneal@9: bgneal@9: def clean_html(buf): bgneal@9: """Cleans HTML of dangerous tags and content.""" bgneal@9: buf = buf.strip() bgneal@9: if not buf: bgneal@9: return buf bgneal@9: bgneal@9: p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), bgneal@9: tokenizer=sanitizer_factory) bgneal@9: dom_tree = p.parseFragment(buf) bgneal@9: bgneal@9: walker = treewalkers.getTreeWalker("dom") bgneal@9: stream = walker(dom_tree) bgneal@9: bgneal@9: s = serializer.htmlserializer.HTMLSerializer( bgneal@9: omit_optional_tags=False, bgneal@9: quote_attr_values=True) bgneal@9: return s.render(stream) bgneal@9: bgneal@9: # vim: ts=4 sw=4