Mercurial > public > sg101
diff gpp/core/html.py @ 9:b3b11edf91d8
News: removed the lxml stuff. Based on Jacob Kaplan-Moss suggestion, use html5lib to clean html. Added that functionality in a new core.html module.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Sun, 12 Apr 2009 02:03:03 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/core/html.py Sun Apr 12 02:03:03 2009 +0000 @@ -0,0 +1,28 @@ +import html5lib +from html5lib import sanitizer, treebuilders, treewalkers, serializer + +def sanitizer_factory(*args, **kwargs): + san = sanitizer.HTMLSanitizer(*args, **kwargs) + # This isn't available yet + # san.strip_tokens = True + return san + +def clean_html(buf): + """Cleans HTML of dangerous tags and content.""" + buf = buf.strip() + if not buf: + return buf + + p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), + tokenizer=sanitizer_factory) + dom_tree = p.parseFragment(buf) + + walker = treewalkers.getTreeWalker("dom") + stream = walker(dom_tree) + + s = serializer.htmlserializer.HTMLSerializer( + omit_optional_tags=False, + quote_attr_values=True) + return s.render(stream) + +# vim: ts=4 sw=4