diff gpp/core/html.py @ 9:b3b11edf91d8

News: removed the lxml stuff. Based on Jacob Kaplan-Moss suggestion, use html5lib to clean html. Added that functionality in a new core.html module.
author Brian Neal <bgneal@gmail.com>
date Sun, 12 Apr 2009 02:03:03 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gpp/core/html.py	Sun Apr 12 02:03:03 2009 +0000
@@ -0,0 +1,28 @@
+import html5lib
+from html5lib import sanitizer, treebuilders, treewalkers, serializer
+
+def sanitizer_factory(*args, **kwargs):
+    san = sanitizer.HTMLSanitizer(*args, **kwargs)
+    # This isn't available yet
+    # san.strip_tokens = True
+    return san
+
+def clean_html(buf):
+    """Cleans HTML of dangerous tags and content."""
+    buf = buf.strip()
+    if not buf:
+        return buf
+
+    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
+            tokenizer=sanitizer_factory)
+    dom_tree = p.parseFragment(buf)
+
+    walker = treewalkers.getTreeWalker("dom")
+    stream = walker(dom_tree)
+
+    s = serializer.htmlserializer.HTMLSerializer(
+            omit_optional_tags=False,
+            quote_attr_values=True)
+    return s.render(stream) 
+
+# vim: ts=4 sw=4