diff core/html.py @ 581:ee87ea74d46b

For Django 1.4, rearranged project structure for new manage.py.
author Brian Neal <bgneal@gmail.com>
date Sat, 05 May 2012 17:10:48 -0500
parents gpp/core/html.py@b3b11edf91d8
children ff645a692791
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/core/html.py	Sat May 05 17:10:48 2012 -0500
@@ -0,0 +1,28 @@
+import html5lib
+from html5lib import sanitizer, treebuilders, treewalkers, serializer
+
+def sanitizer_factory(*args, **kwargs):
+    san = sanitizer.HTMLSanitizer(*args, **kwargs)
+    # This isn't available yet
+    # san.strip_tokens = True
+    return san
+
+def clean_html(buf):
+    """Cleans HTML of dangerous tags and content."""
+    buf = buf.strip()
+    if not buf:
+        return buf
+
+    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
+            tokenizer=sanitizer_factory)
+    dom_tree = p.parseFragment(buf)
+
+    walker = treewalkers.getTreeWalker("dom")
+    stream = walker(dom_tree)
+
+    s = serializer.htmlserializer.HTMLSerializer(
+            omit_optional_tags=False,
+            quote_attr_values=True)
+    return s.render(stream) 
+
+# vim: ts=4 sw=4