diff core/html.py @ 849:ff645a692791

For issue #79, use bleach to sanitize both user input markdown & html.
author Brian Neal <bgneal@gmail.com>
date Thu, 30 Oct 2014 19:30:37 -0500
parents ee87ea74d46b
children 928b97ec55a7
line wrap: on
line diff
--- a/core/html.py	Tue Oct 28 19:33:14 2014 -0500
+++ b/core/html.py	Thu Oct 30 19:30:37 2014 -0500
@@ -1,28 +1,49 @@
-import html5lib
-from html5lib import sanitizer, treebuilders, treewalkers, serializer
+"""Common HTML related functions"""
+import bleach
 
-def sanitizer_factory(*args, **kwargs):
-    san = sanitizer.HTMLSanitizer(*args, **kwargs)
-    # This isn't available yet
-    # san.strip_tokens = True
-    return san
 
-def clean_html(buf):
+# Each entry in the _CLEAN_PROFILES dict is a profile name -> 3-tuple pair. The
+# tuple consists of (allowed_tags_list, allowed_attributes_dict,
+# allowed_styles_list)
+#
+_CLEAN_PROFILES = {
+    'comments': (
+        [
+            'a', 'b', 'blockquote', 'br', 'code', 'del', 'em',
+            'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+            'i', 'img', 'li', 'ol', 'p', 'pre', 'strong', 'ul',
+        ],
+        {
+            'a': ['href'],
+            'img': ['src', 'alt', 'title'],
+        },
+        [],
+    ),
+    'news': (
+        [
+            'a', 'b', 'blockquote', 'br', 'caption', 'center', 'code', 'col',
+            'colgroup', 'dd', 'del', 'div', 'dl', 'dt', 'em',
+            'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+            'i', 'img', 'ins', 'li', 'ol', 'p', 'pre', 'small', 'strike',
+            'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th',
+            'thead', 'tr', 'tt', 'u', 'ul',
+        ],
+        {
+            'a': ['href'],
+            'img': ['src', 'alt', 'title', 'width', 'height'],
+        },
+        [],
+    ),
+}
+
+
+def clean_html(text, profile='comments'):
     """Cleans HTML of dangerous tags and content."""
-    buf = buf.strip()
-    if not buf:
-        return buf
+    text = text.strip()
+    if not text:
+        return text
 
-    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
-            tokenizer=sanitizer_factory)
-    dom_tree = p.parseFragment(buf)
+    tags, attrs, styles = _CLEAN_PROFILES[profile]
 
-    walker = treewalkers.getTreeWalker("dom")
-    stream = walker(dom_tree)
-
-    s = serializer.htmlserializer.HTMLSerializer(
-            omit_optional_tags=False,
-            quote_attr_values=True)
-    return s.render(stream) 
-
-# vim: ts=4 sw=4
+    return bleach.clean(text, tags=tags, attributes=attrs, styles=styles,
+        strip=True, strip_comments=True)