changeset 9:b3b11edf91d8

News: removed the lxml stuff. Based on Jacob Kaplan-Moss suggestion, use html5lib to clean html. Added that functionality in a new core.html module.
author Brian Neal <bgneal@gmail.com>
date Sun, 12 Apr 2009 02:03:03 +0000
parents d6f3c38e8f50
children f43f8a956f1d
files gpp/core/functions.py gpp/core/html.py gpp/news/views.py
diffstat 3 files changed, 31 insertions(+), 27 deletions(-) [+]
line wrap: on
line diff
--- a/gpp/core/functions.py	Sun Apr 12 00:34:08 2009 +0000
+++ b/gpp/core/functions.py	Sun Apr 12 02:03:03 2009 +0000
@@ -5,25 +5,6 @@
 from django.conf import settings
 
 from core import logging
-from lxml.html.clean import Cleaner
-
-html_cleaner = Cleaner(scripts=True,
-        javascript=True,
-        comments=True,
-        style=True,
-        links=True,
-        meta=True,
-        page_structure=True,
-        processing_instructions=True,
-        embedded=True,
-        frames=True,
-        forms=True,
-        annoying_tags=True,
-        remove_unknown_tags=True,
-        safe_attrs_only=True,
-        host_whitelist=['www.youtube.com'],
-        whitelist_tags=['object', 'param', 'embed'],
-        )
 
 
 def send_mail(subject, message, from_email, recipient_list, 
@@ -61,13 +42,6 @@
             [mail_tuple[1] for mail_tuple in settings.MANAGERS])
 
 
-def clean_html(s):
-    """Cleans HTML of dangerous tags and content."""
-    if s:
-        return html_cleaner.clean_html(s)
-    return s
-
-
 def get_full_name(user):
     """Returns the user's full name if available, otherwise falls back
     to the username."""
@@ -75,3 +49,5 @@
     if full_name:
         return full_name
     return user.username
+
+# vim: ts=4 sw=4
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gpp/core/html.py	Sun Apr 12 02:03:03 2009 +0000
@@ -0,0 +1,28 @@
+import html5lib
+from html5lib import sanitizer, treebuilders, treewalkers, serializer
+
+def sanitizer_factory(*args, **kwargs):
+    san = sanitizer.HTMLSanitizer(*args, **kwargs)
+    # This isn't available yet
+    # san.strip_tokens = True
+    return san
+
+def clean_html(buf):
+    """Cleans HTML of dangerous tags and content."""
+    buf = buf.strip()
+    if not buf:
+        return buf
+
+    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
+            tokenizer=sanitizer_factory)
+    dom_tree = p.parseFragment(buf)
+
+    walker = treewalkers.getTreeWalker("dom")
+    stream = walker(dom_tree)
+
+    s = serializer.htmlserializer.HTMLSerializer(
+            omit_optional_tags=False,
+            quote_attr_values=True)
+    return s.render(stream) 
+
+# vim: ts=4 sw=4
--- a/gpp/news/views.py	Sun Apr 12 00:34:08 2009 +0000
+++ b/gpp/news/views.py	Sun Apr 12 02:03:03 2009 +0000
@@ -18,7 +18,7 @@
 from tagging.models import Tag
 from tagging.models import TaggedItem
 
-from core.functions import clean_html
+from core.html import clean_html
 from core.functions import send_mail
 from core.functions import get_full_name
 from core.paginator import DiggPaginator