# HG changeset patch # User Brian Neal # Date 1239501783 0 # Node ID b3b11edf91d85df111045ef4bbc14d2279714930 # Parent d6f3c38e8f50010ef9051ce41b40861b91a7b693 News: removed the lxml stuff. Based on Jacob Kaplan-Moss suggestion, use html5lib to clean html. Added that functionality in a new core.html module. diff -r d6f3c38e8f50 -r b3b11edf91d8 gpp/core/functions.py --- a/gpp/core/functions.py Sun Apr 12 00:34:08 2009 +0000 +++ b/gpp/core/functions.py Sun Apr 12 02:03:03 2009 +0000 @@ -5,25 +5,6 @@ from django.conf import settings from core import logging -from lxml.html.clean import Cleaner - -html_cleaner = Cleaner(scripts=True, - javascript=True, - comments=True, - style=True, - links=True, - meta=True, - page_structure=True, - processing_instructions=True, - embedded=True, - frames=True, - forms=True, - annoying_tags=True, - remove_unknown_tags=True, - safe_attrs_only=True, - host_whitelist=['www.youtube.com'], - whitelist_tags=['object', 'param', 'embed'], - ) def send_mail(subject, message, from_email, recipient_list, @@ -61,13 +42,6 @@ [mail_tuple[1] for mail_tuple in settings.MANAGERS]) -def clean_html(s): - """Cleans HTML of dangerous tags and content.""" - if s: - return html_cleaner.clean_html(s) - return s - - def get_full_name(user): """Returns the user's full name if available, otherwise falls back to the username.""" @@ -75,3 +49,5 @@ if full_name: return full_name return user.username + +# vim: ts=4 sw=4 diff -r d6f3c38e8f50 -r b3b11edf91d8 gpp/core/html.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/core/html.py Sun Apr 12 02:03:03 2009 +0000 @@ -0,0 +1,28 @@ +import html5lib +from html5lib import sanitizer, treebuilders, treewalkers, serializer + +def sanitizer_factory(*args, **kwargs): + san = sanitizer.HTMLSanitizer(*args, **kwargs) + # This isn't available yet + # san.strip_tokens = True + return san + +def clean_html(buf): + """Cleans HTML of dangerous tags and content.""" + buf = buf.strip() + if not buf: + return buf + + p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), + tokenizer=sanitizer_factory) + dom_tree = p.parseFragment(buf) + + walker = treewalkers.getTreeWalker("dom") + stream = walker(dom_tree) + + s = serializer.htmlserializer.HTMLSerializer( + omit_optional_tags=False, + quote_attr_values=True) + return s.render(stream) + +# vim: ts=4 sw=4 diff -r d6f3c38e8f50 -r b3b11edf91d8 gpp/news/views.py --- a/gpp/news/views.py Sun Apr 12 00:34:08 2009 +0000 +++ b/gpp/news/views.py Sun Apr 12 02:03:03 2009 +0000 @@ -18,7 +18,7 @@ from tagging.models import Tag from tagging.models import TaggedItem -from core.functions import clean_html +from core.html import clean_html from core.functions import send_mail from core.functions import get_full_name from core.paginator import DiggPaginator