changeset 849:ff645a692791

For issue #79, use bleach to sanitize both user input markdown & html.
author Brian Neal <bgneal@gmail.com>
date Thu, 30 Oct 2014 19:30:37 -0500
parents 32ebe22f0cad
children 202e0828aafe
files core/html.py core/markup.py news/forms.py news/views.py requirements.txt requirements_dev.txt
diffstat 6 files changed, 61 insertions(+), 36 deletions(-) [+]
line wrap: on
line diff
--- a/core/html.py	Tue Oct 28 19:33:14 2014 -0500
+++ b/core/html.py	Thu Oct 30 19:30:37 2014 -0500
@@ -1,28 +1,49 @@
-import html5lib
-from html5lib import sanitizer, treebuilders, treewalkers, serializer
+"""Common HTML related functions"""
+import bleach
 
-def sanitizer_factory(*args, **kwargs):
-    san = sanitizer.HTMLSanitizer(*args, **kwargs)
-    # This isn't available yet
-    # san.strip_tokens = True
-    return san
 
-def clean_html(buf):
+# Each entry in the _CLEAN_PROFILES dict is a profile name -> 3-tuple pair. The
+# tuple consists of (allowed_tags_list, allowed_attributes_dict,
+# allowed_styles_list)
+#
+_CLEAN_PROFILES = {
+    'comments': (
+        [
+            'a', 'b', 'blockquote', 'br', 'code', 'del', 'em',
+            'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+            'i', 'img', 'li', 'ol', 'p', 'pre', 'strong', 'ul',
+        ],
+        {
+            'a': ['href'],
+            'img': ['src', 'alt', 'title'],
+        },
+        [],
+    ),
+    'news': (
+        [
+            'a', 'b', 'blockquote', 'br', 'caption', 'center', 'code', 'col',
+            'colgroup', 'dd', 'del', 'div', 'dl', 'dt', 'em',
+            'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+            'i', 'img', 'ins', 'li', 'ol', 'p', 'pre', 'small', 'strike',
+            'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th',
+            'thead', 'tr', 'tt', 'u', 'ul',
+        ],
+        {
+            'a': ['href'],
+            'img': ['src', 'alt', 'title', 'width', 'height'],
+        },
+        [],
+    ),
+}
+
+
+def clean_html(text, profile='comments'):
     """Cleans HTML of dangerous tags and content."""
-    buf = buf.strip()
-    if not buf:
-        return buf
+    text = text.strip()
+    if not text:
+        return text
 
-    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
-            tokenizer=sanitizer_factory)
-    dom_tree = p.parseFragment(buf)
+    tags, attrs, styles = _CLEAN_PROFILES[profile]
 
-    walker = treewalkers.getTreeWalker("dom")
-    stream = walker(dom_tree)
-
-    s = serializer.htmlserializer.HTMLSerializer(
-            omit_optional_tags=False,
-            quote_attr_values=True)
-    return s.render(stream) 
-
-# vim: ts=4 sw=4
+    return bleach.clean(text, tags=tags, attributes=attrs, styles=styles,
+        strip=True, strip_comments=True)
--- a/core/markup.py	Tue Oct 28 19:33:14 2014 -0500
+++ b/core/markup.py	Thu Oct 30 19:30:37 2014 -0500
@@ -8,22 +8,23 @@
 from smiley import SmilifyMarkdown
 from core.mdexts.urlize import UrlizeExtension
 from core.mdexts.deleted import DelExtension
+from core.html import clean_html
+
 
 class Markdown(object):
     """
     This is a thin wrapper around the Markdown class.
 
     """
-    def __init__(self, safe_mode='escape'):
-        self.md = _markdown.Markdown(safe_mode=safe_mode,
-                                     extensions=[
+    def __init__(self):
+        self.md = _markdown.Markdown(extensions=[
                                          UrlizeExtension(),
                                          'markdown.extensions.nl2br',
                                          DelExtension(),
                                      ])
 
     def convert(self, s):
-        return self.md.convert(force_unicode(s))
+        return clean_html(self.md.convert(force_unicode(s)))
 
 
 def markdown(s):
--- a/news/forms.py	Tue Oct 28 19:33:14 2014 -0500
+++ b/news/forms.py	Thu Oct 30 19:30:37 2014 -0500
@@ -6,7 +6,6 @@
 from django.conf import settings
 
 from news.models import PendingStory
-from news.models import Category
 
 
 class AddNewsForm(forms.ModelForm):
--- a/news/views.py	Tue Oct 28 19:33:14 2014 -0500
+++ b/news/views.py	Thu Oct 30 19:30:37 2014 -0500
@@ -11,7 +11,6 @@
 from django.shortcuts import get_object_or_404
 from django.core.paginator import InvalidPage
 from django.core.urlresolvers import reverse
-from django.db.models import Q
 from django.contrib.sites.models import Site
 from django.http import Http404
 
@@ -24,7 +23,6 @@
 from core.functions import get_page
 from core.paginator import DiggPaginator
 from news.models import Category
-from news.models import PendingStory
 from news.models import Story
 from news.forms import AddNewsForm
 from news.forms import SendStoryForm
@@ -145,8 +143,8 @@
         if add_form.is_valid():
             pending_story = add_form.save(commit=False)
             pending_story.submitter = request.user
-            pending_story.short_text = clean_html(pending_story.short_text)
-            pending_story.long_text = clean_html(pending_story.long_text)
+            pending_story.short_text = _clean_html(pending_story.short_text)
+            pending_story.long_text = _clean_html(pending_story.long_text)
             pending_story.save()
             return HttpResponseRedirect(reverse('news.views.submit_thanks'))
     else:
@@ -239,3 +237,7 @@
         },
         context_instance = RequestContext(request))
 
+#######################################################################
+
+def _clean_html(s):
+    return clean_html(s, profile='news')
--- a/requirements.txt	Tue Oct 28 19:33:14 2014 -0500
+++ b/requirements.txt	Thu Oct 30 19:30:37 2014 -0500
@@ -1,11 +1,12 @@
 Django==1.6.6
-Markdown==2.3.1
+Markdown==2.5.1
 MySQL-python==1.2.4
 -e git+https://github.com/gremmie/django-elsewhere.git@1203bd331aba4c5d4e702cc4e64d807310f2b591#egg=django_elsewhere-dev
 django-haystack==2.1.0
 django-tagging==0.3.1
 gdata==2.0.15
-html5lib==0.90
+html5lib==0.999
+bleach==1.4
 pytz==2013b
 queued-search==2.1.0
 queues==0.6.3
--- a/requirements_dev.txt	Tue Oct 28 19:33:14 2014 -0500
+++ b/requirements_dev.txt	Thu Oct 30 19:30:37 2014 -0500
@@ -1,12 +1,13 @@
 Django==1.6.6
-Markdown==2.3.1
+Markdown==2.5.1
 MySQL-python==1.2.5
 django-debug-toolbar==1.0
 -e git+https://github.com/gremmie/django-elsewhere.git@1203bd331aba4c5d4e702cc4e64d807310f2b591#egg=django_elsewhere-master
 django-haystack==2.1.0
 django-tagging==0.3.1
 gdata==2.0.15
-html5lib==0.90
+html5lib==0.999
+bleach==1.4
 pytz==2013b
 queued-search==2.1.0
 queues==0.6.3