# HG changeset patch # User Brian Neal # Date 1414715437 18000 # Node ID ff645a69279119693d648646e01be395f7a3cec2 # Parent 32ebe22f0cadb6ef3bb01124414cf00ebf0838e4 For issue #79, use bleach to sanitize both user input markdown & html. diff -r 32ebe22f0cad -r ff645a692791 core/html.py --- a/core/html.py Tue Oct 28 19:33:14 2014 -0500 +++ b/core/html.py Thu Oct 30 19:30:37 2014 -0500 @@ -1,28 +1,49 @@ -import html5lib -from html5lib import sanitizer, treebuilders, treewalkers, serializer +"""Common HTML related functions""" +import bleach -def sanitizer_factory(*args, **kwargs): - san = sanitizer.HTMLSanitizer(*args, **kwargs) - # This isn't available yet - # san.strip_tokens = True - return san -def clean_html(buf): +# Each entry in the _CLEAN_PROFILES dict is a profile name -> 3-tuple pair. The +# tuple consists of (allowed_tags_list, allowed_attributes_dict, +# allowed_styles_list) +# +_CLEAN_PROFILES = { + 'comments': ( + [ + 'a', 'b', 'blockquote', 'br', 'code', 'del', 'em', + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', + 'i', 'img', 'li', 'ol', 'p', 'pre', 'strong', 'ul', + ], + { + 'a': ['href'], + 'img': ['src', 'alt', 'title'], + }, + [], + ), + 'news': ( + [ + 'a', 'b', 'blockquote', 'br', 'caption', 'center', 'code', 'col', + 'colgroup', 'dd', 'del', 'div', 'dl', 'dt', 'em', + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', + 'i', 'img', 'ins', 'li', 'ol', 'p', 'pre', 'small', 'strike', + 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', + 'thead', 'tr', 'tt', 'u', 'ul', + ], + { + 'a': ['href'], + 'img': ['src', 'alt', 'title', 'width', 'height'], + }, + [], + ), +} + + +def clean_html(text, profile='comments'): """Cleans HTML of dangerous tags and content.""" - buf = buf.strip() - if not buf: - return buf + text = text.strip() + if not text: + return text - p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), - tokenizer=sanitizer_factory) - dom_tree = p.parseFragment(buf) + tags, attrs, styles = _CLEAN_PROFILES[profile] - walker = treewalkers.getTreeWalker("dom") - stream = walker(dom_tree) - - s = serializer.htmlserializer.HTMLSerializer( - omit_optional_tags=False, - quote_attr_values=True) - return s.render(stream) - -# vim: ts=4 sw=4 + return bleach.clean(text, tags=tags, attributes=attrs, styles=styles, + strip=True, strip_comments=True) diff -r 32ebe22f0cad -r ff645a692791 core/markup.py --- a/core/markup.py Tue Oct 28 19:33:14 2014 -0500 +++ b/core/markup.py Thu Oct 30 19:30:37 2014 -0500 @@ -8,22 +8,23 @@ from smiley import SmilifyMarkdown from core.mdexts.urlize import UrlizeExtension from core.mdexts.deleted import DelExtension +from core.html import clean_html + class Markdown(object): """ This is a thin wrapper around the Markdown class. """ - def __init__(self, safe_mode='escape'): - self.md = _markdown.Markdown(safe_mode=safe_mode, - extensions=[ + def __init__(self): + self.md = _markdown.Markdown(extensions=[ UrlizeExtension(), 'markdown.extensions.nl2br', DelExtension(), ]) def convert(self, s): - return self.md.convert(force_unicode(s)) + return clean_html(self.md.convert(force_unicode(s))) def markdown(s): diff -r 32ebe22f0cad -r ff645a692791 news/forms.py --- a/news/forms.py Tue Oct 28 19:33:14 2014 -0500 +++ b/news/forms.py Thu Oct 30 19:30:37 2014 -0500 @@ -6,7 +6,6 @@ from django.conf import settings from news.models import PendingStory -from news.models import Category class AddNewsForm(forms.ModelForm): diff -r 32ebe22f0cad -r ff645a692791 news/views.py --- a/news/views.py Tue Oct 28 19:33:14 2014 -0500 +++ b/news/views.py Thu Oct 30 19:30:37 2014 -0500 @@ -11,7 +11,6 @@ from django.shortcuts import get_object_or_404 from django.core.paginator import InvalidPage from django.core.urlresolvers import reverse -from django.db.models import Q from django.contrib.sites.models import Site from django.http import Http404 @@ -24,7 +23,6 @@ from core.functions import get_page from core.paginator import DiggPaginator from news.models import Category -from news.models import PendingStory from news.models import Story from news.forms import AddNewsForm from news.forms import SendStoryForm @@ -145,8 +143,8 @@ if add_form.is_valid(): pending_story = add_form.save(commit=False) pending_story.submitter = request.user - pending_story.short_text = clean_html(pending_story.short_text) - pending_story.long_text = clean_html(pending_story.long_text) + pending_story.short_text = _clean_html(pending_story.short_text) + pending_story.long_text = _clean_html(pending_story.long_text) pending_story.save() return HttpResponseRedirect(reverse('news.views.submit_thanks')) else: @@ -239,3 +237,7 @@ }, context_instance = RequestContext(request)) +####################################################################### + +def _clean_html(s): + return clean_html(s, profile='news') diff -r 32ebe22f0cad -r ff645a692791 requirements.txt --- a/requirements.txt Tue Oct 28 19:33:14 2014 -0500 +++ b/requirements.txt Thu Oct 30 19:30:37 2014 -0500 @@ -1,11 +1,12 @@ Django==1.6.6 -Markdown==2.3.1 +Markdown==2.5.1 MySQL-python==1.2.4 -e git+https://github.com/gremmie/django-elsewhere.git@1203bd331aba4c5d4e702cc4e64d807310f2b591#egg=django_elsewhere-dev django-haystack==2.1.0 django-tagging==0.3.1 gdata==2.0.15 -html5lib==0.90 +html5lib==0.999 +bleach==1.4 pytz==2013b queued-search==2.1.0 queues==0.6.3 diff -r 32ebe22f0cad -r ff645a692791 requirements_dev.txt --- a/requirements_dev.txt Tue Oct 28 19:33:14 2014 -0500 +++ b/requirements_dev.txt Thu Oct 30 19:30:37 2014 -0500 @@ -1,12 +1,13 @@ Django==1.6.6 -Markdown==2.3.1 +Markdown==2.5.1 MySQL-python==1.2.5 django-debug-toolbar==1.0 -e git+https://github.com/gremmie/django-elsewhere.git@1203bd331aba4c5d4e702cc4e64d807310f2b591#egg=django_elsewhere-master django-haystack==2.1.0 django-tagging==0.3.1 gdata==2.0.15 -html5lib==0.90 +html5lib==0.999 +bleach==1.4 pytz==2013b queued-search==2.1.0 queues==0.6.3