# HG changeset patch # User Brian Neal # Date 1433383988 18000 # Node ID 71a671dab55d0a734fc881cbf7e9dbe81818d98d # Parent e56455f4626b449a35aa8ab9352db3119e73e597 First commit of whitelisting image hosts. This is behind a feature flag courtesy of waffle. diff -r e56455f4626b -r 71a671dab55d core/html.py --- a/core/html.py Tue May 26 20:40:31 2015 -0500 +++ b/core/html.py Wed Jun 03 21:13:08 2015 -0500 @@ -1,5 +1,10 @@ """Common HTML related functions""" +from urlparse import urlparse + import bleach +from lxml import etree + +from django.conf import settings # Each entry in the _CLEAN_PROFILES dict is a profile name -> 3-tuple pair. The @@ -47,3 +52,47 @@ return bleach.clean(text, tags=tags, attributes=attrs, styles=styles, strip=True, strip_comments=True) + + +class ImageCheckError(Exception): + """Exception for the image_check() function""" + + +ALLOWED_HOSTS = set(settings.USER_IMAGES_SOURCES) + + +def image_check(html, allowed_hosts=None): + """Returns true if all image tags in the given html come from hosts + specified in the allowed_hosts container using https. + + An ImageCheckError is raised if the following problems are detected: + * the image src is missing altogether + * the scheme is missing or not https + * the hostname is missing + * the hostname is not in allowed_hosts + + If allowed_hosts is not None, it will be used as the whitelist of allowed + hosts. If None, USER_IMAGES_SOURCES from settings will be used as the + whitelist. + """ + if not allowed_hosts: + allowed_hosts = ALLOWED_HOSTS + + root = etree.HTML(html) + for img in root.iter('img'): + src = img.get('src') + if not src: + raise ImageCheckError("missing image source") + r = urlparse(src) + + if not r.scheme and not r.hostname: + # relative URL is ok + continue + if r.scheme != 'https': + raise ImageCheckError("image must be accessed via https") + if not r.hostname: + raise ImageCheckError("missing image hostname") + if r.hostname not in allowed_hosts: + raise ImageCheckError("invalid image source") + + return True diff -r e56455f4626b -r 71a671dab55d core/tests/test_html.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/core/tests/test_html.py Wed Jun 03 21:13:08 2015 -0500 @@ -0,0 +1,68 @@ +"""Tests for the core.html module.""" +import unittest + +from core.html import ImageCheckError +from core.html import image_check + + +TEST_HTML = """ +

Posters and Facebook events are starting to come in...

+

image

+

image

+""" + + +class ImageCheckTestCase(unittest.TestCase): + def setUp(self): + self.allowed_hosts = ['example.com'] + + def test_happy_path(self): + url1 = 'https://example.com/1.jpg' + url2 = 'https://example.com/2.jpg' + html = TEST_HTML.format(src1=url1, src2=url2) + + result = image_check(html, self.allowed_hosts) + self.assertTrue(result) + + def test_empty_image(self): + url1 = 'https://example.com/1.jpg' + url2 = '' + html = TEST_HTML.format(src1=url1, src2=url2) + + self.assertRaises(ImageCheckError, image_check, html, self.allowed_hosts) + + def test_relative_ok(self): + url1 = 'https://example.com/1.jpg' + url2 = '/some/path/2.jpg' + html = TEST_HTML.format(src1=url1, src2=url2) + + result = image_check(html, self.allowed_hosts) + self.assertTrue(result) + + def test_non_https(self): + url1 = 'http://example.com/1.jpg' + url2 = 'https://example.com/2.jpg' + html = TEST_HTML.format(src1=url1, src2=url2) + + self.assertRaises(ImageCheckError, image_check, html, self.allowed_hosts) + + def test_missing_hostname(self): + url1 = 'http:///1.jpg' + url2 = 'https://example.com/2.jpg' + html = TEST_HTML.format(src1=url1, src2=url2) + + self.assertRaises(ImageCheckError, image_check, html, self.allowed_hosts) + + def test_hostname_not_allowed1(self): + url1 = 'https://xxx.example.com/1.jpg' + url2 = 'https://example.com/2.jpg' + html = TEST_HTML.format(src1=url1, src2=url2) + + self.assertRaises(ImageCheckError, image_check, html, self.allowed_hosts) + + def test_hostname_not_allowed2(self): + url1 = 'https://xxx.example.com/1.jpg' + url2 = 'https://yyy.example.com/2.jpg' + html = TEST_HTML.format(src1=url1, src2=url2) + + self.assertRaises(ImageCheckError, image_check, html, self.allowed_hosts) diff -r e56455f4626b -r 71a671dab55d forums/forms.py --- a/forums/forms.py Tue May 26 20:40:31 2015 -0500 +++ b/forums/forms.py Wed Jun 03 21:13:08 2015 -0500 @@ -7,12 +7,15 @@ from django import forms from django.conf import settings +from core.markup import site_markup from forums.models import Forum from forums.models import Topic from forums.models import Post from forums.attachments import AttachmentProcessor import forums.permissions as perms from forums.signals import notify_new_topic, notify_new_post +from core.html import ImageCheckError +from core.html import image_check FORUMS_FORM_CSS = { @@ -117,7 +120,7 @@ raise forms.ValidationError("This field is required.") return data - def save(self, ip=None): + def save(self, ip=None, html=None): """ Creates the new Topic and first Post from the form data and supplied arguments. @@ -133,7 +136,7 @@ user=self.user, body=self.cleaned_data['body'], user_ip=ip) - post.save() + post.save(html=html) self.attach_proc.save_attachments(post) @@ -143,6 +146,23 @@ return topic +class NewTopicFormS3(NewTopicForm): + """Form for ensuring image sources come from a white-listed set of + sources. + """ + def clean_body(self): + body_data = self.cleaned_data['body'] + self.body_html = site_markup(body_data) + try: + image_check(self.body_html) + except ImageCheckError as ex: + raise forms.ValidationError(str(ex)) + return body_data + + def save(self, ip=None): + return super(NewTopicFormS3, self).save(ip, html=self.body_html) + + class PostForm(forms.ModelForm): """ Form for editing an existing post or a new, non-quick post. diff -r e56455f4626b -r 71a671dab55d forums/models.py --- a/forums/models.py Tue May 26 20:40:31 2015 -0500 +++ b/forums/models.py Wed Jun 03 21:13:08 2015 -0500 @@ -311,7 +311,9 @@ self.creation_date = datetime.datetime.now() self.update_date = self.creation_date - self.html = site_markup(self.body) + self.html = kwargs.pop('html', None) + if self.html is None: + self.html = site_markup(self.body) super(Post, self).save(*args, **kwargs) def delete(self, *args, **kwargs): diff -r e56455f4626b -r 71a671dab55d forums/views/main.py --- a/forums/views/main.py Tue May 26 20:40:31 2015 -0500 +++ b/forums/views/main.py Wed Jun 03 21:13:08 2015 -0500 @@ -20,6 +20,7 @@ from django.template import RequestContext from django.views.decorators.http import require_POST from django.db.models import F +import waffle import antispam import antispam.utils @@ -30,7 +31,7 @@ from forums.models import (Forum, Topic, Post, FlaggedPost, TopicLastVisit, ForumLastVisit) from forums.forms import (NewTopicForm, NewPostForm, PostForm, MoveTopicForm, - SplitTopicForm) + SplitTopicForm, NewTopicFormS3) from forums.unread import (get_forum_unread_status, get_topic_unread_status, get_post_unread_status, get_unread_topics) @@ -295,8 +296,11 @@ if not perms.can_access(forum.category, request.user): return HttpResponseForbidden() + form_class = (NewTopicFormS3 if waffle.flag_is_active(request, 's3_images') + else NewTopicForm) + if request.method == 'POST': - form = NewTopicForm(request.user, forum, request.POST) + form = form_class(request.user, forum, request.POST) if form.is_valid(): if antispam.utils.spam_check(request, form.cleaned_data['body']): return HttpResponseRedirect(reverse('antispam-suspended')) @@ -306,7 +310,7 @@ return HttpResponseRedirect(reverse('forums-new_topic_thanks', kwargs={'tid': topic.pk})) else: - form = NewTopicForm(request.user, forum) + form = form_class(request.user, forum) return render_to_response('forums/new_topic.html', { 'forum': forum, diff -r e56455f4626b -r 71a671dab55d sg101/settings/base.py --- a/sg101/settings/base.py Tue May 26 20:40:31 2015 -0500 +++ b/sg101/settings/base.py Wed Jun 03 21:13:08 2015 -0500 @@ -298,6 +298,14 @@ USER_PHOTOS_THUMB_SIZE = (120, 120) USER_PHOTOS_RATE_LIMIT = (50, 86400) # number / seconds +USER_IMAGES_SOURCES = [ + 'dl.dropboxusercontent.com', + 's3.amazonaws.com', + 's3-us-west-1.amazonaws.com', + 'scontent-ord1-1.xx.fbcdn.net', + 'lh3.googleusercontent.com', +] + # If this flag is False, the queued_search queue will not be processed. This is # useful when we are rebuilding the search index. SEARCH_QUEUE_ENABLED = True