changeset 955:71a671dab55d

First commit of whitelisting image hosts. This is behind a feature flag courtesy of waffle.
author Brian Neal <bgneal@gmail.com>
date Wed, 03 Jun 2015 21:13:08 -0500 (2015-06-04)
parents e56455f4626b
children 76ee6403e033
files core/html.py core/tests/test_html.py forums/forms.py forums/models.py forums/views/main.py sg101/settings/base.py
diffstat 6 files changed, 157 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/core/html.py	Tue May 26 20:40:31 2015 -0500
+++ b/core/html.py	Wed Jun 03 21:13:08 2015 -0500
@@ -1,5 +1,10 @@
 """Common HTML related functions"""
+from urlparse import urlparse
+
 import bleach
+from lxml import etree
+
+from django.conf import settings
 
 
 # Each entry in the _CLEAN_PROFILES dict is a profile name -> 3-tuple pair. The
@@ -47,3 +52,47 @@
 
     return bleach.clean(text, tags=tags, attributes=attrs, styles=styles,
         strip=True, strip_comments=True)
+
+
+class ImageCheckError(Exception):
+    """Exception for the image_check() function"""
+
+
+ALLOWED_HOSTS = set(settings.USER_IMAGES_SOURCES)
+
+
+def image_check(html, allowed_hosts=None):
+    """Returns true if all image tags in the given html come from hosts
+    specified in the allowed_hosts container using https.
+
+    An ImageCheckError is raised if the following problems are detected:
+        * the image src is missing altogether
+        * the scheme is missing or not https
+        * the hostname is missing
+        * the hostname is not in allowed_hosts
+
+    If allowed_hosts is not None, it will be used as the whitelist of allowed
+    hosts. If None, USER_IMAGES_SOURCES from settings will be used as the
+    whitelist.
+    """
+    if not allowed_hosts:
+        allowed_hosts = ALLOWED_HOSTS
+
+    root = etree.HTML(html)
+    for img in root.iter('img'):
+        src = img.get('src')
+        if not src:
+            raise ImageCheckError("missing image source")
+        r = urlparse(src)
+
+        if not r.scheme and not r.hostname:
+            # relative URL is ok
+            continue
+        if r.scheme != 'https':
+            raise ImageCheckError("image must be accessed via https")
+        if not r.hostname:
+            raise ImageCheckError("missing image hostname")
+        if r.hostname not in allowed_hosts:
+            raise ImageCheckError("invalid image source")
+
+    return True
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/core/tests/test_html.py	Wed Jun 03 21:13:08 2015 -0500
@@ -0,0 +1,68 @@
+"""Tests for the core.html module."""
+import unittest
+
+from core.html import ImageCheckError
+from core.html import image_check
+
+
+TEST_HTML = """
+<p>Posters and Facebook events are starting to come in...</p>
+<p><img src="{src1}" alt="image"></p>
+<p><img src="{src2}" alt="image"></p>
+"""
+
+
+class ImageCheckTestCase(unittest.TestCase):
+    def setUp(self):
+        self.allowed_hosts = ['example.com']
+
+    def test_happy_path(self):
+        url1 = 'https://example.com/1.jpg'
+        url2 = 'https://example.com/2.jpg'
+        html = TEST_HTML.format(src1=url1, src2=url2)
+
+        result = image_check(html, self.allowed_hosts)
+        self.assertTrue(result)
+
+    def test_empty_image(self):
+        url1 = 'https://example.com/1.jpg'
+        url2 = ''
+        html = TEST_HTML.format(src1=url1, src2=url2)
+
+        self.assertRaises(ImageCheckError, image_check, html, self.allowed_hosts)
+
+    def test_relative_ok(self):
+        url1 = 'https://example.com/1.jpg'
+        url2 = '/some/path/2.jpg'
+        html = TEST_HTML.format(src1=url1, src2=url2)
+
+        result = image_check(html, self.allowed_hosts)
+        self.assertTrue(result)
+
+    def test_non_https(self):
+        url1 = 'http://example.com/1.jpg'
+        url2 = 'https://example.com/2.jpg'
+        html = TEST_HTML.format(src1=url1, src2=url2)
+
+        self.assertRaises(ImageCheckError, image_check, html, self.allowed_hosts)
+
+    def test_missing_hostname(self):
+        url1 = 'http:///1.jpg'
+        url2 = 'https://example.com/2.jpg'
+        html = TEST_HTML.format(src1=url1, src2=url2)
+
+        self.assertRaises(ImageCheckError, image_check, html, self.allowed_hosts)
+
+    def test_hostname_not_allowed1(self):
+        url1 = 'https://xxx.example.com/1.jpg'
+        url2 = 'https://example.com/2.jpg'
+        html = TEST_HTML.format(src1=url1, src2=url2)
+
+        self.assertRaises(ImageCheckError, image_check, html, self.allowed_hosts)
+
+    def test_hostname_not_allowed2(self):
+        url1 = 'https://xxx.example.com/1.jpg'
+        url2 = 'https://yyy.example.com/2.jpg'
+        html = TEST_HTML.format(src1=url1, src2=url2)
+
+        self.assertRaises(ImageCheckError, image_check, html, self.allowed_hosts)
--- a/forums/forms.py	Tue May 26 20:40:31 2015 -0500
+++ b/forums/forms.py	Wed Jun 03 21:13:08 2015 -0500
@@ -7,12 +7,15 @@
 from django import forms
 from django.conf import settings
 
+from core.markup import site_markup
 from forums.models import Forum
 from forums.models import Topic
 from forums.models import Post
 from forums.attachments import AttachmentProcessor
 import forums.permissions as perms
 from forums.signals import notify_new_topic, notify_new_post
+from core.html import ImageCheckError
+from core.html import image_check
 
 
 FORUMS_FORM_CSS = {
@@ -117,7 +120,7 @@
             raise forms.ValidationError("This field is required.")
         return data
 
-    def save(self, ip=None):
+    def save(self, ip=None, html=None):
         """
         Creates the new Topic and first Post from the form data and supplied
         arguments.
@@ -133,7 +136,7 @@
                 user=self.user,
                 body=self.cleaned_data['body'],
                 user_ip=ip)
-        post.save()
+        post.save(html=html)
 
         self.attach_proc.save_attachments(post)
 
@@ -143,6 +146,23 @@
         return topic
 
 
+class NewTopicFormS3(NewTopicForm):
+    """Form for ensuring image sources come from a white-listed set of
+    sources.
+    """
+    def clean_body(self):
+        body_data = self.cleaned_data['body']
+        self.body_html = site_markup(body_data)
+        try:
+            image_check(self.body_html)
+        except ImageCheckError as ex:
+            raise forms.ValidationError(str(ex))
+        return body_data
+
+    def save(self, ip=None):
+        return super(NewTopicFormS3, self).save(ip, html=self.body_html)
+
+
 class PostForm(forms.ModelForm):
     """
     Form for editing an existing post or a new, non-quick post.
--- a/forums/models.py	Tue May 26 20:40:31 2015 -0500
+++ b/forums/models.py	Wed Jun 03 21:13:08 2015 -0500
@@ -311,7 +311,9 @@
             self.creation_date = datetime.datetime.now()
             self.update_date = self.creation_date
 
-        self.html = site_markup(self.body)
+        self.html = kwargs.pop('html', None)
+        if self.html is None:
+            self.html = site_markup(self.body)
         super(Post, self).save(*args, **kwargs)
 
     def delete(self, *args, **kwargs):
--- a/forums/views/main.py	Tue May 26 20:40:31 2015 -0500
+++ b/forums/views/main.py	Wed Jun 03 21:13:08 2015 -0500
@@ -20,6 +20,7 @@
 from django.template import RequestContext
 from django.views.decorators.http import require_POST
 from django.db.models import F
+import waffle
 
 import antispam
 import antispam.utils
@@ -30,7 +31,7 @@
 from forums.models import (Forum, Topic, Post, FlaggedPost, TopicLastVisit,
         ForumLastVisit)
 from forums.forms import (NewTopicForm, NewPostForm, PostForm, MoveTopicForm,
-        SplitTopicForm)
+        SplitTopicForm, NewTopicFormS3)
 from forums.unread import (get_forum_unread_status, get_topic_unread_status,
         get_post_unread_status, get_unread_topics)
 
@@ -295,8 +296,11 @@
     if not perms.can_access(forum.category, request.user):
         return HttpResponseForbidden()
 
+    form_class = (NewTopicFormS3 if waffle.flag_is_active(request, 's3_images')
+                                 else NewTopicForm)
+
     if request.method == 'POST':
-        form = NewTopicForm(request.user, forum, request.POST)
+        form = form_class(request.user, forum, request.POST)
         if form.is_valid():
             if antispam.utils.spam_check(request, form.cleaned_data['body']):
                 return HttpResponseRedirect(reverse('antispam-suspended'))
@@ -306,7 +310,7 @@
             return HttpResponseRedirect(reverse('forums-new_topic_thanks',
                                             kwargs={'tid': topic.pk}))
     else:
-        form = NewTopicForm(request.user, forum)
+        form = form_class(request.user, forum)
 
     return render_to_response('forums/new_topic.html', {
         'forum': forum,
--- a/sg101/settings/base.py	Tue May 26 20:40:31 2015 -0500
+++ b/sg101/settings/base.py	Wed Jun 03 21:13:08 2015 -0500
@@ -298,6 +298,14 @@
 USER_PHOTOS_THUMB_SIZE = (120, 120)
 USER_PHOTOS_RATE_LIMIT = (50, 86400)   # number / seconds
 
+USER_IMAGES_SOURCES = [
+    'dl.dropboxusercontent.com',
+    's3.amazonaws.com',
+    's3-us-west-1.amazonaws.com',
+    'scontent-ord1-1.xx.fbcdn.net',
+    'lh3.googleusercontent.com',
+]
+
 # If this flag is False, the queued_search queue will not be processed. This is
 # useful when we are rebuilding the search index.
 SEARCH_QUEUE_ENABLED = True