changeset 963:4619290d171d

Whitelist hot-linked image sources.
author Brian Neal <bgneal@gmail.com>
date Tue, 01 Sep 2015 20:33:40 -0500
parents 10e7570a3aab
children 51a2051588f5
files comments/forms.py comments/models.py comments/views.py core/html.py core/management/commands/ssl_images.py core/tests/test_ssl_images.py forums/forms.py forums/models.py forums/views/main.py sg101/settings/base.py
diffstat 10 files changed, 161 insertions(+), 35 deletions(-) [+]
line wrap: on
line diff
--- a/comments/forms.py	Tue Aug 04 16:58:17 2015 -0500
+++ b/comments/forms.py	Tue Sep 01 20:33:40 2015 -0500
@@ -7,6 +7,10 @@
 from django.contrib.contenttypes.models import ContentType
 
 from comments.models import Comment
+from core.html import ImageCheckError
+from core.html import image_check
+from core.markup import site_markup
+
 
 COMMENT_MAX_LENGTH = getattr(settings, 'COMMENT_MAX_LENGTH', 3000)
 
@@ -64,6 +68,18 @@
 
         return new
 
+    def clean_comment(self):
+        comment = self.cleaned_data['comment']
+        self.comment_html = None
+        if comment:
+            self.comment_html = site_markup(comment)
+            try:
+                image_check(self.comment_html)
+            except ImageCheckError as ex:
+                raise forms.ValidationError(str(ex))
+
+        return comment
+
     class Media:
         css = {
             'all': (settings.GPP_THIRD_PARTY_CSS['markitup'] +
--- a/comments/models.py	Tue Aug 04 16:58:17 2015 -0500
+++ b/comments/models.py	Tue Sep 01 20:33:40 2015 -0500
@@ -57,7 +57,9 @@
         if not self.id:
             self.creation_date = datetime.datetime.now()
 
-        self.html = site_markup(self.comment)
+        self.html = kwargs.pop('html', None)
+        if self.html is None:
+            self.html = site_markup(self.comment)
         super(Comment, self).save(*args, **kwargs)
 
     def get_absolute_url(self):
--- a/comments/views.py	Tue Aug 04 16:58:17 2015 -0500
+++ b/comments/views.py	Tue Sep 01 20:33:40 2015 -0500
@@ -67,7 +67,12 @@
 
     form = CommentForm(target, request.POST)
     if not form.is_valid():
-        return HttpResponseBadRequest('Invalid comment; missing parameters?')
+        # The client side javascript is pretty simplistic right now and we don't
+        # want to change it yet. It is expecting a single error string. Just grab
+        # the first error message and use that.
+        errors = form.errors.as_data()
+        msg = errors.values()[0][0].message if errors else 'Unknown error'
+        return HttpResponseBadRequest(msg)
 
     comment = form.get_comment_object(request.user, request.META.get("REMOTE_ADDR", None))
 
@@ -76,7 +81,7 @@
     if antispam.utils.spam_check(request, comment.comment):
         return HttpResponseForbidden(antispam.BUSTED_MESSAGE)
 
-    comment.save()
+    comment.save(html=form.comment_html)
 
     # return the rendered comment
     return render_to_response('comments/comment.html', {
--- a/core/html.py	Tue Aug 04 16:58:17 2015 -0500
+++ b/core/html.py	Tue Sep 01 20:33:40 2015 -0500
@@ -1,5 +1,10 @@
 """Common HTML related functions"""
+from urlparse import urlparse
+
 import bleach
+from lxml import etree
+
+from django.conf import settings
 
 
 # Each entry in the _CLEAN_PROFILES dict is a profile name -> 3-tuple pair. The
@@ -47,3 +52,47 @@
 
     return bleach.clean(text, tags=tags, attributes=attrs, styles=styles,
         strip=True, strip_comments=True)
+
+
+class ImageCheckError(Exception):
+    """Exception for the image_check() function"""
+
+
+ALLOWED_HOSTS = set(settings.USER_IMAGES_SOURCES)
+
+
+def image_check(html, allowed_hosts=None):
+    """Returns true if all image tags in the given html come from hosts
+    specified in the allowed_hosts container using https.
+
+    An ImageCheckError is raised if the following problems are detected:
+        * the image src is missing altogether
+        * the scheme is missing or not https
+        * the hostname is missing
+        * the hostname is not in allowed_hosts
+
+    If allowed_hosts is not None, it will be used as the whitelist of allowed
+    hosts. If None, USER_IMAGES_SOURCES from settings will be used as the
+    whitelist.
+    """
+    if not allowed_hosts:
+        allowed_hosts = ALLOWED_HOSTS
+
+    root = etree.HTML(html)
+    for img in root.iter('img'):
+        src = img.get('src')
+        if not src:
+            raise ImageCheckError("Missing image source")
+        r = urlparse(src)
+
+        if not r.scheme and not r.hostname:
+            # relative URL is ok
+            continue
+        if r.scheme != 'https':
+            raise ImageCheckError("Image must be accessed via https")
+        if not r.hostname:
+            raise ImageCheckError("Missing image hostname")
+        if r.hostname not in allowed_hosts:
+            raise ImageCheckError("Invalid image source")
+
+    return True
--- a/core/management/commands/ssl_images.py	Tue Aug 04 16:58:17 2015 -0500
+++ b/core/management/commands/ssl_images.py	Tue Sep 01 20:33:40 2015 -0500
@@ -39,6 +39,7 @@
                           re.DOTALL | re.UNICODE)
 
 SG101_HOSTS = set(['www.surfguitar101.com', 'surfguitar101.com'])
+WHITELIST_HOSTS = set(settings.USER_IMAGES_SOURCES)
 MODEL_CHOICES = ['comments', 'posts']
 
 PHOTO_MAX_SIZE = (660, 720)
@@ -224,7 +225,10 @@
             # Try a few things to get this on ssl:
             new_src = convert_to_ssl(r)
         elif r.scheme == 'https':
-            new_src = src       # already https, accept it as-is
+            if r.hostname in WHITELIST_HOSTS:
+                new_src = src   # already in whitelist
+            else:
+                new_src = convert_to_ssl(r)
 
     if new_src:
         if title:
--- a/core/tests/test_ssl_images.py	Tue Aug 04 16:58:17 2015 -0500
+++ b/core/tests/test_ssl_images.py	Tue Sep 01 20:33:40 2015 -0500
@@ -4,6 +4,7 @@
 from urlparse import urlparse
 
 import mock
+from django.conf import settings
 
 from core.management.commands.ssl_images import html_check
 from core.management.commands.ssl_images import process_post
@@ -14,6 +15,10 @@
 
     SG101_RE = re.compile(r'http://(?:www\.)?surfguitar101.com/', re.I)
 
+    def setUp(self):
+        self.assertTrue(len(settings.USER_IMAGES_SOURCES) > 0)
+        self.safe_host = settings.USER_IMAGES_SOURCES[0]
+
     def tearDown(self):
         core.management.commands.ssl_images.url_cache = {}
 
@@ -50,9 +55,9 @@
 
     def test_https_already(self):
         test_str = """An image that is already using https:
-            ![flyer](https://example.com/zzz.png)
+            ![flyer](https://{}/zzz.png)
             It's cool.
-            """
+            """.format(self.safe_host)
         result = process_post(test_str)
         self.assertEqual(test_str, result)
 
@@ -70,19 +75,19 @@
 
     def test_multiple_non_http(self):
         test_str = """An image: ![image](http://www.surfguitar101.com/img.jpg)
-        And another: ![pic](HTTPS://example.com/foo/bar/img.png).
-        More stuff here."""
+        And another: ![pic](HTTPS://{}/foo/bar/img.png).
+        More stuff here.""".format(self.safe_host)
         expected = """An image: ![image](/img.jpg)
-        And another: ![pic](HTTPS://example.com/foo/bar/img.png).
-        More stuff here."""
+        And another: ![pic](HTTPS://{}/foo/bar/img.png).
+        More stuff here.""".format(self.safe_host)
         result = process_post(test_str)
         self.assertEqual(expected, result)
 
     def test_https_already_with_title(self):
         test_str = """An image that is already using https:
-            ![flyer](https://example.com/zzz.png "the title")
+            ![flyer](https://{}/zzz.png "the title")
             It's cool.
-            """
+            """.format(self.safe_host)
         result = process_post(test_str)
         self.assertEqual(test_str, result)
 
@@ -112,13 +117,13 @@
 
     def test_https_already_brackets(self):
         test_str = """An image that is already using https:
-            ![flyer](<https://example.com/zzz.png>)
+            ![flyer](<https://{}/zzz.png>)
             It's cool.
-            """
+            """.format(self.safe_host)
         expected = """An image that is already using https:
-            ![flyer](https://example.com/zzz.png)
+            ![flyer](https://{}/zzz.png)
             It's cool.
-            """
+            """.format(self.safe_host)
         result = process_post(test_str)
         self.assertEqual(expected, result)
 
@@ -172,13 +177,13 @@
     def test_multiple_replacement_2(self, upload_mock):
         old_src = [
             'http://example.com/images/my_image.jpg',
-            'https://example.com/static/wow.gif',
+            'https://{}/static/wow.gif'.format(self.safe_host),
             'http://www.surfguitar101.com/media/a/b/c/pic.png',
             'http://surfguitar101.com/media/a/b/c/pic2.png',
         ]
         new_src = [
             'https://cloud.com/some/path/012345.jpg',
-            'https://example.com/static/wow.gif',
+            'https://{}/static/wow.gif'.format(self.safe_host),
             '/media/a/b/c/pic.png',
             '/media/a/b/c/pic2.png',
         ]
--- a/forums/forms.py	Tue Aug 04 16:58:17 2015 -0500
+++ b/forums/forms.py	Tue Sep 01 20:33:40 2015 -0500
@@ -13,6 +13,9 @@
 from forums.attachments import AttachmentProcessor
 import forums.permissions as perms
 from forums.signals import notify_new_topic, notify_new_post
+from core.html import ImageCheckError
+from core.html import image_check
+from core.markup import site_markup
 
 
 FORUMS_FORM_CSS = {
@@ -44,17 +47,26 @@
         self.attach_proc = AttachmentProcessor(attachments)
 
     def clean_body(self):
-        data = self.cleaned_data['body']
-        if not data and not self.attach_proc.has_attachments():
-            raise forms.ValidationError("This field is required.")
-        return data
+        body = self.cleaned_data['body']
+        self.body_html = None
+        if not body and not self.attach_proc.has_attachments():
+            raise forms.ValidationError("Please enter some text")
+
+        if body:
+            self.body_html = site_markup(body)
+            try:
+                image_check(self.body_html)
+            except ImageCheckError as ex:
+                raise forms.ValidationError(str(ex))
+
+        return body
 
     def clean_topic_id(self):
         id = self.cleaned_data['topic_id']
         try:
             self.topic = Topic.objects.select_related().get(pk=id)
         except Topic.DoesNotExist:
-            raise forms.ValidationError('invalid topic')
+            raise forms.ValidationError('Invalid topic')
         return id
 
     def save(self, user, ip=None):
@@ -63,7 +75,7 @@
         """
         post = Post(topic=self.topic, user=user, body=self.cleaned_data['body'],
                 user_ip=ip)
-        post.save()
+        post.save(html=self.body_html)
         self.attach_proc.save_attachments(post)
         notify_new_post(post)
         return post
@@ -112,10 +124,19 @@
                     choices=[(v, v) for v in pks])
 
     def clean_body(self):
-        data = self.cleaned_data['body']
-        if not data and not self.attach_proc.has_attachments():
+        body = self.cleaned_data['body']
+        self.body_html = None
+        if not body and not self.attach_proc.has_attachments():
             raise forms.ValidationError("This field is required.")
-        return data
+
+        if body:
+            self.body_html = site_markup(body)
+            try:
+                image_check(self.body_html)
+            except ImageCheckError as ex:
+                raise forms.ValidationError(str(ex))
+
+        return body
 
     def save(self, ip=None):
         """
@@ -133,7 +154,7 @@
                 user=self.user,
                 body=self.cleaned_data['body'],
                 user_ip=ip)
-        post.save()
+        post.save(html=self.body_html)
 
         self.attach_proc.save_attachments(post)
 
@@ -189,10 +210,19 @@
                         widget=forms.HiddenInput(attrs={'value': post.id}))
 
     def clean_body(self):
-        data = self.cleaned_data['body']
-        if not data and not self.attach_proc.has_attachments():
+        body = self.cleaned_data['body']
+        self.body_html = None
+        if not body and not self.attach_proc.has_attachments():
             raise forms.ValidationError('This field is required.')
-        return data
+
+        if body:
+            self.body_html = site_markup(body)
+            try:
+                image_check(self.body_html)
+            except ImageCheckError as ex:
+                raise forms.ValidationError(str(ex))
+
+        return body
 
     def save(self, *args, **kwargs):
         commit = kwargs.get('commit', False)
--- a/forums/models.py	Tue Aug 04 16:58:17 2015 -0500
+++ b/forums/models.py	Tue Sep 01 20:33:40 2015 -0500
@@ -311,7 +311,9 @@
             self.creation_date = datetime.datetime.now()
             self.update_date = self.creation_date
 
-        self.html = site_markup(self.body)
+        self.html = kwargs.pop('html', None)
+        if self.html is None:
+            self.html = site_markup(self.body)
         super(Post, self).save(*args, **kwargs)
 
     def delete(self, *args, **kwargs):
--- a/forums/views/main.py	Tue Aug 04 16:58:17 2015 -0500
+++ b/forums/views/main.py	Tue Sep 01 20:33:40 2015 -0500
@@ -359,7 +359,12 @@
             },
             context_instance=RequestContext(request))
 
-    return HttpResponseBadRequest("Oops, did you forget some text?");
+    # The client side javascript is pretty simplistic right now and we don't
+    # want to change it yet. It is expecting a single error string. Just grab
+    # the first error message and use that.
+    errors = form.errors.as_data()
+    msg = errors.values()[0][0].message if errors else 'Unknown error'
+    return HttpResponseBadRequest(msg)
 
 
 def _goto_post(post):
@@ -439,7 +444,7 @@
                 return HttpResponseRedirect(reverse('antispam-suspended'))
             post = form.save(commit=False)
             post.touch()
-            post.save()
+            post.save(html=form.body_html)
             notify_updated_post(post)
 
             # if we are editing a first post, save the parent topic as well
@@ -589,7 +594,7 @@
                 post.topic = topic
                 post.user = request.user
                 post.user_ip = request.META.get("REMOTE_ADDR", "")
-                post.save()
+                post.save(html=form.body_html)
                 notify_new_post(post)
 
                 # Save any attachments
--- a/sg101/settings/base.py	Tue Aug 04 16:58:17 2015 -0500
+++ b/sg101/settings/base.py	Tue Sep 01 20:33:40 2015 -0500
@@ -296,6 +296,14 @@
 USER_PHOTOS_THUMB_SIZE = (120, 120)
 USER_PHOTOS_RATE_LIMIT = (50, 86400)   # number / seconds
 
+USER_IMAGES_SOURCES = [
+    'dl.dropboxusercontent.com',
+    's3.amazonaws.com',
+    's3-us-west-1.amazonaws.com',
+    'scontent-ord1-1.xx.fbcdn.net',
+    'lh3.googleusercontent.com',
+]
+
 # If this flag is False, the queued_search queue will not be processed. This is
 # useful when we are rebuilding the search index.
 SEARCH_QUEUE_ENABLED = True