diff core/html.py @ 955:71a671dab55d

First commit of whitelisting image hosts. This is behind a feature flag courtesy of waffle.
author Brian Neal <bgneal@gmail.com>
date Wed, 03 Jun 2015 21:13:08 -0500
parents 928b97ec55a7
children
line wrap: on
line diff
--- a/core/html.py	Tue May 26 20:40:31 2015 -0500
+++ b/core/html.py	Wed Jun 03 21:13:08 2015 -0500
@@ -1,5 +1,10 @@
 """Common HTML related functions"""
+from urlparse import urlparse
+
 import bleach
+from lxml import etree
+
+from django.conf import settings
 
 
 # Each entry in the _CLEAN_PROFILES dict is a profile name -> 3-tuple pair. The
@@ -47,3 +52,47 @@
 
     return bleach.clean(text, tags=tags, attributes=attrs, styles=styles,
         strip=True, strip_comments=True)
+
+
+class ImageCheckError(Exception):
+    """Exception for the image_check() function"""
+
+
+ALLOWED_HOSTS = set(settings.USER_IMAGES_SOURCES)
+
+
+def image_check(html, allowed_hosts=None):
+    """Returns true if all image tags in the given html come from hosts
+    specified in the allowed_hosts container using https.
+
+    An ImageCheckError is raised if the following problems are detected:
+        * the image src is missing altogether
+        * the scheme is missing or not https
+        * the hostname is missing
+        * the hostname is not in allowed_hosts
+
+    If allowed_hosts is not None, it will be used as the whitelist of allowed
+    hosts. If None, USER_IMAGES_SOURCES from settings will be used as the
+    whitelist.
+    """
+    if not allowed_hosts:
+        allowed_hosts = ALLOWED_HOSTS
+
+    root = etree.HTML(html)
+    for img in root.iter('img'):
+        src = img.get('src')
+        if not src:
+            raise ImageCheckError("missing image source")
+        r = urlparse(src)
+
+        if not r.scheme and not r.hostname:
+            # relative URL is ok
+            continue
+        if r.scheme != 'https':
+            raise ImageCheckError("image must be accessed via https")
+        if not r.hostname:
+            raise ImageCheckError("missing image hostname")
+        if r.hostname not in allowed_hosts:
+            raise ImageCheckError("invalid image source")
+
+    return True