Mercurial > public > sg101
diff core/html.py @ 955:71a671dab55d
First commit of whitelisting image hosts.
This is behind a feature flag courtesy of waffle.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Wed, 03 Jun 2015 21:13:08 -0500 |
parents | 928b97ec55a7 |
children |
line wrap: on
line diff
--- a/core/html.py Tue May 26 20:40:31 2015 -0500 +++ b/core/html.py Wed Jun 03 21:13:08 2015 -0500 @@ -1,5 +1,10 @@ """Common HTML related functions""" +from urlparse import urlparse + import bleach +from lxml import etree + +from django.conf import settings # Each entry in the _CLEAN_PROFILES dict is a profile name -> 3-tuple pair. The @@ -47,3 +52,47 @@ return bleach.clean(text, tags=tags, attributes=attrs, styles=styles, strip=True, strip_comments=True) + + +class ImageCheckError(Exception): + """Exception for the image_check() function""" + + +ALLOWED_HOSTS = set(settings.USER_IMAGES_SOURCES) + + +def image_check(html, allowed_hosts=None): + """Returns true if all image tags in the given html come from hosts + specified in the allowed_hosts container using https. + + An ImageCheckError is raised if the following problems are detected: + * the image src is missing altogether + * the scheme is missing or not https + * the hostname is missing + * the hostname is not in allowed_hosts + + If allowed_hosts is not None, it will be used as the whitelist of allowed + hosts. If None, USER_IMAGES_SOURCES from settings will be used as the + whitelist. + """ + if not allowed_hosts: + allowed_hosts = ALLOWED_HOSTS + + root = etree.HTML(html) + for img in root.iter('img'): + src = img.get('src') + if not src: + raise ImageCheckError("missing image source") + r = urlparse(src) + + if not r.scheme and not r.hostname: + # relative URL is ok + continue + if r.scheme != 'https': + raise ImageCheckError("image must be accessed via https") + if not r.hostname: + raise ImageCheckError("missing image hostname") + if r.hostname not in allowed_hosts: + raise ImageCheckError("invalid image source") + + return True