comparison core/html.py @ 955:71a671dab55d

First commit of whitelisting image hosts. This is behind a feature flag courtesy of waffle.
author Brian Neal <bgneal@gmail.com>
date Wed, 03 Jun 2015 21:13:08 -0500
parents 928b97ec55a7
children
comparison
equal deleted inserted replaced
954:e56455f4626b 955:71a671dab55d
1 """Common HTML related functions""" 1 """Common HTML related functions"""
2 from urlparse import urlparse
3
2 import bleach 4 import bleach
5 from lxml import etree
6
7 from django.conf import settings
3 8
4 9
5 # Each entry in the _CLEAN_PROFILES dict is a profile name -> 3-tuple pair. The 10 # Each entry in the _CLEAN_PROFILES dict is a profile name -> 3-tuple pair. The
6 # tuple consists of (allowed_tags_list, allowed_attributes_dict, 11 # tuple consists of (allowed_tags_list, allowed_attributes_dict,
7 # allowed_styles_list) 12 # allowed_styles_list)
45 50
46 tags, attrs, styles = _CLEAN_PROFILES[profile] 51 tags, attrs, styles = _CLEAN_PROFILES[profile]
47 52
48 return bleach.clean(text, tags=tags, attributes=attrs, styles=styles, 53 return bleach.clean(text, tags=tags, attributes=attrs, styles=styles,
49 strip=True, strip_comments=True) 54 strip=True, strip_comments=True)
55
56
57 class ImageCheckError(Exception):
58 """Exception for the image_check() function"""
59
60
61 ALLOWED_HOSTS = set(settings.USER_IMAGES_SOURCES)
62
63
64 def image_check(html, allowed_hosts=None):
65 """Returns true if all image tags in the given html come from hosts
66 specified in the allowed_hosts container using https.
67
68 An ImageCheckError is raised if the following problems are detected:
69 * the image src is missing altogether
70 * the scheme is missing or not https
71 * the hostname is missing
72 * the hostname is not in allowed_hosts
73
74 If allowed_hosts is not None, it will be used as the whitelist of allowed
75 hosts. If None, USER_IMAGES_SOURCES from settings will be used as the
76 whitelist.
77 """
78 if not allowed_hosts:
79 allowed_hosts = ALLOWED_HOSTS
80
81 root = etree.HTML(html)
82 for img in root.iter('img'):
83 src = img.get('src')
84 if not src:
85 raise ImageCheckError("missing image source")
86 r = urlparse(src)
87
88 if not r.scheme and not r.hostname:
89 # relative URL is ok
90 continue
91 if r.scheme != 'https':
92 raise ImageCheckError("image must be accessed via https")
93 if not r.hostname:
94 raise ImageCheckError("missing image hostname")
95 if r.hostname not in allowed_hosts:
96 raise ImageCheckError("invalid image source")
97
98 return True