Mercurial > public > sg101
comparison core/html.py @ 955:71a671dab55d
First commit of whitelisting image hosts.
This is behind a feature flag courtesy of waffle.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Wed, 03 Jun 2015 21:13:08 -0500 |
parents | 928b97ec55a7 |
children |
comparison
equal
deleted
inserted
replaced
954:e56455f4626b | 955:71a671dab55d |
---|---|
1 """Common HTML related functions""" | 1 """Common HTML related functions""" |
2 from urlparse import urlparse | |
3 | |
2 import bleach | 4 import bleach |
5 from lxml import etree | |
6 | |
7 from django.conf import settings | |
3 | 8 |
4 | 9 |
5 # Each entry in the _CLEAN_PROFILES dict is a profile name -> 3-tuple pair. The | 10 # Each entry in the _CLEAN_PROFILES dict is a profile name -> 3-tuple pair. The |
6 # tuple consists of (allowed_tags_list, allowed_attributes_dict, | 11 # tuple consists of (allowed_tags_list, allowed_attributes_dict, |
7 # allowed_styles_list) | 12 # allowed_styles_list) |
45 | 50 |
46 tags, attrs, styles = _CLEAN_PROFILES[profile] | 51 tags, attrs, styles = _CLEAN_PROFILES[profile] |
47 | 52 |
48 return bleach.clean(text, tags=tags, attributes=attrs, styles=styles, | 53 return bleach.clean(text, tags=tags, attributes=attrs, styles=styles, |
49 strip=True, strip_comments=True) | 54 strip=True, strip_comments=True) |
55 | |
56 | |
57 class ImageCheckError(Exception): | |
58 """Exception for the image_check() function""" | |
59 | |
60 | |
61 ALLOWED_HOSTS = set(settings.USER_IMAGES_SOURCES) | |
62 | |
63 | |
64 def image_check(html, allowed_hosts=None): | |
65 """Returns true if all image tags in the given html come from hosts | |
66 specified in the allowed_hosts container using https. | |
67 | |
68 An ImageCheckError is raised if the following problems are detected: | |
69 * the image src is missing altogether | |
70 * the scheme is missing or not https | |
71 * the hostname is missing | |
72 * the hostname is not in allowed_hosts | |
73 | |
74 If allowed_hosts is not None, it will be used as the whitelist of allowed | |
75 hosts. If None, USER_IMAGES_SOURCES from settings will be used as the | |
76 whitelist. | |
77 """ | |
78 if not allowed_hosts: | |
79 allowed_hosts = ALLOWED_HOSTS | |
80 | |
81 root = etree.HTML(html) | |
82 for img in root.iter('img'): | |
83 src = img.get('src') | |
84 if not src: | |
85 raise ImageCheckError("missing image source") | |
86 r = urlparse(src) | |
87 | |
88 if not r.scheme and not r.hostname: | |
89 # relative URL is ok | |
90 continue | |
91 if r.scheme != 'https': | |
92 raise ImageCheckError("image must be accessed via https") | |
93 if not r.hostname: | |
94 raise ImageCheckError("missing image hostname") | |
95 if r.hostname not in allowed_hosts: | |
96 raise ImageCheckError("invalid image source") | |
97 | |
98 return True |