bgneal@849
|
1 """Common HTML related functions"""
|
bgneal@955
|
2 from urlparse import urlparse
|
bgneal@955
|
3
|
bgneal@849
|
4 import bleach
|
bgneal@955
|
5 from lxml import etree
|
bgneal@955
|
6
|
bgneal@955
|
7 from django.conf import settings
|
bgneal@9
|
8
|
bgneal@9
|
9
|
bgneal@849
|
10 # Each entry in the _CLEAN_PROFILES dict is a profile name -> 3-tuple pair. The
|
bgneal@849
|
11 # tuple consists of (allowed_tags_list, allowed_attributes_dict,
|
bgneal@849
|
12 # allowed_styles_list)
|
bgneal@849
|
13 #
|
bgneal@849
|
14 _CLEAN_PROFILES = {
|
bgneal@849
|
15 'comments': (
|
bgneal@849
|
16 [
|
bgneal@849
|
17 'a', 'b', 'blockquote', 'br', 'code', 'del', 'em',
|
bgneal@864
|
18 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr',
|
bgneal@849
|
19 'i', 'img', 'li', 'ol', 'p', 'pre', 'strong', 'ul',
|
bgneal@849
|
20 ],
|
bgneal@849
|
21 {
|
bgneal@849
|
22 'a': ['href'],
|
bgneal@849
|
23 'img': ['src', 'alt', 'title'],
|
bgneal@849
|
24 },
|
bgneal@849
|
25 [],
|
bgneal@849
|
26 ),
|
bgneal@849
|
27 'news': (
|
bgneal@849
|
28 [
|
bgneal@849
|
29 'a', 'b', 'blockquote', 'br', 'caption', 'center', 'code', 'col',
|
bgneal@849
|
30 'colgroup', 'dd', 'del', 'div', 'dl', 'dt', 'em',
|
bgneal@864
|
31 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr',
|
bgneal@849
|
32 'i', 'img', 'ins', 'li', 'ol', 'p', 'pre', 'small', 'strike',
|
bgneal@849
|
33 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th',
|
bgneal@849
|
34 'thead', 'tr', 'tt', 'u', 'ul',
|
bgneal@849
|
35 ],
|
bgneal@849
|
36 {
|
bgneal@849
|
37 'a': ['href'],
|
bgneal@849
|
38 'img': ['src', 'alt', 'title', 'width', 'height'],
|
bgneal@849
|
39 },
|
bgneal@849
|
40 [],
|
bgneal@849
|
41 ),
|
bgneal@849
|
42 }
|
bgneal@849
|
43
|
bgneal@849
|
44
|
bgneal@849
|
45 def clean_html(text, profile='comments'):
|
bgneal@9
|
46 """Cleans HTML of dangerous tags and content."""
|
bgneal@849
|
47 text = text.strip()
|
bgneal@849
|
48 if not text:
|
bgneal@849
|
49 return text
|
bgneal@9
|
50
|
bgneal@849
|
51 tags, attrs, styles = _CLEAN_PROFILES[profile]
|
bgneal@9
|
52
|
bgneal@849
|
53 return bleach.clean(text, tags=tags, attributes=attrs, styles=styles,
|
bgneal@849
|
54 strip=True, strip_comments=True)
|
bgneal@955
|
55
|
bgneal@955
|
56
|
bgneal@955
|
57 class ImageCheckError(Exception):
|
bgneal@955
|
58 """Exception for the image_check() function"""
|
bgneal@955
|
59
|
bgneal@955
|
60
|
bgneal@955
|
61 ALLOWED_HOSTS = set(settings.USER_IMAGES_SOURCES)
|
bgneal@955
|
62
|
bgneal@955
|
63
|
bgneal@955
|
64 def image_check(html, allowed_hosts=None):
|
bgneal@955
|
65 """Returns true if all image tags in the given html come from hosts
|
bgneal@955
|
66 specified in the allowed_hosts container using https.
|
bgneal@955
|
67
|
bgneal@955
|
68 An ImageCheckError is raised if the following problems are detected:
|
bgneal@955
|
69 * the image src is missing altogether
|
bgneal@955
|
70 * the scheme is missing or not https
|
bgneal@955
|
71 * the hostname is missing
|
bgneal@955
|
72 * the hostname is not in allowed_hosts
|
bgneal@955
|
73
|
bgneal@955
|
74 If allowed_hosts is not None, it will be used as the whitelist of allowed
|
bgneal@955
|
75 hosts. If None, USER_IMAGES_SOURCES from settings will be used as the
|
bgneal@955
|
76 whitelist.
|
bgneal@955
|
77 """
|
bgneal@955
|
78 if not allowed_hosts:
|
bgneal@955
|
79 allowed_hosts = ALLOWED_HOSTS
|
bgneal@955
|
80
|
bgneal@955
|
81 root = etree.HTML(html)
|
bgneal@955
|
82 for img in root.iter('img'):
|
bgneal@955
|
83 src = img.get('src')
|
bgneal@955
|
84 if not src:
|
bgneal@955
|
85 raise ImageCheckError("missing image source")
|
bgneal@955
|
86 r = urlparse(src)
|
bgneal@955
|
87
|
bgneal@955
|
88 if not r.scheme and not r.hostname:
|
bgneal@955
|
89 # relative URL is ok
|
bgneal@955
|
90 continue
|
bgneal@955
|
91 if r.scheme != 'https':
|
bgneal@955
|
92 raise ImageCheckError("image must be accessed via https")
|
bgneal@955
|
93 if not r.hostname:
|
bgneal@955
|
94 raise ImageCheckError("missing image hostname")
|
bgneal@955
|
95 if r.hostname not in allowed_hosts:
|
bgneal@955
|
96 raise ImageCheckError("invalid image source")
|
bgneal@955
|
97
|
bgneal@955
|
98 return True
|