comparison core/html.py @ 849:ff645a692791

For issue #79, use bleach to sanitize both user input markdown & html.
author Brian Neal <bgneal@gmail.com>
date Thu, 30 Oct 2014 19:30:37 -0500
parents ee87ea74d46b
children 928b97ec55a7
comparison
equal deleted inserted replaced
848:32ebe22f0cad 849:ff645a692791
1 import html5lib 1 """Common HTML related functions"""
2 from html5lib import sanitizer, treebuilders, treewalkers, serializer 2 import bleach
3 3
4 def sanitizer_factory(*args, **kwargs):
5 san = sanitizer.HTMLSanitizer(*args, **kwargs)
6 # This isn't available yet
7 # san.strip_tokens = True
8 return san
9 4
10 def clean_html(buf): 5 # Each entry in the _CLEAN_PROFILES dict is a profile name -> 3-tuple pair. The
6 # tuple consists of (allowed_tags_list, allowed_attributes_dict,
7 # allowed_styles_list)
8 #
9 _CLEAN_PROFILES = {
10 'comments': (
11 [
12 'a', 'b', 'blockquote', 'br', 'code', 'del', 'em',
13 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
14 'i', 'img', 'li', 'ol', 'p', 'pre', 'strong', 'ul',
15 ],
16 {
17 'a': ['href'],
18 'img': ['src', 'alt', 'title'],
19 },
20 [],
21 ),
22 'news': (
23 [
24 'a', 'b', 'blockquote', 'br', 'caption', 'center', 'code', 'col',
25 'colgroup', 'dd', 'del', 'div', 'dl', 'dt', 'em',
26 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
27 'i', 'img', 'ins', 'li', 'ol', 'p', 'pre', 'small', 'strike',
28 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th',
29 'thead', 'tr', 'tt', 'u', 'ul',
30 ],
31 {
32 'a': ['href'],
33 'img': ['src', 'alt', 'title', 'width', 'height'],
34 },
35 [],
36 ),
37 }
38
39
40 def clean_html(text, profile='comments'):
11 """Cleans HTML of dangerous tags and content.""" 41 """Cleans HTML of dangerous tags and content."""
12 buf = buf.strip() 42 text = text.strip()
13 if not buf: 43 if not text:
14 return buf 44 return text
15 45
16 p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), 46 tags, attrs, styles = _CLEAN_PROFILES[profile]
17 tokenizer=sanitizer_factory)
18 dom_tree = p.parseFragment(buf)
19 47
20 walker = treewalkers.getTreeWalker("dom") 48 return bleach.clean(text, tags=tags, attributes=attrs, styles=styles,
21 stream = walker(dom_tree) 49 strip=True, strip_comments=True)
22
23 s = serializer.htmlserializer.HTMLSerializer(
24 omit_optional_tags=False,
25 quote_attr_values=True)
26 return s.render(stream)
27
28 # vim: ts=4 sw=4