Mercurial > public > sg101
comparison core/html.py @ 849:ff645a692791
For issue #79, use bleach to sanitize both user input markdown & html.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Thu, 30 Oct 2014 19:30:37 -0500 |
parents | ee87ea74d46b |
children | 928b97ec55a7 |
comparison
equal
deleted
inserted
replaced
848:32ebe22f0cad | 849:ff645a692791 |
---|---|
1 import html5lib | 1 """Common HTML related functions""" |
2 from html5lib import sanitizer, treebuilders, treewalkers, serializer | 2 import bleach |
3 | 3 |
4 def sanitizer_factory(*args, **kwargs): | |
5 san = sanitizer.HTMLSanitizer(*args, **kwargs) | |
6 # This isn't available yet | |
7 # san.strip_tokens = True | |
8 return san | |
9 | 4 |
10 def clean_html(buf): | 5 # Each entry in the _CLEAN_PROFILES dict is a profile name -> 3-tuple pair. The |
6 # tuple consists of (allowed_tags_list, allowed_attributes_dict, | |
7 # allowed_styles_list) | |
8 # | |
9 _CLEAN_PROFILES = { | |
10 'comments': ( | |
11 [ | |
12 'a', 'b', 'blockquote', 'br', 'code', 'del', 'em', | |
13 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', | |
14 'i', 'img', 'li', 'ol', 'p', 'pre', 'strong', 'ul', | |
15 ], | |
16 { | |
17 'a': ['href'], | |
18 'img': ['src', 'alt', 'title'], | |
19 }, | |
20 [], | |
21 ), | |
22 'news': ( | |
23 [ | |
24 'a', 'b', 'blockquote', 'br', 'caption', 'center', 'code', 'col', | |
25 'colgroup', 'dd', 'del', 'div', 'dl', 'dt', 'em', | |
26 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', | |
27 'i', 'img', 'ins', 'li', 'ol', 'p', 'pre', 'small', 'strike', | |
28 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', | |
29 'thead', 'tr', 'tt', 'u', 'ul', | |
30 ], | |
31 { | |
32 'a': ['href'], | |
33 'img': ['src', 'alt', 'title', 'width', 'height'], | |
34 }, | |
35 [], | |
36 ), | |
37 } | |
38 | |
39 | |
40 def clean_html(text, profile='comments'): | |
11 """Cleans HTML of dangerous tags and content.""" | 41 """Cleans HTML of dangerous tags and content.""" |
12 buf = buf.strip() | 42 text = text.strip() |
13 if not buf: | 43 if not text: |
14 return buf | 44 return text |
15 | 45 |
16 p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), | 46 tags, attrs, styles = _CLEAN_PROFILES[profile] |
17 tokenizer=sanitizer_factory) | |
18 dom_tree = p.parseFragment(buf) | |
19 | 47 |
20 walker = treewalkers.getTreeWalker("dom") | 48 return bleach.clean(text, tags=tags, attributes=attrs, styles=styles, |
21 stream = walker(dom_tree) | 49 strip=True, strip_comments=True) |
22 | |
23 s = serializer.htmlserializer.HTMLSerializer( | |
24 omit_optional_tags=False, | |
25 quote_attr_values=True) | |
26 return s.render(stream) | |
27 | |
28 # vim: ts=4 sw=4 |