Mercurial > public > sg101
diff core/html.py @ 849:ff645a692791
For issue #79, use bleach to sanitize both user input markdown & html.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Thu, 30 Oct 2014 19:30:37 -0500 |
parents | ee87ea74d46b |
children | 928b97ec55a7 |
line wrap: on
line diff
--- a/core/html.py Tue Oct 28 19:33:14 2014 -0500 +++ b/core/html.py Thu Oct 30 19:30:37 2014 -0500 @@ -1,28 +1,49 @@ -import html5lib -from html5lib import sanitizer, treebuilders, treewalkers, serializer +"""Common HTML related functions""" +import bleach -def sanitizer_factory(*args, **kwargs): - san = sanitizer.HTMLSanitizer(*args, **kwargs) - # This isn't available yet - # san.strip_tokens = True - return san -def clean_html(buf): +# Each entry in the _CLEAN_PROFILES dict is a profile name -> 3-tuple pair. The +# tuple consists of (allowed_tags_list, allowed_attributes_dict, +# allowed_styles_list) +# +_CLEAN_PROFILES = { + 'comments': ( + [ + 'a', 'b', 'blockquote', 'br', 'code', 'del', 'em', + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', + 'i', 'img', 'li', 'ol', 'p', 'pre', 'strong', 'ul', + ], + { + 'a': ['href'], + 'img': ['src', 'alt', 'title'], + }, + [], + ), + 'news': ( + [ + 'a', 'b', 'blockquote', 'br', 'caption', 'center', 'code', 'col', + 'colgroup', 'dd', 'del', 'div', 'dl', 'dt', 'em', + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', + 'i', 'img', 'ins', 'li', 'ol', 'p', 'pre', 'small', 'strike', + 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', + 'thead', 'tr', 'tt', 'u', 'ul', + ], + { + 'a': ['href'], + 'img': ['src', 'alt', 'title', 'width', 'height'], + }, + [], + ), +} + + +def clean_html(text, profile='comments'): """Cleans HTML of dangerous tags and content.""" - buf = buf.strip() - if not buf: - return buf + text = text.strip() + if not text: + return text - p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), - tokenizer=sanitizer_factory) - dom_tree = p.parseFragment(buf) + tags, attrs, styles = _CLEAN_PROFILES[profile] - walker = treewalkers.getTreeWalker("dom") - stream = walker(dom_tree) - - s = serializer.htmlserializer.HTMLSerializer( - omit_optional_tags=False, - quote_attr_values=True) - return s.render(stream) - -# vim: ts=4 sw=4 + return bleach.clean(text, tags=tags, attributes=attrs, styles=styles, + strip=True, strip_comments=True)