Mercurial > public > sg101
comparison gpp/core/html.py @ 9:b3b11edf91d8
News: removed the lxml stuff. Based on Jacob Kaplan-Moss suggestion, use html5lib to clean html. Added that functionality in a new core.html module.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Sun, 12 Apr 2009 02:03:03 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
8:d6f3c38e8f50 | 9:b3b11edf91d8 |
---|---|
1 import html5lib | |
2 from html5lib import sanitizer, treebuilders, treewalkers, serializer | |
3 | |
4 def sanitizer_factory(*args, **kwargs): | |
5 san = sanitizer.HTMLSanitizer(*args, **kwargs) | |
6 # This isn't available yet | |
7 # san.strip_tokens = True | |
8 return san | |
9 | |
10 def clean_html(buf): | |
11 """Cleans HTML of dangerous tags and content.""" | |
12 buf = buf.strip() | |
13 if not buf: | |
14 return buf | |
15 | |
16 p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), | |
17 tokenizer=sanitizer_factory) | |
18 dom_tree = p.parseFragment(buf) | |
19 | |
20 walker = treewalkers.getTreeWalker("dom") | |
21 stream = walker(dom_tree) | |
22 | |
23 s = serializer.htmlserializer.HTMLSerializer( | |
24 omit_optional_tags=False, | |
25 quote_attr_values=True) | |
26 return s.render(stream) | |
27 | |
28 # vim: ts=4 sw=4 |