comparison gpp/core/html.py @ 9:b3b11edf91d8

News: removed the lxml stuff. Based on Jacob Kaplan-Moss suggestion, use html5lib to clean html. Added that functionality in a new core.html module.
author Brian Neal <bgneal@gmail.com>
date Sun, 12 Apr 2009 02:03:03 +0000
parents
children
comparison
equal deleted inserted replaced
8:d6f3c38e8f50 9:b3b11edf91d8
1 import html5lib
2 from html5lib import sanitizer, treebuilders, treewalkers, serializer
3
4 def sanitizer_factory(*args, **kwargs):
5 san = sanitizer.HTMLSanitizer(*args, **kwargs)
6 # This isn't available yet
7 # san.strip_tokens = True
8 return san
9
10 def clean_html(buf):
11 """Cleans HTML of dangerous tags and content."""
12 buf = buf.strip()
13 if not buf:
14 return buf
15
16 p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
17 tokenizer=sanitizer_factory)
18 dom_tree = p.parseFragment(buf)
19
20 walker = treewalkers.getTreeWalker("dom")
21 stream = walker(dom_tree)
22
23 s = serializer.htmlserializer.HTMLSerializer(
24 omit_optional_tags=False,
25 quote_attr_values=True)
26 return s.render(stream)
27
28 # vim: ts=4 sw=4