bgneal@9
|
1 import html5lib
|
bgneal@9
|
2 from html5lib import sanitizer, treebuilders, treewalkers, serializer
|
bgneal@9
|
3
|
bgneal@9
|
4 def sanitizer_factory(*args, **kwargs):
|
bgneal@9
|
5 san = sanitizer.HTMLSanitizer(*args, **kwargs)
|
bgneal@9
|
6 # This isn't available yet
|
bgneal@9
|
7 # san.strip_tokens = True
|
bgneal@9
|
8 return san
|
bgneal@9
|
9
|
bgneal@9
|
10 def clean_html(buf):
|
bgneal@9
|
11 """Cleans HTML of dangerous tags and content."""
|
bgneal@9
|
12 buf = buf.strip()
|
bgneal@9
|
13 if not buf:
|
bgneal@9
|
14 return buf
|
bgneal@9
|
15
|
bgneal@9
|
16 p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
|
bgneal@9
|
17 tokenizer=sanitizer_factory)
|
bgneal@9
|
18 dom_tree = p.parseFragment(buf)
|
bgneal@9
|
19
|
bgneal@9
|
20 walker = treewalkers.getTreeWalker("dom")
|
bgneal@9
|
21 stream = walker(dom_tree)
|
bgneal@9
|
22
|
bgneal@9
|
23 s = serializer.htmlserializer.HTMLSerializer(
|
bgneal@9
|
24 omit_optional_tags=False,
|
bgneal@9
|
25 quote_attr_values=True)
|
bgneal@9
|
26 return s.render(stream)
|
bgneal@9
|
27
|
bgneal@9
|
28 # vim: ts=4 sw=4
|