Mercurial > public > sg101
annotate gpp/core/html.py @ 469:3b30286adba5
Smarter search index updating for forums. This work is for #227.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Wed, 17 Aug 2011 01:02:08 +0000 |
parents | b3b11edf91d8 |
children |
rev | line source |
---|---|
bgneal@9 | 1 import html5lib |
bgneal@9 | 2 from html5lib import sanitizer, treebuilders, treewalkers, serializer |
bgneal@9 | 3 |
bgneal@9 | 4 def sanitizer_factory(*args, **kwargs): |
bgneal@9 | 5 san = sanitizer.HTMLSanitizer(*args, **kwargs) |
bgneal@9 | 6 # This isn't available yet |
bgneal@9 | 7 # san.strip_tokens = True |
bgneal@9 | 8 return san |
bgneal@9 | 9 |
bgneal@9 | 10 def clean_html(buf): |
bgneal@9 | 11 """Cleans HTML of dangerous tags and content.""" |
bgneal@9 | 12 buf = buf.strip() |
bgneal@9 | 13 if not buf: |
bgneal@9 | 14 return buf |
bgneal@9 | 15 |
bgneal@9 | 16 p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), |
bgneal@9 | 17 tokenizer=sanitizer_factory) |
bgneal@9 | 18 dom_tree = p.parseFragment(buf) |
bgneal@9 | 19 |
bgneal@9 | 20 walker = treewalkers.getTreeWalker("dom") |
bgneal@9 | 21 stream = walker(dom_tree) |
bgneal@9 | 22 |
bgneal@9 | 23 s = serializer.htmlserializer.HTMLSerializer( |
bgneal@9 | 24 omit_optional_tags=False, |
bgneal@9 | 25 quote_attr_values=True) |
bgneal@9 | 26 return s.render(stream) |
bgneal@9 | 27 |
bgneal@9 | 28 # vim: ts=4 sw=4 |