Mercurial > public > sg101
annotate gpp/core/html.py @ 467:b910cc1460c8
Add the ability to conditionally add model instances to the search index on update. This is not perfect, as some instances should be deleted from the index if they are updated such that they should not be in the index anymore. Will think about and address that later.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Sun, 24 Jul 2011 18:12:20 +0000 |
parents | b3b11edf91d8 |
children |
rev | line source |
---|---|
bgneal@9 | 1 import html5lib |
bgneal@9 | 2 from html5lib import sanitizer, treebuilders, treewalkers, serializer |
bgneal@9 | 3 |
bgneal@9 | 4 def sanitizer_factory(*args, **kwargs): |
bgneal@9 | 5 san = sanitizer.HTMLSanitizer(*args, **kwargs) |
bgneal@9 | 6 # This isn't available yet |
bgneal@9 | 7 # san.strip_tokens = True |
bgneal@9 | 8 return san |
bgneal@9 | 9 |
bgneal@9 | 10 def clean_html(buf): |
bgneal@9 | 11 """Cleans HTML of dangerous tags and content.""" |
bgneal@9 | 12 buf = buf.strip() |
bgneal@9 | 13 if not buf: |
bgneal@9 | 14 return buf |
bgneal@9 | 15 |
bgneal@9 | 16 p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), |
bgneal@9 | 17 tokenizer=sanitizer_factory) |
bgneal@9 | 18 dom_tree = p.parseFragment(buf) |
bgneal@9 | 19 |
bgneal@9 | 20 walker = treewalkers.getTreeWalker("dom") |
bgneal@9 | 21 stream = walker(dom_tree) |
bgneal@9 | 22 |
bgneal@9 | 23 s = serializer.htmlserializer.HTMLSerializer( |
bgneal@9 | 24 omit_optional_tags=False, |
bgneal@9 | 25 quote_attr_values=True) |
bgneal@9 | 26 return s.render(stream) |
bgneal@9 | 27 |
bgneal@9 | 28 # vim: ts=4 sw=4 |