Mercurial > public > sg101
annotate gpp/core/html.py @ 334:6805d15cda13
Adding a script I had to write on the fly to filter out posts from the posts csv file that had no parent topics. MyISAM let me get away with that, but InnoDB won't.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Sat, 26 Feb 2011 01:28:22 +0000 |
parents | b3b11edf91d8 |
children |
rev | line source |
---|---|
bgneal@9 | 1 import html5lib |
bgneal@9 | 2 from html5lib import sanitizer, treebuilders, treewalkers, serializer |
bgneal@9 | 3 |
bgneal@9 | 4 def sanitizer_factory(*args, **kwargs): |
bgneal@9 | 5 san = sanitizer.HTMLSanitizer(*args, **kwargs) |
bgneal@9 | 6 # This isn't available yet |
bgneal@9 | 7 # san.strip_tokens = True |
bgneal@9 | 8 return san |
bgneal@9 | 9 |
bgneal@9 | 10 def clean_html(buf): |
bgneal@9 | 11 """Cleans HTML of dangerous tags and content.""" |
bgneal@9 | 12 buf = buf.strip() |
bgneal@9 | 13 if not buf: |
bgneal@9 | 14 return buf |
bgneal@9 | 15 |
bgneal@9 | 16 p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), |
bgneal@9 | 17 tokenizer=sanitizer_factory) |
bgneal@9 | 18 dom_tree = p.parseFragment(buf) |
bgneal@9 | 19 |
bgneal@9 | 20 walker = treewalkers.getTreeWalker("dom") |
bgneal@9 | 21 stream = walker(dom_tree) |
bgneal@9 | 22 |
bgneal@9 | 23 s = serializer.htmlserializer.HTMLSerializer( |
bgneal@9 | 24 omit_optional_tags=False, |
bgneal@9 | 25 quote_attr_values=True) |
bgneal@9 | 26 return s.render(stream) |
bgneal@9 | 27 |
bgneal@9 | 28 # vim: ts=4 sw=4 |