annotate gpp/core/html.py @ 334:6805d15cda13

Adding a script I had to write on the fly to filter out posts from the posts csv file that had no parent topics. MyISAM let me get away with that, but InnoDB won't.
author Brian Neal <bgneal@gmail.com>
date Sat, 26 Feb 2011 01:28:22 +0000
parents b3b11edf91d8
children
rev   line source
bgneal@9 1 import html5lib
bgneal@9 2 from html5lib import sanitizer, treebuilders, treewalkers, serializer
bgneal@9 3
bgneal@9 4 def sanitizer_factory(*args, **kwargs):
bgneal@9 5 san = sanitizer.HTMLSanitizer(*args, **kwargs)
bgneal@9 6 # This isn't available yet
bgneal@9 7 # san.strip_tokens = True
bgneal@9 8 return san
bgneal@9 9
bgneal@9 10 def clean_html(buf):
bgneal@9 11 """Cleans HTML of dangerous tags and content."""
bgneal@9 12 buf = buf.strip()
bgneal@9 13 if not buf:
bgneal@9 14 return buf
bgneal@9 15
bgneal@9 16 p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
bgneal@9 17 tokenizer=sanitizer_factory)
bgneal@9 18 dom_tree = p.parseFragment(buf)
bgneal@9 19
bgneal@9 20 walker = treewalkers.getTreeWalker("dom")
bgneal@9 21 stream = walker(dom_tree)
bgneal@9 22
bgneal@9 23 s = serializer.htmlserializer.HTMLSerializer(
bgneal@9 24 omit_optional_tags=False,
bgneal@9 25 quote_attr_values=True)
bgneal@9 26 return s.render(stream)
bgneal@9 27
bgneal@9 28 # vim: ts=4 sw=4