Mercurial > public > sg101
view gpp/core/html.py @ 334:6805d15cda13
Adding a script I had to write on the fly to filter out posts from the posts csv file that had no parent topics. MyISAM let me get away with that, but InnoDB won't.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Sat, 26 Feb 2011 01:28:22 +0000 |
parents | b3b11edf91d8 |
children |
line wrap: on
line source
import html5lib from html5lib import sanitizer, treebuilders, treewalkers, serializer def sanitizer_factory(*args, **kwargs): san = sanitizer.HTMLSanitizer(*args, **kwargs) # This isn't available yet # san.strip_tokens = True return san def clean_html(buf): """Cleans HTML of dangerous tags and content.""" buf = buf.strip() if not buf: return buf p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), tokenizer=sanitizer_factory) dom_tree = p.parseFragment(buf) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = serializer.htmlserializer.HTMLSerializer( omit_optional_tags=False, quote_attr_values=True) return s.render(stream) # vim: ts=4 sw=4