Mercurial > public > sg101
diff gpp/legacy/html2md.py @ 290:64c188a9d31f
Adding a legacy app to contain management commands to convert the old data to the new database format. This first commit has the import_old_users command.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Fri, 24 Dec 2010 05:28:58 +0000 |
parents | |
children | 2367c4795c92 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/legacy/html2md.py Fri Dec 24 05:28:58 2010 +0000 @@ -0,0 +1,279 @@ +""" +This module contains a class derived from Python's HTMLParser to convert HTML to +Markdown. Currently this class only supports those HTML tags that have counter- +parts in BBCode used by stock phpBB 2.x. + +In other words, this class was created to help convert data from a phpBB +forum to Markdown syntax and its scope is currently limited to that task. + +""" +from HTMLParser import HTMLParser +import htmlentitydefs + + +# Let's call Markdown markup entities "elements" to avoid confusion +# with HTML tags. + +class ElementBase(object): + """ + Base class for all Markdown elements. + + """ + def __init__(self, attrs=None): + self.data = u'' + self.attrs = dict(attrs) if attrs else {} + + def add_data(self, data): + self.data += data + + def markdown(self): + return self.data + + +class TextElement(ElementBase): + """ + TextElements represent text fragments not inside HTML tags. + """ + pass + + +class EmphasisElement(ElementBase): + """ + An EmphasisElement is a Markdown element used to indicate emphasis and is + represented by placing characters around text. E.g. _em_, **bold** + + """ + def __init__(self, tag, attrs): + super(EmphasisElement, self).__init__(attrs) + self.tag = tag + + def markdown(self): + return u'%s%s%s' % (self.tag, self.data, self.tag) + + +def create_emphasis(tag): + """ + Returns a function that creates an EmphasisElement using the supplied + tag. + + """ + def inner(attrs): + return EmphasisElement(tag, attrs) + return inner + + +class HtmlElement(ElementBase): + """ + Markdown also accepts HTML markup. This element represents a HTML tag that + maps to itself in Markdown. + + """ + def __init__(self, tag, attrs): + super(HtmlElement, self).__init__(attrs) + self.tag = tag + + def markdown(self): + return u'<%s>%s</%s>' % (self.tag, self.data, self.tag) + + +def create_html(tag): + """ + Returns a function that creates a HtmlElement using the supplied tag. + + """ + def inner(attrs): + return HtmlElement(tag, attrs) + return inner + + +class QuoteElement(ElementBase): + """ + Class to represent a blockquote in Markdown. + + """ + def markdown(self): + return u'> %s\n\n' % self.data.replace('\n', '\n> ') + + +class BreakElement(ElementBase): + """ + Class to represent a linebreak in Markdown. + + """ + def markdown(self): + return u' \n' + + +class DivElement(ElementBase): + """ + This class maps a HTML <div> into a block of text surrounded by newlines. + + """ + def markdown(self): + return u'\n%s\n' % self.data + + +class LinkElement(ElementBase): + """ + This class maps HTML <a> tags into Markdown links. + If no data is present, the actual href is used for the link text. + + """ + def markdown(self): + url = self.attrs['href'] + text = self.data if self.data else url + return u'[%s](%s)' % (text, url) + + +class ImageElement(ElementBase): + """ + This class maps HTML <img> tags into Markdown. + This element assumes no alt text is present, and simply uses the word + 'image' for the alt text. + + """ + def markdown(self): + url = self.attrs['src'] + return u'![image](%s)' % url + + +class CodeElement(ElementBase): + """ + This class is used to create code blocks in Markdown. + + """ + def markdown(self): + return u' %s\n' % self.data.replace('\n', '\n ') + + +# List (ordered & unordered) support: + +class ListElement(ElementBase): + """ + This class creates Markdown for unordered lists. The bullet() method can be + overridden to create ordered lists. + + """ + def __init__(self, attrs=None): + super(ListElement, self).__init__(attrs) + self.items = [] + self.list_nesting = 1 + + def add_data(self, data): + self.items.append(data) + + def bullet(self): + return u'*' + + def markdown(self): + bullet_str = self.bullet() + indent = u' ' * (4 * (self.list_nesting - 1)) + s = u'' + for item in self.items: + s += u'\n%s%s %s' % (indent, bullet_str, item) + return s + + +class OrderedListElement(ListElement): + """ + This class creates Markdown for ordered lists. + + """ + def bullet(self): + return '1.' + + +class ItemElement(ElementBase): + """ + This element is used to represent ordered & unordered list items. + + """ + pass + +############################################################################### +############################################################################### + +class MarkdownWriter(HTMLParser): + """ + This class is an HTMLParser that converts a subset of HTML to Markdown. + + """ + + elem_factories = { + 'a': LinkElement, + 'blockquote': QuoteElement, + 'br': BreakElement, + 'div': DivElement, + 'em': create_emphasis('_'), + 'img': ImageElement, + 'li': ItemElement, + 'ol': OrderedListElement, + 'pre': CodeElement, + 's': create_html('strike'), + 'strong': create_emphasis('**'), + 'u': create_html('u'), + 'ul': ListElement, + } + + def __init__(self): + HTMLParser.__init__(self) + self.reset() + + def handle_starttag(self, tag, attrs): + if tag in self.elem_factories: + factory = self.elem_factories[tag] + element = factory(attrs) + else: + element = TextElement() + + self._push_elem(element) + + def handle_endtag(self, tag): + self._pop_elem() + + def handle_data(self, data): + if len(self.elem_stack) == 0: + self._push_elem(TextElement()) + self._add_data(data) + + def handle_entityref(self, name): + try: + text = unichr(htmlentitydefs.name2codepoint[name]) + except KeyError: + text = name + self.handle_data(text) + + def handle_charref(self, name): + self.handle_data(unichr(int(name))) + + def reset(self): + HTMLParser.reset(self) + self.elem_stack = [] + self.elements = [] + self.list_nesting = 0 + + def _push_elem(self, tag): + if len(self.elem_stack) and isinstance(self.elem_stack[-1], TextElement): + self._pop_elem() + if isinstance(tag, ListElement): + self.list_nesting += 1 + tag.list_nesting = self.list_nesting + self.elem_stack.append(tag) + + def _pop_elem(self): + element = self.elem_stack.pop() + if isinstance(element, ListElement): + self.list_nesting -= 1 + if len(self.elem_stack): + self.elem_stack[-1].add_data(element.markdown()) + else: + self.elements.append(element) + + def _add_data(self, data): + self.elem_stack[-1].add_data(data) + + def markdown(self): + while len(self.elem_stack): + self._pop_elem() + text_list = [e.markdown() for e in self.elements] + return u''.join(text_list)