Mercurial > public > sg101
view legacy/html2md.py @ 697:67f8d49a9377
Cleaned up the code a bit.
Separated the S3 stuff out into its own class.
This class maybe should be in core.
Still want to do some kind of context manager around the temporary file we are
creating to ensure it gets deleted.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Sun, 08 Sep 2013 21:02:58 -0500 |
parents | ee87ea74d46b |
children |
line wrap: on
line source
""" This module contains a class derived from Python's HTMLParser to convert HTML to Markdown. Currently this class only supports those HTML tags that have counter- parts in BBCode used by stock phpBB 2.x. In other words, this class was created to help convert data from a phpBB forum to Markdown syntax and its scope is currently limited to that task. """ from HTMLParser import HTMLParser import htmlentitydefs # Let's call Markdown markup entities "elements" to avoid confusion # with HTML tags. class ElementBase(object): """ Base class for all Markdown elements. """ def __init__(self, attrs=None): self.data = u'' self.attrs = dict(attrs) if attrs else {} def add_data(self, data): self.data += data def markdown(self): return self.data class TextElement(ElementBase): """ TextElements represent text fragments not inside HTML tags. """ pass class EmphasisElement(ElementBase): """ An EmphasisElement is a Markdown element used to indicate emphasis and is represented by placing characters around text. E.g. _em_, **bold** """ def __init__(self, tag, attrs): super(EmphasisElement, self).__init__(attrs) self.tag = tag def markdown(self): return u'%s%s%s' % (self.tag, self.data, self.tag) def create_emphasis(tag): """ Returns a function that creates an EmphasisElement using the supplied tag. """ def inner(attrs): return EmphasisElement(tag, attrs) return inner class HtmlElement(ElementBase): """ Markdown also accepts HTML markup. This element represents a HTML tag that maps to itself in Markdown. """ def __init__(self, tag, attrs): super(HtmlElement, self).__init__(attrs) self.tag = tag def markdown(self): return u'<%s>%s</%s>' % (self.tag, self.data, self.tag) def create_html(tag): """ Returns a function that creates a HtmlElement using the supplied tag. """ def inner(attrs): return HtmlElement(tag, attrs) return inner class QuoteElement(ElementBase): """ Class to represent a blockquote in Markdown. """ def markdown(self): return u'> %s\n\n' % self.data.replace('\n', '\n> ') class BreakElement(ElementBase): """ Class to represent a linebreak in Markdown. """ def markdown(self): return u' \n' class DivElement(ElementBase): """ This class maps a HTML <div> into a block of text surrounded by newlines. """ def markdown(self): return u'\n%s\n' % self.data class LinkElement(ElementBase): """ This class maps HTML <a> tags into Markdown links. If no data is present, the actual href is used for the link text. """ def markdown(self): try: url = self.attrs['href'] except KeyError: return self.data if self.data else u'' text = self.data if self.data else url return u'[%s](%s)' % (text, url) class ImageElement(ElementBase): """ This class maps HTML <img> tags into Markdown. This element assumes no alt text is present, and simply uses the word 'image' for the alt text. """ def markdown(self): try: url = self.attrs['src'] except KeyError: return u' (missing image) ' return u'![image](%s)' % url class CodeElement(ElementBase): """ This class is used to create code blocks in Markdown. """ def markdown(self): return u' %s\n' % self.data.replace('\n', '\n ') # List (ordered & unordered) support: class ListElement(ElementBase): """ This class creates Markdown for unordered lists. The bullet() method can be overridden to create ordered lists. """ def __init__(self, attrs=None): super(ListElement, self).__init__(attrs) self.items = [] self.list_nesting = 1 def add_data(self, data): self.items.append(data) def bullet(self): return u'*' def markdown(self): bullet_str = self.bullet() indent = u' ' * (4 * (self.list_nesting - 1)) s = u'' for item in self.items: s += u'\n%s%s %s' % (indent, bullet_str, item) return s class OrderedListElement(ListElement): """ This class creates Markdown for ordered lists. """ def bullet(self): return '1.' class ItemElement(ElementBase): """ This element is used to represent ordered & unordered list items. """ pass ############################################################################### ############################################################################### class MarkdownWriter(HTMLParser): """ This class is an HTMLParser that converts a subset of HTML to Markdown. """ elem_factories = { 'a': LinkElement, 'blockquote': QuoteElement, 'br': BreakElement, 'div': DivElement, 'em': create_emphasis('_'), 'img': ImageElement, 'li': ItemElement, 'ol': OrderedListElement, 'pre': CodeElement, 's': create_html('strike'), 'strong': create_emphasis('**'), 'u': create_html('u'), 'ul': ListElement, } def __init__(self): HTMLParser.__init__(self) self.reset() def handle_starttag(self, tag, attrs): if tag in self.elem_factories: factory = self.elem_factories[tag] element = factory(attrs) else: element = TextElement() self._push_elem(element) def handle_endtag(self, tag): self._pop_elem() def handle_data(self, data): if len(self.elem_stack) == 0: self._push_elem(TextElement()) self._add_data(data) def handle_entityref(self, name): try: text = unichr(htmlentitydefs.name2codepoint[name]) except KeyError: text = name self.handle_data(text) def handle_charref(self, name): self.handle_data(unichr(int(name))) def reset(self): HTMLParser.reset(self) self.elem_stack = [] self.elements = [] self.list_nesting = 0 def _push_elem(self, tag): if len(self.elem_stack) and isinstance(self.elem_stack[-1], TextElement): self._pop_elem() if isinstance(tag, ListElement): self.list_nesting += 1 tag.list_nesting = self.list_nesting self.elem_stack.append(tag) def _pop_elem(self): try: element = self.elem_stack.pop() except IndexError: # pop from empty list => bad HTML input; ignore it return if isinstance(element, ListElement): self.list_nesting -= 1 if len(self.elem_stack): self.elem_stack[-1].add_data(element.markdown()) else: self.elements.append(element) def _add_data(self, data): self.elem_stack[-1].add_data(data) def markdown(self): while len(self.elem_stack): self._pop_elem() text_list = [e.markdown() for e in self.elements] return u''.join(text_list)