bgneal@290: """
bgneal@290: This module contains a class derived from Python's HTMLParser to convert HTML to
bgneal@290: Markdown. Currently this class only supports those HTML tags that have counter-
bgneal@290: parts in BBCode used by stock phpBB 2.x.
bgneal@290: 
bgneal@290: In other words, this class was created to help convert data from a phpBB
bgneal@290: forum to Markdown syntax and its scope is currently limited to that task.
bgneal@290: 
bgneal@290: """
bgneal@290: from HTMLParser import HTMLParser
bgneal@290: import htmlentitydefs
bgneal@290: 
bgneal@290: 
bgneal@290: # Let's call Markdown markup entities "elements" to avoid confusion
bgneal@290: # with HTML tags.
bgneal@290: 
bgneal@290: class ElementBase(object):
bgneal@290:     """
bgneal@290:     Base class for all Markdown elements.
bgneal@290: 
bgneal@290:     """
bgneal@290:     def __init__(self, attrs=None):
bgneal@290:         self.data = u''
bgneal@290:         self.attrs = dict(attrs) if attrs else {}
bgneal@290: 
bgneal@290:     def add_data(self, data):
bgneal@290:         self.data += data
bgneal@290: 
bgneal@290:     def markdown(self):
bgneal@290:         return self.data
bgneal@290: 
bgneal@290: 
bgneal@290: class TextElement(ElementBase):
bgneal@290:     """
bgneal@290:     TextElements represent text fragments not inside HTML tags.
bgneal@290:     """
bgneal@290:     pass
bgneal@290: 
bgneal@290: 
bgneal@290: class EmphasisElement(ElementBase):
bgneal@290:     """
bgneal@290:     An EmphasisElement is a Markdown element used to indicate emphasis and is
bgneal@290:     represented by placing characters around text. E.g. _em_, **bold**
bgneal@290: 
bgneal@290:     """
bgneal@290:     def __init__(self, tag, attrs):
bgneal@290:         super(EmphasisElement, self).__init__(attrs)
bgneal@290:         self.tag = tag
bgneal@290: 
bgneal@290:     def markdown(self):
bgneal@290:         return u'%s%s%s' % (self.tag, self.data, self.tag)
bgneal@290: 
bgneal@290: 
bgneal@290: def create_emphasis(tag):
bgneal@290:     """
bgneal@290:     Returns a function that creates an EmphasisElement using the supplied
bgneal@290:     tag.
bgneal@290: 
bgneal@290:     """
bgneal@290:     def inner(attrs):
bgneal@290:         return EmphasisElement(tag, attrs)
bgneal@290:     return inner
bgneal@290: 
bgneal@290: 
bgneal@290: class HtmlElement(ElementBase):
bgneal@290:     """
bgneal@290:     Markdown also accepts HTML markup. This element represents a HTML tag that
bgneal@290:     maps to itself in Markdown.
bgneal@290: 
bgneal@290:     """
bgneal@290:     def __init__(self, tag, attrs):
bgneal@290:         super(HtmlElement, self).__init__(attrs)
bgneal@290:         self.tag = tag
bgneal@290: 
bgneal@290:     def markdown(self):
bgneal@290:         return u'<%s>%s</%s>' % (self.tag, self.data, self.tag)
bgneal@290: 
bgneal@290: 
bgneal@290: def create_html(tag):
bgneal@290:     """
bgneal@290:     Returns a function that creates a HtmlElement using the supplied tag.
bgneal@290: 
bgneal@290:     """
bgneal@290:     def inner(attrs):
bgneal@290:         return HtmlElement(tag, attrs)
bgneal@290:     return inner
bgneal@290: 
bgneal@290: 
bgneal@290: class QuoteElement(ElementBase):
bgneal@290:     """
bgneal@290:     Class to represent a blockquote in Markdown.
bgneal@290: 
bgneal@290:     """
bgneal@290:     def markdown(self):
bgneal@290:         return u'> %s\n\n' % self.data.replace('\n', '\n> ')
bgneal@290: 
bgneal@290: 
bgneal@290: class BreakElement(ElementBase):
bgneal@290:     """
bgneal@290:     Class to represent a linebreak in Markdown.
bgneal@290: 
bgneal@290:     """
bgneal@290:     def markdown(self):
bgneal@290:         return u'  \n'
bgneal@290: 
bgneal@290: 
bgneal@290: class DivElement(ElementBase):
bgneal@290:     """
bgneal@290:     This class maps a HTML <div> into a block of text surrounded by newlines.
bgneal@290: 
bgneal@290:     """
bgneal@290:     def markdown(self):
bgneal@290:         return u'\n%s\n' % self.data
bgneal@290: 
bgneal@290: 
bgneal@290: class LinkElement(ElementBase):
bgneal@290:     """
bgneal@290:     This class maps HTML <a> tags into Markdown links.
bgneal@290:     If no data is present, the actual href is used for the link text.
bgneal@290: 
bgneal@290:     """
bgneal@290:     def markdown(self):
bgneal@292:         try:
bgneal@292:             url = self.attrs['href']
bgneal@292:         except KeyError:
bgneal@292:             return self.data if self.data else u''
bgneal@292: 
bgneal@290:         text = self.data if self.data else url
bgneal@290:         return u'[%s](%s)' % (text, url)
bgneal@290: 
bgneal@290: 
bgneal@290: class ImageElement(ElementBase):
bgneal@290:     """
bgneal@290:     This class maps HTML <img> tags into Markdown.
bgneal@290:     This element assumes no alt text is present, and simply uses the word
bgneal@290:     'image' for the alt text.
bgneal@290: 
bgneal@290:     """
bgneal@290:     def markdown(self):
bgneal@292:         try:
bgneal@292:             url = self.attrs['src']
bgneal@292:         except KeyError:
bgneal@292:             return u' (missing image) '
bgneal@290:         return u'![image](%s)' % url
bgneal@290: 
bgneal@290: 
bgneal@290: class CodeElement(ElementBase):
bgneal@290:     """
bgneal@290:     This class is used to create code blocks in Markdown.
bgneal@290: 
bgneal@290:     """
bgneal@290:     def markdown(self):
bgneal@290:         return u'    %s\n' % self.data.replace('\n', '\n    ')
bgneal@290: 
bgneal@290: 
bgneal@290: # List (ordered & unordered) support:
bgneal@290: 
bgneal@290: class ListElement(ElementBase):
bgneal@290:     """
bgneal@290:     This class creates Markdown for unordered lists. The bullet() method can be
bgneal@290:     overridden to create ordered lists.
bgneal@290: 
bgneal@290:     """
bgneal@290:     def __init__(self, attrs=None):
bgneal@290:         super(ListElement, self).__init__(attrs)
bgneal@290:         self.items = []
bgneal@290:         self.list_nesting = 1
bgneal@290: 
bgneal@290:     def add_data(self, data):
bgneal@290:         self.items.append(data)
bgneal@290: 
bgneal@290:     def bullet(self):
bgneal@290:         return u'*'
bgneal@290: 
bgneal@290:     def markdown(self):
bgneal@290:         bullet_str = self.bullet()
bgneal@290:         indent = u' ' * (4 * (self.list_nesting - 1))
bgneal@290:         s = u''
bgneal@290:         for item in self.items:
bgneal@290:             s += u'\n%s%s %s' % (indent, bullet_str, item)
bgneal@290:         return s
bgneal@290: 
bgneal@290: 
bgneal@290: class OrderedListElement(ListElement):
bgneal@290:     """
bgneal@290:     This class creates Markdown for ordered lists.
bgneal@290: 
bgneal@290:     """
bgneal@290:     def bullet(self):
bgneal@290:         return '1.'
bgneal@290: 
bgneal@290: 
bgneal@290: class ItemElement(ElementBase):
bgneal@290:     """
bgneal@290:     This element is used to represent ordered & unordered list items.
bgneal@290: 
bgneal@290:     """
bgneal@290:     pass
bgneal@290: 
bgneal@290: ###############################################################################
bgneal@290: ###############################################################################
bgneal@290: 
bgneal@290: class MarkdownWriter(HTMLParser):
bgneal@290:     """
bgneal@290:     This class is an HTMLParser that converts a subset of HTML to Markdown.
bgneal@290: 
bgneal@290:     """
bgneal@290: 
bgneal@290:     elem_factories = {
bgneal@290:         'a': LinkElement,
bgneal@290:         'blockquote': QuoteElement,
bgneal@290:         'br': BreakElement,
bgneal@290:         'div': DivElement,
bgneal@290:         'em': create_emphasis('_'),
bgneal@290:         'img': ImageElement,
bgneal@290:         'li': ItemElement,
bgneal@290:         'ol': OrderedListElement,
bgneal@290:         'pre': CodeElement,
bgneal@290:         's': create_html('strike'),
bgneal@290:         'strong': create_emphasis('**'),
bgneal@290:         'u': create_html('u'),
bgneal@290:         'ul': ListElement,
bgneal@290:     }
bgneal@290: 
bgneal@290:     def __init__(self):
bgneal@290:         HTMLParser.__init__(self)
bgneal@290:         self.reset()
bgneal@290: 
bgneal@290:     def handle_starttag(self, tag, attrs):
bgneal@290:         if tag in self.elem_factories:
bgneal@290:             factory = self.elem_factories[tag]
bgneal@290:             element = factory(attrs)
bgneal@290:         else:
bgneal@290:             element = TextElement()
bgneal@290: 
bgneal@290:         self._push_elem(element)
bgneal@290: 
bgneal@290:     def handle_endtag(self, tag):
bgneal@290:         self._pop_elem()
bgneal@290: 
bgneal@290:     def handle_data(self, data):
bgneal@290:         if len(self.elem_stack) == 0:
bgneal@290:             self._push_elem(TextElement())
bgneal@290:         self._add_data(data)
bgneal@290: 
bgneal@290:     def handle_entityref(self, name):
bgneal@290:         try:
bgneal@290:             text = unichr(htmlentitydefs.name2codepoint[name])
bgneal@290:         except KeyError:
bgneal@290:             text = name
bgneal@290:         self.handle_data(text)
bgneal@290: 
bgneal@290:     def handle_charref(self, name):
bgneal@290:         self.handle_data(unichr(int(name)))
bgneal@290: 
bgneal@290:     def reset(self):
bgneal@290:         HTMLParser.reset(self)
bgneal@290:         self.elem_stack = []
bgneal@290:         self.elements = []
bgneal@290:         self.list_nesting = 0
bgneal@290: 
bgneal@290:     def _push_elem(self, tag):
bgneal@290:         if len(self.elem_stack) and isinstance(self.elem_stack[-1], TextElement):
bgneal@290:             self._pop_elem()
bgneal@290:         if isinstance(tag, ListElement):
bgneal@290:             self.list_nesting += 1
bgneal@290:             tag.list_nesting = self.list_nesting
bgneal@290:         self.elem_stack.append(tag)
bgneal@290: 
bgneal@290:     def _pop_elem(self):
bgneal@292:         try:
bgneal@292:             element = self.elem_stack.pop()
bgneal@292:         except IndexError:
bgneal@292:             # pop from empty list => bad HTML input; ignore it
bgneal@292:             return
bgneal@292: 
bgneal@290:         if isinstance(element, ListElement):
bgneal@290:             self.list_nesting -= 1
bgneal@290:         if len(self.elem_stack):
bgneal@290:             self.elem_stack[-1].add_data(element.markdown())
bgneal@290:         else:
bgneal@290:             self.elements.append(element)
bgneal@290: 
bgneal@290:     def _add_data(self, data):
bgneal@290:         self.elem_stack[-1].add_data(data)
bgneal@290: 
bgneal@290:     def markdown(self):
bgneal@290:         while len(self.elem_stack):
bgneal@290:             self._pop_elem()
bgneal@290:         text_list = [e.markdown() for e in self.elements]
bgneal@290:         return u''.join(text_list)