bgneal@290: """
bgneal@290: This module contains a class derived from Python's HTMLParser to convert HTML to
bgneal@290: Markdown. Currently this class only supports those HTML tags that have counter-
bgneal@290: parts in BBCode used by stock phpBB 2.x.
bgneal@290:
bgneal@290: In other words, this class was created to help convert data from a phpBB
bgneal@290: forum to Markdown syntax and its scope is currently limited to that task.
bgneal@290:
bgneal@290: """
bgneal@290: from HTMLParser import HTMLParser
bgneal@290: import htmlentitydefs
bgneal@290:
bgneal@290:
bgneal@290: # Let's call Markdown markup entities "elements" to avoid confusion
bgneal@290: # with HTML tags.
bgneal@290:
bgneal@290: class ElementBase(object):
bgneal@290: """
bgneal@290: Base class for all Markdown elements.
bgneal@290:
bgneal@290: """
bgneal@290: def __init__(self, attrs=None):
bgneal@290: self.data = u''
bgneal@290: self.attrs = dict(attrs) if attrs else {}
bgneal@290:
bgneal@290: def add_data(self, data):
bgneal@290: self.data += data
bgneal@290:
bgneal@290: def markdown(self):
bgneal@290: return self.data
bgneal@290:
bgneal@290:
bgneal@290: class TextElement(ElementBase):
bgneal@290: """
bgneal@290: TextElements represent text fragments not inside HTML tags.
bgneal@290: """
bgneal@290: pass
bgneal@290:
bgneal@290:
bgneal@290: class EmphasisElement(ElementBase):
bgneal@290: """
bgneal@290: An EmphasisElement is a Markdown element used to indicate emphasis and is
bgneal@290: represented by placing characters around text. E.g. _em_, **bold**
bgneal@290:
bgneal@290: """
bgneal@290: def __init__(self, tag, attrs):
bgneal@290: super(EmphasisElement, self).__init__(attrs)
bgneal@290: self.tag = tag
bgneal@290:
bgneal@290: def markdown(self):
bgneal@290: return u'%s%s%s' % (self.tag, self.data, self.tag)
bgneal@290:
bgneal@290:
bgneal@290: def create_emphasis(tag):
bgneal@290: """
bgneal@290: Returns a function that creates an EmphasisElement using the supplied
bgneal@290: tag.
bgneal@290:
bgneal@290: """
bgneal@290: def inner(attrs):
bgneal@290: return EmphasisElement(tag, attrs)
bgneal@290: return inner
bgneal@290:
bgneal@290:
bgneal@290: class HtmlElement(ElementBase):
bgneal@290: """
bgneal@290: Markdown also accepts HTML markup. This element represents a HTML tag that
bgneal@290: maps to itself in Markdown.
bgneal@290:
bgneal@290: """
bgneal@290: def __init__(self, tag, attrs):
bgneal@290: super(HtmlElement, self).__init__(attrs)
bgneal@290: self.tag = tag
bgneal@290:
bgneal@290: def markdown(self):
bgneal@290: return u'<%s>%s%s>' % (self.tag, self.data, self.tag)
bgneal@290:
bgneal@290:
bgneal@290: def create_html(tag):
bgneal@290: """
bgneal@290: Returns a function that creates a HtmlElement using the supplied tag.
bgneal@290:
bgneal@290: """
bgneal@290: def inner(attrs):
bgneal@290: return HtmlElement(tag, attrs)
bgneal@290: return inner
bgneal@290:
bgneal@290:
bgneal@290: class QuoteElement(ElementBase):
bgneal@290: """
bgneal@290: Class to represent a blockquote in Markdown.
bgneal@290:
bgneal@290: """
bgneal@290: def markdown(self):
bgneal@290: return u'> %s\n\n' % self.data.replace('\n', '\n> ')
bgneal@290:
bgneal@290:
bgneal@290: class BreakElement(ElementBase):
bgneal@290: """
bgneal@290: Class to represent a linebreak in Markdown.
bgneal@290:
bgneal@290: """
bgneal@290: def markdown(self):
bgneal@290: return u' \n'
bgneal@290:
bgneal@290:
bgneal@290: class DivElement(ElementBase):
bgneal@290: """
bgneal@290: This class maps a HTML
into a block of text surrounded by newlines.
bgneal@290:
bgneal@290: """
bgneal@290: def markdown(self):
bgneal@290: return u'\n%s\n' % self.data
bgneal@290:
bgneal@290:
bgneal@290: class LinkElement(ElementBase):
bgneal@290: """
bgneal@290: This class maps HTML
tags into Markdown links.
bgneal@290: If no data is present, the actual href is used for the link text.
bgneal@290:
bgneal@290: """
bgneal@290: def markdown(self):
bgneal@292: try:
bgneal@292: url = self.attrs['href']
bgneal@292: except KeyError:
bgneal@292: return self.data if self.data else u''
bgneal@292:
bgneal@290: text = self.data if self.data else url
bgneal@290: return u'[%s](%s)' % (text, url)
bgneal@290:
bgneal@290:
bgneal@290: class ImageElement(ElementBase):
bgneal@290: """
bgneal@290: This class maps HTML tags into Markdown.
bgneal@290: This element assumes no alt text is present, and simply uses the word
bgneal@290: 'image' for the alt text.
bgneal@290:
bgneal@290: """
bgneal@290: def markdown(self):
bgneal@292: try:
bgneal@292: url = self.attrs['src']
bgneal@292: except KeyError:
bgneal@292: return u' (missing image) '
bgneal@290: return u'![image](%s)' % url
bgneal@290:
bgneal@290:
bgneal@290: class CodeElement(ElementBase):
bgneal@290: """
bgneal@290: This class is used to create code blocks in Markdown.
bgneal@290:
bgneal@290: """
bgneal@290: def markdown(self):
bgneal@290: return u' %s\n' % self.data.replace('\n', '\n ')
bgneal@290:
bgneal@290:
bgneal@290: # List (ordered & unordered) support:
bgneal@290:
bgneal@290: class ListElement(ElementBase):
bgneal@290: """
bgneal@290: This class creates Markdown for unordered lists. The bullet() method can be
bgneal@290: overridden to create ordered lists.
bgneal@290:
bgneal@290: """
bgneal@290: def __init__(self, attrs=None):
bgneal@290: super(ListElement, self).__init__(attrs)
bgneal@290: self.items = []
bgneal@290: self.list_nesting = 1
bgneal@290:
bgneal@290: def add_data(self, data):
bgneal@290: self.items.append(data)
bgneal@290:
bgneal@290: def bullet(self):
bgneal@290: return u'*'
bgneal@290:
bgneal@290: def markdown(self):
bgneal@290: bullet_str = self.bullet()
bgneal@290: indent = u' ' * (4 * (self.list_nesting - 1))
bgneal@290: s = u''
bgneal@290: for item in self.items:
bgneal@290: s += u'\n%s%s %s' % (indent, bullet_str, item)
bgneal@290: return s
bgneal@290:
bgneal@290:
bgneal@290: class OrderedListElement(ListElement):
bgneal@290: """
bgneal@290: This class creates Markdown for ordered lists.
bgneal@290:
bgneal@290: """
bgneal@290: def bullet(self):
bgneal@290: return '1.'
bgneal@290:
bgneal@290:
bgneal@290: class ItemElement(ElementBase):
bgneal@290: """
bgneal@290: This element is used to represent ordered & unordered list items.
bgneal@290:
bgneal@290: """
bgneal@290: pass
bgneal@290:
bgneal@290: ###############################################################################
bgneal@290: ###############################################################################
bgneal@290:
bgneal@290: class MarkdownWriter(HTMLParser):
bgneal@290: """
bgneal@290: This class is an HTMLParser that converts a subset of HTML to Markdown.
bgneal@290:
bgneal@290: """
bgneal@290:
bgneal@290: elem_factories = {
bgneal@290: 'a': LinkElement,
bgneal@290: 'blockquote': QuoteElement,
bgneal@290: 'br': BreakElement,
bgneal@290: 'div': DivElement,
bgneal@290: 'em': create_emphasis('_'),
bgneal@290: 'img': ImageElement,
bgneal@290: 'li': ItemElement,
bgneal@290: 'ol': OrderedListElement,
bgneal@290: 'pre': CodeElement,
bgneal@290: 's': create_html('strike'),
bgneal@290: 'strong': create_emphasis('**'),
bgneal@290: 'u': create_html('u'),
bgneal@290: 'ul': ListElement,
bgneal@290: }
bgneal@290:
bgneal@290: def __init__(self):
bgneal@290: HTMLParser.__init__(self)
bgneal@290: self.reset()
bgneal@290:
bgneal@290: def handle_starttag(self, tag, attrs):
bgneal@290: if tag in self.elem_factories:
bgneal@290: factory = self.elem_factories[tag]
bgneal@290: element = factory(attrs)
bgneal@290: else:
bgneal@290: element = TextElement()
bgneal@290:
bgneal@290: self._push_elem(element)
bgneal@290:
bgneal@290: def handle_endtag(self, tag):
bgneal@290: self._pop_elem()
bgneal@290:
bgneal@290: def handle_data(self, data):
bgneal@290: if len(self.elem_stack) == 0:
bgneal@290: self._push_elem(TextElement())
bgneal@290: self._add_data(data)
bgneal@290:
bgneal@290: def handle_entityref(self, name):
bgneal@290: try:
bgneal@290: text = unichr(htmlentitydefs.name2codepoint[name])
bgneal@290: except KeyError:
bgneal@290: text = name
bgneal@290: self.handle_data(text)
bgneal@290:
bgneal@290: def handle_charref(self, name):
bgneal@290: self.handle_data(unichr(int(name)))
bgneal@290:
bgneal@290: def reset(self):
bgneal@290: HTMLParser.reset(self)
bgneal@290: self.elem_stack = []
bgneal@290: self.elements = []
bgneal@290: self.list_nesting = 0
bgneal@290:
bgneal@290: def _push_elem(self, tag):
bgneal@290: if len(self.elem_stack) and isinstance(self.elem_stack[-1], TextElement):
bgneal@290: self._pop_elem()
bgneal@290: if isinstance(tag, ListElement):
bgneal@290: self.list_nesting += 1
bgneal@290: tag.list_nesting = self.list_nesting
bgneal@290: self.elem_stack.append(tag)
bgneal@290:
bgneal@290: def _pop_elem(self):
bgneal@292: try:
bgneal@292: element = self.elem_stack.pop()
bgneal@292: except IndexError:
bgneal@292: # pop from empty list => bad HTML input; ignore it
bgneal@292: return
bgneal@292:
bgneal@290: if isinstance(element, ListElement):
bgneal@290: self.list_nesting -= 1
bgneal@290: if len(self.elem_stack):
bgneal@290: self.elem_stack[-1].add_data(element.markdown())
bgneal@290: else:
bgneal@290: self.elements.append(element)
bgneal@290:
bgneal@290: def _add_data(self, data):
bgneal@290: self.elem_stack[-1].add_data(data)
bgneal@290:
bgneal@290: def markdown(self):
bgneal@290: while len(self.elem_stack):
bgneal@290: self._pop_elem()
bgneal@290: text_list = [e.markdown() for e in self.elements]
bgneal@290: return u''.join(text_list)