bgneal@290: """ bgneal@290: This module contains a class derived from Python's HTMLParser to convert HTML to bgneal@290: Markdown. Currently this class only supports those HTML tags that have counter- bgneal@290: parts in BBCode used by stock phpBB 2.x. bgneal@290: bgneal@290: In other words, this class was created to help convert data from a phpBB bgneal@290: forum to Markdown syntax and its scope is currently limited to that task. bgneal@290: bgneal@290: """ bgneal@290: from HTMLParser import HTMLParser bgneal@290: import htmlentitydefs bgneal@290: bgneal@290: bgneal@290: # Let's call Markdown markup entities "elements" to avoid confusion bgneal@290: # with HTML tags. bgneal@290: bgneal@290: class ElementBase(object): bgneal@290: """ bgneal@290: Base class for all Markdown elements. bgneal@290: bgneal@290: """ bgneal@290: def __init__(self, attrs=None): bgneal@290: self.data = u'' bgneal@290: self.attrs = dict(attrs) if attrs else {} bgneal@290: bgneal@290: def add_data(self, data): bgneal@290: self.data += data bgneal@290: bgneal@290: def markdown(self): bgneal@290: return self.data bgneal@290: bgneal@290: bgneal@290: class TextElement(ElementBase): bgneal@290: """ bgneal@290: TextElements represent text fragments not inside HTML tags. bgneal@290: """ bgneal@290: pass bgneal@290: bgneal@290: bgneal@290: class EmphasisElement(ElementBase): bgneal@290: """ bgneal@290: An EmphasisElement is a Markdown element used to indicate emphasis and is bgneal@290: represented by placing characters around text. E.g. _em_, **bold** bgneal@290: bgneal@290: """ bgneal@290: def __init__(self, tag, attrs): bgneal@290: super(EmphasisElement, self).__init__(attrs) bgneal@290: self.tag = tag bgneal@290: bgneal@290: def markdown(self): bgneal@290: return u'%s%s%s' % (self.tag, self.data, self.tag) bgneal@290: bgneal@290: bgneal@290: def create_emphasis(tag): bgneal@290: """ bgneal@290: Returns a function that creates an EmphasisElement using the supplied bgneal@290: tag. bgneal@290: bgneal@290: """ bgneal@290: def inner(attrs): bgneal@290: return EmphasisElement(tag, attrs) bgneal@290: return inner bgneal@290: bgneal@290: bgneal@290: class HtmlElement(ElementBase): bgneal@290: """ bgneal@290: Markdown also accepts HTML markup. This element represents a HTML tag that bgneal@290: maps to itself in Markdown. bgneal@290: bgneal@290: """ bgneal@290: def __init__(self, tag, attrs): bgneal@290: super(HtmlElement, self).__init__(attrs) bgneal@290: self.tag = tag bgneal@290: bgneal@290: def markdown(self): bgneal@290: return u'<%s>%s' % (self.tag, self.data, self.tag) bgneal@290: bgneal@290: bgneal@290: def create_html(tag): bgneal@290: """ bgneal@290: Returns a function that creates a HtmlElement using the supplied tag. bgneal@290: bgneal@290: """ bgneal@290: def inner(attrs): bgneal@290: return HtmlElement(tag, attrs) bgneal@290: return inner bgneal@290: bgneal@290: bgneal@290: class QuoteElement(ElementBase): bgneal@290: """ bgneal@290: Class to represent a blockquote in Markdown. bgneal@290: bgneal@290: """ bgneal@290: def markdown(self): bgneal@290: return u'> %s\n\n' % self.data.replace('\n', '\n> ') bgneal@290: bgneal@290: bgneal@290: class BreakElement(ElementBase): bgneal@290: """ bgneal@290: Class to represent a linebreak in Markdown. bgneal@290: bgneal@290: """ bgneal@290: def markdown(self): bgneal@290: return u' \n' bgneal@290: bgneal@290: bgneal@290: class DivElement(ElementBase): bgneal@290: """ bgneal@290: This class maps a HTML
into a block of text surrounded by newlines. bgneal@290: bgneal@290: """ bgneal@290: def markdown(self): bgneal@290: return u'\n%s\n' % self.data bgneal@290: bgneal@290: bgneal@290: class LinkElement(ElementBase): bgneal@290: """ bgneal@290: This class maps HTML tags into Markdown links. bgneal@290: If no data is present, the actual href is used for the link text. bgneal@290: bgneal@290: """ bgneal@290: def markdown(self): bgneal@292: try: bgneal@292: url = self.attrs['href'] bgneal@292: except KeyError: bgneal@292: return self.data if self.data else u'' bgneal@292: bgneal@290: text = self.data if self.data else url bgneal@290: return u'[%s](%s)' % (text, url) bgneal@290: bgneal@290: bgneal@290: class ImageElement(ElementBase): bgneal@290: """ bgneal@290: This class maps HTML tags into Markdown. bgneal@290: This element assumes no alt text is present, and simply uses the word bgneal@290: 'image' for the alt text. bgneal@290: bgneal@290: """ bgneal@290: def markdown(self): bgneal@292: try: bgneal@292: url = self.attrs['src'] bgneal@292: except KeyError: bgneal@292: return u' (missing image) ' bgneal@290: return u'![image](%s)' % url bgneal@290: bgneal@290: bgneal@290: class CodeElement(ElementBase): bgneal@290: """ bgneal@290: This class is used to create code blocks in Markdown. bgneal@290: bgneal@290: """ bgneal@290: def markdown(self): bgneal@290: return u' %s\n' % self.data.replace('\n', '\n ') bgneal@290: bgneal@290: bgneal@290: # List (ordered & unordered) support: bgneal@290: bgneal@290: class ListElement(ElementBase): bgneal@290: """ bgneal@290: This class creates Markdown for unordered lists. The bullet() method can be bgneal@290: overridden to create ordered lists. bgneal@290: bgneal@290: """ bgneal@290: def __init__(self, attrs=None): bgneal@290: super(ListElement, self).__init__(attrs) bgneal@290: self.items = [] bgneal@290: self.list_nesting = 1 bgneal@290: bgneal@290: def add_data(self, data): bgneal@290: self.items.append(data) bgneal@290: bgneal@290: def bullet(self): bgneal@290: return u'*' bgneal@290: bgneal@290: def markdown(self): bgneal@290: bullet_str = self.bullet() bgneal@290: indent = u' ' * (4 * (self.list_nesting - 1)) bgneal@290: s = u'' bgneal@290: for item in self.items: bgneal@290: s += u'\n%s%s %s' % (indent, bullet_str, item) bgneal@290: return s bgneal@290: bgneal@290: bgneal@290: class OrderedListElement(ListElement): bgneal@290: """ bgneal@290: This class creates Markdown for ordered lists. bgneal@290: bgneal@290: """ bgneal@290: def bullet(self): bgneal@290: return '1.' bgneal@290: bgneal@290: bgneal@290: class ItemElement(ElementBase): bgneal@290: """ bgneal@290: This element is used to represent ordered & unordered list items. bgneal@290: bgneal@290: """ bgneal@290: pass bgneal@290: bgneal@290: ############################################################################### bgneal@290: ############################################################################### bgneal@290: bgneal@290: class MarkdownWriter(HTMLParser): bgneal@290: """ bgneal@290: This class is an HTMLParser that converts a subset of HTML to Markdown. bgneal@290: bgneal@290: """ bgneal@290: bgneal@290: elem_factories = { bgneal@290: 'a': LinkElement, bgneal@290: 'blockquote': QuoteElement, bgneal@290: 'br': BreakElement, bgneal@290: 'div': DivElement, bgneal@290: 'em': create_emphasis('_'), bgneal@290: 'img': ImageElement, bgneal@290: 'li': ItemElement, bgneal@290: 'ol': OrderedListElement, bgneal@290: 'pre': CodeElement, bgneal@290: 's': create_html('strike'), bgneal@290: 'strong': create_emphasis('**'), bgneal@290: 'u': create_html('u'), bgneal@290: 'ul': ListElement, bgneal@290: } bgneal@290: bgneal@290: def __init__(self): bgneal@290: HTMLParser.__init__(self) bgneal@290: self.reset() bgneal@290: bgneal@290: def handle_starttag(self, tag, attrs): bgneal@290: if tag in self.elem_factories: bgneal@290: factory = self.elem_factories[tag] bgneal@290: element = factory(attrs) bgneal@290: else: bgneal@290: element = TextElement() bgneal@290: bgneal@290: self._push_elem(element) bgneal@290: bgneal@290: def handle_endtag(self, tag): bgneal@290: self._pop_elem() bgneal@290: bgneal@290: def handle_data(self, data): bgneal@290: if len(self.elem_stack) == 0: bgneal@290: self._push_elem(TextElement()) bgneal@290: self._add_data(data) bgneal@290: bgneal@290: def handle_entityref(self, name): bgneal@290: try: bgneal@290: text = unichr(htmlentitydefs.name2codepoint[name]) bgneal@290: except KeyError: bgneal@290: text = name bgneal@290: self.handle_data(text) bgneal@290: bgneal@290: def handle_charref(self, name): bgneal@290: self.handle_data(unichr(int(name))) bgneal@290: bgneal@290: def reset(self): bgneal@290: HTMLParser.reset(self) bgneal@290: self.elem_stack = [] bgneal@290: self.elements = [] bgneal@290: self.list_nesting = 0 bgneal@290: bgneal@290: def _push_elem(self, tag): bgneal@290: if len(self.elem_stack) and isinstance(self.elem_stack[-1], TextElement): bgneal@290: self._pop_elem() bgneal@290: if isinstance(tag, ListElement): bgneal@290: self.list_nesting += 1 bgneal@290: tag.list_nesting = self.list_nesting bgneal@290: self.elem_stack.append(tag) bgneal@290: bgneal@290: def _pop_elem(self): bgneal@292: try: bgneal@292: element = self.elem_stack.pop() bgneal@292: except IndexError: bgneal@292: # pop from empty list => bad HTML input; ignore it bgneal@292: return bgneal@292: bgneal@290: if isinstance(element, ListElement): bgneal@290: self.list_nesting -= 1 bgneal@290: if len(self.elem_stack): bgneal@290: self.elem_stack[-1].add_data(element.markdown()) bgneal@290: else: bgneal@290: self.elements.append(element) bgneal@290: bgneal@290: def _add_data(self, data): bgneal@290: self.elem_stack[-1].add_data(data) bgneal@290: bgneal@290: def markdown(self): bgneal@290: while len(self.elem_stack): bgneal@290: self._pop_elem() bgneal@290: text_list = [e.markdown() for e in self.elements] bgneal@290: return u''.join(text_list)