Mercurial > public > sg101
view legacy/html2md.py @ 861:e4f8d87c3d30
Configure Markdown logger to reduce noise in logs.
Markdown is logging at the INFO level whenever it loads an extension.
This looks like it has been fixed in master at GitHub. But until then
we will explicitly configure the MARKDOWN logger to log at WARNING
or higher.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Mon, 01 Dec 2014 18:36:27 -0600 |
parents | ee87ea74d46b |
children |
line wrap: on
line source
""" This module contains a class derived from Python's HTMLParser to convert HTML to Markdown. Currently this class only supports those HTML tags that have counter- parts in BBCode used by stock phpBB 2.x. In other words, this class was created to help convert data from a phpBB forum to Markdown syntax and its scope is currently limited to that task. """ from HTMLParser import HTMLParser import htmlentitydefs # Let's call Markdown markup entities "elements" to avoid confusion # with HTML tags. class ElementBase(object): """ Base class for all Markdown elements. """ def __init__(self, attrs=None): self.data = u'' self.attrs = dict(attrs) if attrs else {} def add_data(self, data): self.data += data def markdown(self): return self.data class TextElement(ElementBase): """ TextElements represent text fragments not inside HTML tags. """ pass class EmphasisElement(ElementBase): """ An EmphasisElement is a Markdown element used to indicate emphasis and is represented by placing characters around text. E.g. _em_, **bold** """ def __init__(self, tag, attrs): super(EmphasisElement, self).__init__(attrs) self.tag = tag def markdown(self): return u'%s%s%s' % (self.tag, self.data, self.tag) def create_emphasis(tag): """ Returns a function that creates an EmphasisElement using the supplied tag. """ def inner(attrs): return EmphasisElement(tag, attrs) return inner class HtmlElement(ElementBase): """ Markdown also accepts HTML markup. This element represents a HTML tag that maps to itself in Markdown. """ def __init__(self, tag, attrs): super(HtmlElement, self).__init__(attrs) self.tag = tag def markdown(self): return u'<%s>%s</%s>' % (self.tag, self.data, self.tag) def create_html(tag): """ Returns a function that creates a HtmlElement using the supplied tag. """ def inner(attrs): return HtmlElement(tag, attrs) return inner class QuoteElement(ElementBase): """ Class to represent a blockquote in Markdown. """ def markdown(self): return u'> %s\n\n' % self.data.replace('\n', '\n> ') class BreakElement(ElementBase): """ Class to represent a linebreak in Markdown. """ def markdown(self): return u' \n' class DivElement(ElementBase): """ This class maps a HTML <div> into a block of text surrounded by newlines. """ def markdown(self): return u'\n%s\n' % self.data class LinkElement(ElementBase): """ This class maps HTML <a> tags into Markdown links. If no data is present, the actual href is used for the link text. """ def markdown(self): try: url = self.attrs['href'] except KeyError: return self.data if self.data else u'' text = self.data if self.data else url return u'[%s](%s)' % (text, url) class ImageElement(ElementBase): """ This class maps HTML <img> tags into Markdown. This element assumes no alt text is present, and simply uses the word 'image' for the alt text. """ def markdown(self): try: url = self.attrs['src'] except KeyError: return u' (missing image) ' return u'![image](%s)' % url class CodeElement(ElementBase): """ This class is used to create code blocks in Markdown. """ def markdown(self): return u' %s\n' % self.data.replace('\n', '\n ') # List (ordered & unordered) support: class ListElement(ElementBase): """ This class creates Markdown for unordered lists. The bullet() method can be overridden to create ordered lists. """ def __init__(self, attrs=None): super(ListElement, self).__init__(attrs) self.items = [] self.list_nesting = 1 def add_data(self, data): self.items.append(data) def bullet(self): return u'*' def markdown(self): bullet_str = self.bullet() indent = u' ' * (4 * (self.list_nesting - 1)) s = u'' for item in self.items: s += u'\n%s%s %s' % (indent, bullet_str, item) return s class OrderedListElement(ListElement): """ This class creates Markdown for ordered lists. """ def bullet(self): return '1.' class ItemElement(ElementBase): """ This element is used to represent ordered & unordered list items. """ pass ############################################################################### ############################################################################### class MarkdownWriter(HTMLParser): """ This class is an HTMLParser that converts a subset of HTML to Markdown. """ elem_factories = { 'a': LinkElement, 'blockquote': QuoteElement, 'br': BreakElement, 'div': DivElement, 'em': create_emphasis('_'), 'img': ImageElement, 'li': ItemElement, 'ol': OrderedListElement, 'pre': CodeElement, 's': create_html('strike'), 'strong': create_emphasis('**'), 'u': create_html('u'), 'ul': ListElement, } def __init__(self): HTMLParser.__init__(self) self.reset() def handle_starttag(self, tag, attrs): if tag in self.elem_factories: factory = self.elem_factories[tag] element = factory(attrs) else: element = TextElement() self._push_elem(element) def handle_endtag(self, tag): self._pop_elem() def handle_data(self, data): if len(self.elem_stack) == 0: self._push_elem(TextElement()) self._add_data(data) def handle_entityref(self, name): try: text = unichr(htmlentitydefs.name2codepoint[name]) except KeyError: text = name self.handle_data(text) def handle_charref(self, name): self.handle_data(unichr(int(name))) def reset(self): HTMLParser.reset(self) self.elem_stack = [] self.elements = [] self.list_nesting = 0 def _push_elem(self, tag): if len(self.elem_stack) and isinstance(self.elem_stack[-1], TextElement): self._pop_elem() if isinstance(tag, ListElement): self.list_nesting += 1 tag.list_nesting = self.list_nesting self.elem_stack.append(tag) def _pop_elem(self): try: element = self.elem_stack.pop() except IndexError: # pop from empty list => bad HTML input; ignore it return if isinstance(element, ListElement): self.list_nesting -= 1 if len(self.elem_stack): self.elem_stack[-1].add_data(element.markdown()) else: self.elements.append(element) def _add_data(self, data): self.elem_stack[-1].add_data(data) def markdown(self): while len(self.elem_stack): self._pop_elem() text_list = [e.markdown() for e in self.elements] return u''.join(text_list)