view legacy/html2md.py @ 850:202e0828aafe

Update requirements.txt for latest html5lib.
author Brian Neal <bgneal@gmail.com>
date Thu, 30 Oct 2014 19:40:26 -0500
parents ee87ea74d46b
children
line wrap: on
line source
"""
This module contains a class derived from Python's HTMLParser to convert HTML to
Markdown. Currently this class only supports those HTML tags that have counter-
parts in BBCode used by stock phpBB 2.x.

In other words, this class was created to help convert data from a phpBB
forum to Markdown syntax and its scope is currently limited to that task.

"""
from HTMLParser import HTMLParser
import htmlentitydefs


# Let's call Markdown markup entities "elements" to avoid confusion
# with HTML tags.

class ElementBase(object):
    """
    Base class for all Markdown elements.

    """
    def __init__(self, attrs=None):
        self.data = u''
        self.attrs = dict(attrs) if attrs else {}

    def add_data(self, data):
        self.data += data

    def markdown(self):
        return self.data


class TextElement(ElementBase):
    """
    TextElements represent text fragments not inside HTML tags.
    """
    pass


class EmphasisElement(ElementBase):
    """
    An EmphasisElement is a Markdown element used to indicate emphasis and is
    represented by placing characters around text. E.g. _em_, **bold**

    """
    def __init__(self, tag, attrs):
        super(EmphasisElement, self).__init__(attrs)
        self.tag = tag

    def markdown(self):
        return u'%s%s%s' % (self.tag, self.data, self.tag)


def create_emphasis(tag):
    """
    Returns a function that creates an EmphasisElement using the supplied
    tag.

    """
    def inner(attrs):
        return EmphasisElement(tag, attrs)
    return inner


class HtmlElement(ElementBase):
    """
    Markdown also accepts HTML markup. This element represents a HTML tag that
    maps to itself in Markdown.

    """
    def __init__(self, tag, attrs):
        super(HtmlElement, self).__init__(attrs)
        self.tag = tag

    def markdown(self):
        return u'<%s>%s</%s>' % (self.tag, self.data, self.tag)


def create_html(tag):
    """
    Returns a function that creates a HtmlElement using the supplied tag.

    """
    def inner(attrs):
        return HtmlElement(tag, attrs)
    return inner


class QuoteElement(ElementBase):
    """
    Class to represent a blockquote in Markdown.

    """
    def markdown(self):
        return u'> %s\n\n' % self.data.replace('\n', '\n> ')


class BreakElement(ElementBase):
    """
    Class to represent a linebreak in Markdown.

    """
    def markdown(self):
        return u'  \n'


class DivElement(ElementBase):
    """
    This class maps a HTML <div> into a block of text surrounded by newlines.

    """
    def markdown(self):
        return u'\n%s\n' % self.data


class LinkElement(ElementBase):
    """
    This class maps HTML <a> tags into Markdown links.
    If no data is present, the actual href is used for the link text.

    """
    def markdown(self):
        try:
            url = self.attrs['href']
        except KeyError:
            return self.data if self.data else u''

        text = self.data if self.data else url
        return u'[%s](%s)' % (text, url)


class ImageElement(ElementBase):
    """
    This class maps HTML <img> tags into Markdown.
    This element assumes no alt text is present, and simply uses the word
    'image' for the alt text.

    """
    def markdown(self):
        try:
            url = self.attrs['src']
        except KeyError:
            return u' (missing image) '
        return u'![image](%s)' % url


class CodeElement(ElementBase):
    """
    This class is used to create code blocks in Markdown.

    """
    def markdown(self):
        return u'    %s\n' % self.data.replace('\n', '\n    ')


# List (ordered & unordered) support:

class ListElement(ElementBase):
    """
    This class creates Markdown for unordered lists. The bullet() method can be
    overridden to create ordered lists.

    """
    def __init__(self, attrs=None):
        super(ListElement, self).__init__(attrs)
        self.items = []
        self.list_nesting = 1

    def add_data(self, data):
        self.items.append(data)

    def bullet(self):
        return u'*'

    def markdown(self):
        bullet_str = self.bullet()
        indent = u' ' * (4 * (self.list_nesting - 1))
        s = u''
        for item in self.items:
            s += u'\n%s%s %s' % (indent, bullet_str, item)
        return s


class OrderedListElement(ListElement):
    """
    This class creates Markdown for ordered lists.

    """
    def bullet(self):
        return '1.'


class ItemElement(ElementBase):
    """
    This element is used to represent ordered & unordered list items.

    """
    pass

###############################################################################
###############################################################################

class MarkdownWriter(HTMLParser):
    """
    This class is an HTMLParser that converts a subset of HTML to Markdown.

    """

    elem_factories = {
        'a': LinkElement,
        'blockquote': QuoteElement,
        'br': BreakElement,
        'div': DivElement,
        'em': create_emphasis('_'),
        'img': ImageElement,
        'li': ItemElement,
        'ol': OrderedListElement,
        'pre': CodeElement,
        's': create_html('strike'),
        'strong': create_emphasis('**'),
        'u': create_html('u'),
        'ul': ListElement,
    }

    def __init__(self):
        HTMLParser.__init__(self)
        self.reset()

    def handle_starttag(self, tag, attrs):
        if tag in self.elem_factories:
            factory = self.elem_factories[tag]
            element = factory(attrs)
        else:
            element = TextElement()

        self._push_elem(element)

    def handle_endtag(self, tag):
        self._pop_elem()

    def handle_data(self, data):
        if len(self.elem_stack) == 0:
            self._push_elem(TextElement())
        self._add_data(data)

    def handle_entityref(self, name):
        try:
            text = unichr(htmlentitydefs.name2codepoint[name])
        except KeyError:
            text = name
        self.handle_data(text)

    def handle_charref(self, name):
        self.handle_data(unichr(int(name)))

    def reset(self):
        HTMLParser.reset(self)
        self.elem_stack = []
        self.elements = []
        self.list_nesting = 0

    def _push_elem(self, tag):
        if len(self.elem_stack) and isinstance(self.elem_stack[-1], TextElement):
            self._pop_elem()
        if isinstance(tag, ListElement):
            self.list_nesting += 1
            tag.list_nesting = self.list_nesting
        self.elem_stack.append(tag)

    def _pop_elem(self):
        try:
            element = self.elem_stack.pop()
        except IndexError:
            # pop from empty list => bad HTML input; ignore it
            return

        if isinstance(element, ListElement):
            self.list_nesting -= 1
        if len(self.elem_stack):
            self.elem_stack[-1].add_data(element.markdown())
        else:
            self.elements.append(element)

    def _add_data(self, data):
        self.elem_stack[-1].add_data(data)

    def markdown(self):
        while len(self.elem_stack):
            self._pop_elem()
        text_list = [e.markdown() for e in self.elements]
        return u''.join(text_list)