annotate legacy/html2md.py @ 861:e4f8d87c3d30

Configure Markdown logger to reduce noise in logs. Markdown is logging at the INFO level whenever it loads an extension. This looks like it has been fixed in master at GitHub. But until then we will explicitly configure the MARKDOWN logger to log at WARNING or higher.
author Brian Neal <bgneal@gmail.com>
date Mon, 01 Dec 2014 18:36:27 -0600
parents ee87ea74d46b
children
rev   line source
bgneal@290 1 """
bgneal@290 2 This module contains a class derived from Python's HTMLParser to convert HTML to
bgneal@290 3 Markdown. Currently this class only supports those HTML tags that have counter-
bgneal@290 4 parts in BBCode used by stock phpBB 2.x.
bgneal@290 5
bgneal@290 6 In other words, this class was created to help convert data from a phpBB
bgneal@290 7 forum to Markdown syntax and its scope is currently limited to that task.
bgneal@290 8
bgneal@290 9 """
bgneal@290 10 from HTMLParser import HTMLParser
bgneal@290 11 import htmlentitydefs
bgneal@290 12
bgneal@290 13
bgneal@290 14 # Let's call Markdown markup entities "elements" to avoid confusion
bgneal@290 15 # with HTML tags.
bgneal@290 16
bgneal@290 17 class ElementBase(object):
bgneal@290 18 """
bgneal@290 19 Base class for all Markdown elements.
bgneal@290 20
bgneal@290 21 """
bgneal@290 22 def __init__(self, attrs=None):
bgneal@290 23 self.data = u''
bgneal@290 24 self.attrs = dict(attrs) if attrs else {}
bgneal@290 25
bgneal@290 26 def add_data(self, data):
bgneal@290 27 self.data += data
bgneal@290 28
bgneal@290 29 def markdown(self):
bgneal@290 30 return self.data
bgneal@290 31
bgneal@290 32
bgneal@290 33 class TextElement(ElementBase):
bgneal@290 34 """
bgneal@290 35 TextElements represent text fragments not inside HTML tags.
bgneal@290 36 """
bgneal@290 37 pass
bgneal@290 38
bgneal@290 39
bgneal@290 40 class EmphasisElement(ElementBase):
bgneal@290 41 """
bgneal@290 42 An EmphasisElement is a Markdown element used to indicate emphasis and is
bgneal@290 43 represented by placing characters around text. E.g. _em_, **bold**
bgneal@290 44
bgneal@290 45 """
bgneal@290 46 def __init__(self, tag, attrs):
bgneal@290 47 super(EmphasisElement, self).__init__(attrs)
bgneal@290 48 self.tag = tag
bgneal@290 49
bgneal@290 50 def markdown(self):
bgneal@290 51 return u'%s%s%s' % (self.tag, self.data, self.tag)
bgneal@290 52
bgneal@290 53
bgneal@290 54 def create_emphasis(tag):
bgneal@290 55 """
bgneal@290 56 Returns a function that creates an EmphasisElement using the supplied
bgneal@290 57 tag.
bgneal@290 58
bgneal@290 59 """
bgneal@290 60 def inner(attrs):
bgneal@290 61 return EmphasisElement(tag, attrs)
bgneal@290 62 return inner
bgneal@290 63
bgneal@290 64
bgneal@290 65 class HtmlElement(ElementBase):
bgneal@290 66 """
bgneal@290 67 Markdown also accepts HTML markup. This element represents a HTML tag that
bgneal@290 68 maps to itself in Markdown.
bgneal@290 69
bgneal@290 70 """
bgneal@290 71 def __init__(self, tag, attrs):
bgneal@290 72 super(HtmlElement, self).__init__(attrs)
bgneal@290 73 self.tag = tag
bgneal@290 74
bgneal@290 75 def markdown(self):
bgneal@290 76 return u'<%s>%s</%s>' % (self.tag, self.data, self.tag)
bgneal@290 77
bgneal@290 78
bgneal@290 79 def create_html(tag):
bgneal@290 80 """
bgneal@290 81 Returns a function that creates a HtmlElement using the supplied tag.
bgneal@290 82
bgneal@290 83 """
bgneal@290 84 def inner(attrs):
bgneal@290 85 return HtmlElement(tag, attrs)
bgneal@290 86 return inner
bgneal@290 87
bgneal@290 88
bgneal@290 89 class QuoteElement(ElementBase):
bgneal@290 90 """
bgneal@290 91 Class to represent a blockquote in Markdown.
bgneal@290 92
bgneal@290 93 """
bgneal@290 94 def markdown(self):
bgneal@290 95 return u'> %s\n\n' % self.data.replace('\n', '\n> ')
bgneal@290 96
bgneal@290 97
bgneal@290 98 class BreakElement(ElementBase):
bgneal@290 99 """
bgneal@290 100 Class to represent a linebreak in Markdown.
bgneal@290 101
bgneal@290 102 """
bgneal@290 103 def markdown(self):
bgneal@290 104 return u' \n'
bgneal@290 105
bgneal@290 106
bgneal@290 107 class DivElement(ElementBase):
bgneal@290 108 """
bgneal@290 109 This class maps a HTML <div> into a block of text surrounded by newlines.
bgneal@290 110
bgneal@290 111 """
bgneal@290 112 def markdown(self):
bgneal@290 113 return u'\n%s\n' % self.data
bgneal@290 114
bgneal@290 115
bgneal@290 116 class LinkElement(ElementBase):
bgneal@290 117 """
bgneal@290 118 This class maps HTML <a> tags into Markdown links.
bgneal@290 119 If no data is present, the actual href is used for the link text.
bgneal@290 120
bgneal@290 121 """
bgneal@290 122 def markdown(self):
bgneal@292 123 try:
bgneal@292 124 url = self.attrs['href']
bgneal@292 125 except KeyError:
bgneal@292 126 return self.data if self.data else u''
bgneal@292 127
bgneal@290 128 text = self.data if self.data else url
bgneal@290 129 return u'[%s](%s)' % (text, url)
bgneal@290 130
bgneal@290 131
bgneal@290 132 class ImageElement(ElementBase):
bgneal@290 133 """
bgneal@290 134 This class maps HTML <img> tags into Markdown.
bgneal@290 135 This element assumes no alt text is present, and simply uses the word
bgneal@290 136 'image' for the alt text.
bgneal@290 137
bgneal@290 138 """
bgneal@290 139 def markdown(self):
bgneal@292 140 try:
bgneal@292 141 url = self.attrs['src']
bgneal@292 142 except KeyError:
bgneal@292 143 return u' (missing image) '
bgneal@290 144 return u'![image](%s)' % url
bgneal@290 145
bgneal@290 146
bgneal@290 147 class CodeElement(ElementBase):
bgneal@290 148 """
bgneal@290 149 This class is used to create code blocks in Markdown.
bgneal@290 150
bgneal@290 151 """
bgneal@290 152 def markdown(self):
bgneal@290 153 return u' %s\n' % self.data.replace('\n', '\n ')
bgneal@290 154
bgneal@290 155
bgneal@290 156 # List (ordered & unordered) support:
bgneal@290 157
bgneal@290 158 class ListElement(ElementBase):
bgneal@290 159 """
bgneal@290 160 This class creates Markdown for unordered lists. The bullet() method can be
bgneal@290 161 overridden to create ordered lists.
bgneal@290 162
bgneal@290 163 """
bgneal@290 164 def __init__(self, attrs=None):
bgneal@290 165 super(ListElement, self).__init__(attrs)
bgneal@290 166 self.items = []
bgneal@290 167 self.list_nesting = 1
bgneal@290 168
bgneal@290 169 def add_data(self, data):
bgneal@290 170 self.items.append(data)
bgneal@290 171
bgneal@290 172 def bullet(self):
bgneal@290 173 return u'*'
bgneal@290 174
bgneal@290 175 def markdown(self):
bgneal@290 176 bullet_str = self.bullet()
bgneal@290 177 indent = u' ' * (4 * (self.list_nesting - 1))
bgneal@290 178 s = u''
bgneal@290 179 for item in self.items:
bgneal@290 180 s += u'\n%s%s %s' % (indent, bullet_str, item)
bgneal@290 181 return s
bgneal@290 182
bgneal@290 183
bgneal@290 184 class OrderedListElement(ListElement):
bgneal@290 185 """
bgneal@290 186 This class creates Markdown for ordered lists.
bgneal@290 187
bgneal@290 188 """
bgneal@290 189 def bullet(self):
bgneal@290 190 return '1.'
bgneal@290 191
bgneal@290 192
bgneal@290 193 class ItemElement(ElementBase):
bgneal@290 194 """
bgneal@290 195 This element is used to represent ordered & unordered list items.
bgneal@290 196
bgneal@290 197 """
bgneal@290 198 pass
bgneal@290 199
bgneal@290 200 ###############################################################################
bgneal@290 201 ###############################################################################
bgneal@290 202
bgneal@290 203 class MarkdownWriter(HTMLParser):
bgneal@290 204 """
bgneal@290 205 This class is an HTMLParser that converts a subset of HTML to Markdown.
bgneal@290 206
bgneal@290 207 """
bgneal@290 208
bgneal@290 209 elem_factories = {
bgneal@290 210 'a': LinkElement,
bgneal@290 211 'blockquote': QuoteElement,
bgneal@290 212 'br': BreakElement,
bgneal@290 213 'div': DivElement,
bgneal@290 214 'em': create_emphasis('_'),
bgneal@290 215 'img': ImageElement,
bgneal@290 216 'li': ItemElement,
bgneal@290 217 'ol': OrderedListElement,
bgneal@290 218 'pre': CodeElement,
bgneal@290 219 's': create_html('strike'),
bgneal@290 220 'strong': create_emphasis('**'),
bgneal@290 221 'u': create_html('u'),
bgneal@290 222 'ul': ListElement,
bgneal@290 223 }
bgneal@290 224
bgneal@290 225 def __init__(self):
bgneal@290 226 HTMLParser.__init__(self)
bgneal@290 227 self.reset()
bgneal@290 228
bgneal@290 229 def handle_starttag(self, tag, attrs):
bgneal@290 230 if tag in self.elem_factories:
bgneal@290 231 factory = self.elem_factories[tag]
bgneal@290 232 element = factory(attrs)
bgneal@290 233 else:
bgneal@290 234 element = TextElement()
bgneal@290 235
bgneal@290 236 self._push_elem(element)
bgneal@290 237
bgneal@290 238 def handle_endtag(self, tag):
bgneal@290 239 self._pop_elem()
bgneal@290 240
bgneal@290 241 def handle_data(self, data):
bgneal@290 242 if len(self.elem_stack) == 0:
bgneal@290 243 self._push_elem(TextElement())
bgneal@290 244 self._add_data(data)
bgneal@290 245
bgneal@290 246 def handle_entityref(self, name):
bgneal@290 247 try:
bgneal@290 248 text = unichr(htmlentitydefs.name2codepoint[name])
bgneal@290 249 except KeyError:
bgneal@290 250 text = name
bgneal@290 251 self.handle_data(text)
bgneal@290 252
bgneal@290 253 def handle_charref(self, name):
bgneal@290 254 self.handle_data(unichr(int(name)))
bgneal@290 255
bgneal@290 256 def reset(self):
bgneal@290 257 HTMLParser.reset(self)
bgneal@290 258 self.elem_stack = []
bgneal@290 259 self.elements = []
bgneal@290 260 self.list_nesting = 0
bgneal@290 261
bgneal@290 262 def _push_elem(self, tag):
bgneal@290 263 if len(self.elem_stack) and isinstance(self.elem_stack[-1], TextElement):
bgneal@290 264 self._pop_elem()
bgneal@290 265 if isinstance(tag, ListElement):
bgneal@290 266 self.list_nesting += 1
bgneal@290 267 tag.list_nesting = self.list_nesting
bgneal@290 268 self.elem_stack.append(tag)
bgneal@290 269
bgneal@290 270 def _pop_elem(self):
bgneal@292 271 try:
bgneal@292 272 element = self.elem_stack.pop()
bgneal@292 273 except IndexError:
bgneal@292 274 # pop from empty list => bad HTML input; ignore it
bgneal@292 275 return
bgneal@292 276
bgneal@290 277 if isinstance(element, ListElement):
bgneal@290 278 self.list_nesting -= 1
bgneal@290 279 if len(self.elem_stack):
bgneal@290 280 self.elem_stack[-1].add_data(element.markdown())
bgneal@290 281 else:
bgneal@290 282 self.elements.append(element)
bgneal@290 283
bgneal@290 284 def _add_data(self, data):
bgneal@290 285 self.elem_stack[-1].add_data(data)
bgneal@290 286
bgneal@290 287 def markdown(self):
bgneal@290 288 while len(self.elem_stack):
bgneal@290 289 self._pop_elem()
bgneal@290 290 text_list = [e.markdown() for e in self.elements]
bgneal@290 291 return u''.join(text_list)