annotate legacy/html2md.py @ 1203:8cd15df9b563

Controlling the xapian install script in tools.
author Brian Neal <bgneal@gmail.com>
date Sat, 04 Jan 2025 14:19:19 -0600
parents ee87ea74d46b
children
rev   line source
bgneal@290 1 """
bgneal@290 2 This module contains a class derived from Python's HTMLParser to convert HTML to
bgneal@290 3 Markdown. Currently this class only supports those HTML tags that have counter-
bgneal@290 4 parts in BBCode used by stock phpBB 2.x.
bgneal@290 5
bgneal@290 6 In other words, this class was created to help convert data from a phpBB
bgneal@290 7 forum to Markdown syntax and its scope is currently limited to that task.
bgneal@290 8
bgneal@290 9 """
bgneal@290 10 from HTMLParser import HTMLParser
bgneal@290 11 import htmlentitydefs
bgneal@290 12
bgneal@290 13
bgneal@290 14 # Let's call Markdown markup entities "elements" to avoid confusion
bgneal@290 15 # with HTML tags.
bgneal@290 16
bgneal@290 17 class ElementBase(object):
bgneal@290 18 """
bgneal@290 19 Base class for all Markdown elements.
bgneal@290 20
bgneal@290 21 """
bgneal@290 22 def __init__(self, attrs=None):
bgneal@290 23 self.data = u''
bgneal@290 24 self.attrs = dict(attrs) if attrs else {}
bgneal@290 25
bgneal@290 26 def add_data(self, data):
bgneal@290 27 self.data += data
bgneal@290 28
bgneal@290 29 def markdown(self):
bgneal@290 30 return self.data
bgneal@290 31
bgneal@290 32
bgneal@290 33 class TextElement(ElementBase):
bgneal@290 34 """
bgneal@290 35 TextElements represent text fragments not inside HTML tags.
bgneal@290 36 """
bgneal@290 37 pass
bgneal@290 38
bgneal@290 39
bgneal@290 40 class EmphasisElement(ElementBase):
bgneal@290 41 """
bgneal@290 42 An EmphasisElement is a Markdown element used to indicate emphasis and is
bgneal@290 43 represented by placing characters around text. E.g. _em_, **bold**
bgneal@290 44
bgneal@290 45 """
bgneal@290 46 def __init__(self, tag, attrs):
bgneal@290 47 super(EmphasisElement, self).__init__(attrs)
bgneal@290 48 self.tag = tag
bgneal@290 49
bgneal@290 50 def markdown(self):
bgneal@290 51 return u'%s%s%s' % (self.tag, self.data, self.tag)
bgneal@290 52
bgneal@290 53
bgneal@290 54 def create_emphasis(tag):
bgneal@290 55 """
bgneal@290 56 Returns a function that creates an EmphasisElement using the supplied
bgneal@290 57 tag.
bgneal@290 58
bgneal@290 59 """
bgneal@290 60 def inner(attrs):
bgneal@290 61 return EmphasisElement(tag, attrs)
bgneal@290 62 return inner
bgneal@290 63
bgneal@290 64
bgneal@290 65 class HtmlElement(ElementBase):
bgneal@290 66 """
bgneal@290 67 Markdown also accepts HTML markup. This element represents a HTML tag that
bgneal@290 68 maps to itself in Markdown.
bgneal@290 69
bgneal@290 70 """
bgneal@290 71 def __init__(self, tag, attrs):
bgneal@290 72 super(HtmlElement, self).__init__(attrs)
bgneal@290 73 self.tag = tag
bgneal@290 74
bgneal@290 75 def markdown(self):
bgneal@290 76 return u'<%s>%s</%s>' % (self.tag, self.data, self.tag)
bgneal@290 77
bgneal@290 78
bgneal@290 79 def create_html(tag):
bgneal@290 80 """
bgneal@290 81 Returns a function that creates a HtmlElement using the supplied tag.
bgneal@290 82
bgneal@290 83 """
bgneal@290 84 def inner(attrs):
bgneal@290 85 return HtmlElement(tag, attrs)
bgneal@290 86 return inner
bgneal@290 87
bgneal@290 88
bgneal@290 89 class QuoteElement(ElementBase):
bgneal@290 90 """
bgneal@290 91 Class to represent a blockquote in Markdown.
bgneal@290 92
bgneal@290 93 """
bgneal@290 94 def markdown(self):
bgneal@290 95 return u'> %s\n\n' % self.data.replace('\n', '\n> ')
bgneal@290 96
bgneal@290 97
bgneal@290 98 class BreakElement(ElementBase):
bgneal@290 99 """
bgneal@290 100 Class to represent a linebreak in Markdown.
bgneal@290 101
bgneal@290 102 """
bgneal@290 103 def markdown(self):
bgneal@290 104 return u' \n'
bgneal@290 105
bgneal@290 106
bgneal@290 107 class DivElement(ElementBase):
bgneal@290 108 """
bgneal@290 109 This class maps a HTML <div> into a block of text surrounded by newlines.
bgneal@290 110
bgneal@290 111 """
bgneal@290 112 def markdown(self):
bgneal@290 113 return u'\n%s\n' % self.data
bgneal@290 114
bgneal@290 115
bgneal@290 116 class LinkElement(ElementBase):
bgneal@290 117 """
bgneal@290 118 This class maps HTML <a> tags into Markdown links.
bgneal@290 119 If no data is present, the actual href is used for the link text.
bgneal@290 120
bgneal@290 121 """
bgneal@290 122 def markdown(self):
bgneal@292 123 try:
bgneal@292 124 url = self.attrs['href']
bgneal@292 125 except KeyError:
bgneal@292 126 return self.data if self.data else u''
bgneal@292 127
bgneal@290 128 text = self.data if self.data else url
bgneal@290 129 return u'[%s](%s)' % (text, url)
bgneal@290 130
bgneal@290 131
bgneal@290 132 class ImageElement(ElementBase):
bgneal@290 133 """
bgneal@290 134 This class maps HTML <img> tags into Markdown.
bgneal@290 135 This element assumes no alt text is present, and simply uses the word
bgneal@290 136 'image' for the alt text.
bgneal@290 137
bgneal@290 138 """
bgneal@290 139 def markdown(self):
bgneal@292 140 try:
bgneal@292 141 url = self.attrs['src']
bgneal@292 142 except KeyError:
bgneal@292 143 return u' (missing image) '
bgneal@290 144 return u'![image](%s)' % url
bgneal@290 145
bgneal@290 146
bgneal@290 147 class CodeElement(ElementBase):
bgneal@290 148 """
bgneal@290 149 This class is used to create code blocks in Markdown.
bgneal@290 150
bgneal@290 151 """
bgneal@290 152 def markdown(self):
bgneal@290 153 return u' %s\n' % self.data.replace('\n', '\n ')
bgneal@290 154
bgneal@290 155
bgneal@290 156 # List (ordered & unordered) support:
bgneal@290 157
bgneal@290 158 class ListElement(ElementBase):
bgneal@290 159 """
bgneal@290 160 This class creates Markdown for unordered lists. The bullet() method can be
bgneal@290 161 overridden to create ordered lists.
bgneal@290 162
bgneal@290 163 """
bgneal@290 164 def __init__(self, attrs=None):
bgneal@290 165 super(ListElement, self).__init__(attrs)
bgneal@290 166 self.items = []
bgneal@290 167 self.list_nesting = 1
bgneal@290 168
bgneal@290 169 def add_data(self, data):
bgneal@290 170 self.items.append(data)
bgneal@290 171
bgneal@290 172 def bullet(self):
bgneal@290 173 return u'*'
bgneal@290 174
bgneal@290 175 def markdown(self):
bgneal@290 176 bullet_str = self.bullet()
bgneal@290 177 indent = u' ' * (4 * (self.list_nesting - 1))
bgneal@290 178 s = u''
bgneal@290 179 for item in self.items:
bgneal@290 180 s += u'\n%s%s %s' % (indent, bullet_str, item)
bgneal@290 181 return s
bgneal@290 182
bgneal@290 183
bgneal@290 184 class OrderedListElement(ListElement):
bgneal@290 185 """
bgneal@290 186 This class creates Markdown for ordered lists.
bgneal@290 187
bgneal@290 188 """
bgneal@290 189 def bullet(self):
bgneal@290 190 return '1.'
bgneal@290 191
bgneal@290 192
bgneal@290 193 class ItemElement(ElementBase):
bgneal@290 194 """
bgneal@290 195 This element is used to represent ordered & unordered list items.
bgneal@290 196
bgneal@290 197 """
bgneal@290 198 pass
bgneal@290 199
bgneal@290 200 ###############################################################################
bgneal@290 201 ###############################################################################
bgneal@290 202
bgneal@290 203 class MarkdownWriter(HTMLParser):
bgneal@290 204 """
bgneal@290 205 This class is an HTMLParser that converts a subset of HTML to Markdown.
bgneal@290 206
bgneal@290 207 """
bgneal@290 208
bgneal@290 209 elem_factories = {
bgneal@290 210 'a': LinkElement,
bgneal@290 211 'blockquote': QuoteElement,
bgneal@290 212 'br': BreakElement,
bgneal@290 213 'div': DivElement,
bgneal@290 214 'em': create_emphasis('_'),
bgneal@290 215 'img': ImageElement,
bgneal@290 216 'li': ItemElement,
bgneal@290 217 'ol': OrderedListElement,
bgneal@290 218 'pre': CodeElement,
bgneal@290 219 's': create_html('strike'),
bgneal@290 220 'strong': create_emphasis('**'),
bgneal@290 221 'u': create_html('u'),
bgneal@290 222 'ul': ListElement,
bgneal@290 223 }
bgneal@290 224
bgneal@290 225 def __init__(self):
bgneal@290 226 HTMLParser.__init__(self)
bgneal@290 227 self.reset()
bgneal@290 228
bgneal@290 229 def handle_starttag(self, tag, attrs):
bgneal@290 230 if tag in self.elem_factories:
bgneal@290 231 factory = self.elem_factories[tag]
bgneal@290 232 element = factory(attrs)
bgneal@290 233 else:
bgneal@290 234 element = TextElement()
bgneal@290 235
bgneal@290 236 self._push_elem(element)
bgneal@290 237
bgneal@290 238 def handle_endtag(self, tag):
bgneal@290 239 self._pop_elem()
bgneal@290 240
bgneal@290 241 def handle_data(self, data):
bgneal@290 242 if len(self.elem_stack) == 0:
bgneal@290 243 self._push_elem(TextElement())
bgneal@290 244 self._add_data(data)
bgneal@290 245
bgneal@290 246 def handle_entityref(self, name):
bgneal@290 247 try:
bgneal@290 248 text = unichr(htmlentitydefs.name2codepoint[name])
bgneal@290 249 except KeyError:
bgneal@290 250 text = name
bgneal@290 251 self.handle_data(text)
bgneal@290 252
bgneal@290 253 def handle_charref(self, name):
bgneal@290 254 self.handle_data(unichr(int(name)))
bgneal@290 255
bgneal@290 256 def reset(self):
bgneal@290 257 HTMLParser.reset(self)
bgneal@290 258 self.elem_stack = []
bgneal@290 259 self.elements = []
bgneal@290 260 self.list_nesting = 0
bgneal@290 261
bgneal@290 262 def _push_elem(self, tag):
bgneal@290 263 if len(self.elem_stack) and isinstance(self.elem_stack[-1], TextElement):
bgneal@290 264 self._pop_elem()
bgneal@290 265 if isinstance(tag, ListElement):
bgneal@290 266 self.list_nesting += 1
bgneal@290 267 tag.list_nesting = self.list_nesting
bgneal@290 268 self.elem_stack.append(tag)
bgneal@290 269
bgneal@290 270 def _pop_elem(self):
bgneal@292 271 try:
bgneal@292 272 element = self.elem_stack.pop()
bgneal@292 273 except IndexError:
bgneal@292 274 # pop from empty list => bad HTML input; ignore it
bgneal@292 275 return
bgneal@292 276
bgneal@290 277 if isinstance(element, ListElement):
bgneal@290 278 self.list_nesting -= 1
bgneal@290 279 if len(self.elem_stack):
bgneal@290 280 self.elem_stack[-1].add_data(element.markdown())
bgneal@290 281 else:
bgneal@290 282 self.elements.append(element)
bgneal@290 283
bgneal@290 284 def _add_data(self, data):
bgneal@290 285 self.elem_stack[-1].add_data(data)
bgneal@290 286
bgneal@290 287 def markdown(self):
bgneal@290 288 while len(self.elem_stack):
bgneal@290 289 self._pop_elem()
bgneal@290 290 text_list = [e.markdown() for e in self.elements]
bgneal@290 291 return u''.join(text_list)