comparison legacy/html2md.py @ 581:ee87ea74d46b

For Django 1.4, rearranged project structure for new manage.py.
author Brian Neal <bgneal@gmail.com>
date Sat, 05 May 2012 17:10:48 -0500
parents gpp/legacy/html2md.py@2367c4795c92
children
comparison
equal deleted inserted replaced
580:c525f3e0b5d0 581:ee87ea74d46b
1 """
2 This module contains a class derived from Python's HTMLParser to convert HTML to
3 Markdown. Currently this class only supports those HTML tags that have counter-
4 parts in BBCode used by stock phpBB 2.x.
5
6 In other words, this class was created to help convert data from a phpBB
7 forum to Markdown syntax and its scope is currently limited to that task.
8
9 """
10 from HTMLParser import HTMLParser
11 import htmlentitydefs
12
13
14 # Let's call Markdown markup entities "elements" to avoid confusion
15 # with HTML tags.
16
17 class ElementBase(object):
18 """
19 Base class for all Markdown elements.
20
21 """
22 def __init__(self, attrs=None):
23 self.data = u''
24 self.attrs = dict(attrs) if attrs else {}
25
26 def add_data(self, data):
27 self.data += data
28
29 def markdown(self):
30 return self.data
31
32
33 class TextElement(ElementBase):
34 """
35 TextElements represent text fragments not inside HTML tags.
36 """
37 pass
38
39
40 class EmphasisElement(ElementBase):
41 """
42 An EmphasisElement is a Markdown element used to indicate emphasis and is
43 represented by placing characters around text. E.g. _em_, **bold**
44
45 """
46 def __init__(self, tag, attrs):
47 super(EmphasisElement, self).__init__(attrs)
48 self.tag = tag
49
50 def markdown(self):
51 return u'%s%s%s' % (self.tag, self.data, self.tag)
52
53
54 def create_emphasis(tag):
55 """
56 Returns a function that creates an EmphasisElement using the supplied
57 tag.
58
59 """
60 def inner(attrs):
61 return EmphasisElement(tag, attrs)
62 return inner
63
64
65 class HtmlElement(ElementBase):
66 """
67 Markdown also accepts HTML markup. This element represents a HTML tag that
68 maps to itself in Markdown.
69
70 """
71 def __init__(self, tag, attrs):
72 super(HtmlElement, self).__init__(attrs)
73 self.tag = tag
74
75 def markdown(self):
76 return u'<%s>%s</%s>' % (self.tag, self.data, self.tag)
77
78
79 def create_html(tag):
80 """
81 Returns a function that creates a HtmlElement using the supplied tag.
82
83 """
84 def inner(attrs):
85 return HtmlElement(tag, attrs)
86 return inner
87
88
89 class QuoteElement(ElementBase):
90 """
91 Class to represent a blockquote in Markdown.
92
93 """
94 def markdown(self):
95 return u'> %s\n\n' % self.data.replace('\n', '\n> ')
96
97
98 class BreakElement(ElementBase):
99 """
100 Class to represent a linebreak in Markdown.
101
102 """
103 def markdown(self):
104 return u' \n'
105
106
107 class DivElement(ElementBase):
108 """
109 This class maps a HTML <div> into a block of text surrounded by newlines.
110
111 """
112 def markdown(self):
113 return u'\n%s\n' % self.data
114
115
116 class LinkElement(ElementBase):
117 """
118 This class maps HTML <a> tags into Markdown links.
119 If no data is present, the actual href is used for the link text.
120
121 """
122 def markdown(self):
123 try:
124 url = self.attrs['href']
125 except KeyError:
126 return self.data if self.data else u''
127
128 text = self.data if self.data else url
129 return u'[%s](%s)' % (text, url)
130
131
132 class ImageElement(ElementBase):
133 """
134 This class maps HTML <img> tags into Markdown.
135 This element assumes no alt text is present, and simply uses the word
136 'image' for the alt text.
137
138 """
139 def markdown(self):
140 try:
141 url = self.attrs['src']
142 except KeyError:
143 return u' (missing image) '
144 return u'![image](%s)' % url
145
146
147 class CodeElement(ElementBase):
148 """
149 This class is used to create code blocks in Markdown.
150
151 """
152 def markdown(self):
153 return u' %s\n' % self.data.replace('\n', '\n ')
154
155
156 # List (ordered & unordered) support:
157
158 class ListElement(ElementBase):
159 """
160 This class creates Markdown for unordered lists. The bullet() method can be
161 overridden to create ordered lists.
162
163 """
164 def __init__(self, attrs=None):
165 super(ListElement, self).__init__(attrs)
166 self.items = []
167 self.list_nesting = 1
168
169 def add_data(self, data):
170 self.items.append(data)
171
172 def bullet(self):
173 return u'*'
174
175 def markdown(self):
176 bullet_str = self.bullet()
177 indent = u' ' * (4 * (self.list_nesting - 1))
178 s = u''
179 for item in self.items:
180 s += u'\n%s%s %s' % (indent, bullet_str, item)
181 return s
182
183
184 class OrderedListElement(ListElement):
185 """
186 This class creates Markdown for ordered lists.
187
188 """
189 def bullet(self):
190 return '1.'
191
192
193 class ItemElement(ElementBase):
194 """
195 This element is used to represent ordered & unordered list items.
196
197 """
198 pass
199
200 ###############################################################################
201 ###############################################################################
202
203 class MarkdownWriter(HTMLParser):
204 """
205 This class is an HTMLParser that converts a subset of HTML to Markdown.
206
207 """
208
209 elem_factories = {
210 'a': LinkElement,
211 'blockquote': QuoteElement,
212 'br': BreakElement,
213 'div': DivElement,
214 'em': create_emphasis('_'),
215 'img': ImageElement,
216 'li': ItemElement,
217 'ol': OrderedListElement,
218 'pre': CodeElement,
219 's': create_html('strike'),
220 'strong': create_emphasis('**'),
221 'u': create_html('u'),
222 'ul': ListElement,
223 }
224
225 def __init__(self):
226 HTMLParser.__init__(self)
227 self.reset()
228
229 def handle_starttag(self, tag, attrs):
230 if tag in self.elem_factories:
231 factory = self.elem_factories[tag]
232 element = factory(attrs)
233 else:
234 element = TextElement()
235
236 self._push_elem(element)
237
238 def handle_endtag(self, tag):
239 self._pop_elem()
240
241 def handle_data(self, data):
242 if len(self.elem_stack) == 0:
243 self._push_elem(TextElement())
244 self._add_data(data)
245
246 def handle_entityref(self, name):
247 try:
248 text = unichr(htmlentitydefs.name2codepoint[name])
249 except KeyError:
250 text = name
251 self.handle_data(text)
252
253 def handle_charref(self, name):
254 self.handle_data(unichr(int(name)))
255
256 def reset(self):
257 HTMLParser.reset(self)
258 self.elem_stack = []
259 self.elements = []
260 self.list_nesting = 0
261
262 def _push_elem(self, tag):
263 if len(self.elem_stack) and isinstance(self.elem_stack[-1], TextElement):
264 self._pop_elem()
265 if isinstance(tag, ListElement):
266 self.list_nesting += 1
267 tag.list_nesting = self.list_nesting
268 self.elem_stack.append(tag)
269
270 def _pop_elem(self):
271 try:
272 element = self.elem_stack.pop()
273 except IndexError:
274 # pop from empty list => bad HTML input; ignore it
275 return
276
277 if isinstance(element, ListElement):
278 self.list_nesting -= 1
279 if len(self.elem_stack):
280 self.elem_stack[-1].add_data(element.markdown())
281 else:
282 self.elements.append(element)
283
284 def _add_data(self, data):
285 self.elem_stack[-1].add_data(data)
286
287 def markdown(self):
288 while len(self.elem_stack):
289 self._pop_elem()
290 text_list = [e.markdown() for e in self.elements]
291 return u''.join(text_list)