bgneal@290
|
1 """
|
bgneal@290
|
2 This module contains a class derived from Python's HTMLParser to convert HTML to
|
bgneal@290
|
3 Markdown. Currently this class only supports those HTML tags that have counter-
|
bgneal@290
|
4 parts in BBCode used by stock phpBB 2.x.
|
bgneal@290
|
5
|
bgneal@290
|
6 In other words, this class was created to help convert data from a phpBB
|
bgneal@290
|
7 forum to Markdown syntax and its scope is currently limited to that task.
|
bgneal@290
|
8
|
bgneal@290
|
9 """
|
bgneal@290
|
10 from HTMLParser import HTMLParser
|
bgneal@290
|
11 import htmlentitydefs
|
bgneal@290
|
12
|
bgneal@290
|
13
|
bgneal@290
|
14 # Let's call Markdown markup entities "elements" to avoid confusion
|
bgneal@290
|
15 # with HTML tags.
|
bgneal@290
|
16
|
bgneal@290
|
17 class ElementBase(object):
|
bgneal@290
|
18 """
|
bgneal@290
|
19 Base class for all Markdown elements.
|
bgneal@290
|
20
|
bgneal@290
|
21 """
|
bgneal@290
|
22 def __init__(self, attrs=None):
|
bgneal@290
|
23 self.data = u''
|
bgneal@290
|
24 self.attrs = dict(attrs) if attrs else {}
|
bgneal@290
|
25
|
bgneal@290
|
26 def add_data(self, data):
|
bgneal@290
|
27 self.data += data
|
bgneal@290
|
28
|
bgneal@290
|
29 def markdown(self):
|
bgneal@290
|
30 return self.data
|
bgneal@290
|
31
|
bgneal@290
|
32
|
bgneal@290
|
33 class TextElement(ElementBase):
|
bgneal@290
|
34 """
|
bgneal@290
|
35 TextElements represent text fragments not inside HTML tags.
|
bgneal@290
|
36 """
|
bgneal@290
|
37 pass
|
bgneal@290
|
38
|
bgneal@290
|
39
|
bgneal@290
|
40 class EmphasisElement(ElementBase):
|
bgneal@290
|
41 """
|
bgneal@290
|
42 An EmphasisElement is a Markdown element used to indicate emphasis and is
|
bgneal@290
|
43 represented by placing characters around text. E.g. _em_, **bold**
|
bgneal@290
|
44
|
bgneal@290
|
45 """
|
bgneal@290
|
46 def __init__(self, tag, attrs):
|
bgneal@290
|
47 super(EmphasisElement, self).__init__(attrs)
|
bgneal@290
|
48 self.tag = tag
|
bgneal@290
|
49
|
bgneal@290
|
50 def markdown(self):
|
bgneal@290
|
51 return u'%s%s%s' % (self.tag, self.data, self.tag)
|
bgneal@290
|
52
|
bgneal@290
|
53
|
bgneal@290
|
54 def create_emphasis(tag):
|
bgneal@290
|
55 """
|
bgneal@290
|
56 Returns a function that creates an EmphasisElement using the supplied
|
bgneal@290
|
57 tag.
|
bgneal@290
|
58
|
bgneal@290
|
59 """
|
bgneal@290
|
60 def inner(attrs):
|
bgneal@290
|
61 return EmphasisElement(tag, attrs)
|
bgneal@290
|
62 return inner
|
bgneal@290
|
63
|
bgneal@290
|
64
|
bgneal@290
|
65 class HtmlElement(ElementBase):
|
bgneal@290
|
66 """
|
bgneal@290
|
67 Markdown also accepts HTML markup. This element represents a HTML tag that
|
bgneal@290
|
68 maps to itself in Markdown.
|
bgneal@290
|
69
|
bgneal@290
|
70 """
|
bgneal@290
|
71 def __init__(self, tag, attrs):
|
bgneal@290
|
72 super(HtmlElement, self).__init__(attrs)
|
bgneal@290
|
73 self.tag = tag
|
bgneal@290
|
74
|
bgneal@290
|
75 def markdown(self):
|
bgneal@290
|
76 return u'<%s>%s</%s>' % (self.tag, self.data, self.tag)
|
bgneal@290
|
77
|
bgneal@290
|
78
|
bgneal@290
|
79 def create_html(tag):
|
bgneal@290
|
80 """
|
bgneal@290
|
81 Returns a function that creates a HtmlElement using the supplied tag.
|
bgneal@290
|
82
|
bgneal@290
|
83 """
|
bgneal@290
|
84 def inner(attrs):
|
bgneal@290
|
85 return HtmlElement(tag, attrs)
|
bgneal@290
|
86 return inner
|
bgneal@290
|
87
|
bgneal@290
|
88
|
bgneal@290
|
89 class QuoteElement(ElementBase):
|
bgneal@290
|
90 """
|
bgneal@290
|
91 Class to represent a blockquote in Markdown.
|
bgneal@290
|
92
|
bgneal@290
|
93 """
|
bgneal@290
|
94 def markdown(self):
|
bgneal@290
|
95 return u'> %s\n\n' % self.data.replace('\n', '\n> ')
|
bgneal@290
|
96
|
bgneal@290
|
97
|
bgneal@290
|
98 class BreakElement(ElementBase):
|
bgneal@290
|
99 """
|
bgneal@290
|
100 Class to represent a linebreak in Markdown.
|
bgneal@290
|
101
|
bgneal@290
|
102 """
|
bgneal@290
|
103 def markdown(self):
|
bgneal@290
|
104 return u' \n'
|
bgneal@290
|
105
|
bgneal@290
|
106
|
bgneal@290
|
107 class DivElement(ElementBase):
|
bgneal@290
|
108 """
|
bgneal@290
|
109 This class maps a HTML <div> into a block of text surrounded by newlines.
|
bgneal@290
|
110
|
bgneal@290
|
111 """
|
bgneal@290
|
112 def markdown(self):
|
bgneal@290
|
113 return u'\n%s\n' % self.data
|
bgneal@290
|
114
|
bgneal@290
|
115
|
bgneal@290
|
116 class LinkElement(ElementBase):
|
bgneal@290
|
117 """
|
bgneal@290
|
118 This class maps HTML <a> tags into Markdown links.
|
bgneal@290
|
119 If no data is present, the actual href is used for the link text.
|
bgneal@290
|
120
|
bgneal@290
|
121 """
|
bgneal@290
|
122 def markdown(self):
|
bgneal@292
|
123 try:
|
bgneal@292
|
124 url = self.attrs['href']
|
bgneal@292
|
125 except KeyError:
|
bgneal@292
|
126 return self.data if self.data else u''
|
bgneal@292
|
127
|
bgneal@290
|
128 text = self.data if self.data else url
|
bgneal@290
|
129 return u'[%s](%s)' % (text, url)
|
bgneal@290
|
130
|
bgneal@290
|
131
|
bgneal@290
|
132 class ImageElement(ElementBase):
|
bgneal@290
|
133 """
|
bgneal@290
|
134 This class maps HTML <img> tags into Markdown.
|
bgneal@290
|
135 This element assumes no alt text is present, and simply uses the word
|
bgneal@290
|
136 'image' for the alt text.
|
bgneal@290
|
137
|
bgneal@290
|
138 """
|
bgneal@290
|
139 def markdown(self):
|
bgneal@292
|
140 try:
|
bgneal@292
|
141 url = self.attrs['src']
|
bgneal@292
|
142 except KeyError:
|
bgneal@292
|
143 return u' (missing image) '
|
bgneal@290
|
144 return u'![image](%s)' % url
|
bgneal@290
|
145
|
bgneal@290
|
146
|
bgneal@290
|
147 class CodeElement(ElementBase):
|
bgneal@290
|
148 """
|
bgneal@290
|
149 This class is used to create code blocks in Markdown.
|
bgneal@290
|
150
|
bgneal@290
|
151 """
|
bgneal@290
|
152 def markdown(self):
|
bgneal@290
|
153 return u' %s\n' % self.data.replace('\n', '\n ')
|
bgneal@290
|
154
|
bgneal@290
|
155
|
bgneal@290
|
156 # List (ordered & unordered) support:
|
bgneal@290
|
157
|
bgneal@290
|
158 class ListElement(ElementBase):
|
bgneal@290
|
159 """
|
bgneal@290
|
160 This class creates Markdown for unordered lists. The bullet() method can be
|
bgneal@290
|
161 overridden to create ordered lists.
|
bgneal@290
|
162
|
bgneal@290
|
163 """
|
bgneal@290
|
164 def __init__(self, attrs=None):
|
bgneal@290
|
165 super(ListElement, self).__init__(attrs)
|
bgneal@290
|
166 self.items = []
|
bgneal@290
|
167 self.list_nesting = 1
|
bgneal@290
|
168
|
bgneal@290
|
169 def add_data(self, data):
|
bgneal@290
|
170 self.items.append(data)
|
bgneal@290
|
171
|
bgneal@290
|
172 def bullet(self):
|
bgneal@290
|
173 return u'*'
|
bgneal@290
|
174
|
bgneal@290
|
175 def markdown(self):
|
bgneal@290
|
176 bullet_str = self.bullet()
|
bgneal@290
|
177 indent = u' ' * (4 * (self.list_nesting - 1))
|
bgneal@290
|
178 s = u''
|
bgneal@290
|
179 for item in self.items:
|
bgneal@290
|
180 s += u'\n%s%s %s' % (indent, bullet_str, item)
|
bgneal@290
|
181 return s
|
bgneal@290
|
182
|
bgneal@290
|
183
|
bgneal@290
|
184 class OrderedListElement(ListElement):
|
bgneal@290
|
185 """
|
bgneal@290
|
186 This class creates Markdown for ordered lists.
|
bgneal@290
|
187
|
bgneal@290
|
188 """
|
bgneal@290
|
189 def bullet(self):
|
bgneal@290
|
190 return '1.'
|
bgneal@290
|
191
|
bgneal@290
|
192
|
bgneal@290
|
193 class ItemElement(ElementBase):
|
bgneal@290
|
194 """
|
bgneal@290
|
195 This element is used to represent ordered & unordered list items.
|
bgneal@290
|
196
|
bgneal@290
|
197 """
|
bgneal@290
|
198 pass
|
bgneal@290
|
199
|
bgneal@290
|
200 ###############################################################################
|
bgneal@290
|
201 ###############################################################################
|
bgneal@290
|
202
|
bgneal@290
|
203 class MarkdownWriter(HTMLParser):
|
bgneal@290
|
204 """
|
bgneal@290
|
205 This class is an HTMLParser that converts a subset of HTML to Markdown.
|
bgneal@290
|
206
|
bgneal@290
|
207 """
|
bgneal@290
|
208
|
bgneal@290
|
209 elem_factories = {
|
bgneal@290
|
210 'a': LinkElement,
|
bgneal@290
|
211 'blockquote': QuoteElement,
|
bgneal@290
|
212 'br': BreakElement,
|
bgneal@290
|
213 'div': DivElement,
|
bgneal@290
|
214 'em': create_emphasis('_'),
|
bgneal@290
|
215 'img': ImageElement,
|
bgneal@290
|
216 'li': ItemElement,
|
bgneal@290
|
217 'ol': OrderedListElement,
|
bgneal@290
|
218 'pre': CodeElement,
|
bgneal@290
|
219 's': create_html('strike'),
|
bgneal@290
|
220 'strong': create_emphasis('**'),
|
bgneal@290
|
221 'u': create_html('u'),
|
bgneal@290
|
222 'ul': ListElement,
|
bgneal@290
|
223 }
|
bgneal@290
|
224
|
bgneal@290
|
225 def __init__(self):
|
bgneal@290
|
226 HTMLParser.__init__(self)
|
bgneal@290
|
227 self.reset()
|
bgneal@290
|
228
|
bgneal@290
|
229 def handle_starttag(self, tag, attrs):
|
bgneal@290
|
230 if tag in self.elem_factories:
|
bgneal@290
|
231 factory = self.elem_factories[tag]
|
bgneal@290
|
232 element = factory(attrs)
|
bgneal@290
|
233 else:
|
bgneal@290
|
234 element = TextElement()
|
bgneal@290
|
235
|
bgneal@290
|
236 self._push_elem(element)
|
bgneal@290
|
237
|
bgneal@290
|
238 def handle_endtag(self, tag):
|
bgneal@290
|
239 self._pop_elem()
|
bgneal@290
|
240
|
bgneal@290
|
241 def handle_data(self, data):
|
bgneal@290
|
242 if len(self.elem_stack) == 0:
|
bgneal@290
|
243 self._push_elem(TextElement())
|
bgneal@290
|
244 self._add_data(data)
|
bgneal@290
|
245
|
bgneal@290
|
246 def handle_entityref(self, name):
|
bgneal@290
|
247 try:
|
bgneal@290
|
248 text = unichr(htmlentitydefs.name2codepoint[name])
|
bgneal@290
|
249 except KeyError:
|
bgneal@290
|
250 text = name
|
bgneal@290
|
251 self.handle_data(text)
|
bgneal@290
|
252
|
bgneal@290
|
253 def handle_charref(self, name):
|
bgneal@290
|
254 self.handle_data(unichr(int(name)))
|
bgneal@290
|
255
|
bgneal@290
|
256 def reset(self):
|
bgneal@290
|
257 HTMLParser.reset(self)
|
bgneal@290
|
258 self.elem_stack = []
|
bgneal@290
|
259 self.elements = []
|
bgneal@290
|
260 self.list_nesting = 0
|
bgneal@290
|
261
|
bgneal@290
|
262 def _push_elem(self, tag):
|
bgneal@290
|
263 if len(self.elem_stack) and isinstance(self.elem_stack[-1], TextElement):
|
bgneal@290
|
264 self._pop_elem()
|
bgneal@290
|
265 if isinstance(tag, ListElement):
|
bgneal@290
|
266 self.list_nesting += 1
|
bgneal@290
|
267 tag.list_nesting = self.list_nesting
|
bgneal@290
|
268 self.elem_stack.append(tag)
|
bgneal@290
|
269
|
bgneal@290
|
270 def _pop_elem(self):
|
bgneal@292
|
271 try:
|
bgneal@292
|
272 element = self.elem_stack.pop()
|
bgneal@292
|
273 except IndexError:
|
bgneal@292
|
274 # pop from empty list => bad HTML input; ignore it
|
bgneal@292
|
275 return
|
bgneal@292
|
276
|
bgneal@290
|
277 if isinstance(element, ListElement):
|
bgneal@290
|
278 self.list_nesting -= 1
|
bgneal@290
|
279 if len(self.elem_stack):
|
bgneal@290
|
280 self.elem_stack[-1].add_data(element.markdown())
|
bgneal@290
|
281 else:
|
bgneal@290
|
282 self.elements.append(element)
|
bgneal@290
|
283
|
bgneal@290
|
284 def _add_data(self, data):
|
bgneal@290
|
285 self.elem_stack[-1].add_data(data)
|
bgneal@290
|
286
|
bgneal@290
|
287 def markdown(self):
|
bgneal@290
|
288 while len(self.elem_stack):
|
bgneal@290
|
289 self._pop_elem()
|
bgneal@290
|
290 text_list = [e.markdown() for e in self.elements]
|
bgneal@290
|
291 return u''.join(text_list)
|