Mercurial > public > sg101
comparison legacy/html2md.py @ 581:ee87ea74d46b
For Django 1.4, rearranged project structure for new manage.py.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Sat, 05 May 2012 17:10:48 -0500 |
parents | gpp/legacy/html2md.py@2367c4795c92 |
children |
comparison
equal
deleted
inserted
replaced
580:c525f3e0b5d0 | 581:ee87ea74d46b |
---|---|
1 """ | |
2 This module contains a class derived from Python's HTMLParser to convert HTML to | |
3 Markdown. Currently this class only supports those HTML tags that have counter- | |
4 parts in BBCode used by stock phpBB 2.x. | |
5 | |
6 In other words, this class was created to help convert data from a phpBB | |
7 forum to Markdown syntax and its scope is currently limited to that task. | |
8 | |
9 """ | |
10 from HTMLParser import HTMLParser | |
11 import htmlentitydefs | |
12 | |
13 | |
14 # Let's call Markdown markup entities "elements" to avoid confusion | |
15 # with HTML tags. | |
16 | |
17 class ElementBase(object): | |
18 """ | |
19 Base class for all Markdown elements. | |
20 | |
21 """ | |
22 def __init__(self, attrs=None): | |
23 self.data = u'' | |
24 self.attrs = dict(attrs) if attrs else {} | |
25 | |
26 def add_data(self, data): | |
27 self.data += data | |
28 | |
29 def markdown(self): | |
30 return self.data | |
31 | |
32 | |
33 class TextElement(ElementBase): | |
34 """ | |
35 TextElements represent text fragments not inside HTML tags. | |
36 """ | |
37 pass | |
38 | |
39 | |
40 class EmphasisElement(ElementBase): | |
41 """ | |
42 An EmphasisElement is a Markdown element used to indicate emphasis and is | |
43 represented by placing characters around text. E.g. _em_, **bold** | |
44 | |
45 """ | |
46 def __init__(self, tag, attrs): | |
47 super(EmphasisElement, self).__init__(attrs) | |
48 self.tag = tag | |
49 | |
50 def markdown(self): | |
51 return u'%s%s%s' % (self.tag, self.data, self.tag) | |
52 | |
53 | |
54 def create_emphasis(tag): | |
55 """ | |
56 Returns a function that creates an EmphasisElement using the supplied | |
57 tag. | |
58 | |
59 """ | |
60 def inner(attrs): | |
61 return EmphasisElement(tag, attrs) | |
62 return inner | |
63 | |
64 | |
65 class HtmlElement(ElementBase): | |
66 """ | |
67 Markdown also accepts HTML markup. This element represents a HTML tag that | |
68 maps to itself in Markdown. | |
69 | |
70 """ | |
71 def __init__(self, tag, attrs): | |
72 super(HtmlElement, self).__init__(attrs) | |
73 self.tag = tag | |
74 | |
75 def markdown(self): | |
76 return u'<%s>%s</%s>' % (self.tag, self.data, self.tag) | |
77 | |
78 | |
79 def create_html(tag): | |
80 """ | |
81 Returns a function that creates a HtmlElement using the supplied tag. | |
82 | |
83 """ | |
84 def inner(attrs): | |
85 return HtmlElement(tag, attrs) | |
86 return inner | |
87 | |
88 | |
89 class QuoteElement(ElementBase): | |
90 """ | |
91 Class to represent a blockquote in Markdown. | |
92 | |
93 """ | |
94 def markdown(self): | |
95 return u'> %s\n\n' % self.data.replace('\n', '\n> ') | |
96 | |
97 | |
98 class BreakElement(ElementBase): | |
99 """ | |
100 Class to represent a linebreak in Markdown. | |
101 | |
102 """ | |
103 def markdown(self): | |
104 return u' \n' | |
105 | |
106 | |
107 class DivElement(ElementBase): | |
108 """ | |
109 This class maps a HTML <div> into a block of text surrounded by newlines. | |
110 | |
111 """ | |
112 def markdown(self): | |
113 return u'\n%s\n' % self.data | |
114 | |
115 | |
116 class LinkElement(ElementBase): | |
117 """ | |
118 This class maps HTML <a> tags into Markdown links. | |
119 If no data is present, the actual href is used for the link text. | |
120 | |
121 """ | |
122 def markdown(self): | |
123 try: | |
124 url = self.attrs['href'] | |
125 except KeyError: | |
126 return self.data if self.data else u'' | |
127 | |
128 text = self.data if self.data else url | |
129 return u'[%s](%s)' % (text, url) | |
130 | |
131 | |
132 class ImageElement(ElementBase): | |
133 """ | |
134 This class maps HTML <img> tags into Markdown. | |
135 This element assumes no alt text is present, and simply uses the word | |
136 'image' for the alt text. | |
137 | |
138 """ | |
139 def markdown(self): | |
140 try: | |
141 url = self.attrs['src'] | |
142 except KeyError: | |
143 return u' (missing image) ' | |
144 return u'![image](%s)' % url | |
145 | |
146 | |
147 class CodeElement(ElementBase): | |
148 """ | |
149 This class is used to create code blocks in Markdown. | |
150 | |
151 """ | |
152 def markdown(self): | |
153 return u' %s\n' % self.data.replace('\n', '\n ') | |
154 | |
155 | |
156 # List (ordered & unordered) support: | |
157 | |
158 class ListElement(ElementBase): | |
159 """ | |
160 This class creates Markdown for unordered lists. The bullet() method can be | |
161 overridden to create ordered lists. | |
162 | |
163 """ | |
164 def __init__(self, attrs=None): | |
165 super(ListElement, self).__init__(attrs) | |
166 self.items = [] | |
167 self.list_nesting = 1 | |
168 | |
169 def add_data(self, data): | |
170 self.items.append(data) | |
171 | |
172 def bullet(self): | |
173 return u'*' | |
174 | |
175 def markdown(self): | |
176 bullet_str = self.bullet() | |
177 indent = u' ' * (4 * (self.list_nesting - 1)) | |
178 s = u'' | |
179 for item in self.items: | |
180 s += u'\n%s%s %s' % (indent, bullet_str, item) | |
181 return s | |
182 | |
183 | |
184 class OrderedListElement(ListElement): | |
185 """ | |
186 This class creates Markdown for ordered lists. | |
187 | |
188 """ | |
189 def bullet(self): | |
190 return '1.' | |
191 | |
192 | |
193 class ItemElement(ElementBase): | |
194 """ | |
195 This element is used to represent ordered & unordered list items. | |
196 | |
197 """ | |
198 pass | |
199 | |
200 ############################################################################### | |
201 ############################################################################### | |
202 | |
203 class MarkdownWriter(HTMLParser): | |
204 """ | |
205 This class is an HTMLParser that converts a subset of HTML to Markdown. | |
206 | |
207 """ | |
208 | |
209 elem_factories = { | |
210 'a': LinkElement, | |
211 'blockquote': QuoteElement, | |
212 'br': BreakElement, | |
213 'div': DivElement, | |
214 'em': create_emphasis('_'), | |
215 'img': ImageElement, | |
216 'li': ItemElement, | |
217 'ol': OrderedListElement, | |
218 'pre': CodeElement, | |
219 's': create_html('strike'), | |
220 'strong': create_emphasis('**'), | |
221 'u': create_html('u'), | |
222 'ul': ListElement, | |
223 } | |
224 | |
225 def __init__(self): | |
226 HTMLParser.__init__(self) | |
227 self.reset() | |
228 | |
229 def handle_starttag(self, tag, attrs): | |
230 if tag in self.elem_factories: | |
231 factory = self.elem_factories[tag] | |
232 element = factory(attrs) | |
233 else: | |
234 element = TextElement() | |
235 | |
236 self._push_elem(element) | |
237 | |
238 def handle_endtag(self, tag): | |
239 self._pop_elem() | |
240 | |
241 def handle_data(self, data): | |
242 if len(self.elem_stack) == 0: | |
243 self._push_elem(TextElement()) | |
244 self._add_data(data) | |
245 | |
246 def handle_entityref(self, name): | |
247 try: | |
248 text = unichr(htmlentitydefs.name2codepoint[name]) | |
249 except KeyError: | |
250 text = name | |
251 self.handle_data(text) | |
252 | |
253 def handle_charref(self, name): | |
254 self.handle_data(unichr(int(name))) | |
255 | |
256 def reset(self): | |
257 HTMLParser.reset(self) | |
258 self.elem_stack = [] | |
259 self.elements = [] | |
260 self.list_nesting = 0 | |
261 | |
262 def _push_elem(self, tag): | |
263 if len(self.elem_stack) and isinstance(self.elem_stack[-1], TextElement): | |
264 self._pop_elem() | |
265 if isinstance(tag, ListElement): | |
266 self.list_nesting += 1 | |
267 tag.list_nesting = self.list_nesting | |
268 self.elem_stack.append(tag) | |
269 | |
270 def _pop_elem(self): | |
271 try: | |
272 element = self.elem_stack.pop() | |
273 except IndexError: | |
274 # pop from empty list => bad HTML input; ignore it | |
275 return | |
276 | |
277 if isinstance(element, ListElement): | |
278 self.list_nesting -= 1 | |
279 if len(self.elem_stack): | |
280 self.elem_stack[-1].add_data(element.markdown()) | |
281 else: | |
282 self.elements.append(element) | |
283 | |
284 def _add_data(self, data): | |
285 self.elem_stack[-1].add_data(data) | |
286 | |
287 def markdown(self): | |
288 while len(self.elem_stack): | |
289 self._pop_elem() | |
290 text_list = [e.markdown() for e in self.elements] | |
291 return u''.join(text_list) |