Mercurial > public > sg101
comparison legacy/phpbb.py @ 581:ee87ea74d46b
For Django 1.4, rearranged project structure for new manage.py.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Sat, 05 May 2012 17:10:48 -0500 |
parents | gpp/legacy/phpbb.py@254db4cb6a86 |
children |
comparison
equal
deleted
inserted
replaced
580:c525f3e0b5d0 | 581:ee87ea74d46b |
---|---|
1 """ | |
2 This module contains functions for working with data from the legacy phpBB | |
3 based website. | |
4 """ | |
5 import re | |
6 import htmlentitydefs | |
7 | |
8 | |
9 # BBCode tags used by the old site | |
10 BBCODE_TAGS = "b i u s url quote img list * code color size".split() | |
11 | |
12 # Regular expressions used to get rid of phpBB's uid inside BBCode tags. | |
13 # This is a list of regular expression pairs. Element 0 of each pair | |
14 # is for the opening tag & element 1 is for the closing tag. | |
15 | |
16 BBCODE_RES = [( | |
17 re.compile(r"(\[%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}" % tag), | |
18 re.compile(r"(\[/%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}\]" % tag) | |
19 ) for tag in BBCODE_TAGS] | |
20 | |
21 | |
22 ## | |
23 # Removes HTML or XML character references and entities from a text string. | |
24 # | |
25 # @param text The HTML (or XML) source text. | |
26 # @return The plain text, as a Unicode string, if necessary. | |
27 # Source: http://effbot.org/zone/re-sub.htm#unescape-html | |
28 # | |
29 def unescape(text): | |
30 def fixup(m): | |
31 text = m.group(0) | |
32 if text[:2] == "&#": | |
33 # character reference | |
34 try: | |
35 if text[:3] == "&#x": | |
36 return unichr(int(text[3:-1], 16)) | |
37 else: | |
38 return unichr(int(text[2:-1])) | |
39 except ValueError: | |
40 pass | |
41 else: | |
42 # named entity | |
43 try: | |
44 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) | |
45 except KeyError: | |
46 pass | |
47 return text # leave as is | |
48 return re.sub("&#?\w+;", fixup, text) | |
49 | |
50 | |
51 def unphpbb(s, encoding='latin-1'): | |
52 """Converts BBCode from phpBB database data into 'pure' BBCode. | |
53 | |
54 phpBB doesn't store plain BBCode in its database. The BBCode tags have | |
55 "uids" added to them and the data has already been HTML entity'ized. | |
56 This function removes the uid stuff and undoes the entity'ification and | |
57 returns the result as a unicode string. | |
58 | |
59 If the input 's' is not already unicode, it will be decoded using the | |
60 supplied encoding. | |
61 | |
62 """ | |
63 if not isinstance(s, unicode): | |
64 s = s.decode(encoding, 'replace') | |
65 for start, end in BBCODE_RES: | |
66 s = re.sub(start, r'\1', s, re.MULTILINE) | |
67 s = re.sub(end, r'\1]', s, re.MULTILINE) | |
68 return unescape(s) |