bgneal@290
|
1 """
|
bgneal@290
|
2 This module contains functions for working with data from the legacy phpBB
|
bgneal@290
|
3 based website.
|
bgneal@290
|
4 """
|
bgneal@290
|
5 import re
|
bgneal@290
|
6 import htmlentitydefs
|
bgneal@290
|
7
|
bgneal@290
|
8
|
bgneal@290
|
9 # BBCode tags used by the old site
|
bgneal@290
|
10 BBCODE_TAGS = "b i u s url quote img list * code color size".split()
|
bgneal@290
|
11
|
bgneal@290
|
12 # Regular expressions used to get rid of phpBB's uid inside BBCode tags.
|
bgneal@290
|
13 # This is a list of regular expression pairs. Element 0 of each pair
|
bgneal@290
|
14 # is for the opening tag & element 1 is for the closing tag.
|
bgneal@290
|
15
|
bgneal@290
|
16 BBCODE_RES = [(
|
bgneal@290
|
17 re.compile(r"(\[%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}" % tag),
|
bgneal@290
|
18 re.compile(r"(\[/%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}\]" % tag)
|
bgneal@290
|
19 ) for tag in BBCODE_TAGS]
|
bgneal@290
|
20
|
bgneal@290
|
21
|
bgneal@290
|
22 ##
|
bgneal@290
|
23 # Removes HTML or XML character references and entities from a text string.
|
bgneal@290
|
24 #
|
bgneal@290
|
25 # @param text The HTML (or XML) source text.
|
bgneal@290
|
26 # @return The plain text, as a Unicode string, if necessary.
|
bgneal@290
|
27 # Source: http://effbot.org/zone/re-sub.htm#unescape-html
|
bgneal@290
|
28 #
|
bgneal@290
|
29 def unescape(text):
|
bgneal@290
|
30 def fixup(m):
|
bgneal@290
|
31 text = m.group(0)
|
bgneal@290
|
32 if text[:2] == "&#":
|
bgneal@290
|
33 # character reference
|
bgneal@290
|
34 try:
|
bgneal@290
|
35 if text[:3] == "&#x":
|
bgneal@290
|
36 return unichr(int(text[3:-1], 16))
|
bgneal@290
|
37 else:
|
bgneal@290
|
38 return unichr(int(text[2:-1]))
|
bgneal@290
|
39 except ValueError:
|
bgneal@290
|
40 pass
|
bgneal@290
|
41 else:
|
bgneal@290
|
42 # named entity
|
bgneal@290
|
43 try:
|
bgneal@290
|
44 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
|
bgneal@290
|
45 except KeyError:
|
bgneal@290
|
46 pass
|
bgneal@290
|
47 return text # leave as is
|
bgneal@290
|
48 return re.sub("&#?\w+;", fixup, text)
|
bgneal@290
|
49
|
bgneal@290
|
50
|
bgneal@294
|
51 def unphpbb(s, encoding='latin-1'):
|
bgneal@290
|
52 """Converts BBCode from phpBB database data into 'pure' BBCode.
|
bgneal@290
|
53
|
bgneal@290
|
54 phpBB doesn't store plain BBCode in its database. The BBCode tags have
|
bgneal@290
|
55 "uids" added to them and the data has already been HTML entity'ized.
|
bgneal@290
|
56 This function removes the uid stuff and undoes the entity'ification and
|
bgneal@290
|
57 returns the result as a unicode string.
|
bgneal@290
|
58
|
bgneal@294
|
59 If the input 's' is not already unicode, it will be decoded using the
|
bgneal@294
|
60 supplied encoding.
|
bgneal@294
|
61
|
bgneal@290
|
62 """
|
bgneal@290
|
63 if not isinstance(s, unicode):
|
bgneal@294
|
64 s = s.decode(encoding, 'replace')
|
bgneal@290
|
65 for start, end in BBCODE_RES:
|
bgneal@290
|
66 s = re.sub(start, r'\1', s, re.MULTILINE)
|
bgneal@290
|
67 s = re.sub(end, r'\1]', s, re.MULTILINE)
|
bgneal@290
|
68 return unescape(s)
|