comparison legacy/phpbb.py @ 581:ee87ea74d46b

For Django 1.4, rearranged project structure for new manage.py.
author Brian Neal <bgneal@gmail.com>
date Sat, 05 May 2012 17:10:48 -0500
parents gpp/legacy/phpbb.py@254db4cb6a86
children
comparison
equal deleted inserted replaced
580:c525f3e0b5d0 581:ee87ea74d46b
1 """
2 This module contains functions for working with data from the legacy phpBB
3 based website.
4 """
5 import re
6 import htmlentitydefs
7
8
9 # BBCode tags used by the old site
10 BBCODE_TAGS = "b i u s url quote img list * code color size".split()
11
12 # Regular expressions used to get rid of phpBB's uid inside BBCode tags.
13 # This is a list of regular expression pairs. Element 0 of each pair
14 # is for the opening tag & element 1 is for the closing tag.
15
16 BBCODE_RES = [(
17 re.compile(r"(\[%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}" % tag),
18 re.compile(r"(\[/%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}\]" % tag)
19 ) for tag in BBCODE_TAGS]
20
21
22 ##
23 # Removes HTML or XML character references and entities from a text string.
24 #
25 # @param text The HTML (or XML) source text.
26 # @return The plain text, as a Unicode string, if necessary.
27 # Source: http://effbot.org/zone/re-sub.htm#unescape-html
28 #
29 def unescape(text):
30 def fixup(m):
31 text = m.group(0)
32 if text[:2] == "&#":
33 # character reference
34 try:
35 if text[:3] == "&#x":
36 return unichr(int(text[3:-1], 16))
37 else:
38 return unichr(int(text[2:-1]))
39 except ValueError:
40 pass
41 else:
42 # named entity
43 try:
44 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
45 except KeyError:
46 pass
47 return text # leave as is
48 return re.sub("&#?\w+;", fixup, text)
49
50
51 def unphpbb(s, encoding='latin-1'):
52 """Converts BBCode from phpBB database data into 'pure' BBCode.
53
54 phpBB doesn't store plain BBCode in its database. The BBCode tags have
55 "uids" added to them and the data has already been HTML entity'ized.
56 This function removes the uid stuff and undoes the entity'ification and
57 returns the result as a unicode string.
58
59 If the input 's' is not already unicode, it will be decoded using the
60 supplied encoding.
61
62 """
63 if not isinstance(s, unicode):
64 s = s.decode(encoding, 'replace')
65 for start, end in BBCODE_RES:
66 s = re.sub(start, r'\1', s, re.MULTILINE)
67 s = re.sub(end, r'\1]', s, re.MULTILINE)
68 return unescape(s)