Mercurial > public > sg101
diff gpp/legacy/phpbb.py @ 290:64c188a9d31f
Adding a legacy app to contain management commands to convert the old data to the new database format. This first commit has the import_old_users command.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Fri, 24 Dec 2010 05:28:58 +0000 |
parents | |
children | 254db4cb6a86 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/legacy/phpbb.py Fri Dec 24 05:28:58 2010 +0000 @@ -0,0 +1,65 @@ +""" +This module contains functions for working with data from the legacy phpBB +based website. +""" +import re +import htmlentitydefs + + +# BBCode tags used by the old site +BBCODE_TAGS = "b i u s url quote img list * code color size".split() + +# Regular expressions used to get rid of phpBB's uid inside BBCode tags. +# This is a list of regular expression pairs. Element 0 of each pair +# is for the opening tag & element 1 is for the closing tag. + +BBCODE_RES = [( + re.compile(r"(\[%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}" % tag), + re.compile(r"(\[/%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}\]" % tag) +) for tag in BBCODE_TAGS] + + +## +# Removes HTML or XML character references and entities from a text string. +# +# @param text The HTML (or XML) source text. +# @return The plain text, as a Unicode string, if necessary. +# Source: http://effbot.org/zone/re-sub.htm#unescape-html +# +def unescape(text): + def fixup(m): + text = m.group(0) + if text[:2] == "&#": + # character reference + try: + if text[:3] == "&#x": + return unichr(int(text[3:-1], 16)) + else: + return unichr(int(text[2:-1])) + except ValueError: + pass + else: + # named entity + try: + text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) + except KeyError: + pass + return text # leave as is + return re.sub("&#?\w+;", fixup, text) + + +def unphpbb(s): + """Converts BBCode from phpBB database data into 'pure' BBCode. + + phpBB doesn't store plain BBCode in its database. The BBCode tags have + "uids" added to them and the data has already been HTML entity'ized. + This function removes the uid stuff and undoes the entity'ification and + returns the result as a unicode string. + + """ + if not isinstance(s, unicode): + s = s.decode('utf-8', 'replace') + for start, end in BBCODE_RES: + s = re.sub(start, r'\1', s, re.MULTILINE) + s = re.sub(end, r'\1]', s, re.MULTILINE) + return unescape(s)