sg101: gpp/legacy/phpbb.py comparison

comparison gpp/legacy/phpbb.py @ 290:64c188a9d31f

Adding a legacy app to contain management commands to convert the old data to the new database format. This first commit has the import_old_users command.

author	Brian Neal <bgneal@gmail.com>
date	Fri, 24 Dec 2010 05:28:58 +0000
parents
children	254db4cb6a86

comparison

equal deleted inserted replaced

-:0dd8989abef2
+:64c188a9d31f
+"""
+This module contains functions for working with data from the legacy phpBB
+based website.
+"""
+import re
+import htmlentitydefs
+# BBCode tags used by the old site
+BBCODE_TAGS = "b i u s url quote img list * code color size".split()
+# Regular expressions used to get rid of phpBB's uid inside BBCode tags.
+# This is a list of regular expression pairs. Element 0 of each pair
+# is for the opening tag & element 1 is for the closing tag.
+BBCODE_RES = [(
+re.compile(r"(\[%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}" % tag),
+re.compile(r"(\[/%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}\]" % tag)
+) for tag in BBCODE_TAGS]
+##
+# Removes HTML or XML character references and entities from a text string.
+#
+# @param text The HTML (or XML) source text.
+# @return The plain text, as a Unicode string, if necessary.
+# Source: http://effbot.org/zone/re-sub.htm#unescape-html
+#
+def unescape(text):
+def fixup(m):
+text = m.group(0)
+if text[:2] == "&#":
+# character reference
+try:
+if text[:3] == "&#x":
+return unichr(int(text[3:-1], 16))
+else:
+return unichr(int(text[2:-1]))
+except ValueError:
+pass
+else:
+# named entity
+try:
+text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+except KeyError:
+pass
+return text # leave as is
+return re.sub("&#?\w+;", fixup, text)
+def unphpbb(s):
+"""Converts BBCode from phpBB database data into 'pure' BBCode.
+phpBB doesn't store plain BBCode in its database. The BBCode tags have
+"uids" added to them and the data has already been HTML entity'ized.
+This function removes the uid stuff and undoes the entity'ification and
+returns the result as a unicode string.
+"""
+if not isinstance(s, unicode):
+s = s.decode('utf-8', 'replace')
+for start, end in BBCODE_RES:
+s = re.sub(start, r'\1', s, re.MULTILINE)
+s = re.sub(end, r'\1]', s, re.MULTILINE)
+return unescape(s)

Mercurial > public > sg101

comparison gpp/legacy/phpbb.py @ 290:64c188a9d31f