diff legacy/phpbb.py @ 581:ee87ea74d46b

For Django 1.4, rearranged project structure for new manage.py.
author Brian Neal <bgneal@gmail.com>
date Sat, 05 May 2012 17:10:48 -0500
parents gpp/legacy/phpbb.py@254db4cb6a86
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/legacy/phpbb.py	Sat May 05 17:10:48 2012 -0500
@@ -0,0 +1,68 @@
+"""
+This module contains functions for working with data from the legacy phpBB
+based website.
+"""
+import re
+import htmlentitydefs
+
+
+# BBCode tags used by the old site
+BBCODE_TAGS = "b i u s url quote img list * code color size".split()
+
+# Regular expressions used to get rid of phpBB's uid inside BBCode tags.
+# This is a list of regular expression pairs. Element 0 of each pair
+# is for the opening tag & element 1 is for the closing tag.
+
+BBCODE_RES = [(
+    re.compile(r"(\[%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}" % tag),
+    re.compile(r"(\[/%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}\]" % tag)
+) for tag in BBCODE_TAGS]
+
+
+##
+# Removes HTML or XML character references and entities from a text string.
+#
+# @param text The HTML (or XML) source text.
+# @return The plain text, as a Unicode string, if necessary.
+# Source: http://effbot.org/zone/re-sub.htm#unescape-html
+#
+def unescape(text):
+    def fixup(m):
+        text = m.group(0)
+        if text[:2] == "&#":
+            # character reference
+            try:
+                if text[:3] == "&#x":
+                    return unichr(int(text[3:-1], 16))
+                else:
+                    return unichr(int(text[2:-1]))
+            except ValueError:
+                pass
+        else:
+            # named entity
+            try:
+                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+            except KeyError:
+                pass
+        return text # leave as is
+    return re.sub("&#?\w+;", fixup, text)
+
+
+def unphpbb(s, encoding='latin-1'):
+    """Converts BBCode from phpBB database data into 'pure' BBCode.
+
+    phpBB doesn't store plain BBCode in its database. The BBCode tags have
+    "uids" added to them and the data has already been HTML entity'ized.
+    This function removes the uid stuff and undoes the entity'ification and
+    returns the result as a unicode string.
+
+    If the input 's' is not already unicode, it will be decoded using the
+    supplied encoding.
+
+    """
+    if not isinstance(s, unicode):
+        s = s.decode(encoding, 'replace')
+    for start, end in BBCODE_RES:
+        s = re.sub(start, r'\1', s, re.MULTILINE)
+        s = re.sub(end, r'\1]', s, re.MULTILINE)
+    return unescape(s)