annotate legacy/phpbb.py @ 887:9a15f7c27526

Actually save model object upon change. This commit was tested on the comments model. Additional logging added. Added check for Markdown image references. Added TODOs after observing behavior on comments.
author Brian Neal <bgneal@gmail.com>
date Tue, 03 Feb 2015 21:09:44 -0600
parents ee87ea74d46b
children
rev   line source
bgneal@290 1 """
bgneal@290 2 This module contains functions for working with data from the legacy phpBB
bgneal@290 3 based website.
bgneal@290 4 """
bgneal@290 5 import re
bgneal@290 6 import htmlentitydefs
bgneal@290 7
bgneal@290 8
bgneal@290 9 # BBCode tags used by the old site
bgneal@290 10 BBCODE_TAGS = "b i u s url quote img list * code color size".split()
bgneal@290 11
bgneal@290 12 # Regular expressions used to get rid of phpBB's uid inside BBCode tags.
bgneal@290 13 # This is a list of regular expression pairs. Element 0 of each pair
bgneal@290 14 # is for the opening tag & element 1 is for the closing tag.
bgneal@290 15
bgneal@290 16 BBCODE_RES = [(
bgneal@290 17 re.compile(r"(\[%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}" % tag),
bgneal@290 18 re.compile(r"(\[/%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}\]" % tag)
bgneal@290 19 ) for tag in BBCODE_TAGS]
bgneal@290 20
bgneal@290 21
bgneal@290 22 ##
bgneal@290 23 # Removes HTML or XML character references and entities from a text string.
bgneal@290 24 #
bgneal@290 25 # @param text The HTML (or XML) source text.
bgneal@290 26 # @return The plain text, as a Unicode string, if necessary.
bgneal@290 27 # Source: http://effbot.org/zone/re-sub.htm#unescape-html
bgneal@290 28 #
bgneal@290 29 def unescape(text):
bgneal@290 30 def fixup(m):
bgneal@290 31 text = m.group(0)
bgneal@290 32 if text[:2] == "&#":
bgneal@290 33 # character reference
bgneal@290 34 try:
bgneal@290 35 if text[:3] == "&#x":
bgneal@290 36 return unichr(int(text[3:-1], 16))
bgneal@290 37 else:
bgneal@290 38 return unichr(int(text[2:-1]))
bgneal@290 39 except ValueError:
bgneal@290 40 pass
bgneal@290 41 else:
bgneal@290 42 # named entity
bgneal@290 43 try:
bgneal@290 44 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
bgneal@290 45 except KeyError:
bgneal@290 46 pass
bgneal@290 47 return text # leave as is
bgneal@290 48 return re.sub("&#?\w+;", fixup, text)
bgneal@290 49
bgneal@290 50
bgneal@294 51 def unphpbb(s, encoding='latin-1'):
bgneal@290 52 """Converts BBCode from phpBB database data into 'pure' BBCode.
bgneal@290 53
bgneal@290 54 phpBB doesn't store plain BBCode in its database. The BBCode tags have
bgneal@290 55 "uids" added to them and the data has already been HTML entity'ized.
bgneal@290 56 This function removes the uid stuff and undoes the entity'ification and
bgneal@290 57 returns the result as a unicode string.
bgneal@290 58
bgneal@294 59 If the input 's' is not already unicode, it will be decoded using the
bgneal@294 60 supplied encoding.
bgneal@294 61
bgneal@290 62 """
bgneal@290 63 if not isinstance(s, unicode):
bgneal@294 64 s = s.decode(encoding, 'replace')
bgneal@290 65 for start, end in BBCODE_RES:
bgneal@290 66 s = re.sub(start, r'\1', s, re.MULTILINE)
bgneal@290 67 s = re.sub(end, r'\1]', s, re.MULTILINE)
bgneal@290 68 return unescape(s)