sg101: legacy/phpbb.py annotate

annotate legacy/phpbb.py @ 887:9a15f7c27526

Actually save model object upon change. This commit was tested on the comments model. Additional logging added. Added check for Markdown image references. Added TODOs after observing behavior on comments.

author	Brian Neal <bgneal@gmail.com>
date	Tue, 03 Feb 2015 21:09:44 -0600
parents	ee87ea74d46b
children

rev	line source
bgneal@290	1 """
bgneal@290	2 This module contains functions for working with data from the legacy phpBB
bgneal@290	3 based website.
bgneal@290	4 """
bgneal@290	5 import re
bgneal@290	6 import htmlentitydefs
bgneal@290	7
bgneal@290	8
bgneal@290	9 # BBCode tags used by the old site
bgneal@290	10 BBCODE_TAGS = "b i u s url quote img list * code color size".split()
bgneal@290	11
bgneal@290	12 # Regular expressions used to get rid of phpBB's uid inside BBCode tags.
bgneal@290	13 # This is a list of regular expression pairs. Element 0 of each pair
bgneal@290	14 # is for the opening tag & element 1 is for the closing tag.
bgneal@290	15
bgneal@290	16 BBCODE_RES = [(
bgneal@290	17 re.compile(r"(\[%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}" % tag),
bgneal@290	18 re.compile(r"(\[/%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}\]" % tag)
bgneal@290	19 ) for tag in BBCODE_TAGS]
bgneal@290	20
bgneal@290	21
bgneal@290	22 ##
bgneal@290	23 # Removes HTML or XML character references and entities from a text string.
bgneal@290	24 #
bgneal@290	25 # @param text The HTML (or XML) source text.
bgneal@290	26 # @return The plain text, as a Unicode string, if necessary.
bgneal@290	27 # Source: http://effbot.org/zone/re-sub.htm#unescape-html
bgneal@290	28 #
bgneal@290	29 def unescape(text):
bgneal@290	30 def fixup(m):
bgneal@290	31 text = m.group(0)
bgneal@290	32 if text[:2] == "&#":
bgneal@290	33 # character reference
bgneal@290	34 try:
bgneal@290	35 if text[:3] == "&#x":
bgneal@290	36 return unichr(int(text[3:-1], 16))
bgneal@290	37 else:
bgneal@290	38 return unichr(int(text[2:-1]))
bgneal@290	39 except ValueError:
bgneal@290	40 pass
bgneal@290	41 else:
bgneal@290	42 # named entity
bgneal@290	43 try:
bgneal@290	44 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
bgneal@290	45 except KeyError:
bgneal@290	46 pass
bgneal@290	47 return text # leave as is
bgneal@290	48 return re.sub("&#?\w+;", fixup, text)
bgneal@290	49
bgneal@290	50
bgneal@294	51 def unphpbb(s, encoding='latin-1'):
bgneal@290	52 """Converts BBCode from phpBB database data into 'pure' BBCode.
bgneal@290	53
bgneal@290	54 phpBB doesn't store plain BBCode in its database. The BBCode tags have
bgneal@290	55 "uids" added to them and the data has already been HTML entity'ized.
bgneal@290	56 This function removes the uid stuff and undoes the entity'ification and
bgneal@290	57 returns the result as a unicode string.
bgneal@290	58
bgneal@294	59 If the input 's' is not already unicode, it will be decoded using the
bgneal@294	60 supplied encoding.
bgneal@294	61
bgneal@290	62 """
bgneal@290	63 if not isinstance(s, unicode):
bgneal@294	64 s = s.decode(encoding, 'replace')
bgneal@290	65 for start, end in BBCODE_RES:
bgneal@290	66 s = re.sub(start, r'\1', s, re.MULTILINE)
bgneal@290	67 s = re.sub(end, r'\1]', s, re.MULTILINE)
bgneal@290	68 return unescape(s)

Mercurial > public > sg101

annotate legacy/phpbb.py @ 887:9a15f7c27526