bgneal@290: """
bgneal@290: This module contains functions for working with data from the legacy phpBB
bgneal@290: based website.
bgneal@290: """
bgneal@290: import re
bgneal@290: import htmlentitydefs
bgneal@290: 
bgneal@290: 
bgneal@290: # BBCode tags used by the old site
bgneal@290: BBCODE_TAGS = "b i u s url quote img list * code color size".split()
bgneal@290: 
bgneal@290: # Regular expressions used to get rid of phpBB's uid inside BBCode tags.
bgneal@290: # This is a list of regular expression pairs. Element 0 of each pair
bgneal@290: # is for the opening tag & element 1 is for the closing tag.
bgneal@290: 
bgneal@290: BBCODE_RES = [(
bgneal@290:     re.compile(r"(\[%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}" % tag),
bgneal@290:     re.compile(r"(\[/%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}\]" % tag)
bgneal@290: ) for tag in BBCODE_TAGS]
bgneal@290: 
bgneal@290: 
bgneal@290: ##
bgneal@290: # Removes HTML or XML character references and entities from a text string.
bgneal@290: #
bgneal@290: # @param text The HTML (or XML) source text.
bgneal@290: # @return The plain text, as a Unicode string, if necessary.
bgneal@290: # Source: http://effbot.org/zone/re-sub.htm#unescape-html
bgneal@290: #
bgneal@290: def unescape(text):
bgneal@290:     def fixup(m):
bgneal@290:         text = m.group(0)
bgneal@290:         if text[:2] == "&#":
bgneal@290:             # character reference
bgneal@290:             try:
bgneal@290:                 if text[:3] == "&#x":
bgneal@290:                     return unichr(int(text[3:-1], 16))
bgneal@290:                 else:
bgneal@290:                     return unichr(int(text[2:-1]))
bgneal@290:             except ValueError:
bgneal@290:                 pass
bgneal@290:         else:
bgneal@290:             # named entity
bgneal@290:             try:
bgneal@290:                 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
bgneal@290:             except KeyError:
bgneal@290:                 pass
bgneal@290:         return text # leave as is
bgneal@290:     return re.sub("&#?\w+;", fixup, text)
bgneal@290: 
bgneal@290: 
bgneal@294: def unphpbb(s, encoding='latin-1'):
bgneal@290:     """Converts BBCode from phpBB database data into 'pure' BBCode.
bgneal@290: 
bgneal@290:     phpBB doesn't store plain BBCode in its database. The BBCode tags have
bgneal@290:     "uids" added to them and the data has already been HTML entity'ized.
bgneal@290:     This function removes the uid stuff and undoes the entity'ification and
bgneal@290:     returns the result as a unicode string.
bgneal@290: 
bgneal@294:     If the input 's' is not already unicode, it will be decoded using the
bgneal@294:     supplied encoding.
bgneal@294: 
bgneal@290:     """
bgneal@290:     if not isinstance(s, unicode):
bgneal@294:         s = s.decode(encoding, 'replace')
bgneal@290:     for start, end in BBCODE_RES:
bgneal@290:         s = re.sub(start, r'\1', s, re.MULTILINE)
bgneal@290:         s = re.sub(end, r'\1]', s, re.MULTILINE)
bgneal@290:     return unescape(s)