bgneal@290: """ bgneal@290: This module contains functions for working with data from the legacy phpBB bgneal@290: based website. bgneal@290: """ bgneal@290: import re bgneal@290: import htmlentitydefs bgneal@290: bgneal@290: bgneal@290: # BBCode tags used by the old site bgneal@290: BBCODE_TAGS = "b i u s url quote img list * code color size".split() bgneal@290: bgneal@290: # Regular expressions used to get rid of phpBB's uid inside BBCode tags. bgneal@290: # This is a list of regular expression pairs. Element 0 of each pair bgneal@290: # is for the opening tag & element 1 is for the closing tag. bgneal@290: bgneal@290: BBCODE_RES = [( bgneal@290: re.compile(r"(\[%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}" % tag), bgneal@290: re.compile(r"(\[/%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}\]" % tag) bgneal@290: ) for tag in BBCODE_TAGS] bgneal@290: bgneal@290: bgneal@290: ## bgneal@290: # Removes HTML or XML character references and entities from a text string. bgneal@290: # bgneal@290: # @param text The HTML (or XML) source text. bgneal@290: # @return The plain text, as a Unicode string, if necessary. bgneal@290: # Source: http://effbot.org/zone/re-sub.htm#unescape-html bgneal@290: # bgneal@290: def unescape(text): bgneal@290: def fixup(m): bgneal@290: text = m.group(0) bgneal@290: if text[:2] == "&#": bgneal@290: # character reference bgneal@290: try: bgneal@290: if text[:3] == "&#x": bgneal@290: return unichr(int(text[3:-1], 16)) bgneal@290: else: bgneal@290: return unichr(int(text[2:-1])) bgneal@290: except ValueError: bgneal@290: pass bgneal@290: else: bgneal@290: # named entity bgneal@290: try: bgneal@290: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) bgneal@290: except KeyError: bgneal@290: pass bgneal@290: return text # leave as is bgneal@290: return re.sub("&#?\w+;", fixup, text) bgneal@290: bgneal@290: bgneal@294: def unphpbb(s, encoding='latin-1'): bgneal@290: """Converts BBCode from phpBB database data into 'pure' BBCode. bgneal@290: bgneal@290: phpBB doesn't store plain BBCode in its database. The BBCode tags have bgneal@290: "uids" added to them and the data has already been HTML entity'ized. bgneal@290: This function removes the uid stuff and undoes the entity'ification and bgneal@290: returns the result as a unicode string. bgneal@290: bgneal@294: If the input 's' is not already unicode, it will be decoded using the bgneal@294: supplied encoding. bgneal@294: bgneal@290: """ bgneal@290: if not isinstance(s, unicode): bgneal@294: s = s.decode(encoding, 'replace') bgneal@290: for start, end in BBCODE_RES: bgneal@290: s = re.sub(start, r'\1', s, re.MULTILINE) bgneal@290: s = re.sub(end, r'\1]', s, re.MULTILINE) bgneal@290: return unescape(s)