annotate legacy/phpbb.py @ 1203:8cd15df9b563

Controlling the xapian install script in tools.
author Brian Neal <bgneal@gmail.com>
date Sat, 04 Jan 2025 14:19:19 -0600
parents ee87ea74d46b
children
rev   line source
bgneal@290 1 """
bgneal@290 2 This module contains functions for working with data from the legacy phpBB
bgneal@290 3 based website.
bgneal@290 4 """
bgneal@290 5 import re
bgneal@290 6 import htmlentitydefs
bgneal@290 7
bgneal@290 8
bgneal@290 9 # BBCode tags used by the old site
bgneal@290 10 BBCODE_TAGS = "b i u s url quote img list * code color size".split()
bgneal@290 11
bgneal@290 12 # Regular expressions used to get rid of phpBB's uid inside BBCode tags.
bgneal@290 13 # This is a list of regular expression pairs. Element 0 of each pair
bgneal@290 14 # is for the opening tag & element 1 is for the closing tag.
bgneal@290 15
bgneal@290 16 BBCODE_RES = [(
bgneal@290 17 re.compile(r"(\[%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}" % tag),
bgneal@290 18 re.compile(r"(\[/%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}\]" % tag)
bgneal@290 19 ) for tag in BBCODE_TAGS]
bgneal@290 20
bgneal@290 21
bgneal@290 22 ##
bgneal@290 23 # Removes HTML or XML character references and entities from a text string.
bgneal@290 24 #
bgneal@290 25 # @param text The HTML (or XML) source text.
bgneal@290 26 # @return The plain text, as a Unicode string, if necessary.
bgneal@290 27 # Source: http://effbot.org/zone/re-sub.htm#unescape-html
bgneal@290 28 #
bgneal@290 29 def unescape(text):
bgneal@290 30 def fixup(m):
bgneal@290 31 text = m.group(0)
bgneal@290 32 if text[:2] == "&#":
bgneal@290 33 # character reference
bgneal@290 34 try:
bgneal@290 35 if text[:3] == "&#x":
bgneal@290 36 return unichr(int(text[3:-1], 16))
bgneal@290 37 else:
bgneal@290 38 return unichr(int(text[2:-1]))
bgneal@290 39 except ValueError:
bgneal@290 40 pass
bgneal@290 41 else:
bgneal@290 42 # named entity
bgneal@290 43 try:
bgneal@290 44 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
bgneal@290 45 except KeyError:
bgneal@290 46 pass
bgneal@290 47 return text # leave as is
bgneal@290 48 return re.sub("&#?\w+;", fixup, text)
bgneal@290 49
bgneal@290 50
bgneal@294 51 def unphpbb(s, encoding='latin-1'):
bgneal@290 52 """Converts BBCode from phpBB database data into 'pure' BBCode.
bgneal@290 53
bgneal@290 54 phpBB doesn't store plain BBCode in its database. The BBCode tags have
bgneal@290 55 "uids" added to them and the data has already been HTML entity'ized.
bgneal@290 56 This function removes the uid stuff and undoes the entity'ification and
bgneal@290 57 returns the result as a unicode string.
bgneal@290 58
bgneal@294 59 If the input 's' is not already unicode, it will be decoded using the
bgneal@294 60 supplied encoding.
bgneal@294 61
bgneal@290 62 """
bgneal@290 63 if not isinstance(s, unicode):
bgneal@294 64 s = s.decode(encoding, 'replace')
bgneal@290 65 for start, end in BBCODE_RES:
bgneal@290 66 s = re.sub(start, r'\1', s, re.MULTILINE)
bgneal@290 67 s = re.sub(end, r'\1]', s, re.MULTILINE)
bgneal@290 68 return unescape(s)