Mercurial > public > sg101
changeset 290:64c188a9d31f
Adding a legacy app to contain management commands to convert the old data to the new database format. This first commit has the import_old_users command.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Fri, 24 Dec 2010 05:28:58 +0000 |
parents | 0dd8989abef2 |
children | a6357f2bcbbc |
files | gpp/legacy/__init__.py gpp/legacy/html2md.py gpp/legacy/management/__init__.py gpp/legacy/management/commands/__init__.py gpp/legacy/management/commands/import_old_users.py gpp/legacy/models.py gpp/legacy/phpbb.py gpp/legacy/tests.py gpp/legacy/views.py gpp/settings.py |
diffstat | 7 files changed, 550 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/legacy/html2md.py Fri Dec 24 05:28:58 2010 +0000 @@ -0,0 +1,279 @@ +""" +This module contains a class derived from Python's HTMLParser to convert HTML to +Markdown. Currently this class only supports those HTML tags that have counter- +parts in BBCode used by stock phpBB 2.x. + +In other words, this class was created to help convert data from a phpBB +forum to Markdown syntax and its scope is currently limited to that task. + +""" +from HTMLParser import HTMLParser +import htmlentitydefs + + +# Let's call Markdown markup entities "elements" to avoid confusion +# with HTML tags. + +class ElementBase(object): + """ + Base class for all Markdown elements. + + """ + def __init__(self, attrs=None): + self.data = u'' + self.attrs = dict(attrs) if attrs else {} + + def add_data(self, data): + self.data += data + + def markdown(self): + return self.data + + +class TextElement(ElementBase): + """ + TextElements represent text fragments not inside HTML tags. + """ + pass + + +class EmphasisElement(ElementBase): + """ + An EmphasisElement is a Markdown element used to indicate emphasis and is + represented by placing characters around text. E.g. _em_, **bold** + + """ + def __init__(self, tag, attrs): + super(EmphasisElement, self).__init__(attrs) + self.tag = tag + + def markdown(self): + return u'%s%s%s' % (self.tag, self.data, self.tag) + + +def create_emphasis(tag): + """ + Returns a function that creates an EmphasisElement using the supplied + tag. + + """ + def inner(attrs): + return EmphasisElement(tag, attrs) + return inner + + +class HtmlElement(ElementBase): + """ + Markdown also accepts HTML markup. This element represents a HTML tag that + maps to itself in Markdown. + + """ + def __init__(self, tag, attrs): + super(HtmlElement, self).__init__(attrs) + self.tag = tag + + def markdown(self): + return u'<%s>%s</%s>' % (self.tag, self.data, self.tag) + + +def create_html(tag): + """ + Returns a function that creates a HtmlElement using the supplied tag. + + """ + def inner(attrs): + return HtmlElement(tag, attrs) + return inner + + +class QuoteElement(ElementBase): + """ + Class to represent a blockquote in Markdown. + + """ + def markdown(self): + return u'> %s\n\n' % self.data.replace('\n', '\n> ') + + +class BreakElement(ElementBase): + """ + Class to represent a linebreak in Markdown. + + """ + def markdown(self): + return u' \n' + + +class DivElement(ElementBase): + """ + This class maps a HTML <div> into a block of text surrounded by newlines. + + """ + def markdown(self): + return u'\n%s\n' % self.data + + +class LinkElement(ElementBase): + """ + This class maps HTML <a> tags into Markdown links. + If no data is present, the actual href is used for the link text. + + """ + def markdown(self): + url = self.attrs['href'] + text = self.data if self.data else url + return u'[%s](%s)' % (text, url) + + +class ImageElement(ElementBase): + """ + This class maps HTML <img> tags into Markdown. + This element assumes no alt text is present, and simply uses the word + 'image' for the alt text. + + """ + def markdown(self): + url = self.attrs['src'] + return u'![image](%s)' % url + + +class CodeElement(ElementBase): + """ + This class is used to create code blocks in Markdown. + + """ + def markdown(self): + return u' %s\n' % self.data.replace('\n', '\n ') + + +# List (ordered & unordered) support: + +class ListElement(ElementBase): + """ + This class creates Markdown for unordered lists. The bullet() method can be + overridden to create ordered lists. + + """ + def __init__(self, attrs=None): + super(ListElement, self).__init__(attrs) + self.items = [] + self.list_nesting = 1 + + def add_data(self, data): + self.items.append(data) + + def bullet(self): + return u'*' + + def markdown(self): + bullet_str = self.bullet() + indent = u' ' * (4 * (self.list_nesting - 1)) + s = u'' + for item in self.items: + s += u'\n%s%s %s' % (indent, bullet_str, item) + return s + + +class OrderedListElement(ListElement): + """ + This class creates Markdown for ordered lists. + + """ + def bullet(self): + return '1.' + + +class ItemElement(ElementBase): + """ + This element is used to represent ordered & unordered list items. + + """ + pass + +############################################################################### +############################################################################### + +class MarkdownWriter(HTMLParser): + """ + This class is an HTMLParser that converts a subset of HTML to Markdown. + + """ + + elem_factories = { + 'a': LinkElement, + 'blockquote': QuoteElement, + 'br': BreakElement, + 'div': DivElement, + 'em': create_emphasis('_'), + 'img': ImageElement, + 'li': ItemElement, + 'ol': OrderedListElement, + 'pre': CodeElement, + 's': create_html('strike'), + 'strong': create_emphasis('**'), + 'u': create_html('u'), + 'ul': ListElement, + } + + def __init__(self): + HTMLParser.__init__(self) + self.reset() + + def handle_starttag(self, tag, attrs): + if tag in self.elem_factories: + factory = self.elem_factories[tag] + element = factory(attrs) + else: + element = TextElement() + + self._push_elem(element) + + def handle_endtag(self, tag): + self._pop_elem() + + def handle_data(self, data): + if len(self.elem_stack) == 0: + self._push_elem(TextElement()) + self._add_data(data) + + def handle_entityref(self, name): + try: + text = unichr(htmlentitydefs.name2codepoint[name]) + except KeyError: + text = name + self.handle_data(text) + + def handle_charref(self, name): + self.handle_data(unichr(int(name))) + + def reset(self): + HTMLParser.reset(self) + self.elem_stack = [] + self.elements = [] + self.list_nesting = 0 + + def _push_elem(self, tag): + if len(self.elem_stack) and isinstance(self.elem_stack[-1], TextElement): + self._pop_elem() + if isinstance(tag, ListElement): + self.list_nesting += 1 + tag.list_nesting = self.list_nesting + self.elem_stack.append(tag) + + def _pop_elem(self): + element = self.elem_stack.pop() + if isinstance(element, ListElement): + self.list_nesting -= 1 + if len(self.elem_stack): + self.elem_stack[-1].add_data(element.markdown()) + else: + self.elements.append(element) + + def _add_data(self, data): + self.elem_stack[-1].add_data(data) + + def markdown(self): + while len(self.elem_stack): + self._pop_elem() + text_list = [e.markdown() for e in self.elements] + return u''.join(text_list)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/legacy/management/commands/import_old_users.py Fri Dec 24 05:28:58 2010 +0000 @@ -0,0 +1,163 @@ +""" +import_old_users.py - For importing users from SG101 1.0 as csv files. +""" +from __future__ import with_statement +import csv +import optparse +import re +import sys +from datetime import datetime + +import postmarkup + +from django.core.management.base import LabelCommand, CommandError +from django.contrib.auth.models import User + +import bio.models +from legacy.phpbb import unphpbb +from legacy.html2md import MarkdownWriter + +TIME_ZONES = { + '-5': 'US/Eastern', + '-6': 'US/Central', + '-7': 'US/Mountain', + '-8': 'US/Pacific', +} +USERNAME_RE = re.compile(r'^[\w.@+-]+$') +USERNAME_LEN = (1, 30) # min & max length values + + +def _valid_username(username): + """ + Return true if the username is valid. + """ + return (USERNAME_LEN[0] <= len(username) <= USERNAME_LEN[1] and + USERNAME_RE.match(username)) + + +def _break_name(name): + """ + Break name into a first and last name. + Return a 2-tuple of first_name, last_name. + """ + parts = name.split() + n = len(parts) + if n == 0: + t = '', '' + elif n == 1: + t = parts[0], '' + else: + t = ' '.join(parts[:-1]), parts[-1] + return t[0][:USERNAME_LEN[1]], t[1][:USERNAME_LEN[1]] + + +class Command(LabelCommand): + args = '<filename filename ...>' + help = 'Imports users from the old database in CSV format' + option_list = LabelCommand.option_list + ( + optparse.make_option("-s", "--super-user", + help="Make the user with this name a superuser"), + optparse.make_option("-a", "--anon-user", + help="Make the user with this name the anonymous user " + "[default: Anonymous]"), + optparse.make_option("-p", "--progress", action="store_true", + help="Output a . after every 20 users to show progress"), + ) + bb_parser = postmarkup.create(use_pygments=False, annotate_links=False) + md_writer = MarkdownWriter() + + def handle_label(self, filename, **options): + """ + Process each line in the CSV file given by filename by + creating a new user and profile. + + """ + self.superuser = options.get('super_user') + self.anonymous = options.get('anon_user') + if self.anonymous is None: + self.anonymous = 'Anonymous' + self.show_progress = options.get('progress') + + if self.superuser == self.anonymous: + raise CommandError("super-user name should not match anon-user") + + try: + with open(filename, "rb") as f: + self.reader = csv.DictReader(f) + num_rows = 0 + try: + for row in self.reader: + self.process_row(row) + num_rows += 1 + if num_rows % 20 == 0: + sys.stdout.write('.') + sys.stdout.flush() + except csv.Error, e: + raise CommandError("CSV error: %s %s %s" % ( + filename, self.reader.line_num, e)) + + print + + except IOError: + raise CommandError("Could not open file: %s" % filename) + + def process_row(self, row): + """ + Process one row from the CSV file: create a user and user profile for + the row and save it in the database. + + """ + row = dict((k, v if v != 'NULL' else '') for k, v in row.iteritems()) + + if not _valid_username(row['username']): + print "Skipping import of %s; invalid username" % row['username'] + return + + n = User.objects.filter(username=row['username']).count() + if n > 0: + print "Skipping import of %s; user already exists" % row['username'] + return + + first_name, last_name = _break_name(row['name']) + is_superuser = self.superuser == row['username'] + is_anonymous = self.anonymous == row['username'] + + u = User(id=int(row['user_id']), + username=row['username'], + first_name=first_name, + last_name=last_name, + email=row['user_email'], + password=row['user_password'] if row['user_password'] else None, + is_staff=is_superuser, + is_active=True if not is_anonymous else False, + is_superuser=is_superuser, + last_login=datetime.fromtimestamp(int(row['user_lastvisit'])), + date_joined=datetime.strptime(row['user_regdate'], "%b %d, %Y")) + + if is_anonymous: + u.set_unusable_password() + + u.save() + + p = u.get_profile() + p.location = row['user_from'] + p.occupation = row['user_occ'] + p.interests = row['user_interests'] + p.profile_text = u'' + p.hide_email = True if row['user_viewemail'] != '1' else False + p.signature = self.to_markdown(row['user_sig']) if row['user_sig'] else u'' + p.time_zone = TIME_ZONES.get(row['user_timezone'], 'US/Pacific') + p.use_24_time = False + p.forum_post_count = int(row['user_posts']) + p.status = bio.models.STA_ACTIVE if p.forum_post_count > 10 else bio.models.STA_STRANGER + p.status_date = datetime.now() + p.update_date = p.status_date + p.save() + + def to_html(self, s): + return self.bb_parser.render_to_html(unphpbb(s), cosmetic_replace=False) + + def to_markdown(self, s): + self.md_writer.reset() + self.md_writer.feed(self.to_html(s)) + return self.md_writer.markdown()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/legacy/models.py Fri Dec 24 05:28:58 2010 +0000 @@ -0,0 +1,3 @@ +from django.db import models + +# Create your models here.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/legacy/phpbb.py Fri Dec 24 05:28:58 2010 +0000 @@ -0,0 +1,65 @@ +""" +This module contains functions for working with data from the legacy phpBB +based website. +""" +import re +import htmlentitydefs + + +# BBCode tags used by the old site +BBCODE_TAGS = "b i u s url quote img list * code color size".split() + +# Regular expressions used to get rid of phpBB's uid inside BBCode tags. +# This is a list of regular expression pairs. Element 0 of each pair +# is for the opening tag & element 1 is for the closing tag. + +BBCODE_RES = [( + re.compile(r"(\[%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}" % tag), + re.compile(r"(\[/%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}\]" % tag) +) for tag in BBCODE_TAGS] + + +## +# Removes HTML or XML character references and entities from a text string. +# +# @param text The HTML (or XML) source text. +# @return The plain text, as a Unicode string, if necessary. +# Source: http://effbot.org/zone/re-sub.htm#unescape-html +# +def unescape(text): + def fixup(m): + text = m.group(0) + if text[:2] == "&#": + # character reference + try: + if text[:3] == "&#x": + return unichr(int(text[3:-1], 16)) + else: + return unichr(int(text[2:-1])) + except ValueError: + pass + else: + # named entity + try: + text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) + except KeyError: + pass + return text # leave as is + return re.sub("&#?\w+;", fixup, text) + + +def unphpbb(s): + """Converts BBCode from phpBB database data into 'pure' BBCode. + + phpBB doesn't store plain BBCode in its database. The BBCode tags have + "uids" added to them and the data has already been HTML entity'ized. + This function removes the uid stuff and undoes the entity'ification and + returns the result as a unicode string. + + """ + if not isinstance(s, unicode): + s = s.decode('utf-8', 'replace') + for start, end in BBCODE_RES: + s = re.sub(start, r'\1', s, re.MULTILINE) + s = re.sub(end, r'\1]', s, re.MULTILINE) + return unescape(s)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/legacy/tests.py Fri Dec 24 05:28:58 2010 +0000 @@ -0,0 +1,38 @@ +""" +Tests for legacy app functions. +""" + +from django.test import TestCase + +from legacy.phpbb import unphpbb +from legacy.html2md import MarkdownWriter + +class UnPhpBbTest(TestCase): + + def test_unentities(self): + s1 = ""Look! No head!" - Laika & The Cosmonauts" + s2 = unphpbb(s1) + s3 = u'"Look! No head!" - Laika & The Cosmonauts' + self.failUnlessEqual(s2, s3) + + def test_rem_uuid1(self): + s1 = ("[url=http://www.thesurfites.com][color=black:3fdb565c83]" + "T H E - S U R F I T E S[/color:3fdb565c83][/url]") + s2 = unphpbb(s1) + s3 = (u'[url=http://www.thesurfites.com][color=black]' + 'T H E - S U R F I T E S[/color][/url]') + self.failUnlessEqual(s2, s3) + + +class Html2MdTest(TestCase): + + def test_sig1(self): + s1 = """<p><a href="http://surfguitar101.com/modules.php?name=Web_Links&l_op=visit&lid=50">Pollo Del Mar</a><br /> +<a href="http://tinyurl.com/yjfmspj">Frankie & The Pool Boys</a><br /> +<a href="http://tinyurl.com/cnr27t">PDM on FaceBook</a><br /> +</p>""" + md_writer = MarkdownWriter() + md_writer.feed(s1) + s2 = md_writer.markdown() + s3 = u'[Pollo Del Mar](http://surfguitar101.com/modules.php?name=Web_Links&l_op=visit&lid=50) \n\n[Frankie & The Pool Boys](http://tinyurl.com/yjfmspj) \n\n[PDM on FaceBook](http://tinyurl.com/cnr27t) \n\n' + self.failUnlessEqual(s2, s3)