changeset 290:64c188a9d31f

Adding a legacy app to contain management commands to convert the old data to the new database format. This first commit has the import_old_users command.
author Brian Neal <bgneal@gmail.com>
date Fri, 24 Dec 2010 05:28:58 +0000
parents 0dd8989abef2
children a6357f2bcbbc
files gpp/legacy/__init__.py gpp/legacy/html2md.py gpp/legacy/management/__init__.py gpp/legacy/management/commands/__init__.py gpp/legacy/management/commands/import_old_users.py gpp/legacy/models.py gpp/legacy/phpbb.py gpp/legacy/tests.py gpp/legacy/views.py gpp/settings.py
diffstat 7 files changed, 550 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gpp/legacy/html2md.py	Fri Dec 24 05:28:58 2010 +0000
@@ -0,0 +1,279 @@
+"""
+This module contains a class derived from Python's HTMLParser to convert HTML to
+Markdown. Currently this class only supports those HTML tags that have counter-
+parts in BBCode used by stock phpBB 2.x.
+
+In other words, this class was created to help convert data from a phpBB
+forum to Markdown syntax and its scope is currently limited to that task.
+
+"""
+from HTMLParser import HTMLParser
+import htmlentitydefs
+
+
+# Let's call Markdown markup entities "elements" to avoid confusion
+# with HTML tags.
+
+class ElementBase(object):
+    """
+    Base class for all Markdown elements.
+
+    """
+    def __init__(self, attrs=None):
+        self.data = u''
+        self.attrs = dict(attrs) if attrs else {}
+
+    def add_data(self, data):
+        self.data += data
+
+    def markdown(self):
+        return self.data
+
+
+class TextElement(ElementBase):
+    """
+    TextElements represent text fragments not inside HTML tags.
+    """
+    pass
+
+
+class EmphasisElement(ElementBase):
+    """
+    An EmphasisElement is a Markdown element used to indicate emphasis and is
+    represented by placing characters around text. E.g. _em_, **bold**
+
+    """
+    def __init__(self, tag, attrs):
+        super(EmphasisElement, self).__init__(attrs)
+        self.tag = tag
+
+    def markdown(self):
+        return u'%s%s%s' % (self.tag, self.data, self.tag)
+
+
+def create_emphasis(tag):
+    """
+    Returns a function that creates an EmphasisElement using the supplied
+    tag.
+
+    """
+    def inner(attrs):
+        return EmphasisElement(tag, attrs)
+    return inner
+
+
+class HtmlElement(ElementBase):
+    """
+    Markdown also accepts HTML markup. This element represents a HTML tag that
+    maps to itself in Markdown.
+
+    """
+    def __init__(self, tag, attrs):
+        super(HtmlElement, self).__init__(attrs)
+        self.tag = tag
+
+    def markdown(self):
+        return u'<%s>%s</%s>' % (self.tag, self.data, self.tag)
+
+
+def create_html(tag):
+    """
+    Returns a function that creates a HtmlElement using the supplied tag.
+
+    """
+    def inner(attrs):
+        return HtmlElement(tag, attrs)
+    return inner
+
+
+class QuoteElement(ElementBase):
+    """
+    Class to represent a blockquote in Markdown.
+
+    """
+    def markdown(self):
+        return u'> %s\n\n' % self.data.replace('\n', '\n> ')
+
+
+class BreakElement(ElementBase):
+    """
+    Class to represent a linebreak in Markdown.
+
+    """
+    def markdown(self):
+        return u'  \n'
+
+
+class DivElement(ElementBase):
+    """
+    This class maps a HTML <div> into a block of text surrounded by newlines.
+
+    """
+    def markdown(self):
+        return u'\n%s\n' % self.data
+
+
+class LinkElement(ElementBase):
+    """
+    This class maps HTML <a> tags into Markdown links.
+    If no data is present, the actual href is used for the link text.
+
+    """
+    def markdown(self):
+        url = self.attrs['href']
+        text = self.data if self.data else url
+        return u'[%s](%s)' % (text, url)
+
+
+class ImageElement(ElementBase):
+    """
+    This class maps HTML <img> tags into Markdown.
+    This element assumes no alt text is present, and simply uses the word
+    'image' for the alt text.
+
+    """
+    def markdown(self):
+        url = self.attrs['src']
+        return u'![image](%s)' % url
+
+
+class CodeElement(ElementBase):
+    """
+    This class is used to create code blocks in Markdown.
+
+    """
+    def markdown(self):
+        return u'    %s\n' % self.data.replace('\n', '\n    ')
+
+
+# List (ordered & unordered) support:
+
+class ListElement(ElementBase):
+    """
+    This class creates Markdown for unordered lists. The bullet() method can be
+    overridden to create ordered lists.
+
+    """
+    def __init__(self, attrs=None):
+        super(ListElement, self).__init__(attrs)
+        self.items = []
+        self.list_nesting = 1
+
+    def add_data(self, data):
+        self.items.append(data)
+
+    def bullet(self):
+        return u'*'
+
+    def markdown(self):
+        bullet_str = self.bullet()
+        indent = u' ' * (4 * (self.list_nesting - 1))
+        s = u''
+        for item in self.items:
+            s += u'\n%s%s %s' % (indent, bullet_str, item)
+        return s
+
+
+class OrderedListElement(ListElement):
+    """
+    This class creates Markdown for ordered lists.
+
+    """
+    def bullet(self):
+        return '1.'
+
+
+class ItemElement(ElementBase):
+    """
+    This element is used to represent ordered & unordered list items.
+
+    """
+    pass
+
+###############################################################################
+###############################################################################
+
+class MarkdownWriter(HTMLParser):
+    """
+    This class is an HTMLParser that converts a subset of HTML to Markdown.
+
+    """
+
+    elem_factories = {
+        'a': LinkElement,
+        'blockquote': QuoteElement,
+        'br': BreakElement,
+        'div': DivElement,
+        'em': create_emphasis('_'),
+        'img': ImageElement,
+        'li': ItemElement,
+        'ol': OrderedListElement,
+        'pre': CodeElement,
+        's': create_html('strike'),
+        'strong': create_emphasis('**'),
+        'u': create_html('u'),
+        'ul': ListElement,
+    }
+
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.reset()
+
+    def handle_starttag(self, tag, attrs):
+        if tag in self.elem_factories:
+            factory = self.elem_factories[tag]
+            element = factory(attrs)
+        else:
+            element = TextElement()
+
+        self._push_elem(element)
+
+    def handle_endtag(self, tag):
+        self._pop_elem()
+
+    def handle_data(self, data):
+        if len(self.elem_stack) == 0:
+            self._push_elem(TextElement())
+        self._add_data(data)
+
+    def handle_entityref(self, name):
+        try:
+            text = unichr(htmlentitydefs.name2codepoint[name])
+        except KeyError:
+            text = name
+        self.handle_data(text)
+
+    def handle_charref(self, name):
+        self.handle_data(unichr(int(name)))
+
+    def reset(self):
+        HTMLParser.reset(self)
+        self.elem_stack = []
+        self.elements = []
+        self.list_nesting = 0
+
+    def _push_elem(self, tag):
+        if len(self.elem_stack) and isinstance(self.elem_stack[-1], TextElement):
+            self._pop_elem()
+        if isinstance(tag, ListElement):
+            self.list_nesting += 1
+            tag.list_nesting = self.list_nesting
+        self.elem_stack.append(tag)
+
+    def _pop_elem(self):
+        element = self.elem_stack.pop()
+        if isinstance(element, ListElement):
+            self.list_nesting -= 1
+        if len(self.elem_stack):
+            self.elem_stack[-1].add_data(element.markdown())
+        else:
+            self.elements.append(element)
+
+    def _add_data(self, data):
+        self.elem_stack[-1].add_data(data)
+
+    def markdown(self):
+        while len(self.elem_stack):
+            self._pop_elem()
+        text_list = [e.markdown() for e in self.elements]
+        return u''.join(text_list)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gpp/legacy/management/commands/import_old_users.py	Fri Dec 24 05:28:58 2010 +0000
@@ -0,0 +1,163 @@
+"""
+import_old_users.py - For importing users from SG101 1.0 as csv files.
+"""
+from __future__ import with_statement
+import csv
+import optparse
+import re
+import sys
+from datetime import datetime
+
+import postmarkup
+
+from django.core.management.base import LabelCommand, CommandError
+from django.contrib.auth.models import User
+
+import bio.models
+from legacy.phpbb import unphpbb
+from legacy.html2md import MarkdownWriter
+
+TIME_ZONES = {
+    '-5': 'US/Eastern',
+    '-6': 'US/Central',
+    '-7': 'US/Mountain',
+    '-8': 'US/Pacific',
+}
+USERNAME_RE = re.compile(r'^[\w.@+-]+$')
+USERNAME_LEN = (1, 30)      # min & max length values
+
+
+def _valid_username(username):
+    """
+    Return true if the username is valid.
+    """
+    return (USERNAME_LEN[0] <= len(username) <= USERNAME_LEN[1] and
+            USERNAME_RE.match(username))
+
+
+def _break_name(name):
+    """
+    Break name into a first and last name.
+    Return a 2-tuple of first_name, last_name.
+    """
+    parts = name.split()
+    n = len(parts)
+    if n == 0:
+        t = '', ''
+    elif n == 1:
+        t = parts[0], ''
+    else:
+        t = ' '.join(parts[:-1]), parts[-1]
+    return t[0][:USERNAME_LEN[1]], t[1][:USERNAME_LEN[1]]
+
+
+class Command(LabelCommand):
+    args = '<filename filename ...>'
+    help = 'Imports users from the old database in CSV format'
+    option_list = LabelCommand.option_list + (
+        optparse.make_option("-s", "--super-user",
+            help="Make the user with this name a superuser"),
+        optparse.make_option("-a", "--anon-user",
+            help="Make the user with this name the anonymous user "
+                "[default: Anonymous]"),
+        optparse.make_option("-p", "--progress", action="store_true",
+            help="Output a . after every 20 users to show progress"),
+    )
+    bb_parser = postmarkup.create(use_pygments=False, annotate_links=False)
+    md_writer = MarkdownWriter()
+
+    def handle_label(self, filename, **options):
+        """
+        Process each line in the CSV file given by filename by
+        creating a new user and profile.
+
+        """
+        self.superuser = options.get('super_user')
+        self.anonymous = options.get('anon_user')
+        if self.anonymous is None:
+            self.anonymous = 'Anonymous'
+        self.show_progress = options.get('progress')
+
+        if self.superuser == self.anonymous:
+            raise CommandError("super-user name should not match anon-user")
+
+        try:
+            with open(filename, "rb") as f:
+                self.reader = csv.DictReader(f)
+                num_rows = 0
+                try:
+                    for row in self.reader:
+                        self.process_row(row)
+                        num_rows += 1
+                        if num_rows % 20 == 0:
+                            sys.stdout.write('.')
+                            sys.stdout.flush()
+                except csv.Error, e:
+                    raise CommandError("CSV error: %s %s %s" % (
+                        filename, self.reader.line_num, e))
+
+                print
+
+        except IOError:
+            raise CommandError("Could not open file: %s" % filename)
+
+    def process_row(self, row):
+        """
+        Process one row from the CSV file: create a user and user profile for
+        the row and save it in the database.
+
+        """
+        row = dict((k, v if v != 'NULL' else '') for k, v in row.iteritems())
+
+        if not _valid_username(row['username']):
+            print "Skipping import of %s; invalid username" % row['username']
+            return
+
+        n = User.objects.filter(username=row['username']).count()
+        if n > 0:
+            print "Skipping import of %s; user already exists" % row['username']
+            return
+
+        first_name, last_name = _break_name(row['name'])
+        is_superuser = self.superuser == row['username']
+        is_anonymous = self.anonymous == row['username']
+
+        u = User(id=int(row['user_id']),
+                username=row['username'],
+                first_name=first_name,
+                last_name=last_name,
+                email=row['user_email'],
+                password=row['user_password'] if row['user_password'] else None,
+                is_staff=is_superuser,
+                is_active=True if not is_anonymous else False,
+                is_superuser=is_superuser,
+                last_login=datetime.fromtimestamp(int(row['user_lastvisit'])),
+                date_joined=datetime.strptime(row['user_regdate'], "%b %d, %Y"))
+
+        if is_anonymous:
+            u.set_unusable_password()
+
+        u.save()
+
+        p = u.get_profile()
+        p.location = row['user_from']
+        p.occupation = row['user_occ']
+        p.interests = row['user_interests']
+        p.profile_text = u''
+        p.hide_email = True if row['user_viewemail'] != '1' else False
+        p.signature = self.to_markdown(row['user_sig']) if row['user_sig'] else u''
+        p.time_zone = TIME_ZONES.get(row['user_timezone'], 'US/Pacific')
+        p.use_24_time = False
+        p.forum_post_count = int(row['user_posts'])
+        p.status = bio.models.STA_ACTIVE if p.forum_post_count > 10 else bio.models.STA_STRANGER
+        p.status_date = datetime.now()
+        p.update_date = p.status_date
+        p.save()
+
+    def to_html(self, s):
+        return self.bb_parser.render_to_html(unphpbb(s), cosmetic_replace=False)
+
+    def to_markdown(self, s):
+        self.md_writer.reset()
+        self.md_writer.feed(self.to_html(s))
+        return self.md_writer.markdown()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gpp/legacy/models.py	Fri Dec 24 05:28:58 2010 +0000
@@ -0,0 +1,3 @@
+from django.db import models
+
+# Create your models here.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gpp/legacy/phpbb.py	Fri Dec 24 05:28:58 2010 +0000
@@ -0,0 +1,65 @@
+"""
+This module contains functions for working with data from the legacy phpBB
+based website.
+"""
+import re
+import htmlentitydefs
+
+
+# BBCode tags used by the old site
+BBCODE_TAGS = "b i u s url quote img list * code color size".split()
+
+# Regular expressions used to get rid of phpBB's uid inside BBCode tags.
+# This is a list of regular expression pairs. Element 0 of each pair
+# is for the opening tag & element 1 is for the closing tag.
+
+BBCODE_RES = [(
+    re.compile(r"(\[%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}" % tag),
+    re.compile(r"(\[/%s):(?:[0-9a-fu]+:)?[0-9a-f]{10}\]" % tag)
+) for tag in BBCODE_TAGS]
+
+
+##
+# Removes HTML or XML character references and entities from a text string.
+#
+# @param text The HTML (or XML) source text.
+# @return The plain text, as a Unicode string, if necessary.
+# Source: http://effbot.org/zone/re-sub.htm#unescape-html
+#
+def unescape(text):
+    def fixup(m):
+        text = m.group(0)
+        if text[:2] == "&#":
+            # character reference
+            try:
+                if text[:3] == "&#x":
+                    return unichr(int(text[3:-1], 16))
+                else:
+                    return unichr(int(text[2:-1]))
+            except ValueError:
+                pass
+        else:
+            # named entity
+            try:
+                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+            except KeyError:
+                pass
+        return text # leave as is
+    return re.sub("&#?\w+;", fixup, text)
+
+
+def unphpbb(s):
+    """Converts BBCode from phpBB database data into 'pure' BBCode.
+
+    phpBB doesn't store plain BBCode in its database. The BBCode tags have
+    "uids" added to them and the data has already been HTML entity'ized.
+    This function removes the uid stuff and undoes the entity'ification and
+    returns the result as a unicode string.
+
+    """
+    if not isinstance(s, unicode):
+        s = s.decode('utf-8', 'replace')
+    for start, end in BBCODE_RES:
+        s = re.sub(start, r'\1', s, re.MULTILINE)
+        s = re.sub(end, r'\1]', s, re.MULTILINE)
+    return unescape(s)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gpp/legacy/tests.py	Fri Dec 24 05:28:58 2010 +0000
@@ -0,0 +1,38 @@
+"""
+Tests for legacy app functions.
+"""
+
+from django.test import TestCase
+
+from legacy.phpbb import unphpbb
+from legacy.html2md import MarkdownWriter
+
+class UnPhpBbTest(TestCase):
+
+    def test_unentities(self):
+        s1 = "&quot;Look! No head!&quot; - Laika &amp; The Cosmonauts"
+        s2 = unphpbb(s1)
+        s3 = u'"Look! No head!" - Laika & The Cosmonauts'
+        self.failUnlessEqual(s2, s3)
+
+    def test_rem_uuid1(self):
+        s1 = ("[url=http://www.thesurfites.com][color=black:3fdb565c83]"
+                "T H E - S U R F I T E S[/color:3fdb565c83][/url]")
+        s2 = unphpbb(s1)
+        s3 = (u'[url=http://www.thesurfites.com][color=black]'
+                'T H E - S U R F I T E S[/color][/url]')
+        self.failUnlessEqual(s2, s3)
+
+
+class Html2MdTest(TestCase):
+
+    def test_sig1(self):
+        s1 = """<p><a href="http://surfguitar101.com/modules.php?name=Web_Links&amp;l_op=visit&amp;lid=50">Pollo Del Mar</a><br />
+<a href="http://tinyurl.com/yjfmspj">Frankie &amp; The Pool Boys</a><br />
+<a href="http://tinyurl.com/cnr27t">PDM on FaceBook</a><br />
+</p>"""
+        md_writer = MarkdownWriter()
+        md_writer.feed(s1)
+        s2 = md_writer.markdown()
+        s3 = u'[Pollo Del Mar](http://surfguitar101.com/modules.php?name=Web_Links&l_op=visit&lid=50)  \n\n[Frankie & The Pool Boys](http://tinyurl.com/yjfmspj)  \n\n[PDM on FaceBook](http://tinyurl.com/cnr27t)  \n\n'
+        self.failUnlessEqual(s2, s3)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gpp/legacy/views.py	Fri Dec 24 05:28:58 2010 +0000
@@ -0,0 +1,1 @@
+# Create your views here.
--- a/gpp/settings.py	Thu Dec 23 23:36:53 2010 +0000
+++ b/gpp/settings.py	Fri Dec 24 05:28:58 2010 +0000
@@ -140,6 +140,7 @@
     'forums',
     'gcalendar',
     'irc',
+    'legacy',
     'mailer',
     'membermap',
     'messages',