view gpp/legacy/management/commands/translate_old_posts.py @ 294:254db4cb6a86

Changes / scripts to import forums. Other tweaks and moving other import scripts to the legacy application.
author Brian Neal <bgneal@gmail.com>
date Wed, 05 Jan 2011 04:09:35 +0000
parents
children 28de6caa4e6d
line wrap: on
line source
"""
translate_old_posts.py - A management command to join the bbposts and 
bbposts_text tables together and output as a .csv file, suitable for use as an
input to mysqlimport into the new database. This method bypasses the Django ORM
as it was too slow given the number of old posts to import.

"""
from __future__ import with_statement
import csv
import optparse
from datetime import datetime

import MySQLdb
import postmarkup

from django.core.management.base import NoArgsCommand, CommandError

from legacy.phpbb import unphpbb
from legacy.html2md import MarkdownWriter
from core.markup import SiteMarkup


def convert_ip(s):
    """
    Converts a hex string representing an IP address into dotted notation.
    """
    n = int(s, 16)
    return "%d.%d.%d.%d" % (
            ((n >> 24) & 0xff),
            ((n >> 16) & 0xff),
            ((n >> 8) & 0xff),
            n & 0xff)


class Command(NoArgsCommand):
    help = """\
This command joins converts the SG101 1.0 posts to 2.0 format and outputs the
data as a .csv file suitable for importing into the new database scheme with
the mysqlimport utility.
"""
    option_list = NoArgsCommand.option_list + (
        optparse.make_option("-s", "--progress", action="store_true",
            help="Output a . after every 100 posts to show progress"),
        optparse.make_option("-a", "--host", help="set MySQL host name"),
        optparse.make_option("-u", "--user", help="set MySQL user name"),
        optparse.make_option("-p", "--password", help="set MySQL user password"),
        optparse.make_option("-d", "--database", help="set MySQL database name"),
        optparse.make_option("-o", "--out-file", help="set output filename"),
    )
    bb_parser = postmarkup.create(use_pygments=False, annotate_links=False)
    md_writer = MarkdownWriter()
    site_markup = SiteMarkup()

    def handle_noargs(self, **opts):

        host = opts.get('host', 'localhost') or 'localhost'
        user = opts.get('user', 'root') or 'root'
        password = opts.get('password', '') or ''
        database = opts.get('database')
        out_filename = opts.get('out_file', 'forums_post.csv') or 'forums_post.csv'

        if database is None:
            raise CommandError("Please specify a database option")

        out_file = open(out_filename, "wb")

        # database columns (fieldnames) for the output CSV file:
        cols = ('id', 'topic_id', 'user_id', 'creation_date', 'update_date',
                'body', 'html', 'user_ip')
        self.writer = csv.writer(out_file)

        # Write an initial row of fieldnames to the output file 
        self.writer.writerow(cols)

        # connect to the legacy database
        try:
            db = MySQLdb.connect(host=host,
                    user=user,
                    passwd=password,
                    db=database)
        except MySQLdb.DatabaseError, e:
            raise CommandError(str(e))

        c = db.cursor(MySQLdb.cursors.DictCursor)

        # query the legacy database
        sql = ('SELECT * FROM sln_bbposts as p, sln_bbposts_text as t WHERE '
                'p.post_id = t.post_id ORDER BY p.post_id')
        c.execute(sql)

        # convert the old data and write the output to the file
        while True:
            row = c.fetchone()
            if row is None:
                break

            self.process_row(row)

        c.close()
        db.close()
        out_file.close()

    def to_html(self, s):
        return self.bb_parser.render_to_html(unphpbb(s), cosmetic_replace=False)

    def to_markdown(self, s):
        self.md_writer.reset()
        self.md_writer.feed(self.to_html(s))
        return self.md_writer.markdown()

    def process_row(self, row):
        """
        This function accepts one row from the legacy database and converts the
        contents to the new database format, and calls the writer to write the new
        row to the output file.
        """
        creation_date = datetime.fromtimestamp(float(row['post_time']))

        if row['post_edit_time']:
            update_date = datetime.fromtimestamp(float(row['post_edit_time'])) 
        else:
            update_date = creation_date

        body = self.to_markdown(row['post_text'])
        html = self.site_markup.convert(body)

        self.writer.writerow([row['post_id'],
                row['topic_id'],
                row['poster_id'],
                creation_date,
                update_date,
                body.encode("utf-8"),
                html.encode("utf-8"),
                convert_ip(row['poster_ip'])])