diff gpp/legacy/management/commands/translate_old_posts.py @ 294:254db4cb6a86

Changes / scripts to import forums. Other tweaks and moving other import scripts to the legacy application.
author Brian Neal <bgneal@gmail.com>
date Wed, 05 Jan 2011 04:09:35 +0000
parents
children 28de6caa4e6d
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gpp/legacy/management/commands/translate_old_posts.py	Wed Jan 05 04:09:35 2011 +0000
@@ -0,0 +1,134 @@
+"""
+translate_old_posts.py - A management command to join the bbposts and 
+bbposts_text tables together and output as a .csv file, suitable for use as an
+input to mysqlimport into the new database. This method bypasses the Django ORM
+as it was too slow given the number of old posts to import.
+
+"""
+from __future__ import with_statement
+import csv
+import optparse
+from datetime import datetime
+
+import MySQLdb
+import postmarkup
+
+from django.core.management.base import NoArgsCommand, CommandError
+
+from legacy.phpbb import unphpbb
+from legacy.html2md import MarkdownWriter
+from core.markup import SiteMarkup
+
+
+def convert_ip(s):
+    """
+    Converts a hex string representing an IP address into dotted notation.
+    """
+    n = int(s, 16)
+    return "%d.%d.%d.%d" % (
+            ((n >> 24) & 0xff),
+            ((n >> 16) & 0xff),
+            ((n >> 8) & 0xff),
+            n & 0xff)
+
+
+class Command(NoArgsCommand):
+    help = """\
+This command joins converts the SG101 1.0 posts to 2.0 format and outputs the
+data as a .csv file suitable for importing into the new database scheme with
+the mysqlimport utility.
+"""
+    option_list = NoArgsCommand.option_list + (
+        optparse.make_option("-s", "--progress", action="store_true",
+            help="Output a . after every 100 posts to show progress"),
+        optparse.make_option("-a", "--host", help="set MySQL host name"),
+        optparse.make_option("-u", "--user", help="set MySQL user name"),
+        optparse.make_option("-p", "--password", help="set MySQL user password"),
+        optparse.make_option("-d", "--database", help="set MySQL database name"),
+        optparse.make_option("-o", "--out-file", help="set output filename"),
+    )
+    bb_parser = postmarkup.create(use_pygments=False, annotate_links=False)
+    md_writer = MarkdownWriter()
+    site_markup = SiteMarkup()
+
+    def handle_noargs(self, **opts):
+
+        host = opts.get('host', 'localhost') or 'localhost'
+        user = opts.get('user', 'root') or 'root'
+        password = opts.get('password', '') or ''
+        database = opts.get('database')
+        out_filename = opts.get('out_file', 'forums_post.csv') or 'forums_post.csv'
+
+        if database is None:
+            raise CommandError("Please specify a database option")
+
+        out_file = open(out_filename, "wb")
+
+        # database columns (fieldnames) for the output CSV file:
+        cols = ('id', 'topic_id', 'user_id', 'creation_date', 'update_date',
+                'body', 'html', 'user_ip')
+        self.writer = csv.writer(out_file)
+
+        # Write an initial row of fieldnames to the output file 
+        self.writer.writerow(cols)
+
+        # connect to the legacy database
+        try:
+            db = MySQLdb.connect(host=host,
+                    user=user,
+                    passwd=password,
+                    db=database)
+        except MySQLdb.DatabaseError, e:
+            raise CommandError(str(e))
+
+        c = db.cursor(MySQLdb.cursors.DictCursor)
+
+        # query the legacy database
+        sql = ('SELECT * FROM sln_bbposts as p, sln_bbposts_text as t WHERE '
+                'p.post_id = t.post_id ORDER BY p.post_id')
+        c.execute(sql)
+
+        # convert the old data and write the output to the file
+        while True:
+            row = c.fetchone()
+            if row is None:
+                break
+
+            self.process_row(row)
+
+        c.close()
+        db.close()
+        out_file.close()
+
+    def to_html(self, s):
+        return self.bb_parser.render_to_html(unphpbb(s), cosmetic_replace=False)
+
+    def to_markdown(self, s):
+        self.md_writer.reset()
+        self.md_writer.feed(self.to_html(s))
+        return self.md_writer.markdown()
+
+    def process_row(self, row):
+        """
+        This function accepts one row from the legacy database and converts the
+        contents to the new database format, and calls the writer to write the new
+        row to the output file.
+        """
+        creation_date = datetime.fromtimestamp(float(row['post_time']))
+
+        if row['post_edit_time']:
+            update_date = datetime.fromtimestamp(float(row['post_edit_time'])) 
+        else:
+            update_date = creation_date
+
+        body = self.to_markdown(row['post_text'])
+        html = self.site_markup.convert(body)
+
+        self.writer.writerow([row['post_id'],
+                row['topic_id'],
+                row['poster_id'],
+                creation_date,
+                update_date,
+                body.encode("utf-8"),
+                html.encode("utf-8"),
+                convert_ip(row['poster_ip'])])