annotate gpp/legacy/management/commands/translate_old_posts.py @ 334:6805d15cda13

Adding a script I had to write on the fly to filter out posts from the posts csv file that had no parent topics. MyISAM let me get away with that, but InnoDB won't.
author Brian Neal <bgneal@gmail.com>
date Sat, 26 Feb 2011 01:28:22 +0000
parents 28de6caa4e6d
children
rev   line source
bgneal@294 1 """
bgneal@321 2 translate_old_posts.py - A management command to join the bbposts and
bgneal@294 3 bbposts_text tables together and output as a .csv file, suitable for use as an
bgneal@294 4 input to mysqlimport into the new database. This method bypasses the Django ORM
bgneal@294 5 as it was too slow given the number of old posts to import.
bgneal@294 6
bgneal@294 7 """
bgneal@294 8 from __future__ import with_statement
bgneal@294 9 import csv
bgneal@294 10 import optparse
bgneal@294 11 from datetime import datetime
bgneal@294 12
bgneal@294 13 import MySQLdb
bgneal@294 14 import postmarkup
bgneal@294 15
bgneal@294 16 from django.core.management.base import NoArgsCommand, CommandError
bgneal@294 17
bgneal@294 18 from legacy.phpbb import unphpbb
bgneal@294 19 from legacy.html2md import MarkdownWriter
bgneal@294 20 from core.markup import SiteMarkup
bgneal@294 21
bgneal@294 22
bgneal@294 23 def convert_ip(s):
bgneal@294 24 """
bgneal@294 25 Converts a hex string representing an IP address into dotted notation.
bgneal@294 26 """
bgneal@294 27 n = int(s, 16)
bgneal@294 28 return "%d.%d.%d.%d" % (
bgneal@294 29 ((n >> 24) & 0xff),
bgneal@294 30 ((n >> 16) & 0xff),
bgneal@294 31 ((n >> 8) & 0xff),
bgneal@294 32 n & 0xff)
bgneal@294 33
bgneal@294 34
bgneal@294 35 class Command(NoArgsCommand):
bgneal@294 36 help = """\
bgneal@321 37 This command joins the SG101 1.0 posts to 2.0 format and outputs the
bgneal@294 38 data as a .csv file suitable for importing into the new database scheme with
bgneal@294 39 the mysqlimport utility.
bgneal@294 40 """
bgneal@294 41 option_list = NoArgsCommand.option_list + (
bgneal@294 42 optparse.make_option("-s", "--progress", action="store_true",
bgneal@294 43 help="Output a . after every 100 posts to show progress"),
bgneal@294 44 optparse.make_option("-a", "--host", help="set MySQL host name"),
bgneal@294 45 optparse.make_option("-u", "--user", help="set MySQL user name"),
bgneal@294 46 optparse.make_option("-p", "--password", help="set MySQL user password"),
bgneal@294 47 optparse.make_option("-d", "--database", help="set MySQL database name"),
bgneal@294 48 optparse.make_option("-o", "--out-file", help="set output filename"),
bgneal@294 49 )
bgneal@294 50 bb_parser = postmarkup.create(use_pygments=False, annotate_links=False)
bgneal@294 51 md_writer = MarkdownWriter()
bgneal@294 52 site_markup = SiteMarkup()
bgneal@294 53
bgneal@294 54 def handle_noargs(self, **opts):
bgneal@294 55
bgneal@294 56 host = opts.get('host', 'localhost') or 'localhost'
bgneal@294 57 user = opts.get('user', 'root') or 'root'
bgneal@294 58 password = opts.get('password', '') or ''
bgneal@294 59 database = opts.get('database')
bgneal@294 60 out_filename = opts.get('out_file', 'forums_post.csv') or 'forums_post.csv'
bgneal@294 61
bgneal@294 62 if database is None:
bgneal@294 63 raise CommandError("Please specify a database option")
bgneal@294 64
bgneal@294 65 out_file = open(out_filename, "wb")
bgneal@294 66
bgneal@294 67 # database columns (fieldnames) for the output CSV file:
bgneal@294 68 cols = ('id', 'topic_id', 'user_id', 'creation_date', 'update_date',
bgneal@294 69 'body', 'html', 'user_ip')
bgneal@294 70 self.writer = csv.writer(out_file)
bgneal@294 71
bgneal@294 72 # Write an initial row of fieldnames to the output file
bgneal@294 73 self.writer.writerow(cols)
bgneal@294 74
bgneal@294 75 # connect to the legacy database
bgneal@294 76 try:
bgneal@294 77 db = MySQLdb.connect(host=host,
bgneal@294 78 user=user,
bgneal@294 79 passwd=password,
bgneal@294 80 db=database)
bgneal@294 81 except MySQLdb.DatabaseError, e:
bgneal@294 82 raise CommandError(str(e))
bgneal@294 83
bgneal@294 84 c = db.cursor(MySQLdb.cursors.DictCursor)
bgneal@294 85
bgneal@294 86 # query the legacy database
bgneal@294 87 sql = ('SELECT * FROM sln_bbposts as p, sln_bbposts_text as t WHERE '
bgneal@294 88 'p.post_id = t.post_id ORDER BY p.post_id')
bgneal@294 89 c.execute(sql)
bgneal@294 90
bgneal@294 91 # convert the old data and write the output to the file
bgneal@294 92 while True:
bgneal@294 93 row = c.fetchone()
bgneal@294 94 if row is None:
bgneal@294 95 break
bgneal@294 96
bgneal@294 97 self.process_row(row)
bgneal@294 98
bgneal@294 99 c.close()
bgneal@294 100 db.close()
bgneal@294 101 out_file.close()
bgneal@294 102
bgneal@294 103 def to_html(self, s):
bgneal@294 104 return self.bb_parser.render_to_html(unphpbb(s), cosmetic_replace=False)
bgneal@294 105
bgneal@294 106 def to_markdown(self, s):
bgneal@294 107 self.md_writer.reset()
bgneal@294 108 self.md_writer.feed(self.to_html(s))
bgneal@294 109 return self.md_writer.markdown()
bgneal@294 110
bgneal@294 111 def process_row(self, row):
bgneal@294 112 """
bgneal@294 113 This function accepts one row from the legacy database and converts the
bgneal@294 114 contents to the new database format, and calls the writer to write the new
bgneal@294 115 row to the output file.
bgneal@294 116 """
bgneal@294 117 creation_date = datetime.fromtimestamp(float(row['post_time']))
bgneal@294 118
bgneal@294 119 if row['post_edit_time']:
bgneal@294 120 update_date = datetime.fromtimestamp(float(row['post_edit_time']))
bgneal@294 121 else:
bgneal@294 122 update_date = creation_date
bgneal@294 123
bgneal@294 124 body = self.to_markdown(row['post_text'])
bgneal@294 125 html = self.site_markup.convert(body)
bgneal@294 126
bgneal@294 127 self.writer.writerow([row['post_id'],
bgneal@294 128 row['topic_id'],
bgneal@294 129 row['poster_id'],
bgneal@294 130 creation_date,
bgneal@294 131 update_date,
bgneal@294 132 body.encode("utf-8"),
bgneal@294 133 html.encode("utf-8"),
bgneal@294 134 convert_ip(row['poster_ip'])])