annotate legacy/management/commands/translate_old_posts.py @ 1203:8cd15df9b563

Controlling the xapian install script in tools.
author Brian Neal <bgneal@gmail.com>
date Sat, 04 Jan 2025 14:19:19 -0600
parents ee87ea74d46b
children
rev   line source
bgneal@294 1 """
bgneal@321 2 translate_old_posts.py - A management command to join the bbposts and
bgneal@294 3 bbposts_text tables together and output as a .csv file, suitable for use as an
bgneal@294 4 input to mysqlimport into the new database. This method bypasses the Django ORM
bgneal@294 5 as it was too slow given the number of old posts to import.
bgneal@294 6
bgneal@294 7 """
bgneal@294 8 from __future__ import with_statement
bgneal@294 9 import csv
bgneal@294 10 import optparse
bgneal@294 11 from datetime import datetime
bgneal@294 12
bgneal@294 13 import MySQLdb
bgneal@294 14 import postmarkup
bgneal@294 15
bgneal@294 16 from django.core.management.base import NoArgsCommand, CommandError
bgneal@294 17
bgneal@294 18 from legacy.phpbb import unphpbb
bgneal@294 19 from legacy.html2md import MarkdownWriter
bgneal@294 20 from core.markup import SiteMarkup
bgneal@294 21
bgneal@294 22
bgneal@294 23 def convert_ip(s):
bgneal@294 24 """
bgneal@294 25 Converts a hex string representing an IP address into dotted notation.
bgneal@294 26 """
bgneal@294 27 n = int(s, 16)
bgneal@294 28 return "%d.%d.%d.%d" % (
bgneal@294 29 ((n >> 24) & 0xff),
bgneal@294 30 ((n >> 16) & 0xff),
bgneal@294 31 ((n >> 8) & 0xff),
bgneal@294 32 n & 0xff)
bgneal@294 33
bgneal@294 34
bgneal@294 35 class Command(NoArgsCommand):
bgneal@294 36 help = """\
bgneal@321 37 This command joins the SG101 1.0 posts to 2.0 format and outputs the
bgneal@294 38 data as a .csv file suitable for importing into the new database scheme with
bgneal@294 39 the mysqlimport utility.
bgneal@294 40 """
bgneal@294 41 option_list = NoArgsCommand.option_list + (
bgneal@294 42 optparse.make_option("-s", "--progress", action="store_true",
bgneal@294 43 help="Output a . after every 100 posts to show progress"),
bgneal@294 44 optparse.make_option("-a", "--host", help="set MySQL host name"),
bgneal@294 45 optparse.make_option("-u", "--user", help="set MySQL user name"),
bgneal@294 46 optparse.make_option("-p", "--password", help="set MySQL user password"),
bgneal@294 47 optparse.make_option("-d", "--database", help="set MySQL database name"),
bgneal@294 48 optparse.make_option("-o", "--out-file", help="set output filename"),
bgneal@294 49 )
bgneal@294 50 bb_parser = postmarkup.create(use_pygments=False, annotate_links=False)
bgneal@294 51 md_writer = MarkdownWriter()
bgneal@294 52 site_markup = SiteMarkup()
bgneal@294 53
bgneal@294 54 def handle_noargs(self, **opts):
bgneal@294 55
bgneal@294 56 host = opts.get('host', 'localhost') or 'localhost'
bgneal@294 57 user = opts.get('user', 'root') or 'root'
bgneal@294 58 password = opts.get('password', '') or ''
bgneal@294 59 database = opts.get('database')
bgneal@294 60 out_filename = opts.get('out_file', 'forums_post.csv') or 'forums_post.csv'
bgneal@294 61
bgneal@294 62 if database is None:
bgneal@294 63 raise CommandError("Please specify a database option")
bgneal@294 64
bgneal@294 65 out_file = open(out_filename, "wb")
bgneal@294 66
bgneal@294 67 # database columns (fieldnames) for the output CSV file:
bgneal@294 68 cols = ('id', 'topic_id', 'user_id', 'creation_date', 'update_date',
bgneal@294 69 'body', 'html', 'user_ip')
bgneal@294 70 self.writer = csv.writer(out_file)
bgneal@294 71
bgneal@294 72 # Write an initial row of fieldnames to the output file
bgneal@294 73 self.writer.writerow(cols)
bgneal@294 74
bgneal@294 75 # connect to the legacy database
bgneal@294 76 try:
bgneal@294 77 db = MySQLdb.connect(host=host,
bgneal@294 78 user=user,
bgneal@294 79 passwd=password,
bgneal@294 80 db=database)
bgneal@294 81 except MySQLdb.DatabaseError, e:
bgneal@294 82 raise CommandError(str(e))
bgneal@294 83
bgneal@294 84 c = db.cursor(MySQLdb.cursors.DictCursor)
bgneal@294 85
bgneal@294 86 # query the legacy database
bgneal@294 87 sql = ('SELECT * FROM sln_bbposts as p, sln_bbposts_text as t WHERE '
bgneal@294 88 'p.post_id = t.post_id ORDER BY p.post_id')
bgneal@294 89 c.execute(sql)
bgneal@294 90
bgneal@294 91 # convert the old data and write the output to the file
bgneal@294 92 while True:
bgneal@294 93 row = c.fetchone()
bgneal@294 94 if row is None:
bgneal@294 95 break
bgneal@294 96
bgneal@294 97 self.process_row(row)
bgneal@294 98
bgneal@294 99 c.close()
bgneal@294 100 db.close()
bgneal@294 101 out_file.close()
bgneal@294 102
bgneal@294 103 def to_html(self, s):
bgneal@294 104 return self.bb_parser.render_to_html(unphpbb(s), cosmetic_replace=False)
bgneal@294 105
bgneal@294 106 def to_markdown(self, s):
bgneal@294 107 self.md_writer.reset()
bgneal@294 108 self.md_writer.feed(self.to_html(s))
bgneal@294 109 return self.md_writer.markdown()
bgneal@294 110
bgneal@294 111 def process_row(self, row):
bgneal@294 112 """
bgneal@294 113 This function accepts one row from the legacy database and converts the
bgneal@294 114 contents to the new database format, and calls the writer to write the new
bgneal@294 115 row to the output file.
bgneal@294 116 """
bgneal@294 117 creation_date = datetime.fromtimestamp(float(row['post_time']))
bgneal@294 118
bgneal@294 119 if row['post_edit_time']:
bgneal@294 120 update_date = datetime.fromtimestamp(float(row['post_edit_time']))
bgneal@294 121 else:
bgneal@294 122 update_date = creation_date
bgneal@294 123
bgneal@294 124 body = self.to_markdown(row['post_text'])
bgneal@294 125 html = self.site_markup.convert(body)
bgneal@294 126
bgneal@294 127 self.writer.writerow([row['post_id'],
bgneal@294 128 row['topic_id'],
bgneal@294 129 row['poster_id'],
bgneal@294 130 creation_date,
bgneal@294 131 update_date,
bgneal@294 132 body.encode("utf-8"),
bgneal@294 133 html.encode("utf-8"),
bgneal@294 134 convert_ip(row['poster_ip'])])