changeset 292:2367c4795c92

Added a legacy management command to import old news comments.
author Brian Neal <bgneal@gmail.com>
date Fri, 24 Dec 2010 22:20:30 +0000
parents a6357f2bcbbc
children c92fb89dbc7d
files gpp/legacy/data.py gpp/legacy/html2md.py gpp/legacy/management/commands/import_old_news.py gpp/legacy/management/commands/import_old_news_comments.py
diffstat 4 files changed, 159 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gpp/legacy/data.py	Fri Dec 24 22:20:30 2010 +0000
@@ -0,0 +1,17 @@
+"""
+Misc data for the legacy management commands.
+
+"""
+
+# Over time various users asked me to change their username. The legacy site
+# rarely stored foreign keys to users; instead it stored the name of the user
+# at the time. This dictionary contains mappings from old usernames to new
+# usernames.
+
+KNOWN_USERNAME_CHANGES = {
+    'cavefishbutchdelux': 'butchdelux',
+    'Findicator1': 'WaveOhhh',
+    'Tikimania': 'Tikitena',
+    'sandyfeet': 'RickRhoades',
+}
+
--- a/gpp/legacy/html2md.py	Fri Dec 24 20:45:33 2010 +0000
+++ b/gpp/legacy/html2md.py	Fri Dec 24 22:20:30 2010 +0000
@@ -120,7 +120,11 @@
 
     """
     def markdown(self):
-        url = self.attrs['href']
+        try:
+            url = self.attrs['href']
+        except KeyError:
+            return self.data if self.data else u''
+
         text = self.data if self.data else url
         return u'[%s](%s)' % (text, url)
 
@@ -133,7 +137,10 @@
 
     """
     def markdown(self):
-        url = self.attrs['src']
+        try:
+            url = self.attrs['src']
+        except KeyError:
+            return u' (missing image) '
         return u'![image](%s)' % url
 
 
@@ -261,7 +268,12 @@
         self.elem_stack.append(tag)
 
     def _pop_elem(self):
-        element = self.elem_stack.pop()
+        try:
+            element = self.elem_stack.pop()
+        except IndexError:
+            # pop from empty list => bad HTML input; ignore it
+            return
+
         if isinstance(element, ListElement):
             self.list_nesting -= 1
         if len(self.elem_stack):
--- a/gpp/legacy/management/commands/import_old_news.py	Fri Dec 24 20:45:33 2010 +0000
+++ b/gpp/legacy/management/commands/import_old_news.py	Fri Dec 24 22:20:30 2010 +0000
@@ -12,11 +12,8 @@
 
 from news.models import Category, Story
 from legacy.phpbb import unescape
+import legacy.data
 
-KNOWN_USERNAME_CHANGES = {
-    'cavefishbutchdelux': 'butchdelux',
-    'Findicator1': 'WaveOhhh',
-}
 
 class Command(LabelCommand):
     args = '<filename filename ...>'
@@ -114,7 +111,8 @@
             user = User.objects.get(username=username)
         except User.DoesNotExist:
             try:
-                user = User.objects.get(username=KNOWN_USERNAME_CHANGES[username])
+                user = User.objects.get(
+                        username=legacy.data.KNOWN_USERNAME_CHANGES[username])
             except KeyError:
                 raise User.DoesNotExist
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gpp/legacy/management/commands/import_old_news_comments.py	Fri Dec 24 22:20:30 2010 +0000
@@ -0,0 +1,124 @@
+"""
+import_old_news_comments.py - For importing comments on news stories from SG101 1.0 as csv files.
+"""
+from __future__ import with_statement
+import csv
+import optparse
+import sys
+from datetime import datetime
+
+from django.core.management.base import LabelCommand, CommandError
+from django.contrib.auth.models import User
+from django.contrib.contenttypes.models import ContentType
+
+from comments.models import Comment
+from news.models import Story
+import legacy.data
+from legacy.html2md import MarkdownWriter
+
+
+class Command(LabelCommand):
+    args = '<filename filename ...>'
+    help = 'Imports news story comments from the old database in CSV format'
+    option_list = LabelCommand.option_list + (
+        optparse.make_option("-p", "--progress", action="store_true",
+            help="Output a . after every 20 comments to show progress"),
+    )
+    md_writer = MarkdownWriter()
+
+    def handle_label(self, filename, **options):
+        """
+        Process each line in the CSV file given by filename by
+        creating a new story comment.
+
+        """
+        self.show_progress = options.get('progress')
+        self.users = {}
+
+        try:
+            with open(filename, "rb") as f:
+                self.reader = csv.DictReader(f)
+                num_rows = 0
+                try:
+                    for row in self.reader:
+                        self.process_row(row)
+                        num_rows += 1
+                        if self.show_progress and num_rows % 20 == 0:
+                            sys.stdout.write('.')
+                            sys.stdout.flush()
+                except csv.Error, e:
+                    raise CommandError("CSV error: %s %s %s" % (
+                        filename, self.reader.line_num, e))
+
+                print
+
+        except IOError:
+            raise CommandError("Could not open file: %s" % filename)
+
+    def process_row(self, row):
+        """
+        Process one row from the CSV file: create a Comment object for
+        the row and save it in the database.
+
+        """
+        row = dict((k, v if v != 'NULL' else '') for k, v in row.iteritems())
+
+        try:
+            user = self._get_user(row['name'])
+        except User.DoesNotExist:
+            print "Could not find user %s for comment %s; skipping." % (
+                    row['name'], row['tid'])
+            return
+
+        try:
+            story = Story.objects.get(id=int(row['sid']))
+        except Story.DoesNotExist:
+            print "Could not find story %s for comment %s; skipping." % (
+                    row['sid'], row['tid'])
+            return
+
+        comment = Comment(
+            id=int(row['tid']),
+            content_type = ContentType.objects.get_for_model(story),
+            object_id = story.id,
+            user = user,
+            comment = self.to_markdown(row['comment']),
+            creation_date = datetime.strptime(row['date'], "%Y-%m-%d %H:%M:%S"),
+            ip_address = row['host_name'],
+            is_public = True,
+            is_removed = False,
+        )
+
+        comment.save()
+
+    def _get_user(self, username):
+        """
+        Returns the user object with the given username.
+        Throws User.DoesNotExist if not found.
+
+        """
+        try:
+            return self.users[username]
+        except KeyError:
+            pass
+
+        try:
+            user = User.objects.get(username=username)
+        except User.DoesNotExist:
+            try:
+                user = User.objects.get(
+                        username=legacy.data.KNOWN_USERNAME_CHANGES[username])
+            except KeyError:
+                raise User.DoesNotExist
+
+        self.users[username] = user
+        return user
+
+    def to_markdown(self, s):
+        self.md_writer.reset()
+
+        if not isinstance(s, unicode):
+            s = s.decode('utf-8', 'replace')
+
+        self.md_writer.feed(s)
+        return self.md_writer.markdown()