Mercurial > public > sg101

diff tools/load_ygroup.py @ 323:0c18dfb1da1c
Fixing #149; adding the ygroup application: an archive of the old Yahoo Group messages.
author: Brian Neal <bgneal@gmail.com>
date: Sun, 20 Feb 2011 00:31:54 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/load_ygroup.py	Sun Feb 20 00:31:54 2011 +0000
@@ -0,0 +1,230 @@
+"""
+load_ygroup.py
+
+This application reads the Yahoo Group posts database and creates .csv files
+for populating the ygroup application tables. The .csv files can be used
+with the mysqlimport command to load the ygroup_thread and ygroup_post tables.
+
+E.g.:
+mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 gremmies_portal /path/to/ygroup_thread.csv
+
+"""
+import csv
+import datetime
+import optparse
+import re
+import sys
+from email.utils import parseaddr
+
+import MySQLdb
+
+
+USAGE = "usage: %prog [options]"
+DESCRIPTION = """\
+This program reads the Yahoo Groups database and outputs 2 .csv files suitable
+for import with mysqlimport for loading the ygroup application tables. Threads
+and posts are created from the initial data.
+"""
+###############################################################################
+
+class ConvertPosts(object):
+    REPLY_RE = re.compile(r"^Re:", re.IGNORECASE)
+    SG101_REPLY_RE = re.compile(r"^Re:\s*\[SurfGuitar101\]", re.IGNORECASE)
+
+    def __init__(self, db, show_progress=False):
+        self.db = db
+        self.show_progress = show_progress
+        self.thread_writer = csv.writer(open('ygroup_thread.csv', 'wb'))
+        self.post_writer = csv.writer(open('ygroup_post.csv', 'wb'))
+        self.thread_cache = {}
+        self.last_date = None
+
+    def process(self):
+        """
+        Main processing function. Processes a row at a time from the legacy
+        database, creating csv records in the thread and posts files as
+        appropriate.
+
+        """
+        c = self.db.cursor(MySQLdb.cursors.DictCursor)
+
+        # query the legacy database
+        sql = "SELECT * FROM post ORDER BY id"
+        c.execute(sql)
+
+        # convert the old data and write the output to the file
+        i = 0
+        while True:
+            row = c.fetchone()
+            if row is None:
+                break
+            i += 1
+            if i % 100 == 0:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+
+            self._process_row(row)
+
+        print
+        c.close()
+
+    def _process_row(self, row):
+        """
+        Process one row from the legacy database, creating a csv record
+        in the thread or post files as appropriate.
+
+        """
+        # Create a unified author name from the Yahoo ID and email address
+        # fields in the original post:
+        row['author'] = self.get_author(row['name'], row['email'])
+
+        # Some posts (mainly from 1 user...) have no date; we'll just
+        # make one up by using the last date we saw + 1 second
+
+        if row['date'] is None:
+            assert self.last_date is not None
+            row['date'] = self.last_date + datetime.timedelta(seconds=1)
+
+        self.last_date = row['date']
+
+        # determine if this is a new thread or a reply
+
+        if self.REPLY_RE.match(row['title']):
+            # This appears to be a reply.
+            # Remove all the leading Re: etc., cruft
+            stripped_title = self._strip_title(row['title'])
+            thread_id = self.thread_cache.get(stripped_title)
+            if thread_id:
+                self._create_post(thread_id, row)
+            else:
+                # Huh, no parent thread..?; create a new one
+                # and cache under stripped name so replies will find it
+                self._create_thread(row)
+                self.thread_cache[stripped_title] = row['id']
+        else:
+            # At first glance, not a reply; see if another thread
+            # already exists with the exact same title:
+            thread_id = self.thread_cache.get(row['title'])
+            if thread_id:
+                # Duplicate; Yahoo or someone lopped off the Re:
+                # or it is just a coincidence. Either way, make it
+                # a post of an existing thread.
+                self._create_post(thread_id, row)
+            else:
+                self._create_thread(row)
+                self.thread_cache[row['title']] = row['id']
+
+
+    def _create_thread(self, row):
+        """
+        Create a new thread from the post data by writing a record in the
+        thread .csv file and a record in the post file.
+
+        """
+        self.thread_writer.writerow((row['id'],
+            row['title'].encode('utf-8'),
+            row['date'],
+            row['author'].encode('utf-8'),
+            0))
+        self._create_post(row['id'], row)
+
+    def _create_post(self, thread_id, row):
+        """
+        Create a new post from the post data by writing a record in the
+        post .csv file.
+
+        """
+        self.post_writer.writerow((row['id'],
+            thread_id,
+            row['title'].encode('utf-8'),
+            row['date'],
+            row['author'].encode('utf-8'),
+            row['msg'].encode('utf-8'),
+            ''))
+
+    def _strip_title(self, title):
+        """
+        Strip out all the Re: and [SurfGuitar101] stuff to get a bare
+        title.
+
+        """
+        s = title
+        while self.REPLY_RE.match(s):
+            s = self.SG101_REPLY_RE.sub('', s).strip()
+            s = self.REPLY_RE.sub('', s).strip()
+
+        return s
+
+    @staticmethod
+    def get_author(yahoo_id, email):
+
+        def anti_spam(s):
+            return s.replace('.', ' dot ').replace('@', ' at ')
+
+        name, addr = parseaddr(email)
+
+        if name == addr:
+            name = anti_spam(name)
+        else:
+            # For some weird reason, sometimes Yahoo (?) put the email address
+            # in the name field: "John Doe <doe@example.com" <doe@example.com>"
+            name2, addr = parseaddr(name)
+            if name2:
+                name = name2
+
+        if name and yahoo_id and name != yahoo_id:
+            author = "%s (%s)" % (name, yahoo_id)
+        elif name:
+            author = name
+        elif yahoo_id:
+            author = yahoo_id
+        else:
+            author = anti_spam(email)
+        return author
+
+###############################################################################
+
+def main(argv=None):
+    parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION)
+    parser.set_defaults(
+        progress=False,
+        host='localhost',
+        user='root',
+        password='',
+        database='sg101_yahoo_group',
+    )
+    parser.add_option("-s", "--progress", action="store_true",
+        help="Output a . after every 100 posts to show progress [default: %default]")
+    parser.add_option("-a", "--host",
+        help="set MySQL host name [default: %default]"),
+    parser.add_option("-u", "--user",
+        help="set MySQL user name [default: %default]")
+    parser.add_option("-p", "--password",
+        help="set MySQL user password [default: %default]"),
+    parser.add_option("-d", "--database",
+        help="set MySQL database name [default: %default]")
+    opts, args = parser.parse_args(args=argv)
+
+    # connect to the legacy database
+    try:
+        db = MySQLdb.connect(host=opts.host,
+                user=opts.user,
+                passwd=opts.password,
+                db=opts.database,
+                use_unicode=True)
+    except MySQLdb.DatabaseError, e:
+        sys.exit("Can't connect to database: %s" % e)
+
+    converter = ConvertPosts(db, opts.progress)
+    converter.process()
+    db.close()
+
+###############################################################################
+
+if __name__ == '__main__':
+    try:
+        main()
+    except IOError, ex:
+        sys.exit("IO Error: %s" % ex)
+    except KeyboardInterrupt:
+        sys.exit("Control-C interrupt")
author	Brian Neal <bgneal@gmail.com>
date	Sun, 20 Feb 2011 00:31:54 +0000
parents
children