bgneal@323: """
bgneal@323: load_ygroup.py
bgneal@323: 
bgneal@323: This application reads the Yahoo Group posts database and creates .csv files
bgneal@323: for populating the ygroup application tables. The .csv files can be used
bgneal@323: with the mysqlimport command to load the ygroup_thread and ygroup_post tables.
bgneal@323: 
bgneal@323: E.g.:
bgneal@323: mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 gremmies_portal /path/to/ygroup_thread.csv
bgneal@323: 
bgneal@323: """
bgneal@323: import csv
bgneal@323: import datetime
bgneal@323: import optparse
bgneal@323: import re
bgneal@323: import sys
bgneal@323: from email.utils import parseaddr
bgneal@323: 
bgneal@323: import MySQLdb
bgneal@323: 
bgneal@323: 
bgneal@323: USAGE = "usage: %prog [options]"
bgneal@323: DESCRIPTION = """\
bgneal@323: This program reads the Yahoo Groups database and outputs 2 .csv files suitable
bgneal@323: for import with mysqlimport for loading the ygroup application tables. Threads
bgneal@323: and posts are created from the initial data.
bgneal@323: """
bgneal@323: ###############################################################################
bgneal@323: 
bgneal@323: class ConvertPosts(object):
bgneal@323:     REPLY_RE = re.compile(r"^Re:", re.IGNORECASE)
bgneal@323:     SG101_REPLY_RE = re.compile(r"^Re:\s*\[SurfGuitar101\]", re.IGNORECASE)
bgneal@323: 
bgneal@323:     def __init__(self, db, show_progress=False):
bgneal@323:         self.db = db
bgneal@323:         self.show_progress = show_progress
bgneal@323:         self.thread_writer = csv.writer(open('ygroup_thread.csv', 'wb'))
bgneal@323:         self.post_writer = csv.writer(open('ygroup_post.csv', 'wb'))
bgneal@323:         self.thread_cache = {}
bgneal@323:         self.last_date = None
bgneal@323: 
bgneal@323:     def process(self):
bgneal@323:         """
bgneal@323:         Main processing function. Processes a row at a time from the legacy
bgneal@323:         database, creating csv records in the thread and posts files as
bgneal@323:         appropriate.
bgneal@323: 
bgneal@323:         """
bgneal@323:         c = self.db.cursor(MySQLdb.cursors.DictCursor)
bgneal@323: 
bgneal@323:         # query the legacy database
bgneal@323:         sql = "SELECT * FROM post ORDER BY id"
bgneal@323:         c.execute(sql)
bgneal@323: 
bgneal@323:         # convert the old data and write the output to the file
bgneal@323:         i = 0
bgneal@323:         while True:
bgneal@323:             row = c.fetchone()
bgneal@323:             if row is None:
bgneal@323:                 break
bgneal@323:             i += 1
bgneal@323:             if i % 100 == 0:
bgneal@323:                 sys.stdout.write('.')
bgneal@323:                 sys.stdout.flush()
bgneal@323: 
bgneal@323:             self._process_row(row)
bgneal@323: 
bgneal@323:         print
bgneal@323:         c.close()
bgneal@323: 
bgneal@323:     def _process_row(self, row):
bgneal@323:         """
bgneal@323:         Process one row from the legacy database, creating a csv record
bgneal@323:         in the thread or post files as appropriate.
bgneal@323: 
bgneal@323:         """
bgneal@323:         # Create a unified author name from the Yahoo ID and email address
bgneal@323:         # fields in the original post:
bgneal@323:         row['author'] = self.get_author(row['name'], row['email'])
bgneal@323: 
bgneal@323:         # Some posts (mainly from 1 user...) have no date; we'll just
bgneal@323:         # make one up by using the last date we saw + 1 second
bgneal@323: 
bgneal@323:         if row['date'] is None:
bgneal@323:             assert self.last_date is not None
bgneal@323:             row['date'] = self.last_date + datetime.timedelta(seconds=1)
bgneal@323: 
bgneal@323:         self.last_date = row['date']
bgneal@323: 
bgneal@323:         # determine if this is a new thread or a reply
bgneal@323: 
bgneal@323:         if self.REPLY_RE.match(row['title']):
bgneal@323:             # This appears to be a reply.
bgneal@323:             # Remove all the leading Re: etc., cruft
bgneal@323:             stripped_title = self._strip_title(row['title'])
bgneal@323:             thread_id = self.thread_cache.get(stripped_title)
bgneal@323:             if thread_id:
bgneal@323:                 self._create_post(thread_id, row)
bgneal@323:             else:
bgneal@323:                 # Huh, no parent thread..?; create a new one
bgneal@323:                 # and cache under stripped name so replies will find it
bgneal@323:                 self._create_thread(row)
bgneal@323:                 self.thread_cache[stripped_title] = row['id']
bgneal@323:         else:
bgneal@323:             # At first glance, not a reply; see if another thread
bgneal@323:             # already exists with the exact same title:
bgneal@323:             thread_id = self.thread_cache.get(row['title'])
bgneal@323:             if thread_id:
bgneal@323:                 # Duplicate; Yahoo or someone lopped off the Re:
bgneal@323:                 # or it is just a coincidence. Either way, make it
bgneal@323:                 # a post of an existing thread.
bgneal@323:                 self._create_post(thread_id, row)
bgneal@323:             else:
bgneal@323:                 self._create_thread(row)
bgneal@323:                 self.thread_cache[row['title']] = row['id']
bgneal@323: 
bgneal@323: 
bgneal@323:     def _create_thread(self, row):
bgneal@323:         """
bgneal@323:         Create a new thread from the post data by writing a record in the
bgneal@323:         thread .csv file and a record in the post file.
bgneal@323: 
bgneal@323:         """
bgneal@323:         self.thread_writer.writerow((row['id'],
bgneal@323:             row['title'].encode('utf-8'),
bgneal@323:             row['date'],
bgneal@323:             row['author'].encode('utf-8'),
bgneal@323:             0))
bgneal@323:         self._create_post(row['id'], row)
bgneal@323: 
bgneal@323:     def _create_post(self, thread_id, row):
bgneal@323:         """
bgneal@323:         Create a new post from the post data by writing a record in the
bgneal@323:         post .csv file.
bgneal@323: 
bgneal@323:         """
bgneal@323:         self.post_writer.writerow((row['id'],
bgneal@323:             thread_id,
bgneal@323:             row['title'].encode('utf-8'),
bgneal@323:             row['date'],
bgneal@323:             row['author'].encode('utf-8'),
bgneal@323:             row['msg'].encode('utf-8'),
bgneal@323:             ''))
bgneal@323: 
bgneal@323:     def _strip_title(self, title):
bgneal@323:         """
bgneal@323:         Strip out all the Re: and [SurfGuitar101] stuff to get a bare
bgneal@323:         title.
bgneal@323: 
bgneal@323:         """
bgneal@323:         s = title
bgneal@323:         while self.REPLY_RE.match(s):
bgneal@323:             s = self.SG101_REPLY_RE.sub('', s).strip()
bgneal@323:             s = self.REPLY_RE.sub('', s).strip()
bgneal@323: 
bgneal@323:         return s
bgneal@323: 
bgneal@323:     @staticmethod
bgneal@323:     def get_author(yahoo_id, email):
bgneal@323: 
bgneal@323:         def anti_spam(s):
bgneal@323:             return s.replace('.', ' dot ').replace('@', ' at ')
bgneal@323: 
bgneal@323:         name, addr = parseaddr(email)
bgneal@323: 
bgneal@323:         if name == addr:
bgneal@323:             name = anti_spam(name)
bgneal@323:         else:
bgneal@323:             # For some weird reason, sometimes Yahoo (?) put the email address
bgneal@323:             # in the name field: "John Doe <doe@example.com" <doe@example.com>"
bgneal@323:             name2, addr = parseaddr(name)
bgneal@323:             if name2:
bgneal@323:                 name = name2
bgneal@323: 
bgneal@323:         if name and yahoo_id and name != yahoo_id:
bgneal@323:             author = "%s (%s)" % (name, yahoo_id)
bgneal@323:         elif name:
bgneal@323:             author = name
bgneal@323:         elif yahoo_id:
bgneal@323:             author = yahoo_id
bgneal@323:         else:
bgneal@323:             author = anti_spam(email)
bgneal@323:         return author
bgneal@323: 
bgneal@323: ###############################################################################
bgneal@323: 
bgneal@323: def main(argv=None):
bgneal@323:     parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION)
bgneal@323:     parser.set_defaults(
bgneal@323:         progress=False,
bgneal@323:         host='localhost',
bgneal@323:         user='root',
bgneal@323:         password='',
bgneal@323:         database='sg101_yahoo_group',
bgneal@323:     )
bgneal@323:     parser.add_option("-s", "--progress", action="store_true",
bgneal@323:         help="Output a . after every 100 posts to show progress [default: %default]")
bgneal@323:     parser.add_option("-a", "--host",
bgneal@323:         help="set MySQL host name [default: %default]"),
bgneal@323:     parser.add_option("-u", "--user",
bgneal@323:         help="set MySQL user name [default: %default]")
bgneal@323:     parser.add_option("-p", "--password",
bgneal@323:         help="set MySQL user password [default: %default]"),
bgneal@323:     parser.add_option("-d", "--database",
bgneal@323:         help="set MySQL database name [default: %default]")
bgneal@323:     opts, args = parser.parse_args(args=argv)
bgneal@323: 
bgneal@323:     # connect to the legacy database
bgneal@323:     try:
bgneal@323:         db = MySQLdb.connect(host=opts.host,
bgneal@323:                 user=opts.user,
bgneal@323:                 passwd=opts.password,
bgneal@323:                 db=opts.database,
bgneal@323:                 use_unicode=True)
bgneal@323:     except MySQLdb.DatabaseError, e:
bgneal@323:         sys.exit("Can't connect to database: %s" % e)
bgneal@323: 
bgneal@323:     converter = ConvertPosts(db, opts.progress)
bgneal@323:     converter.process()
bgneal@323:     db.close()
bgneal@323: 
bgneal@323: ###############################################################################
bgneal@323: 
bgneal@323: if __name__ == '__main__':
bgneal@323:     try:
bgneal@323:         main()
bgneal@323:     except IOError, ex:
bgneal@323:         sys.exit("IO Error: %s" % ex)
bgneal@323:     except KeyboardInterrupt:
bgneal@323:         sys.exit("Control-C interrupt")