bgneal@323: """ bgneal@323: load_ygroup.py bgneal@323: bgneal@323: This application reads the Yahoo Group posts database and creates .csv files bgneal@323: for populating the ygroup application tables. The .csv files can be used bgneal@323: with the mysqlimport command to load the ygroup_thread and ygroup_post tables. bgneal@323: bgneal@323: E.g.: bgneal@323: mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 gremmies_portal /path/to/ygroup_thread.csv bgneal@323: bgneal@323: """ bgneal@323: import csv bgneal@323: import datetime bgneal@323: import optparse bgneal@323: import re bgneal@323: import sys bgneal@323: from email.utils import parseaddr bgneal@323: bgneal@323: import MySQLdb bgneal@323: bgneal@323: bgneal@323: USAGE = "usage: %prog [options]" bgneal@323: DESCRIPTION = """\ bgneal@323: This program reads the Yahoo Groups database and outputs 2 .csv files suitable bgneal@323: for import with mysqlimport for loading the ygroup application tables. Threads bgneal@323: and posts are created from the initial data. bgneal@323: """ bgneal@323: ############################################################################### bgneal@323: bgneal@323: class ConvertPosts(object): bgneal@323: REPLY_RE = re.compile(r"^Re:", re.IGNORECASE) bgneal@323: SG101_REPLY_RE = re.compile(r"^Re:\s*\[SurfGuitar101\]", re.IGNORECASE) bgneal@323: bgneal@323: def __init__(self, db, show_progress=False): bgneal@323: self.db = db bgneal@323: self.show_progress = show_progress bgneal@323: self.thread_writer = csv.writer(open('ygroup_thread.csv', 'wb')) bgneal@323: self.post_writer = csv.writer(open('ygroup_post.csv', 'wb')) bgneal@323: self.thread_cache = {} bgneal@323: self.last_date = None bgneal@323: bgneal@323: def process(self): bgneal@323: """ bgneal@323: Main processing function. Processes a row at a time from the legacy bgneal@323: database, creating csv records in the thread and posts files as bgneal@323: appropriate. bgneal@323: bgneal@323: """ bgneal@323: c = self.db.cursor(MySQLdb.cursors.DictCursor) bgneal@323: bgneal@323: # query the legacy database bgneal@323: sql = "SELECT * FROM post ORDER BY id" bgneal@323: c.execute(sql) bgneal@323: bgneal@323: # convert the old data and write the output to the file bgneal@323: i = 0 bgneal@323: while True: bgneal@323: row = c.fetchone() bgneal@323: if row is None: bgneal@323: break bgneal@323: i += 1 bgneal@323: if i % 100 == 0: bgneal@323: sys.stdout.write('.') bgneal@323: sys.stdout.flush() bgneal@323: bgneal@323: self._process_row(row) bgneal@323: bgneal@323: print bgneal@323: c.close() bgneal@323: bgneal@323: def _process_row(self, row): bgneal@323: """ bgneal@323: Process one row from the legacy database, creating a csv record bgneal@323: in the thread or post files as appropriate. bgneal@323: bgneal@323: """ bgneal@323: # Create a unified author name from the Yahoo ID and email address bgneal@323: # fields in the original post: bgneal@323: row['author'] = self.get_author(row['name'], row['email']) bgneal@323: bgneal@323: # Some posts (mainly from 1 user...) have no date; we'll just bgneal@323: # make one up by using the last date we saw + 1 second bgneal@323: bgneal@323: if row['date'] is None: bgneal@323: assert self.last_date is not None bgneal@323: row['date'] = self.last_date + datetime.timedelta(seconds=1) bgneal@323: bgneal@323: self.last_date = row['date'] bgneal@323: bgneal@323: # determine if this is a new thread or a reply bgneal@323: bgneal@323: if self.REPLY_RE.match(row['title']): bgneal@323: # This appears to be a reply. bgneal@323: # Remove all the leading Re: etc., cruft bgneal@323: stripped_title = self._strip_title(row['title']) bgneal@323: thread_id = self.thread_cache.get(stripped_title) bgneal@323: if thread_id: bgneal@323: self._create_post(thread_id, row) bgneal@323: else: bgneal@323: # Huh, no parent thread..?; create a new one bgneal@323: # and cache under stripped name so replies will find it bgneal@323: self._create_thread(row) bgneal@323: self.thread_cache[stripped_title] = row['id'] bgneal@323: else: bgneal@323: # At first glance, not a reply; see if another thread bgneal@323: # already exists with the exact same title: bgneal@323: thread_id = self.thread_cache.get(row['title']) bgneal@323: if thread_id: bgneal@323: # Duplicate; Yahoo or someone lopped off the Re: bgneal@323: # or it is just a coincidence. Either way, make it bgneal@323: # a post of an existing thread. bgneal@323: self._create_post(thread_id, row) bgneal@323: else: bgneal@323: self._create_thread(row) bgneal@323: self.thread_cache[row['title']] = row['id'] bgneal@323: bgneal@323: bgneal@323: def _create_thread(self, row): bgneal@323: """ bgneal@323: Create a new thread from the post data by writing a record in the bgneal@323: thread .csv file and a record in the post file. bgneal@323: bgneal@323: """ bgneal@323: self.thread_writer.writerow((row['id'], bgneal@323: row['title'].encode('utf-8'), bgneal@323: row['date'], bgneal@323: row['author'].encode('utf-8'), bgneal@323: 0)) bgneal@323: self._create_post(row['id'], row) bgneal@323: bgneal@323: def _create_post(self, thread_id, row): bgneal@323: """ bgneal@323: Create a new post from the post data by writing a record in the bgneal@323: post .csv file. bgneal@323: bgneal@323: """ bgneal@323: self.post_writer.writerow((row['id'], bgneal@323: thread_id, bgneal@323: row['title'].encode('utf-8'), bgneal@323: row['date'], bgneal@323: row['author'].encode('utf-8'), bgneal@323: row['msg'].encode('utf-8'), bgneal@323: '')) bgneal@323: bgneal@323: def _strip_title(self, title): bgneal@323: """ bgneal@323: Strip out all the Re: and [SurfGuitar101] stuff to get a bare bgneal@323: title. bgneal@323: bgneal@323: """ bgneal@323: s = title bgneal@323: while self.REPLY_RE.match(s): bgneal@323: s = self.SG101_REPLY_RE.sub('', s).strip() bgneal@323: s = self.REPLY_RE.sub('', s).strip() bgneal@323: bgneal@323: return s bgneal@323: bgneal@323: @staticmethod bgneal@323: def get_author(yahoo_id, email): bgneal@323: bgneal@323: def anti_spam(s): bgneal@323: return s.replace('.', ' dot ').replace('@', ' at ') bgneal@323: bgneal@323: name, addr = parseaddr(email) bgneal@323: bgneal@323: if name == addr: bgneal@323: name = anti_spam(name) bgneal@323: else: bgneal@323: # For some weird reason, sometimes Yahoo (?) put the email address bgneal@323: # in the name field: "John Doe " bgneal@323: name2, addr = parseaddr(name) bgneal@323: if name2: bgneal@323: name = name2 bgneal@323: bgneal@323: if name and yahoo_id and name != yahoo_id: bgneal@323: author = "%s (%s)" % (name, yahoo_id) bgneal@323: elif name: bgneal@323: author = name bgneal@323: elif yahoo_id: bgneal@323: author = yahoo_id bgneal@323: else: bgneal@323: author = anti_spam(email) bgneal@323: return author bgneal@323: bgneal@323: ############################################################################### bgneal@323: bgneal@323: def main(argv=None): bgneal@323: parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION) bgneal@323: parser.set_defaults( bgneal@323: progress=False, bgneal@323: host='localhost', bgneal@323: user='root', bgneal@323: password='', bgneal@323: database='sg101_yahoo_group', bgneal@323: ) bgneal@323: parser.add_option("-s", "--progress", action="store_true", bgneal@323: help="Output a . after every 100 posts to show progress [default: %default]") bgneal@323: parser.add_option("-a", "--host", bgneal@323: help="set MySQL host name [default: %default]"), bgneal@323: parser.add_option("-u", "--user", bgneal@323: help="set MySQL user name [default: %default]") bgneal@323: parser.add_option("-p", "--password", bgneal@323: help="set MySQL user password [default: %default]"), bgneal@323: parser.add_option("-d", "--database", bgneal@323: help="set MySQL database name [default: %default]") bgneal@323: opts, args = parser.parse_args(args=argv) bgneal@323: bgneal@323: # connect to the legacy database bgneal@323: try: bgneal@323: db = MySQLdb.connect(host=opts.host, bgneal@323: user=opts.user, bgneal@323: passwd=opts.password, bgneal@323: db=opts.database, bgneal@323: use_unicode=True) bgneal@323: except MySQLdb.DatabaseError, e: bgneal@323: sys.exit("Can't connect to database: %s" % e) bgneal@323: bgneal@323: converter = ConvertPosts(db, opts.progress) bgneal@323: converter.process() bgneal@323: db.close() bgneal@323: bgneal@323: ############################################################################### bgneal@323: bgneal@323: if __name__ == '__main__': bgneal@323: try: bgneal@323: main() bgneal@323: except IOError, ex: bgneal@323: sys.exit("IO Error: %s" % ex) bgneal@323: except KeyboardInterrupt: bgneal@323: sys.exit("Control-C interrupt")