Mercurial > public > sg101
view tools/load_ygroup.py @ 693:ad69236e8501
For issue #52, update many 3rd party Javascript libraries.
Updated to jquery 1.10.2, jquery ui 1.10.3.
This broke a lot of stuff.
- Found a newer version of the jquery cycle all plugin (3.0.3).
- Updated JPlayer to 2.4.0.
- Updated to MarkItUp 1.1.14. This also required me to add multiline attributes
set to true on various buttons in the markdown set.
- As per a stackoverflow post, added some code to get multiline titles in
a jQuery UI dialog. They removed that functionality but allow you to put it
back.
Tweaked the MarkItUp preview CSS to show blockquotes in italic.
Did not update TinyMCE at this time. I'm not using the JQuery version and this
version appears to work ok for now.
What I should do is make a repo for MarkItUp and do a vendor branch thing so
I don't have to futz around diffing directories to figure out if I'll lose
changes when I update.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Wed, 04 Sep 2013 19:55:20 -0500 |
parents | 0c18dfb1da1c |
children |
line wrap: on
line source
""" load_ygroup.py This application reads the Yahoo Group posts database and creates .csv files for populating the ygroup application tables. The .csv files can be used with the mysqlimport command to load the ygroup_thread and ygroup_post tables. E.g.: mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 gremmies_portal /path/to/ygroup_thread.csv """ import csv import datetime import optparse import re import sys from email.utils import parseaddr import MySQLdb USAGE = "usage: %prog [options]" DESCRIPTION = """\ This program reads the Yahoo Groups database and outputs 2 .csv files suitable for import with mysqlimport for loading the ygroup application tables. Threads and posts are created from the initial data. """ ############################################################################### class ConvertPosts(object): REPLY_RE = re.compile(r"^Re:", re.IGNORECASE) SG101_REPLY_RE = re.compile(r"^Re:\s*\[SurfGuitar101\]", re.IGNORECASE) def __init__(self, db, show_progress=False): self.db = db self.show_progress = show_progress self.thread_writer = csv.writer(open('ygroup_thread.csv', 'wb')) self.post_writer = csv.writer(open('ygroup_post.csv', 'wb')) self.thread_cache = {} self.last_date = None def process(self): """ Main processing function. Processes a row at a time from the legacy database, creating csv records in the thread and posts files as appropriate. """ c = self.db.cursor(MySQLdb.cursors.DictCursor) # query the legacy database sql = "SELECT * FROM post ORDER BY id" c.execute(sql) # convert the old data and write the output to the file i = 0 while True: row = c.fetchone() if row is None: break i += 1 if i % 100 == 0: sys.stdout.write('.') sys.stdout.flush() self._process_row(row) print c.close() def _process_row(self, row): """ Process one row from the legacy database, creating a csv record in the thread or post files as appropriate. """ # Create a unified author name from the Yahoo ID and email address # fields in the original post: row['author'] = self.get_author(row['name'], row['email']) # Some posts (mainly from 1 user...) have no date; we'll just # make one up by using the last date we saw + 1 second if row['date'] is None: assert self.last_date is not None row['date'] = self.last_date + datetime.timedelta(seconds=1) self.last_date = row['date'] # determine if this is a new thread or a reply if self.REPLY_RE.match(row['title']): # This appears to be a reply. # Remove all the leading Re: etc., cruft stripped_title = self._strip_title(row['title']) thread_id = self.thread_cache.get(stripped_title) if thread_id: self._create_post(thread_id, row) else: # Huh, no parent thread..?; create a new one # and cache under stripped name so replies will find it self._create_thread(row) self.thread_cache[stripped_title] = row['id'] else: # At first glance, not a reply; see if another thread # already exists with the exact same title: thread_id = self.thread_cache.get(row['title']) if thread_id: # Duplicate; Yahoo or someone lopped off the Re: # or it is just a coincidence. Either way, make it # a post of an existing thread. self._create_post(thread_id, row) else: self._create_thread(row) self.thread_cache[row['title']] = row['id'] def _create_thread(self, row): """ Create a new thread from the post data by writing a record in the thread .csv file and a record in the post file. """ self.thread_writer.writerow((row['id'], row['title'].encode('utf-8'), row['date'], row['author'].encode('utf-8'), 0)) self._create_post(row['id'], row) def _create_post(self, thread_id, row): """ Create a new post from the post data by writing a record in the post .csv file. """ self.post_writer.writerow((row['id'], thread_id, row['title'].encode('utf-8'), row['date'], row['author'].encode('utf-8'), row['msg'].encode('utf-8'), '')) def _strip_title(self, title): """ Strip out all the Re: and [SurfGuitar101] stuff to get a bare title. """ s = title while self.REPLY_RE.match(s): s = self.SG101_REPLY_RE.sub('', s).strip() s = self.REPLY_RE.sub('', s).strip() return s @staticmethod def get_author(yahoo_id, email): def anti_spam(s): return s.replace('.', ' dot ').replace('@', ' at ') name, addr = parseaddr(email) if name == addr: name = anti_spam(name) else: # For some weird reason, sometimes Yahoo (?) put the email address # in the name field: "John Doe <doe@example.com" <doe@example.com>" name2, addr = parseaddr(name) if name2: name = name2 if name and yahoo_id and name != yahoo_id: author = "%s (%s)" % (name, yahoo_id) elif name: author = name elif yahoo_id: author = yahoo_id else: author = anti_spam(email) return author ############################################################################### def main(argv=None): parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION) parser.set_defaults( progress=False, host='localhost', user='root', password='', database='sg101_yahoo_group', ) parser.add_option("-s", "--progress", action="store_true", help="Output a . after every 100 posts to show progress [default: %default]") parser.add_option("-a", "--host", help="set MySQL host name [default: %default]"), parser.add_option("-u", "--user", help="set MySQL user name [default: %default]") parser.add_option("-p", "--password", help="set MySQL user password [default: %default]"), parser.add_option("-d", "--database", help="set MySQL database name [default: %default]") opts, args = parser.parse_args(args=argv) # connect to the legacy database try: db = MySQLdb.connect(host=opts.host, user=opts.user, passwd=opts.password, db=opts.database, use_unicode=True) except MySQLdb.DatabaseError, e: sys.exit("Can't connect to database: %s" % e) converter = ConvertPosts(db, opts.progress) converter.process() db.close() ############################################################################### if __name__ == '__main__': try: main() except IOError, ex: sys.exit("IO Error: %s" % ex) except KeyboardInterrupt: sys.exit("Control-C interrupt")