bgneal@319: """
bgneal@319: translate_yahoo_group.py - A management command to read the old Yahoo group
bgneal@319: posts (in JSON format) and output as a .csv file, suitable for use as an
bgneal@319: input to mysqlimport into a database.
bgneal@319: 
bgneal@319: How to import:
bgneal@319: mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 sg101_yahoo_group /path/to/post.csv
bgneal@319: 
bgneal@319: """
bgneal@319: from __future__ import with_statement
bgneal@319: import csv
bgneal@319: import optparse
bgneal@319: import datetime
bgneal@319: import json
bgneal@319: import re
bgneal@319: import sys
bgneal@319: 
bgneal@319: 
bgneal@319: USAGE = "usage: %prog [options] filename1 filename2 ..."
bgneal@319: DESCRIPTION = """\
bgneal@319: This command converts the SG101 Yahoo Group posts from JSON and outputs the
bgneal@319: data as a .csv file suitable for importing into the new database scheme with
bgneal@319: the mysqlimport utility.
bgneal@319: """
bgneal@319: 
bgneal@319: DATE_RE = re.compile(r"(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2}):(\d{1,2})Z")
bgneal@319: 
bgneal@319: # My scraping wasn't perfect and I forgot Yahoo truncated long Yahoo IDs. Provide
bgneal@319: # this translation table to fix up the names.
bgneal@319: 
bgneal@319: NAME_EXPANSIONS = {
bgneal@319:     "atomic_mosqu...": "atomic_mosquitos",
bgneal@319:     "balsafliesbe...": "balsafliesbetter",
bgneal@319:     "bambinheadli...": "bambinheadlights",
bgneal@319:     "batinyourfac...": "batinyourface2002",
bgneal@319:     "billyblastof...": "billyblastoff805",
bgneal@319:     "bluegrassban...": "bluegrassbanjobabe",
bgneal@319:     "bluesbrother...": "bluesbrotherbrian",
bgneal@319:     "brother_reyn...": "brother_reynolds",
bgneal@319:     "cannistraro_...": "cannistraro_80303",
bgneal@319:     "cantemaza_de...": "cantemaza_de_miye",
bgneal@319:     "captainraven...": "captainravenwood",
bgneal@319:     "captainsprin...": "captainspringfield",
bgneal@319:     "colonel_must...": "colonel_mustache",
bgneal@319:     "cowboy_playi...": "cowboy_playing_dead",
bgneal@319:     "creature_fea...": "creature_features_fan",
bgneal@319:     "c_e_l_t_i_c_...": "c_e_l_t_i_c_g_o_t_h",
bgneal@319:     "dana_lyn_vin...": "dana_lyn_vincent",
bgneal@319:     "destishtarte...": "destishtartenfort",
bgneal@319:     "efrem_the_re...": "efrem_the_retarded_rabbit",
bgneal@319:     "el_capitan_k...": "el_capitan_kosmo",
bgneal@319:     "epiphonecasi...": "epiphonecasino2001",
bgneal@319:     "fenderjaguar...": "fenderjaguar1963",
bgneal@319:     "fenderplayer...": "fenderplayer52886",
bgneal@319:     "fiberglassro...": "fiberglassrocket",
bgneal@319:     "formerlyleft...": "formerlylefthanded",
bgneal@319:     "guildaristoc...": "guildaristocrat62",
bgneal@319:     "guitardo_mon...": "guitardo_montalbone",
bgneal@319:     "guitargeekfe...": "guitargeekfestival",
bgneal@319:     "harrygpelleg...": "harrygpellegrin2003",
bgneal@319:     "iamtheorigin...": "iamtheoriginaltoblave",
bgneal@319:     "indiana_surf...": "indiana_surfguitar",
bgneal@319:     "invisiblesur...": "invisiblesurfers",
bgneal@319:     "jimmywilseyl...": "jimmywilseylives",
bgneal@319:     "johnnywrecki...": "johnnywreckingball",
bgneal@319:     "killers_from...": "killers_from_space",
bgneal@319:     "los_twang_ma...": "los_twang_marvels",
bgneal@319:     "midwest_surf...": "midwest_surf_guy",
bgneal@319:     "murphy_the_s...": "murphy_the_surfie",
bgneal@319:     "national_dis...": "national_disunion421",
bgneal@319:     "onlytwothing...": "onlytwothingstodo",
bgneal@319:     "photog4surfb...": "photog4surfbands",
bgneal@319:     "poseidonsurf...": "poseidonsurfband",
bgneal@319:     "reluctantaqu...": "reluctantaquanauts",
bgneal@319:     "rich_from_th...": "rich_from_the_balboas",
bgneal@319:     "roland_bette...": "roland_bettenville",
bgneal@319:     "rvstewartpro...": "rvstewartproductions",
bgneal@319:     "screamindeat...": "screamindeathmonkey",
bgneal@319:     "sheckysgotre...": "sheckysgotreverb",
bgneal@319:     "skatalite_of...": "skatalite_of_love",
bgneal@319:     "slippy_the_b...": "slippy_the_backflipping_frog",
bgneal@319:     "somethingabo...": "somethingaboutwendy",
bgneal@319:     "storm_static...": "storm_static_sleep",
bgneal@319:     "supertwangre...": "supertwangreverb",
bgneal@319:     "surfalicious...": "surfaliciousdude1",
bgneal@319:     "surf_guitar_...": "surf_guitar_lizzy",
bgneal@319:     "taylorsnyder...": "taylorsnyderband",
bgneal@319:     "teenageflami...": "teenageflamingovandals",
bgneal@319:     "thecaveofthe...": "thecaveofthedead",
bgneal@319:     "tikitakitiki...": "tikitakitikitakitak",
bgneal@319:     "troublewitht...": "troublewithtreble",
bgneal@319:     "tumamaentang...": "tumamaentangadeluchador",
bgneal@319:     "warped_skate...": "warped_skater_182",
bgneal@319:     "weird_jack_s...": "weird_jack_stinson",
bgneal@319:     "whatever5050...": "whatever50505050",
bgneal@319:     "windanseabea...": "windanseabeachboy",
bgneal@319:     "Wookiee_Smug...": "Wookiee_Smuggler_Drayyken",
bgneal@319: }
bgneal@319: 
bgneal@319: def convert_date(s):
bgneal@319:     """
bgneal@319:     Converts the timestamp string of the form "yyyy-mm-ddTHH:MM:SSZ" into a
bgneal@319:     datetime object.
bgneal@319:     """
bgneal@319:     m = DATE_RE.match(s)
bgneal@320:     assert m is not None
bgneal@319: 
bgneal@319:     parts = [int(part) for part in m.groups()]
bgneal@319:     return datetime.datetime(year=parts[0], month=parts[1], day=parts[2],
bgneal@319:             hour=parts[3], minute=parts[4], second=parts[5])
bgneal@319: 
bgneal@319: 
bgneal@319: def process_line(line, writer):
bgneal@319:     """
bgneal@319:     Process one line from the JSON file: create a CSV line for the post.
bgneal@319: 
bgneal@319:     """
bgneal@319:     # Remove trailing newline
bgneal@319:     line = line.strip()
bgneal@319: 
bgneal@319:     # The JSON files consist of one large array of dicts, one dict per line.
bgneal@319:     # The first line in the file thus has an opening '[' we need to get rid of.
bgneal@319:     # The last char in each line is either a ',' or a ']', so we need to 
bgneal@319:     # lose the last char as well. That should leave us with just a dict.
bgneal@319:     if line.startswith('['):
bgneal@319:         line = line[1:]
bgneal@319:     line = line[:-1].strip()
bgneal@319:     parts = json.loads(line)
bgneal@319: 
bgneal@319:     # Convert the post date; for some strange reason, some dates were not available
bgneal@319:     # and the date value will be an empty string. In these cases we'll output a NULL.
bgneal@319:     post_date = parts['date']
bgneal@319:     if not post_date:
bgneal@319:         post_date = 'NULL'
bgneal@319:     else:
bgneal@319:         post_date = str(convert_date(post_date))
bgneal@319: 
bgneal@319:     name = parts['name']
bgneal@319:     name = NAME_EXPANSIONS.get(name, name)
bgneal@319: 
bgneal@319:     # write the parts as CSV
bgneal@319:     writer.writerow((parts['msg_id'],
bgneal@319:             parts['title'].encode('utf-8'),
bgneal@319:             post_date,
bgneal@319:             name.encode('utf-8'),
bgneal@319:             parts['email'].encode('utf-8'),
bgneal@319:             parts['msg'].encode('utf-8')))
bgneal@319: 
bgneal@319: 
bgneal@319: def main(argv=None):
bgneal@319: 
bgneal@319:     parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION)
bgneal@319:     parser.set_defaults(
bgneal@319:         progress=False,
bgneal@319:         output='post.csv',
bgneal@319:     )
bgneal@319:     parser.add_option("-p", "--progress", action="store_true",
bgneal@319:         help="Output a . after every 100 posts to show progress [default: %default]")
bgneal@319:     parser.add_option("-o", "--output",
bgneal@319:         help="Name of the output file [default: %default]")
bgneal@319: 
bgneal@319:     opts, args = parser.parse_args(args=argv)
bgneal@319: 
bgneal@319:     with open(opts.output, "wb") as out_file:
bgneal@319:         writer = csv.writer(out_file)
bgneal@319: 
bgneal@319:         for filename in args:
bgneal@319:             print "Processing " + filename
bgneal@319:             with open(filename, "r") as in_file:
bgneal@319:                 posts = 0
bgneal@319:                 for line in in_file:
bgneal@319:                     process_line(line, writer)
bgneal@319:                     posts += 1
bgneal@319:                     if opts.progress and posts % 100 == 0:
bgneal@319:                         sys.stdout.write('.')
bgneal@319:                         sys.stdout.flush()
bgneal@319:                 print
bgneal@319: 
bgneal@319: 
bgneal@319: if __name__ == '__main__':
bgneal@319:     try:
bgneal@319:         main()
bgneal@319:     except IOError, ex:
bgneal@319:         sys.exit("IO Error: %s" % ex)
bgneal@319:     except KeyboardInterrupt:
bgneal@319:         sys.exit("Control-C interrupt")