Mercurial > public > sg101
changeset 319:c5055573e8d7
Added a tool to translate the old Yahoo Group posts from JSON to CSV suitable for mysqlimport.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Tue, 08 Feb 2011 03:42:04 +0000 (2011-02-08) |
parents | c550933ff5b6 |
children | 66cc2f8c4603 |
files | tools/translate_yahoo_group.py |
diffstat | 1 files changed, 187 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/translate_yahoo_group.py Tue Feb 08 03:42:04 2011 +0000 @@ -0,0 +1,187 @@ +""" +translate_yahoo_group.py - A management command to read the old Yahoo group +posts (in JSON format) and output as a .csv file, suitable for use as an +input to mysqlimport into a database. + +How to import: +mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 sg101_yahoo_group /path/to/post.csv + +""" +from __future__ import with_statement +import csv +import optparse +import datetime +import json +import re +import sys + + +USAGE = "usage: %prog [options] filename1 filename2 ..." +DESCRIPTION = """\ +This command converts the SG101 Yahoo Group posts from JSON and outputs the +data as a .csv file suitable for importing into the new database scheme with +the mysqlimport utility. +""" + +DATE_RE = re.compile(r"(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2}):(\d{1,2})Z") + +# My scraping wasn't perfect and I forgot Yahoo truncated long Yahoo IDs. Provide +# this translation table to fix up the names. + +NAME_EXPANSIONS = { + "atomic_mosqu...": "atomic_mosquitos", + "balsafliesbe...": "balsafliesbetter", + "bambinheadli...": "bambinheadlights", + "batinyourfac...": "batinyourface2002", + "billyblastof...": "billyblastoff805", + "bluegrassban...": "bluegrassbanjobabe", + "bluesbrother...": "bluesbrotherbrian", + "brother_reyn...": "brother_reynolds", + "cannistraro_...": "cannistraro_80303", + "cantemaza_de...": "cantemaza_de_miye", + "captainraven...": "captainravenwood", + "captainsprin...": "captainspringfield", + "colonel_must...": "colonel_mustache", + "cowboy_playi...": "cowboy_playing_dead", + "creature_fea...": "creature_features_fan", + "c_e_l_t_i_c_...": "c_e_l_t_i_c_g_o_t_h", + "dana_lyn_vin...": "dana_lyn_vincent", + "destishtarte...": "destishtartenfort", + "efrem_the_re...": "efrem_the_retarded_rabbit", + "el_capitan_k...": "el_capitan_kosmo", + "epiphonecasi...": "epiphonecasino2001", + "fenderjaguar...": "fenderjaguar1963", + "fenderplayer...": "fenderplayer52886", + "fiberglassro...": "fiberglassrocket", + "formerlyleft...": "formerlylefthanded", + "guildaristoc...": "guildaristocrat62", + "guitardo_mon...": "guitardo_montalbone", + "guitargeekfe...": "guitargeekfestival", + "harrygpelleg...": "harrygpellegrin2003", + "iamtheorigin...": "iamtheoriginaltoblave", + "indiana_surf...": "indiana_surfguitar", + "invisiblesur...": "invisiblesurfers", + "jimmywilseyl...": "jimmywilseylives", + "johnnywrecki...": "johnnywreckingball", + "killers_from...": "killers_from_space", + "los_twang_ma...": "los_twang_marvels", + "midwest_surf...": "midwest_surf_guy", + "murphy_the_s...": "murphy_the_surfie", + "national_dis...": "national_disunion421", + "onlytwothing...": "onlytwothingstodo", + "photog4surfb...": "photog4surfbands", + "poseidonsurf...": "poseidonsurfband", + "reluctantaqu...": "reluctantaquanauts", + "rich_from_th...": "rich_from_the_balboas", + "roland_bette...": "roland_bettenville", + "rvstewartpro...": "rvstewartproductions", + "screamindeat...": "screamindeathmonkey", + "sheckysgotre...": "sheckysgotreverb", + "skatalite_of...": "skatalite_of_love", + "slippy_the_b...": "slippy_the_backflipping_frog", + "somethingabo...": "somethingaboutwendy", + "storm_static...": "storm_static_sleep", + "supertwangre...": "supertwangreverb", + "surfalicious...": "surfaliciousdude1", + "surf_guitar_...": "surf_guitar_lizzy", + "taylorsnyder...": "taylorsnyderband", + "teenageflami...": "teenageflamingovandals", + "thecaveofthe...": "thecaveofthedead", + "tikitakitiki...": "tikitakitikitakitak", + "troublewitht...": "troublewithtreble", + "tumamaentang...": "tumamaentangadeluchador", + "warped_skate...": "warped_skater_182", + "weird_jack_s...": "weird_jack_stinson", + "whatever5050...": "whatever50505050", + "windanseabea...": "windanseabeachboy", + "Wookiee_Smug...": "Wookiee_Smuggler_Drayyken", +} + +def convert_date(s): + """ + Converts the timestamp string of the form "yyyy-mm-ddTHH:MM:SSZ" into a + datetime object. + """ + m = DATE_RE.match(s) + if m is None: + raise CommandError("Invalid date string: %s" % s) + + parts = [int(part) for part in m.groups()] + return datetime.datetime(year=parts[0], month=parts[1], day=parts[2], + hour=parts[3], minute=parts[4], second=parts[5]) + + +def process_line(line, writer): + """ + Process one line from the JSON file: create a CSV line for the post. + + """ + # Remove trailing newline + line = line.strip() + + # The JSON files consist of one large array of dicts, one dict per line. + # The first line in the file thus has an opening '[' we need to get rid of. + # The last char in each line is either a ',' or a ']', so we need to + # lose the last char as well. That should leave us with just a dict. + if line.startswith('['): + line = line[1:] + line = line[:-1].strip() + parts = json.loads(line) + + # Convert the post date; for some strange reason, some dates were not available + # and the date value will be an empty string. In these cases we'll output a NULL. + post_date = parts['date'] + if not post_date: + post_date = 'NULL' + else: + post_date = str(convert_date(post_date)) + + name = parts['name'] + name = NAME_EXPANSIONS.get(name, name) + + # write the parts as CSV + writer.writerow((parts['msg_id'], + parts['title'].encode('utf-8'), + post_date, + name.encode('utf-8'), + parts['email'].encode('utf-8'), + parts['msg'].encode('utf-8'))) + + +def main(argv=None): + + parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION) + parser.set_defaults( + progress=False, + output='post.csv', + ) + parser.add_option("-p", "--progress", action="store_true", + help="Output a . after every 100 posts to show progress [default: %default]") + parser.add_option("-o", "--output", + help="Name of the output file [default: %default]") + + opts, args = parser.parse_args(args=argv) + + with open(opts.output, "wb") as out_file: + writer = csv.writer(out_file) + + for filename in args: + print "Processing " + filename + with open(filename, "r") as in_file: + posts = 0 + for line in in_file: + process_line(line, writer) + posts += 1 + if opts.progress and posts % 100 == 0: + sys.stdout.write('.') + sys.stdout.flush() + print + + +if __name__ == '__main__': + try: + main() + except IOError, ex: + sys.exit("IO Error: %s" % ex) + except KeyboardInterrupt: + sys.exit("Control-C interrupt")