bgneal@319: """ bgneal@319: translate_yahoo_group.py - A management command to read the old Yahoo group bgneal@319: posts (in JSON format) and output as a .csv file, suitable for use as an bgneal@319: input to mysqlimport into a database. bgneal@319: bgneal@319: How to import: bgneal@319: mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 sg101_yahoo_group /path/to/post.csv bgneal@319: bgneal@319: """ bgneal@319: from __future__ import with_statement bgneal@319: import csv bgneal@319: import optparse bgneal@319: import datetime bgneal@319: import json bgneal@319: import re bgneal@319: import sys bgneal@319: bgneal@319: bgneal@319: USAGE = "usage: %prog [options] filename1 filename2 ..." bgneal@319: DESCRIPTION = """\ bgneal@319: This command converts the SG101 Yahoo Group posts from JSON and outputs the bgneal@319: data as a .csv file suitable for importing into the new database scheme with bgneal@319: the mysqlimport utility. bgneal@319: """ bgneal@319: bgneal@319: DATE_RE = re.compile(r"(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2}):(\d{1,2})Z") bgneal@319: bgneal@319: # My scraping wasn't perfect and I forgot Yahoo truncated long Yahoo IDs. Provide bgneal@319: # this translation table to fix up the names. bgneal@319: bgneal@319: NAME_EXPANSIONS = { bgneal@319: "atomic_mosqu...": "atomic_mosquitos", bgneal@319: "balsafliesbe...": "balsafliesbetter", bgneal@319: "bambinheadli...": "bambinheadlights", bgneal@319: "batinyourfac...": "batinyourface2002", bgneal@319: "billyblastof...": "billyblastoff805", bgneal@319: "bluegrassban...": "bluegrassbanjobabe", bgneal@319: "bluesbrother...": "bluesbrotherbrian", bgneal@319: "brother_reyn...": "brother_reynolds", bgneal@319: "cannistraro_...": "cannistraro_80303", bgneal@319: "cantemaza_de...": "cantemaza_de_miye", bgneal@319: "captainraven...": "captainravenwood", bgneal@319: "captainsprin...": "captainspringfield", bgneal@319: "colonel_must...": "colonel_mustache", bgneal@319: "cowboy_playi...": "cowboy_playing_dead", bgneal@319: "creature_fea...": "creature_features_fan", bgneal@319: "c_e_l_t_i_c_...": "c_e_l_t_i_c_g_o_t_h", bgneal@319: "dana_lyn_vin...": "dana_lyn_vincent", bgneal@319: "destishtarte...": "destishtartenfort", bgneal@319: "efrem_the_re...": "efrem_the_retarded_rabbit", bgneal@319: "el_capitan_k...": "el_capitan_kosmo", bgneal@319: "epiphonecasi...": "epiphonecasino2001", bgneal@319: "fenderjaguar...": "fenderjaguar1963", bgneal@319: "fenderplayer...": "fenderplayer52886", bgneal@319: "fiberglassro...": "fiberglassrocket", bgneal@319: "formerlyleft...": "formerlylefthanded", bgneal@319: "guildaristoc...": "guildaristocrat62", bgneal@319: "guitardo_mon...": "guitardo_montalbone", bgneal@319: "guitargeekfe...": "guitargeekfestival", bgneal@319: "harrygpelleg...": "harrygpellegrin2003", bgneal@319: "iamtheorigin...": "iamtheoriginaltoblave", bgneal@319: "indiana_surf...": "indiana_surfguitar", bgneal@319: "invisiblesur...": "invisiblesurfers", bgneal@319: "jimmywilseyl...": "jimmywilseylives", bgneal@319: "johnnywrecki...": "johnnywreckingball", bgneal@319: "killers_from...": "killers_from_space", bgneal@319: "los_twang_ma...": "los_twang_marvels", bgneal@319: "midwest_surf...": "midwest_surf_guy", bgneal@319: "murphy_the_s...": "murphy_the_surfie", bgneal@319: "national_dis...": "national_disunion421", bgneal@319: "onlytwothing...": "onlytwothingstodo", bgneal@319: "photog4surfb...": "photog4surfbands", bgneal@319: "poseidonsurf...": "poseidonsurfband", bgneal@319: "reluctantaqu...": "reluctantaquanauts", bgneal@319: "rich_from_th...": "rich_from_the_balboas", bgneal@319: "roland_bette...": "roland_bettenville", bgneal@319: "rvstewartpro...": "rvstewartproductions", bgneal@319: "screamindeat...": "screamindeathmonkey", bgneal@319: "sheckysgotre...": "sheckysgotreverb", bgneal@319: "skatalite_of...": "skatalite_of_love", bgneal@319: "slippy_the_b...": "slippy_the_backflipping_frog", bgneal@319: "somethingabo...": "somethingaboutwendy", bgneal@319: "storm_static...": "storm_static_sleep", bgneal@319: "supertwangre...": "supertwangreverb", bgneal@319: "surfalicious...": "surfaliciousdude1", bgneal@319: "surf_guitar_...": "surf_guitar_lizzy", bgneal@319: "taylorsnyder...": "taylorsnyderband", bgneal@319: "teenageflami...": "teenageflamingovandals", bgneal@319: "thecaveofthe...": "thecaveofthedead", bgneal@319: "tikitakitiki...": "tikitakitikitakitak", bgneal@319: "troublewitht...": "troublewithtreble", bgneal@319: "tumamaentang...": "tumamaentangadeluchador", bgneal@319: "warped_skate...": "warped_skater_182", bgneal@319: "weird_jack_s...": "weird_jack_stinson", bgneal@319: "whatever5050...": "whatever50505050", bgneal@319: "windanseabea...": "windanseabeachboy", bgneal@319: "Wookiee_Smug...": "Wookiee_Smuggler_Drayyken", bgneal@319: } bgneal@319: bgneal@319: def convert_date(s): bgneal@319: """ bgneal@319: Converts the timestamp string of the form "yyyy-mm-ddTHH:MM:SSZ" into a bgneal@319: datetime object. bgneal@319: """ bgneal@319: m = DATE_RE.match(s) bgneal@320: assert m is not None bgneal@319: bgneal@319: parts = [int(part) for part in m.groups()] bgneal@319: return datetime.datetime(year=parts[0], month=parts[1], day=parts[2], bgneal@319: hour=parts[3], minute=parts[4], second=parts[5]) bgneal@319: bgneal@319: bgneal@319: def process_line(line, writer): bgneal@319: """ bgneal@319: Process one line from the JSON file: create a CSV line for the post. bgneal@319: bgneal@319: """ bgneal@319: # Remove trailing newline bgneal@319: line = line.strip() bgneal@319: bgneal@319: # The JSON files consist of one large array of dicts, one dict per line. bgneal@319: # The first line in the file thus has an opening '[' we need to get rid of. bgneal@319: # The last char in each line is either a ',' or a ']', so we need to bgneal@319: # lose the last char as well. That should leave us with just a dict. bgneal@319: if line.startswith('['): bgneal@319: line = line[1:] bgneal@319: line = line[:-1].strip() bgneal@319: parts = json.loads(line) bgneal@319: bgneal@319: # Convert the post date; for some strange reason, some dates were not available bgneal@319: # and the date value will be an empty string. In these cases we'll output a NULL. bgneal@319: post_date = parts['date'] bgneal@319: if not post_date: bgneal@319: post_date = 'NULL' bgneal@319: else: bgneal@319: post_date = str(convert_date(post_date)) bgneal@319: bgneal@319: name = parts['name'] bgneal@319: name = NAME_EXPANSIONS.get(name, name) bgneal@319: bgneal@319: # write the parts as CSV bgneal@319: writer.writerow((parts['msg_id'], bgneal@319: parts['title'].encode('utf-8'), bgneal@319: post_date, bgneal@319: name.encode('utf-8'), bgneal@319: parts['email'].encode('utf-8'), bgneal@319: parts['msg'].encode('utf-8'))) bgneal@319: bgneal@319: bgneal@319: def main(argv=None): bgneal@319: bgneal@319: parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION) bgneal@319: parser.set_defaults( bgneal@319: progress=False, bgneal@319: output='post.csv', bgneal@319: ) bgneal@319: parser.add_option("-p", "--progress", action="store_true", bgneal@319: help="Output a . after every 100 posts to show progress [default: %default]") bgneal@319: parser.add_option("-o", "--output", bgneal@319: help="Name of the output file [default: %default]") bgneal@319: bgneal@319: opts, args = parser.parse_args(args=argv) bgneal@319: bgneal@319: with open(opts.output, "wb") as out_file: bgneal@319: writer = csv.writer(out_file) bgneal@319: bgneal@319: for filename in args: bgneal@319: print "Processing " + filename bgneal@319: with open(filename, "r") as in_file: bgneal@319: posts = 0 bgneal@319: for line in in_file: bgneal@319: process_line(line, writer) bgneal@319: posts += 1 bgneal@319: if opts.progress and posts % 100 == 0: bgneal@319: sys.stdout.write('.') bgneal@319: sys.stdout.flush() bgneal@319: print bgneal@319: bgneal@319: bgneal@319: if __name__ == '__main__': bgneal@319: try: bgneal@319: main() bgneal@319: except IOError, ex: bgneal@319: sys.exit("IO Error: %s" % ex) bgneal@319: except KeyboardInterrupt: bgneal@319: sys.exit("Control-C interrupt")