changeset 319:c5055573e8d7

Added a tool to translate the old Yahoo Group posts from JSON to CSV suitable for mysqlimport.
author Brian Neal <bgneal@gmail.com>
date Tue, 08 Feb 2011 03:42:04 +0000 (2011-02-08)
parents c550933ff5b6
children 66cc2f8c4603
files tools/translate_yahoo_group.py
diffstat 1 files changed, 187 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/translate_yahoo_group.py	Tue Feb 08 03:42:04 2011 +0000
@@ -0,0 +1,187 @@
+"""
+translate_yahoo_group.py - A management command to read the old Yahoo group
+posts (in JSON format) and output as a .csv file, suitable for use as an
+input to mysqlimport into a database.
+
+How to import:
+mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 sg101_yahoo_group /path/to/post.csv
+
+"""
+from __future__ import with_statement
+import csv
+import optparse
+import datetime
+import json
+import re
+import sys
+
+
+USAGE = "usage: %prog [options] filename1 filename2 ..."
+DESCRIPTION = """\
+This command converts the SG101 Yahoo Group posts from JSON and outputs the
+data as a .csv file suitable for importing into the new database scheme with
+the mysqlimport utility.
+"""
+
+DATE_RE = re.compile(r"(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2}):(\d{1,2})Z")
+
+# My scraping wasn't perfect and I forgot Yahoo truncated long Yahoo IDs. Provide
+# this translation table to fix up the names.
+
+NAME_EXPANSIONS = {
+    "atomic_mosqu...": "atomic_mosquitos",
+    "balsafliesbe...": "balsafliesbetter",
+    "bambinheadli...": "bambinheadlights",
+    "batinyourfac...": "batinyourface2002",
+    "billyblastof...": "billyblastoff805",
+    "bluegrassban...": "bluegrassbanjobabe",
+    "bluesbrother...": "bluesbrotherbrian",
+    "brother_reyn...": "brother_reynolds",
+    "cannistraro_...": "cannistraro_80303",
+    "cantemaza_de...": "cantemaza_de_miye",
+    "captainraven...": "captainravenwood",
+    "captainsprin...": "captainspringfield",
+    "colonel_must...": "colonel_mustache",
+    "cowboy_playi...": "cowboy_playing_dead",
+    "creature_fea...": "creature_features_fan",
+    "c_e_l_t_i_c_...": "c_e_l_t_i_c_g_o_t_h",
+    "dana_lyn_vin...": "dana_lyn_vincent",
+    "destishtarte...": "destishtartenfort",
+    "efrem_the_re...": "efrem_the_retarded_rabbit",
+    "el_capitan_k...": "el_capitan_kosmo",
+    "epiphonecasi...": "epiphonecasino2001",
+    "fenderjaguar...": "fenderjaguar1963",
+    "fenderplayer...": "fenderplayer52886",
+    "fiberglassro...": "fiberglassrocket",
+    "formerlyleft...": "formerlylefthanded",
+    "guildaristoc...": "guildaristocrat62",
+    "guitardo_mon...": "guitardo_montalbone",
+    "guitargeekfe...": "guitargeekfestival",
+    "harrygpelleg...": "harrygpellegrin2003",
+    "iamtheorigin...": "iamtheoriginaltoblave",
+    "indiana_surf...": "indiana_surfguitar",
+    "invisiblesur...": "invisiblesurfers",
+    "jimmywilseyl...": "jimmywilseylives",
+    "johnnywrecki...": "johnnywreckingball",
+    "killers_from...": "killers_from_space",
+    "los_twang_ma...": "los_twang_marvels",
+    "midwest_surf...": "midwest_surf_guy",
+    "murphy_the_s...": "murphy_the_surfie",
+    "national_dis...": "national_disunion421",
+    "onlytwothing...": "onlytwothingstodo",
+    "photog4surfb...": "photog4surfbands",
+    "poseidonsurf...": "poseidonsurfband",
+    "reluctantaqu...": "reluctantaquanauts",
+    "rich_from_th...": "rich_from_the_balboas",
+    "roland_bette...": "roland_bettenville",
+    "rvstewartpro...": "rvstewartproductions",
+    "screamindeat...": "screamindeathmonkey",
+    "sheckysgotre...": "sheckysgotreverb",
+    "skatalite_of...": "skatalite_of_love",
+    "slippy_the_b...": "slippy_the_backflipping_frog",
+    "somethingabo...": "somethingaboutwendy",
+    "storm_static...": "storm_static_sleep",
+    "supertwangre...": "supertwangreverb",
+    "surfalicious...": "surfaliciousdude1",
+    "surf_guitar_...": "surf_guitar_lizzy",
+    "taylorsnyder...": "taylorsnyderband",
+    "teenageflami...": "teenageflamingovandals",
+    "thecaveofthe...": "thecaveofthedead",
+    "tikitakitiki...": "tikitakitikitakitak",
+    "troublewitht...": "troublewithtreble",
+    "tumamaentang...": "tumamaentangadeluchador",
+    "warped_skate...": "warped_skater_182",
+    "weird_jack_s...": "weird_jack_stinson",
+    "whatever5050...": "whatever50505050",
+    "windanseabea...": "windanseabeachboy",
+    "Wookiee_Smug...": "Wookiee_Smuggler_Drayyken",
+}
+
+def convert_date(s):
+    """
+    Converts the timestamp string of the form "yyyy-mm-ddTHH:MM:SSZ" into a
+    datetime object.
+    """
+    m = DATE_RE.match(s)
+    if m is None:
+        raise CommandError("Invalid date string: %s" % s)
+
+    parts = [int(part) for part in m.groups()]
+    return datetime.datetime(year=parts[0], month=parts[1], day=parts[2],
+            hour=parts[3], minute=parts[4], second=parts[5])
+
+
+def process_line(line, writer):
+    """
+    Process one line from the JSON file: create a CSV line for the post.
+
+    """
+    # Remove trailing newline
+    line = line.strip()
+
+    # The JSON files consist of one large array of dicts, one dict per line.
+    # The first line in the file thus has an opening '[' we need to get rid of.
+    # The last char in each line is either a ',' or a ']', so we need to 
+    # lose the last char as well. That should leave us with just a dict.
+    if line.startswith('['):
+        line = line[1:]
+    line = line[:-1].strip()
+    parts = json.loads(line)
+
+    # Convert the post date; for some strange reason, some dates were not available
+    # and the date value will be an empty string. In these cases we'll output a NULL.
+    post_date = parts['date']
+    if not post_date:
+        post_date = 'NULL'
+    else:
+        post_date = str(convert_date(post_date))
+
+    name = parts['name']
+    name = NAME_EXPANSIONS.get(name, name)
+
+    # write the parts as CSV
+    writer.writerow((parts['msg_id'],
+            parts['title'].encode('utf-8'),
+            post_date,
+            name.encode('utf-8'),
+            parts['email'].encode('utf-8'),
+            parts['msg'].encode('utf-8')))
+
+
+def main(argv=None):
+
+    parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION)
+    parser.set_defaults(
+        progress=False,
+        output='post.csv',
+    )
+    parser.add_option("-p", "--progress", action="store_true",
+        help="Output a . after every 100 posts to show progress [default: %default]")
+    parser.add_option("-o", "--output",
+        help="Name of the output file [default: %default]")
+
+    opts, args = parser.parse_args(args=argv)
+
+    with open(opts.output, "wb") as out_file:
+        writer = csv.writer(out_file)
+
+        for filename in args:
+            print "Processing " + filename
+            with open(filename, "r") as in_file:
+                posts = 0
+                for line in in_file:
+                    process_line(line, writer)
+                    posts += 1
+                    if opts.progress and posts % 100 == 0:
+                        sys.stdout.write('.')
+                        sys.stdout.flush()
+                print
+
+
+if __name__ == '__main__':
+    try:
+        main()
+    except IOError, ex:
+        sys.exit("IO Error: %s" % ex)
+    except KeyboardInterrupt:
+        sys.exit("Control-C interrupt")