view tools/translate_yahoo_group.py @ 334:6805d15cda13

Adding a script I had to write on the fly to filter out posts from the posts csv file that had no parent topics. MyISAM let me get away with that, but InnoDB won't.
author Brian Neal <bgneal@gmail.com>
date Sat, 26 Feb 2011 01:28:22 +0000
parents 66cc2f8c4603
children
line wrap: on
line source
"""
translate_yahoo_group.py - A management command to read the old Yahoo group
posts (in JSON format) and output as a .csv file, suitable for use as an
input to mysqlimport into a database.

How to import:
mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 sg101_yahoo_group /path/to/post.csv

"""
from __future__ import with_statement
import csv
import optparse
import datetime
import json
import re
import sys


USAGE = "usage: %prog [options] filename1 filename2 ..."
DESCRIPTION = """\
This command converts the SG101 Yahoo Group posts from JSON and outputs the
data as a .csv file suitable for importing into the new database scheme with
the mysqlimport utility.
"""

DATE_RE = re.compile(r"(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2}):(\d{1,2})Z")

# My scraping wasn't perfect and I forgot Yahoo truncated long Yahoo IDs. Provide
# this translation table to fix up the names.

NAME_EXPANSIONS = {
    "atomic_mosqu...": "atomic_mosquitos",
    "balsafliesbe...": "balsafliesbetter",
    "bambinheadli...": "bambinheadlights",
    "batinyourfac...": "batinyourface2002",
    "billyblastof...": "billyblastoff805",
    "bluegrassban...": "bluegrassbanjobabe",
    "bluesbrother...": "bluesbrotherbrian",
    "brother_reyn...": "brother_reynolds",
    "cannistraro_...": "cannistraro_80303",
    "cantemaza_de...": "cantemaza_de_miye",
    "captainraven...": "captainravenwood",
    "captainsprin...": "captainspringfield",
    "colonel_must...": "colonel_mustache",
    "cowboy_playi...": "cowboy_playing_dead",
    "creature_fea...": "creature_features_fan",
    "c_e_l_t_i_c_...": "c_e_l_t_i_c_g_o_t_h",
    "dana_lyn_vin...": "dana_lyn_vincent",
    "destishtarte...": "destishtartenfort",
    "efrem_the_re...": "efrem_the_retarded_rabbit",
    "el_capitan_k...": "el_capitan_kosmo",
    "epiphonecasi...": "epiphonecasino2001",
    "fenderjaguar...": "fenderjaguar1963",
    "fenderplayer...": "fenderplayer52886",
    "fiberglassro...": "fiberglassrocket",
    "formerlyleft...": "formerlylefthanded",
    "guildaristoc...": "guildaristocrat62",
    "guitardo_mon...": "guitardo_montalbone",
    "guitargeekfe...": "guitargeekfestival",
    "harrygpelleg...": "harrygpellegrin2003",
    "iamtheorigin...": "iamtheoriginaltoblave",
    "indiana_surf...": "indiana_surfguitar",
    "invisiblesur...": "invisiblesurfers",
    "jimmywilseyl...": "jimmywilseylives",
    "johnnywrecki...": "johnnywreckingball",
    "killers_from...": "killers_from_space",
    "los_twang_ma...": "los_twang_marvels",
    "midwest_surf...": "midwest_surf_guy",
    "murphy_the_s...": "murphy_the_surfie",
    "national_dis...": "national_disunion421",
    "onlytwothing...": "onlytwothingstodo",
    "photog4surfb...": "photog4surfbands",
    "poseidonsurf...": "poseidonsurfband",
    "reluctantaqu...": "reluctantaquanauts",
    "rich_from_th...": "rich_from_the_balboas",
    "roland_bette...": "roland_bettenville",
    "rvstewartpro...": "rvstewartproductions",
    "screamindeat...": "screamindeathmonkey",
    "sheckysgotre...": "sheckysgotreverb",
    "skatalite_of...": "skatalite_of_love",
    "slippy_the_b...": "slippy_the_backflipping_frog",
    "somethingabo...": "somethingaboutwendy",
    "storm_static...": "storm_static_sleep",
    "supertwangre...": "supertwangreverb",
    "surfalicious...": "surfaliciousdude1",
    "surf_guitar_...": "surf_guitar_lizzy",
    "taylorsnyder...": "taylorsnyderband",
    "teenageflami...": "teenageflamingovandals",
    "thecaveofthe...": "thecaveofthedead",
    "tikitakitiki...": "tikitakitikitakitak",
    "troublewitht...": "troublewithtreble",
    "tumamaentang...": "tumamaentangadeluchador",
    "warped_skate...": "warped_skater_182",
    "weird_jack_s...": "weird_jack_stinson",
    "whatever5050...": "whatever50505050",
    "windanseabea...": "windanseabeachboy",
    "Wookiee_Smug...": "Wookiee_Smuggler_Drayyken",
}

def convert_date(s):
    """
    Converts the timestamp string of the form "yyyy-mm-ddTHH:MM:SSZ" into a
    datetime object.
    """
    m = DATE_RE.match(s)
    assert m is not None

    parts = [int(part) for part in m.groups()]
    return datetime.datetime(year=parts[0], month=parts[1], day=parts[2],
            hour=parts[3], minute=parts[4], second=parts[5])


def process_line(line, writer):
    """
    Process one line from the JSON file: create a CSV line for the post.

    """
    # Remove trailing newline
    line = line.strip()

    # The JSON files consist of one large array of dicts, one dict per line.
    # The first line in the file thus has an opening '[' we need to get rid of.
    # The last char in each line is either a ',' or a ']', so we need to 
    # lose the last char as well. That should leave us with just a dict.
    if line.startswith('['):
        line = line[1:]
    line = line[:-1].strip()
    parts = json.loads(line)

    # Convert the post date; for some strange reason, some dates were not available
    # and the date value will be an empty string. In these cases we'll output a NULL.
    post_date = parts['date']
    if not post_date:
        post_date = 'NULL'
    else:
        post_date = str(convert_date(post_date))

    name = parts['name']
    name = NAME_EXPANSIONS.get(name, name)

    # write the parts as CSV
    writer.writerow((parts['msg_id'],
            parts['title'].encode('utf-8'),
            post_date,
            name.encode('utf-8'),
            parts['email'].encode('utf-8'),
            parts['msg'].encode('utf-8')))


def main(argv=None):

    parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION)
    parser.set_defaults(
        progress=False,
        output='post.csv',
    )
    parser.add_option("-p", "--progress", action="store_true",
        help="Output a . after every 100 posts to show progress [default: %default]")
    parser.add_option("-o", "--output",
        help="Name of the output file [default: %default]")

    opts, args = parser.parse_args(args=argv)

    with open(opts.output, "wb") as out_file:
        writer = csv.writer(out_file)

        for filename in args:
            print "Processing " + filename
            with open(filename, "r") as in_file:
                posts = 0
                for line in in_file:
                    process_line(line, writer)
                    posts += 1
                    if opts.progress and posts % 100 == 0:
                        sys.stdout.write('.')
                        sys.stdout.flush()
                print


if __name__ == '__main__':
    try:
        main()
    except IOError, ex:
        sys.exit("IO Error: %s" % ex)
    except KeyboardInterrupt:
        sys.exit("Control-C interrupt")