Mercurial > public > sg101
view tools/translate_yahoo_group.py @ 467:b910cc1460c8
Add the ability to conditionally add model instances to the search index on update. This is not perfect, as some instances should be deleted from the index if they are updated such that they should not be in the index anymore. Will think about and address that later.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Sun, 24 Jul 2011 18:12:20 +0000 |
parents | 66cc2f8c4603 |
children |
line wrap: on
line source
""" translate_yahoo_group.py - A management command to read the old Yahoo group posts (in JSON format) and output as a .csv file, suitable for use as an input to mysqlimport into a database. How to import: mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 sg101_yahoo_group /path/to/post.csv """ from __future__ import with_statement import csv import optparse import datetime import json import re import sys USAGE = "usage: %prog [options] filename1 filename2 ..." DESCRIPTION = """\ This command converts the SG101 Yahoo Group posts from JSON and outputs the data as a .csv file suitable for importing into the new database scheme with the mysqlimport utility. """ DATE_RE = re.compile(r"(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2}):(\d{1,2})Z") # My scraping wasn't perfect and I forgot Yahoo truncated long Yahoo IDs. Provide # this translation table to fix up the names. NAME_EXPANSIONS = { "atomic_mosqu...": "atomic_mosquitos", "balsafliesbe...": "balsafliesbetter", "bambinheadli...": "bambinheadlights", "batinyourfac...": "batinyourface2002", "billyblastof...": "billyblastoff805", "bluegrassban...": "bluegrassbanjobabe", "bluesbrother...": "bluesbrotherbrian", "brother_reyn...": "brother_reynolds", "cannistraro_...": "cannistraro_80303", "cantemaza_de...": "cantemaza_de_miye", "captainraven...": "captainravenwood", "captainsprin...": "captainspringfield", "colonel_must...": "colonel_mustache", "cowboy_playi...": "cowboy_playing_dead", "creature_fea...": "creature_features_fan", "c_e_l_t_i_c_...": "c_e_l_t_i_c_g_o_t_h", "dana_lyn_vin...": "dana_lyn_vincent", "destishtarte...": "destishtartenfort", "efrem_the_re...": "efrem_the_retarded_rabbit", "el_capitan_k...": "el_capitan_kosmo", "epiphonecasi...": "epiphonecasino2001", "fenderjaguar...": "fenderjaguar1963", "fenderplayer...": "fenderplayer52886", "fiberglassro...": "fiberglassrocket", "formerlyleft...": "formerlylefthanded", "guildaristoc...": "guildaristocrat62", "guitardo_mon...": "guitardo_montalbone", "guitargeekfe...": "guitargeekfestival", "harrygpelleg...": "harrygpellegrin2003", "iamtheorigin...": "iamtheoriginaltoblave", "indiana_surf...": "indiana_surfguitar", "invisiblesur...": "invisiblesurfers", "jimmywilseyl...": "jimmywilseylives", "johnnywrecki...": "johnnywreckingball", "killers_from...": "killers_from_space", "los_twang_ma...": "los_twang_marvels", "midwest_surf...": "midwest_surf_guy", "murphy_the_s...": "murphy_the_surfie", "national_dis...": "national_disunion421", "onlytwothing...": "onlytwothingstodo", "photog4surfb...": "photog4surfbands", "poseidonsurf...": "poseidonsurfband", "reluctantaqu...": "reluctantaquanauts", "rich_from_th...": "rich_from_the_balboas", "roland_bette...": "roland_bettenville", "rvstewartpro...": "rvstewartproductions", "screamindeat...": "screamindeathmonkey", "sheckysgotre...": "sheckysgotreverb", "skatalite_of...": "skatalite_of_love", "slippy_the_b...": "slippy_the_backflipping_frog", "somethingabo...": "somethingaboutwendy", "storm_static...": "storm_static_sleep", "supertwangre...": "supertwangreverb", "surfalicious...": "surfaliciousdude1", "surf_guitar_...": "surf_guitar_lizzy", "taylorsnyder...": "taylorsnyderband", "teenageflami...": "teenageflamingovandals", "thecaveofthe...": "thecaveofthedead", "tikitakitiki...": "tikitakitikitakitak", "troublewitht...": "troublewithtreble", "tumamaentang...": "tumamaentangadeluchador", "warped_skate...": "warped_skater_182", "weird_jack_s...": "weird_jack_stinson", "whatever5050...": "whatever50505050", "windanseabea...": "windanseabeachboy", "Wookiee_Smug...": "Wookiee_Smuggler_Drayyken", } def convert_date(s): """ Converts the timestamp string of the form "yyyy-mm-ddTHH:MM:SSZ" into a datetime object. """ m = DATE_RE.match(s) assert m is not None parts = [int(part) for part in m.groups()] return datetime.datetime(year=parts[0], month=parts[1], day=parts[2], hour=parts[3], minute=parts[4], second=parts[5]) def process_line(line, writer): """ Process one line from the JSON file: create a CSV line for the post. """ # Remove trailing newline line = line.strip() # The JSON files consist of one large array of dicts, one dict per line. # The first line in the file thus has an opening '[' we need to get rid of. # The last char in each line is either a ',' or a ']', so we need to # lose the last char as well. That should leave us with just a dict. if line.startswith('['): line = line[1:] line = line[:-1].strip() parts = json.loads(line) # Convert the post date; for some strange reason, some dates were not available # and the date value will be an empty string. In these cases we'll output a NULL. post_date = parts['date'] if not post_date: post_date = 'NULL' else: post_date = str(convert_date(post_date)) name = parts['name'] name = NAME_EXPANSIONS.get(name, name) # write the parts as CSV writer.writerow((parts['msg_id'], parts['title'].encode('utf-8'), post_date, name.encode('utf-8'), parts['email'].encode('utf-8'), parts['msg'].encode('utf-8'))) def main(argv=None): parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION) parser.set_defaults( progress=False, output='post.csv', ) parser.add_option("-p", "--progress", action="store_true", help="Output a . after every 100 posts to show progress [default: %default]") parser.add_option("-o", "--output", help="Name of the output file [default: %default]") opts, args = parser.parse_args(args=argv) with open(opts.output, "wb") as out_file: writer = csv.writer(out_file) for filename in args: print "Processing " + filename with open(filename, "r") as in_file: posts = 0 for line in in_file: process_line(line, writer) posts += 1 if opts.progress and posts % 100 == 0: sys.stdout.write('.') sys.stdout.flush() print if __name__ == '__main__': try: main() except IOError, ex: sys.exit("IO Error: %s" % ex) except KeyboardInterrupt: sys.exit("Control-C interrupt")