Mercurial > public > sg101
view tools/translate_yahoo_group.py @ 645:99f7917702ca
Fix 081a88b3bfc8, javascript resize of forum images.
Commit 081a88b3bfc8 broke those pages that loaded forums.js but did not load
the imagesLoaded jQuery extension. Now we have arranged it so that only the
forums topic view loads imagesLoaded and put the resizing javascript inline.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Mon, 11 Mar 2013 15:30:25 -0500 |
parents | 66cc2f8c4603 |
children |
line wrap: on
line source
""" translate_yahoo_group.py - A management command to read the old Yahoo group posts (in JSON format) and output as a .csv file, suitable for use as an input to mysqlimport into a database. How to import: mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 sg101_yahoo_group /path/to/post.csv """ from __future__ import with_statement import csv import optparse import datetime import json import re import sys USAGE = "usage: %prog [options] filename1 filename2 ..." DESCRIPTION = """\ This command converts the SG101 Yahoo Group posts from JSON and outputs the data as a .csv file suitable for importing into the new database scheme with the mysqlimport utility. """ DATE_RE = re.compile(r"(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2}):(\d{1,2})Z") # My scraping wasn't perfect and I forgot Yahoo truncated long Yahoo IDs. Provide # this translation table to fix up the names. NAME_EXPANSIONS = { "atomic_mosqu...": "atomic_mosquitos", "balsafliesbe...": "balsafliesbetter", "bambinheadli...": "bambinheadlights", "batinyourfac...": "batinyourface2002", "billyblastof...": "billyblastoff805", "bluegrassban...": "bluegrassbanjobabe", "bluesbrother...": "bluesbrotherbrian", "brother_reyn...": "brother_reynolds", "cannistraro_...": "cannistraro_80303", "cantemaza_de...": "cantemaza_de_miye", "captainraven...": "captainravenwood", "captainsprin...": "captainspringfield", "colonel_must...": "colonel_mustache", "cowboy_playi...": "cowboy_playing_dead", "creature_fea...": "creature_features_fan", "c_e_l_t_i_c_...": "c_e_l_t_i_c_g_o_t_h", "dana_lyn_vin...": "dana_lyn_vincent", "destishtarte...": "destishtartenfort", "efrem_the_re...": "efrem_the_retarded_rabbit", "el_capitan_k...": "el_capitan_kosmo", "epiphonecasi...": "epiphonecasino2001", "fenderjaguar...": "fenderjaguar1963", "fenderplayer...": "fenderplayer52886", "fiberglassro...": "fiberglassrocket", "formerlyleft...": "formerlylefthanded", "guildaristoc...": "guildaristocrat62", "guitardo_mon...": "guitardo_montalbone", "guitargeekfe...": "guitargeekfestival", "harrygpelleg...": "harrygpellegrin2003", "iamtheorigin...": "iamtheoriginaltoblave", "indiana_surf...": "indiana_surfguitar", "invisiblesur...": "invisiblesurfers", "jimmywilseyl...": "jimmywilseylives", "johnnywrecki...": "johnnywreckingball", "killers_from...": "killers_from_space", "los_twang_ma...": "los_twang_marvels", "midwest_surf...": "midwest_surf_guy", "murphy_the_s...": "murphy_the_surfie", "national_dis...": "national_disunion421", "onlytwothing...": "onlytwothingstodo", "photog4surfb...": "photog4surfbands", "poseidonsurf...": "poseidonsurfband", "reluctantaqu...": "reluctantaquanauts", "rich_from_th...": "rich_from_the_balboas", "roland_bette...": "roland_bettenville", "rvstewartpro...": "rvstewartproductions", "screamindeat...": "screamindeathmonkey", "sheckysgotre...": "sheckysgotreverb", "skatalite_of...": "skatalite_of_love", "slippy_the_b...": "slippy_the_backflipping_frog", "somethingabo...": "somethingaboutwendy", "storm_static...": "storm_static_sleep", "supertwangre...": "supertwangreverb", "surfalicious...": "surfaliciousdude1", "surf_guitar_...": "surf_guitar_lizzy", "taylorsnyder...": "taylorsnyderband", "teenageflami...": "teenageflamingovandals", "thecaveofthe...": "thecaveofthedead", "tikitakitiki...": "tikitakitikitakitak", "troublewitht...": "troublewithtreble", "tumamaentang...": "tumamaentangadeluchador", "warped_skate...": "warped_skater_182", "weird_jack_s...": "weird_jack_stinson", "whatever5050...": "whatever50505050", "windanseabea...": "windanseabeachboy", "Wookiee_Smug...": "Wookiee_Smuggler_Drayyken", } def convert_date(s): """ Converts the timestamp string of the form "yyyy-mm-ddTHH:MM:SSZ" into a datetime object. """ m = DATE_RE.match(s) assert m is not None parts = [int(part) for part in m.groups()] return datetime.datetime(year=parts[0], month=parts[1], day=parts[2], hour=parts[3], minute=parts[4], second=parts[5]) def process_line(line, writer): """ Process one line from the JSON file: create a CSV line for the post. """ # Remove trailing newline line = line.strip() # The JSON files consist of one large array of dicts, one dict per line. # The first line in the file thus has an opening '[' we need to get rid of. # The last char in each line is either a ',' or a ']', so we need to # lose the last char as well. That should leave us with just a dict. if line.startswith('['): line = line[1:] line = line[:-1].strip() parts = json.loads(line) # Convert the post date; for some strange reason, some dates were not available # and the date value will be an empty string. In these cases we'll output a NULL. post_date = parts['date'] if not post_date: post_date = 'NULL' else: post_date = str(convert_date(post_date)) name = parts['name'] name = NAME_EXPANSIONS.get(name, name) # write the parts as CSV writer.writerow((parts['msg_id'], parts['title'].encode('utf-8'), post_date, name.encode('utf-8'), parts['email'].encode('utf-8'), parts['msg'].encode('utf-8'))) def main(argv=None): parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION) parser.set_defaults( progress=False, output='post.csv', ) parser.add_option("-p", "--progress", action="store_true", help="Output a . after every 100 posts to show progress [default: %default]") parser.add_option("-o", "--output", help="Name of the output file [default: %default]") opts, args = parser.parse_args(args=argv) with open(opts.output, "wb") as out_file: writer = csv.writer(out_file) for filename in args: print "Processing " + filename with open(filename, "r") as in_file: posts = 0 for line in in_file: process_line(line, writer) posts += 1 if opts.progress and posts % 100 == 0: sys.stdout.write('.') sys.stdout.flush() print if __name__ == '__main__': try: main() except IOError, ex: sys.exit("IO Error: %s" % ex) except KeyboardInterrupt: sys.exit("Control-C interrupt")