Mercurial > public > sg101
view tools/translate_yahoo_group.py @ 693:ad69236e8501
For issue #52, update many 3rd party Javascript libraries.
Updated to jquery 1.10.2, jquery ui 1.10.3.
This broke a lot of stuff.
- Found a newer version of the jquery cycle all plugin (3.0.3).
- Updated JPlayer to 2.4.0.
- Updated to MarkItUp 1.1.14. This also required me to add multiline attributes
set to true on various buttons in the markdown set.
- As per a stackoverflow post, added some code to get multiline titles in
a jQuery UI dialog. They removed that functionality but allow you to put it
back.
Tweaked the MarkItUp preview CSS to show blockquotes in italic.
Did not update TinyMCE at this time. I'm not using the JQuery version and this
version appears to work ok for now.
What I should do is make a repo for MarkItUp and do a vendor branch thing so
I don't have to futz around diffing directories to figure out if I'll lose
changes when I update.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Wed, 04 Sep 2013 19:55:20 -0500 |
parents | 66cc2f8c4603 |
children |
line wrap: on
line source
""" translate_yahoo_group.py - A management command to read the old Yahoo group posts (in JSON format) and output as a .csv file, suitable for use as an input to mysqlimport into a database. How to import: mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 sg101_yahoo_group /path/to/post.csv """ from __future__ import with_statement import csv import optparse import datetime import json import re import sys USAGE = "usage: %prog [options] filename1 filename2 ..." DESCRIPTION = """\ This command converts the SG101 Yahoo Group posts from JSON and outputs the data as a .csv file suitable for importing into the new database scheme with the mysqlimport utility. """ DATE_RE = re.compile(r"(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2}):(\d{1,2})Z") # My scraping wasn't perfect and I forgot Yahoo truncated long Yahoo IDs. Provide # this translation table to fix up the names. NAME_EXPANSIONS = { "atomic_mosqu...": "atomic_mosquitos", "balsafliesbe...": "balsafliesbetter", "bambinheadli...": "bambinheadlights", "batinyourfac...": "batinyourface2002", "billyblastof...": "billyblastoff805", "bluegrassban...": "bluegrassbanjobabe", "bluesbrother...": "bluesbrotherbrian", "brother_reyn...": "brother_reynolds", "cannistraro_...": "cannistraro_80303", "cantemaza_de...": "cantemaza_de_miye", "captainraven...": "captainravenwood", "captainsprin...": "captainspringfield", "colonel_must...": "colonel_mustache", "cowboy_playi...": "cowboy_playing_dead", "creature_fea...": "creature_features_fan", "c_e_l_t_i_c_...": "c_e_l_t_i_c_g_o_t_h", "dana_lyn_vin...": "dana_lyn_vincent", "destishtarte...": "destishtartenfort", "efrem_the_re...": "efrem_the_retarded_rabbit", "el_capitan_k...": "el_capitan_kosmo", "epiphonecasi...": "epiphonecasino2001", "fenderjaguar...": "fenderjaguar1963", "fenderplayer...": "fenderplayer52886", "fiberglassro...": "fiberglassrocket", "formerlyleft...": "formerlylefthanded", "guildaristoc...": "guildaristocrat62", "guitardo_mon...": "guitardo_montalbone", "guitargeekfe...": "guitargeekfestival", "harrygpelleg...": "harrygpellegrin2003", "iamtheorigin...": "iamtheoriginaltoblave", "indiana_surf...": "indiana_surfguitar", "invisiblesur...": "invisiblesurfers", "jimmywilseyl...": "jimmywilseylives", "johnnywrecki...": "johnnywreckingball", "killers_from...": "killers_from_space", "los_twang_ma...": "los_twang_marvels", "midwest_surf...": "midwest_surf_guy", "murphy_the_s...": "murphy_the_surfie", "national_dis...": "national_disunion421", "onlytwothing...": "onlytwothingstodo", "photog4surfb...": "photog4surfbands", "poseidonsurf...": "poseidonsurfband", "reluctantaqu...": "reluctantaquanauts", "rich_from_th...": "rich_from_the_balboas", "roland_bette...": "roland_bettenville", "rvstewartpro...": "rvstewartproductions", "screamindeat...": "screamindeathmonkey", "sheckysgotre...": "sheckysgotreverb", "skatalite_of...": "skatalite_of_love", "slippy_the_b...": "slippy_the_backflipping_frog", "somethingabo...": "somethingaboutwendy", "storm_static...": "storm_static_sleep", "supertwangre...": "supertwangreverb", "surfalicious...": "surfaliciousdude1", "surf_guitar_...": "surf_guitar_lizzy", "taylorsnyder...": "taylorsnyderband", "teenageflami...": "teenageflamingovandals", "thecaveofthe...": "thecaveofthedead", "tikitakitiki...": "tikitakitikitakitak", "troublewitht...": "troublewithtreble", "tumamaentang...": "tumamaentangadeluchador", "warped_skate...": "warped_skater_182", "weird_jack_s...": "weird_jack_stinson", "whatever5050...": "whatever50505050", "windanseabea...": "windanseabeachboy", "Wookiee_Smug...": "Wookiee_Smuggler_Drayyken", } def convert_date(s): """ Converts the timestamp string of the form "yyyy-mm-ddTHH:MM:SSZ" into a datetime object. """ m = DATE_RE.match(s) assert m is not None parts = [int(part) for part in m.groups()] return datetime.datetime(year=parts[0], month=parts[1], day=parts[2], hour=parts[3], minute=parts[4], second=parts[5]) def process_line(line, writer): """ Process one line from the JSON file: create a CSV line for the post. """ # Remove trailing newline line = line.strip() # The JSON files consist of one large array of dicts, one dict per line. # The first line in the file thus has an opening '[' we need to get rid of. # The last char in each line is either a ',' or a ']', so we need to # lose the last char as well. That should leave us with just a dict. if line.startswith('['): line = line[1:] line = line[:-1].strip() parts = json.loads(line) # Convert the post date; for some strange reason, some dates were not available # and the date value will be an empty string. In these cases we'll output a NULL. post_date = parts['date'] if not post_date: post_date = 'NULL' else: post_date = str(convert_date(post_date)) name = parts['name'] name = NAME_EXPANSIONS.get(name, name) # write the parts as CSV writer.writerow((parts['msg_id'], parts['title'].encode('utf-8'), post_date, name.encode('utf-8'), parts['email'].encode('utf-8'), parts['msg'].encode('utf-8'))) def main(argv=None): parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION) parser.set_defaults( progress=False, output='post.csv', ) parser.add_option("-p", "--progress", action="store_true", help="Output a . after every 100 posts to show progress [default: %default]") parser.add_option("-o", "--output", help="Name of the output file [default: %default]") opts, args = parser.parse_args(args=argv) with open(opts.output, "wb") as out_file: writer = csv.writer(out_file) for filename in args: print "Processing " + filename with open(filename, "r") as in_file: posts = 0 for line in in_file: process_line(line, writer) posts += 1 if opts.progress and posts % 100 == 0: sys.stdout.write('.') sys.stdout.flush() print if __name__ == '__main__': try: main() except IOError, ex: sys.exit("IO Error: %s" % ex) except KeyboardInterrupt: sys.exit("Control-C interrupt")