annotate tools/translate_yahoo_group.py @ 989:2908859c2fe4

Smilies now use relative links. This is for upcoming switch to SSL. Currently we do not need absolute URLs for smilies. If this changes we can add it later.
author Brian Neal <bgneal@gmail.com>
date Thu, 29 Oct 2015 20:54:34 -0500
parents 66cc2f8c4603
children
rev   line source
bgneal@319 1 """
bgneal@319 2 translate_yahoo_group.py - A management command to read the old Yahoo group
bgneal@319 3 posts (in JSON format) and output as a .csv file, suitable for use as an
bgneal@319 4 input to mysqlimport into a database.
bgneal@319 5
bgneal@319 6 How to import:
bgneal@319 7 mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 sg101_yahoo_group /path/to/post.csv
bgneal@319 8
bgneal@319 9 """
bgneal@319 10 from __future__ import with_statement
bgneal@319 11 import csv
bgneal@319 12 import optparse
bgneal@319 13 import datetime
bgneal@319 14 import json
bgneal@319 15 import re
bgneal@319 16 import sys
bgneal@319 17
bgneal@319 18
bgneal@319 19 USAGE = "usage: %prog [options] filename1 filename2 ..."
bgneal@319 20 DESCRIPTION = """\
bgneal@319 21 This command converts the SG101 Yahoo Group posts from JSON and outputs the
bgneal@319 22 data as a .csv file suitable for importing into the new database scheme with
bgneal@319 23 the mysqlimport utility.
bgneal@319 24 """
bgneal@319 25
bgneal@319 26 DATE_RE = re.compile(r"(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2}):(\d{1,2})Z")
bgneal@319 27
bgneal@319 28 # My scraping wasn't perfect and I forgot Yahoo truncated long Yahoo IDs. Provide
bgneal@319 29 # this translation table to fix up the names.
bgneal@319 30
bgneal@319 31 NAME_EXPANSIONS = {
bgneal@319 32 "atomic_mosqu...": "atomic_mosquitos",
bgneal@319 33 "balsafliesbe...": "balsafliesbetter",
bgneal@319 34 "bambinheadli...": "bambinheadlights",
bgneal@319 35 "batinyourfac...": "batinyourface2002",
bgneal@319 36 "billyblastof...": "billyblastoff805",
bgneal@319 37 "bluegrassban...": "bluegrassbanjobabe",
bgneal@319 38 "bluesbrother...": "bluesbrotherbrian",
bgneal@319 39 "brother_reyn...": "brother_reynolds",
bgneal@319 40 "cannistraro_...": "cannistraro_80303",
bgneal@319 41 "cantemaza_de...": "cantemaza_de_miye",
bgneal@319 42 "captainraven...": "captainravenwood",
bgneal@319 43 "captainsprin...": "captainspringfield",
bgneal@319 44 "colonel_must...": "colonel_mustache",
bgneal@319 45 "cowboy_playi...": "cowboy_playing_dead",
bgneal@319 46 "creature_fea...": "creature_features_fan",
bgneal@319 47 "c_e_l_t_i_c_...": "c_e_l_t_i_c_g_o_t_h",
bgneal@319 48 "dana_lyn_vin...": "dana_lyn_vincent",
bgneal@319 49 "destishtarte...": "destishtartenfort",
bgneal@319 50 "efrem_the_re...": "efrem_the_retarded_rabbit",
bgneal@319 51 "el_capitan_k...": "el_capitan_kosmo",
bgneal@319 52 "epiphonecasi...": "epiphonecasino2001",
bgneal@319 53 "fenderjaguar...": "fenderjaguar1963",
bgneal@319 54 "fenderplayer...": "fenderplayer52886",
bgneal@319 55 "fiberglassro...": "fiberglassrocket",
bgneal@319 56 "formerlyleft...": "formerlylefthanded",
bgneal@319 57 "guildaristoc...": "guildaristocrat62",
bgneal@319 58 "guitardo_mon...": "guitardo_montalbone",
bgneal@319 59 "guitargeekfe...": "guitargeekfestival",
bgneal@319 60 "harrygpelleg...": "harrygpellegrin2003",
bgneal@319 61 "iamtheorigin...": "iamtheoriginaltoblave",
bgneal@319 62 "indiana_surf...": "indiana_surfguitar",
bgneal@319 63 "invisiblesur...": "invisiblesurfers",
bgneal@319 64 "jimmywilseyl...": "jimmywilseylives",
bgneal@319 65 "johnnywrecki...": "johnnywreckingball",
bgneal@319 66 "killers_from...": "killers_from_space",
bgneal@319 67 "los_twang_ma...": "los_twang_marvels",
bgneal@319 68 "midwest_surf...": "midwest_surf_guy",
bgneal@319 69 "murphy_the_s...": "murphy_the_surfie",
bgneal@319 70 "national_dis...": "national_disunion421",
bgneal@319 71 "onlytwothing...": "onlytwothingstodo",
bgneal@319 72 "photog4surfb...": "photog4surfbands",
bgneal@319 73 "poseidonsurf...": "poseidonsurfband",
bgneal@319 74 "reluctantaqu...": "reluctantaquanauts",
bgneal@319 75 "rich_from_th...": "rich_from_the_balboas",
bgneal@319 76 "roland_bette...": "roland_bettenville",
bgneal@319 77 "rvstewartpro...": "rvstewartproductions",
bgneal@319 78 "screamindeat...": "screamindeathmonkey",
bgneal@319 79 "sheckysgotre...": "sheckysgotreverb",
bgneal@319 80 "skatalite_of...": "skatalite_of_love",
bgneal@319 81 "slippy_the_b...": "slippy_the_backflipping_frog",
bgneal@319 82 "somethingabo...": "somethingaboutwendy",
bgneal@319 83 "storm_static...": "storm_static_sleep",
bgneal@319 84 "supertwangre...": "supertwangreverb",
bgneal@319 85 "surfalicious...": "surfaliciousdude1",
bgneal@319 86 "surf_guitar_...": "surf_guitar_lizzy",
bgneal@319 87 "taylorsnyder...": "taylorsnyderband",
bgneal@319 88 "teenageflami...": "teenageflamingovandals",
bgneal@319 89 "thecaveofthe...": "thecaveofthedead",
bgneal@319 90 "tikitakitiki...": "tikitakitikitakitak",
bgneal@319 91 "troublewitht...": "troublewithtreble",
bgneal@319 92 "tumamaentang...": "tumamaentangadeluchador",
bgneal@319 93 "warped_skate...": "warped_skater_182",
bgneal@319 94 "weird_jack_s...": "weird_jack_stinson",
bgneal@319 95 "whatever5050...": "whatever50505050",
bgneal@319 96 "windanseabea...": "windanseabeachboy",
bgneal@319 97 "Wookiee_Smug...": "Wookiee_Smuggler_Drayyken",
bgneal@319 98 }
bgneal@319 99
bgneal@319 100 def convert_date(s):
bgneal@319 101 """
bgneal@319 102 Converts the timestamp string of the form "yyyy-mm-ddTHH:MM:SSZ" into a
bgneal@319 103 datetime object.
bgneal@319 104 """
bgneal@319 105 m = DATE_RE.match(s)
bgneal@320 106 assert m is not None
bgneal@319 107
bgneal@319 108 parts = [int(part) for part in m.groups()]
bgneal@319 109 return datetime.datetime(year=parts[0], month=parts[1], day=parts[2],
bgneal@319 110 hour=parts[3], minute=parts[4], second=parts[5])
bgneal@319 111
bgneal@319 112
bgneal@319 113 def process_line(line, writer):
bgneal@319 114 """
bgneal@319 115 Process one line from the JSON file: create a CSV line for the post.
bgneal@319 116
bgneal@319 117 """
bgneal@319 118 # Remove trailing newline
bgneal@319 119 line = line.strip()
bgneal@319 120
bgneal@319 121 # The JSON files consist of one large array of dicts, one dict per line.
bgneal@319 122 # The first line in the file thus has an opening '[' we need to get rid of.
bgneal@319 123 # The last char in each line is either a ',' or a ']', so we need to
bgneal@319 124 # lose the last char as well. That should leave us with just a dict.
bgneal@319 125 if line.startswith('['):
bgneal@319 126 line = line[1:]
bgneal@319 127 line = line[:-1].strip()
bgneal@319 128 parts = json.loads(line)
bgneal@319 129
bgneal@319 130 # Convert the post date; for some strange reason, some dates were not available
bgneal@319 131 # and the date value will be an empty string. In these cases we'll output a NULL.
bgneal@319 132 post_date = parts['date']
bgneal@319 133 if not post_date:
bgneal@319 134 post_date = 'NULL'
bgneal@319 135 else:
bgneal@319 136 post_date = str(convert_date(post_date))
bgneal@319 137
bgneal@319 138 name = parts['name']
bgneal@319 139 name = NAME_EXPANSIONS.get(name, name)
bgneal@319 140
bgneal@319 141 # write the parts as CSV
bgneal@319 142 writer.writerow((parts['msg_id'],
bgneal@319 143 parts['title'].encode('utf-8'),
bgneal@319 144 post_date,
bgneal@319 145 name.encode('utf-8'),
bgneal@319 146 parts['email'].encode('utf-8'),
bgneal@319 147 parts['msg'].encode('utf-8')))
bgneal@319 148
bgneal@319 149
bgneal@319 150 def main(argv=None):
bgneal@319 151
bgneal@319 152 parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION)
bgneal@319 153 parser.set_defaults(
bgneal@319 154 progress=False,
bgneal@319 155 output='post.csv',
bgneal@319 156 )
bgneal@319 157 parser.add_option("-p", "--progress", action="store_true",
bgneal@319 158 help="Output a . after every 100 posts to show progress [default: %default]")
bgneal@319 159 parser.add_option("-o", "--output",
bgneal@319 160 help="Name of the output file [default: %default]")
bgneal@319 161
bgneal@319 162 opts, args = parser.parse_args(args=argv)
bgneal@319 163
bgneal@319 164 with open(opts.output, "wb") as out_file:
bgneal@319 165 writer = csv.writer(out_file)
bgneal@319 166
bgneal@319 167 for filename in args:
bgneal@319 168 print "Processing " + filename
bgneal@319 169 with open(filename, "r") as in_file:
bgneal@319 170 posts = 0
bgneal@319 171 for line in in_file:
bgneal@319 172 process_line(line, writer)
bgneal@319 173 posts += 1
bgneal@319 174 if opts.progress and posts % 100 == 0:
bgneal@319 175 sys.stdout.write('.')
bgneal@319 176 sys.stdout.flush()
bgneal@319 177 print
bgneal@319 178
bgneal@319 179
bgneal@319 180 if __name__ == '__main__':
bgneal@319 181 try:
bgneal@319 182 main()
bgneal@319 183 except IOError, ex:
bgneal@319 184 sys.exit("IO Error: %s" % ex)
bgneal@319 185 except KeyboardInterrupt:
bgneal@319 186 sys.exit("Control-C interrupt")