comparison tools/translate_yahoo_group.py @ 319:c5055573e8d7

Added a tool to translate the old Yahoo Group posts from JSON to CSV suitable for mysqlimport.
author Brian Neal <bgneal@gmail.com>
date Tue, 08 Feb 2011 03:42:04 +0000
parents
children 66cc2f8c4603
comparison
equal deleted inserted replaced
318:c550933ff5b6 319:c5055573e8d7
1 """
2 translate_yahoo_group.py - A management command to read the old Yahoo group
3 posts (in JSON format) and output as a .csv file, suitable for use as an
4 input to mysqlimport into a database.
5
6 How to import:
7 mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 sg101_yahoo_group /path/to/post.csv
8
9 """
10 from __future__ import with_statement
11 import csv
12 import optparse
13 import datetime
14 import json
15 import re
16 import sys
17
18
19 USAGE = "usage: %prog [options] filename1 filename2 ..."
20 DESCRIPTION = """\
21 This command converts the SG101 Yahoo Group posts from JSON and outputs the
22 data as a .csv file suitable for importing into the new database scheme with
23 the mysqlimport utility.
24 """
25
26 DATE_RE = re.compile(r"(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2}):(\d{1,2})Z")
27
28 # My scraping wasn't perfect and I forgot Yahoo truncated long Yahoo IDs. Provide
29 # this translation table to fix up the names.
30
31 NAME_EXPANSIONS = {
32 "atomic_mosqu...": "atomic_mosquitos",
33 "balsafliesbe...": "balsafliesbetter",
34 "bambinheadli...": "bambinheadlights",
35 "batinyourfac...": "batinyourface2002",
36 "billyblastof...": "billyblastoff805",
37 "bluegrassban...": "bluegrassbanjobabe",
38 "bluesbrother...": "bluesbrotherbrian",
39 "brother_reyn...": "brother_reynolds",
40 "cannistraro_...": "cannistraro_80303",
41 "cantemaza_de...": "cantemaza_de_miye",
42 "captainraven...": "captainravenwood",
43 "captainsprin...": "captainspringfield",
44 "colonel_must...": "colonel_mustache",
45 "cowboy_playi...": "cowboy_playing_dead",
46 "creature_fea...": "creature_features_fan",
47 "c_e_l_t_i_c_...": "c_e_l_t_i_c_g_o_t_h",
48 "dana_lyn_vin...": "dana_lyn_vincent",
49 "destishtarte...": "destishtartenfort",
50 "efrem_the_re...": "efrem_the_retarded_rabbit",
51 "el_capitan_k...": "el_capitan_kosmo",
52 "epiphonecasi...": "epiphonecasino2001",
53 "fenderjaguar...": "fenderjaguar1963",
54 "fenderplayer...": "fenderplayer52886",
55 "fiberglassro...": "fiberglassrocket",
56 "formerlyleft...": "formerlylefthanded",
57 "guildaristoc...": "guildaristocrat62",
58 "guitardo_mon...": "guitardo_montalbone",
59 "guitargeekfe...": "guitargeekfestival",
60 "harrygpelleg...": "harrygpellegrin2003",
61 "iamtheorigin...": "iamtheoriginaltoblave",
62 "indiana_surf...": "indiana_surfguitar",
63 "invisiblesur...": "invisiblesurfers",
64 "jimmywilseyl...": "jimmywilseylives",
65 "johnnywrecki...": "johnnywreckingball",
66 "killers_from...": "killers_from_space",
67 "los_twang_ma...": "los_twang_marvels",
68 "midwest_surf...": "midwest_surf_guy",
69 "murphy_the_s...": "murphy_the_surfie",
70 "national_dis...": "national_disunion421",
71 "onlytwothing...": "onlytwothingstodo",
72 "photog4surfb...": "photog4surfbands",
73 "poseidonsurf...": "poseidonsurfband",
74 "reluctantaqu...": "reluctantaquanauts",
75 "rich_from_th...": "rich_from_the_balboas",
76 "roland_bette...": "roland_bettenville",
77 "rvstewartpro...": "rvstewartproductions",
78 "screamindeat...": "screamindeathmonkey",
79 "sheckysgotre...": "sheckysgotreverb",
80 "skatalite_of...": "skatalite_of_love",
81 "slippy_the_b...": "slippy_the_backflipping_frog",
82 "somethingabo...": "somethingaboutwendy",
83 "storm_static...": "storm_static_sleep",
84 "supertwangre...": "supertwangreverb",
85 "surfalicious...": "surfaliciousdude1",
86 "surf_guitar_...": "surf_guitar_lizzy",
87 "taylorsnyder...": "taylorsnyderband",
88 "teenageflami...": "teenageflamingovandals",
89 "thecaveofthe...": "thecaveofthedead",
90 "tikitakitiki...": "tikitakitikitakitak",
91 "troublewitht...": "troublewithtreble",
92 "tumamaentang...": "tumamaentangadeluchador",
93 "warped_skate...": "warped_skater_182",
94 "weird_jack_s...": "weird_jack_stinson",
95 "whatever5050...": "whatever50505050",
96 "windanseabea...": "windanseabeachboy",
97 "Wookiee_Smug...": "Wookiee_Smuggler_Drayyken",
98 }
99
100 def convert_date(s):
101 """
102 Converts the timestamp string of the form "yyyy-mm-ddTHH:MM:SSZ" into a
103 datetime object.
104 """
105 m = DATE_RE.match(s)
106 if m is None:
107 raise CommandError("Invalid date string: %s" % s)
108
109 parts = [int(part) for part in m.groups()]
110 return datetime.datetime(year=parts[0], month=parts[1], day=parts[2],
111 hour=parts[3], minute=parts[4], second=parts[5])
112
113
114 def process_line(line, writer):
115 """
116 Process one line from the JSON file: create a CSV line for the post.
117
118 """
119 # Remove trailing newline
120 line = line.strip()
121
122 # The JSON files consist of one large array of dicts, one dict per line.
123 # The first line in the file thus has an opening '[' we need to get rid of.
124 # The last char in each line is either a ',' or a ']', so we need to
125 # lose the last char as well. That should leave us with just a dict.
126 if line.startswith('['):
127 line = line[1:]
128 line = line[:-1].strip()
129 parts = json.loads(line)
130
131 # Convert the post date; for some strange reason, some dates were not available
132 # and the date value will be an empty string. In these cases we'll output a NULL.
133 post_date = parts['date']
134 if not post_date:
135 post_date = 'NULL'
136 else:
137 post_date = str(convert_date(post_date))
138
139 name = parts['name']
140 name = NAME_EXPANSIONS.get(name, name)
141
142 # write the parts as CSV
143 writer.writerow((parts['msg_id'],
144 parts['title'].encode('utf-8'),
145 post_date,
146 name.encode('utf-8'),
147 parts['email'].encode('utf-8'),
148 parts['msg'].encode('utf-8')))
149
150
151 def main(argv=None):
152
153 parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION)
154 parser.set_defaults(
155 progress=False,
156 output='post.csv',
157 )
158 parser.add_option("-p", "--progress", action="store_true",
159 help="Output a . after every 100 posts to show progress [default: %default]")
160 parser.add_option("-o", "--output",
161 help="Name of the output file [default: %default]")
162
163 opts, args = parser.parse_args(args=argv)
164
165 with open(opts.output, "wb") as out_file:
166 writer = csv.writer(out_file)
167
168 for filename in args:
169 print "Processing " + filename
170 with open(filename, "r") as in_file:
171 posts = 0
172 for line in in_file:
173 process_line(line, writer)
174 posts += 1
175 if opts.progress and posts % 100 == 0:
176 sys.stdout.write('.')
177 sys.stdout.flush()
178 print
179
180
181 if __name__ == '__main__':
182 try:
183 main()
184 except IOError, ex:
185 sys.exit("IO Error: %s" % ex)
186 except KeyboardInterrupt:
187 sys.exit("Control-C interrupt")