Mercurial > public > sg101
comparison tools/translate_yahoo_group.py @ 319:c5055573e8d7
Added a tool to translate the old Yahoo Group posts from JSON to CSV suitable for mysqlimport.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Tue, 08 Feb 2011 03:42:04 +0000 |
parents | |
children | 66cc2f8c4603 |
comparison
equal
deleted
inserted
replaced
318:c550933ff5b6 | 319:c5055573e8d7 |
---|---|
1 """ | |
2 translate_yahoo_group.py - A management command to read the old Yahoo group | |
3 posts (in JSON format) and output as a .csv file, suitable for use as an | |
4 input to mysqlimport into a database. | |
5 | |
6 How to import: | |
7 mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 sg101_yahoo_group /path/to/post.csv | |
8 | |
9 """ | |
10 from __future__ import with_statement | |
11 import csv | |
12 import optparse | |
13 import datetime | |
14 import json | |
15 import re | |
16 import sys | |
17 | |
18 | |
19 USAGE = "usage: %prog [options] filename1 filename2 ..." | |
20 DESCRIPTION = """\ | |
21 This command converts the SG101 Yahoo Group posts from JSON and outputs the | |
22 data as a .csv file suitable for importing into the new database scheme with | |
23 the mysqlimport utility. | |
24 """ | |
25 | |
26 DATE_RE = re.compile(r"(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2}):(\d{1,2})Z") | |
27 | |
28 # My scraping wasn't perfect and I forgot Yahoo truncated long Yahoo IDs. Provide | |
29 # this translation table to fix up the names. | |
30 | |
31 NAME_EXPANSIONS = { | |
32 "atomic_mosqu...": "atomic_mosquitos", | |
33 "balsafliesbe...": "balsafliesbetter", | |
34 "bambinheadli...": "bambinheadlights", | |
35 "batinyourfac...": "batinyourface2002", | |
36 "billyblastof...": "billyblastoff805", | |
37 "bluegrassban...": "bluegrassbanjobabe", | |
38 "bluesbrother...": "bluesbrotherbrian", | |
39 "brother_reyn...": "brother_reynolds", | |
40 "cannistraro_...": "cannistraro_80303", | |
41 "cantemaza_de...": "cantemaza_de_miye", | |
42 "captainraven...": "captainravenwood", | |
43 "captainsprin...": "captainspringfield", | |
44 "colonel_must...": "colonel_mustache", | |
45 "cowboy_playi...": "cowboy_playing_dead", | |
46 "creature_fea...": "creature_features_fan", | |
47 "c_e_l_t_i_c_...": "c_e_l_t_i_c_g_o_t_h", | |
48 "dana_lyn_vin...": "dana_lyn_vincent", | |
49 "destishtarte...": "destishtartenfort", | |
50 "efrem_the_re...": "efrem_the_retarded_rabbit", | |
51 "el_capitan_k...": "el_capitan_kosmo", | |
52 "epiphonecasi...": "epiphonecasino2001", | |
53 "fenderjaguar...": "fenderjaguar1963", | |
54 "fenderplayer...": "fenderplayer52886", | |
55 "fiberglassro...": "fiberglassrocket", | |
56 "formerlyleft...": "formerlylefthanded", | |
57 "guildaristoc...": "guildaristocrat62", | |
58 "guitardo_mon...": "guitardo_montalbone", | |
59 "guitargeekfe...": "guitargeekfestival", | |
60 "harrygpelleg...": "harrygpellegrin2003", | |
61 "iamtheorigin...": "iamtheoriginaltoblave", | |
62 "indiana_surf...": "indiana_surfguitar", | |
63 "invisiblesur...": "invisiblesurfers", | |
64 "jimmywilseyl...": "jimmywilseylives", | |
65 "johnnywrecki...": "johnnywreckingball", | |
66 "killers_from...": "killers_from_space", | |
67 "los_twang_ma...": "los_twang_marvels", | |
68 "midwest_surf...": "midwest_surf_guy", | |
69 "murphy_the_s...": "murphy_the_surfie", | |
70 "national_dis...": "national_disunion421", | |
71 "onlytwothing...": "onlytwothingstodo", | |
72 "photog4surfb...": "photog4surfbands", | |
73 "poseidonsurf...": "poseidonsurfband", | |
74 "reluctantaqu...": "reluctantaquanauts", | |
75 "rich_from_th...": "rich_from_the_balboas", | |
76 "roland_bette...": "roland_bettenville", | |
77 "rvstewartpro...": "rvstewartproductions", | |
78 "screamindeat...": "screamindeathmonkey", | |
79 "sheckysgotre...": "sheckysgotreverb", | |
80 "skatalite_of...": "skatalite_of_love", | |
81 "slippy_the_b...": "slippy_the_backflipping_frog", | |
82 "somethingabo...": "somethingaboutwendy", | |
83 "storm_static...": "storm_static_sleep", | |
84 "supertwangre...": "supertwangreverb", | |
85 "surfalicious...": "surfaliciousdude1", | |
86 "surf_guitar_...": "surf_guitar_lizzy", | |
87 "taylorsnyder...": "taylorsnyderband", | |
88 "teenageflami...": "teenageflamingovandals", | |
89 "thecaveofthe...": "thecaveofthedead", | |
90 "tikitakitiki...": "tikitakitikitakitak", | |
91 "troublewitht...": "troublewithtreble", | |
92 "tumamaentang...": "tumamaentangadeluchador", | |
93 "warped_skate...": "warped_skater_182", | |
94 "weird_jack_s...": "weird_jack_stinson", | |
95 "whatever5050...": "whatever50505050", | |
96 "windanseabea...": "windanseabeachboy", | |
97 "Wookiee_Smug...": "Wookiee_Smuggler_Drayyken", | |
98 } | |
99 | |
100 def convert_date(s): | |
101 """ | |
102 Converts the timestamp string of the form "yyyy-mm-ddTHH:MM:SSZ" into a | |
103 datetime object. | |
104 """ | |
105 m = DATE_RE.match(s) | |
106 if m is None: | |
107 raise CommandError("Invalid date string: %s" % s) | |
108 | |
109 parts = [int(part) for part in m.groups()] | |
110 return datetime.datetime(year=parts[0], month=parts[1], day=parts[2], | |
111 hour=parts[3], minute=parts[4], second=parts[5]) | |
112 | |
113 | |
114 def process_line(line, writer): | |
115 """ | |
116 Process one line from the JSON file: create a CSV line for the post. | |
117 | |
118 """ | |
119 # Remove trailing newline | |
120 line = line.strip() | |
121 | |
122 # The JSON files consist of one large array of dicts, one dict per line. | |
123 # The first line in the file thus has an opening '[' we need to get rid of. | |
124 # The last char in each line is either a ',' or a ']', so we need to | |
125 # lose the last char as well. That should leave us with just a dict. | |
126 if line.startswith('['): | |
127 line = line[1:] | |
128 line = line[:-1].strip() | |
129 parts = json.loads(line) | |
130 | |
131 # Convert the post date; for some strange reason, some dates were not available | |
132 # and the date value will be an empty string. In these cases we'll output a NULL. | |
133 post_date = parts['date'] | |
134 if not post_date: | |
135 post_date = 'NULL' | |
136 else: | |
137 post_date = str(convert_date(post_date)) | |
138 | |
139 name = parts['name'] | |
140 name = NAME_EXPANSIONS.get(name, name) | |
141 | |
142 # write the parts as CSV | |
143 writer.writerow((parts['msg_id'], | |
144 parts['title'].encode('utf-8'), | |
145 post_date, | |
146 name.encode('utf-8'), | |
147 parts['email'].encode('utf-8'), | |
148 parts['msg'].encode('utf-8'))) | |
149 | |
150 | |
151 def main(argv=None): | |
152 | |
153 parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION) | |
154 parser.set_defaults( | |
155 progress=False, | |
156 output='post.csv', | |
157 ) | |
158 parser.add_option("-p", "--progress", action="store_true", | |
159 help="Output a . after every 100 posts to show progress [default: %default]") | |
160 parser.add_option("-o", "--output", | |
161 help="Name of the output file [default: %default]") | |
162 | |
163 opts, args = parser.parse_args(args=argv) | |
164 | |
165 with open(opts.output, "wb") as out_file: | |
166 writer = csv.writer(out_file) | |
167 | |
168 for filename in args: | |
169 print "Processing " + filename | |
170 with open(filename, "r") as in_file: | |
171 posts = 0 | |
172 for line in in_file: | |
173 process_line(line, writer) | |
174 posts += 1 | |
175 if opts.progress and posts % 100 == 0: | |
176 sys.stdout.write('.') | |
177 sys.stdout.flush() | |
178 print | |
179 | |
180 | |
181 if __name__ == '__main__': | |
182 try: | |
183 main() | |
184 except IOError, ex: | |
185 sys.exit("IO Error: %s" % ex) | |
186 except KeyboardInterrupt: | |
187 sys.exit("Control-C interrupt") |