bgneal@319
|
1 """
|
bgneal@319
|
2 translate_yahoo_group.py - A management command to read the old Yahoo group
|
bgneal@319
|
3 posts (in JSON format) and output as a .csv file, suitable for use as an
|
bgneal@319
|
4 input to mysqlimport into a database.
|
bgneal@319
|
5
|
bgneal@319
|
6 How to import:
|
bgneal@319
|
7 mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 sg101_yahoo_group /path/to/post.csv
|
bgneal@319
|
8
|
bgneal@319
|
9 """
|
bgneal@319
|
10 from __future__ import with_statement
|
bgneal@319
|
11 import csv
|
bgneal@319
|
12 import optparse
|
bgneal@319
|
13 import datetime
|
bgneal@319
|
14 import json
|
bgneal@319
|
15 import re
|
bgneal@319
|
16 import sys
|
bgneal@319
|
17
|
bgneal@319
|
18
|
bgneal@319
|
19 USAGE = "usage: %prog [options] filename1 filename2 ..."
|
bgneal@319
|
20 DESCRIPTION = """\
|
bgneal@319
|
21 This command converts the SG101 Yahoo Group posts from JSON and outputs the
|
bgneal@319
|
22 data as a .csv file suitable for importing into the new database scheme with
|
bgneal@319
|
23 the mysqlimport utility.
|
bgneal@319
|
24 """
|
bgneal@319
|
25
|
bgneal@319
|
26 DATE_RE = re.compile(r"(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2}):(\d{1,2})Z")
|
bgneal@319
|
27
|
bgneal@319
|
28 # My scraping wasn't perfect and I forgot Yahoo truncated long Yahoo IDs. Provide
|
bgneal@319
|
29 # this translation table to fix up the names.
|
bgneal@319
|
30
|
bgneal@319
|
31 NAME_EXPANSIONS = {
|
bgneal@319
|
32 "atomic_mosqu...": "atomic_mosquitos",
|
bgneal@319
|
33 "balsafliesbe...": "balsafliesbetter",
|
bgneal@319
|
34 "bambinheadli...": "bambinheadlights",
|
bgneal@319
|
35 "batinyourfac...": "batinyourface2002",
|
bgneal@319
|
36 "billyblastof...": "billyblastoff805",
|
bgneal@319
|
37 "bluegrassban...": "bluegrassbanjobabe",
|
bgneal@319
|
38 "bluesbrother...": "bluesbrotherbrian",
|
bgneal@319
|
39 "brother_reyn...": "brother_reynolds",
|
bgneal@319
|
40 "cannistraro_...": "cannistraro_80303",
|
bgneal@319
|
41 "cantemaza_de...": "cantemaza_de_miye",
|
bgneal@319
|
42 "captainraven...": "captainravenwood",
|
bgneal@319
|
43 "captainsprin...": "captainspringfield",
|
bgneal@319
|
44 "colonel_must...": "colonel_mustache",
|
bgneal@319
|
45 "cowboy_playi...": "cowboy_playing_dead",
|
bgneal@319
|
46 "creature_fea...": "creature_features_fan",
|
bgneal@319
|
47 "c_e_l_t_i_c_...": "c_e_l_t_i_c_g_o_t_h",
|
bgneal@319
|
48 "dana_lyn_vin...": "dana_lyn_vincent",
|
bgneal@319
|
49 "destishtarte...": "destishtartenfort",
|
bgneal@319
|
50 "efrem_the_re...": "efrem_the_retarded_rabbit",
|
bgneal@319
|
51 "el_capitan_k...": "el_capitan_kosmo",
|
bgneal@319
|
52 "epiphonecasi...": "epiphonecasino2001",
|
bgneal@319
|
53 "fenderjaguar...": "fenderjaguar1963",
|
bgneal@319
|
54 "fenderplayer...": "fenderplayer52886",
|
bgneal@319
|
55 "fiberglassro...": "fiberglassrocket",
|
bgneal@319
|
56 "formerlyleft...": "formerlylefthanded",
|
bgneal@319
|
57 "guildaristoc...": "guildaristocrat62",
|
bgneal@319
|
58 "guitardo_mon...": "guitardo_montalbone",
|
bgneal@319
|
59 "guitargeekfe...": "guitargeekfestival",
|
bgneal@319
|
60 "harrygpelleg...": "harrygpellegrin2003",
|
bgneal@319
|
61 "iamtheorigin...": "iamtheoriginaltoblave",
|
bgneal@319
|
62 "indiana_surf...": "indiana_surfguitar",
|
bgneal@319
|
63 "invisiblesur...": "invisiblesurfers",
|
bgneal@319
|
64 "jimmywilseyl...": "jimmywilseylives",
|
bgneal@319
|
65 "johnnywrecki...": "johnnywreckingball",
|
bgneal@319
|
66 "killers_from...": "killers_from_space",
|
bgneal@319
|
67 "los_twang_ma...": "los_twang_marvels",
|
bgneal@319
|
68 "midwest_surf...": "midwest_surf_guy",
|
bgneal@319
|
69 "murphy_the_s...": "murphy_the_surfie",
|
bgneal@319
|
70 "national_dis...": "national_disunion421",
|
bgneal@319
|
71 "onlytwothing...": "onlytwothingstodo",
|
bgneal@319
|
72 "photog4surfb...": "photog4surfbands",
|
bgneal@319
|
73 "poseidonsurf...": "poseidonsurfband",
|
bgneal@319
|
74 "reluctantaqu...": "reluctantaquanauts",
|
bgneal@319
|
75 "rich_from_th...": "rich_from_the_balboas",
|
bgneal@319
|
76 "roland_bette...": "roland_bettenville",
|
bgneal@319
|
77 "rvstewartpro...": "rvstewartproductions",
|
bgneal@319
|
78 "screamindeat...": "screamindeathmonkey",
|
bgneal@319
|
79 "sheckysgotre...": "sheckysgotreverb",
|
bgneal@319
|
80 "skatalite_of...": "skatalite_of_love",
|
bgneal@319
|
81 "slippy_the_b...": "slippy_the_backflipping_frog",
|
bgneal@319
|
82 "somethingabo...": "somethingaboutwendy",
|
bgneal@319
|
83 "storm_static...": "storm_static_sleep",
|
bgneal@319
|
84 "supertwangre...": "supertwangreverb",
|
bgneal@319
|
85 "surfalicious...": "surfaliciousdude1",
|
bgneal@319
|
86 "surf_guitar_...": "surf_guitar_lizzy",
|
bgneal@319
|
87 "taylorsnyder...": "taylorsnyderband",
|
bgneal@319
|
88 "teenageflami...": "teenageflamingovandals",
|
bgneal@319
|
89 "thecaveofthe...": "thecaveofthedead",
|
bgneal@319
|
90 "tikitakitiki...": "tikitakitikitakitak",
|
bgneal@319
|
91 "troublewitht...": "troublewithtreble",
|
bgneal@319
|
92 "tumamaentang...": "tumamaentangadeluchador",
|
bgneal@319
|
93 "warped_skate...": "warped_skater_182",
|
bgneal@319
|
94 "weird_jack_s...": "weird_jack_stinson",
|
bgneal@319
|
95 "whatever5050...": "whatever50505050",
|
bgneal@319
|
96 "windanseabea...": "windanseabeachboy",
|
bgneal@319
|
97 "Wookiee_Smug...": "Wookiee_Smuggler_Drayyken",
|
bgneal@319
|
98 }
|
bgneal@319
|
99
|
bgneal@319
|
100 def convert_date(s):
|
bgneal@319
|
101 """
|
bgneal@319
|
102 Converts the timestamp string of the form "yyyy-mm-ddTHH:MM:SSZ" into a
|
bgneal@319
|
103 datetime object.
|
bgneal@319
|
104 """
|
bgneal@319
|
105 m = DATE_RE.match(s)
|
bgneal@320
|
106 assert m is not None
|
bgneal@319
|
107
|
bgneal@319
|
108 parts = [int(part) for part in m.groups()]
|
bgneal@319
|
109 return datetime.datetime(year=parts[0], month=parts[1], day=parts[2],
|
bgneal@319
|
110 hour=parts[3], minute=parts[4], second=parts[5])
|
bgneal@319
|
111
|
bgneal@319
|
112
|
bgneal@319
|
113 def process_line(line, writer):
|
bgneal@319
|
114 """
|
bgneal@319
|
115 Process one line from the JSON file: create a CSV line for the post.
|
bgneal@319
|
116
|
bgneal@319
|
117 """
|
bgneal@319
|
118 # Remove trailing newline
|
bgneal@319
|
119 line = line.strip()
|
bgneal@319
|
120
|
bgneal@319
|
121 # The JSON files consist of one large array of dicts, one dict per line.
|
bgneal@319
|
122 # The first line in the file thus has an opening '[' we need to get rid of.
|
bgneal@319
|
123 # The last char in each line is either a ',' or a ']', so we need to
|
bgneal@319
|
124 # lose the last char as well. That should leave us with just a dict.
|
bgneal@319
|
125 if line.startswith('['):
|
bgneal@319
|
126 line = line[1:]
|
bgneal@319
|
127 line = line[:-1].strip()
|
bgneal@319
|
128 parts = json.loads(line)
|
bgneal@319
|
129
|
bgneal@319
|
130 # Convert the post date; for some strange reason, some dates were not available
|
bgneal@319
|
131 # and the date value will be an empty string. In these cases we'll output a NULL.
|
bgneal@319
|
132 post_date = parts['date']
|
bgneal@319
|
133 if not post_date:
|
bgneal@319
|
134 post_date = 'NULL'
|
bgneal@319
|
135 else:
|
bgneal@319
|
136 post_date = str(convert_date(post_date))
|
bgneal@319
|
137
|
bgneal@319
|
138 name = parts['name']
|
bgneal@319
|
139 name = NAME_EXPANSIONS.get(name, name)
|
bgneal@319
|
140
|
bgneal@319
|
141 # write the parts as CSV
|
bgneal@319
|
142 writer.writerow((parts['msg_id'],
|
bgneal@319
|
143 parts['title'].encode('utf-8'),
|
bgneal@319
|
144 post_date,
|
bgneal@319
|
145 name.encode('utf-8'),
|
bgneal@319
|
146 parts['email'].encode('utf-8'),
|
bgneal@319
|
147 parts['msg'].encode('utf-8')))
|
bgneal@319
|
148
|
bgneal@319
|
149
|
bgneal@319
|
150 def main(argv=None):
|
bgneal@319
|
151
|
bgneal@319
|
152 parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION)
|
bgneal@319
|
153 parser.set_defaults(
|
bgneal@319
|
154 progress=False,
|
bgneal@319
|
155 output='post.csv',
|
bgneal@319
|
156 )
|
bgneal@319
|
157 parser.add_option("-p", "--progress", action="store_true",
|
bgneal@319
|
158 help="Output a . after every 100 posts to show progress [default: %default]")
|
bgneal@319
|
159 parser.add_option("-o", "--output",
|
bgneal@319
|
160 help="Name of the output file [default: %default]")
|
bgneal@319
|
161
|
bgneal@319
|
162 opts, args = parser.parse_args(args=argv)
|
bgneal@319
|
163
|
bgneal@319
|
164 with open(opts.output, "wb") as out_file:
|
bgneal@319
|
165 writer = csv.writer(out_file)
|
bgneal@319
|
166
|
bgneal@319
|
167 for filename in args:
|
bgneal@319
|
168 print "Processing " + filename
|
bgneal@319
|
169 with open(filename, "r") as in_file:
|
bgneal@319
|
170 posts = 0
|
bgneal@319
|
171 for line in in_file:
|
bgneal@319
|
172 process_line(line, writer)
|
bgneal@319
|
173 posts += 1
|
bgneal@319
|
174 if opts.progress and posts % 100 == 0:
|
bgneal@319
|
175 sys.stdout.write('.')
|
bgneal@319
|
176 sys.stdout.flush()
|
bgneal@319
|
177 print
|
bgneal@319
|
178
|
bgneal@319
|
179
|
bgneal@319
|
180 if __name__ == '__main__':
|
bgneal@319
|
181 try:
|
bgneal@319
|
182 main()
|
bgneal@319
|
183 except IOError, ex:
|
bgneal@319
|
184 sys.exit("IO Error: %s" % ex)
|
bgneal@319
|
185 except KeyboardInterrupt:
|
bgneal@319
|
186 sys.exit("Control-C interrupt")
|