annotate tools/translate_yahoo_group.py @ 715:820e57e621e8

Use |safe filter on Haystack templates to get better results w/quotes. Content was getting escaped, so text with quotes around it was seemingly missing from the search index. This change fixed that. I verified that the search results will not leak raw HTML to the page so this should be safe to do.
author Brian Neal <bgneal@gmail.com>
date Tue, 17 Sep 2013 20:26:49 -0500
parents 66cc2f8c4603
children
rev   line source
bgneal@319 1 """
bgneal@319 2 translate_yahoo_group.py - A management command to read the old Yahoo group
bgneal@319 3 posts (in JSON format) and output as a .csv file, suitable for use as an
bgneal@319 4 input to mysqlimport into a database.
bgneal@319 5
bgneal@319 6 How to import:
bgneal@319 7 mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 sg101_yahoo_group /path/to/post.csv
bgneal@319 8
bgneal@319 9 """
bgneal@319 10 from __future__ import with_statement
bgneal@319 11 import csv
bgneal@319 12 import optparse
bgneal@319 13 import datetime
bgneal@319 14 import json
bgneal@319 15 import re
bgneal@319 16 import sys
bgneal@319 17
bgneal@319 18
bgneal@319 19 USAGE = "usage: %prog [options] filename1 filename2 ..."
bgneal@319 20 DESCRIPTION = """\
bgneal@319 21 This command converts the SG101 Yahoo Group posts from JSON and outputs the
bgneal@319 22 data as a .csv file suitable for importing into the new database scheme with
bgneal@319 23 the mysqlimport utility.
bgneal@319 24 """
bgneal@319 25
bgneal@319 26 DATE_RE = re.compile(r"(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{1,2}):(\d{1,2})Z")
bgneal@319 27
bgneal@319 28 # My scraping wasn't perfect and I forgot Yahoo truncated long Yahoo IDs. Provide
bgneal@319 29 # this translation table to fix up the names.
bgneal@319 30
bgneal@319 31 NAME_EXPANSIONS = {
bgneal@319 32 "atomic_mosqu...": "atomic_mosquitos",
bgneal@319 33 "balsafliesbe...": "balsafliesbetter",
bgneal@319 34 "bambinheadli...": "bambinheadlights",
bgneal@319 35 "batinyourfac...": "batinyourface2002",
bgneal@319 36 "billyblastof...": "billyblastoff805",
bgneal@319 37 "bluegrassban...": "bluegrassbanjobabe",
bgneal@319 38 "bluesbrother...": "bluesbrotherbrian",
bgneal@319 39 "brother_reyn...": "brother_reynolds",
bgneal@319 40 "cannistraro_...": "cannistraro_80303",
bgneal@319 41 "cantemaza_de...": "cantemaza_de_miye",
bgneal@319 42 "captainraven...": "captainravenwood",
bgneal@319 43 "captainsprin...": "captainspringfield",
bgneal@319 44 "colonel_must...": "colonel_mustache",
bgneal@319 45 "cowboy_playi...": "cowboy_playing_dead",
bgneal@319 46 "creature_fea...": "creature_features_fan",
bgneal@319 47 "c_e_l_t_i_c_...": "c_e_l_t_i_c_g_o_t_h",
bgneal@319 48 "dana_lyn_vin...": "dana_lyn_vincent",
bgneal@319 49 "destishtarte...": "destishtartenfort",
bgneal@319 50 "efrem_the_re...": "efrem_the_retarded_rabbit",
bgneal@319 51 "el_capitan_k...": "el_capitan_kosmo",
bgneal@319 52 "epiphonecasi...": "epiphonecasino2001",
bgneal@319 53 "fenderjaguar...": "fenderjaguar1963",
bgneal@319 54 "fenderplayer...": "fenderplayer52886",
bgneal@319 55 "fiberglassro...": "fiberglassrocket",
bgneal@319 56 "formerlyleft...": "formerlylefthanded",
bgneal@319 57 "guildaristoc...": "guildaristocrat62",
bgneal@319 58 "guitardo_mon...": "guitardo_montalbone",
bgneal@319 59 "guitargeekfe...": "guitargeekfestival",
bgneal@319 60 "harrygpelleg...": "harrygpellegrin2003",
bgneal@319 61 "iamtheorigin...": "iamtheoriginaltoblave",
bgneal@319 62 "indiana_surf...": "indiana_surfguitar",
bgneal@319 63 "invisiblesur...": "invisiblesurfers",
bgneal@319 64 "jimmywilseyl...": "jimmywilseylives",
bgneal@319 65 "johnnywrecki...": "johnnywreckingball",
bgneal@319 66 "killers_from...": "killers_from_space",
bgneal@319 67 "los_twang_ma...": "los_twang_marvels",
bgneal@319 68 "midwest_surf...": "midwest_surf_guy",
bgneal@319 69 "murphy_the_s...": "murphy_the_surfie",
bgneal@319 70 "national_dis...": "national_disunion421",
bgneal@319 71 "onlytwothing...": "onlytwothingstodo",
bgneal@319 72 "photog4surfb...": "photog4surfbands",
bgneal@319 73 "poseidonsurf...": "poseidonsurfband",
bgneal@319 74 "reluctantaqu...": "reluctantaquanauts",
bgneal@319 75 "rich_from_th...": "rich_from_the_balboas",
bgneal@319 76 "roland_bette...": "roland_bettenville",
bgneal@319 77 "rvstewartpro...": "rvstewartproductions",
bgneal@319 78 "screamindeat...": "screamindeathmonkey",
bgneal@319 79 "sheckysgotre...": "sheckysgotreverb",
bgneal@319 80 "skatalite_of...": "skatalite_of_love",
bgneal@319 81 "slippy_the_b...": "slippy_the_backflipping_frog",
bgneal@319 82 "somethingabo...": "somethingaboutwendy",
bgneal@319 83 "storm_static...": "storm_static_sleep",
bgneal@319 84 "supertwangre...": "supertwangreverb",
bgneal@319 85 "surfalicious...": "surfaliciousdude1",
bgneal@319 86 "surf_guitar_...": "surf_guitar_lizzy",
bgneal@319 87 "taylorsnyder...": "taylorsnyderband",
bgneal@319 88 "teenageflami...": "teenageflamingovandals",
bgneal@319 89 "thecaveofthe...": "thecaveofthedead",
bgneal@319 90 "tikitakitiki...": "tikitakitikitakitak",
bgneal@319 91 "troublewitht...": "troublewithtreble",
bgneal@319 92 "tumamaentang...": "tumamaentangadeluchador",
bgneal@319 93 "warped_skate...": "warped_skater_182",
bgneal@319 94 "weird_jack_s...": "weird_jack_stinson",
bgneal@319 95 "whatever5050...": "whatever50505050",
bgneal@319 96 "windanseabea...": "windanseabeachboy",
bgneal@319 97 "Wookiee_Smug...": "Wookiee_Smuggler_Drayyken",
bgneal@319 98 }
bgneal@319 99
bgneal@319 100 def convert_date(s):
bgneal@319 101 """
bgneal@319 102 Converts the timestamp string of the form "yyyy-mm-ddTHH:MM:SSZ" into a
bgneal@319 103 datetime object.
bgneal@319 104 """
bgneal@319 105 m = DATE_RE.match(s)
bgneal@320 106 assert m is not None
bgneal@319 107
bgneal@319 108 parts = [int(part) for part in m.groups()]
bgneal@319 109 return datetime.datetime(year=parts[0], month=parts[1], day=parts[2],
bgneal@319 110 hour=parts[3], minute=parts[4], second=parts[5])
bgneal@319 111
bgneal@319 112
bgneal@319 113 def process_line(line, writer):
bgneal@319 114 """
bgneal@319 115 Process one line from the JSON file: create a CSV line for the post.
bgneal@319 116
bgneal@319 117 """
bgneal@319 118 # Remove trailing newline
bgneal@319 119 line = line.strip()
bgneal@319 120
bgneal@319 121 # The JSON files consist of one large array of dicts, one dict per line.
bgneal@319 122 # The first line in the file thus has an opening '[' we need to get rid of.
bgneal@319 123 # The last char in each line is either a ',' or a ']', so we need to
bgneal@319 124 # lose the last char as well. That should leave us with just a dict.
bgneal@319 125 if line.startswith('['):
bgneal@319 126 line = line[1:]
bgneal@319 127 line = line[:-1].strip()
bgneal@319 128 parts = json.loads(line)
bgneal@319 129
bgneal@319 130 # Convert the post date; for some strange reason, some dates were not available
bgneal@319 131 # and the date value will be an empty string. In these cases we'll output a NULL.
bgneal@319 132 post_date = parts['date']
bgneal@319 133 if not post_date:
bgneal@319 134 post_date = 'NULL'
bgneal@319 135 else:
bgneal@319 136 post_date = str(convert_date(post_date))
bgneal@319 137
bgneal@319 138 name = parts['name']
bgneal@319 139 name = NAME_EXPANSIONS.get(name, name)
bgneal@319 140
bgneal@319 141 # write the parts as CSV
bgneal@319 142 writer.writerow((parts['msg_id'],
bgneal@319 143 parts['title'].encode('utf-8'),
bgneal@319 144 post_date,
bgneal@319 145 name.encode('utf-8'),
bgneal@319 146 parts['email'].encode('utf-8'),
bgneal@319 147 parts['msg'].encode('utf-8')))
bgneal@319 148
bgneal@319 149
bgneal@319 150 def main(argv=None):
bgneal@319 151
bgneal@319 152 parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION)
bgneal@319 153 parser.set_defaults(
bgneal@319 154 progress=False,
bgneal@319 155 output='post.csv',
bgneal@319 156 )
bgneal@319 157 parser.add_option("-p", "--progress", action="store_true",
bgneal@319 158 help="Output a . after every 100 posts to show progress [default: %default]")
bgneal@319 159 parser.add_option("-o", "--output",
bgneal@319 160 help="Name of the output file [default: %default]")
bgneal@319 161
bgneal@319 162 opts, args = parser.parse_args(args=argv)
bgneal@319 163
bgneal@319 164 with open(opts.output, "wb") as out_file:
bgneal@319 165 writer = csv.writer(out_file)
bgneal@319 166
bgneal@319 167 for filename in args:
bgneal@319 168 print "Processing " + filename
bgneal@319 169 with open(filename, "r") as in_file:
bgneal@319 170 posts = 0
bgneal@319 171 for line in in_file:
bgneal@319 172 process_line(line, writer)
bgneal@319 173 posts += 1
bgneal@319 174 if opts.progress and posts % 100 == 0:
bgneal@319 175 sys.stdout.write('.')
bgneal@319 176 sys.stdout.flush()
bgneal@319 177 print
bgneal@319 178
bgneal@319 179
bgneal@319 180 if __name__ == '__main__':
bgneal@319 181 try:
bgneal@319 182 main()
bgneal@319 183 except IOError, ex:
bgneal@319 184 sys.exit("IO Error: %s" % ex)
bgneal@319 185 except KeyboardInterrupt:
bgneal@319 186 sys.exit("Control-C interrupt")