annotate tools/post_sub.py @ 955:71a671dab55d

First commit of whitelisting image hosts. This is behind a feature flag courtesy of waffle.
author Brian Neal <bgneal@gmail.com>
date Wed, 03 Jun 2015 21:13:08 -0500 (2015-06-04)
parents e877b9c05740
children
rev   line source
bgneal@359 1 """
bgneal@359 2 This script reads a .csv dump of the forums post table. It writes a new file,
bgneal@359 3 performing a search and replace over a given field.
bgneal@359 4
bgneal@359 5 The output file can be imported into MySQL with:
bgneal@359 6
bgneal@359 7 LOAD DATA LOCAL INFILE 'forums_post.csv' REPLACE INTO TABLE forums_post
bgneal@359 8 CHARACTER SET utf8
bgneal@359 9 FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' ESCAPED BY ''
bgneal@359 10 LINES TERMINATED BY '\r\n';
bgneal@359 11 SHOW WARNINGS;
bgneal@359 12
bgneal@359 13 """
bgneal@359 14 from __future__ import with_statement
bgneal@359 15 import csv
bgneal@359 16 import re
bgneal@359 17 import optparse
bgneal@359 18 import sys
bgneal@359 19
bgneal@359 20
bgneal@359 21 USAGE = "usage: %prog [options] infile outfile"
bgneal@359 22 DESCRIPTION = """\
bgneal@359 23 Performs a search and replace on a field in a forums post .csv file.
bgneal@359 24 """
bgneal@359 25
bgneal@359 26 POST_FIELDS = ('id', 'topic_id', 'user_id', 'creation_date', 'update_date',
bgneal@359 27 'body', 'html', 'user_ip')
bgneal@359 28
bgneal@359 29
bgneal@359 30 def main(argv=None):
bgneal@359 31 parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION)
bgneal@359 32 parser.set_defaults(
bgneal@359 33 progress=False,
bgneal@359 34 field='body',
bgneal@359 35 )
bgneal@359 36 parser.add_option("-p", "--progress", action="store_true",
bgneal@359 37 help="Output a . after every 100 posts to show progress [default: %default]")
bgneal@359 38 parser.add_option("-f", "--field",
bgneal@359 39 help="Name of the field to search [default: %default]")
bgneal@359 40 parser.add_option("-s", "--search", help="The search pattern")
bgneal@359 41 parser.add_option("-r", "--replace", help="The replacement text")
bgneal@359 42
bgneal@359 43 opts, args = parser.parse_args(args=argv)
bgneal@359 44
bgneal@359 45 if len(args) != 2:
bgneal@359 46 sys.exit("Please supply input and output file arguments.")
bgneal@359 47
bgneal@359 48 if opts.search is None:
bgneal@359 49 sys.exit("Please specify a search pattern.")
bgneal@359 50 search_re = re.compile(opts.search)
bgneal@359 51
bgneal@359 52 if opts.replace is None:
bgneal@359 53 sys.exit("Please specify replacement text.")
bgneal@359 54
bgneal@359 55 with open(args[0], "rb") as infile:
bgneal@359 56 reader = csv.DictReader(infile)
bgneal@359 57 if opts.field not in reader.fieldnames:
bgneal@359 58 sys.exit("Error, invalid field option: %s" % opts.field)
bgneal@359 59
bgneal@359 60 with open(args[1], "wb") as outfile:
bgneal@359 61 writer = csv.DictWriter(outfile, POST_FIELDS)
bgneal@359 62
bgneal@359 63 n = 0
bgneal@359 64 for row in reader:
bgneal@359 65 row[opts.field] = search_re.sub(opts.replace, row[opts.field])
bgneal@359 66 writer.writerow(row)
bgneal@359 67
bgneal@359 68 if n % 100 == 0:
bgneal@359 69 sys.stdout.write('.')
bgneal@359 70 sys.stdout.flush()
bgneal@359 71
bgneal@359 72 print
bgneal@359 73
bgneal@359 74 if __name__ == '__main__':
bgneal@359 75 try:
bgneal@359 76 main()
bgneal@359 77 except IOError, ex:
bgneal@359 78 sys.exit("IO Error: %s" % ex)
bgneal@359 79 except KeyboardInterrupt:
bgneal@359 80 sys.exit("Control-C interrupt")