annotate tools/post_sub.py @ 861:e4f8d87c3d30

Configure Markdown logger to reduce noise in logs. Markdown is logging at the INFO level whenever it loads an extension. This looks like it has been fixed in master at GitHub. But until then we will explicitly configure the MARKDOWN logger to log at WARNING or higher.
author Brian Neal <bgneal@gmail.com>
date Mon, 01 Dec 2014 18:36:27 -0600
parents e877b9c05740
children
rev   line source
bgneal@359 1 """
bgneal@359 2 This script reads a .csv dump of the forums post table. It writes a new file,
bgneal@359 3 performing a search and replace over a given field.
bgneal@359 4
bgneal@359 5 The output file can be imported into MySQL with:
bgneal@359 6
bgneal@359 7 LOAD DATA LOCAL INFILE 'forums_post.csv' REPLACE INTO TABLE forums_post
bgneal@359 8 CHARACTER SET utf8
bgneal@359 9 FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' ESCAPED BY ''
bgneal@359 10 LINES TERMINATED BY '\r\n';
bgneal@359 11 SHOW WARNINGS;
bgneal@359 12
bgneal@359 13 """
bgneal@359 14 from __future__ import with_statement
bgneal@359 15 import csv
bgneal@359 16 import re
bgneal@359 17 import optparse
bgneal@359 18 import sys
bgneal@359 19
bgneal@359 20
bgneal@359 21 USAGE = "usage: %prog [options] infile outfile"
bgneal@359 22 DESCRIPTION = """\
bgneal@359 23 Performs a search and replace on a field in a forums post .csv file.
bgneal@359 24 """
bgneal@359 25
bgneal@359 26 POST_FIELDS = ('id', 'topic_id', 'user_id', 'creation_date', 'update_date',
bgneal@359 27 'body', 'html', 'user_ip')
bgneal@359 28
bgneal@359 29
bgneal@359 30 def main(argv=None):
bgneal@359 31 parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION)
bgneal@359 32 parser.set_defaults(
bgneal@359 33 progress=False,
bgneal@359 34 field='body',
bgneal@359 35 )
bgneal@359 36 parser.add_option("-p", "--progress", action="store_true",
bgneal@359 37 help="Output a . after every 100 posts to show progress [default: %default]")
bgneal@359 38 parser.add_option("-f", "--field",
bgneal@359 39 help="Name of the field to search [default: %default]")
bgneal@359 40 parser.add_option("-s", "--search", help="The search pattern")
bgneal@359 41 parser.add_option("-r", "--replace", help="The replacement text")
bgneal@359 42
bgneal@359 43 opts, args = parser.parse_args(args=argv)
bgneal@359 44
bgneal@359 45 if len(args) != 2:
bgneal@359 46 sys.exit("Please supply input and output file arguments.")
bgneal@359 47
bgneal@359 48 if opts.search is None:
bgneal@359 49 sys.exit("Please specify a search pattern.")
bgneal@359 50 search_re = re.compile(opts.search)
bgneal@359 51
bgneal@359 52 if opts.replace is None:
bgneal@359 53 sys.exit("Please specify replacement text.")
bgneal@359 54
bgneal@359 55 with open(args[0], "rb") as infile:
bgneal@359 56 reader = csv.DictReader(infile)
bgneal@359 57 if opts.field not in reader.fieldnames:
bgneal@359 58 sys.exit("Error, invalid field option: %s" % opts.field)
bgneal@359 59
bgneal@359 60 with open(args[1], "wb") as outfile:
bgneal@359 61 writer = csv.DictWriter(outfile, POST_FIELDS)
bgneal@359 62
bgneal@359 63 n = 0
bgneal@359 64 for row in reader:
bgneal@359 65 row[opts.field] = search_re.sub(opts.replace, row[opts.field])
bgneal@359 66 writer.writerow(row)
bgneal@359 67
bgneal@359 68 if n % 100 == 0:
bgneal@359 69 sys.stdout.write('.')
bgneal@359 70 sys.stdout.flush()
bgneal@359 71
bgneal@359 72 print
bgneal@359 73
bgneal@359 74 if __name__ == '__main__':
bgneal@359 75 try:
bgneal@359 76 main()
bgneal@359 77 except IOError, ex:
bgneal@359 78 sys.exit("IO Error: %s" % ex)
bgneal@359 79 except KeyboardInterrupt:
bgneal@359 80 sys.exit("Control-C interrupt")