bgneal@359
|
1 """
|
bgneal@359
|
2 This script reads a .csv dump of the forums post table. It writes a new file,
|
bgneal@359
|
3 performing a search and replace over a given field.
|
bgneal@359
|
4
|
bgneal@359
|
5 The output file can be imported into MySQL with:
|
bgneal@359
|
6
|
bgneal@359
|
7 LOAD DATA LOCAL INFILE 'forums_post.csv' REPLACE INTO TABLE forums_post
|
bgneal@359
|
8 CHARACTER SET utf8
|
bgneal@359
|
9 FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' ESCAPED BY ''
|
bgneal@359
|
10 LINES TERMINATED BY '\r\n';
|
bgneal@359
|
11 SHOW WARNINGS;
|
bgneal@359
|
12
|
bgneal@359
|
13 """
|
bgneal@359
|
14 from __future__ import with_statement
|
bgneal@359
|
15 import csv
|
bgneal@359
|
16 import re
|
bgneal@359
|
17 import optparse
|
bgneal@359
|
18 import sys
|
bgneal@359
|
19
|
bgneal@359
|
20
|
bgneal@359
|
21 USAGE = "usage: %prog [options] infile outfile"
|
bgneal@359
|
22 DESCRIPTION = """\
|
bgneal@359
|
23 Performs a search and replace on a field in a forums post .csv file.
|
bgneal@359
|
24 """
|
bgneal@359
|
25
|
bgneal@359
|
26 POST_FIELDS = ('id', 'topic_id', 'user_id', 'creation_date', 'update_date',
|
bgneal@359
|
27 'body', 'html', 'user_ip')
|
bgneal@359
|
28
|
bgneal@359
|
29
|
bgneal@359
|
30 def main(argv=None):
|
bgneal@359
|
31 parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION)
|
bgneal@359
|
32 parser.set_defaults(
|
bgneal@359
|
33 progress=False,
|
bgneal@359
|
34 field='body',
|
bgneal@359
|
35 )
|
bgneal@359
|
36 parser.add_option("-p", "--progress", action="store_true",
|
bgneal@359
|
37 help="Output a . after every 100 posts to show progress [default: %default]")
|
bgneal@359
|
38 parser.add_option("-f", "--field",
|
bgneal@359
|
39 help="Name of the field to search [default: %default]")
|
bgneal@359
|
40 parser.add_option("-s", "--search", help="The search pattern")
|
bgneal@359
|
41 parser.add_option("-r", "--replace", help="The replacement text")
|
bgneal@359
|
42
|
bgneal@359
|
43 opts, args = parser.parse_args(args=argv)
|
bgneal@359
|
44
|
bgneal@359
|
45 if len(args) != 2:
|
bgneal@359
|
46 sys.exit("Please supply input and output file arguments.")
|
bgneal@359
|
47
|
bgneal@359
|
48 if opts.search is None:
|
bgneal@359
|
49 sys.exit("Please specify a search pattern.")
|
bgneal@359
|
50 search_re = re.compile(opts.search)
|
bgneal@359
|
51
|
bgneal@359
|
52 if opts.replace is None:
|
bgneal@359
|
53 sys.exit("Please specify replacement text.")
|
bgneal@359
|
54
|
bgneal@359
|
55 with open(args[0], "rb") as infile:
|
bgneal@359
|
56 reader = csv.DictReader(infile)
|
bgneal@359
|
57 if opts.field not in reader.fieldnames:
|
bgneal@359
|
58 sys.exit("Error, invalid field option: %s" % opts.field)
|
bgneal@359
|
59
|
bgneal@359
|
60 with open(args[1], "wb") as outfile:
|
bgneal@359
|
61 writer = csv.DictWriter(outfile, POST_FIELDS)
|
bgneal@359
|
62
|
bgneal@359
|
63 n = 0
|
bgneal@359
|
64 for row in reader:
|
bgneal@359
|
65 row[opts.field] = search_re.sub(opts.replace, row[opts.field])
|
bgneal@359
|
66 writer.writerow(row)
|
bgneal@359
|
67
|
bgneal@359
|
68 if n % 100 == 0:
|
bgneal@359
|
69 sys.stdout.write('.')
|
bgneal@359
|
70 sys.stdout.flush()
|
bgneal@359
|
71
|
bgneal@359
|
72 print
|
bgneal@359
|
73
|
bgneal@359
|
74 if __name__ == '__main__':
|
bgneal@359
|
75 try:
|
bgneal@359
|
76 main()
|
bgneal@359
|
77 except IOError, ex:
|
bgneal@359
|
78 sys.exit("IO Error: %s" % ex)
|
bgneal@359
|
79 except KeyboardInterrupt:
|
bgneal@359
|
80 sys.exit("Control-C interrupt")
|