comparison tools/filter_posts.py @ 334:6805d15cda13

Adding a script I had to write on the fly to filter out posts from the posts csv file that had no parent topics. MyISAM let me get away with that, but InnoDB won't.
author Brian Neal <bgneal@gmail.com>
date Sat, 26 Feb 2011 01:28:22 +0000
parents
children
comparison
equal deleted inserted replaced
333:0bf5a5677067 334:6805d15cda13
1 """
2 filter_posts.py - A script to filter out posts that have no parent topic in the
3 new database.
4
5 """
6 from __future__ import with_statement
7 import csv
8 import optparse
9 import sys
10
11
12 USAGE = "usage: %prog [options]"
13 DESCRIPTION = """Filters out posts that have no parent topic."""
14
15
16 def main(argv=None):
17
18 parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION)
19 parser.set_defaults(
20 topic_file='forums_topic.csv',
21 post_file='forums_post.csv',
22 output_file='forums_post_filtered.csv',
23 )
24 parser.add_option("-s", "--progress", action="store_true",
25 help="Output a . after every 100 posts to show progress [default: %default]")
26 parser.add_option("-t", "--topic-file",
27 help="Name of the topics csv file [default: %default]")
28 parser.add_option("-p", "--post-file",
29 help="Name of the posts csv file [default: %default]")
30 parser.add_option("-o", "--output-file",
31 help="Name of the output posts csv file [default: %default]")
32
33 opts, args = parser.parse_args(args=argv)
34
35 topics = set()
36 with open(opts.topic_file, "rb") as topic_file:
37 reader = csv.reader(topic_file)
38 for row in reader:
39 topics.add(int(row[0]))
40
41 with open(opts.post_file, "rb") as post_file:
42 reader = csv.reader(post_file)
43 # skip first row
44 print reader.next()
45 with open(opts.output_file, "wb") as output_file:
46 writer = csv.writer(output_file)
47 for row in reader:
48 topic = int(row[1])
49 if topic in topics:
50 writer.writerow(row)
51
52
53 if __name__ == '__main__':
54 try:
55 main()
56 except IOError, ex:
57 sys.exit("IO Error: %s" % ex)
58 except KeyboardInterrupt:
59 sys.exit("Control-C interrupt")