bgneal@334
|
1 """
|
bgneal@334
|
2 filter_posts.py - A script to filter out posts that have no parent topic in the
|
bgneal@334
|
3 new database.
|
bgneal@334
|
4
|
bgneal@334
|
5 """
|
bgneal@334
|
6 from __future__ import with_statement
|
bgneal@334
|
7 import csv
|
bgneal@334
|
8 import optparse
|
bgneal@334
|
9 import sys
|
bgneal@334
|
10
|
bgneal@334
|
11
|
bgneal@334
|
12 USAGE = "usage: %prog [options]"
|
bgneal@334
|
13 DESCRIPTION = """Filters out posts that have no parent topic."""
|
bgneal@334
|
14
|
bgneal@334
|
15
|
bgneal@334
|
16 def main(argv=None):
|
bgneal@334
|
17
|
bgneal@334
|
18 parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION)
|
bgneal@334
|
19 parser.set_defaults(
|
bgneal@334
|
20 topic_file='forums_topic.csv',
|
bgneal@334
|
21 post_file='forums_post.csv',
|
bgneal@334
|
22 output_file='forums_post_filtered.csv',
|
bgneal@334
|
23 )
|
bgneal@334
|
24 parser.add_option("-s", "--progress", action="store_true",
|
bgneal@334
|
25 help="Output a . after every 100 posts to show progress [default: %default]")
|
bgneal@334
|
26 parser.add_option("-t", "--topic-file",
|
bgneal@334
|
27 help="Name of the topics csv file [default: %default]")
|
bgneal@334
|
28 parser.add_option("-p", "--post-file",
|
bgneal@334
|
29 help="Name of the posts csv file [default: %default]")
|
bgneal@334
|
30 parser.add_option("-o", "--output-file",
|
bgneal@334
|
31 help="Name of the output posts csv file [default: %default]")
|
bgneal@334
|
32
|
bgneal@334
|
33 opts, args = parser.parse_args(args=argv)
|
bgneal@334
|
34
|
bgneal@334
|
35 topics = set()
|
bgneal@334
|
36 with open(opts.topic_file, "rb") as topic_file:
|
bgneal@334
|
37 reader = csv.reader(topic_file)
|
bgneal@334
|
38 for row in reader:
|
bgneal@334
|
39 topics.add(int(row[0]))
|
bgneal@334
|
40
|
bgneal@334
|
41 with open(opts.post_file, "rb") as post_file:
|
bgneal@334
|
42 reader = csv.reader(post_file)
|
bgneal@334
|
43 # skip first row
|
bgneal@334
|
44 print reader.next()
|
bgneal@334
|
45 with open(opts.output_file, "wb") as output_file:
|
bgneal@334
|
46 writer = csv.writer(output_file)
|
bgneal@334
|
47 for row in reader:
|
bgneal@334
|
48 topic = int(row[1])
|
bgneal@334
|
49 if topic in topics:
|
bgneal@334
|
50 writer.writerow(row)
|
bgneal@334
|
51
|
bgneal@334
|
52
|
bgneal@334
|
53 if __name__ == '__main__':
|
bgneal@334
|
54 try:
|
bgneal@334
|
55 main()
|
bgneal@334
|
56 except IOError, ex:
|
bgneal@334
|
57 sys.exit("IO Error: %s" % ex)
|
bgneal@334
|
58 except KeyboardInterrupt:
|
bgneal@334
|
59 sys.exit("Control-C interrupt")
|