bgneal@334: """
bgneal@334: filter_posts.py - A script to filter out posts that have no parent topic in the
bgneal@334: new database. 
bgneal@334: 
bgneal@334: """
bgneal@334: from __future__ import with_statement
bgneal@334: import csv
bgneal@334: import optparse
bgneal@334: import sys
bgneal@334: 
bgneal@334: 
bgneal@334: USAGE = "usage: %prog [options]"
bgneal@334: DESCRIPTION = """Filters out posts that have no parent topic."""
bgneal@334: 
bgneal@334: 
bgneal@334: def main(argv=None):
bgneal@334: 
bgneal@334:     parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION)
bgneal@334:     parser.set_defaults(
bgneal@334:         topic_file='forums_topic.csv',
bgneal@334:         post_file='forums_post.csv',
bgneal@334:         output_file='forums_post_filtered.csv',
bgneal@334:     )
bgneal@334:     parser.add_option("-s", "--progress", action="store_true",
bgneal@334:         help="Output a . after every 100 posts to show progress [default: %default]")
bgneal@334:     parser.add_option("-t", "--topic-file",
bgneal@334:         help="Name of the topics csv file [default: %default]")
bgneal@334:     parser.add_option("-p", "--post-file",
bgneal@334:         help="Name of the posts csv file [default: %default]")
bgneal@334:     parser.add_option("-o", "--output-file",
bgneal@334:         help="Name of the output posts csv file [default: %default]")
bgneal@334: 
bgneal@334:     opts, args = parser.parse_args(args=argv)
bgneal@334: 
bgneal@334:     topics = set()
bgneal@334:     with open(opts.topic_file, "rb") as topic_file:
bgneal@334:         reader = csv.reader(topic_file)
bgneal@334:         for row in reader:
bgneal@334:             topics.add(int(row[0]))
bgneal@334: 
bgneal@334:     with open(opts.post_file, "rb") as post_file:
bgneal@334:         reader = csv.reader(post_file)
bgneal@334:         # skip first row
bgneal@334:         print reader.next()
bgneal@334:         with open(opts.output_file, "wb") as output_file:
bgneal@334:             writer = csv.writer(output_file)
bgneal@334:             for row in reader:
bgneal@334:                 topic = int(row[1])
bgneal@334:                 if topic in topics:
bgneal@334:                     writer.writerow(row)
bgneal@334:                     
bgneal@334: 
bgneal@334: if __name__ == '__main__':
bgneal@334:     try:
bgneal@334:         main()
bgneal@334:     except IOError, ex:
bgneal@334:         sys.exit("IO Error: %s" % ex)
bgneal@334:     except KeyboardInterrupt:
bgneal@334:         sys.exit("Control-C interrupt")