bgneal@334: """ bgneal@334: filter_posts.py - A script to filter out posts that have no parent topic in the bgneal@334: new database. bgneal@334: bgneal@334: """ bgneal@334: from __future__ import with_statement bgneal@334: import csv bgneal@334: import optparse bgneal@334: import sys bgneal@334: bgneal@334: bgneal@334: USAGE = "usage: %prog [options]" bgneal@334: DESCRIPTION = """Filters out posts that have no parent topic.""" bgneal@334: bgneal@334: bgneal@334: def main(argv=None): bgneal@334: bgneal@334: parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION) bgneal@334: parser.set_defaults( bgneal@334: topic_file='forums_topic.csv', bgneal@334: post_file='forums_post.csv', bgneal@334: output_file='forums_post_filtered.csv', bgneal@334: ) bgneal@334: parser.add_option("-s", "--progress", action="store_true", bgneal@334: help="Output a . after every 100 posts to show progress [default: %default]") bgneal@334: parser.add_option("-t", "--topic-file", bgneal@334: help="Name of the topics csv file [default: %default]") bgneal@334: parser.add_option("-p", "--post-file", bgneal@334: help="Name of the posts csv file [default: %default]") bgneal@334: parser.add_option("-o", "--output-file", bgneal@334: help="Name of the output posts csv file [default: %default]") bgneal@334: bgneal@334: opts, args = parser.parse_args(args=argv) bgneal@334: bgneal@334: topics = set() bgneal@334: with open(opts.topic_file, "rb") as topic_file: bgneal@334: reader = csv.reader(topic_file) bgneal@334: for row in reader: bgneal@334: topics.add(int(row[0])) bgneal@334: bgneal@334: with open(opts.post_file, "rb") as post_file: bgneal@334: reader = csv.reader(post_file) bgneal@334: # skip first row bgneal@334: print reader.next() bgneal@334: with open(opts.output_file, "wb") as output_file: bgneal@334: writer = csv.writer(output_file) bgneal@334: for row in reader: bgneal@334: topic = int(row[1]) bgneal@334: if topic in topics: bgneal@334: writer.writerow(row) bgneal@334: bgneal@334: bgneal@334: if __name__ == '__main__': bgneal@334: try: bgneal@334: main() bgneal@334: except IOError, ex: bgneal@334: sys.exit("IO Error: %s" % ex) bgneal@334: except KeyboardInterrupt: bgneal@334: sys.exit("Control-C interrupt")