annotate tools/filter_posts.py @ 387:b15726767ab8

Fixing #191; terrible performance on the combined forums RSS feed query. Use an .extra() clause to force the WHERE on a query to use the primary key.
author Brian Neal <bgneal@gmail.com>
date Sat, 19 Mar 2011 01:52:41 +0000
parents 6805d15cda13
children
rev   line source
bgneal@334 1 """
bgneal@334 2 filter_posts.py - A script to filter out posts that have no parent topic in the
bgneal@334 3 new database.
bgneal@334 4
bgneal@334 5 """
bgneal@334 6 from __future__ import with_statement
bgneal@334 7 import csv
bgneal@334 8 import optparse
bgneal@334 9 import sys
bgneal@334 10
bgneal@334 11
bgneal@334 12 USAGE = "usage: %prog [options]"
bgneal@334 13 DESCRIPTION = """Filters out posts that have no parent topic."""
bgneal@334 14
bgneal@334 15
bgneal@334 16 def main(argv=None):
bgneal@334 17
bgneal@334 18 parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION)
bgneal@334 19 parser.set_defaults(
bgneal@334 20 topic_file='forums_topic.csv',
bgneal@334 21 post_file='forums_post.csv',
bgneal@334 22 output_file='forums_post_filtered.csv',
bgneal@334 23 )
bgneal@334 24 parser.add_option("-s", "--progress", action="store_true",
bgneal@334 25 help="Output a . after every 100 posts to show progress [default: %default]")
bgneal@334 26 parser.add_option("-t", "--topic-file",
bgneal@334 27 help="Name of the topics csv file [default: %default]")
bgneal@334 28 parser.add_option("-p", "--post-file",
bgneal@334 29 help="Name of the posts csv file [default: %default]")
bgneal@334 30 parser.add_option("-o", "--output-file",
bgneal@334 31 help="Name of the output posts csv file [default: %default]")
bgneal@334 32
bgneal@334 33 opts, args = parser.parse_args(args=argv)
bgneal@334 34
bgneal@334 35 topics = set()
bgneal@334 36 with open(opts.topic_file, "rb") as topic_file:
bgneal@334 37 reader = csv.reader(topic_file)
bgneal@334 38 for row in reader:
bgneal@334 39 topics.add(int(row[0]))
bgneal@334 40
bgneal@334 41 with open(opts.post_file, "rb") as post_file:
bgneal@334 42 reader = csv.reader(post_file)
bgneal@334 43 # skip first row
bgneal@334 44 print reader.next()
bgneal@334 45 with open(opts.output_file, "wb") as output_file:
bgneal@334 46 writer = csv.writer(output_file)
bgneal@334 47 for row in reader:
bgneal@334 48 topic = int(row[1])
bgneal@334 49 if topic in topics:
bgneal@334 50 writer.writerow(row)
bgneal@334 51
bgneal@334 52
bgneal@334 53 if __name__ == '__main__':
bgneal@334 54 try:
bgneal@334 55 main()
bgneal@334 56 except IOError, ex:
bgneal@334 57 sys.exit("IO Error: %s" % ex)
bgneal@334 58 except KeyboardInterrupt:
bgneal@334 59 sys.exit("Control-C interrupt")