Mercurial > public > sg101
view tools/filter_posts.py @ 467:b910cc1460c8
Add the ability to conditionally add model instances to the search index on update. This is not perfect, as some instances should be deleted from the index if they are updated such that they should not be in the index anymore. Will think about and address that later.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Sun, 24 Jul 2011 18:12:20 +0000 |
parents | 6805d15cda13 |
children |
line wrap: on
line source
""" filter_posts.py - A script to filter out posts that have no parent topic in the new database. """ from __future__ import with_statement import csv import optparse import sys USAGE = "usage: %prog [options]" DESCRIPTION = """Filters out posts that have no parent topic.""" def main(argv=None): parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION) parser.set_defaults( topic_file='forums_topic.csv', post_file='forums_post.csv', output_file='forums_post_filtered.csv', ) parser.add_option("-s", "--progress", action="store_true", help="Output a . after every 100 posts to show progress [default: %default]") parser.add_option("-t", "--topic-file", help="Name of the topics csv file [default: %default]") parser.add_option("-p", "--post-file", help="Name of the posts csv file [default: %default]") parser.add_option("-o", "--output-file", help="Name of the output posts csv file [default: %default]") opts, args = parser.parse_args(args=argv) topics = set() with open(opts.topic_file, "rb") as topic_file: reader = csv.reader(topic_file) for row in reader: topics.add(int(row[0])) with open(opts.post_file, "rb") as post_file: reader = csv.reader(post_file) # skip first row print reader.next() with open(opts.output_file, "wb") as output_file: writer = csv.writer(output_file) for row in reader: topic = int(row[1]) if topic in topics: writer.writerow(row) if __name__ == '__main__': try: main() except IOError, ex: sys.exit("IO Error: %s" % ex) except KeyboardInterrupt: sys.exit("Control-C interrupt")