Mercurial > public > think_complexity
diff zipf.py @ 27:78116556b491
Exercise 1 in chapter 5.1 (Zipf's law).
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Sat, 05 Jan 2013 16:38:24 -0600 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/zipf.py Sat Jan 05 16:38:24 2013 -0600 @@ -0,0 +1,90 @@ +"""Exercise 1 in Chapter 5.1 of Allen Downey's Think Complexity + +"Write a program that reads a text from a file, counts word frequencies, and +prints one line for each word, in descending order of frequency. You can test it +by downloading an out-of-copyright book in plain text format from gutenberg.net. +You might want to remove punctuation from the words. If you need some help +getting started, you can download thinkcomplex.com/Pmf.py, which provides an +object named Hist that maps from value to frequencies. + +Plot the results and check whether they form a straight line. For plotting +suggestions, see Section 3.6. Can you estimate the value of s? + +You can download my solution from thinkcomplex.com/Zipf.py" + + +""" +import argparse +import collections +import string + +from matplotlib import pyplot + + +DESCRIPTION = """\ +This program reads words from files and analyzes their frequency. +The words can be printed in descending order of frequency or plotted. + +See exercise 1 in Chapter 5.1 of Allen Downey's Think Complexity book. +""" + +def word_generator(fp): + """A generator function to produce words from a file-like object. + + """ + for line in fp: + line = line.replace('--', ' ') + words = line.split() + for word in words: + if word.endswith("'s"): + word = word[:-2] + word = word.lower().strip(string.punctuation) + yield word + + +def process_file(fp, counter): + + word_iter = word_generator(fp) + for word in word_iter: + counter[word] += 1 + +def show_plot(counter): + """Display a plot of log f vs. log r to demonstrate Zipf's law.""" + + data = [(r + 1, pair[1]) for r, pair in enumerate(counter.most_common())] + r_vals, f_vals = zip(*data) + + pyplot.clf() + pyplot.xscale('log') + pyplot.yscale('log') + pyplot.title('log f vs log r') + pyplot.xlabel('r') + pyplot.ylabel('f') + pyplot.plot(r_vals, f_vals, label='f vs r', color='green', linewidth=3) + pyplot.legend(loc=4) + pyplot.show() + +def main(args=None): + + parser = argparse.ArgumentParser(description=DESCRIPTION) + parser.add_argument('-p', '--plot', action='store_true', default=False, + help='display a plot of the results instead of printing') + parser.add_argument('files', nargs='+', type=argparse.FileType('r'), + metavar='filename', + help='filename to read words from') + + opts = parser.parse_args(args=args) + + counter = collections.Counter() + + for fp in opts.files: + process_file(fp, counter) + + if opts.plot: + show_plot(counter) + else: + for word, count in counter.most_common(): + print word, count + +if __name__ == '__main__': + main()