# HG changeset patch # User Brian Neal # Date 1357425504 21600 # Node ID 78116556b491bf6e25d6c580ae7b3e1684a38753 # Parent f6073c18792674e5ff373e3f1074e25aec251e26 Exercise 1 in chapter 5.1 (Zipf's law). diff -r f6073c187926 -r 78116556b491 zipf.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/zipf.py Sat Jan 05 16:38:24 2013 -0600 @@ -0,0 +1,90 @@ +"""Exercise 1 in Chapter 5.1 of Allen Downey's Think Complexity + +"Write a program that reads a text from a file, counts word frequencies, and +prints one line for each word, in descending order of frequency. You can test it +by downloading an out-of-copyright book in plain text format from gutenberg.net. +You might want to remove punctuation from the words. If you need some help +getting started, you can download thinkcomplex.com/Pmf.py, which provides an +object named Hist that maps from value to frequencies. + +Plot the results and check whether they form a straight line. For plotting +suggestions, see Section 3.6. Can you estimate the value of s? + +You can download my solution from thinkcomplex.com/Zipf.py" + + +""" +import argparse +import collections +import string + +from matplotlib import pyplot + + +DESCRIPTION = """\ +This program reads words from files and analyzes their frequency. +The words can be printed in descending order of frequency or plotted. + +See exercise 1 in Chapter 5.1 of Allen Downey's Think Complexity book. +""" + +def word_generator(fp): + """A generator function to produce words from a file-like object. + + """ + for line in fp: + line = line.replace('--', ' ') + words = line.split() + for word in words: + if word.endswith("'s"): + word = word[:-2] + word = word.lower().strip(string.punctuation) + yield word + + +def process_file(fp, counter): + + word_iter = word_generator(fp) + for word in word_iter: + counter[word] += 1 + +def show_plot(counter): + """Display a plot of log f vs. log r to demonstrate Zipf's law.""" + + data = [(r + 1, pair[1]) for r, pair in enumerate(counter.most_common())] + r_vals, f_vals = zip(*data) + + pyplot.clf() + pyplot.xscale('log') + pyplot.yscale('log') + pyplot.title('log f vs log r') + pyplot.xlabel('r') + pyplot.ylabel('f') + pyplot.plot(r_vals, f_vals, label='f vs r', color='green', linewidth=3) + pyplot.legend(loc=4) + pyplot.show() + +def main(args=None): + + parser = argparse.ArgumentParser(description=DESCRIPTION) + parser.add_argument('-p', '--plot', action='store_true', default=False, + help='display a plot of the results instead of printing') + parser.add_argument('files', nargs='+', type=argparse.FileType('r'), + metavar='filename', + help='filename to read words from') + + opts = parser.parse_args(args=args) + + counter = collections.Counter() + + for fp in opts.files: + process_file(fp, counter) + + if opts.plot: + show_plot(counter) + else: + for word, count in counter.most_common(): + print word, count + +if __name__ == '__main__': + main()