changeset 27:78116556b491

Exercise 1 in chapter 5.1 (Zipf's law).
author Brian Neal <bgneal@gmail.com>
date Sat, 05 Jan 2013 16:38:24 -0600
parents f6073c187926
children 15ff31ecec7a
files zipf.py
diffstat 1 files changed, 90 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/zipf.py	Sat Jan 05 16:38:24 2013 -0600
@@ -0,0 +1,90 @@
+"""Exercise 1 in Chapter 5.1 of Allen Downey's Think Complexity
+
+"Write a program that reads a text from a file, counts word frequencies, and
+prints one line for each word, in descending order of frequency. You can test it
+by downloading an out-of-copyright book in plain text format from gutenberg.net.
+You might want to remove punctuation from the words.  If you need some help
+getting started, you can download thinkcomplex.com/Pmf.py, which provides an
+object named Hist that maps from value to frequencies.
+
+Plot the results and check whether they form a straight line. For plotting
+suggestions, see Section 3.6. Can you estimate the value of s?
+
+You can download my solution from thinkcomplex.com/Zipf.py"
+
+
+"""
+import argparse
+import collections
+import string
+
+from matplotlib import pyplot
+
+
+DESCRIPTION = """\
+This program reads words from files and analyzes their frequency.
+The words can be printed in descending order of frequency or plotted.
+
+See exercise 1 in Chapter 5.1 of Allen Downey's Think Complexity book.
+"""
+
+def word_generator(fp):
+    """A generator function to produce words from a file-like object.
+
+    """
+    for line in fp:
+        line = line.replace('--', ' ')
+        words = line.split()
+        for word in words:
+            if word.endswith("'s"):
+                word = word[:-2]
+            word = word.lower().strip(string.punctuation)
+            yield word
+
+
+def process_file(fp, counter):
+
+    word_iter = word_generator(fp)
+    for word in word_iter:
+        counter[word] += 1
+
+def show_plot(counter):
+    """Display a plot of log f vs. log r to demonstrate Zipf's law."""
+
+    data = [(r + 1, pair[1]) for r, pair in enumerate(counter.most_common())]
+    r_vals, f_vals = zip(*data)
+
+    pyplot.clf()
+    pyplot.xscale('log')
+    pyplot.yscale('log')
+    pyplot.title('log f vs log r')
+    pyplot.xlabel('r')
+    pyplot.ylabel('f')
+    pyplot.plot(r_vals, f_vals, label='f vs r', color='green', linewidth=3)
+    pyplot.legend(loc=4)
+    pyplot.show()
+
+def main(args=None):
+
+    parser = argparse.ArgumentParser(description=DESCRIPTION)
+    parser.add_argument('-p', '--plot', action='store_true', default=False,
+            help='display a plot of the results instead of printing')
+    parser.add_argument('files', nargs='+', type=argparse.FileType('r'),
+            metavar='filename',
+            help='filename to read words from')
+
+    opts = parser.parse_args(args=args)
+
+    counter = collections.Counter()
+
+    for fp in opts.files:
+        process_file(fp, counter)
+
+    if opts.plot:
+        show_plot(counter)
+    else:
+        for word, count in counter.most_common():
+            print word, count
+
+if __name__ == '__main__':
+    main()