comparison zipf.py @ 27:78116556b491

Exercise 1 in chapter 5.1 (Zipf's law).
author Brian Neal <bgneal@gmail.com>
date Sat, 05 Jan 2013 16:38:24 -0600
parents
children
comparison
equal deleted inserted replaced
26:f6073c187926 27:78116556b491
1 """Exercise 1 in Chapter 5.1 of Allen Downey's Think Complexity
2
3 "Write a program that reads a text from a file, counts word frequencies, and
4 prints one line for each word, in descending order of frequency. You can test it
5 by downloading an out-of-copyright book in plain text format from gutenberg.net.
6 You might want to remove punctuation from the words. If you need some help
7 getting started, you can download thinkcomplex.com/Pmf.py, which provides an
8 object named Hist that maps from value to frequencies.
9
10 Plot the results and check whether they form a straight line. For plotting
11 suggestions, see Section 3.6. Can you estimate the value of s?
12
13 You can download my solution from thinkcomplex.com/Zipf.py"
14
15
16 """
17 import argparse
18 import collections
19 import string
20
21 from matplotlib import pyplot
22
23
24 DESCRIPTION = """\
25 This program reads words from files and analyzes their frequency.
26 The words can be printed in descending order of frequency or plotted.
27
28 See exercise 1 in Chapter 5.1 of Allen Downey's Think Complexity book.
29 """
30
31 def word_generator(fp):
32 """A generator function to produce words from a file-like object.
33
34 """
35 for line in fp:
36 line = line.replace('--', ' ')
37 words = line.split()
38 for word in words:
39 if word.endswith("'s"):
40 word = word[:-2]
41 word = word.lower().strip(string.punctuation)
42 yield word
43
44
45 def process_file(fp, counter):
46
47 word_iter = word_generator(fp)
48 for word in word_iter:
49 counter[word] += 1
50
51 def show_plot(counter):
52 """Display a plot of log f vs. log r to demonstrate Zipf's law."""
53
54 data = [(r + 1, pair[1]) for r, pair in enumerate(counter.most_common())]
55 r_vals, f_vals = zip(*data)
56
57 pyplot.clf()
58 pyplot.xscale('log')
59 pyplot.yscale('log')
60 pyplot.title('log f vs log r')
61 pyplot.xlabel('r')
62 pyplot.ylabel('f')
63 pyplot.plot(r_vals, f_vals, label='f vs r', color='green', linewidth=3)
64 pyplot.legend(loc=4)
65 pyplot.show()
66
67 def main(args=None):
68
69 parser = argparse.ArgumentParser(description=DESCRIPTION)
70 parser.add_argument('-p', '--plot', action='store_true', default=False,
71 help='display a plot of the results instead of printing')
72 parser.add_argument('files', nargs='+', type=argparse.FileType('r'),
73 metavar='filename',
74 help='filename to read words from')
75
76 opts = parser.parse_args(args=args)
77
78 counter = collections.Counter()
79
80 for fp in opts.files:
81 process_file(fp, counter)
82
83 if opts.plot:
84 show_plot(counter)
85 else:
86 for word, count in counter.most_common():
87 print word, count
88
89 if __name__ == '__main__':
90 main()