Mercurial > public > think_complexity
comparison zipf.py @ 27:78116556b491
Exercise 1 in chapter 5.1 (Zipf's law).
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Sat, 05 Jan 2013 16:38:24 -0600 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
26:f6073c187926 | 27:78116556b491 |
---|---|
1 """Exercise 1 in Chapter 5.1 of Allen Downey's Think Complexity | |
2 | |
3 "Write a program that reads a text from a file, counts word frequencies, and | |
4 prints one line for each word, in descending order of frequency. You can test it | |
5 by downloading an out-of-copyright book in plain text format from gutenberg.net. | |
6 You might want to remove punctuation from the words. If you need some help | |
7 getting started, you can download thinkcomplex.com/Pmf.py, which provides an | |
8 object named Hist that maps from value to frequencies. | |
9 | |
10 Plot the results and check whether they form a straight line. For plotting | |
11 suggestions, see Section 3.6. Can you estimate the value of s? | |
12 | |
13 You can download my solution from thinkcomplex.com/Zipf.py" | |
14 | |
15 | |
16 """ | |
17 import argparse | |
18 import collections | |
19 import string | |
20 | |
21 from matplotlib import pyplot | |
22 | |
23 | |
24 DESCRIPTION = """\ | |
25 This program reads words from files and analyzes their frequency. | |
26 The words can be printed in descending order of frequency or plotted. | |
27 | |
28 See exercise 1 in Chapter 5.1 of Allen Downey's Think Complexity book. | |
29 """ | |
30 | |
31 def word_generator(fp): | |
32 """A generator function to produce words from a file-like object. | |
33 | |
34 """ | |
35 for line in fp: | |
36 line = line.replace('--', ' ') | |
37 words = line.split() | |
38 for word in words: | |
39 if word.endswith("'s"): | |
40 word = word[:-2] | |
41 word = word.lower().strip(string.punctuation) | |
42 yield word | |
43 | |
44 | |
45 def process_file(fp, counter): | |
46 | |
47 word_iter = word_generator(fp) | |
48 for word in word_iter: | |
49 counter[word] += 1 | |
50 | |
51 def show_plot(counter): | |
52 """Display a plot of log f vs. log r to demonstrate Zipf's law.""" | |
53 | |
54 data = [(r + 1, pair[1]) for r, pair in enumerate(counter.most_common())] | |
55 r_vals, f_vals = zip(*data) | |
56 | |
57 pyplot.clf() | |
58 pyplot.xscale('log') | |
59 pyplot.yscale('log') | |
60 pyplot.title('log f vs log r') | |
61 pyplot.xlabel('r') | |
62 pyplot.ylabel('f') | |
63 pyplot.plot(r_vals, f_vals, label='f vs r', color='green', linewidth=3) | |
64 pyplot.legend(loc=4) | |
65 pyplot.show() | |
66 | |
67 def main(args=None): | |
68 | |
69 parser = argparse.ArgumentParser(description=DESCRIPTION) | |
70 parser.add_argument('-p', '--plot', action='store_true', default=False, | |
71 help='display a plot of the results instead of printing') | |
72 parser.add_argument('files', nargs='+', type=argparse.FileType('r'), | |
73 metavar='filename', | |
74 help='filename to read words from') | |
75 | |
76 opts = parser.parse_args(args=args) | |
77 | |
78 counter = collections.Counter() | |
79 | |
80 for fp in opts.files: | |
81 process_file(fp, counter) | |
82 | |
83 if opts.plot: | |
84 show_plot(counter) | |
85 else: | |
86 for word, count in counter.most_common(): | |
87 print word, count | |
88 | |
89 if __name__ == '__main__': | |
90 main() |