bgneal@27
|
1 """Exercise 1 in Chapter 5.1 of Allen Downey's Think Complexity
|
bgneal@27
|
2
|
bgneal@27
|
3 "Write a program that reads a text from a file, counts word frequencies, and
|
bgneal@27
|
4 prints one line for each word, in descending order of frequency. You can test it
|
bgneal@27
|
5 by downloading an out-of-copyright book in plain text format from gutenberg.net.
|
bgneal@27
|
6 You might want to remove punctuation from the words. If you need some help
|
bgneal@27
|
7 getting started, you can download thinkcomplex.com/Pmf.py, which provides an
|
bgneal@27
|
8 object named Hist that maps from value to frequencies.
|
bgneal@27
|
9
|
bgneal@27
|
10 Plot the results and check whether they form a straight line. For plotting
|
bgneal@27
|
11 suggestions, see Section 3.6. Can you estimate the value of s?
|
bgneal@27
|
12
|
bgneal@27
|
13 You can download my solution from thinkcomplex.com/Zipf.py"
|
bgneal@27
|
14
|
bgneal@27
|
15
|
bgneal@27
|
16 """
|
bgneal@27
|
17 import argparse
|
bgneal@27
|
18 import collections
|
bgneal@27
|
19 import string
|
bgneal@27
|
20
|
bgneal@27
|
21 from matplotlib import pyplot
|
bgneal@27
|
22
|
bgneal@27
|
23
|
bgneal@27
|
24 DESCRIPTION = """\
|
bgneal@27
|
25 This program reads words from files and analyzes their frequency.
|
bgneal@27
|
26 The words can be printed in descending order of frequency or plotted.
|
bgneal@27
|
27
|
bgneal@27
|
28 See exercise 1 in Chapter 5.1 of Allen Downey's Think Complexity book.
|
bgneal@27
|
29 """
|
bgneal@27
|
30
|
bgneal@27
|
31 def word_generator(fp):
|
bgneal@27
|
32 """A generator function to produce words from a file-like object.
|
bgneal@27
|
33
|
bgneal@27
|
34 """
|
bgneal@27
|
35 for line in fp:
|
bgneal@27
|
36 line = line.replace('--', ' ')
|
bgneal@27
|
37 words = line.split()
|
bgneal@27
|
38 for word in words:
|
bgneal@27
|
39 if word.endswith("'s"):
|
bgneal@27
|
40 word = word[:-2]
|
bgneal@27
|
41 word = word.lower().strip(string.punctuation)
|
bgneal@27
|
42 yield word
|
bgneal@27
|
43
|
bgneal@27
|
44
|
bgneal@27
|
45 def process_file(fp, counter):
|
bgneal@27
|
46
|
bgneal@27
|
47 word_iter = word_generator(fp)
|
bgneal@27
|
48 for word in word_iter:
|
bgneal@27
|
49 counter[word] += 1
|
bgneal@27
|
50
|
bgneal@27
|
51 def show_plot(counter):
|
bgneal@27
|
52 """Display a plot of log f vs. log r to demonstrate Zipf's law."""
|
bgneal@27
|
53
|
bgneal@27
|
54 data = [(r + 1, pair[1]) for r, pair in enumerate(counter.most_common())]
|
bgneal@27
|
55 r_vals, f_vals = zip(*data)
|
bgneal@27
|
56
|
bgneal@27
|
57 pyplot.clf()
|
bgneal@27
|
58 pyplot.xscale('log')
|
bgneal@27
|
59 pyplot.yscale('log')
|
bgneal@27
|
60 pyplot.title('log f vs log r')
|
bgneal@27
|
61 pyplot.xlabel('r')
|
bgneal@27
|
62 pyplot.ylabel('f')
|
bgneal@27
|
63 pyplot.plot(r_vals, f_vals, label='f vs r', color='green', linewidth=3)
|
bgneal@27
|
64 pyplot.legend(loc=4)
|
bgneal@27
|
65 pyplot.show()
|
bgneal@27
|
66
|
bgneal@27
|
67 def main(args=None):
|
bgneal@27
|
68
|
bgneal@27
|
69 parser = argparse.ArgumentParser(description=DESCRIPTION)
|
bgneal@27
|
70 parser.add_argument('-p', '--plot', action='store_true', default=False,
|
bgneal@27
|
71 help='display a plot of the results instead of printing')
|
bgneal@27
|
72 parser.add_argument('files', nargs='+', type=argparse.FileType('r'),
|
bgneal@27
|
73 metavar='filename',
|
bgneal@27
|
74 help='filename to read words from')
|
bgneal@27
|
75
|
bgneal@27
|
76 opts = parser.parse_args(args=args)
|
bgneal@27
|
77
|
bgneal@27
|
78 counter = collections.Counter()
|
bgneal@27
|
79
|
bgneal@27
|
80 for fp in opts.files:
|
bgneal@27
|
81 process_file(fp, counter)
|
bgneal@27
|
82
|
bgneal@27
|
83 if opts.plot:
|
bgneal@27
|
84 show_plot(counter)
|
bgneal@27
|
85 else:
|
bgneal@27
|
86 for word, count in counter.most_common():
|
bgneal@27
|
87 print word, count
|
bgneal@27
|
88
|
bgneal@27
|
89 if __name__ == '__main__':
|
bgneal@27
|
90 main()
|