bgneal@27: """Exercise 1 in Chapter 5.1 of Allen Downey's Think Complexity bgneal@27: bgneal@27: "Write a program that reads a text from a file, counts word frequencies, and bgneal@27: prints one line for each word, in descending order of frequency. You can test it bgneal@27: by downloading an out-of-copyright book in plain text format from gutenberg.net. bgneal@27: You might want to remove punctuation from the words. If you need some help bgneal@27: getting started, you can download thinkcomplex.com/Pmf.py, which provides an bgneal@27: object named Hist that maps from value to frequencies. bgneal@27: bgneal@27: Plot the results and check whether they form a straight line. For plotting bgneal@27: suggestions, see Section 3.6. Can you estimate the value of s? bgneal@27: bgneal@27: You can download my solution from thinkcomplex.com/Zipf.py" bgneal@27: bgneal@27: bgneal@27: """ bgneal@27: import argparse bgneal@27: import collections bgneal@27: import string bgneal@27: bgneal@27: from matplotlib import pyplot bgneal@27: bgneal@27: bgneal@27: DESCRIPTION = """\ bgneal@27: This program reads words from files and analyzes their frequency. bgneal@27: The words can be printed in descending order of frequency or plotted. bgneal@27: bgneal@27: See exercise 1 in Chapter 5.1 of Allen Downey's Think Complexity book. bgneal@27: """ bgneal@27: bgneal@27: def word_generator(fp): bgneal@27: """A generator function to produce words from a file-like object. bgneal@27: bgneal@27: """ bgneal@27: for line in fp: bgneal@27: line = line.replace('--', ' ') bgneal@27: words = line.split() bgneal@27: for word in words: bgneal@27: if word.endswith("'s"): bgneal@27: word = word[:-2] bgneal@27: word = word.lower().strip(string.punctuation) bgneal@27: yield word bgneal@27: bgneal@27: bgneal@27: def process_file(fp, counter): bgneal@27: bgneal@27: word_iter = word_generator(fp) bgneal@27: for word in word_iter: bgneal@27: counter[word] += 1 bgneal@27: bgneal@27: def show_plot(counter): bgneal@27: """Display a plot of log f vs. log r to demonstrate Zipf's law.""" bgneal@27: bgneal@27: data = [(r + 1, pair[1]) for r, pair in enumerate(counter.most_common())] bgneal@27: r_vals, f_vals = zip(*data) bgneal@27: bgneal@27: pyplot.clf() bgneal@27: pyplot.xscale('log') bgneal@27: pyplot.yscale('log') bgneal@27: pyplot.title('log f vs log r') bgneal@27: pyplot.xlabel('r') bgneal@27: pyplot.ylabel('f') bgneal@27: pyplot.plot(r_vals, f_vals, label='f vs r', color='green', linewidth=3) bgneal@27: pyplot.legend(loc=4) bgneal@27: pyplot.show() bgneal@27: bgneal@27: def main(args=None): bgneal@27: bgneal@27: parser = argparse.ArgumentParser(description=DESCRIPTION) bgneal@27: parser.add_argument('-p', '--plot', action='store_true', default=False, bgneal@27: help='display a plot of the results instead of printing') bgneal@27: parser.add_argument('files', nargs='+', type=argparse.FileType('r'), bgneal@27: metavar='filename', bgneal@27: help='filename to read words from') bgneal@27: bgneal@27: opts = parser.parse_args(args=args) bgneal@27: bgneal@27: counter = collections.Counter() bgneal@27: bgneal@27: for fp in opts.files: bgneal@27: process_file(fp, counter) bgneal@27: bgneal@27: if opts.plot: bgneal@27: show_plot(counter) bgneal@27: else: bgneal@27: for word, count in counter.most_common(): bgneal@27: print word, count bgneal@27: bgneal@27: if __name__ == '__main__': bgneal@27: main()