bgneal@27: """Exercise 1 in Chapter 5.1 of Allen Downey's Think Complexity
bgneal@27: 
bgneal@27: "Write a program that reads a text from a file, counts word frequencies, and
bgneal@27: prints one line for each word, in descending order of frequency. You can test it
bgneal@27: by downloading an out-of-copyright book in plain text format from gutenberg.net.
bgneal@27: You might want to remove punctuation from the words.  If you need some help
bgneal@27: getting started, you can download thinkcomplex.com/Pmf.py, which provides an
bgneal@27: object named Hist that maps from value to frequencies.
bgneal@27: 
bgneal@27: Plot the results and check whether they form a straight line. For plotting
bgneal@27: suggestions, see Section 3.6. Can you estimate the value of s?
bgneal@27: 
bgneal@27: You can download my solution from thinkcomplex.com/Zipf.py"
bgneal@27: 
bgneal@27: 
bgneal@27: """
bgneal@27: import argparse
bgneal@27: import collections
bgneal@27: import string
bgneal@27: 
bgneal@27: from matplotlib import pyplot
bgneal@27: 
bgneal@27: 
bgneal@27: DESCRIPTION = """\
bgneal@27: This program reads words from files and analyzes their frequency.
bgneal@27: The words can be printed in descending order of frequency or plotted.
bgneal@27: 
bgneal@27: See exercise 1 in Chapter 5.1 of Allen Downey's Think Complexity book.
bgneal@27: """
bgneal@27: 
bgneal@27: def word_generator(fp):
bgneal@27:     """A generator function to produce words from a file-like object.
bgneal@27: 
bgneal@27:     """
bgneal@27:     for line in fp:
bgneal@27:         line = line.replace('--', ' ')
bgneal@27:         words = line.split()
bgneal@27:         for word in words:
bgneal@27:             if word.endswith("'s"):
bgneal@27:                 word = word[:-2]
bgneal@27:             word = word.lower().strip(string.punctuation)
bgneal@27:             yield word
bgneal@27: 
bgneal@27: 
bgneal@27: def process_file(fp, counter):
bgneal@27: 
bgneal@27:     word_iter = word_generator(fp)
bgneal@27:     for word in word_iter:
bgneal@27:         counter[word] += 1
bgneal@27: 
bgneal@27: def show_plot(counter):
bgneal@27:     """Display a plot of log f vs. log r to demonstrate Zipf's law."""
bgneal@27: 
bgneal@27:     data = [(r + 1, pair[1]) for r, pair in enumerate(counter.most_common())]
bgneal@27:     r_vals, f_vals = zip(*data)
bgneal@27: 
bgneal@27:     pyplot.clf()
bgneal@27:     pyplot.xscale('log')
bgneal@27:     pyplot.yscale('log')
bgneal@27:     pyplot.title('log f vs log r')
bgneal@27:     pyplot.xlabel('r')
bgneal@27:     pyplot.ylabel('f')
bgneal@27:     pyplot.plot(r_vals, f_vals, label='f vs r', color='green', linewidth=3)
bgneal@27:     pyplot.legend(loc=4)
bgneal@27:     pyplot.show()
bgneal@27: 
bgneal@27: def main(args=None):
bgneal@27: 
bgneal@27:     parser = argparse.ArgumentParser(description=DESCRIPTION)
bgneal@27:     parser.add_argument('-p', '--plot', action='store_true', default=False,
bgneal@27:             help='display a plot of the results instead of printing')
bgneal@27:     parser.add_argument('files', nargs='+', type=argparse.FileType('r'),
bgneal@27:             metavar='filename',
bgneal@27:             help='filename to read words from')
bgneal@27: 
bgneal@27:     opts = parser.parse_args(args=args)
bgneal@27: 
bgneal@27:     counter = collections.Counter()
bgneal@27: 
bgneal@27:     for fp in opts.files:
bgneal@27:         process_file(fp, counter)
bgneal@27: 
bgneal@27:     if opts.plot:
bgneal@27:         show_plot(counter)
bgneal@27:     else:
bgneal@27:         for word, count in counter.most_common():
bgneal@27:             print word, count
bgneal@27: 
bgneal@27: if __name__ == '__main__':
bgneal@27:     main()