view zipf.py @ 31:a2358c64d9af

Chapter 5.4, exercise 5: Pareto distribution and city populations.
author Brian Neal <bgneal@gmail.com>
date Mon, 07 Jan 2013 20:41:44 -0600
parents 78116556b491
children
line wrap: on
line source
"""Exercise 1 in Chapter 5.1 of Allen Downey's Think Complexity

"Write a program that reads a text from a file, counts word frequencies, and
prints one line for each word, in descending order of frequency. You can test it
by downloading an out-of-copyright book in plain text format from gutenberg.net.
You might want to remove punctuation from the words.  If you need some help
getting started, you can download thinkcomplex.com/Pmf.py, which provides an
object named Hist that maps from value to frequencies.

Plot the results and check whether they form a straight line. For plotting
suggestions, see Section 3.6. Can you estimate the value of s?

You can download my solution from thinkcomplex.com/Zipf.py"


"""
import argparse
import collections
import string

from matplotlib import pyplot


DESCRIPTION = """\
This program reads words from files and analyzes their frequency.
The words can be printed in descending order of frequency or plotted.

See exercise 1 in Chapter 5.1 of Allen Downey's Think Complexity book.
"""

def word_generator(fp):
    """A generator function to produce words from a file-like object.

    """
    for line in fp:
        line = line.replace('--', ' ')
        words = line.split()
        for word in words:
            if word.endswith("'s"):
                word = word[:-2]
            word = word.lower().strip(string.punctuation)
            yield word


def process_file(fp, counter):

    word_iter = word_generator(fp)
    for word in word_iter:
        counter[word] += 1

def show_plot(counter):
    """Display a plot of log f vs. log r to demonstrate Zipf's law."""

    data = [(r + 1, pair[1]) for r, pair in enumerate(counter.most_common())]
    r_vals, f_vals = zip(*data)

    pyplot.clf()
    pyplot.xscale('log')
    pyplot.yscale('log')
    pyplot.title('log f vs log r')
    pyplot.xlabel('r')
    pyplot.ylabel('f')
    pyplot.plot(r_vals, f_vals, label='f vs r', color='green', linewidth=3)
    pyplot.legend(loc=4)
    pyplot.show()

def main(args=None):

    parser = argparse.ArgumentParser(description=DESCRIPTION)
    parser.add_argument('-p', '--plot', action='store_true', default=False,
            help='display a plot of the results instead of printing')
    parser.add_argument('files', nargs='+', type=argparse.FileType('r'),
            metavar='filename',
            help='filename to read words from')

    opts = parser.parse_args(args=args)

    counter = collections.Counter()

    for fp in opts.files:
        process_file(fp, counter)

    if opts.plot:
        show_plot(counter)
    else:
        for word, count in counter.most_common():
            print word, count

if __name__ == '__main__':
    main()