think_complexity: zipf.py annotate

annotate zipf.py @ 38:d876711b88b5

Comment out the calculation of L as it may take a long time.

author	Brian Neal <bgneal@gmail.com>
date	Thu, 10 Jan 2013 20:24:56 -0600
parents	78116556b491
children

rev	line source
bgneal@27	1 """Exercise 1 in Chapter 5.1 of Allen Downey's Think Complexity
bgneal@27	2
bgneal@27	3 "Write a program that reads a text from a file, counts word frequencies, and
bgneal@27	4 prints one line for each word, in descending order of frequency. You can test it
bgneal@27	5 by downloading an out-of-copyright book in plain text format from gutenberg.net.
bgneal@27	6 You might want to remove punctuation from the words. If you need some help
bgneal@27	7 getting started, you can download thinkcomplex.com/Pmf.py, which provides an
bgneal@27	8 object named Hist that maps from value to frequencies.
bgneal@27	9
bgneal@27	10 Plot the results and check whether they form a straight line. For plotting
bgneal@27	11 suggestions, see Section 3.6. Can you estimate the value of s?
bgneal@27	12
bgneal@27	13 You can download my solution from thinkcomplex.com/Zipf.py"
bgneal@27	14
bgneal@27	15
bgneal@27	16 """
bgneal@27	17 import argparse
bgneal@27	18 import collections
bgneal@27	19 import string
bgneal@27	20
bgneal@27	21 from matplotlib import pyplot
bgneal@27	22
bgneal@27	23
bgneal@27	24 DESCRIPTION = """\
bgneal@27	25 This program reads words from files and analyzes their frequency.
bgneal@27	26 The words can be printed in descending order of frequency or plotted.
bgneal@27	27
bgneal@27	28 See exercise 1 in Chapter 5.1 of Allen Downey's Think Complexity book.
bgneal@27	29 """
bgneal@27	30
bgneal@27	31 def word_generator(fp):
bgneal@27	32 """A generator function to produce words from a file-like object.
bgneal@27	33
bgneal@27	34 """
bgneal@27	35 for line in fp:
bgneal@27	36 line = line.replace('--', ' ')
bgneal@27	37 words = line.split()
bgneal@27	38 for word in words:
bgneal@27	39 if word.endswith("'s"):
bgneal@27	40 word = word[:-2]
bgneal@27	41 word = word.lower().strip(string.punctuation)
bgneal@27	42 yield word
bgneal@27	43
bgneal@27	44
bgneal@27	45 def process_file(fp, counter):
bgneal@27	46
bgneal@27	47 word_iter = word_generator(fp)
bgneal@27	48 for word in word_iter:
bgneal@27	49 counter[word] += 1
bgneal@27	50
bgneal@27	51 def show_plot(counter):
bgneal@27	52 """Display a plot of log f vs. log r to demonstrate Zipf's law."""
bgneal@27	53
bgneal@27	54 data = [(r + 1, pair[1]) for r, pair in enumerate(counter.most_common())]
bgneal@27	55 r_vals, f_vals = zip(*data)
bgneal@27	56
bgneal@27	57 pyplot.clf()
bgneal@27	58 pyplot.xscale('log')
bgneal@27	59 pyplot.yscale('log')
bgneal@27	60 pyplot.title('log f vs log r')
bgneal@27	61 pyplot.xlabel('r')
bgneal@27	62 pyplot.ylabel('f')
bgneal@27	63 pyplot.plot(r_vals, f_vals, label='f vs r', color='green', linewidth=3)
bgneal@27	64 pyplot.legend(loc=4)
bgneal@27	65 pyplot.show()
bgneal@27	66
bgneal@27	67 def main(args=None):
bgneal@27	68
bgneal@27	69 parser = argparse.ArgumentParser(description=DESCRIPTION)
bgneal@27	70 parser.add_argument('-p', '--plot', action='store_true', default=False,
bgneal@27	71 help='display a plot of the results instead of printing')
bgneal@27	72 parser.add_argument('files', nargs='+', type=argparse.FileType('r'),
bgneal@27	73 metavar='filename',
bgneal@27	74 help='filename to read words from')
bgneal@27	75
bgneal@27	76 opts = parser.parse_args(args=args)
bgneal@27	77
bgneal@27	78 counter = collections.Counter()
bgneal@27	79
bgneal@27	80 for fp in opts.files:
bgneal@27	81 process_file(fp, counter)
bgneal@27	82
bgneal@27	83 if opts.plot:
bgneal@27	84 show_plot(counter)
bgneal@27	85 else:
bgneal@27	86 for word, count in counter.most_common():
bgneal@27	87 print word, count
bgneal@27	88
bgneal@27	89 if __name__ == '__main__':
bgneal@27	90 main()

Mercurial > public > think_complexity

annotate zipf.py @ 38:d876711b88b5