comparison ch5ex5.py @ 31:a2358c64d9af

Chapter 5.4, exercise 5: Pareto distribution and city populations.
author Brian Neal <bgneal@gmail.com>
date Mon, 07 Jan 2013 20:41:44 -0600
parents
children
comparison
equal deleted inserted replaced
30:5a9a6d1dbf1b 31:a2358c64d9af
1 """Chapter 5.3 exercise 5 in Allen Downey's Think Complexity book.
2
3 "The distribution of populations for cities and towns has been proposed as an
4 example of a real-world phenomenon that can be described with a Pareto
5 distribution.
6
7 The U.S. Census Bureau publishes data on the population of every incorporated
8 city and town in the United States. I wrote a small program that downloads this
9 data and converts it into a convenient form. You can download it from
10 thinkcomplex.com/populations.py.
11
12 Read over the program to make sure you know what it does and then write
13 a program that computes and plots the distribution of populations for the 14593
14 cities and towns in the dataset.
15
16 Plot the CDF on linear and log-x scales so you can get a sense of the shape of
17 the distribution. Then plot the CCDF on a log-log scale to see if it has the
18 characteristic shape of a Pareto distribution.
19
20 What conclusion do you draw about the distribution of sizes for cities and
21 towns?"
22
23 """
24 import sys
25
26 import matplotlib.pyplot as pyplot
27
28
29 def plot_ccdf_log_log(x_vals, y_vals, title=''):
30 """Given a set of x-values and y-values from a continuous distribution, plot
31 the complementary distribution (CCDF) on a log-log scale.
32
33 """
34 if len(x_vals) != len(y_vals):
35 raise ValueError
36
37 ys = [1.0 - y for y in y_vals]
38
39 pyplot.clf()
40 pyplot.xscale('log')
41 pyplot.yscale('log')
42 pyplot.title(title)
43 pyplot.xlabel('x')
44 pyplot.ylabel('1-y')
45 pyplot.plot(x_vals, ys, label='1-y', color='green', linewidth=3)
46 pyplot.legend(loc='upper right')
47 pyplot.show()
48
49
50 def main(script, filename):
51
52 with open(filename, 'r') as fp:
53 y_vals = [int(y) for y in fp]
54
55 print 'Read {} populations'.format(len(y_vals))
56 y_vals.sort(reverse=True)
57
58 x_vals = range(len(y_vals))
59
60 pyplot.clf()
61 pyplot.xscale('linear')
62 pyplot.yscale('linear')
63 pyplot.title('Populations')
64 pyplot.xlabel('x')
65 pyplot.ylabel('y')
66 pyplot.plot(x_vals, y_vals, label='Population Linear', color='green', linewidth=3)
67 pyplot.legend(loc='upper right')
68 pyplot.show()
69
70 pyplot.clf()
71 pyplot.xscale('log')
72 pyplot.yscale('linear')
73 pyplot.title('Populations')
74 pyplot.xlabel('x')
75 pyplot.ylabel('y')
76 pyplot.plot(x_vals, y_vals, label='Population Log-x', color='green', linewidth=3)
77 pyplot.legend(loc='upper right')
78 pyplot.show()
79
80 # normalize to 0-1
81 max_p = y_vals[0]
82 ys = [y / float(max_p) for y in y_vals]
83
84 plot_ccdf_log_log(x_vals, ys, 'Population CCDF log-log')
85
86 if __name__ == '__main__':
87 main(*sys.argv)