bgneal@31
|
1 """Chapter 5.3 exercise 5 in Allen Downey's Think Complexity book.
|
bgneal@31
|
2
|
bgneal@31
|
3 "The distribution of populations for cities and towns has been proposed as an
|
bgneal@31
|
4 example of a real-world phenomenon that can be described with a Pareto
|
bgneal@31
|
5 distribution.
|
bgneal@31
|
6
|
bgneal@31
|
7 The U.S. Census Bureau publishes data on the population of every incorporated
|
bgneal@31
|
8 city and town in the United States. I wrote a small program that downloads this
|
bgneal@31
|
9 data and converts it into a convenient form. You can download it from
|
bgneal@31
|
10 thinkcomplex.com/populations.py.
|
bgneal@31
|
11
|
bgneal@31
|
12 Read over the program to make sure you know what it does and then write
|
bgneal@31
|
13 a program that computes and plots the distribution of populations for the 14593
|
bgneal@31
|
14 cities and towns in the dataset.
|
bgneal@31
|
15
|
bgneal@31
|
16 Plot the CDF on linear and log-x scales so you can get a sense of the shape of
|
bgneal@31
|
17 the distribution. Then plot the CCDF on a log-log scale to see if it has the
|
bgneal@31
|
18 characteristic shape of a Pareto distribution.
|
bgneal@31
|
19
|
bgneal@31
|
20 What conclusion do you draw about the distribution of sizes for cities and
|
bgneal@31
|
21 towns?"
|
bgneal@31
|
22
|
bgneal@31
|
23 """
|
bgneal@31
|
24 import sys
|
bgneal@31
|
25
|
bgneal@31
|
26 import matplotlib.pyplot as pyplot
|
bgneal@31
|
27
|
bgneal@31
|
28
|
bgneal@31
|
29 def plot_ccdf_log_log(x_vals, y_vals, title=''):
|
bgneal@31
|
30 """Given a set of x-values and y-values from a continuous distribution, plot
|
bgneal@31
|
31 the complementary distribution (CCDF) on a log-log scale.
|
bgneal@31
|
32
|
bgneal@31
|
33 """
|
bgneal@31
|
34 if len(x_vals) != len(y_vals):
|
bgneal@31
|
35 raise ValueError
|
bgneal@31
|
36
|
bgneal@31
|
37 ys = [1.0 - y for y in y_vals]
|
bgneal@31
|
38
|
bgneal@31
|
39 pyplot.clf()
|
bgneal@31
|
40 pyplot.xscale('log')
|
bgneal@31
|
41 pyplot.yscale('log')
|
bgneal@31
|
42 pyplot.title(title)
|
bgneal@31
|
43 pyplot.xlabel('x')
|
bgneal@31
|
44 pyplot.ylabel('1-y')
|
bgneal@31
|
45 pyplot.plot(x_vals, ys, label='1-y', color='green', linewidth=3)
|
bgneal@31
|
46 pyplot.legend(loc='upper right')
|
bgneal@31
|
47 pyplot.show()
|
bgneal@31
|
48
|
bgneal@31
|
49
|
bgneal@31
|
50 def main(script, filename):
|
bgneal@31
|
51
|
bgneal@31
|
52 with open(filename, 'r') as fp:
|
bgneal@31
|
53 y_vals = [int(y) for y in fp]
|
bgneal@31
|
54
|
bgneal@31
|
55 print 'Read {} populations'.format(len(y_vals))
|
bgneal@31
|
56 y_vals.sort(reverse=True)
|
bgneal@31
|
57
|
bgneal@31
|
58 x_vals = range(len(y_vals))
|
bgneal@31
|
59
|
bgneal@31
|
60 pyplot.clf()
|
bgneal@31
|
61 pyplot.xscale('linear')
|
bgneal@31
|
62 pyplot.yscale('linear')
|
bgneal@31
|
63 pyplot.title('Populations')
|
bgneal@31
|
64 pyplot.xlabel('x')
|
bgneal@31
|
65 pyplot.ylabel('y')
|
bgneal@31
|
66 pyplot.plot(x_vals, y_vals, label='Population Linear', color='green', linewidth=3)
|
bgneal@31
|
67 pyplot.legend(loc='upper right')
|
bgneal@31
|
68 pyplot.show()
|
bgneal@31
|
69
|
bgneal@31
|
70 pyplot.clf()
|
bgneal@31
|
71 pyplot.xscale('log')
|
bgneal@31
|
72 pyplot.yscale('linear')
|
bgneal@31
|
73 pyplot.title('Populations')
|
bgneal@31
|
74 pyplot.xlabel('x')
|
bgneal@31
|
75 pyplot.ylabel('y')
|
bgneal@31
|
76 pyplot.plot(x_vals, y_vals, label='Population Log-x', color='green', linewidth=3)
|
bgneal@31
|
77 pyplot.legend(loc='upper right')
|
bgneal@31
|
78 pyplot.show()
|
bgneal@31
|
79
|
bgneal@31
|
80 # normalize to 0-1
|
bgneal@31
|
81 max_p = y_vals[0]
|
bgneal@31
|
82 ys = [y / float(max_p) for y in y_vals]
|
bgneal@31
|
83
|
bgneal@31
|
84 plot_ccdf_log_log(x_vals, ys, 'Population CCDF log-log')
|
bgneal@31
|
85
|
bgneal@31
|
86 if __name__ == '__main__':
|
bgneal@31
|
87 main(*sys.argv)
|