Mercurial > public > think_complexity
comparison ch5ex5.py @ 31:a2358c64d9af
Chapter 5.4, exercise 5: Pareto distribution and city populations.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Mon, 07 Jan 2013 20:41:44 -0600 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
30:5a9a6d1dbf1b | 31:a2358c64d9af |
---|---|
1 """Chapter 5.3 exercise 5 in Allen Downey's Think Complexity book. | |
2 | |
3 "The distribution of populations for cities and towns has been proposed as an | |
4 example of a real-world phenomenon that can be described with a Pareto | |
5 distribution. | |
6 | |
7 The U.S. Census Bureau publishes data on the population of every incorporated | |
8 city and town in the United States. I wrote a small program that downloads this | |
9 data and converts it into a convenient form. You can download it from | |
10 thinkcomplex.com/populations.py. | |
11 | |
12 Read over the program to make sure you know what it does and then write | |
13 a program that computes and plots the distribution of populations for the 14593 | |
14 cities and towns in the dataset. | |
15 | |
16 Plot the CDF on linear and log-x scales so you can get a sense of the shape of | |
17 the distribution. Then plot the CCDF on a log-log scale to see if it has the | |
18 characteristic shape of a Pareto distribution. | |
19 | |
20 What conclusion do you draw about the distribution of sizes for cities and | |
21 towns?" | |
22 | |
23 """ | |
24 import sys | |
25 | |
26 import matplotlib.pyplot as pyplot | |
27 | |
28 | |
29 def plot_ccdf_log_log(x_vals, y_vals, title=''): | |
30 """Given a set of x-values and y-values from a continuous distribution, plot | |
31 the complementary distribution (CCDF) on a log-log scale. | |
32 | |
33 """ | |
34 if len(x_vals) != len(y_vals): | |
35 raise ValueError | |
36 | |
37 ys = [1.0 - y for y in y_vals] | |
38 | |
39 pyplot.clf() | |
40 pyplot.xscale('log') | |
41 pyplot.yscale('log') | |
42 pyplot.title(title) | |
43 pyplot.xlabel('x') | |
44 pyplot.ylabel('1-y') | |
45 pyplot.plot(x_vals, ys, label='1-y', color='green', linewidth=3) | |
46 pyplot.legend(loc='upper right') | |
47 pyplot.show() | |
48 | |
49 | |
50 def main(script, filename): | |
51 | |
52 with open(filename, 'r') as fp: | |
53 y_vals = [int(y) for y in fp] | |
54 | |
55 print 'Read {} populations'.format(len(y_vals)) | |
56 y_vals.sort(reverse=True) | |
57 | |
58 x_vals = range(len(y_vals)) | |
59 | |
60 pyplot.clf() | |
61 pyplot.xscale('linear') | |
62 pyplot.yscale('linear') | |
63 pyplot.title('Populations') | |
64 pyplot.xlabel('x') | |
65 pyplot.ylabel('y') | |
66 pyplot.plot(x_vals, y_vals, label='Population Linear', color='green', linewidth=3) | |
67 pyplot.legend(loc='upper right') | |
68 pyplot.show() | |
69 | |
70 pyplot.clf() | |
71 pyplot.xscale('log') | |
72 pyplot.yscale('linear') | |
73 pyplot.title('Populations') | |
74 pyplot.xlabel('x') | |
75 pyplot.ylabel('y') | |
76 pyplot.plot(x_vals, y_vals, label='Population Log-x', color='green', linewidth=3) | |
77 pyplot.legend(loc='upper right') | |
78 pyplot.show() | |
79 | |
80 # normalize to 0-1 | |
81 max_p = y_vals[0] | |
82 ys = [y / float(max_p) for y in y_vals] | |
83 | |
84 plot_ccdf_log_log(x_vals, ys, 'Population CCDF log-log') | |
85 | |
86 if __name__ == '__main__': | |
87 main(*sys.argv) |