annotate ch5ex5.py @ 31:a2358c64d9af

Chapter 5.4, exercise 5: Pareto distribution and city populations.
author Brian Neal <bgneal@gmail.com>
date Mon, 07 Jan 2013 20:41:44 -0600
parents
children
rev   line source
bgneal@31 1 """Chapter 5.3 exercise 5 in Allen Downey's Think Complexity book.
bgneal@31 2
bgneal@31 3 "The distribution of populations for cities and towns has been proposed as an
bgneal@31 4 example of a real-world phenomenon that can be described with a Pareto
bgneal@31 5 distribution.
bgneal@31 6
bgneal@31 7 The U.S. Census Bureau publishes data on the population of every incorporated
bgneal@31 8 city and town in the United States. I wrote a small program that downloads this
bgneal@31 9 data and converts it into a convenient form. You can download it from
bgneal@31 10 thinkcomplex.com/populations.py.
bgneal@31 11
bgneal@31 12 Read over the program to make sure you know what it does and then write
bgneal@31 13 a program that computes and plots the distribution of populations for the 14593
bgneal@31 14 cities and towns in the dataset.
bgneal@31 15
bgneal@31 16 Plot the CDF on linear and log-x scales so you can get a sense of the shape of
bgneal@31 17 the distribution. Then plot the CCDF on a log-log scale to see if it has the
bgneal@31 18 characteristic shape of a Pareto distribution.
bgneal@31 19
bgneal@31 20 What conclusion do you draw about the distribution of sizes for cities and
bgneal@31 21 towns?"
bgneal@31 22
bgneal@31 23 """
bgneal@31 24 import sys
bgneal@31 25
bgneal@31 26 import matplotlib.pyplot as pyplot
bgneal@31 27
bgneal@31 28
bgneal@31 29 def plot_ccdf_log_log(x_vals, y_vals, title=''):
bgneal@31 30 """Given a set of x-values and y-values from a continuous distribution, plot
bgneal@31 31 the complementary distribution (CCDF) on a log-log scale.
bgneal@31 32
bgneal@31 33 """
bgneal@31 34 if len(x_vals) != len(y_vals):
bgneal@31 35 raise ValueError
bgneal@31 36
bgneal@31 37 ys = [1.0 - y for y in y_vals]
bgneal@31 38
bgneal@31 39 pyplot.clf()
bgneal@31 40 pyplot.xscale('log')
bgneal@31 41 pyplot.yscale('log')
bgneal@31 42 pyplot.title(title)
bgneal@31 43 pyplot.xlabel('x')
bgneal@31 44 pyplot.ylabel('1-y')
bgneal@31 45 pyplot.plot(x_vals, ys, label='1-y', color='green', linewidth=3)
bgneal@31 46 pyplot.legend(loc='upper right')
bgneal@31 47 pyplot.show()
bgneal@31 48
bgneal@31 49
bgneal@31 50 def main(script, filename):
bgneal@31 51
bgneal@31 52 with open(filename, 'r') as fp:
bgneal@31 53 y_vals = [int(y) for y in fp]
bgneal@31 54
bgneal@31 55 print 'Read {} populations'.format(len(y_vals))
bgneal@31 56 y_vals.sort(reverse=True)
bgneal@31 57
bgneal@31 58 x_vals = range(len(y_vals))
bgneal@31 59
bgneal@31 60 pyplot.clf()
bgneal@31 61 pyplot.xscale('linear')
bgneal@31 62 pyplot.yscale('linear')
bgneal@31 63 pyplot.title('Populations')
bgneal@31 64 pyplot.xlabel('x')
bgneal@31 65 pyplot.ylabel('y')
bgneal@31 66 pyplot.plot(x_vals, y_vals, label='Population Linear', color='green', linewidth=3)
bgneal@31 67 pyplot.legend(loc='upper right')
bgneal@31 68 pyplot.show()
bgneal@31 69
bgneal@31 70 pyplot.clf()
bgneal@31 71 pyplot.xscale('log')
bgneal@31 72 pyplot.yscale('linear')
bgneal@31 73 pyplot.title('Populations')
bgneal@31 74 pyplot.xlabel('x')
bgneal@31 75 pyplot.ylabel('y')
bgneal@31 76 pyplot.plot(x_vals, y_vals, label='Population Log-x', color='green', linewidth=3)
bgneal@31 77 pyplot.legend(loc='upper right')
bgneal@31 78 pyplot.show()
bgneal@31 79
bgneal@31 80 # normalize to 0-1
bgneal@31 81 max_p = y_vals[0]
bgneal@31 82 ys = [y / float(max_p) for y in y_vals]
bgneal@31 83
bgneal@31 84 plot_ccdf_log_log(x_vals, ys, 'Population CCDF log-log')
bgneal@31 85
bgneal@31 86 if __name__ == '__main__':
bgneal@31 87 main(*sys.argv)