# HG changeset patch # User Brian Neal # Date 1357612904 21600 # Node ID a2358c64d9afe498ff106f1775473f84c5f3a9ed # Parent 5a9a6d1dbf1b51d2c7313a936d8921e45c505e89 Chapter 5.4, exercise 5: Pareto distribution and city populations. diff -r 5a9a6d1dbf1b -r a2358c64d9af ch5ex5.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ch5ex5.py Mon Jan 07 20:41:44 2013 -0600 @@ -0,0 +1,87 @@ +"""Chapter 5.3 exercise 5 in Allen Downey's Think Complexity book. + +"The distribution of populations for cities and towns has been proposed as an +example of a real-world phenomenon that can be described with a Pareto +distribution. + +The U.S. Census Bureau publishes data on the population of every incorporated +city and town in the United States. I wrote a small program that downloads this +data and converts it into a convenient form. You can download it from +thinkcomplex.com/populations.py. + +Read over the program to make sure you know what it does and then write +a program that computes and plots the distribution of populations for the 14593 +cities and towns in the dataset. + +Plot the CDF on linear and log-x scales so you can get a sense of the shape of +the distribution. Then plot the CCDF on a log-log scale to see if it has the +characteristic shape of a Pareto distribution. + +What conclusion do you draw about the distribution of sizes for cities and +towns?" + +""" +import sys + +import matplotlib.pyplot as pyplot + + +def plot_ccdf_log_log(x_vals, y_vals, title=''): + """Given a set of x-values and y-values from a continuous distribution, plot + the complementary distribution (CCDF) on a log-log scale. + + """ + if len(x_vals) != len(y_vals): + raise ValueError + + ys = [1.0 - y for y in y_vals] + + pyplot.clf() + pyplot.xscale('log') + pyplot.yscale('log') + pyplot.title(title) + pyplot.xlabel('x') + pyplot.ylabel('1-y') + pyplot.plot(x_vals, ys, label='1-y', color='green', linewidth=3) + pyplot.legend(loc='upper right') + pyplot.show() + + +def main(script, filename): + + with open(filename, 'r') as fp: + y_vals = [int(y) for y in fp] + + print 'Read {} populations'.format(len(y_vals)) + y_vals.sort(reverse=True) + + x_vals = range(len(y_vals)) + + pyplot.clf() + pyplot.xscale('linear') + pyplot.yscale('linear') + pyplot.title('Populations') + pyplot.xlabel('x') + pyplot.ylabel('y') + pyplot.plot(x_vals, y_vals, label='Population Linear', color='green', linewidth=3) + pyplot.legend(loc='upper right') + pyplot.show() + + pyplot.clf() + pyplot.xscale('log') + pyplot.yscale('linear') + pyplot.title('Populations') + pyplot.xlabel('x') + pyplot.ylabel('y') + pyplot.plot(x_vals, y_vals, label='Population Log-x', color='green', linewidth=3) + pyplot.legend(loc='upper right') + pyplot.show() + + # normalize to 0-1 + max_p = y_vals[0] + ys = [y / float(max_p) for y in y_vals] + + plot_ccdf_log_log(x_vals, ys, 'Population CCDF log-log') + +if __name__ == '__main__': + main(*sys.argv)