changeset 31:a2358c64d9af

Chapter 5.4, exercise 5: Pareto distribution and city populations.
author Brian Neal <bgneal@gmail.com>
date Mon, 07 Jan 2013 20:41:44 -0600
parents 5a9a6d1dbf1b
children a13c00c0dfe5
files ch5ex5.py
diffstat 1 files changed, 87 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ch5ex5.py	Mon Jan 07 20:41:44 2013 -0600
@@ -0,0 +1,87 @@
+"""Chapter 5.3 exercise 5 in Allen Downey's Think Complexity book.
+
+"The distribution of populations for cities and towns has been proposed as an
+example of a real-world phenomenon that can be described with a Pareto
+distribution.
+
+The U.S. Census Bureau publishes data on the population of every incorporated
+city and town in the United States. I wrote a small program that downloads this
+data and converts it into a convenient form. You can download it from
+thinkcomplex.com/populations.py.
+
+Read over the program to make sure you know what it does and then write
+a program that computes and plots the distribution of populations for the 14593
+cities and towns in the dataset.
+
+Plot the CDF on linear and log-x scales so you can get a sense of the shape of
+the distribution. Then plot the CCDF on a log-log scale to see if it has the
+characteristic shape of a Pareto distribution.
+
+What conclusion do you draw about the distribution of sizes for cities and
+towns?"
+
+"""
+import sys
+
+import matplotlib.pyplot as pyplot
+
+
+def plot_ccdf_log_log(x_vals, y_vals, title=''):
+    """Given a set of x-values and y-values from a continuous distribution, plot
+    the complementary distribution (CCDF) on a log-log scale.
+
+    """
+    if len(x_vals) != len(y_vals):
+        raise ValueError
+
+    ys = [1.0 - y for y in y_vals]
+
+    pyplot.clf()
+    pyplot.xscale('log')
+    pyplot.yscale('log')
+    pyplot.title(title)
+    pyplot.xlabel('x')
+    pyplot.ylabel('1-y')
+    pyplot.plot(x_vals, ys, label='1-y', color='green', linewidth=3)
+    pyplot.legend(loc='upper right')
+    pyplot.show()
+
+
+def main(script, filename):
+
+    with open(filename, 'r') as fp:
+        y_vals = [int(y) for y in fp]
+
+    print 'Read {} populations'.format(len(y_vals))
+    y_vals.sort(reverse=True)
+
+    x_vals = range(len(y_vals))
+
+    pyplot.clf()
+    pyplot.xscale('linear')
+    pyplot.yscale('linear')
+    pyplot.title('Populations')
+    pyplot.xlabel('x')
+    pyplot.ylabel('y')
+    pyplot.plot(x_vals, y_vals, label='Population Linear', color='green', linewidth=3)
+    pyplot.legend(loc='upper right')
+    pyplot.show()
+
+    pyplot.clf()
+    pyplot.xscale('log')
+    pyplot.yscale('linear')
+    pyplot.title('Populations')
+    pyplot.xlabel('x')
+    pyplot.ylabel('y')
+    pyplot.plot(x_vals, y_vals, label='Population Log-x', color='green', linewidth=3)
+    pyplot.legend(loc='upper right')
+    pyplot.show()
+
+    # normalize to 0-1
+    max_p = y_vals[0]
+    ys = [y / float(max_p) for y in y_vals]
+
+    plot_ccdf_log_log(x_vals, ys, 'Population CCDF log-log')
+
+if __name__ == '__main__':
+    main(*sys.argv)