view ch5ex5.py @ 31:a2358c64d9af

Chapter 5.4, exercise 5: Pareto distribution and city populations.
author Brian Neal <bgneal@gmail.com>
date Mon, 07 Jan 2013 20:41:44 -0600
parents
children
line wrap: on
line source
"""Chapter 5.3 exercise 5 in Allen Downey's Think Complexity book.

"The distribution of populations for cities and towns has been proposed as an
example of a real-world phenomenon that can be described with a Pareto
distribution.

The U.S. Census Bureau publishes data on the population of every incorporated
city and town in the United States. I wrote a small program that downloads this
data and converts it into a convenient form. You can download it from
thinkcomplex.com/populations.py.

Read over the program to make sure you know what it does and then write
a program that computes and plots the distribution of populations for the 14593
cities and towns in the dataset.

Plot the CDF on linear and log-x scales so you can get a sense of the shape of
the distribution. Then plot the CCDF on a log-log scale to see if it has the
characteristic shape of a Pareto distribution.

What conclusion do you draw about the distribution of sizes for cities and
towns?"

"""
import sys

import matplotlib.pyplot as pyplot


def plot_ccdf_log_log(x_vals, y_vals, title=''):
    """Given a set of x-values and y-values from a continuous distribution, plot
    the complementary distribution (CCDF) on a log-log scale.

    """
    if len(x_vals) != len(y_vals):
        raise ValueError

    ys = [1.0 - y for y in y_vals]

    pyplot.clf()
    pyplot.xscale('log')
    pyplot.yscale('log')
    pyplot.title(title)
    pyplot.xlabel('x')
    pyplot.ylabel('1-y')
    pyplot.plot(x_vals, ys, label='1-y', color='green', linewidth=3)
    pyplot.legend(loc='upper right')
    pyplot.show()


def main(script, filename):

    with open(filename, 'r') as fp:
        y_vals = [int(y) for y in fp]

    print 'Read {} populations'.format(len(y_vals))
    y_vals.sort(reverse=True)

    x_vals = range(len(y_vals))

    pyplot.clf()
    pyplot.xscale('linear')
    pyplot.yscale('linear')
    pyplot.title('Populations')
    pyplot.xlabel('x')
    pyplot.ylabel('y')
    pyplot.plot(x_vals, y_vals, label='Population Linear', color='green', linewidth=3)
    pyplot.legend(loc='upper right')
    pyplot.show()

    pyplot.clf()
    pyplot.xscale('log')
    pyplot.yscale('linear')
    pyplot.title('Populations')
    pyplot.xlabel('x')
    pyplot.ylabel('y')
    pyplot.plot(x_vals, y_vals, label='Population Log-x', color='green', linewidth=3)
    pyplot.legend(loc='upper right')
    pyplot.show()

    # normalize to 0-1
    max_p = y_vals[0]
    ys = [y / float(max_p) for y in y_vals]

    plot_ccdf_log_log(x_vals, ys, 'Population CCDF log-log')

if __name__ == '__main__':
    main(*sys.argv)