bgneal@31: """Chapter 5.3 exercise 5 in Allen Downey's Think Complexity book.
bgneal@31: 
bgneal@31: "The distribution of populations for cities and towns has been proposed as an
bgneal@31: example of a real-world phenomenon that can be described with a Pareto
bgneal@31: distribution.
bgneal@31: 
bgneal@31: The U.S. Census Bureau publishes data on the population of every incorporated
bgneal@31: city and town in the United States. I wrote a small program that downloads this
bgneal@31: data and converts it into a convenient form. You can download it from
bgneal@31: thinkcomplex.com/populations.py.
bgneal@31: 
bgneal@31: Read over the program to make sure you know what it does and then write
bgneal@31: a program that computes and plots the distribution of populations for the 14593
bgneal@31: cities and towns in the dataset.
bgneal@31: 
bgneal@31: Plot the CDF on linear and log-x scales so you can get a sense of the shape of
bgneal@31: the distribution. Then plot the CCDF on a log-log scale to see if it has the
bgneal@31: characteristic shape of a Pareto distribution.
bgneal@31: 
bgneal@31: What conclusion do you draw about the distribution of sizes for cities and
bgneal@31: towns?"
bgneal@31: 
bgneal@31: """
bgneal@31: import sys
bgneal@31: 
bgneal@31: import matplotlib.pyplot as pyplot
bgneal@31: 
bgneal@31: 
bgneal@31: def plot_ccdf_log_log(x_vals, y_vals, title=''):
bgneal@31:     """Given a set of x-values and y-values from a continuous distribution, plot
bgneal@31:     the complementary distribution (CCDF) on a log-log scale.
bgneal@31: 
bgneal@31:     """
bgneal@31:     if len(x_vals) != len(y_vals):
bgneal@31:         raise ValueError
bgneal@31: 
bgneal@31:     ys = [1.0 - y for y in y_vals]
bgneal@31: 
bgneal@31:     pyplot.clf()
bgneal@31:     pyplot.xscale('log')
bgneal@31:     pyplot.yscale('log')
bgneal@31:     pyplot.title(title)
bgneal@31:     pyplot.xlabel('x')
bgneal@31:     pyplot.ylabel('1-y')
bgneal@31:     pyplot.plot(x_vals, ys, label='1-y', color='green', linewidth=3)
bgneal@31:     pyplot.legend(loc='upper right')
bgneal@31:     pyplot.show()
bgneal@31: 
bgneal@31: 
bgneal@31: def main(script, filename):
bgneal@31: 
bgneal@31:     with open(filename, 'r') as fp:
bgneal@31:         y_vals = [int(y) for y in fp]
bgneal@31: 
bgneal@31:     print 'Read {} populations'.format(len(y_vals))
bgneal@31:     y_vals.sort(reverse=True)
bgneal@31: 
bgneal@31:     x_vals = range(len(y_vals))
bgneal@31: 
bgneal@31:     pyplot.clf()
bgneal@31:     pyplot.xscale('linear')
bgneal@31:     pyplot.yscale('linear')
bgneal@31:     pyplot.title('Populations')
bgneal@31:     pyplot.xlabel('x')
bgneal@31:     pyplot.ylabel('y')
bgneal@31:     pyplot.plot(x_vals, y_vals, label='Population Linear', color='green', linewidth=3)
bgneal@31:     pyplot.legend(loc='upper right')
bgneal@31:     pyplot.show()
bgneal@31: 
bgneal@31:     pyplot.clf()
bgneal@31:     pyplot.xscale('log')
bgneal@31:     pyplot.yscale('linear')
bgneal@31:     pyplot.title('Populations')
bgneal@31:     pyplot.xlabel('x')
bgneal@31:     pyplot.ylabel('y')
bgneal@31:     pyplot.plot(x_vals, y_vals, label='Population Log-x', color='green', linewidth=3)
bgneal@31:     pyplot.legend(loc='upper right')
bgneal@31:     pyplot.show()
bgneal@31: 
bgneal@31:     # normalize to 0-1
bgneal@31:     max_p = y_vals[0]
bgneal@31:     ys = [y / float(max_p) for y in y_vals]
bgneal@31: 
bgneal@31:     plot_ccdf_log_log(x_vals, ys, 'Population CCDF log-log')
bgneal@31: 
bgneal@31: if __name__ == '__main__':
bgneal@31:     main(*sys.argv)