{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Zip code distribution analysis for \"geochunk\" algorithm\n", "\n", "Our goal is to group zip codes by variable length prefixes in such a way that the chunk sizes tend to be roughly equal, with no giant, leftover chunks.\n", "\n", "But first, we need to import some standard Python data libraries." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We have some zip code census data from [this blog post](https://blog.splitwise.com/2013/09/18/the-2010-us-census-population-by-zip-code-totally-free/) that we want to load and examine. We need to do this `np.object_` business to force the CSV parser to treat zip codes as strings, and not as integers (thus losing the leading zeroes)." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Zip Code ZCTA | \n", "2010 Census Population | \n", "
---|---|---|
0 | \n", "01001 | \n", "16769 | \n", "
1 | \n", "01002 | \n", "29049 | \n", "
2 | \n", "01003 | \n", "10372 | \n", "
3 | \n", "01005 | \n", "5079 | \n", "
4 | \n", "01007 | \n", "14649 | \n", "
5 | \n", "01008 | \n", "1263 | \n", "
6 | \n", "01009 | \n", "741 | \n", "
7 | \n", "01010 | \n", "3609 | \n", "
8 | \n", "01011 | \n", "1370 | \n", "
9 | \n", "01012 | \n", "661 | \n", "
\n", " | Zip Code ZCTA | \n", "2010 Census Population | \n", "
---|---|---|
20640 | \n", "60629 | \n", "113916 | \n", "
27727 | \n", "79936 | \n", "111086 | \n", "
2748 | \n", "11368 | \n", "109931 | \n", "
30109 | \n", "90650 | \n", "105549 | \n", "
29986 | \n", "90011 | \n", "103892 | \n", "
30192 | \n", "91331 | \n", "103689 | \n", "
2720 | \n", "11226 | \n", "101572 | \n", "
30045 | \n", "90201 | \n", "101279 | \n", "
2753 | \n", "11373 | \n", "100820 | \n", "
2714 | \n", "11220 | \n", "99598 | \n", "
2759 | \n", "11385 | \n", "98592 | \n", "
2527 | \n", "10467 | \n", "97060 | \n", "
30460 | \n", "92335 | \n", "95397 | \n", "
26671 | \n", "77084 | \n", "95137 | \n", "
2451 | \n", "10025 | \n", "94600 | \n", "
2702 | \n", "11208 | \n", "94469 | \n", "
30075 | \n", "90280 | \n", "94396 | \n", "
26773 | \n", "77449 | \n", "94382 | \n", "
2729 | \n", "11236 | \n", "93877 | \n", "
27192 | \n", "78521 | \n", "93818 | \n", "
\n", " | Zip Code ZCTA | \n", "2010 Census Population | \n", "
---|---|---|
75 | \n", "01199 | \n", "0 | \n", "
383 | \n", "02203 | \n", "0 | \n", "
1423 | \n", "05481 | \n", "0 | \n", "
1558 | \n", "06020 | \n", "0 | \n", "
1580 | \n", "06059 | \n", "0 | \n", "
\n", " | Zip Prefix | \n", "2010 Census Population | \n", "
---|---|---|
533 | \n", "1121 | \n", "710148 | \n", "
534 | \n", "1122 | \n", "621986 | \n", "
498 | \n", "1046 | \n", "605938 | \n", "
3773 | \n", "6062 | \n", "605797 | \n", "
535 | \n", "1123 | \n", "588379 | \n", "
532 | \n", "1120 | \n", "584187 | \n", "
497 | \n", "1045 | \n", "560238 | \n", "
3772 | \n", "6061 | \n", "543914 | \n", "
481 | \n", "1002 | \n", "505290 | \n", "
3774 | \n", "6063 | \n", "497137 | \n", "
\n", " | Zip Prefix | \n", "2010 Census Population | \n", "
---|---|---|
699 | \n", "770 | \n", "2906700 | \n", "
555 | \n", "606 | \n", "2680484 | \n", "
91 | \n", "112 | \n", "2504700 | \n", "
803 | \n", "900 | \n", "2404395 | \n", "
843 | \n", "945 | \n", "2274102 | \n", "
276 | \n", "300 | \n", "2177710 | \n", "
679 | \n", "750 | \n", "2079278 | \n", "
818 | \n", "917 | \n", "1924074 | \n", "
306 | \n", "331 | \n", "1885845 | \n", "
59 | \n", "070 | \n", "1683417 | \n", "
\n", " | Zip Prefix | \n", "2010 Census Population | \n", "
---|---|---|
31 | \n", "33 | \n", "9285786 | \n", "
90 | \n", "92 | \n", "9033701 | \n", "
58 | \n", "60 | \n", "8783167 | \n", "
9 | \n", "11 | \n", "7579159 | \n", "
28 | \n", "30 | \n", "7254703 | \n", "
75 | \n", "77 | \n", "7012758 | \n", "
93 | \n", "95 | \n", "6786586 | \n", "
46 | \n", "48 | \n", "6634334 | \n", "
76 | \n", "78 | \n", "6358782 | \n", "
30 | \n", "32 | \n", "6263654 | \n", "
\n", " | Zip Prefix | \n", "2010 Census Population | \n", "
---|---|---|
67 | \n", "69 | \n", "184566 | \n", "
49 | \n", "51 | \n", "486400 | \n", "
80 | \n", "82 | \n", "520260 | \n", "
86 | \n", "88 | \n", "620867 | \n", "
4 | \n", "05 | \n", "625741 | \n", "
84 | \n", "86 | \n", "644295 | \n", "
56 | \n", "58 | \n", "671781 | \n", "
24 | \n", "26 | \n", "795935 | \n", "
55 | \n", "57 | \n", "813464 | \n", "
79 | \n", "81 | \n", "832537 | \n", "
\n", " | Zip Prefix | \n", "2010 Census Population | \n", "
---|---|---|
0 | \n", "0 | \n", "23236989 | \n", "
1 | \n", "1 | \n", "32978141 | \n", "
2 | \n", "2 | \n", "30390138 | \n", "
3 | \n", "3 | \n", "42575790 | \n", "
4 | \n", "4 | \n", "32249466 | \n", "
5 | \n", "5 | \n", "16513346 | \n", "
6 | \n", "6 | \n", "23499661 | \n", "
7 | \n", "7 | \n", "36345281 | \n", "
8 | \n", "8 | \n", "21075552 | \n", "
9 | \n", "9 | \n", "49875567 | \n", "
\n", " | 2010 Census Population | \n", "Prefixes | \n", "
---|---|---|
Chunk | \n", "\n", " | \n", " |
010_0 | \n", "77922 | \n", "[0100] | \n", "
010_1 | \n", "96062 | \n", "[0101, 0102] | \n", "
010_2 | \n", "76753 | \n", "[0103, 0104] | \n", "
010_3 | \n", "73002 | \n", "[0105, 0106] | \n", "
010_4 | \n", "37535 | \n", "[0107] | \n", "
010_5 | \n", "88283 | \n", "[0108] | \n", "
010_6 | \n", "19818 | \n", "[0109] | \n", "
0110_0 | \n", "92014 | \n", "[01100, 01101, 01102, 01103, 01104, 01105, 011... | \n", "
0110_1 | \n", "30250 | \n", "[01109] | \n", "
011_0 | \n", "46571 | \n", "[0111, 0112, 0113, 0114, 0115, 0116, 0117, 011... | \n", "
012_0 | \n", "89837 | \n", "[0120, 0121, 0122, 0123] | \n", "
012_1 | \n", "40852 | \n", "[0124, 0125, 0126, 0127, 0128, 0129] | \n", "
014_0 | \n", "82626 | \n", "[0140, 0141, 0142, 0143, 0144] | \n", "
014_1 | \n", "60658 | \n", "[0145] | \n", "
014_2 | \n", "68528 | \n", "[0146, 0147, 0148, 0149] | \n", "
015_0 | \n", "82551 | \n", "[0150, 0151] | \n", "
015_1 | \n", "85393 | \n", "[0152, 0153] | \n", "
015_2 | \n", "76897 | \n", "[0154, 0155] | \n", "
015_3 | \n", "79853 | \n", "[0156, 0157] | \n", "
015_4 | \n", "49025 | \n", "[0158, 0159] | \n", "
0160_0 | \n", "77412 | \n", "[01600, 01601, 01602, 01603, 01604] | \n", "
0160_1 | \n", "79958 | \n", "[01605, 01606, 01607, 01608, 01609] | \n", "
016_0 | \n", "31032 | \n", "[0161, 0162, 0163, 0164, 0165, 0166, 0167, 016... | \n", "
017_0 | \n", "73888 | \n", "[0170, 0171] | \n", "
017_1 | \n", "52571 | \n", "[0172, 0173] | \n", "
017_2 | \n", "81318 | \n", "[0174] | \n", "
017_3 | \n", "82505 | \n", "[0175] | \n", "
017_4 | \n", "88745 | \n", "[0176, 0177, 0178, 0179] | \n", "
0184_0 | \n", "76383 | \n", "[01840, 01841, 01842, 01843] | \n", "
0184_1 | \n", "75601 | \n", "[01844, 01845, 01846, 01847, 01848, 01849] | \n", "
... | \n", "... | \n", "... | \n", "
988_1 | \n", "68451 | \n", "[9882, 9883] | \n", "
988_2 | \n", "58751 | \n", "[9884, 9885, 9886, 9887, 9888, 9889] | \n", "
9890_0 | \n", "91008 | \n", "[98900, 98901, 98902, 98903, 98904, 98905, 989... | \n", "
9890_1 | \n", "35240 | \n", "[98908, 98909] | \n", "
989_0 | \n", "77532 | \n", "[9891, 9892, 9893] | \n", "
989_1 | \n", "80875 | \n", "[9894, 9895, 9896, 9897, 9898, 9899] | \n", "
990_0 | \n", "82533 | \n", "[9900, 9901] | \n", "
990_1 | \n", "59475 | \n", "[9902, 9903, 9904, 9905, 9906, 9907, 9908, 9909] | \n", "
991_0 | \n", "67655 | \n", "[9910, 9911, 9912, 9913, 9914, 9915] | \n", "
991_1 | \n", "47583 | \n", "[9916, 9917, 9918, 9919] | \n", "
9920_0 | \n", "61795 | \n", "[99200, 99201, 99202, 99203, 99204] | \n", "
9920_1 | \n", "76939 | \n", "[99205, 99206] | \n", "
9920_2 | \n", "80047 | \n", "[99207, 99208, 99209] | \n", "
992_0 | \n", "76513 | \n", "[9921] | \n", "
992_1 | \n", "47562 | \n", "[9922, 9923, 9924, 9925, 9926, 9927, 9928, 9929] | \n", "
993_0 | \n", "99875 | \n", "[9930, 9931, 9932] | \n", "
993_1 | \n", "91450 | \n", "[9933] | \n", "
993_2 | \n", "33446 | \n", "[9934] | \n", "
993_3 | \n", "79378 | \n", "[9935] | \n", "
993_4 | \n", "44835 | \n", "[9936, 9937, 9938, 9939] | \n", "
9950_0 | \n", "97248 | \n", "[99500, 99501, 99502, 99503, 99504] | \n", "
9950_1 | \n", "87630 | \n", "[99505, 99506, 99507, 99508, 99509] | \n", "
995_0 | \n", "97276 | \n", "[9951, 9952, 9953, 9954, 9955, 9956] | \n", "
995_1 | \n", "38506 | \n", "[9957, 9958, 9959] | \n", "
996_0 | \n", "78544 | \n", "[9960, 9961, 9962, 9963, 9964] | \n", "
996_1 | \n", "89277 | \n", "[9965, 9966, 9967] | \n", "
996_2 | \n", "16262 | \n", "[9968, 9969] | \n", "
997_0 | \n", "96393 | \n", "[9970, 9971] | \n", "
997_1 | \n", "37796 | \n", "[9972, 9973, 9974, 9975, 9976, 9977, 9978, 9979] | \n", "
99_0 | \n", "92694 | \n", "[994, 998, 999] | \n", "
4444 rows × 2 columns
\n", "\n", " | 2010 Census Population | \n", "Prefixes | \n", "
---|---|---|
Chunk | \n", "\n", " | \n", " |
010_0 | \n", "210857 | \n", "[0100, 0101, 0102, 0103] | \n", "
010_1 | \n", "238700 | \n", "[0104, 0105, 0106, 0107, 0108] | \n", "
010_2 | \n", "19818 | \n", "[0109] | \n", "
015_0 | \n", "244841 | \n", "[0150, 0151, 0152, 0153, 0154, 0155] | \n", "
015_1 | \n", "128878 | \n", "[0156, 0157, 0158, 0159] | \n", "
017_0 | \n", "207777 | \n", "[0170, 0171, 0172, 0173, 0174] | \n", "
017_1 | \n", "171250 | \n", "[0175, 0176, 0177, 0178, 0179] | \n", "
018_0 | \n", "184413 | \n", "[0180, 0181, 0182] | \n", "
018_1 | \n", "227505 | \n", "[0183, 0184] | \n", "
018_2 | \n", "210817 | \n", "[0185, 0186, 0187] | \n", "
018_3 | \n", "90391 | \n", "[0188, 0189] | \n", "
019_0 | \n", "231079 | \n", "[0190, 0191, 0192] | \n", "
019_1 | \n", "245235 | \n", "[0193, 0194, 0195, 0196, 0197, 0198, 0199] | \n", "
01_0 | \n", "168835 | \n", "[011] | \n", "
01_1 | \n", "213800 | \n", "[012, 013] | \n", "
01_2 | \n", "211812 | \n", "[014] | \n", "
01_3 | \n", "188402 | \n", "[016] | \n", "
020_0 | \n", "184275 | \n", "[0200, 0201, 0202, 0203, 0204] | \n", "
020_1 | \n", "216096 | \n", "[0205, 0206, 0207, 0208, 0209] | \n", "
0212_0 | \n", "243585 | \n", "[02120, 02121, 02122, 02123, 02124, 02125, 021... | \n", "
0212_1 | \n", "16439 | \n", "[02129] | \n", "
0213_0 | \n", "220173 | \n", "[02130, 02131, 02132, 02133, 02134, 02135, 021... | \n", "
0213_1 | \n", "36349 | \n", "[02139] | \n", "
021_0 | \n", "136539 | \n", "[0210, 0211] | \n", "
021_1 | \n", "207555 | \n", "[0214] | \n", "
021_2 | \n", "220030 | \n", "[0215, 0216] | \n", "
021_3 | \n", "203272 | \n", "[0217, 0218, 0219] | \n", "
023_0 | \n", "241125 | \n", "[0230, 0231, 0232, 0233, 0234] | \n", "
023_1 | \n", "199938 | \n", "[0235, 0236, 0237, 0238, 0239] | \n", "
024_0 | \n", "242552 | \n", "[0240, 0241, 0242, 0243, 0244, 0245, 0246] | \n", "
... | \n", "... | \n", "... | \n", "
981_3 | \n", "207457 | \n", "[9815, 9816, 9817, 9818, 9819] | \n", "
982_0 | \n", "168159 | \n", "[9820, 9821] | \n", "
982_1 | \n", "230928 | \n", "[9822, 9823] | \n", "
982_2 | \n", "138281 | \n", "[9824, 9825, 9826] | \n", "
982_3 | \n", "249350 | \n", "[9827, 9828] | \n", "
982_4 | \n", "90767 | \n", "[9829] | \n", "
983_0 | \n", "240218 | \n", "[9830, 9831, 9832, 9833, 9834] | \n", "
983_1 | \n", "148730 | \n", "[9835, 9836] | \n", "
983_2 | \n", "162860 | \n", "[9837] | \n", "
983_3 | \n", "161800 | \n", "[9838, 9839] | \n", "
984_0 | \n", "214804 | \n", "[9840, 9841, 9842, 9843] | \n", "
984_1 | \n", "185311 | \n", "[9844, 9845, 9846, 9847, 9848, 9849] | \n", "
985_0 | \n", "241456 | \n", "[9850, 9851, 9852] | \n", "
985_1 | \n", "234098 | \n", "[9853, 9854, 9855, 9856, 9857, 9858, 9859] | \n", "
986_0 | \n", "227225 | \n", "[9860, 9861, 9862, 9863, 9864, 9865] | \n", "
986_1 | \n", "191810 | \n", "[9866, 9867] | \n", "
986_2 | \n", "154295 | \n", "[9868, 9869] | \n", "
989_0 | \n", "203780 | \n", "[9890, 9891, 9892, 9893] | \n", "
989_1 | \n", "80875 | \n", "[9894, 9895, 9896, 9897, 9898, 9899] | \n", "
98_0 | \n", "219761 | \n", "[987, 988] | \n", "
992_0 | \n", "218781 | \n", "[9920] | \n", "
992_1 | \n", "124075 | \n", "[9921, 9922, 9923, 9924, 9925, 9926, 9927, 992... | \n", "
993_0 | \n", "224771 | \n", "[9930, 9931, 9932, 9933, 9934] | \n", "
993_1 | \n", "124213 | \n", "[9935, 9936, 9937, 9938, 9939] | \n", "
995_0 | \n", "184878 | \n", "[9950] | \n", "
995_1 | \n", "135782 | \n", "[9951, 9952, 9953, 9954, 9955, 9956, 9957, 995... | \n", "
99_0 | \n", "142008 | \n", "[990] | \n", "
99_1 | \n", "136934 | \n", "[991, 994] | \n", "
99_2 | \n", "184083 | \n", "[996] | \n", "
99_3 | \n", "205187 | \n", "[997, 998, 999] | \n", "
1780 rows × 2 columns
\n", "\n", " | 2010 Census Population | \n", "Prefixes | \n", "
---|---|---|
Chunk | \n", "\n", " | \n", " |
018_0 | \n", "411918 | \n", "[0180, 0181, 0182, 0183, 0184] | \n", "
018_1 | \n", "301208 | \n", "[0185, 0186, 0187, 0188, 0189] | \n", "
01_0 | \n", "469375 | \n", "[010] | \n", "
01_1 | \n", "382635 | \n", "[011, 012, 013] | \n", "
01_2 | \n", "211812 | \n", "[014] | \n", "
01_3 | \n", "373719 | \n", "[015] | \n", "
01_4 | \n", "188402 | \n", "[016] | \n", "
01_5 | \n", "379027 | \n", "[017] | \n", "
01_6 | \n", "476314 | \n", "[019] | \n", "
021_0 | \n", "396563 | \n", "[0210, 0211, 0212] | \n", "
021_1 | \n", "464077 | \n", "[0213, 0214] | \n", "
021_2 | \n", "423302 | \n", "[0215, 0216, 0217, 0218, 0219] | \n", "
027_0 | \n", "461337 | \n", "[0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277] | \n", "
027_1 | \n", "64999 | \n", "[0278, 0279] | \n", "
028_0 | \n", "451970 | \n", "[0280, 0281, 0282, 0283, 0284, 0285, 0286, 0287] | \n", "
028_1 | \n", "221007 | \n", "[0288, 0289] | \n", "
02_0 | \n", "428586 | \n", "[020, 022] | \n", "
02_1 | \n", "441063 | \n", "[023] | \n", "
02_2 | \n", "408771 | \n", "[024] | \n", "
02_3 | \n", "264504 | \n", "[025, 026] | \n", "
02_4 | \n", "379604 | \n", "[029] | \n", "
03_0 | \n", "377547 | \n", "[030] | \n", "
03_1 | \n", "481457 | \n", "[031, 032, 033, 034] | \n", "
03_2 | \n", "457569 | \n", "[035, 036, 037, 038] | \n", "
03_3 | \n", "48241 | \n", "[039] | \n", "
04_0 | \n", "449276 | \n", "[040, 041] | \n", "
04_1 | \n", "484241 | \n", "[042, 043, 044, 045] | \n", "
04_2 | \n", "346497 | \n", "[046, 047, 048, 049] | \n", "
05_0 | \n", "480908 | \n", "[050, 051, 052, 053, 054, 055, 056] | \n", "
05_1 | \n", "144833 | \n", "[057, 058, 059] | \n", "
... | \n", "... | \n", "... | \n", "
972_0 | \n", "392534 | \n", "[9720, 9721] | \n", "
972_1 | \n", "449177 | \n", "[9722, 9723, 9724, 9725, 9726, 9727, 9728, 9729] | \n", "
973_0 | \n", "462185 | \n", "[9730, 9731, 9732, 9733, 9734, 9735] | \n", "
973_1 | \n", "135381 | \n", "[9736, 9737, 9738, 9739] | \n", "
974_0 | \n", "394715 | \n", "[9740, 9741, 9742, 9743, 9744, 9745, 9746] | \n", "
974_1 | \n", "161028 | \n", "[9747, 9748, 9749] | \n", "
97_0 | \n", "319301 | \n", "[971] | \n", "
97_1 | \n", "355650 | \n", "[975, 976] | \n", "
97_2 | \n", "389914 | \n", "[977, 978, 979] | \n", "
980_0 | \n", "341896 | \n", "[9800, 9801] | \n", "
980_1 | \n", "226002 | \n", "[9802] | \n", "
980_2 | \n", "382521 | \n", "[9803, 9804] | \n", "
980_3 | \n", "398103 | \n", "[9805, 9806, 9807, 9808, 9809] | \n", "
981_0 | \n", "428717 | \n", "[9810, 9811] | \n", "
981_1 | \n", "432100 | \n", "[9812, 9813, 9814, 9815, 9816, 9817, 9818, 9819] | \n", "
982_0 | \n", "447430 | \n", "[9820, 9821, 9822, 9823, 9824] | \n", "
982_1 | \n", "430055 | \n", "[9825, 9826, 9827, 9828, 9829] | \n", "
983_0 | \n", "388948 | \n", "[9830, 9831, 9832, 9833, 9834, 9835, 9836] | \n", "
983_1 | \n", "324660 | \n", "[9837, 9838, 9839] | \n", "
986_0 | \n", "419035 | \n", "[9860, 9861, 9862, 9863, 9864, 9865, 9866, 9867] | \n", "
986_1 | \n", "154295 | \n", "[9868, 9869] | \n", "
98_0 | \n", "400115 | \n", "[984] | \n", "
98_1 | \n", "475554 | \n", "[985, 987] | \n", "
98_2 | \n", "219761 | \n", "[988] | \n", "
98_3 | \n", "284655 | \n", "[989] | \n", "
99_0 | \n", "257246 | \n", "[990, 991] | \n", "
99_1 | \n", "342856 | \n", "[992] | \n", "
99_2 | \n", "370680 | \n", "[993, 994] | \n", "
99_3 | \n", "320660 | \n", "[995] | \n", "
99_4 | \n", "389270 | \n", "[996, 997, 998, 999] | \n", "
875 rows × 2 columns
\n", "