From b8fe3d01ea4b5dfa482d1c19b61ad725b6bfdae7 Mon Sep 17 00:00:00 2001 From: Joshua Croff Date: Fri, 6 Sep 2024 23:26:15 -0700 Subject: [PATCH] Update pull_county_decennial_census.ipynb --- .../pull_county_decennial_census.ipynb | 691 +++--------------- 1 file changed, 109 insertions(+), 582 deletions(-) diff --git a/Project-Documentation/Adhoc-Spatial-Analysis/pull_county_decennial_census.ipynb b/Project-Documentation/Adhoc-Spatial-Analysis/pull_county_decennial_census.ipynb index b9282ca..a01c557 100644 --- a/Project-Documentation/Adhoc-Spatial-Analysis/pull_county_decennial_census.ipynb +++ b/Project-Documentation/Adhoc-Spatial-Analysis/pull_county_decennial_census.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 59, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -27,7 +27,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ @@ -79,7 +79,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ @@ -89,7 +89,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ @@ -111,7 +111,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -171,612 +171,139 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 57, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
GEO_IDNAMERace Total PopulationHispanic or LatinoNot Hispanic or LatinoPopulation of One RaceWhiteBlack or African AmericanAmerican Indian and Alaska NativeAsianNative Hawaiian and Other Pacific IslanderSome Other RaceTwo or More Races
00500000US06001Alameda County, California1682353393749128860412000674722771594994131540511132091044088537
10500000US06013Contra Costa County, California11659273149008510277845744554219799425532145205720836666453
20500000US06041Marin County, California26232149410212911198496173149612055516175457204014415
30500000US06055Napa County, California138019488298919083462689092300507105203169105728
40500000US06075San Francisco County, California8739651367617372046917583413064507115702942203244634745446
50500000US06081San Mateo County, California7644421913865730565340872759021470110212277838840584038969
60500000US06085Santa Clara County, California19362594873571448902137063555570842148324075339959451019578267
70500000US06095Solano County, California453491128155325336294516155125600511624709533775298830820
80500000US06097Sonoma County, California48886314143834742532282628579271253053222391708290924599
\n", - "
" - ], - "text/plain": [ - " GEO_ID NAME Race Total Population \\\n", - "0 0500000US06001 Alameda County, California 1682353 \n", - "1 0500000US06013 Contra Costa County, California 1165927 \n", - "2 0500000US06041 Marin County, California 262321 \n", - "3 0500000US06055 Napa County, California 138019 \n", - "4 0500000US06075 San Francisco County, California 873965 \n", - "5 0500000US06081 San Mateo County, California 764442 \n", - "6 0500000US06085 Santa Clara County, California 1936259 \n", - "7 0500000US06095 Solano County, California 453491 \n", - "8 0500000US06097 Sonoma County, California 488863 \n", - "\n", - " Hispanic or Latino Not Hispanic or Latino Population of One Race White \\\n", - "0 393749 1288604 1200067 472277 \n", - "1 314900 851027 784574 455421 \n", - "2 49410 212911 198496 173149 \n", - "3 48829 89190 83462 68909 \n", - "4 136761 737204 691758 341306 \n", - "5 191386 573056 534087 275902 \n", - "6 487357 1448902 1370635 555708 \n", - "7 128155 325336 294516 155125 \n", - "8 141438 347425 322826 285792 \n", - "\n", - " Black or African American American Indian and Alaska Native Asian \\\n", - "0 159499 4131 540511 \n", - "1 97994 2553 214520 \n", - "2 6120 555 16175 \n", - "3 2300 507 10520 \n", - "4 45071 1570 294220 \n", - "5 14701 1021 227783 \n", - "6 42148 3240 753399 \n", - "7 60051 1624 70953 \n", - "8 7125 3053 22239 \n", - "\n", - " Native Hawaiian and Other Pacific Islander Some Other Race Two or More Races \n", - "0 13209 10440 88537 \n", - "1 5720 8366 66453 \n", - "2 457 2040 14415 \n", - "3 316 910 5728 \n", - "4 3244 6347 45446 \n", - "5 8840 5840 38969 \n", - "6 5945 10195 78267 \n", - "7 3775 2988 30820 \n", - "8 1708 2909 24599 " - ] - }, - "execution_count": 66, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "race_out = list(race_rename_dict.values())\n", "# add the GEO_ID and NAME columns\n", "race_out.insert(0, \"GEO_ID\") \n", "race_out.insert(1, \"NAME\")\n", "\n", - "race_df[race_out]" + "# race_df[race_out]" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "# calculate shares of the race total population\n", + "share_cols = race_out.copy()\n", + "share_cols.remove(\"GEO_ID\")\n", + "share_cols.remove(\"NAME\")\n", + "share_cols.remove(\"Race Total Population\")\n", + "\n", + "for group in share_cols:\n", + " total_col = \"Race Total Population\"\n", + "\n", + " race_df[f\"Share {group}\"] = round(race_df[f\"{group}\"].astype(int) / race_df[total_col].astype(int), 4)" ] }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 59, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
GEO_IDAge Total PopulationTotal MaleMale Under 5Male 5 to 9Male 10 to 14Male 15 to 17Male 18 to 19Male 20Male 21...Female 50 to 54Female 55 to 59Female 60 to 61Female 62 to 64Female 65 to 66Female 67 to 69Female 70 to 74Female 75 to 79Female 80 to 84Female 85 and over
00500000US06001168235382442645635492005139729725225131140711379...53682539062029629019179952417734736230721532818428
10500000US060131165927568066315383671740960252761504473476671...40687414101599122546139001898528511192641261214456
20500000US060412623211285935933721885775422293012801201...10380993639345536390755798742610338824271
30500000US060551380196777731293757438428111725793820...4718497520002856186427013867280619162451
40500000US060758739654461441772615530153099029636237393628...25034240549557138109193129011947612526991912947
50500000US0608176444237754620593219312298813734837641183846...264282638110014141678963118741775212428889710763
60500000US06085193625997531352488571036174137671248021209311031...64880612272292331337190172490335506265471920922196
70500000US060954534912247841279413696150208866599529563116...1447616079658393845998779411057693745144911
80500000US060974888632385351154213354148599099606928922817...156331790474311110774491051415649991859427058
\n", - "

9 rows × 50 columns

\n", - "
" - ], - "text/plain": [ - " GEO_ID Age Total Population Total Male Male Under 5 Male 5 to 9 \\\n", - "0 0500000US06001 1682353 824426 45635 49200 \n", - "1 0500000US06013 1165927 568066 31538 36717 \n", - "2 0500000US06041 262321 128593 5933 7218 \n", - "3 0500000US06055 138019 67777 3129 3757 \n", - "4 0500000US06075 873965 446144 17726 15530 \n", - "5 0500000US06081 764442 377546 20593 21931 \n", - "6 0500000US06085 1936259 975313 52488 57103 \n", - "7 0500000US06095 453491 224784 12794 13696 \n", - "8 0500000US06097 488863 238535 11542 13354 \n", - "\n", - " Male 10 to 14 Male 15 to 17 Male 18 to 19 Male 20 Male 21 ... \\\n", - "0 51397 29725 22513 11407 11379 ... \n", - "1 40960 25276 15044 7347 6671 ... \n", - "2 8577 5422 2930 1280 1201 ... \n", - "3 4384 2811 1725 793 820 ... \n", - "4 15309 9029 6362 3739 3628 ... \n", - "5 22988 13734 8376 4118 3846 ... \n", - "6 61741 37671 24802 12093 11031 ... \n", - "7 15020 8866 5995 2956 3116 ... \n", - "8 14859 9099 6069 2892 2817 ... \n", - "\n", - " Female 50 to 54 Female 55 to 59 Female 60 to 61 Female 62 to 64 \\\n", - "0 53682 53906 20296 29019 \n", - "1 40687 41410 15991 22546 \n", - "2 10380 9936 3934 5536 \n", - "3 4718 4975 2000 2856 \n", - "4 25034 24054 9557 13810 \n", - "5 26428 26381 10014 14167 \n", - "6 64880 61227 22923 31337 \n", - "7 14476 16079 6583 9384 \n", - "8 15633 17904 7431 11107 \n", - "\n", - " Female 65 to 66 Female 67 to 69 Female 70 to 74 Female 75 to 79 \\\n", - "0 17995 24177 34736 23072 \n", - "1 13900 18985 28511 19264 \n", - "2 3907 5579 8742 6103 \n", - "3 1864 2701 3867 2806 \n", - "4 9193 12901 19476 12526 \n", - "5 8963 11874 17752 12428 \n", - "6 19017 24903 35506 26547 \n", - "7 5998 7794 11057 6937 \n", - "8 7449 10514 15649 9918 \n", - "\n", - " Female 80 to 84 Female 85 and over \n", - "0 15328 18428 \n", - "1 12612 14456 \n", - "2 3882 4271 \n", - "3 1916 2451 \n", - "4 9919 12947 \n", - "5 8897 10763 \n", - "6 19209 22196 \n", - "7 4514 4911 \n", - "8 5942 7058 \n", - "\n", - "[9 rows x 50 columns]" - ] - }, - "execution_count": 67, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], + "source": [ + "# remove column names that are not needed starting with \"P2\"\n", + "final_race_out = race_df.columns.to_list()\n", + "\n", + "# Remove columns that start with \"P2\"\n", + "final_race_out = [col for col in final_race_out if not col.startswith(\"P2\")]\n", + "final_race_out.remove(\"ucgid\")" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], "source": [ "age_out = list(age_rename_dict.values())\n", "# add the GEO_ID and NAME columns\n", "age_out.insert(0, \"GEO_ID\")\n", "# age_out.insert(1, \"NAME\")\n", "\n", - "age_df[age_out]" + "# age_df[age_out]" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "# Define the age groups\n", + "age_groups = [\n", + " \"Under 5\", \"5 to 9\", \"10 to 14\", \"15 to 17\", \"18 to 19\", \"20\", \"21\", \n", + " \"22 to 24\", \"25 to 29\", \"30 to 34\", \"35 to 39\", \"40 to 44\", \"45 to 49\", \n", + " \"50 to 54\", \"55 to 59\", \"60 to 61\", \"62 to 64\", \"65 to 66\", \"67 to 69\", \n", + " \"70 to 74\", \"75 to 79\", \"80 to 84\", \"85 and over\"\n", + "]\n", + "\n", + "# Calculate totals for each age group\n", + "total_cols = []\n", + "for group in age_groups:\n", + " male_col = f\"Male {group}\"\n", + " female_col = f\"Female {group}\"\n", + " total_col = f\"Total {group}\"\n", + " total_cols.append(total_col)\n", + " \n", + " # Ensure the columns exist in the DataFrame\n", + " if male_col in age_df.columns and female_col in age_df.columns:\n", + " age_df[total_col] = age_df[male_col].astype(int) + age_df[female_col].astype(int)\n", + "\n", + "for col in total_cols:\n", + " age_out.append(col)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "# calculate shares of the age total population for each sex and age group\n", + "share_cols = age_out.copy()\n", + "share_cols.remove(\"GEO_ID\")\n", + "share_cols.remove(\"Age Total Population\")\n", + "\n", + "for group in share_cols:\n", + " total_col = \"Age Total Population\"\n", + "\n", + " age_df[f\"Share {group}\"] = round(age_df[f\"{group}\"].astype(int) / age_df[total_col].astype(int), 4)" ] }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 63, + "metadata": {}, + "outputs": [], + "source": [ + "# remove column names that are not needed starting with \"P12\"\n", + "final_age_out = age_df.columns.to_list()\n", + "\n", + "final_age_out = [col for col in final_age_out if not col.startswith(\"P12\")]\n", + "final_age_out.remove(\"NAME\")\n", + "final_age_out.remove(\"ucgid\")" + ] + }, + { + "cell_type": "code", + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "# join the two dataframes\n", "\n", - "final_df = pd.merge(race_df[race_out], age_df[age_out], on=\"GEO_ID\")" + "final_df = pd.merge(race_df[final_race_out], age_df[final_age_out], on=\"GEO_ID\")" ] }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [