diff --git a/Project-Documentation/Adhoc-Spatial-Analysis/pull_2020_decennial_census.ipynb b/Project-Documentation/Adhoc-Spatial-Analysis/pull_2020_decennial_census.ipynb deleted file mode 100644 index 74bfe4c..0000000 --- a/Project-Documentation/Adhoc-Spatial-Analysis/pull_2020_decennial_census.ipynb +++ /dev/null @@ -1,496 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import os\n", - "import numpy as np\n", - "import requests\n", - "import pathlib\n", - "import getpass\n", - "import pandas as pd\n", - "\n", - "from dotenv import load_dotenv\n", - "\n", - "user = getpass.getuser()\n", - "load_dotenv()" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [], - "source": [ - "API_KEY = os.environ.get(\"CENSUS_API_KEY\")" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [], - "source": [ - "work_dir = pathlib.Path(f\"/Users/{user}/Library/CloudStorage/Box-Box/DataViz Projects/Bay_Area_Census_Website/census_decennial_download\")\n", - "out_file = work_dir / \"census_2020_race_sex_age_jc.csv\"" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [], - "source": [ - "def fetch_census_data(url):\n", - " # Make the API request\n", - " response = requests.get(url)\n", - "\n", - " # Check if the request was successful\n", - " if response.status_code == 200:\n", - " # Convert the JSON response to a list of lists\n", - " data = response.json()\n", - "\n", - " # The first element contains the column headers\n", - " columns = data[0]\n", - "\n", - " # The rest of the elements contain the actual data\n", - " rows = data[1:]\n", - "\n", - " # Create a pandas DataFrame from the data\n", - " df = pd.DataFrame(rows, columns=columns)\n", - "\n", - " # convert all columns to numeric except GEOID, NAME\n", - " str_cols = [\"GEOID\", \"GEO_ID\", \"NAME\", \"state\", \"place\"]\n", - " num_cols = [col for col in df.columns if col not in str_cols]\n", - " df[num_cols] = df[num_cols].apply(pd.to_numeric, errors=\"coerce\")\n", - "\n", - " return df\n", - " else:\n", - " print(f\"Failed to retrieve data. HTTP Status code: {response.status_code}\")\n", - " return None" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [], - "source": [ - "def create_share_columns(df, universe_column, share_column_dict):\n", - " \"\"\"Calculate share columns based on a single population or universe column.\n", - "\n", - " Author: Joshua Croff\n", - "\n", - " Args:\n", - " df (pd.DataFrame): The Dataframe to calculate share columns on.\n", - " universe_column (str): The name of the column that represents the total population or universe.\n", - " share_column_dict (dictionary): Key value pairs dictionary. Key should be population column \n", - " name and value should be expected share column output name.\n", - " Returns:\n", - " pd.DataFrame: The original DataFrame with the share columns added.\n", - " \"\"\"\n", - " df = df.copy()\n", - " \n", - " if universe_column not in df.columns:\n", - " raise ValueError(f\"Universe column '{universe_column}' not found in DataFrame\")\n", - " \n", - " for key, value in share_column_dict.items():\n", - " if key not in df.columns:\n", - " raise ValueError(f\"Population column '{key}' not found in DataFrame\")\n", - " df[value] = np.where(df[universe_column] == 0, 0, round((df[key] / df[universe_column]), 3))\n", - " return df" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [], - "source": [ - "# Define the API endpoint\n", - "ca_places_url = \"https://www2.census.gov/geo/docs/reference/codes2020/place/st06_ca_place2020.txt\"\n", - "\n", - "race_url_cty = (\n", - " \"https://api.census.gov/data/2020/dec/pl?get=group(P2)&\"\n", - " \"ucgid=0500000US06001,0500000US06013,0500000US06041,\"\n", - " \"0500000US06055,0500000US06075,0500000US06081,0500000US06085,\"\n", - " \"0500000US06095,0500000US06097\"\n", - ")\n", - "\n", - "race_url_pl = (\n", - " f\"https://api.census.gov/data/2020/dec/pl?get=group(P2)&\"\n", - " f\"for=place:*&in=state:06&key={API_KEY}\"\n", - ")\n", - "\n", - "age_url_cty = (\n", - " \"https://api.census.gov/data/2020/dec/dhc?get=group(P12)&\"\n", - " \"ucgid=0500000US06001,0500000US06013,0500000US06041,\"\n", - " \"0500000US06055,0500000US06075,0500000US06081,0500000US06085,\"\n", - " \"0500000US06095,0500000US06097\"\n", - ")\n", - "\n", - "age_url_pl = (\n", - " f\"https://api.census.gov/data/2020/dec/dhc?get=group(P12)&\"\n", - " f\"for=place:*&in=state:06&key={API_KEY}\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [], - "source": [ - "# Get california places data \n", - "county_list = [\n", - " \"San Francisco County\",\n", - " \"Alameda County\",\n", - " \"Contra Costa County\",\n", - " \"Marin County\",\n", - " \"Napa County\",\n", - " \"San Mateo County\",\n", - " \"Santa Clara County\",\n", - " \"Solano County\",\n", - " \"Sonoma County\",\n", - "]\n", - "place_df = pd.read_csv(ca_places_url, sep=\"|\", header=0, dtype=str)\n", - "bay_area_places = place_df.query(\"COUNTIES in @county_list & TYPE =='INCORPORATED PLACE'\")\n", - "place_geoid_list = bay_area_places[\"PLACEFP\"].tolist()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Pull and pre-process race data" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [], - "source": [ - "# get race place data for california\n", - "race_pl_df = fetch_census_data(race_url_pl)\n", - "# Filter to only bay area places\n", - "race_pl_df = race_pl_df.query(\"place in @place_geoid_list\")\n", - "# get race county data for california\n", - "race_cty_df = fetch_census_data(race_url_cty)" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "outputs": [], - "source": [ - "# concatenate place and county dataframes \n", - "race_df = pd.concat([race_pl_df, race_cty_df], axis=0, ignore_index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": {}, - "outputs": [], - "source": [ - "# drop and rename columns\n", - "race_rename_dict = {\n", - " \"P2_001N\": \"Race Total Population\",\n", - " \"P2_002N\": \"Hispanic or Latino\",\n", - " \"P2_003N\": \"Not Hispanic or Latino\",\n", - " \"P2_004N\": \"Population of One Race\",\n", - " \"P2_005N\": \"White\",\n", - " \"P2_006N\": \"Black or African American\",\n", - " \"P2_007N\": \"American Indian and Alaska Native\",\n", - " \"P2_008N\": \"Asian\",\n", - " \"P2_009N\": \"Native Hawaiian and Other Pacific Islander\",\n", - " \"P2_010N\": \"Some Other Race\",\n", - " \"P2_011N\": \"Two or More Races\",\n", - "}\n", - "# drop columns except required columns\n", - "req_cols = list(race_rename_dict.keys())\n", - "req_cols.insert(0, \"GEO_ID\")\n", - "req_cols.insert(1, \"NAME\")\n", - "\n", - "race_df = race_df[req_cols]\n", - "race_df = race_df.rename(columns=race_rename_dict)" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [], - "source": [ - "# create shares dictionary to calculate share columns\n", - "# dict should look like {population_column: share_column}\n", - "rm_cols = [\"GEO_ID\", \"NAME\", \"Race Total Population\"]\n", - "pop_cols = race_df.columns.to_list()\n", - "pop_cols = [col for col in pop_cols if col not in rm_cols]\n", - "share_dict = {col: f\"Share {col}\" for col in pop_cols}\n", - "universe_col = \"Race Total Population\"" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [], - "source": [ - "# calculate share columns\n", - "race_df = create_share_columns(df=race_df, universe_column=universe_col, share_column_dict=share_dict)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Pull and pre-process age data" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [], - "source": [ - "# get age place data for california\n", - "age_pl_df = fetch_census_data(age_url_pl)\n", - "# Filter to only bay area places\n", - "age_pl_df = age_pl_df.query(\"place in @place_geoid_list\")\n", - "# get age county data for california\n", - "age_cty_df = fetch_census_data(age_url_cty)" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": {}, - "outputs": [], - "source": [ - "# concatenate place and county dataframes\n", - "age_df = pd.concat([age_pl_df, age_cty_df], axis=0, ignore_index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [], - "source": [ - "# drop and rename columns\n", - "age_rename_dict = {\n", - " \"P12_001N\": \"Age Total Population\",\n", - " \"P12_002N\": \"Total Male\",\n", - " \"P12_003N\": \"Male Under 5\",\n", - " \"P12_004N\": \"Male 5 to 9\",\n", - " \"P12_005N\": \"Male 10 to 14\",\n", - " \"P12_006N\": \"Male 15 to 17\",\n", - " \"P12_007N\": \"Male 18 to 19\",\n", - " \"P12_008N\": \"Male 20\",\n", - " \"P12_009N\": \"Male 21\",\n", - " \"P12_010N\": \"Male 22 to 24\",\n", - " \"P12_011N\": \"Male 25 to 29\",\n", - " \"P12_012N\": \"Male 30 to 34\",\n", - " \"P12_013N\": \"Male 35 to 39\",\n", - " \"P12_014N\": \"Male 40 to 44\",\n", - " \"P12_015N\": \"Male 45 to 49\",\n", - " \"P12_016N\": \"Male 50 to 54\",\n", - " \"P12_017N\": \"Male 55 to 59\",\n", - " \"P12_018N\": \"Male 60 to 61\",\n", - " \"P12_019N\": \"Male 62 to 64\",\n", - " \"P12_020N\": \"Male 65 to 66\",\n", - " \"P12_021N\": \"Male 67 to 69\",\n", - " \"P12_022N\": \"Male 70 to 74\",\n", - " \"P12_023N\": \"Male 75 to 79\",\n", - " \"P12_024N\": \"Male 80 to 84\",\n", - " \"P12_025N\": \"Male 85 and over\",\n", - " \"P12_026N\": \"Total Female\",\n", - " \"P12_027N\": \"Female Under 5\",\n", - " \"P12_028N\": \"Female 5 to 9\",\n", - " \"P12_029N\": \"Female 10 to 14\",\n", - " \"P12_030N\": \"Female 15 to 17\",\n", - " \"P12_031N\": \"Female 18 to 19\",\n", - " \"P12_032N\": \"Female 20\",\n", - " \"P12_033N\": \"Female 21\",\n", - " \"P12_034N\": \"Female 22 to 24\",\n", - " \"P12_035N\": \"Female 25 to 29\",\n", - " \"P12_036N\": \"Female 30 to 34\",\n", - " \"P12_037N\": \"Female 35 to 39\",\n", - " \"P12_038N\": \"Female 40 to 44\",\n", - " \"P12_039N\": \"Female 45 to 49\",\n", - " \"P12_040N\": \"Female 50 to 54\",\n", - " \"P12_041N\": \"Female 55 to 59\",\n", - " \"P12_042N\": \"Female 60 to 61\",\n", - " \"P12_043N\": \"Female 62 to 64\",\n", - " \"P12_044N\": \"Female 65 to 66\",\n", - " \"P12_045N\": \"Female 67 to 69\",\n", - " \"P12_046N\": \"Female 70 to 74\",\n", - " \"P12_047N\": \"Female 75 to 79\",\n", - " \"P12_048N\": \"Female 80 to 84\",\n", - " \"P12_049N\": \"Female 85 and over\",\n", - "}\n", - "# drop columns except required columns\n", - "req_cols = list(age_rename_dict.keys())\n", - "req_cols.insert(0, \"GEO_ID\")\n", - "req_cols.insert(1, \"NAME\")\n", - "\n", - "age_df = age_df[req_cols]\n", - "age_df = age_df.rename(columns=age_rename_dict)" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": {}, - "outputs": [], - "source": [ - "# add age group total columns\n", - "age_groups = {\n", - " \"Under 5\": [\"Male Under 5\", \"Female Under 5\"],\n", - " \"5 to 9\": [\"Male 5 to 9\", \"Female 5 to 9\"],\n", - " \"10 to 14\": [\"Male 10 to 14\", \"Female 10 to 14\"],\n", - " \"15 to 17\": [\"Male 15 to 17\", \"Female 15 to 17\"],\n", - " \"18 to 19\": [\"Male 18 to 19\", \"Female 18 to 19\"],\n", - " \"20\": [\"Male 20\", \"Female 20\"],\n", - " \"21\": [\"Male 21\", \"Female 21\"],\n", - " \"22 to 24\": [\"Male 22 to 24\", \"Female 22 to 24\"],\n", - " \"25 to 29\": [\"Male 25 to 29\", \"Female 25 to 29\"],\n", - " \"30 to 34\": [\"Male 30 to 34\", \"Female 30 to 34\"],\n", - " \"35 to 39\": [\"Male 35 to 39\", \"Female 35 to 39\"],\n", - " \"40 to 44\": [\"Male 40 to 44\", \"Female 40 to 44\"],\n", - " \"45 to 49\": [\"Male 45 to 49\", \"Female 45 to 49\"],\n", - " \"50 to 54\": [\"Male 50 to 54\", \"Female 50 to 54\"],\n", - " \"55 to 59\": [\"Male 55 to 59\", \"Female 55 to 59\"],\n", - " \"60 to 61\": [\"Male 60 to 61\", \"Female 60 to 61\"],\n", - " \"62 to 64\": [\"Male 62 to 64\", \"Female 62 to 64\"],\n", - " \"65 to 66\": [\"Male 65 to 66\", \"Female 65 to 66\"],\n", - " \"67 to 69\": [\"Male 67 to 69\", \"Female 67 to 69\"],\n", - " \"70 to 74\": [\"Male 70 to 74\", \"Female 70 to 74\"],\n", - " \"75 to 79\": [\"Male 75 to 79\", \"Female 75 to 79\"],\n", - " \"80 to 84\": [\"Male 80 to 84\", \"Female 80 to 84\"],\n", - " \"85 and over\": [\"Male 85 and over\", \"Female 85 and over\"],\n", - "}\n", - "\n", - "for group, cols in age_groups.items():\n", - " male_col, female_col = cols\n", - " total_col = f\"Total {group}\"\n", - " age_df[total_col] = age_df[male_col] + age_df[female_col]" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": {}, - "outputs": [], - "source": [ - "# create shares dictionary to calculate share columns\n", - "# dict should look like {population_column: share_column}\n", - "rm_cols = [\"GEO_ID\", \"NAME\", \"Age Total Population\"]\n", - "pop_cols = age_df.columns.to_list()\n", - "pop_cols = [col for col in pop_cols if col not in rm_cols]\n", - "share_dict = {col: f\"Share {col}\" for col in pop_cols}\n", - "universe_col = \"Age Total Population\"" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": {}, - "outputs": [], - "source": [ - "# calculate share columns\n", - "age_df = create_share_columns(df=age_df, universe_column=universe_col, share_column_dict=share_dict)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Join the two dataframes" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": {}, - "outputs": [], - "source": [ - "# join the two dataframes on GEO_ID and NAME\n", - "final_df = pd.merge(race_df, age_df, on=[\"GEO_ID\", \"NAME\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Final data cleaning" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": {}, - "outputs": [], - "source": [ - "# drop San Francisco County from final_df\n", - "final_df = final_df.query(\"NAME != 'San Francisco County, California'\")\n", - "\n", - "# remove everything after the comma in the NAME column\n", - "final_df[\"NAME\"] = final_df[\"NAME\"].str.split(\",\").str[0]\n", - "\n", - "# remove city or town from the NAME column\n", - "final_df[\"NAME\"] = final_df[\"NAME\"].str.replace(r\" city| town\", \"\", regex=True)\n", - "\n", - "# update the NAME for St. Helena to 'St Helena'\n", - "final_df.loc[final_df[\"NAME\"] == \"St. Helena\", \"NAME\"] = \"St Helena\"" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "metadata": {}, - "outputs": [], - "source": [ - "final_df.to_csv(out_file, index=False)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "esri_env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}