diff --git a/Project-Documentation/Equity-Priority-Communities/equity_priority_communities_build.ipynb b/Project-Documentation/Equity-Priority-Communities/equity_priority_communities_build.ipynb index b712e23..ccdcc7f 100644 --- a/Project-Documentation/Equity-Priority-Communities/equity_priority_communities_build.ipynb +++ b/Project-Documentation/Equity-Priority-Communities/equity_priority_communities_build.ipynb @@ -17,6 +17,7 @@ "import os\n", "import sys\n", "import pandas as pd, geopandas as gp, numpy as np\n", + "import logging\n", "import getpass\n", "from arcgis import GIS\n", "\n", @@ -353,6 +354,178 @@ " return geog_gdf" ] }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# create a function that checks for nulls or missing values and logs any column within the dataframe that has nulls\n", + "def check_nulls(df, dataset_name, select_columns=None):\n", + "\n", + " if select_columns:\n", + " df = df[select_columns]\n", + " \n", + " nulls = df.isnull().sum()\n", + "\n", + " # Log the results\n", + " logging.info(f\"Checking for missing values in the {dataset_name} data \\n\")\n", + "\n", + " for column, null_count in nulls.items():\n", + " if null_count > 0:\n", + " logging.warning(f\"Column '{column}' has {null_count} null values\")\n", + " else:\n", + " logging.info(f\"Column '{column}' has no null values\")\n", + "\n", + " # Inform the user that the missing values check is complete\n", + " logging.info(\"Missing values check completed for dataset: \" + dataset_name + \"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def check_data_types(df, expected_dtypes, dataset_name):\n", + " # Log the results\n", + " logging.info(f\"Checking data types for the {dataset_name} data \\n\")\n", + "\n", + " for column, expected_dtype in expected_dtypes.items():\n", + " actual_dtype = df[column].dtype\n", + " if str(actual_dtype) == expected_dtype:\n", + " logging.info(f\"Column '{column}' matches the expected data type: {expected_dtype}\")\n", + " else:\n", + " logging.warning(f\"Column '{column}' has data type '{actual_dtype}' but expected '{expected_dtype}'\")\n", + " \n", + " # Inform the user that the data type check is complete\n", + " logging.info(\"Data type check completed for dataset: \" + dataset_name + \"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Function to check if column values are within a range\n", + "def check_value_ranges(df, column_ranges, dataset_name):\n", + "\n", + " # Log the results\n", + " logging.info(f\"Checking value ranges for the {dataset_name} data \\n\")\n", + "\n", + " for column, (min_value, max_value) in column_ranges.items():\n", + " if column in df.columns:\n", + " if df[column].between(min_value, max_value).all():\n", + " logging.info(f\"All values in column '{column}' are within the range {min_value}-{max_value}.\")\n", + " else:\n", + " logging.warning(f\"Some values in column '{column}' are outside the range {min_value}-{max_value}.\")\n", + " else:\n", + " logging.error(f\"Column '{column}' does not exist in the DataFrame.\")\n", + "\n", + " # Inform the user that the value range check is complete\n", + " logging.info(\"Value range check completed for dataset: \" + dataset_name + \"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def check_value_matching(df, column_values, dataset_name):\n", + "\n", + " # Log the results\n", + " logging.info(f\"Checking value matching for the {dataset_name} data \\n\")\n", + "\n", + " for column, acceptable_values in column_values.items():\n", + " if column in df.columns:\n", + " # Check which values match the acceptable list\n", + " matches = df[column].isin(acceptable_values)\n", + " if matches.all():\n", + " logging.info(f\"All values in column '{column}' match the acceptable list: {acceptable_values}.\")\n", + " elif matches.any():\n", + " logging.warning(f\"Some values in column '{column}' do not match the acceptable list: {acceptable_values}.\")\n", + " else:\n", + " logging.error(f\"None of the values in column '{column}' match the acceptable list: {acceptable_values}.\")\n", + " else:\n", + " logging.error(f\"Column '{column}' does not exist in the DataFrame.\")\n", + "\n", + " # Inform the user that the value matching check is complete\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def check_one_to_one_join(df_left, df_right, key, merged_df, dataset_name, how=\"inner\"):\n", + " # log the results\n", + " logging.info(\n", + " f\"Checking validity of one-to-one join between the left and right DataFrames resulting in {dataset_name} \\n\"\n", + " )\n", + "\n", + " # Check for duplicates in the key columns of both DataFrames to ensure a one-to-one relationship\n", + " if df_left[key].duplicated().any() or df_right[key].duplicated().any():\n", + " logging.error(\n", + " \"One-to-one join condition violated: Duplicate keys found in one or both DataFrames.\"\n", + " )\n", + " else:\n", + " logging.info(\"One-to-one join check passed: No duplicate keys found in the join columns.\")\n", + "\n", + " # Additionally, check if the merged DataFrame has the same length as the original DataFrames (if 'inner' join)\n", + " if how == \"inner\" and (len(merged_df) != len(df_left) or len(merged_df) != len(df_right)):\n", + " logging.warning(\n", + " f\"The merged DataFrame's length ({len(merged_df)}) does not match the original DataFrames' lengths (left: {len(df_left)}, right: {len(df_right)}). Some rows might not have matched.\"\n", + " )\n", + "\n", + " # Inform the user that the one-to-one join check is complete\n", + " logging.info(\"One-to-one join check completed for dataset: \" + dataset_name + \"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "def setup_logging(log_file, level=logging.INFO):\n", + " \"\"\"\n", + " Sets up logging to a specified file, deleting the log file if it already exists.\n", + "\n", + " :param log_file: The path to the log file.\n", + " :param level: The logging level (e.g., logging.INFO, logging.DEBUG).\n", + " \"\"\"\n", + " # create a logger\n", + " logger = logging.getLogger()\n", + "\n", + " # Check if the log file already exists and remove it\n", + " if os.path.exists(log_file):\n", + " os.remove(log_file)\n", + "\n", + " # Configure logging\n", + " logging.basicConfig(\n", + " filename=f\"{log_file}.log\",\n", + " filemode=\"w\",\n", + " encoding=\"utf-8\",\n", + " level=logging.INFO,\n", + " format=\"%(levelname)s:%(message)s\",\n", + " )\n", + "\n", + " return logger" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "logger = setup_logging(\"epc_data_quality_checks\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -362,7 +535,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -371,7 +544,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -388,24 +561,98 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "# pull american community survey tabular data\n", "acs_df = pull_acs_5_year_est_data(\n", " census_api_key=api_key, acs_year=2022, tbl_prof_type=\"Detailed\", select_table_vars=acs_vars_lst\n", - ")" + ")\n", + "\n", + "# check for missing values in the acs data\n", + "check_nulls(acs_df, \"American Community Survey (ACS) raw data\")" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# check data types for the acs data\n", + "acs_df_dtypes = {\n", + " \"B03002_001E\": \"int64\",\n", + " \"B03002_003E\": \"int64\",\n", + " \"B01001_001E\": \"int64\",\n", + " \"B01001_023E\": \"int64\",\n", + " \"B01001_024E\": \"int64\",\n", + " \"B01001_025E\": \"int64\",\n", + " \"B01001_047E\": \"int64\",\n", + " \"B01001_048E\": \"int64\",\n", + " \"B01001_049E\": \"int64\",\n", + " \"C17002_001E\": \"int64\",\n", + " \"C17002_008E\": \"int64\",\n", + " \"C18108_001E\": \"int64\",\n", + " \"C18108_005E\": \"int64\",\n", + " \"C18108_009E\": \"int64\",\n", + " \"C18108_013E\": \"int64\",\n", + " \"B08201_001E\": \"int64\",\n", + " \"B08201_002E\": \"int64\",\n", + " \"B11004_001E\": \"int64\",\n", + " \"B11004_010E\": \"int64\",\n", + " \"B11004_016E\": \"int64\",\n", + " \"B16005_001E\": \"int64\",\n", + " \"B16005_007E\": \"int64\",\n", + " \"B16005_008E\": \"int64\",\n", + " \"B16005_012E\": \"int64\",\n", + " \"B16005_013E\": \"int64\",\n", + " \"B16005_017E\": \"int64\",\n", + " \"B16005_018E\": \"int64\",\n", + " \"B16005_022E\": \"int64\",\n", + " \"B16005_023E\": \"int64\",\n", + " \"B16005_029E\": \"int64\",\n", + " \"B16005_030E\": \"int64\",\n", + " \"B16005_034E\": \"int64\",\n", + " \"B16005_035E\": \"int64\",\n", + " \"B16005_039E\": \"int64\",\n", + " \"B16005_040E\": \"int64\",\n", + " \"B16005_044E\": \"int64\",\n", + " \"B16005_045E\": \"int64\",\n", + " \"B25070_010E\": \"int64\",\n", + " \"fipco\": \"object\",\n", + " \"tract_geoid\": \"object\",\n", + "}\n", + "\n", + "check_data_types(acs_df, acs_df_dtypes, \"American Community Survey (ACS) raw data\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "# pull american community survey geographic data\n", - "acs_gdf = pull_census_tracts_geodata(year=2022, cartographic=True)" + "acs_gdf = pull_census_tracts_geodata(year=2022, cartographic=True)\n", + "\n", + "# check for missing values in the acs data\n", + "check_nulls(acs_gdf, \"American Community Survey (ACS) geographic data\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# check data types for the acs geographic data\n", + "acs_gdf_dtypes = {\n", + " \"tract_geoid\": \"object\",\n", + " \"geometry\": \"geometry\",\n", + "}\n", + "\n", + "check_data_types(acs_gdf, acs_gdf_dtypes, \"American Community Survey (ACS) geographic data\")" ] }, { @@ -417,7 +664,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -445,7 +692,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -494,6 +741,25 @@ ")" ] }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "# check calculated columns for missing values\n", + "calculated_columns = [\n", + " \"pop_poc\",\n", + " \"pop_over75\",\n", + " \"pop_spfam\",\n", + " \"pop_lep\",\n", + " \"pop_below2\",\n", + " \"pop_disabi\",\n", + "]\n", + "\n", + "check_nulls(acs_df, \"American Community Survey (ACS) data calculated columns\", calculated_columns)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -503,7 +769,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -527,6 +793,26 @@ "acs_df[\"pct_hus_re\"] = np.where(acs_df[\"tot_hh\"] == 0, 0, (acs_df[\"pop_hus_re\"] / acs_df[\"tot_hh\"]))" ] }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "min_max_ranges = {\n", + " \"pct_poc\": (0, 1),\n", + " \"pct_over75\": (0, 1),\n", + " \"pct_spfam\": (0, 1),\n", + " \"pct_lep\": (0, 1),\n", + " \"pct_below2\": (0, 1),\n", + " \"pct_disab\": (0, 1),\n", + " \"pct_zvhhs\": (0, 1),\n", + " \"pct_hus_re\": (0, 1),\n", + "}\n", + "\n", + "check_value_ranges(acs_df, min_max_ranges, \"American Community Survey (ACS) data calculated columns\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -543,7 +829,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -563,7 +849,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -589,7 +875,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -609,7 +895,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -629,7 +915,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -655,7 +941,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -675,7 +961,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -695,7 +981,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -721,7 +1007,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -741,7 +1027,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -759,7 +1045,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -768,27 +1054,39 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 41, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "epc_class\n", - "High 174\n", - "Higher 131\n", - "Highest 48\n", - "NA 0\n", - "Name: epc_2050p, dtype: int64" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "acs_df.groupby(\"epc_class\")[\"epc_2050p\"].agg(\"sum\")" + "# check the epc class column column have the expected values\n", + "\n", + "acceptable_values = [\"Highest\", \"Higher\", \"High\", \"NA\"]\n", + "column_values = {\"epc_class\": acceptable_values}\n", + "\n", + "check_value_matching(acs_df, column_values, \"American Community Survey (ACS) data calculated columns\")" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "class_summary = acs_df.groupby(\"epc_class\")[\"epc_2050p\"].agg(\"sum\").reset_index().rename(\n", + " columns={\"epc_2050p\": \"count\"}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "logger.info(\"Summary of EPC class counts \\n\")\n", + "logger.info(class_summary.to_string())\n", + "\n", + "logger.info(\"End of summary of EPC class counts \\n\")" ] }, { @@ -800,7 +1098,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -824,7 +1122,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -833,7 +1131,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -852,7 +1150,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -869,7 +1167,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ @@ -888,149 +1186,18 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 49, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - " | factors | \n", - "mean | \n", - "std | \n", - "plus_half_sd | \n", - "plus_one_sd | \n", - "plus_one_half_sd | \n", - "
---|---|---|---|---|---|---|
0 | \n", - "Seniors 75 Years and Over | \n", - "0.07 | \n", - "0.06 | \n", - "0.10 | \n", - "0.13 | \n", - "0.16 | \n", - "
1 | \n", - "People of Color | \n", - "0.61 | \n", - "0.23 | \n", - "0.72 | \n", - "0.84 | \n", - "0.96 | \n", - "
2 | \n", - "Limited English Proficiency | \n", - "0.07 | \n", - "0.08 | \n", - "0.11 | \n", - "0.15 | \n", - "0.19 | \n", - "
3 | \n", - "Single Parent Families | \n", - "0.12 | \n", - "0.09 | \n", - "0.16 | \n", - "0.21 | \n", - "0.26 | \n", - "
4 | \n", - "Low-Income (<200% Federal Poverty Level-FPL) | \n", - "0.18 | \n", - "0.13 | \n", - "0.24 | \n", - "0.31 | \n", - "0.38 | \n", - "
5 | \n", - "People with Disability | \n", - "0.10 | \n", - "0.05 | \n", - "0.12 | \n", - "0.15 | \n", - "0.18 | \n", - "
6 | \n", - "Zero-Vehicle Household | \n", - "0.10 | \n", - "0.13 | \n", - "0.16 | \n", - "0.23 | \n", - "0.30 | \n", - "
7 | \n", - "Rent-Burdened | \n", - "0.10 | \n", - "0.08 | \n", - "0.14 | \n", - "0.18 | \n", - "0.22 | \n", - "