Skip to content

Commit

Permalink
Update fastrak_geocoding.ipynb
Browse files Browse the repository at this point in the history
  • Loading branch information
joshuacroff committed Oct 9, 2024
1 parent 7874f21 commit ad34051
Showing 1 changed file with 47 additions and 56 deletions.
103 changes: 47 additions & 56 deletions Project-Documentation/Adhoc-Spatial-Analysis/fastrak_geocoding.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,9 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 79,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Info: Found credentials at: /Users/jcroff/Library/CloudStorage/Box-Box/dvutils-creds-jcroff.json\n"
]
}
],
"outputs": [],
"source": [
"import getpass\n",
"import logging\n",
Expand All @@ -27,7 +19,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -52,20 +44,20 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"work_dir = pathlib.Path(\n",
" f\"/Users/{user}/Library/CloudStorage/Box-Box/DataViz Projects/Data Services/FasTrak Data\"\n",
")\n",
"ft_data = work_dir / \"Fastrak Accounts Cleaned\" / \"Final Geocode Results\" / \"fastrak_accounts_10_2023_dedup.csv\"\n",
"ft_data = work_dir / \"Fastrak Accounts Cleaned\" / \"Final Geocode Results\" / \"unmatched_ggbd_address_list.csv\"\n",
"gc_data = work_dir / \"Fastrak Accounts Cleaned\" / \"bay_area_fastrak_accounts_geocoded.csv\"\n",
"final_gc_data_csv = work_dir / \"Fastrak Accounts Cleaned\" / \"Final Geocode Results\" / \"bay_area_fastrak_accounts_geocoded_final_9_20_24.csv\"\n",
"final_gc_data_geojson = work_dir / \"Fastrak Accounts Cleaned\" / \"Final Geocode Results\" / \"bay_area_fastrak_accounts_geocoded_final_9_20_24.geojson\"\n",
"summary_data_xlsx = work_dir / \"Fastrak Accounts Cleaned\" / \"Final Geocode Results\" / \"bay_area_fastrak_accounts_geocoded_summary_9_20_24.xlsx\"\n",
"final_aggregated_data_csv = work_dir / \"Fastrak Accounts Cleaned\" / \"Final Geocode Results\" / \"bay_area_fastrak_accounts_geocoded_final_aggregated_9_20_24.csv\"\n",
"final_aggregated_data_geojson = work_dir / \"Fastrak Accounts Cleaned\" / \"Final Geocode Results\" / \"bay_area_fastrak_accounts_geocoded_final_aggregated_9_20_24.geojson\"\n",
"final_gc_data_csv = work_dir / \"Fastrak Accounts Cleaned\" / \"Final Geocode Results\" / \"bay_area_fastrak_accounts_geocoded_final_10_8_24.csv\"\n",
"final_gc_data_geojson = work_dir / \"Fastrak Accounts Cleaned\" / \"Final Geocode Results\" / \"bay_area_fastrak_accounts_geocoded_final_10_8_24.geojson\"\n",
"summary_data_xlsx = work_dir / \"Fastrak Accounts Cleaned\" / \"Final Geocode Results\" / \"bay_area_fastrak_accounts_geocoded_summary_10_8_24.xlsx\"\n",
"final_aggregated_data_csv = work_dir / \"Fastrak Accounts Cleaned\" / \"Final Geocode Results\" / \"bay_area_fastrak_accounts_geocoded_final_aggregated_10_8_24.csv\"\n",
"final_aggregated_data_geojson = work_dir / \"Fastrak Accounts Cleaned\" / \"Final Geocode Results\" / \"bay_area_fastrak_accounts_geocoded_final_aggregated_10_8_24.geojson\"\n",
"epc_data = (\n",
" \"https://services3.arcgis.com/i2dkYWmb4wHvYPda/arcgis/rest/services/\"\n",
" \"draft_equity_priority_communities_pba2050plus_acs2022a/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson\"\n",
Expand All @@ -78,7 +70,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -87,7 +79,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -106,7 +98,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -120,7 +112,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 85,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -134,7 +126,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -154,7 +146,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -176,7 +168,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 88,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -193,15 +185,15 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
"def create_required_cols(df):\n",
" \"\"\"Create the required columns for geocoding.\"\"\"\n",
" log_or_print(\"Creating required columns for geocoding\", LOGGER)\n",
"\n",
" required_columns = [\"ADDR\", \"CITY\", \"STATE\", \"ZIP_CODE\"]\n",
" required_columns = [\"addr\", \"city\", \"state\", \"zip\"]\n",
" \n",
" # Check if all required columns exist in the DataFrame\n",
" missing_columns = [col for col in required_columns if col not in df.columns]\n",
Expand All @@ -211,16 +203,16 @@
" raise ValueError(error_message)\n",
"\n",
" df = df.copy()\n",
" df['ZIP_CODE'] = df[\"ZIP_CODE\"].fillna(0).astype(int).astype(str)\n",
" df[\"FULL_ADDRESS\"] = df[\"ADDR\"] + \", \" + df[\"CITY\"] + \", \" + df[\"STATE\"] + \" \" + df[\"ZIP_CODE\"]\n",
" df['zip'] = df[\"zip\"].fillna(0).astype(int).astype(str)\n",
" df[\"full_address\"] = df[\"addr\"] + \", \" + df[\"city\"] + \", \" + df[\"state\"] + \" \" + df[\"zip\"]\n",
"\n",
" log_or_print(\"Created FULL_ADDRESS column\", LOGGER)\n",
" log_or_print(\"Created full_address column\", LOGGER)\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -258,9 +250,9 @@
" log_or_print(f\"Read {len(results_df)} records from geocoded data\", LOGGER)\n",
" return results_df\n",
"\n",
" # check if In_Region_Account and match columns exist in the DataFrame. If In_Region_Account = True, and match is null, then geocode\n",
" if \"match\" in df.columns and \"In_Region_Account\" in df.columns:\n",
" df = df[(df[\"In_Region_Account\"] == True) & (df[\"match\"].isnull())]\n",
" # check if in_region and match columns exist in the DataFrame. If In_Region_Account = True, and match is null, then geocode\n",
" if \"match\" in df.columns and \"in_region\" in df.columns:\n",
" df = df[(df[\"in_region\"] == True) & (df[\"match\"].isnull() | df[\"match\"] == False)]\n",
" log_or_print(\n",
" f\"Starting geocode on {len(df)} records that do not have a geocode match and are in region\",\n",
" LOGGER,\n",
Expand All @@ -277,7 +269,7 @@
"\n",
" try:\n",
" results_df = google_geocode_batch(\n",
" address_list=df[\"FULL_ADDRESS\"].tolist(),\n",
" address_list=df[\"full_address\"].tolist(),\n",
" include_details=True,\n",
" allowed_location_types=[\"ROOFTOP\", \"RANGE_INTERPOLATED\"],\n",
" )\n",
Expand Down Expand Up @@ -320,7 +312,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 91,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -342,7 +334,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 92,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -352,13 +344,12 @@
"\n",
" # check fastrak columns\n",
" req_cols = [\n",
" \"In_Region_Account\",\n",
" \"ACCTNO\",\n",
" \"ADDR\",\n",
" \"CITY\",\n",
" \"STATE\",\n",
" \"ZIP_CODE\",\n",
" \"FULL_ADDRESS\",\n",
" \"acctno\",\n",
" \"addr\",\n",
" \"city\",\n",
" \"state\",\n",
" \"zip\",\n",
" \"full_address\",\n",
" ]\n",
"\n",
" # drop all columns that are not in the required columns\n",
Expand All @@ -370,7 +361,7 @@
" LOGGER,\n",
" )\n",
" # rename ft data address column\n",
" ft_data = ft_data.rename(columns={\"FULL_ADDRESS\": \"address_orig\"})\n",
" ft_data = ft_data.rename(columns={\"full_address\": \"address_orig\"})\n",
"\n",
" # strip whitespace from address_orig\n",
" ft_data[\"address_orig\"] = ft_data[\"address_orig\"].str.strip()\n",
Expand All @@ -384,7 +375,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -428,7 +419,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -501,7 +492,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -529,7 +520,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -554,7 +545,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 97,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -568,7 +559,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -589,7 +580,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 99,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -623,7 +614,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 100,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -642,7 +633,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 101,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -655,9 +646,9 @@
"\n",
" # drop duplicated addresses\n",
" log_or_print(\n",
" f\"Dropping {ft_df.duplicated(subset=['FULL_ADDRESS']).sum()} duplicated addresses\", LOGGER\n",
" f\"Dropping {ft_df.duplicated(subset=['full_address']).sum()} duplicated addresses\", LOGGER\n",
" )\n",
" ft_dedup_df = ft_df.drop_duplicates(subset=[\"FULL_ADDRESS\"])\n",
" ft_dedup_df = ft_df.drop_duplicates(subset=[\"full_address\"])\n",
"\n",
" # geocode the addresses\n",
" results_gdf = batch_geocode_addresses(\n",
Expand Down

0 comments on commit ad34051

Please sign in to comment.