From 604f8633b93e38f9479515a1f689f31615b0e720 Mon Sep 17 00:00:00 2001 From: Qiusheng Wu Date: Thu, 28 Nov 2024 15:02:59 -0500 Subject: [PATCH] Add notebook for visualizing PDFM (#4) * Add notebook for visualizing PDFM * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add notebook to toc --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- README.md | 13 +- docs/PDFM/map_pdfm_features.ipynb | 615 ++++++++++++++++++++++++++++++ docs/index.md | 13 +- mkdocs.yml | 1 + 4 files changed, 640 insertions(+), 2 deletions(-) create mode 100644 docs/PDFM/map_pdfm_features.ipynb diff --git a/README.md b/README.md index a1f9f11..97e4f5e 100644 --- a/README.md +++ b/README.md @@ -6,4 +6,15 @@ A collection of Jupyter notebook examples for using GeoAI Tutorials for Using Google's [Population Dynamics Foundation Model](https://github.com/google-research/population-dynamics) (PDFM) -- [Predicting US Housing Prices with PDFM and Zillow Data](https://geoai-tutorials.gishub.org/PDFM/zillow_home_value/) +- [Predicting US Housing Prices with PDFM and Zillow Data](https://geoai-tutorials.gishub.org/PDFM/zillow_home_value) +- [Visualizing PDFM Features and Predicted Home Values](https://geoai-tutorials.gishub.org/PDFM/map_pdfm_features) + +## Demos + +- Visualize US meidan home values at the county level + + ![image](https://github.com/user-attachments/assets/9052e391-a26a-4a75-9ffe-d4abac3c0fce) + +- Visualize PDFM features + + ![image](https://github.com/user-attachments/assets/fd1b233b-2be0-47c3-b2d0-f53601020604) diff --git a/docs/PDFM/map_pdfm_features.ipynb b/docs/PDFM/map_pdfm_features.ipynb new file mode 100644 index 0000000..eef0229 --- /dev/null +++ b/docs/PDFM/map_pdfm_features.ipynb @@ -0,0 +1,615 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/opengeos/GeoAI-Tutorials/blob/main/docs/PDFM/map_pdfm_features.ipynb)\n", + "\n", + "**Mapping PDFM Features and Predicted Housing Prices**\n", + "\n", + "## Useful Resources\n", + "\n", + "- [Google's Population Dynamics Foundation Model (PDFM)](https://github.com/google-research/population-dynamics)\n", + "- Request access to PDFM embeddings [here](https://github.com/google-research/population-dynamics?tab=readme-ov-file#getting-access-to-the-embeddings)\n", + "- Zillow data can be accessed [here](https://www.zillow.com/research/data/)\n", + "\n", + "## Installation\n", + "\n", + "Uncomment and run the following cell to install the required libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %pip install \"leafmap[maplibre]\" scikit-learn" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "import geopandas as gpd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.neighbors import KNeighborsRegressor\n", + "from leafmap.common import evaluate_model, plot_actual_vs_predicted, download_file\n", + "import leafmap.maplibregl as leafmap" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download Zillow Data\n", + "\n", + "Download the Zillow home value data at the county level." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "zhvi_url = \"https://github.com/opengeos/datasets/releases/download/us/zillow_home_value_index_by_county.csv\"\n", + "zhvi_file = \"data/zillow_home_value_index_by_county.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists(zhvi_file):\n", + " download_file(zhvi_url, zhvi_file)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Process Zillow Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "zhvi_df = pd.read_csv(\n", + " zhvi_file, dtype={\"StateCodeFIPS\": \"string\", \"MunicipalCodeFIPS\": \"string\"}\n", + ")\n", + "zhvi_df.index = \"geoId/\" + zhvi_df[\"StateCodeFIPS\"] + zhvi_df[\"MunicipalCodeFIPS\"]\n", + "zhvi_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Request access to PDFM Embeddings\n", + "\n", + "To request access to PDFM embeddings, please follow the instructions [here](https://github.com/google-research/population-dynamics?tab=readme-ov-file#getting-access-to-the-embeddings)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "county_geojson = \"data/county.geojson\"\n", + "if not os.path.exists(county_geojson):\n", + " raise FileNotFoundError(\"Please request the embeddings from Google\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load county boundaries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "county_gdf = gpd.read_file(county_geojson)\n", + "county_gdf.set_index(\"place\", inplace=True)\n", + "county_gdf.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Join home value data and county boundaries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = zhvi_df.join(county_gdf)\n", + "zhvi_gdf = gpd.GeoDataFrame(df, geometry=\"geometry\")\n", + "zhvi_gdf.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "column = \"2024-10-31\"\n", + "gdf = zhvi_gdf[[\"RegionName\", \"State\", column, \"geometry\"]]\n", + "gdf.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualize home values in 2D" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "m = leafmap.Map(style=\"liberty\")\n", + "first_symbol_id = m.find_first_symbol_layer()[\"id\"]\n", + "m.add_data(\n", + " gdf,\n", + " cmap=\"Blues\",\n", + " column=column,\n", + " legend_title=\"Median Home Value\",\n", + " name=\"Median Home Value\",\n", + " before_id=first_symbol_id,\n", + ")\n", + "m.add_layer_control()\n", + "m" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![image](https://github.com/user-attachments/assets/df616803-87e9-4326-be8c-9de5a72a2d95)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualize home values in 3D" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "m = leafmap.Map(style=\"liberty\", pitch=60)\n", + "m.add_data(\n", + " gdf,\n", + " cmap=\"Blues\",\n", + " column=column,\n", + " legend_title=\"Median Home Value\",\n", + " extrude=True,\n", + " scale_factor=3,\n", + " before_id=first_symbol_id,\n", + " name=\"Median Home Value\",\n", + ")\n", + "m.add_layer_control()\n", + "m" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![image](https://github.com/user-attachments/assets/65f085bb-b781-41b8-8408-440ed202b766)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load PDFM county embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embeddings_file_path = \"data/county_embeddings.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embeddings_df = pd.read_csv(embeddings_file_path).set_index(\"place\")\n", + "embeddings_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = embeddings_df.join(county_gdf)\n", + "embeddings_gdf = gpd.GeoDataFrame(df, geometry=\"geometry\")\n", + "embeddings_gdf.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualize PDFM features\n", + "\n", + "Select any of the 329 PDFM features to visualize." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "column = \"feature329\" # Change this to the feature you want to use\n", + "gdf = embeddings_gdf[[column, \"state\", \"county\", \"geometry\"]]\n", + "gdf.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "m = leafmap.Map(style=\"liberty\")\n", + "m.add_data(\n", + " gdf,\n", + " cmap=\"Blues\",\n", + " column=column,\n", + " legend_title=column,\n", + " before_id=first_symbol_id,\n", + " name=column,\n", + ")\n", + "m.add_layer_control()\n", + "m" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![image](https://github.com/user-attachments/assets/efa4983d-89d2-4dcc-9cd6-fefeee069bf7)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "m = leafmap.Map(style=\"liberty\", pitch=60)\n", + "m.add_data(\n", + " gdf,\n", + " cmap=\"Blues\",\n", + " column=column,\n", + " legend_title=column,\n", + " before_id=first_symbol_id,\n", + " name=column,\n", + " extrude=True,\n", + " scale_factor=0.00005,\n", + ")\n", + "m.add_layer_control()\n", + "m" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![image](https://github.com/user-attachments/assets/fd1b233b-2be0-47c3-b2d0-f53601020604)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Join Zillow and PDFM Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = zhvi_df.join(embeddings_df, how=\"inner\")\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embedding_features = [f\"feature{x}\" for x in range(330)]\n", + "label = \"2024-10-31\" # Change this to the date you want to predict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = data.dropna(subset=[label])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Split Train and Test Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = data[embedding_features + [label]]\n", + "X = data[embedding_features]\n", + "y = data[label]\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=42\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fit Linear Regression Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = LinearRegression()\n", + "model.fit(X_train, y_train)\n", + "y_pred = model.predict(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluate Linear Regression Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evaluation_df = pd.DataFrame({\"y\": y_test, \"y_pred\": y_pred})\n", + "metrics = evaluate_model(evaluation_df)\n", + "print(metrics)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "xy_lim = (0, 1_000_000)\n", + "plot_actual_vs_predicted(\n", + " evaluation_df,\n", + " xlim=xy_lim,\n", + " ylim=xy_lim,\n", + " title=\"Actual vs Predicted Home Values\",\n", + " x_label=\"Actual Home Value\",\n", + " y_label=\"Predicted Home Value\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![image](https://github.com/user-attachments/assets/a16a15c6-508b-40c9-aa57-8e98b8b8e216)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Join predicted values with county boundaries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = evaluation_df.join(gdf)\n", + "df[\"difference\"] = df[\"y_pred\"] - df[\"y\"]\n", + "evaluation_gdf = gpd.GeoDataFrame(df, geometry=\"geometry\")\n", + "evaluation_gdf.drop(columns=[\"category\", \"color\", column], inplace=True)\n", + "evaluation_gdf.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualize actual home values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "m = leafmap.Map(style=\"liberty\", pitch=60)\n", + "m.add_data(\n", + " evaluation_gdf,\n", + " cmap=\"Blues\",\n", + " column=\"y\",\n", + " legend_title=\"Actual Home Value\",\n", + " before_id=first_symbol_id,\n", + " name=\"Actual Home Value\",\n", + " extrude=True,\n", + " scale_factor=3,\n", + ")\n", + "m.add_layer_control()\n", + "m" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![image](https://github.com/user-attachments/assets/a86401d5-defd-4277-877b-c8f8c5e07651)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualize predicted home values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "m = leafmap.Map(style=\"liberty\", pitch=60)\n", + "m.add_data(\n", + " evaluation_gdf,\n", + " cmap=\"Blues\",\n", + " column=\"y_pred\",\n", + " legend_title=\"Predicted Home Value\",\n", + " before_id=first_symbol_id,\n", + " name=\"Predicted Home Value\",\n", + " extrude=True,\n", + " scale_factor=3,\n", + ")\n", + "m.add_layer_control()\n", + "m" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![image](https://github.com/user-attachments/assets/6bfe1c89-968e-4eef-84cc-98bcf07c8acb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualize difference between predicted and actual home values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "m = leafmap.Map(style=\"liberty\", pitch=60)\n", + "m.add_data(\n", + " evaluation_gdf,\n", + " cmap=\"coolwarm\",\n", + " column=\"difference\",\n", + " legend_title=\"y_pred-y\",\n", + " before_id=first_symbol_id,\n", + " name=\"Difference\",\n", + " extrude=True,\n", + " scale_factor=3,\n", + ")\n", + "m.add_layer_control()\n", + "m" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![image](https://github.com/user-attachments/assets/4ae58eb1-ab28-49e5-b352-d769a32d3e5c)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/index.md b/docs/index.md index a1f9f11..97e4f5e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -6,4 +6,15 @@ A collection of Jupyter notebook examples for using GeoAI Tutorials for Using Google's [Population Dynamics Foundation Model](https://github.com/google-research/population-dynamics) (PDFM) -- [Predicting US Housing Prices with PDFM and Zillow Data](https://geoai-tutorials.gishub.org/PDFM/zillow_home_value/) +- [Predicting US Housing Prices with PDFM and Zillow Data](https://geoai-tutorials.gishub.org/PDFM/zillow_home_value) +- [Visualizing PDFM Features and Predicted Home Values](https://geoai-tutorials.gishub.org/PDFM/map_pdfm_features) + +## Demos + +- Visualize US meidan home values at the county level + + ![image](https://github.com/user-attachments/assets/9052e391-a26a-4a75-9ffe-d4abac3c0fce) + +- Visualize PDFM features + + ![image](https://github.com/user-attachments/assets/fd1b233b-2be0-47c3-b2d0-f53601020604) diff --git a/mkdocs.yml b/mkdocs.yml index ae6b222..0a7486b 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -58,6 +58,7 @@ nav: - Home: index.md - PDFM: - PDFM/zillow_home_value.ipynb + - PDFM/map_pdfm_features.ipynb - Opengeos: https://github.com/opengeos - YouTube Channel: https://youtube.com/@giswqs - Report Issues: https://github.com/opengeos/GeoAI-Tutorials/issues