diff --git a/environment.yml b/environment.yml index 867edb3a..8f74a990 100644 --- a/environment.yml +++ b/environment.yml @@ -19,3 +19,4 @@ dependencies: - mapclassify - descartes - joblib + - esda diff --git a/tobler/__init__.py b/tobler/__init__.py index 31b6f23f..1c76137f 100644 --- a/tobler/__init__.py +++ b/tobler/__init__.py @@ -8,3 +8,4 @@ from . import dasymetric from . import model from . import util +from . import diagnostics diff --git a/tobler/area_weighted/area_interpolate.py b/tobler/area_weighted/area_interpolate.py index 7911f850..9c17777b 100644 --- a/tobler/area_weighted/area_interpolate.py +++ b/tobler/area_weighted/area_interpolate.py @@ -6,12 +6,13 @@ import numpy as np import geopandas as gpd from ._vectorized_raster_interpolation import _fast_append_profile_in_gdf -import warnings +from warnings import warn from scipy.sparse import dok_matrix, diags, coo_matrix import pandas as pd import os from tobler.util.util import _check_crs, _nan_check, _inf_check, _check_presence_of_crs +from tobler.diagnostics import _smaup def _chunk_dfs(geoms_to_chunk, geoms_full, n_jobs): @@ -268,6 +269,7 @@ def _area_interpolate_binning( spatial_index="auto", n_jobs=1, categorical_variables=None, + smaup_kwds=None ): """ Area interpolation for extensive, intensive and categorical variables. @@ -307,6 +309,11 @@ def _area_interpolate_binning( of `pygeos` and `geopandas`. categorical_variables : list [Optional. Default=None] Columns in dataframes for categorical variables + smaup_kwds : dict + [Optional. Default = None] Keyword arguments for tobler's smaup wrapper + Requires the following values: + int: 'k' for number of regions to be tested and + libpysal.weights: 'w' to calculate Moran's I. Returns ------- @@ -348,6 +355,14 @@ def _area_interpolate_binning( source_df = source_df.copy() target_df = target_df.copy() + if smaup_kwds is not None: + for var in intensive_variables,extensive_variables: + stat = _smaup(smaup_kwds["k"], source_df[var].to_numpy(), smaup_kwds["w"]) + if stat.summary.find('H0 is rejected'): + warn(f"{var} is affected by the MAUP. Interpolations of this variable may not be accourate!") + else: + print(f"{var} is not affected by the MAUP.") + if _check_crs(source_df, target_df): pass else: @@ -614,7 +629,7 @@ def _area_tables_raster( res_union_pre = gpd.overlay(source_df, target_df, how="union") # Establishing a CRS for the generated union - warnings.warn( + warn( "The CRS for the generated union will be set to be the same as source_df." ) res_union_pre.crs = source_df.crs diff --git a/tobler/dasymetric/masked_area_interpolate.py b/tobler/dasymetric/masked_area_interpolate.py index e42f1651..392fd333 100644 --- a/tobler/dasymetric/masked_area_interpolate.py +++ b/tobler/dasymetric/masked_area_interpolate.py @@ -2,6 +2,9 @@ from ..area_weighted._vectorized_raster_interpolation import * +from tobler.diagnostics import _smaup +from warnings import warn + def masked_area_interpolate( source_df, @@ -13,6 +16,7 @@ def masked_area_interpolate( intensive_variables=None, allocate_total=True, tables=None, + smaup_kwds=None, ): """Interpolate data between two polygonal datasets using an auxiliary raster to mask out uninhabited land. @@ -38,6 +42,11 @@ def masked_area_interpolate( whether to allocate the total from the source geometries (the default is True). tables : tuple of two numpy.array (optional) As generated from `tobler.area_weighted.area_tables_raster` (the default is None). + smaup_kwds : dict + [Optional. Default = None] Keyword arguments for tobler's smaup wrapper + Requires the following values: + int: 'k' for number of regions to be tested and + libpysal.weights: 'w' to calculate Moran's I. Returns ------- @@ -53,6 +62,14 @@ def masked_area_interpolate( "You must pass the path to a raster that can be read with rasterio" ) + if smaup_kwds is not None: + for var in intensive_variables,extensive_variables: + stat = _smaup(smaup_kwds["k"], source_df[var].to_numpy(), smaup_kwds["w"]) + if stat.summary.find('H0 is rejected'): + warn(f"{var} is affected by the MAUP. Interpolations of this variable may not be accourate!") + else: + print(f"{var} is not affected by the MAUP.") + if not tables: tables = _area_tables_raster( source_df, diff --git a/tobler/diagnostics/__init__.py b/tobler/diagnostics/__init__.py new file mode 100644 index 00000000..3c74d2a2 --- /dev/null +++ b/tobler/diagnostics/__init__.py @@ -0,0 +1 @@ +from .smaup import _smaup diff --git a/tobler/diagnostics/smaup.py b/tobler/diagnostics/smaup.py new file mode 100644 index 00000000..7526fa15 --- /dev/null +++ b/tobler/diagnostics/smaup.py @@ -0,0 +1,35 @@ +""" +A wrapper for using the S-maup statistical test in tobler interpolation + +""" + +from esda.smaup import Smaup +from esda.moran import Moran + +def _smaup(k, y, w,): + """ + A function that wraps esda's smaup function and automates some of the process of calculating smaup. + For more about smaup read here: https://pysal.org/esda/generated/esda.Smaup.html#esda.Smaup + + Parameters + ---------- + + k : int + number of regions + y : array + data for autocorellation calculation + w : libpysal.weights object + pysal spatial weights object for autocorellation calculation + + Returns + ------- + esda.smaup.Smaup: + statistic that contains information regarding the extent to which the variable is affected by the MAUP. + + + """ + rho = Moran(y, w).I + n = len(y) + stat = Smaup(n,k,rho) + return stat + \ No newline at end of file diff --git a/tobler/model/glm.py b/tobler/model/glm.py index 9e8f82ef..50ac996e 100644 --- a/tobler/model/glm.py +++ b/tobler/model/glm.py @@ -12,6 +12,7 @@ _return_weights_from_regression, ) from tobler.util import project_gdf +from tobler.diagnostics import _smaup def glm_pixel_adjusted( @@ -100,6 +101,7 @@ def glm( likelihood="poisson", force_crs_match=True, return_model=False, + smaup_kwds=None ): """Estimate interpolated values using raster data as input to a generalized linear model. @@ -130,6 +132,11 @@ def glm( return model : bool whether to return the fitted model in addition to the interpolated geodataframe. If true, this will return (geodataframe, model) + smaup_kwds : dict + [Optional. Default = None] Keyword arguments for tobler's smaup wrapper + Requires the following values: + int: 'k' for number of regions to be tested and + libpysal.weights: 'w' to calculate Moran's I. Returns -------- @@ -143,6 +150,14 @@ def glm( source_df = source_df.copy() target_df = target_df.copy() _check_presence_of_crs(source_df) + + if smaup_kwds is not None: + stat = _smaup(smaup_kwds["k"], source_df[variable].to_numpy(), smaup_kwds["w"]) + if stat.summary.find('H0 is rejected'): + warn(f"{variable} is affected by the MAUP. Interpolations of this variable may not be accourate!") + else: + print(f"{variable} is not affected by the MAUP.") + liks = {"poisson": Poisson, "gaussian": Gaussian, "neg_binomial": NegativeBinomial} if likelihood not in liks.keys():