diff --git a/geocube/api/core.py b/geocube/api/core.py index 52fe525..8c3841c 100644 --- a/geocube/api/core.py +++ b/geocube/api/core.py @@ -2,6 +2,8 @@ """ GeoCube client core functionality """ +import numpy + from geocube.geo_utils.geobox import GeoBoxMaker from geocube.vector_to_cube import VectorToCube @@ -15,7 +17,7 @@ def make_geocube( align=None, geom=None, like=None, - fill=None, + fill=numpy.nan, group_by=None, interpolate_na_method=None, categorical_enums=None, @@ -59,7 +61,7 @@ def make_geocube( gcds = make_geocube(vector_data='my_vector.geopackage', like=other_gcds) fill: float, optional - The value to fill in the grid with for nodata. Default is -9999.0. + The value to fill in the grid with for nodata. Default is NaN. group_by: str, optional When specified, perform basic combining/reducing of the data on this column. interpolate_na_method: {‘linear’, ‘nearest’, ‘cubic’}, optional diff --git a/geocube/cli/commands/make_geocube.py b/geocube/cli/commands/make_geocube.py index 00d37db..5471439 100644 --- a/geocube/cli/commands/make_geocube.py +++ b/geocube/cli/commands/make_geocube.py @@ -4,6 +4,7 @@ """ import click +import numpy import xarray from geocube.api import core @@ -27,7 +28,8 @@ "-f", "--fill", type=float, - help="The value to fill in the grid with for nodata. Default is -9999.0.", + help="The value to fill in the grid with for nodata. Default is NaN.", + default=numpy.nan, required=False, ) @click.option( diff --git a/geocube/rasterize.py b/geocube/rasterize.py index 17a57fe..26a2ad6 100644 --- a/geocube/rasterize.py +++ b/geocube/rasterize.py @@ -14,12 +14,26 @@ from geocube.logger import get_logger +def _remove_missing_data(data_values, geometry_array): + """ + Missing data causes issues with interpolation of point data + https://github.com/corteva/geocube/issues/9 + + This filters the data so those issues don't cause problems. + """ + not_missing_data = ~pandas.isnull(data_values) + geometry_array = geometry_array[not_missing_data] + data_values = data_values[not_missing_data] + return data_values, geometry_array + + def rasterize_image( geometry_array, data_values, geobox, - fill=-9999.0, + fill, merge_alg=MergeAlg.replace, + filter_nan=False, **ignored_kwargs ): """ @@ -33,10 +47,13 @@ def rasterize_image( Data values associated with the list of geojson shapes geobox: :obj:`datacube.utils.geometry.GeoBox` Transform of the resulting image. - fill: float, optional - The value to fill in the grid with for nodata. Default is -9999.0. + fill: float + The value to fill in the grid with for nodata. merge_alg: `rasterio.enums.MergeAlg`, optional The algorithm for merging values into one cell. Default is `MergeAlg.replace`. + filter_nan: bool, optional + If True, will remove nodata values from the data before rasterization. + Default is False. **ignored_kwargs: These are there to be flexible with additional rasterization methods and will be ignored. @@ -50,6 +67,10 @@ def rasterize_image( logger = get_logger() try: + if filter_nan: + data_values, geometry_array = _remove_missing_data( + data_values, geometry_array + ) image = rasterio.features.rasterize( zip(geometry_array.apply(mapping).values, data_values), out_shape=(geobox.height, geobox.width), @@ -66,26 +87,14 @@ def rasterize_image( raise -def _remove_missing_data(data_values, geometry_array): - """ - Missing data causes issues with interpolation of point data - https://github.com/corteva/geocube/issues/9 - - This filters the data so those issues don't cause problems. - """ - not_missing_data = ~pandas.isnull(data_values) - geometry_array = geometry_array[not_missing_data] - data_values = data_values[not_missing_data] - return data_values, geometry_array - - def rasterize_points_griddata( geometry_array, data_values, grid_coords, - fill=-9999.0, + fill, method="nearest", rescale=False, + filter_nan=False, **ignored_kwargs ): """ @@ -100,12 +109,15 @@ def rasterize_points_griddata( Data values associated with the list of geojson shapes grid_coords: dict Output from `rioxarray.rioxarray.affine_to_coords` - fill: float, optional - The value to fill in the grid with for nodata. Default is -9999.0. + fill: float + The value to fill in the grid with for nodata. method: {‘linear’, ‘nearest’, ‘cubic’}, optional The method to use for interpolation in `scipy.interpolate.griddata`. rescale: bool, optional Rescale points to unit cube before performing interpolation. Default is false. + filter_nan: bool, optional + If True, will remove nodata values from the data before rasterization. + Default is False. **ignored_kwargs: These are there to be flexible with additional rasterization methods and will be ignored. @@ -118,7 +130,10 @@ def rasterize_points_griddata( if data_values.dtype == object: return None try: - data_values, geometry_array = _remove_missing_data(data_values, geometry_array) + if filter_nan: + data_values, geometry_array = _remove_missing_data( + data_values, geometry_array + ) return griddata( points=(geometry_array.x, geometry_array.y), values=data_values, @@ -134,7 +149,12 @@ def rasterize_points_griddata( def rasterize_points_radial( - geometry_array, data_values, grid_coords, method="linear", **ignored_kwargs + geometry_array, + data_values, + grid_coords, + method="linear", + filter_nan=False, + **ignored_kwargs ): """ This method uses scipy.interpolate.Rbf to interpolate point data @@ -148,13 +168,13 @@ def rasterize_points_radial( Data values associated with the list of geojson shapes grid_coords: dict Output from `rioxarray.rioxarray.affine_to_coords` - fill: float, optional - The value to fill in the grid with for nodata. Default is -9999.0. method: str, optional The function to use for interpolation in `scipy.interpolate.Rbf`. {'multiquadric', 'inverse', 'gaussian', 'linear', 'cubic', 'quintic', 'thin_plate'} - + filter_nan: bool, optional + If True, will remove nodata values from the data before rasterization. + Default is False. **ignored_kwargs: These are there to be flexible with additional rasterization methods and will be ignored. @@ -167,7 +187,10 @@ def rasterize_points_radial( logger = get_logger() try: - data_values, geometry_array = _remove_missing_data(data_values, geometry_array) + if filter_nan: + data_values, geometry_array = _remove_missing_data( + data_values, geometry_array + ) interp = Rbf(geometry_array.x, geometry_array.y, data_values, function=method) return interp(*numpy.meshgrid(grid_coords["x"], grid_coords["y"])) except ValueError as ter: diff --git a/geocube/vector_to_cube.py b/geocube/vector_to_cube.py index 5db5a93..a0672e9 100644 --- a/geocube/vector_to_cube.py +++ b/geocube/vector_to_cube.py @@ -62,7 +62,7 @@ def __init__(self, vector_data, geobox_maker, fill=None, categorical_enums=None) geobox_maker: :obj:`geocube.geo_utils.geobox.GeoBoxMaker` The geobox for the grid to be generated from the vector data. fill: float, optional - The value to fill in the grid with for nodata. Default is -9999.0. + The value to fill in the grid with for nodata. Default is NaN. categorical_enums: dict, optional A dictionary of all categories for the table columns containing categorical data. @@ -74,7 +74,7 @@ def __init__(self, vector_data, geobox_maker, fill=None, categorical_enums=None) self.grid_coords = affine_to_coords( self.geobox.affine, self.geobox.width, self.geobox.height ) - self.fill = fill if fill is not None else -9999.0 + self.fill = fill if fill is not None else numpy.nan if categorical_enums is not None: for column_name, categories in categorical_enums.items(): category_type = pandas.api.types.CategoricalDtype( diff --git a/sphinx/history.rst b/sphinx/history.rst index 73d9863..7f355fa 100644 --- a/sphinx/history.rst +++ b/sphinx/history.rst @@ -3,7 +3,8 @@ History 0.0.10 ------ -- Filter out missing data when interpolating from point data (issue #9) +- Added filter_nan kwarg to filter out missing data when rasterizing (issue #9) +- Change default fill value to NaN when rasterizing (pull #11) 0.0.9 ----- diff --git a/test/integration/api/test_core_integration.py b/test/integration/api/test_core_integration.py index 0644655..4514e00 100644 --- a/test/integration/api/test_core_integration.py +++ b/test/integration/api/test_core_integration.py @@ -60,6 +60,7 @@ def test_make_geocube(input_geodata, tmpdir): output_crs=TEST_GARS_PROJ, geom=json.dumps(mapping(TEST_GARS_POLY)), resolution=(-10, 10), + fill=-9999.0, ) # test writing to netCDF @@ -92,6 +93,7 @@ def test_make_geocube__categorical(input_geodata, tmpdir): geom=json.dumps(mapping(TEST_GARS_POLY)), resolution=(-10, 10), categorical_enums={"soil_type": ("sand", "silt", "clay")}, + fill=-9999.0, ) # test writing to netCDF out_grid.to_netcdf( @@ -132,6 +134,7 @@ def test_make_geocube__interpolate_na(input_geodata, tmpdir): geom=json.dumps(mapping(TEST_GARS_POLY)), resolution=(-10, 10), interpolate_na_method="nearest", + fill=-9999.0, ) # test writing to netCDF @@ -170,7 +173,10 @@ def test_make_geocube__like(input_geodata, tmpdir): os.path.join(TEST_COMPARE_DATA_DIR, "soil_grid_flat.nc"), mask_and_scale=False ) as xdc: out_grid = make_geocube( - vector_data=input_geodata, measurements=soil_attribute_list, like=xdc + vector_data=input_geodata, + measurements=soil_attribute_list, + like=xdc, + fill=-9999.0, ) # test writing to netCDF @@ -203,6 +209,7 @@ def test_make_geocube__only_resolution(input_geodata, tmpdir): vector_data=input_geodata, measurements=soil_attribute_list, resolution=(-0.001, 0.001), + fill=-9999.0, ) # test writing to netCDF @@ -231,6 +238,7 @@ def test_make_geocube__convert_time(input_geodata, tmpdir): measurements=["test_attr", "test_time_attr", "test_str_attr"], datetime_measurements=["test_time_attr"], resolution=(-0.00001, 0.00001), + fill=-9999.0, ) # test writing to netCDF @@ -275,6 +283,7 @@ def test_make_geocube__like_error_invalid_args(load_extra_kwargs): vector_data=os.path.join(TEST_INPUT_DATA_DIR, "soil_data_flat.geojson"), measurements=soil_attribute_list, like=xdc, + fill=-9999.0, **load_extra_kwargs ) @@ -292,6 +301,7 @@ def test_make_geocube__no_measurements(input_geodata, tmpdir): output_crs=TEST_GARS_PROJ, geom=json.dumps(mapping(TEST_GARS_POLY)), resolution=(-10, 10), + fill=-9999.0, ) # test writing to netCDF @@ -311,6 +321,7 @@ def test_make_geocube__no_geom(tmpdir): vector_data=os.path.join(TEST_INPUT_DATA_DIR, "soil_data_flat.geojson"), measurements=["sandtotal_r"], resolution=(-0.001, 0.001), + fill=-9999.0, ) # test writing to netCDF @@ -350,6 +361,7 @@ def test_make_geocube__no_resolution_error(): measurements=["sandtotal_r"], output_crs=TEST_GARS_PROJ, geom=json.dumps(mapping(TEST_GARS_POLY)), + fill=-9999.0, ) @@ -378,6 +390,7 @@ def test_make_geocube__group_by(input_geodata, tmpdir): geom=json.dumps(mapping(TEST_GARS_POLY)), group_by="hzdept_r", resolution=(-10, 10), + fill=-9999.0, ) # test writing to netCDF @@ -417,6 +430,7 @@ def test_make_geocube__group_by__categorical(input_geodata, tmpdir): group_by="hzdept_r", resolution=(-10, 10), categorical_enums={"soil_type": ("sand", "silt", "clay")}, + fill=-9999.0, ) # test writing to netCDF @@ -461,6 +475,7 @@ def test_make_geocube__group_by_like(input_geodata, tmpdir): measurements=soil_attribute_list, group_by="hzdept_r", like=xdc, + fill=-9999.0, ) # test writing to netCDF @@ -488,6 +503,7 @@ def test_make_geocube__group_by_only_resolution(input_geodata, tmpdir): measurements=soil_attribute_list, group_by="hzdept_r", resolution=(-0.001, 0.001), + fill=-9999.0, ) # test writing to netCDF @@ -518,6 +534,7 @@ def test_make_geocube__group_by_time(input_geodata, tmpdir): datetime_measurements=["test_time_attr"], resolution=(-0.00001, 0.00001), group_by="test_time_attr", + fill=-9999.0, ) # test writing to netCDF @@ -547,6 +564,7 @@ def test_make_geocube__group_by_convert_with_time(input_geodata, tmpdir): datetime_measurements=["test_time_attr"], resolution=(-0.00001, 0.00001), group_by="test_attr", + fill=-9999.0, ) # test writing to netCDF @@ -599,6 +617,7 @@ def test_make_geocube__group_by_like_error_invalid_args(load_extra_kwargs): measurements=soil_attribute_list, like=xdc, group_by="hzdept_r", + fill=-9999.0, **load_extra_kwargs ) @@ -617,6 +636,7 @@ def test_make_geocube__group_by_no_measurements(input_geodata, tmpdir): geom=json.dumps(mapping(TEST_GARS_POLY)), group_by="hzdept_r", resolution=(-10, 10), + fill=-9999.0, ) # test writing to netCDF @@ -639,6 +659,7 @@ def test_make_geocube__group_by__no_geom(tmpdir): measurements=["sandtotal_r"], group_by="hzdept_r", resolution=(-0.001, 0.001), + fill=-9999.0, ) # test writing to netCDF @@ -664,6 +685,7 @@ def test_make_geocube__group_by__no_resolution_error(): output_crs=TEST_GARS_PROJ, geom=json.dumps(mapping(TEST_GARS_POLY)), group_by="hzdept_r", + fill=-9999.0, ) @@ -672,6 +694,7 @@ def test_make_geocube__new_bounds_crs(): vector_data=os.path.join(TEST_INPUT_DATA_DIR, "wgs84_geom.geojson"), output_crs="epsg:32614", resolution=(-1, 1), + fill=-9999.0, ) assert_almost_equal( utm_cube.id.rio.bounds(), (1665478.0, 7018306.0, 1665945.0, 7018509.0) @@ -701,6 +724,7 @@ def test_make_geocube__custom_rasterize_function(function, compare_name, tmpdir) measurements=["test_attr", "test_time_attr", "test_str_attr"], resolution=(-0.00001, 0.00001), rasterize_function=function, + fill=-9999.0, ) # test writing to netCDF @@ -716,12 +740,22 @@ def test_make_geocube__custom_rasterize_function(function, compare_name, tmpdir) @pytest.mark.parametrize( "function,compare_name", [ - (rasterize_points_griddata, "rasterize_griddata_nearest_nodata.nc"), ( - partial(rasterize_points_griddata, method="cubic"), + partial(rasterize_points_griddata, filter_nan=True), + "rasterize_griddata_nearest_nodata.nc", + ), + ( + partial(rasterize_points_griddata, method="cubic", filter_nan=True), "rasterize_griddata_cubic_nodata.nc", ), - (rasterize_points_radial, "rasterize_radial_linear_nodata.nc"), + ( + partial(rasterize_points_radial, filter_nan=True), + "rasterize_radial_linear_nodata.nc", + ), + ( + partial(rasterize_image, merge_alg=MergeAlg.add, filter_nan=True), + "rasterize_image_sum_nodata.nc", + ), ], ) def test_make_geocube__custom_rasterize_function__filter_null( diff --git a/test/test_data/compare/rasterize_griddata_cubic_nodata.nc b/test/test_data/compare/rasterize_griddata_cubic_nodata.nc index 7f638ba..063922f 100644 Binary files a/test/test_data/compare/rasterize_griddata_cubic_nodata.nc and b/test/test_data/compare/rasterize_griddata_cubic_nodata.nc differ diff --git a/test/test_data/compare/rasterize_griddata_nearest_nodata.nc b/test/test_data/compare/rasterize_griddata_nearest_nodata.nc index c161d89..c985595 100644 Binary files a/test/test_data/compare/rasterize_griddata_nearest_nodata.nc and b/test/test_data/compare/rasterize_griddata_nearest_nodata.nc differ diff --git a/test/test_data/compare/rasterize_image_sum_nodata.nc b/test/test_data/compare/rasterize_image_sum_nodata.nc new file mode 100644 index 0000000..a4ec719 Binary files /dev/null and b/test/test_data/compare/rasterize_image_sum_nodata.nc differ diff --git a/test/test_data/compare/rasterize_radial_linear_nodata.nc b/test/test_data/compare/rasterize_radial_linear_nodata.nc index f3758d7..1196cfe 100644 Binary files a/test/test_data/compare/rasterize_radial_linear_nodata.nc and b/test/test_data/compare/rasterize_radial_linear_nodata.nc differ diff --git a/test/unit/cli/commands/test_make_geocube.py b/test/unit/cli/commands/test_make_geocube.py index 7601e97..c724e2a 100644 --- a/test/unit/cli/commands/test_make_geocube.py +++ b/test/unit/cli/commands/test_make_geocube.py @@ -1,5 +1,6 @@ import os +import numpy import pytest import xarray from click.testing import CliRunner @@ -12,7 +13,7 @@ def _get_called_dict(**kwargs): default = dict( align=None, - fill=None, + fill=numpy.nan, geom=None, group_by=None, interpolate_na_method=None,