From 3160423af0ff45d994e96fb3fae0bfbf436c9d22 Mon Sep 17 00:00:00 2001 From: Xee authors Date: Mon, 9 Oct 2023 10:32:32 -0700 Subject: [PATCH 01/15] Setting Minimum Versions (Python and EE) We use 3.9-style type annotations in this codebase. To support 3.8 in the future, we'll need to use a compatible style type annotation. In addition, we set the minimum version of the EE client to prevent errors in computing pixels. Fixes #48 and Fixes #56. PiperOrigin-RevId: 571976897 --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index df7116e..aedd555 100644 --- a/setup.py +++ b/setup.py @@ -43,14 +43,14 @@ license='Apache 2.0', author='Google LLC', author_email='noreply@google.com', - install_requires=['xarray', 'earthengine-api', 'pyproj', 'affine'], + install_requires=['xarray', 'earthengine-api>=0.1.374', 'pyproj', 'affine'], extras_require={ 'tests': tests_requires, 'examples': examples_require, }, url='https://github.com/google/xee', packages=setuptools.find_packages(exclude=['examples']), - python_requires='>=3.8', + python_requires='>=3.9', entry_points={ 'xarray.backends': ['ee=xee:EarthEngineBackendEntrypoint'], } From f870e2376b5b92c1247a9f09d2a269b52982194c Mon Sep 17 00:00:00 2001 From: Xee authors Date: Mon, 9 Oct 2023 14:30:50 -0700 Subject: [PATCH 02/15] Ensure dataset attributes are valid. Fixes #49. PiperOrigin-RevId: 572042222 --- xee/ext.py | 30 ++++++++++++++++++++++++++++-- xee/ext_integration_test.py | 18 +++++++++++++++++- 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/xee/ext.py b/xee/ext.py index 65bd9e0..b094cbe 100644 --- a/xee/ext.py +++ b/xee/ext.py @@ -89,6 +89,17 @@ class EarthEngineStore(common.AbstractDataStore): DEFAULT_MASK_VALUE = np.iinfo(np.int32).max + ATTRS_VALID_TYPES = ( + str, + int, + float, + complex, + np.ndarray, + np.number, + list, + tuple + ) + @classmethod def open( cls, @@ -164,7 +175,7 @@ def __init__( coordinates=f'{self.primary_dim_name} {x_dim_name} {y_dim_name}', crs=self.crs_arg, ) - + self._props = self._make_attrs_valid(self._props) # Scale in the projection's units. Typically, either meters or degrees. # If we use the default CRS i.e. EPSG:3857, the units is in meters. default_scale = self.SCALE_UNITS.get(self.scale_units, 1) @@ -324,13 +335,28 @@ def _band_attrs(self, band_name: str) -> types.BandInfo: def _bands(self) -> list[str]: return [b['id'] for b in self._img_info['bands']] + def _make_attrs_valid( + self, attrs: dict[str, Any] + ) -> dict[ + str, + Union[ + str, int, float, complex, np.ndarray, np.number, list[Any], tuple[Any] + ], + ]: + return { + key: (str(value) + if not isinstance(value, self.ATTRS_VALID_TYPES) + else value) + for key, value in attrs.items() + } + def open_store_variable(self, name: str) -> xarray.Variable: arr = EarthEngineBackendArray(name, self) data = indexing.LazilyIndexedArray(arr) x_dim_name, y_dim_name = self.dimension_names dimensions = [self.primary_dim_name, x_dim_name, y_dim_name] - attrs = self._band_attrs(name) + attrs = self._make_attrs_valid(self._band_attrs(name)) encoding = { 'source': attrs['id'], 'scale_factor': arr.scale, diff --git a/xee/ext_integration_test.py b/xee/ext_integration_test.py index 0f69ad0..4e5b089 100644 --- a/xee/ext_integration_test.py +++ b/xee/ext_integration_test.py @@ -13,7 +13,6 @@ # limitations under the License. # ============================================================================== r"""Integration tests for the Google Earth Engine backend for Xarray.""" - import pathlib from absl.testing import absltest @@ -358,6 +357,23 @@ def test_data_sanity_check(self): self.assertNotEqual(temperature_2m.min(), 0.0) self.assertNotEqual(temperature_2m.max(), 0.0) + def test_validate_band_attrs(self): + ds = self.entry.open_dataset( + 'ee:LANDSAT/LC08/C01/T1', + drop_variables=tuple(f'B{i}' for i in range(3, 12)), + scale=25.0, # in degrees + n_images=3, + ) + valid_types = (str, int, float, complex, np.ndarray, np.number, list, tuple) + + # Check attrs on the dataset itself + for _, value in ds.attrs.items(): + self.assertIsInstance(value, valid_types) + + # Check attrs on each variable within the dataset + for variable in ds.variables.values(): + for _, value in variable.attrs.items(): + self.assertIsInstance(value, valid_types) if __name__ == '__main__': absltest.main() From 68b503fd403676b22f70fe3dd0aac0e45b667046 Mon Sep 17 00:00:00 2001 From: Xee authors Date: Tue, 10 Oct 2023 09:12:10 -0700 Subject: [PATCH 03/15] Made changes to default chunks to not exceed 48 MB request limit. Fixes #50. PiperOrigin-RevId: 572269859 --- xee/ext.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xee/ext.py b/xee/ext.py index b094cbe..15e3f35 100644 --- a/xee/ext.py +++ b/xee/ext.py @@ -70,9 +70,9 @@ class EarthEngineStore(common.AbstractDataStore): # "Safe" default chunks that won't exceed the request limit. PREFERRED_CHUNKS: dict[str, int] = { - 'index': 24, + 'index': 48, 'width': 512, - 'height': 512, + 'height': 256, } SCALE_UNITS: dict[str, int] = { From 4a6f7d839df23b59ec99c27f6c11839c70824087 Mon Sep 17 00:00:00 2001 From: Xee authors Date: Tue, 10 Oct 2023 09:43:35 -0700 Subject: [PATCH 04/15] Updating Integration Test Constants. Updating constants as we begin to automate running integration tests. More attention is needed here to tell if the coordinates are accurate. This will be addressed in #57. PiperOrigin-RevId: 572278845 --- xee/ext_integration_test.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/xee/ext_integration_test.py b/xee/ext_integration_test.py index 4e5b089..1f539c4 100644 --- a/xee/ext_integration_test.py +++ b/xee/ext_integration_test.py @@ -58,14 +58,14 @@ def setUp(self): def test_creates_lat_long_array(self): arr = xee.EarthEngineBackendArray('longitude', self.lnglat_store) - self.assertEqual((1, 360, 180), arr.shape) + self.assertEqual((1, 360, 179), arr.shape) def test_can_create_object(self): arr = xee.EarthEngineBackendArray('B4', self.store) self.assertIsNotNone(arr) - self.assertEqual((64, 360, 180), arr.shape) + self.assertEqual((64, 360, 179), arr.shape) self.assertEqual(np.int32, arr.dtype) self.assertEqual('B4', arr.variable_name) @@ -258,7 +258,7 @@ def test_open_dataset__sanity_check(self): n_images=3, ) self.assertEqual( - dict(ds.dims), {'time': 3, 'lon': 15, 'lat': 8} + dict(ds.dims), {'time': 3, 'lon': 15, 'lat': 7} ) self.assertNotEmpty(dict(ds.coords)) self.assertEqual( @@ -268,7 +268,7 @@ def test_open_dataset__sanity_check(self): for v in ds.values(): self.assertIsNotNone(v.data) self.assertFalse(v.isnull().all(), 'All values are null!') - self.assertEqual(v.shape, (3, 15, 8)) + self.assertEqual(v.shape, (3, 15, 7)) def test_open_dataset__n_images(self): ds = self.entry.open_dataset( @@ -308,7 +308,7 @@ def test_honors_geometry(self): engine=xee.EarthEngineBackendEntrypoint, ) - self.assertEqual(ds.dims, {'time': 4248, 'lon': 42, 'lat': 36}) + self.assertEqual(ds.dims, {'time': 4248, 'lon': 42, 'lat': 34}) self.assertNotEqual(ds.dims, standard_ds.dims) def test_honors_projection(self): @@ -335,14 +335,14 @@ def test_parses_ee_url(self): scale=25.0, # in degrees n_images=3, ) - self.assertEqual(dict(ds.dims), {'time': 3, 'lon': 15, 'lat': 8}) + self.assertEqual(dict(ds.dims), {'time': 3, 'lon': 15, 'lat': 7}) ds = self.entry.open_dataset( 'ee:LANDSAT/LC08/C01/T1', drop_variables=tuple(f'B{i}' for i in range(3, 12)), scale=25.0, # in degrees n_images=3, ) - self.assertEqual(dict(ds.dims), {'time': 3, 'lon': 15, 'lat': 8}) + self.assertEqual(dict(ds.dims), {'time': 3, 'lon': 15, 'lat': 7}) def test_data_sanity_check(self): # This simple test uncovered a bug with the default definition of `scale`. From 74ec35b3e4974ab4c41f016da21c3925baf87f87 Mon Sep 17 00:00:00 2001 From: Xee authors Date: Tue, 10 Oct 2023 11:11:15 -0700 Subject: [PATCH 05/15] Have Earth Engine calculate coordinate information. Fixes #57. Per the suggestion of @raspstephan, we avoid computing lat/lng information ourselves and instead pull our coordinate information via RPC calls into EE. There didn't seem to be a good way to do this within `get_info()`, so this introduces two additional IO calls. This should only add overhead to the `open_dataset()` call (i.e. once) and not to every data index operation. Running benchmarks now, so far: ``` open_dataset():avg=33.11,std=8.23,best=22.81,worst=48.00 open_and_chunk():avg=37.61,std=8.79,best=23.95,worst=53.59 ``` PiperOrigin-RevId: 572310584 --- xee/ext.py | 218 +++++++++++++++++++----------------- xee/ext_integration_test.py | 9 +- 2 files changed, 120 insertions(+), 107 deletions(-) diff --git a/xee/ext.py b/xee/ext.py index 15e3f35..20df379 100644 --- a/xee/ext.py +++ b/xee/ext.py @@ -65,6 +65,13 @@ } +class _GetComputedPixels: + """Wrapper around `ee.data.computePixels()` to make retries simple.""" + + def __getitem__(self, params) -> np.ndarray: + return ee.data.computePixels(params) + + class EarthEngineStore(common.AbstractDataStore): """Read-only Data Store for Google Earth Engine.""" @@ -205,12 +212,12 @@ def __init__( ) # We add and subtract the scale to solve an off-by-one error. With this # adjustment, we achieve parity with a pure `computePixels()` call. - x_min, y_min = self.project(x_min_0 - self.scale_x, y_min_0) + x_min, y_min = self.transform(x_min_0 - self.scale_x, y_min_0) if _bounds_are_invalid(x_min, y_min, self.scale_units == 'degree'): - x_min, y_min = self.project(x_min_0, y_min_0) - x_max, y_max = self.project(x_max_0, y_max_0 + self.scale_y) + x_min, y_min = self.transform(x_min_0, y_min_0) + x_max, y_max = self.transform(x_max_0, y_max_0 + self.scale_y) if _bounds_are_invalid(x_max, y_max, self.scale_units == 'degree'): - x_max, y_max = self.project(x_max_0, y_max_0) + x_max, y_max = self.transform(x_max_0, y_max_0) self.bounds = x_min, y_min, x_max, y_max self.chunks = self.PREFERRED_CHUNKS.copy() @@ -318,12 +325,96 @@ def _assign_preferred_chunks(self) -> Chunks: chunks[y_dim_name] = self.chunks['height'] return chunks - def project(self, xs: float, ys: float) -> tuple[float, float]: + def transform(self, xs: float, ys: float) -> tuple[float, float]: transformer = pyproj.Transformer.from_crs( self.crs.geodetic_crs, self.crs, always_xy=True ) return transformer.transform(xs, ys) + def project(self, bbox: types.BBox) -> types.Grid: + """Translate a bounding box (pixel space) to a grid (projection space). + + Here, we calculate a simple affine transformation to get a specific region + when computing pixels. + + Args: + bbox: Bounding box in pixel space. + + Returns: + A Grid, to be passed into `computePixels()`'s "grid" keyword. Defines the + appropriate region of data to return according to the Array's configured + projection and scale. + """ + # The origin of the image is in the top left corner. X is the minimum value + # and Y is the maximum value. + x_origin, _, _, y_origin = self.bounds # x_min, x_max, y_min, y_max + x_start, y_start, x_end, y_end = bbox + width = x_end - x_start + height = y_end - y_start + + return { + # The size of the bounding box. The affine transform and project will be + # applied, so we can think in terms of pixels. + 'dimensions': { + 'width': width, + 'height': height, + }, + 'affineTransform': { + # Since the origin is in the top left corner, we want to translate + # the start of the grid to the positive direction for X and the + # negative direction for Y. + 'translateX': x_origin + self.scale_x * x_start, + 'translateY': y_origin + self.scale_y * y_start, + # Define the scale for each pixel (e.g. the number of meters between + # each value). + 'scaleX': self.scale_x, + 'scaleY': self.scale_y, + }, + 'crsCode': self.crs_arg, + } + + def image_to_array( + self, + image: ee.Image, + pixels_getter=_GetComputedPixels(), + dtype=np.float32, + **kwargs, + ) -> np.ndarray: + """Gets the pixels for a given image as a numpy array. + + This method includes exponential backoff (with jitter) when trying to get + pixel data. + + Args: + image: An EE image. + pixels_getter: An object whose `__getitem__()` method calls + `computePixels()`. + dtype: a np.dtype. The returned array will be in this dtype. + **kwargs: Additional settings for `params` in `computePixels(params)`. For + example, a `grid` dictionary. + + Returns: + An numpy array containing the pixels computed based on the given image. + """ + image = image.unmask(self.mask_value) + params = { + 'expression': image, + 'fileFormat': 'NUMPY_NDARRAY', + **kwargs, + } + raw = common.robust_getitem( + pixels_getter, params, catch=ee.ee_exception.EEException + ) + + # TODO(#9): Find a way to make this more efficient. This is needed because + # `raw` is a structured array of all the same dtype (i.e. number of images). + arr = np.array(raw.tolist(), dtype=dtype) + data = arr.T + + # Sets EE nodata masked value to NaNs. + data = np.where(data == self.mask_value, np.nan, data) + return data + @functools.lru_cache() def _band_attrs(self, band_name: str) -> types.BandInfo: try: @@ -403,9 +494,17 @@ def get_variables(self) -> utils.Frozen[str, xarray.Variable]: f'ImageCollection due to: {e}.' ) - x_min_0, y_min_0, x_max_0, y_max_0 = self.bounds - width_coord = np.linspace(x_min_0, x_max_0, v0.shape[1]) - height_coord = np.linspace(y_max_0, y_min_0, v0.shape[2]) + lnglat_img = ee.Image.pixelLonLat() + lon_grid = self.project((0, 0, v0.shape[1], 1)) + lat_grid = self.project((0, 0, 1, v0.shape[2])) + lon = self.image_to_array( + lnglat_img, grid=lon_grid, dtype=np.float32, bandIds=['longitude'] + ) + lat = self.image_to_array( + lnglat_img, grid=lat_grid, dtype=np.float32, bandIds=['latitude'] + ) + width_coord = np.squeeze(lon) + height_coord = np.squeeze(lat) x_dim_name, y_dim_name = self.dimension_names @@ -478,13 +577,6 @@ def geometry_to_bounds(geom: ee.Geometry) -> types.Bounds: return _ee_bounds_to_bounds(bounds) -class _GetComputedPixels: - """Wrapper around `ee.data.computePixels()` to make retries simple.""" - - def __getitem__(self, params) -> np.ndarray: - return ee.data.computePixels(params) - - class EarthEngineBackendArray(backends.BackendArray): """Array backend for Earth Engine.""" @@ -492,11 +584,7 @@ def __init__(self, variable_name: str, ee_store: EarthEngineStore): self.variable_name = variable_name self.store = ee_store - self.scale_x = ee_store.scale_x - self.scale_y = ee_store.scale_y self.scale = ee_store.scale - self.crs_arg = ee_store.crs_arg - self.crs = ee_store.crs self.bounds = ee_store.bounds # It looks like different bands have different dimensions & transforms! @@ -506,51 +594,14 @@ def __init__(self, variable_name: str, ee_store: EarthEngineStore): x_min, y_min, x_max, y_max = self.bounds - x_size = int(np.ceil((x_max - x_min) / np.abs(self.scale_x))) - y_size = int(np.ceil((y_max - y_min) / np.abs(self.scale_y))) + x_size = int(np.ceil((x_max - x_min) / np.abs(self.store.scale_x))) + y_size = int(np.ceil((y_max - y_min) / np.abs(self.store.scale_y))) self.shape = (ee_store.n_images, x_size, y_size) self._apparent_chunks = {k: 1 for k in self.store.PREFERRED_CHUNKS.keys()} if isinstance(self.store.chunks, dict): self._apparent_chunks = self.store.chunks.copy() - def _to_array( - self, image: ee.Image, pixels_getter=_GetComputedPixels(), **kwargs - ) -> np.ndarray: - """Gets the pixels for a given image as a numpy array. - - This method includes exponential backoff (with jitter) when trying to get - pixel data. - - Args: - image: An EE image. - pixels_getter: An object whose `__getitem__()` method calls - `computePixels()`. - **kwargs: Additional settings for `params` in `computePixels(params)`. For - example, a `grid` dictionary. - - Returns: - An numpy array containing the pixels computed based on the given image. - """ - image = image.unmask(self.store.mask_value) - params = { - 'expression': image, - 'fileFormat': 'NUMPY_NDARRAY', - **kwargs, - } - raw = common.robust_getitem( - pixels_getter, params, catch=ee.ee_exception.EEException - ) - - # TODO(#9): Find a way to make this more efficient. This is needed because - # `raw` is a structured array of all the same dtype (i.e. number of images). - arr = np.array(raw.tolist(), dtype=self.dtype) - data = arr.T - - # Sets EE nodata masked value to NaNs. - data = np.where(data == self.store.mask_value, np.nan, data) - return data - def __getitem__(self, key: indexing.ExplicitIndexer) -> np.typing.ArrayLike: return indexing.explicit_indexing_adapter( key, @@ -559,48 +610,6 @@ def __getitem__(self, key: indexing.ExplicitIndexer) -> np.typing.ArrayLike: self._raw_indexing_method, ) - def _project(self, bbox: types.BBox) -> types.Grid: - """Translate a bounding box (pixel space) to a grid (projection space). - - Here, we calculate a simple affine transformation to get a specific region - when computing pixels. - - Args: - bbox: Bounding box in pixel space. - - Returns: - A Grid, to be passed into `computePixels()`'s "grid" keyword. Defines the - appropriate region of data to return according to the Array's configured - projection and scale. - """ - # The origin of the image is in the top left corner. X is the minimum value - # and Y is the maximum value. - x_origin, _, _, y_origin = self.bounds # x_min, x_max, y_min, y_max - x_start, y_start, x_end, y_end = bbox - width = x_end - x_start - height = y_end - y_start - - return { - # The size of the bounding box. The affine transform and project will be - # applied, so we can think in terms of pixels. - 'dimensions': { - 'width': width, - 'height': height, - }, - 'affineTransform': { - # Since the origin is in the top left corner, we want to translate - # the start of the grid to the positive direction for X and the - # negative direction for Y. - 'translateX': x_origin + self.scale_x * x_start, - 'translateY': y_origin + self.scale_y * y_start, - # Define the scale for each pixel (e.g. the number of meters between - # each value). - 'scaleX': self.scale_x, - 'scaleY': self.scale_y, - }, - 'crsCode': self.crs_arg, - } - def _key_to_slices( self, key: tuple[Union[int, slice], ...] ) -> tuple[tuple[slice, ...], tuple[int, ...]]: @@ -676,7 +685,9 @@ def _raw_indexing_method( # User does not want to use any chunks... if self.store.chunks == -1: target_image = self._slice_collection(key[0]) - out = self._to_array(target_image, grid=self._project(bbox)) + out = self.store.image_to_array( + target_image, grid=self.store.project(bbox), dtype=self.dtype + ) if squeeze_axes: out = np.squeeze(out, squeeze_axes) @@ -728,9 +739,8 @@ def _make_tile( """Get a numpy array from EE for a specific 3D bounding box (a 'tile').""" tile_idx, (istart, iend, *bbox) = tile_index target_image = self._slice_collection(slice(istart, iend)) - return tile_idx, self._to_array( - target_image, grid=self._project(tuple(bbox)) - ) + return tile_idx, self.store.image_to_array( + target_image, grid=self.store.project(tuple(bbox)), dtype=self.dtype) def _tile_indexes( self, index_range: slice, bbox: types.BBox diff --git a/xee/ext_integration_test.py b/xee/ext_integration_test.py index 1f539c4..40a5b03 100644 --- a/xee/ext_integration_test.py +++ b/xee/ext_integration_test.py @@ -216,10 +216,13 @@ def __getitem__(self, params): return ee.data.computePixels(params) arr = xee.EarthEngineBackendArray('B5', self.store) - grid = arr._project((0, 10, 0, 10)) + grid = self.store.project((0, 10, 0, 10)) getter = ErroneousPixelsGetter() - arr._to_array( - self.store.image_collection.first(), pixels_getter=getter, grid=grid + self.store.image_to_array( + self.store.image_collection.first(), + pixels_getter=getter, + grid=grid, + dtype=arr.dtype, ) self.assertEqual(getter.count, 3) From 03eb6be3e8eb939b192a3d2f33e3de9f37f975b4 Mon Sep 17 00:00:00 2001 From: Xee authors Date: Tue, 10 Oct 2023 11:42:11 -0700 Subject: [PATCH 06/15] Adding PyPi release automation. Stealing the publish.yml Github Action from google/weather-tools. Here, releases will to PyPi will be automatic and tested. PiperOrigin-RevId: 572320497 --- .github/workflows/publish.yml | 101 ++++++++++++++++++++++++++++++++++ setup.py | 23 +++++++- 2 files changed, 123 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/publish.yml diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..172e06d --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,101 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +name: Publish to PyPi + +on: + release: + types: [published] + +jobs: + build-artifacts: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2.3.1 + with: + python-version: 3.9 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install setuptools setuptools-scm wheel twine check-manifest + + - name: Build tarball and wheels + run: | + git clean -xdf + git restore -SW . + python -m build --sdist --wheel . + - name: Check built artifacts + run: | + python -m twine check dist/* + pwd + if [ -f dist/xee-0.0.0.tar.gz ]; then + echo "❌ INVALID VERSION NUMBER" + exit 1 + else + echo "✅ Looks good" + fi + - uses: actions/upload-artifact@v2 + with: + name: releases + path: dist + + test-built-dist: + needs: build-artifacts + runs-on: ubuntu-latest + steps: + - uses: actions/setup-python@v2.3.1 + name: Install Python + with: + python-version: 3.9 + - uses: actions/download-artifact@v2 + with: + name: releases + path: dist + - name: List contents of built dist + run: | + ls -ltrh + ls -ltrh dist + - name: Publish package to TestPyPI + if: github.event_name == 'push' + uses: pypa/gh-action-pypi-publish@v1.4.2 + with: + user: __token__ + password: ${{ secrets.TESTPYPI_TOKEN }} + repository_url: https://test.pypi.org/legacy/ + verbose: true + + - name: Check uploaded package + if: github.event_name == 'push' + run: | + sleep 3 + python -m pip install --upgrade pip + python -m pip install --extra-index-url https://test.pypi.org/simple --upgrade xee + python -c "import xee; print(xee.__version__)" + upload-to-pypi: + needs: test-built-dist + if: github.event_name == 'release' + runs-on: ubuntu-latest + steps: + - uses: actions/download-artifact@v2 + with: + name: releases + path: dist + - name: Publish package to PyPI + uses: pypa/gh-action-pypi-publish@v1.4.2 + with: + user: __token__ + password: ${{ secrets.PYPI_TOKEN }} + verbose: true \ No newline at end of file diff --git a/setup.py b/setup.py index aedd555..1a193d4 100644 --- a/setup.py +++ b/setup.py @@ -43,6 +43,9 @@ license='Apache 2.0', author='Google LLC', author_email='noreply@google.com', + description='A Google Earth Engine extension for Xarray.', + long_description=open('README.md', 'r', encoding='utf-8').read(), + long_description_content_type='text/markdown', install_requires=['xarray', 'earthengine-api>=0.1.374', 'pyproj', 'affine'], extras_require={ 'tests': tests_requires, @@ -53,5 +56,23 @@ python_requires='>=3.9', entry_points={ 'xarray.backends': ['ee=xee:EarthEngineBackendEntrypoint'], - } + }, + classifiers=[ + 'Development Status :: 4 - Beta', + 'Intended Audience :: Science/Research', + 'Intended Audience :: Developers', + 'Intended Audience :: Information Technology', + 'License :: OSI Approved :: Apache Software License', + 'Operating System :: MacOS :: MacOS X', + 'Operating System :: Microsoft :: Windows', + 'Operating System :: POSIX', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Topic :: Scientific/Engineering :: Atmospheric Science', + + ], + project_urls={ + 'Issue Tracking': 'https://github.com/google/Xee/issues', + }, ) From 2d6e9fcf35e0fac5b767e9b031aafe0de4c3e03f Mon Sep 17 00:00:00 2001 From: Xee authors Date: Tue, 10 Oct 2023 13:18:56 -0700 Subject: [PATCH 07/15] Remove weather-tools-specific check. We want to publish 0.0.0 :) PiperOrigin-RevId: 572347201 --- .github/workflows/publish.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 172e06d..a5e9a4c 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -41,12 +41,6 @@ jobs: run: | python -m twine check dist/* pwd - if [ -f dist/xee-0.0.0.tar.gz ]; then - echo "❌ INVALID VERSION NUMBER" - exit 1 - else - echo "✅ Looks good" - fi - uses: actions/upload-artifact@v2 with: name: releases From cfcd2334ad3be8094dbd234879385bb16222b8f1 Mon Sep 17 00:00:00 2001 From: Qiusheng Wu Date: Tue, 10 Oct 2023 13:44:19 -0700 Subject: [PATCH 08/15] Fix logo on PyPI and add badge --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4ed69cc..48fee14 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # Xee: Xarray + Google Earth Engine -![Xee Logo](docs/xee-logo.png) +[![image](https://img.shields.io/pypi/v/xee.svg)](https://pypi.python.org/pypi/xee) + +![Xee Logo](https://raw.githubusercontent.com/google/Xee/main/docs/xee-logo.png) _An Xarray extension for Google Earth Engine._ @@ -84,4 +86,4 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -``` \ No newline at end of file +``` From 085de84b860300b5b1f5dd8f7ce7e9faf9f4befc Mon Sep 17 00:00:00 2001 From: Xee authors Date: Tue, 10 Oct 2023 14:08:00 -0700 Subject: [PATCH 09/15] `pip install xee` now supported. This PR updates our documentation to instruct users to install `Xee` from PyPI. I've also added an extra step in the README walkthrough (imports). Fixes #27. PiperOrigin-RevId: 572361717 --- README.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 48fee14..7e643a5 100644 --- a/README.md +++ b/README.md @@ -8,10 +8,10 @@ _An Xarray extension for Google Earth Engine._ ## How to use -Install with pip (distributions on PyPi will come soon): +Install with pip: ```shell -pip install git+https://github.com/google/xee.git +pip install --upgrade xee ``` Then, authenticate Earth Engine: @@ -20,6 +20,13 @@ Then, authenticate Earth Engine: earthengine authenticate --quiet ``` +Now, in your Python environment, make the following imports: + +```python +import ee +import xarray +``` + Next, initialize the EE client with the high volume API: ```python From 77acef8eaf9daae883bf9cb4186f99b727d0264a Mon Sep 17 00:00:00 2001 From: Xee authors Date: Tue, 10 Oct 2023 15:44:23 -0700 Subject: [PATCH 10/15] Automatically calculate optimal IO Chunks. By default, users should be able to get all 48 MBs worth of data in each request to EE. In this change, we change the default `io_chunk` behavior to make an educated guess such that users get as many bytes as possible under the request byte limit. Fixes #43. PiperOrigin-RevId: 572387933 --- xee/ext.py | 67 ++++++++++++++++++++++++++++++++++++-- xee/ext_test.py | 85 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+), 3 deletions(-) create mode 100644 xee/ext_test.py diff --git a/xee/ext.py b/xee/ext.py index 20df379..ab22e5b 100644 --- a/xee/ext.py +++ b/xee/ext.py @@ -64,6 +64,24 @@ 'double': np.float64, } +# While this documentation says that the limit is 10 MB... +# https://developers.google.com/earth-engine/guides/usage#request_payload_size +# actual byte limit seems to depend on other factors. This has been found via +# trial & error. +REQUEST_BYTE_LIMIT = 2**20 * 48 # 48 MBs + + +def _check_request_limit(chunks: dict[str, int], dtype_size: int, limit: int): + """Checks that the actual number of bytes exceeds the limit.""" + index, width, height = chunks['index'], chunks['width'], chunks['height'] + actual_bytes = index * width * height * dtype_size + if actual_bytes > limit: + raise ValueError( + f'`chunks="auto"` failed! Actual bytes {actual_bytes!r} exceeds limit' + f' {limit!r}. Please choose another value for `chunks` (and file a' + ' bug).' + ) + class _GetComputedPixels: """Wrapper around `ee.data.computePixels()` to make retries simple.""" @@ -121,6 +139,7 @@ def open( primary_dim_name: Optional[str] = None, primary_dim_property: Optional[str] = None, mask_value: Optional[float] = None, + request_byte_limit: int = REQUEST_BYTE_LIMIT, ) -> 'EarthEngineStore': if mode != 'r': raise ValueError( @@ -138,6 +157,7 @@ def open( primary_dim_name=primary_dim_name, primary_dim_property=primary_dim_property, mask_value=mask_value, + request_byte_limit=request_byte_limit, ) def __init__( @@ -152,6 +172,7 @@ def __init__( primary_dim_name: Optional[str] = None, primary_dim_property: Optional[str] = None, mask_value: Optional[float] = None, + request_byte_limit: int = REQUEST_BYTE_LIMIT, ): self.image_collection = image_collection if n_images != -1: @@ -220,10 +241,13 @@ def __init__( x_max, y_max = self.transform(x_max_0, y_max_0) self.bounds = x_min, y_min, x_max, y_max - self.chunks = self.PREFERRED_CHUNKS.copy() + max_dtype = self._max_itemsize() + + # TODO(b/291851322): Consider support for laziness when chunks=None. + # By default, automatically optimize io_chunks. + self.chunks = self._auto_chunks(max_dtype, request_byte_limit) if chunks == -1: self.chunks = -1 - # TODO(b/291851322): Consider support for laziness when chunks=None. elif chunks is not None and chunks != 'auto': self.chunks = self._assign_index_chunks(chunks) @@ -282,6 +306,38 @@ def image_ids(self) -> list[str]: image_ids, _ = self.image_collection_properties return image_ids + def _max_itemsize(self) -> int: + return max( + _parse_dtype(b['data_type']).itemsize for b in self._img_info['bands'] + ) + + @classmethod + def _auto_chunks( + cls, dtype_bytes: int, request_byte_limit: int = REQUEST_BYTE_LIMIT + ) -> dict[str, int]: + """Given the data type size and request limit, calculate optimal chunks.""" + # Taking the data type number of bytes into account, let's try to have the + # height and width follow round numbers (powers of two) and allocate the + # remaining bytes available for the index length. To illustrate this logic, + # let's follow through with an example where: + # request_byte_limit = 2 ** 20 * 10 # = 10 MBs + # dtype_bytes = 8 + log_total = np.log2(request_byte_limit) # e.g.=23.32... + log_dtype = np.log2(dtype_bytes) # e.g.=3 + log_limit = 10 * (log_total // 10) # e.g.=20 + log_index = log_total - log_limit # e.g.=3.32... + + # Motivation: How do we divide a number N into the closest sum of two ints? + d = (log_limit - np.ceil(log_dtype)) / 2 # e.g.=17/2=8.5 + wd, ht = np.ceil(d), np.floor(d) # e.g. wd=9, ht=8 + + # Put back to byte space, then round to the nearst integer number of bytes. + index = int(np.rint(2**log_index)) # e.g.=10 + width = int(np.rint(2**wd)) # e.g.=512 + height = int(np.rint(2**ht)) # e.g.=256 + + return {'index': index, 'width': width, 'height': height} + def _assign_index_chunks( self, input_chunk_store: dict[Any, Any] ) -> dict[Any, Any]: @@ -808,6 +864,7 @@ def open_dataset( primary_dim_name: Optional[str] = None, primary_dim_property: Optional[str] = None, ee_mask_value: Optional[float] = None, + request_byte_limit: int = REQUEST_BYTE_LIMIT, ) -> xarray.Dataset: """Open an Earth Engine ImageCollection as an Xarray Dataset. @@ -816,7 +873,8 @@ def open_dataset( ee.ImageCollection object. drop_variables (optional): Variables or bands to drop before opening. io_chunks (optional): Specifies the chunking strategy for loading data - from EE. + from EE. By default, this automatically calculates optional chunks based + on the `request_byte_limit`. n_images (optional): The max number of EE images in the collection to open. Useful when there are a large number of images in the collection since calculating collection size can be slow. -1 indicates that all @@ -869,6 +927,8 @@ def open_dataset( 'system:time_start'. ee_mask_value (optional): Value to mask to EE nodata values. By default, this is 'np.iinfo(np.int32).max' i.e. 2147483647. + request_byte_limit: the max allowed bytes to request at a time from Earth + Engine. By default, it is 48MBs. Returns: An xarray.Dataset that streams in remote data from Earth Engine. @@ -895,6 +955,7 @@ def open_dataset( primary_dim_name=primary_dim_name, primary_dim_property=primary_dim_property, mask_value=ee_mask_value, + request_byte_limit=request_byte_limit, ) store_entrypoint = backends_store.StoreBackendEntrypoint() diff --git a/xee/ext_test.py b/xee/ext_test.py new file mode 100644 index 0000000..74b47f4 --- /dev/null +++ b/xee/ext_test.py @@ -0,0 +1,85 @@ +"""Xee Unit Tests.""" + +from absl.testing import absltest +from absl.testing import parameterized + +import numpy as np +import xee + +from xee import ext + + +class EEStoreStandardDatatypesTest(parameterized.TestCase): + + @parameterized.named_parameters( + dict( + testcase_name='int8', + dtype=np.dtype('int8'), + expected_chunks={'index': 48, 'width': 1024, 'height': 1024}, + ), + dict( + testcase_name='int32', + dtype=np.dtype('int32'), + expected_chunks={'index': 48, 'width': 512, 'height': 512}, + ), + dict( + testcase_name='int64', + dtype=np.dtype('int64'), + expected_chunks={'index': 48, 'width': 512, 'height': 256}, + ), + dict( + testcase_name='float32', + dtype=np.dtype('float32'), + expected_chunks={'index': 48, 'width': 512, 'height': 512}, + ), + dict( + testcase_name='float64', + dtype=np.dtype('float64'), + expected_chunks={'index': 48, 'width': 512, 'height': 256}, + ), + dict( + testcase_name='complex64', + dtype=np.dtype('complex64'), + expected_chunks={'index': 48, 'width': 512, 'height': 256}, + ), + ) + def test_auto_chunks__handles_standard_dtypes(self, dtype, expected_chunks): + self.assertEqual( + xee.EarthEngineStore._auto_chunks(dtype.itemsize), + expected_chunks, + '%r fails.' % dtype, + ) + + +class EEStoreTest(absltest.TestCase): + + def test_auto_chunks__handles_range_of_dtype_sizes(self): + dt = 0 + try: + for dt in range(1, 1024): + _ = xee.EarthEngineStore._auto_chunks(dt) + except ValueError: + self.fail(f'Could not handle data type size {dt}.') + + def test_auto_chunks__is_optimal_for_powers_of_two(self): + for p in range(10): + dt = 2**p + chunks = xee.EarthEngineStore._auto_chunks(dt) + self.assertEqual( + xee.REQUEST_BYTE_LIMIT, np.prod(list(chunks.values())) * dt + ) + + def test_exceeding_byte_limit__raises_error(self): + dtype_size = 8 + # does not fail + chunks = {'index': 48, 'width': 512, 'height': 256} + ext._check_request_limit(chunks, dtype_size, xee.REQUEST_BYTE_LIMIT) + + # fails + chunks = {'index': 1024, 'width': 1024, 'height': 1024} + with self.assertRaises(ValueError): + ext._check_request_limit(chunks, dtype_size, xee.REQUEST_BYTE_LIMIT) + + +if __name__ == '__main__': + absltest.main() From c023cbbf4bdde444bf45351f251be6a2288d3600 Mon Sep 17 00:00:00 2001 From: Qiusheng Wu Date: Tue, 10 Oct 2023 18:01:20 -0700 Subject: [PATCH 11/15] Fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7e643a5..b280a7f 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ ds = xarray.open_dataset(ic, engine='ee', crs='EPSG:4326', scale=0.25) Open an ImageCollection with a specific EE projection or geometry: ```python -ic = ee.ImageCollection('ee://ECMWF/ERA5_LAND/HOURLY').filterDate('1992-10-05', '1993-03-31') +ic = ee.ImageCollection('ECMWF/ERA5_LAND/HOURLY').filterDate('1992-10-05', '1993-03-31') leg1 = ee.Geometry.Rectangle(113.33, -43.63, 153.56, -10.66) ds = xarray.open_dataset( ic, From 723c831647eebcbe1ce9ff30febb2b1fcc3dfe77 Mon Sep 17 00:00:00 2001 From: Qiusheng Wu Date: Tue, 10 Oct 2023 21:48:10 -0700 Subject: [PATCH 12/15] Add conda-forge package --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 7e643a5..23c07fa 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,10 @@ # Xee: Xarray + Google Earth Engine [![image](https://img.shields.io/pypi/v/xee.svg)](https://pypi.python.org/pypi/xee) +[![image](https://static.pepy.tech/badge/xee)](https://pepy.tech/project/xee) +[![Conda Recipe](https://img.shields.io/badge/recipe-xee-green.svg)](https://github.com/conda-forge/xee-feedstock) +[![image](https://img.shields.io/conda/vn/conda-forge/xee.svg)](https://anaconda.org/conda-forge/xee) +[![Conda Downloads](https://img.shields.io/conda/dn/conda-forge/xee.svg)](https://anaconda.org/conda-forge/xee) ![Xee Logo](https://raw.githubusercontent.com/google/Xee/main/docs/xee-logo.png) @@ -14,6 +18,12 @@ Install with pip: pip install --upgrade xee ``` +Install with conda: + +```shell +conda install -c conda-forge xee +``` + Then, authenticate Earth Engine: ```shell From 955fced3425e49025bdd25b506201b2041a60865 Mon Sep 17 00:00:00 2001 From: Qiusheng Wu Date: Tue, 10 Oct 2023 22:09:12 -0700 Subject: [PATCH 13/15] Move badges --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 23c07fa..c8343da 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,15 @@ # Xee: Xarray + Google Earth Engine +![Xee Logo](https://raw.githubusercontent.com/google/Xee/main/docs/xee-logo.png) + +_An Xarray extension for Google Earth Engine._ + [![image](https://img.shields.io/pypi/v/xee.svg)](https://pypi.python.org/pypi/xee) [![image](https://static.pepy.tech/badge/xee)](https://pepy.tech/project/xee) [![Conda Recipe](https://img.shields.io/badge/recipe-xee-green.svg)](https://github.com/conda-forge/xee-feedstock) [![image](https://img.shields.io/conda/vn/conda-forge/xee.svg)](https://anaconda.org/conda-forge/xee) [![Conda Downloads](https://img.shields.io/conda/dn/conda-forge/xee.svg)](https://anaconda.org/conda-forge/xee) -![Xee Logo](https://raw.githubusercontent.com/google/Xee/main/docs/xee-logo.png) - -_An Xarray extension for Google Earth Engine._ - ## How to use Install with pip: From 9c55df6976b2d756f30069a8be9a7f904baa1257 Mon Sep 17 00:00:00 2001 From: Xee authors Date: Wed, 11 Oct 2023 11:07:50 -0700 Subject: [PATCH 14/15] Fix: Now using default ranges. Fixes #71. I've turned off range adjustments as well as the constants in the integration tests. PiperOrigin-RevId: 572627965 --- xee/ext.py | 25 +++---------------------- xee/ext_integration_test.py | 14 +++++++------- 2 files changed, 10 insertions(+), 29 deletions(-) diff --git a/xee/ext.py b/xee/ext.py index ab22e5b..f96c365 100644 --- a/xee/ext.py +++ b/xee/ext.py @@ -231,14 +231,9 @@ def __init__( x_min_0, y_min_0, x_max_0, y_max_0 = _ee_bounds_to_bounds( self.get_info['bounds'] ) - # We add and subtract the scale to solve an off-by-one error. With this - # adjustment, we achieve parity with a pure `computePixels()` call. - x_min, y_min = self.transform(x_min_0 - self.scale_x, y_min_0) - if _bounds_are_invalid(x_min, y_min, self.scale_units == 'degree'): - x_min, y_min = self.transform(x_min_0, y_min_0) - x_max, y_max = self.transform(x_max_0, y_max_0 + self.scale_y) - if _bounds_are_invalid(x_max, y_max, self.scale_units == 'degree'): - x_max, y_max = self.transform(x_max_0, y_max_0) + # TODO(#40): Investigate data discrepancy (off-by-one) issue. + x_min, y_min = self.transform(x_min_0, y_min_0) + x_max, y_max = self.transform(x_max_0, y_max_0) self.bounds = x_min, y_min, x_max, y_max max_dtype = self._max_itemsize() @@ -579,20 +574,6 @@ def close(self) -> None: del self.image_collection -def _bounds_are_invalid(x: float, y: float, is_degrees=False) -> bool: - """Check for obviously bad x and y projection values.""" - bad_num = math.isnan(x) or math.isnan(y) or math.isinf(x) or math.isinf(y) - - invalid_degree = ( - y < -90.0 - or y > 90.0 - or x < -180.0 - or x > 360.0 # degrees could be from 0 to 360... - ) - - return bad_num or (is_degrees and invalid_degree) - - def _parse_dtype(data_type: types.DataType): """Parse a np.dtype from the 'data_type' section of ee.Image.getInfo(). diff --git a/xee/ext_integration_test.py b/xee/ext_integration_test.py index 40a5b03..b44f0d5 100644 --- a/xee/ext_integration_test.py +++ b/xee/ext_integration_test.py @@ -58,14 +58,14 @@ def setUp(self): def test_creates_lat_long_array(self): arr = xee.EarthEngineBackendArray('longitude', self.lnglat_store) - self.assertEqual((1, 360, 179), arr.shape) + self.assertEqual((1, 360, 180), arr.shape) def test_can_create_object(self): arr = xee.EarthEngineBackendArray('B4', self.store) self.assertIsNotNone(arr) - self.assertEqual((64, 360, 179), arr.shape) + self.assertEqual((64, 360, 180), arr.shape) self.assertEqual(np.int32, arr.dtype) self.assertEqual('B4', arr.variable_name) @@ -261,7 +261,7 @@ def test_open_dataset__sanity_check(self): n_images=3, ) self.assertEqual( - dict(ds.dims), {'time': 3, 'lon': 15, 'lat': 7} + dict(ds.dims), {'time': 3, 'lon': 15, 'lat': 8} ) self.assertNotEmpty(dict(ds.coords)) self.assertEqual( @@ -271,7 +271,7 @@ def test_open_dataset__sanity_check(self): for v in ds.values(): self.assertIsNotNone(v.data) self.assertFalse(v.isnull().all(), 'All values are null!') - self.assertEqual(v.shape, (3, 15, 7)) + self.assertEqual(v.shape, (3, 15, 8)) def test_open_dataset__n_images(self): ds = self.entry.open_dataset( @@ -311,7 +311,7 @@ def test_honors_geometry(self): engine=xee.EarthEngineBackendEntrypoint, ) - self.assertEqual(ds.dims, {'time': 4248, 'lon': 42, 'lat': 34}) + self.assertEqual(ds.dims, {'time': 4248, 'lon': 41, 'lat': 35}) self.assertNotEqual(ds.dims, standard_ds.dims) def test_honors_projection(self): @@ -328,7 +328,7 @@ def test_honors_projection(self): engine=xee.EarthEngineBackendEntrypoint, ) - self.assertEqual(ds.dims, {'time': 4248, 'lon': 3600, 'lat': 1799}) + self.assertEqual(ds.dims, {'time': 4248, 'lon': 3600, 'lat': 1800}) self.assertNotEqual(ds.dims, standard_ds.dims) def test_parses_ee_url(self): @@ -345,7 +345,7 @@ def test_parses_ee_url(self): scale=25.0, # in degrees n_images=3, ) - self.assertEqual(dict(ds.dims), {'time': 3, 'lon': 15, 'lat': 7}) + self.assertEqual(dict(ds.dims), {'time': 3, 'lon': 15, 'lat': 8}) def test_data_sanity_check(self): # This simple test uncovered a bug with the default definition of `scale`. From 6a45ce8aec024a896b32b355705193dad5431d4c Mon Sep 17 00:00:00 2001 From: Xee authors Date: Wed, 11 Oct 2023 13:07:32 -0700 Subject: [PATCH 15/15] Incrementing package version. In the future, we should consider using scm version. PiperOrigin-RevId: 572663830 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1a193d4..59ab72f 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ setuptools.setup( name='xee', - version='0.0.0', + version='0.0.1', license='Apache 2.0', author='Google LLC', author_email='noreply@google.com',