From 2bee900deac9f03e7099adc2597620e6a38e105c Mon Sep 17 00:00:00 2001 From: Xee authors Date: Fri, 6 Oct 2023 01:16:58 +0000 Subject: [PATCH] Starting Sphinx docs for Xee. Adding sphinx boilerplate (taken from Xarray-Beam). I've added an index page and a "Why Xee" section. This PR also includes an outline of what the remaining documentation will look like. We include other project clean-up work here, too. PiperOrigin-RevId: 571189219 --- README.md | 6 ++-- docs/Makefile | 20 +++++++++++ docs/README.md | 1 + docs/api.md | 25 +++++++++++++ docs/conf.py | 83 ++++++++++++++++++++++++++++++++++++++++++ docs/index.md | 34 ++++++++++++++++++ docs/make.bat | 35 ++++++++++++++++++ docs/requirements.txt | 10 ++++++ docs/why-xee.md | 84 +++++++++++++++++++++++++++++++++++++++++++ xee/__init__.py | 23 +----------- 10 files changed, 296 insertions(+), 25 deletions(-) create mode 100644 docs/Makefile create mode 120000 docs/README.md create mode 100644 docs/api.md create mode 100644 docs/conf.py create mode 100644 docs/index.md create mode 100644 docs/make.bat create mode 100644 docs/requirements.txt create mode 100644 docs/why-xee.md diff --git a/README.md b/README.md index 4c0bce1..4ed69cc 100644 --- a/README.md +++ b/README.md @@ -2,14 +2,14 @@ ![Xee Logo](docs/xee-logo.png) -_A Google Earth Engine backend for Xarray | An Xarray Client for Google Earth Engine._ +_An Xarray extension for Google Earth Engine._ ## How to use Install with pip (distributions on PyPi will come soon): ```shell -pip install git+https://github.com/googlestaging/xee.git +pip install git+https://github.com/google/xee.git ``` Then, authenticate Earth Engine: @@ -64,7 +64,7 @@ ds = xarray.open_mfdataset(['ee://ECMWF/ERA5_LAND/HOURLY', 'ee://NASA/GDDP-CMIP6 engine='ee', crs='EPSG:4326', scale=0.25) ``` -See [examples/](examples/) for more uses and integrations. +See [examples/](examples/) or [docs](docs/) for more uses and integrations. ## License diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md new file mode 120000 index 0000000..dd0ea36 --- /dev/null +++ b/docs/README.md @@ -0,0 +1 @@ +index.md \ No newline at end of file diff --git a/docs/api.md b/docs/api.md new file mode 100644 index 0000000..650f8c2 --- /dev/null +++ b/docs/api.md @@ -0,0 +1,25 @@ +# API docs + +```{eval-rst} +.. currentmodule:: xee +``` + +## Core extension + +```{eval-rst} +.. autosummary:: + :toctree: _autosummary + + EarthEngineBackendEntrypoint + EarthEngineStore + EarthEngineBackendArray +``` + +## Utility functions + +```{eval-rst} +.. autosummary:: + :toctree: _autosummary + + geometry_to_bounds +``` \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..e8b1df8 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,83 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + +# Print Python environment info for easier debugging on ReadTheDocs + +import sys +import subprocess +import xee # verify this works + +print('python exec:', sys.executable) +print('sys.path:', sys.path) +print('pip environment:') +subprocess.run([sys.executable, '-m', 'pip', 'list']) # pylint: disable=subprocess-run-check + +print(f'xee: {xee.__version__}, {xee.__file__}') + +# -- Project information ----------------------------------------------------- + +project = 'Xee' +copyright = '2023, Google LCC' # pylint: disable=redefined-builtin +author = 'The Xee authors' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.napoleon', + 'myst_nb', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', '_templates', 'Thumbs.db', '.DS_Store'] + +intersphinx_mapping = { + 'xarray': ('https://xarray.pydata.org/en/latest/', None), +} + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# -- Extension config + +autosummary_generate = True + +# https://myst-nb.readthedocs.io/en/latest/use/execute.html +jupyter_execute_notebooks = 'cache' +# https://myst-nb.readthedocs.io/en/latest/use/formatting_outputs.html#removing-stdout-and-stderr +nb_output_stderr = 'remove-warn' + +# https://stackoverflow.com/a/66295922/809705 +autodoc_typehints = 'description' diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..548ef8c --- /dev/null +++ b/docs/index.md @@ -0,0 +1,34 @@ +# Xee: A Google Earth Engine extension for Xarray + +Xee is an Xarray extension for Google Earth Engine. It aims to help users view +Earth Engine's [data catalog](https://developers.google.com/earth-engine/datasets) +through the lense of arrays. + +In this documentation, we assume readers have some familiarity with +[Earth Engine](https://earthengine.google.com/), [Xarray](https://xarray.dev/), +and Python. Here, we'll dive into core concepts related to the integration +between these tools. + +## Contents + + + +```{toctree} +:maxdepth: 1 +why-xee.md +api.md +``` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..2119f51 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..ca28d1d --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,10 @@ +# doc requirements +Jinja2==3.1.2 +myst-nb==0.17.2 +myst-parser==0.18.1 +sphinx_rtd_theme==1.2.1 +sphinx==5.3.0 +scipy==1.10.1 + +# xee requirements +xee[examples] @ git+https://github.com/google/xee.git \ No newline at end of file diff --git a/docs/why-xee.md b/docs/why-xee.md new file mode 100644 index 0000000..36ce87e --- /dev/null +++ b/docs/why-xee.md @@ -0,0 +1,84 @@ +# Why Xee? + +We noticed two clusters of users working with climate and weather data at +Google Research: Some were [Xarray](https://xarray.dev) (and +[Zarr](https://zarr.dev/)) centric and others, Google Earth Engine centric. Xee +came about as an effort to bring these two groups of developers closer together. + +## Goals + +Primary Goals: + +- Make [EE-curated data](https://developers.google.com/earth-engine/datasets) + accessible to users in the Xarray community and to the wider scientific Python + ecosystem. +- Make it trivial to avoid quota limits when computing pixels from Earth Engine. +- Provide an easy way for scientists and ML practitioners to coalesce Earth data + at different scales into a common resolution. + +Secondary Goals: + +- Provide a succinct interface for querying Earth Engine data at scale (i.e. via + [Xarray-Beam](https://xarray-beam.readthedocs.io/)). +- Make it trivial to quickly [export Earth Engine data to Zarr](https://github.com/google/xee/tree/main/examples#export-earth-engine-imagecollections-to-zarr-with-xarray-beam). +- Provide compelling alternative for the need to export Zarr in the first + place (e.g. during the ML training process). + +## Approach + +With the addition of Earth Engine's [Pixel API](https://medium.com/google-earth/pixels-to-the-people-2d3c14a46da6), +it became possible to easily get NumPy array data from `ee.Image`s. In building +tools atop of this, we noticed that the best practices for managing data were +Xarray-shaped. For example: + +- Our codebases involved many similar LOC to translate between Earth Engine and + arrays: Users typically thought in NumPy and molded EE's Python client to fit + those idioms. +- We often needed to page `computePixel()` requests in a way that's strikingly + similar to Dask/Xarray's concept of [`chunks`](https://docs.xarray.dev/en/stable/user-guide/dask.html#what-is-a-dask-array). +- Users were wrapping NumPy arrays within dataclasses to associate metadata and + labels with data. + +In an attempt to group these disparate solutions into a singular interface, we +experimented with wrapping `computePixels()` into +[Xarray's standard mechanism for defining backends](https://docs.xarray.dev/en/stable/internals/how-to-add-new-backend.html). The result of this effort is Xee. + + +## An array by any other name? (Xee vs Zarr) + +[Zarr](https://zarr.dev/) has been growing in relevance to the world of [cloud-based scientific data](https://doi.org/10.1109/MCSE.2021.3059437). +Members of the open source community have [demonstrated](https://www.youtube.com/watch?v=0bqpxX3Nn_A) +that Zarr is more of a data protocol rather than a data format. In many ways, +Xee is inspired by this work. To this end, we'd like to point out some +similarities and differences between Zarr backed and Earth Engine backed data in +Xarray. + +Similarities: +- **Xarray-compatible**: Of course, this library proves that both types of data + stores can be compatible with Xarray. [Zarr](https://docs.xarray.dev/en/stable/user-guide/io.html#zarr) + reading and writing is deeply integrated into Xarray as well. +- **Optimal IO Chunks**: Ultimately, cloud-based data stores will inherently + involve networking overhead. There are similarities in the best way to page + data across a network into a local context: the optimal Zarr chunk + size is around [10-100 MBs](https://esipfed.github.io/cloud-computing-cluster/optimization-practices.html#chunk-size). With Earth Engine's backend, the maximum chunk size possible + is 48 MBs. + +Differences: +- **Quota vs No Quota**: Since Earth Engine is API based, there are quota + restrictions that limit IO, namely a 100 QPS limit on data requests. Readers + all need to be authenticated and tied to a GCP project quota. Zarr, on the + other hand, has a lower level access pattern. Reading is delegating to basic + permissions on cloud buckets. +- **On the fly vs up-front data shaping**: In Zarr, the representation of data + at rest fundamentally influences performance at query time. For this reason, + [rechunking](https://xarray-beam.readthedocs.io/en/latest/rechunking.html) and + projecting is a common routine performed up front on Zarr when data does not + quite fit the problem at hand. Earth Engine provides a more flexible interface + than this. Since datasets are pyramided (either at [ingestion](https://developers.google.com/earth-engine/help_collection_criteria) or server-side), users are free to request the + resolution and projection of the data during dataset open. Similarly, while + Earth Engine's internal dataset does fit an internal chunking scheme, chunking + schemes are a lot more fungibile. + +We hope that this comparison provides the user of a set of useful precedents +for working with cloud-based datasets. + diff --git a/xee/__init__.py b/xee/__init__.py index a895f56..83f21cd 100644 --- a/xee/__init__.py +++ b/xee/__init__.py @@ -12,26 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""A Google Earth Engine backend for Xarray. - -Warning: Experimental! Use at your own risk. - -Supported today (2023-06-13): -- Pixel Space Chunking: Will split up large pixel requests into smaller chunks - to get around EE `computePixels()` byte limits -- User-defined projections: Users can specify a CRS and Scale when opening a - dataset. All bands should appear in a specific projection space. -- Index Chunking: Users can open all images in the collection (metadata lookup - may be slow) or specify `n_images` to open at at time. - -Needs to be done: -- Full Xarray API support: There are features like cf encoding that are standard - in Xarray that have been put off till later. -- Performance Tuning (with micro benchmarks): We need to methodically optimize - the numpy, EE client, and parallelism bits of this client. -- Robustness testing: Again, this is experimental. Beware of sharp edges! - -Contributions are welcome! Before committing your change, please check if there -is an existing Github issue. -""" +"""A Google Earth Engine extension for Xarray.""" from .ext import *