Merge branch 'main' into kyle/stac-geoarrow

stac-utils · Apr 15, 2024 · dd2c6a8 · dd2c6a8
2 parents bc40430 + b399d23
commit dd2c6a8
Show file tree

Hide file tree

Showing 9 changed files with 474 additions and 165 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
 __pycache__
 .venv
-dist
+dist
+.direnv
+stac_geoparquet/_version.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [build-system]
-requires = ["flit_core >=3.2,<4"]
-build-backend = "flit_core.buildapi"
+requires = ["hatchling>=1.22.2", "hatch-vcs>=0.3.0"]
+build-backend = "hatchling.build"
 
 [project]
 name = "stac_geoparquet"
@@ -20,6 +20,12 @@ dependencies = [
     "shapely",
 ]
 
+[tool.hatch.version]
+source = "vcs"
+
+[tool.hatch.build.hooks.vcs]
+version-file = "stac_geoparquet/_version.py"
+
 [project.optional-dependencies]
 pgstac = [
     "fsspec",

diff --git a/spec/stac-geoparquet-spec.md b/spec/stac-geoparquet-spec.md
@@ -0,0 +1,107 @@
+# STAC GeoParquet Specification
+
+## Overview
+
+This document specifies how to map a set of [STAC Items](https://github.com/radiantearth/stac-spec/tree/v1.0.0/item-spec) into
+[GeoParquet](https://geoparquet.org). It is directly inspired by the [STAC GeoParquet](https://github.com/stac-utils/stac-geoparquet)
+library, but aims to provide guidance for anyone putting STAC data into GeoParquet.
+
+## Use cases
+
+* Provide a STAC GeoParquet that mirrors a static Collection as a way to query the whole dataset instead of reading every specific GeoJSON file.
+* As an output format for STAC API responses that is more efficient than paging through thousands of pages of GeoJSON.
+* Provide efficient access to specific fields of a STAC item, thanks to Parquet's columnar format.
+
+## Guidelines
+
+Each row in the Parquet Dataset represents a single STAC item. Most all the fields in a STAC Item should be mapped to a column in GeoParquet. We embrace Parquet structures where possible, mapping
+from JSON into nested structures. We do pull the properties to the top level, so that it is easier to query and use them. The names of
+most of the fields should be the same in STAC and in GeoParquet.
+
+| Field              | GeoParquet Type      | Required | Details                                                                                                                        |
+|--------------------|----------------------|----------|--------------------------------------------------------------------------------------------------------------------------------|
+| type               | String               | Optional | This is just needed for GeoJSON, so it is optional and not recommended to include in GeoParquet                                |
+| stac_extensions    | List of Strings      | Required | This column is required, but can be empty if no STAC extensions were used                                                      |
+| id                 | String               | Required | Required, should be unique within each collection                                                                              |
+| geometry           | Binary (WKB)         | Required | For GeoParquet 1.0 this must be well-known Binary                                                                              |
+| bbox               | Struct of Floats     | Required | Can be a 4 or 6 value struct, depending on dimension of the data                                                               |
+| links              | List of Link structs | Required | See [Link Struct](#link-struct) for more info                                                                                  |
+| assets             | An Assets struct     | Required | See [Asset Struct](#asset-struct) for more info                                                                                |
+| collection         | String               | Optional | The ID of the collection this Item is a part of. See notes below on 'Collection' and 'Collection JSON' in the Parquet metadata |
+| *property columns* | *varies*             | -        | Each property should use the relevant Parquet type, and be pulled out of the properties object to be a top-level Parquet field |
+
+* Must be valid GeoParquet, with proper metadata. Ideally the geometry types are defined and as narrow as possible.
+* Strongly recommend to only have one GeoParquet per STAC 'Collection'. Not doing this will lead to an expanded GeoParquet schema (the union of all the schemas of the collection) with lots of empty data
+* Any field in 'properties' of the STAC item should be moved up to be a top-level field in the GeoParquet.
+* STAC GeoParquet does not support properties that are named such that they collide with a top-level key.
+* datetime columns should be stored as a [native timestamp][timestamp], not as a string
+* The Collection JSON should be included in the Parquet metadata. See [Collection JSON](#collection-json) below.
+
+### Link Struct
+
+The GeoParquet dataset can contain zero or more Link Structs. Each Link Struct has 2 required fields and 2 optional ones:
+
+| Field Name | Type   | Description                                                                                                                         |
+|------------|--------|-------------------------------------------------------------------------------------------------------------------------------------|
+| href       | string | **REQUIRED.** The actual link in the format of an URL. Relative and absolute links are both allowed.                                |
+| rel        | string | **REQUIRED.** Relationship between the current document and the linked document. See chapter "Relation types" for more information. |
+| type       | string | [Media type][media-type] of the referenced entity.                                                                                  |
+| title      | string | A human readable title to be used in rendered displays of the link.                                                                 |
+
+See [Link Object][link] for more.
+
+### Asset Struct
+
+The GeoParquet dataset can contain zero or more Asset Structs. Each Asset Struct can have the following fields:
+
+| Field Name  | Type      | Description                                                                                                                                                                                  |
+|-------------|-----------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| href        | string    | **REQUIRED.** URI to the asset object. Relative and absolute URI are both allowed.                                                                                                           |
+| title       | string    | The displayed title for clients and users.                                                                                                                                                   |
+| description | string    | A description of the Asset providing additional details, such as how it was processed or created. [CommonMark 0.29](http://commonmark.org/) syntax MAY be used for rich text representation. |
+| type        | string    | [Media type][media-type] of the asset. See the [common media types][common-media-types] in the best practice doc for commonly used asset types.                                              |
+| roles       | \[string] | The [semantic roles][asset-roles] of the asset, similar to the use of `rel` in links.                                                                                                        |
+
+Each struct has each full asset key and object as a sub-struct, it's a direct mapping from the JSON to Parquet
+
+To take advantage of Parquet's columnar nature and compression, the assets should be uniform so they can be represented by a simple schema, which in turn means every item should probably come from the same STAC collection.
+
+See [Asset Object][asset] for more.
+
+## Including a STAC Collection JSON in a STAC Geoparquet Collection
+
+To make a stac-geoparquet file a fully self-contained representation, you can
+include the Collection JSON in the Parquet metadata. If present in the [Parquet
+file metadata][parquet-metadata], the key must be `stac:collection` and the
+value must be a JSON string with the Collection JSON.
+
+## Referencing a STAC Geoparquet Collections in a STAC Collection JSON
+
+A common use case of stac-geoparquet is to create a mirror of a STAC collection. To refer to this mirror in the original collection, use an [Asset Object](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md#asset-object) at the collection level of the STAC JSON that includes the `application/vnd.apache.parquet` Media type and `collection-mirror` Role type to describe the function of the Geoparquet STAC Collection Asset.
+
+For example:
+
+| Field Name  | Type      | Value                               |
+|-------------|-----------|-------------------------------------|
+| href        | string    | s3://example/uti/to/file.geoparquet |
+| title       | string    | An example STAC geoparquet.         |
+| description | string    | Example description.                |
+| type        | string    | application/vnd.apache.parquet      |
+| roles       | \[string] | [collection-mirror]*                |
+
+*Note the IANA has not approved the new Media type `application/vnd.apache.parquet` yet, it's been (submitted for approval)[https://issues.apache.org/jira/browse/PARQUET-1889].
+
+The description should ideally include details about the spatial partitioning method.
+
+
+## Mapping to other geospatial data formats
+
+The principles here can likely be used to map into other geospatial data formats (GeoPackage, FlatGeobuf, etc), but we embrace Parquet's nested 'structs' for some of the mappings, so other formats will need to do something different. The obvious thing to do is to dump JSON into those fields, but that's outside the scope of this document, and we recommend creating a general document for that.
+
+[media-type]: https://github.com/radiantearth/stac-spec/blob/master/item-spec/item-spec.md#asset-media-type
+[asset]: https://github.com/radiantearth/stac-spec/blob/master/item-spec/item-spec.md#asset-object
+[asset-roles]: https://github.com/radiantearth/stac-spec/blob/master/item-spec/item-spec.md#asset-roles
+[link]: https://github.com/radiantearth/stac-spec/blob/master/item-spec/item-spec.md#link-object
+[common-media-types]: https://github.com/radiantearth/stac-spec/blob/master/best-practices.md#common-media-types-in-stac
+[timestamp]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp
+[parquet-metadata]: https://github.com/apache/parquet-format#metadata
diff --git a/stac_geoparquet/__init__.py b/stac_geoparquet/__init__.py
@@ -1,7 +1,12 @@
 """stac-geoparquet"""
-__version__ = "0.3.2"
-
 from .stac_geoparquet import to_geodataframe, to_dict, to_item_collection
+from ._version import __version__
 
 
-__all__ = ["__version__", "to_geodataframe", "to_dict", "to_item_collection"]
+__all__ = [
+    "__version__",
+    "to_geodataframe",
+    "to_dict",
+    "to_item_collection",
+    "__version__",
+]
diff --git a/stac_geoparquet/stac_geoparquet.py b/stac_geoparquet/stac_geoparquet.py
@@ -1,13 +1,17 @@
 """
 Generate geoparquet from a sequence of STAC items.
 """
+
 from __future__ import annotations
+import collections
 
-from typing import Sequence, Any
+from typing import Sequence, Any, Literal
+import warnings
 
 import pystac
 import geopandas
 import pandas as pd
+import pyarrow as pa
 import numpy as np
 import shapely.geometry
 
@@ -16,7 +20,7 @@
 from stac_geoparquet.utils import fix_empty_multipolygon
 
 STAC_ITEM_TYPES = ["application/json", "application/geo+json"]
-
+DTYPE_BACKEND = Literal["numpy_nullable", "pyarrow"]
 SELF_LINK_COLUMN = "self_link"
 
 
@@ -31,7 +35,10 @@ def _fix_array(v):
 
 
 def to_geodataframe(
-    items: Sequence[dict[str, Any]], add_self_link: bool = False
+    items: Sequence[dict[str, Any]],
+    add_self_link: bool = False,
+    dtype_backend: DTYPE_BACKEND | None = None,
+    datetime_precision: str = "ns",
 ) -> geopandas.GeoDataFrame:
     """
     Convert a sequence of STAC items to a :class:`geopandas.GeoDataFrame`.
@@ -42,19 +49,72 @@ def to_geodataframe(
     Parameters
     ----------
     items: A sequence of STAC items.
-    add_self_link: Add the absolute link (if available) to the source STAC Item as a separate column named "self_link"
+    add_self_link: bool, default False
+        Add the absolute link (if available) to the source STAC Item
+        as a separate column named "self_link"
+    dtype_backend: {'pyarrow', 'numpy_nullable'}, optional
+        The dtype backend to use for storing arrays.
+
+        By default, this will use 'numpy_nullable' and emit a
+        FutureWarning that the default will change to 'pyarrow' in
+        the next release.
+
+        Set to 'numpy_nullable' to silence the warning and accept the
+        old behavior.
+
+        Set to 'pyarrow' to silence the warning and accept the new behavior.
+
+        There are some difference in the output as well: with
+        ``dtype_backend="pyarrow"``, struct-like fields will explicitly
+        contain null values for fields that appear in only some of the
+        records. For example, given an ``assets`` like::
+
+            {
+                "a": {
+                    "href": "a.tif",
+                },
+                "b": {
+                    "href": "b.tif",
+                    "title": "B",
+                }
+            }
+
+        The ``assets`` field of the output for the first row with
+        ``dtype_backend="numpy_nullable"`` will be a Python dictionary with
+        just ``{"href": "a.tiff"}``.
+
+        With ``dtype_backend="pyarrow"``, this will be a pyarrow struct
+        with fields ``{"href": "a.tif", "title", None}``. pyarrow will
+        infer that the struct field ``asset.title`` is nullable.
+
+    datetime_precision: str, default "ns"
+        The precision to use for the datetime columns. For example,
+        "us" is microsecond and "ns" is nanosecond.
 
     Returns
     -------
     The converted GeoDataFrame.
     """
-    items2 = []
+    items2 = collections.defaultdict(list)
+
     for item in items:
-        item2 = {k: v for k, v in item.items() if k != "properties"}
+        keys = set(item) - {"properties", "geometry"}
+
+        for k in keys:
+            items2[k].append(item[k])
+
+        item_geometry = item["geometry"]
+        if item_geometry:
+            item_geometry = fix_empty_multipolygon(item_geometry)
+
+        items2["geometry"].append(item_geometry)
+
         for k, v in item["properties"].items():
-            if k in item2:
-                raise ValueError("k", k)
-            item2[k] = v
+            if k in item:
+                msg = f"Key '{k}' appears in both 'properties' and the top level."
+                raise ValueError(msg)
+            items2[k].append(v)
+
         if add_self_link:
             self_href = None
             for link in item["links"]:
@@ -65,23 +125,11 @@ def to_geodataframe(
                 ):
                     self_href = link["href"]
                     break
-            item2[SELF_LINK_COLUMN] = self_href
-        items2.append(item2)
-
-    # Filter out missing geoms in MultiPolygons
-    # https://github.com/shapely/shapely/issues/1407
-    # geometry = [shapely.geometry.shape(x["geometry"]) for x in items2]
-
-    geometry = []
-    for item2 in items2:
-        item_geometry = item2["geometry"]
-        if item_geometry:
-            item_geometry = fix_empty_multipolygon(item_geometry)  # type: ignore
-        geometry.append(item_geometry)
-
-    gdf = geopandas.GeoDataFrame(items2, geometry=geometry, crs="WGS84")
+            items2[SELF_LINK_COLUMN].append(self_href)
 
-    for column in [
+    # TODO: Ideally we wouldn't have to hard-code this list.
+    # Could we get it from the JSON schema.
+    DATETIME_COLUMNS = {
         "datetime",  # common metadata
         "start_datetime",
         "end_datetime",
@@ -90,9 +138,43 @@ def to_geodataframe(
         "expires",  # timestamps extension
         "published",
         "unpublished",
-    ]:
-        if column in gdf.columns:
-            gdf[column] = pd.to_datetime(gdf[column], format="ISO8601")
+    }
+
+    items2["geometry"] = geopandas.array.from_shapely(items2["geometry"])
+
+    if dtype_backend is None:
+        msg = (
+            "The default argument for 'dtype_backend' will change from "
+            "'numpy_nullable' to 'pyarrow'. To keep the previous default "
+            "specify ``dtype_backend='numpy_nullable'``. To accept the future "
+            "behavior specify ``dtype_backend='pyarrow'."
+        )
+        warnings.warn(FutureWarning(msg))
+        dtype_backend = "numpy_nullable"
+
+    if dtype_backend == "pyarrow":
+        for k, v in items2.items():
+            if k in DATETIME_COLUMNS:
+                dt = pd.to_datetime(v, format="ISO8601").as_unit(datetime_precision)
+                items2[k] = pd.arrays.ArrowExtensionArray(pa.array(dt))
+
+            elif k != "geometry":
+                items2[k] = pd.arrays.ArrowExtensionArray(pa.array(v))
+
+    elif dtype_backend == "numpy_nullable":
+        for k, v in items2.items():
+            if k in DATETIME_COLUMNS:
+                items2[k] = pd.to_datetime(v, format="ISO8601").as_unit(
+                    datetime_precision
+                )
+
+            if k in {"type", "stac_version", "id", "collection", SELF_LINK_COLUMN}:
+                items2[k] = pd.array(v, dtype="string")
+    else:
+        msg = f"Invalid 'dtype_backend={dtype_backend}'."
+        raise TypeError(msg)
+
+    gdf = geopandas.GeoDataFrame(items2, geometry="geometry", crs="WGS84")
 
     columns = [
         "type",
@@ -111,10 +193,6 @@ def to_geodataframe(
             columns.remove(col)
 
     gdf = pd.concat([gdf[columns], gdf.drop(columns=columns)], axis="columns")
-    for k in ["type", "stac_version", "id", "collection", SELF_LINK_COLUMN]:
-        if k in gdf:
-            gdf[k] = gdf[k].astype("string")
-
     return gdf
 
 
@@ -144,12 +222,16 @@ def to_dict(record: dict) -> dict:
 
         if k == SELF_LINK_COLUMN:
             continue
+        elif k == "assets":
+            item[k] = {k2: v2 for k2, v2 in v.items() if v2 is not None}
         elif k in top_level_keys:
             item[k] = v
         else:
             properties[k] = v
 
-    item["geometry"] = shapely.geometry.mapping(item["geometry"])
+    if item["geometry"]:
+        item["geometry"] = shapely.geometry.mapping(item["geometry"])
+
     item["properties"] = properties
 
     return item
@@ -175,6 +257,11 @@ def to_item_collection(df: geopandas.GeoDataFrame) -> pystac.ItemCollection:
         include=["datetime64[ns, UTC]", "datetime64[ns]"]
     ).columns
     for k in datelike:
+        # %f isn't implemented in pyarrow
+        # https://github.com/apache/arrow/issues/20146
+        if isinstance(df2[k].dtype, pd.ArrowDtype):
+            df2[k] = df2[k].astype("datetime64[ns, utc]")
+
         df2[k] = (
             df2[k].dt.strftime("%Y-%m-%dT%H:%M:%S.%fZ").fillna("").replace({"": None})
         )