From fb798f4cac590bd4520b513e170963d40dc952f4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 3 Mar 2024 11:22:07 -0600 Subject: [PATCH 1/3] Optionally use pyarrow types in to_geodataframe This updates to_geodataframe to optionally use pyarrow types, rather than NumPy. These types let us faithfully represent the actual nested types, rather than casting everything to `object`. --- stac_geoparquet/stac_geoparquet.py | 147 ++++++++++++---- stac_geoparquet/utils.py | 44 ++++- tests/test_pgstac_reader.py | 4 +- tests/test_stac_geoparquet.py | 260 ++++++++++++++++------------- tests/test_to_dict.py | 31 ++++ 5 files changed, 327 insertions(+), 159 deletions(-) diff --git a/stac_geoparquet/stac_geoparquet.py b/stac_geoparquet/stac_geoparquet.py index b42cce4..0974547 100644 --- a/stac_geoparquet/stac_geoparquet.py +++ b/stac_geoparquet/stac_geoparquet.py @@ -1,13 +1,17 @@ """ Generate geoparquet from a sequence of STAC items. """ + from __future__ import annotations +import collections -from typing import Sequence, Any +from typing import Sequence, Any, Literal +import warnings import pystac import geopandas import pandas as pd +import pyarrow as pa import numpy as np import shapely.geometry @@ -16,7 +20,7 @@ from stac_geoparquet.utils import fix_empty_multipolygon STAC_ITEM_TYPES = ["application/json", "application/geo+json"] - +DTYPE_BACKEND = Literal["numpy_nullable", "pyarrow"] SELF_LINK_COLUMN = "self_link" @@ -31,7 +35,9 @@ def _fix_array(v): def to_geodataframe( - items: Sequence[dict[str, Any]], add_self_link: bool = False + items: Sequence[dict[str, Any]], + add_self_link: bool = False, + dtype_backend: DTYPE_BACKEND | None = None, ) -> geopandas.GeoDataFrame: """ Convert a sequence of STAC items to a :class:`geopandas.GeoDataFrame`. @@ -42,19 +48,68 @@ def to_geodataframe( Parameters ---------- items: A sequence of STAC items. - add_self_link: Add the absolute link (if available) to the source STAC Item as a separate column named "self_link" + add_self_link: bool, default False + Add the absolute link (if available) to the source STAC Item + as a separate column named "self_link" + dtype_backend: {'pyarrow', 'numpy_nullable'}, optional + The dtype backend to use for storing arrays. + + By default, this will use 'numpy_nullable' and emit a + FutureWarning that the default will change to 'pyarrow' in + the next release. + + Set to 'numpy_nullable' to silence the warning and accept the + old behavior. + + Set to 'pyarrow' to silence the warning and accept the new behavior. + + There are some difference in the output as well: with + ``dtype_backend="pyarrow"``, struct-like fields will explicitly + contain null values for fields that appear in only some of the + records. For example, given an ``assets`` like:: + + { + "a": { + "href": "a.tif", + }, + "b": { + "href": "b.tif", + "title": "B", + } + } + + The ``assets`` field of the output for the first row with + ``dtype_backend="numpy_nullable"`` will be a Python dictionary with + just ``{"href": "a.tiff"}``. + + With ``dtype_backend="pyarrow"``, this will be a pyarrow struct + with fields ``{"href": "a.tif", "title", None}``. pyarrow will + infer that the struct field ``asset.title`` is nullable. Returns ------- The converted GeoDataFrame. """ - items2 = [] + items2 = collections.defaultdict(list) + for item in items: - item2 = {k: v for k, v in item.items() if k != "properties"} + keys = set(item) - {"properties", "geometry"} + + for k in keys: + items2[k].append(item[k]) + + item_geometry = item["geometry"] + if item_geometry: + item_geometry = fix_empty_multipolygon(item_geometry) + + items2["geometry"].append(item_geometry) + for k, v in item["properties"].items(): - if k in item2: - raise ValueError("k", k) - item2[k] = v + if k in item: + msg = f"Key '{k}' appears in both 'properties' and the top level." + raise ValueError(msg) + items2[k].append(v) + if add_self_link: self_href = None for link in item["links"]: @@ -65,23 +120,11 @@ def to_geodataframe( ): self_href = link["href"] break - item2[SELF_LINK_COLUMN] = self_href - items2.append(item2) - - # Filter out missing geoms in MultiPolygons - # https://github.com/shapely/shapely/issues/1407 - # geometry = [shapely.geometry.shape(x["geometry"]) for x in items2] - - geometry = [] - for item2 in items2: - item_geometry = item2["geometry"] - if item_geometry: - item_geometry = fix_empty_multipolygon(item_geometry) # type: ignore - geometry.append(item_geometry) - - gdf = geopandas.GeoDataFrame(items2, geometry=geometry, crs="WGS84") + items2[SELF_LINK_COLUMN].append(self_href) - for column in [ + # TODO: Ideally we wouldn't have to hard-code this list. + # Could we get it from the JSON schema. + DATETIME_COLUMNS = { "datetime", # common metadata "start_datetime", "end_datetime", @@ -90,9 +133,42 @@ def to_geodataframe( "expires", # timestamps extension "published", "unpublished", - ]: - if column in gdf.columns: - gdf[column] = pd.to_datetime(gdf[column], format="ISO8601") + } + + items2["geometry"] = geopandas.array.from_shapely(items2["geometry"]) + + if dtype_backend is None: + msg = ( + "The default argument for 'dtype_backend' will change from " + "'numpy_nullable' to 'pyarrow'. To keep the previous default " + "specify ``dtype_backend='numpy_nullable'``. To accept the future " + "behavior specify ``dtype_backend='pyarrow'." + ) + warnings.warn(FutureWarning(msg)) + dtype_backend = "numpy_nullable" + + if dtype_backend == "pyarrow": + for k, v in items2.items(): + if k in DATETIME_COLUMNS: + items2[k] = pd.arrays.ArrowExtensionArray( + pa.array(pd.to_datetime(v, format="ISO8601")) + ) + + elif k != "geometry": + items2[k] = pd.arrays.ArrowExtensionArray(pa.array(v)) + + elif dtype_backend == "numpy_nullable": + for k, v in items2.items(): + if k in DATETIME_COLUMNS: + items2[k] = pd.to_datetime(v, format="ISO8601") + + if k in {"type", "stac_version", "id", "collection", SELF_LINK_COLUMN}: + items2[k] = pd.array(v, dtype="string") + else: + msg = f"Invalid 'dtype_backend={dtype_backend}'." + raise TypeError(msg) + + gdf = geopandas.GeoDataFrame(items2, geometry="geometry", crs="WGS84") columns = [ "type", @@ -111,10 +187,6 @@ def to_geodataframe( columns.remove(col) gdf = pd.concat([gdf[columns], gdf.drop(columns=columns)], axis="columns") - for k in ["type", "stac_version", "id", "collection", SELF_LINK_COLUMN]: - if k in gdf: - gdf[k] = gdf[k].astype("string") - return gdf @@ -144,12 +216,16 @@ def to_dict(record: dict) -> dict: if k == SELF_LINK_COLUMN: continue + elif k == "assets": + item[k] = {k2: v2 for k2, v2 in v.items() if v2 is not None} elif k in top_level_keys: item[k] = v else: properties[k] = v - item["geometry"] = shapely.geometry.mapping(item["geometry"]) + if item["geometry"]: + item["geometry"] = shapely.geometry.mapping(item["geometry"]) + item["properties"] = properties return item @@ -175,6 +251,11 @@ def to_item_collection(df: geopandas.GeoDataFrame) -> pystac.ItemCollection: include=["datetime64[ns, UTC]", "datetime64[ns]"] ).columns for k in datelike: + # %f isn't implemented in pyarrow + # https://github.com/apache/arrow/issues/20146 + if isinstance(df2[k].dtype, pd.ArrowDtype): + df2[k] = df2[k].astype("datetime64[ns, utc]") + df2[k] = ( df2[k].dt.strftime("%Y-%m-%dT%H:%M:%S.%fZ").fillna("").replace({"": None}) ) diff --git a/stac_geoparquet/utils.py b/stac_geoparquet/utils.py index 6c912b1..68e4eba 100644 --- a/stac_geoparquet/utils.py +++ b/stac_geoparquet/utils.py @@ -8,23 +8,27 @@ @functools.singledispatch -def assert_equal(result: Any, expected: Any) -> bool: +def assert_equal(result: Any, expected: Any, ignore_none: bool = False) -> bool: raise TypeError(f"Invalid type {type(result)}") @assert_equal.register(pystac.ItemCollection) def assert_equal_ic( - result: pystac.ItemCollection, expected: pystac.ItemCollection + result: pystac.ItemCollection, + expected: pystac.ItemCollection, + ignore_none: bool = False, ) -> None: assert type(result) == type(expected) assert len(result) == len(expected) assert result.extra_fields == expected.extra_fields for a, b in zip(result.items, expected.items): - assert_equal(a, b) + assert_equal(a, b, ignore_none=ignore_none) @assert_equal.register(pystac.Item) -def assert_equal_item(result: pystac.Item, expected: pystac.Item) -> None: +def assert_equal_item( + result: pystac.Item, expected: pystac.Item, ignore_none: bool = False +) -> None: assert type(result) == type(expected) assert result.id == expected.id assert shapely.geometry.shape(result.geometry) == shapely.geometry.shape( @@ -41,20 +45,44 @@ def assert_equal_item(result: pystac.Item, expected: pystac.Item) -> None: expected_links = sorted(expected.links, key=lambda x: x.href) assert len(result_links) == len(expected_links) for a, b in zip(result_links, expected_links): - assert_equal(a, b) + assert_equal(a, b, ignore_none=ignore_none) assert set(result.assets) == set(expected.assets) for k in result.assets: - assert_equal(result.assets[k], expected.assets[k]) + assert_equal(result.assets[k], expected.assets[k], ignore_none=ignore_none) @assert_equal.register(pystac.Link) @assert_equal.register(pystac.Asset) def assert_link_equal( - result: pystac.Link | pystac.Asset, expected: pystac.Link | pystac.Asset + result: pystac.Link | pystac.Asset, + expected: pystac.Link | pystac.Asset, + ignore_none: bool = False, ) -> None: assert type(result) == type(expected) - assert result.to_dict() == expected.to_dict() + resultd = result.to_dict() + expectedd = expected.to_dict() + + left = {} + + if ignore_none: + for k, v in resultd.items(): + if v is None and k not in expectedd: + pass + elif isinstance(v, list) and k in expectedd: + out = [] + for val in v: + if isinstance(val, dict): + out.append({k: v2 for k, v2 in val.items() if v2 is not None}) + else: + out.append(val) + left[k] = out + else: + left[k] = v + else: + left = resultd + + assert left == expectedd def fix_empty_multipolygon( diff --git a/tests/test_pgstac_reader.py b/tests/test_pgstac_reader.py index 9cdc8cc..d921a6e 100644 --- a/tests/test_pgstac_reader.py +++ b/tests/test_pgstac_reader.py @@ -113,7 +113,7 @@ def test_naip_item(): expected.remove_links(rel=pystac.RelType.SELF) result.remove_links(rel=pystac.RelType.SELF) - assert_equal(result, expected) + assert_equal(result, expected, ignore_none=True) def test_sentinel2_l2a(): @@ -139,7 +139,7 @@ def test_sentinel2_l2a(): result.remove_links(rel=pystac.RelType.SELF) expected.remove_links(rel=pystac.RelType.LICENSE) - assert_equal(result, expected) + assert_equal(result, expected, ignore_none=True) def test_generate_endpoints(): diff --git a/tests/test_stac_geoparquet.py b/tests/test_stac_geoparquet.py index 10c484c..3ddca61 100644 --- a/tests/test_stac_geoparquet.py +++ b/tests/test_stac_geoparquet.py @@ -5,6 +5,7 @@ import shapely.geometry import pandas as pd import pandas.testing +import pyarrow as pa import pystac import geopandas import requests @@ -63,9 +64,9 @@ def test_assert_equal(): "roles": ["data"], "title": "RGBIR COG tile", "eo:bands": [ - {"name": "Red", "common_name": "red"}, - {"name": "Green", "common_name": "green"}, - {"name": "Blue", "common_name": "blue"}, + {"name": "Red", "common_name": "red", "description": "Red"}, + {"name": "Green", "common_name": "green", "description": "Green"}, + {"name": "Blue", "common_name": "blue", "description": "Blue"}, {"name": "NIR", "common_name": "nir", "description": "near-infrared"}, ], }, @@ -125,111 +126,131 @@ def test_assert_equal(): "stac_version": "1.0.0", } -EXPECTED_GDF = { - "type": {0: "Feature"}, - "stac_version": {0: "1.0.0"}, - "stac_extensions": { - 0: [ - "https://stac-extensions.github.io/eo/v1.0.0/schema.json", - "https://stac-extensions.github.io/projection/v1.0.0/schema.json", - ] - }, - "id": {0: "ia_m_4209150_sw_15_060_20190828_20191105"}, - "geometry": {0: shapely.geometry.shape(ITEM["geometry"])}, - "bbox": {0: [-91.879788, 42.121621, -91.807132, 42.191372]}, - "links": { - 0: [ - { - "rel": "collection", - "type": "application/json", - "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip", - }, - { - "rel": "parent", - "type": "application/json", - "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip", - }, - { - "rel": "root", - "type": "application/json", - "href": "https://planetarycomputer.microsoft.com/api/stac/v1/", - }, - { - "rel": "self", - "type": "application/geo+json", - "href": ITEM_SELF_HREF, - }, + +EXPECTED_GDF = geopandas.GeoDataFrame( + { + "type": ["Feature"], + "stac_version": ["1.0.0"], + "stac_extensions": [ + [ + "https://stac-extensions.github.io/eo/v1.0.0/schema.json", + "https://stac-extensions.github.io/projection/v1.0.0/schema.json", + ] + ], + "id": ["ia_m_4209150_sw_15_060_20190828_20191105"], + "geometry": geopandas.array.from_shapely( + [shapely.geometry.shape(ITEM["geometry"])] + ), + "bbox": [[-91.879788, 42.121621, -91.807132, 42.191372]], + "links": [ + [ + { + "rel": "collection", + "type": "application/json", + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip", + }, + { + "rel": "parent", + "type": "application/json", + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip", + }, + { + "rel": "root", + "type": "application/json", + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/", + }, + { + "rel": "self", + "type": "application/geo+json", + "href": ITEM_SELF_HREF, + }, + { + "rel": "preview", + "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/map?collection=naip&item=ia_m_4209150_sw_15_060_20190828_20191105", # noqa: E501 + "title": "Map of item", + "type": "text/html", + }, + ] + ], + "assets": [ { - "rel": "preview", - "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/map?collection=naip&item=ia_m_4209150_sw_15_060_20190828_20191105", # noqa: E501 - "title": "Map of item", - "type": "text/html", - }, - ] - }, - "assets": { - 0: { - "image": { - "href": "https://naipeuwest.blob.core.windows.net/naip/v002/ia/2019/ia_60cm_2019/42091/m_4209150_sw_15_060_20190828.tif", # noqa: E501 - "type": "image/tiff; application=geotiff; profile=cloud-optimized", - "roles": ["data"], - "title": "RGBIR COG tile", - "eo:bands": [ - {"name": "Red", "common_name": "red"}, - {"name": "Green", "common_name": "green"}, - {"name": "Blue", "common_name": "blue"}, - { - "name": "NIR", - "common_name": "nir", - "description": "near-infrared", - }, - ], - }, - "metadata": { - "href": "https://naipeuwest.blob.core.windows.net/naip/v002/ia/2019/ia_fgdc_2019/42091/m_4209150_sw_15_060_20190828.txt", # noqa: E501 - "type": "text/plain", - "roles": ["metadata"], - "title": "FGDC Metdata", - }, - "thumbnail": { - "href": "https://naipeuwest.blob.core.windows.net/naip/v002/ia/2019/ia_60cm_2019/42091/m_4209150_sw_15_060_20190828.200.jpg", # noqa: E501 - "type": "image/jpeg", - "roles": ["thumbnail"], - "title": "Thumbnail", - }, - "tilejson": { - "title": "TileJSON with default rendering", - "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/tilejson.json?collection=naip&item=ia_m_4209150_sw_15_060_20190828_20191105&assets=image&asset_bidx=image%7C1%2C2%2C3", # noqa: E501 - "type": "application/json", - "roles": ["tiles"], - }, - "rendered_preview": { - "title": "Rendered preview", - "rel": "preview", - "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/preview.png?collection=naip&item=ia_m_4209150_sw_15_060_20190828_20191105&assets=image&asset_bidx=image%7C1%2C2%2C3", # noqa: E501 - "roles": ["overview"], - "type": "image/png", - }, - } - }, - "collection": {0: "naip"}, - "gsd": {0: 0.6}, - "datetime": {0: pd.Timestamp("2019-08-28 00:00:00+0000", tz="UTC")}, - "naip:year": {0: "2019"}, - "proj:bbox": {0: [592596.0, 4663966.8, 598495.8, 4671633.0]}, - "proj:epsg": {0: 26915}, - "naip:state": {0: "ia"}, - "proj:shape": {0: [12777, 9833]}, - "proj:transform": {0: [0.6, 0.0, 592596.0, 0.0, -0.6, 4671633.0, 0.0, 0.0, 1.0]}, -} + "image": { + "href": "https://naipeuwest.blob.core.windows.net/naip/v002/ia/2019/ia_60cm_2019/42091/m_4209150_sw_15_060_20190828.tif", # noqa: E501 + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "roles": ["data"], + "title": "RGBIR COG tile", + "eo:bands": [ + {"name": "Red", "common_name": "red", "description": "Red"}, + { + "name": "Green", + "common_name": "green", + "description": "Green", + }, + { + "name": "Blue", + "common_name": "blue", + "description": "Blue", + }, + { + "name": "NIR", + "common_name": "nir", + "description": "near-infrared", + }, + ], + }, + "metadata": { + "href": "https://naipeuwest.blob.core.windows.net/naip/v002/ia/2019/ia_fgdc_2019/42091/m_4209150_sw_15_060_20190828.txt", # noqa: E501 + "type": "text/plain", + "roles": ["metadata"], + "title": "FGDC Metdata", + }, + "thumbnail": { + "href": "https://naipeuwest.blob.core.windows.net/naip/v002/ia/2019/ia_60cm_2019/42091/m_4209150_sw_15_060_20190828.200.jpg", # noqa: E501 + "type": "image/jpeg", + "roles": ["thumbnail"], + "title": "Thumbnail", + }, + "tilejson": { + "title": "TileJSON with default rendering", + "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/tilejson.json?collection=naip&item=ia_m_4209150_sw_15_060_20190828_20191105&assets=image&asset_bidx=image%7C1%2C2%2C3", # noqa: E501 + "type": "application/json", + "roles": ["tiles"], + }, + "rendered_preview": { + "title": "Rendered preview", + "rel": "preview", + "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/preview.png?collection=naip&item=ia_m_4209150_sw_15_060_20190828_20191105&assets=image&asset_bidx=image%7C1%2C2%2C3", # noqa: E501 + "roles": ["overview"], + "type": "image/png", + }, + } + ], + "collection": ["naip"], + "gsd": [0.6], + "datetime": pd.to_datetime(["2019-08-28 00:00:00+0000"]), + "naip:year": ["2019"], + "proj:bbox": [[592596.0, 4663966.8, 598495.8, 4671633.0]], + "proj:epsg": [26915], + "naip:state": ["ia"], + "proj:shape": [[12777, 9833]], + "proj:transform": [[0.6, 0.0, 592596.0, 0.0, -0.6, 4671633.0, 0.0, 0.0, 1.0]], + } +) -def test_to_geodataframe(): - result = stac_geoparquet.to_geodataframe([ITEM]) - expected = geopandas.GeoDataFrame(EXPECTED_GDF) - for k in ["type", "stac_version", "id", "collection"]: - if k in expected: - expected[k] = expected[k].astype("string") +@pytest.mark.parametrize("dtype_backend", ["numpy_nullable", "pyarrow"]) +def test_to_geodataframe(dtype_backend): + result = stac_geoparquet.to_geodataframe([ITEM], dtype_backend=dtype_backend) + expected = EXPECTED_GDF.copy() + + if dtype_backend == "numpy_nullable": + for k in ["type", "stac_version", "id", "collection"]: + expected[k] = expected[k].astype(pd.StringDtype()) + + else: + for k, v in EXPECTED_GDF.items(): + if k != "geometry": + expected[k] = pd.arrays.ArrowExtensionArray(pa.array(v)) pandas.testing.assert_frame_equal(result, expected) @@ -238,14 +259,21 @@ def test_to_geodataframe(): assert_equal(ic1, ic2) +def test_dtype_backend_warns(): + with pytest.warns(FutureWarning, match="dtype_backend"): + stac_geoparquet.to_geodataframe([ITEM]) + + def test_to_geodataframe_with_self_link(): - result = stac_geoparquet.to_geodataframe([ITEM], add_self_link=True) - gdf = EXPECTED_GDF.copy() - gdf["self_link"] = {0: ITEM_SELF_HREF} - expected = geopandas.GeoDataFrame(gdf) - for k in ["type", "stac_version", "id", "collection", "self_link"]: - if k in expected: - expected[k] = expected[k].astype("string") + result = stac_geoparquet.to_geodataframe( + [ITEM], add_self_link=True, dtype_backend="pyarrow" + ) + expected = EXPECTED_GDF.copy() + expected["self_link"] = pd.arrays.ArrowExtensionArray(pa.array([ITEM_SELF_HREF])) + + for k, v in EXPECTED_GDF.items(): + if k != "geometry": + expected[k] = pd.arrays.ArrowExtensionArray(pa.array(v)) pandas.testing.assert_frame_equal(result, expected) @@ -269,7 +297,7 @@ def test_s1_grd(): item["stac_extensions"][i] = EO_V11 item["geometry"] = fix_empty_multipolygon(item["geometry"]).__geo_interface__ - df = stac_geoparquet.to_geodataframe([item]) + df = stac_geoparquet.to_geodataframe([item], dtype_backend="pyarrow") result = to_item_collection(df)[0] assert_equal(result, pystac.read_dict(item)) @@ -354,11 +382,11 @@ def test_smoke(collection_id): ) r.raise_for_status() items = r.json()["features"] - df = stac_geoparquet.to_geodataframe(items) + df = stac_geoparquet.to_geodataframe(items, dtype_backend="pyarrow") result = to_item_collection(df) expected = pystac.ItemCollection(items) - assert_equal(result, expected) + assert_equal(result, expected, ignore_none=True) def test_mixed_date_format(): @@ -369,7 +397,7 @@ def test_mixed_date_format(): a["geometry"] = {"type": "Point", "coordinates": [0, 0]} b["geometry"] = {"type": "Point", "coordinates": [0, 0]} - result = stac_geoparquet.to_geodataframe([a, b]) + result = stac_geoparquet.to_geodataframe([a, b], dtype_backend="pyarrow") expected = [ pd.Timestamp("2000-12-10 22:04:58+0000", tz="UTC"), pd.Timestamp("2000-12-10 22:04:57.998000+0000", tz="UTC"), diff --git a/tests/test_to_dict.py b/tests/test_to_dict.py index 6df2fd8..9b5b336 100644 --- a/tests/test_to_dict.py +++ b/tests/test_to_dict.py @@ -116,3 +116,34 @@ def test_to_dict(naip): "type": "Feature", } assert result[0].to_dict() == expected + + +def test_to_dict_optional_asset(): + items = [ + { + "id": "a", + "geometry": None, + "bbox": None, + "links": [], + "type": "Feature", + "stac_version": "1.0.0", + "properties": {"datetime": "2021-01-01T00:00:00Z"}, + "assets": {"a": {"href": "a.txt"}, "b": {"href": "b.txt"}}, + }, + { + "id": "b", + "geometry": None, + "bbox": None, + "links": [], + "type": "Feature", + "stac_version": "1.0.0", + "properties": {"datetime": "2021-01-01T00:00:00Z"}, + "assets": {"a": {"href": "a.txt"}}, + }, + ] + df = stac_geoparquet.to_geodataframe(items, dtype_backend="pyarrow") + result = stac_geoparquet.to_item_collection(df) + assert result[0].assets["a"].to_dict() == {"href": "a.txt"} + assert result[0].assets["b"].to_dict() == {"href": "b.txt"} + assert result[1].assets["a"].to_dict() == {"href": "a.txt"} + assert "b" not in result[1].assets From 5c646cba236a1106b8c890bd826bc71504500958 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 24 Mar 2024 15:19:11 -0500 Subject: [PATCH 2/3] ts resolution --- stac_geoparquet/stac_geoparquet.py | 14 ++++++++++---- tests/test_stac_geoparquet.py | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/stac_geoparquet/stac_geoparquet.py b/stac_geoparquet/stac_geoparquet.py index 0974547..2efc299 100644 --- a/stac_geoparquet/stac_geoparquet.py +++ b/stac_geoparquet/stac_geoparquet.py @@ -38,6 +38,7 @@ def to_geodataframe( items: Sequence[dict[str, Any]], add_self_link: bool = False, dtype_backend: DTYPE_BACKEND | None = None, + datetime_precision: str = "us", ) -> geopandas.GeoDataFrame: """ Convert a sequence of STAC items to a :class:`geopandas.GeoDataFrame`. @@ -86,6 +87,10 @@ def to_geodataframe( with fields ``{"href": "a.tif", "title", None}``. pyarrow will infer that the struct field ``asset.title`` is nullable. + datetime_precision: str, default "us" + The precision to use for the datetime columns. For example, + "us" is microsecond and "ns" is nanosecond. + Returns ------- The converted GeoDataFrame. @@ -150,9 +155,8 @@ def to_geodataframe( if dtype_backend == "pyarrow": for k, v in items2.items(): if k in DATETIME_COLUMNS: - items2[k] = pd.arrays.ArrowExtensionArray( - pa.array(pd.to_datetime(v, format="ISO8601")) - ) + dt = pd.to_datetime(v, format="ISO8601").as_unit(datetime_precision) + items2[k] = pd.arrays.ArrowExtensionArray(pa.array(dt)) elif k != "geometry": items2[k] = pd.arrays.ArrowExtensionArray(pa.array(v)) @@ -160,7 +164,9 @@ def to_geodataframe( elif dtype_backend == "numpy_nullable": for k, v in items2.items(): if k in DATETIME_COLUMNS: - items2[k] = pd.to_datetime(v, format="ISO8601") + items2[k] = pd.to_datetime(v, format="ISO8601").as_unit( + datetime_precision + ) if k in {"type", "stac_version", "id", "collection", SELF_LINK_COLUMN}: items2[k] = pd.array(v, dtype="string") diff --git a/tests/test_stac_geoparquet.py b/tests/test_stac_geoparquet.py index 3ddca61..b9e18b7 100644 --- a/tests/test_stac_geoparquet.py +++ b/tests/test_stac_geoparquet.py @@ -227,7 +227,7 @@ def test_assert_equal(): ], "collection": ["naip"], "gsd": [0.6], - "datetime": pd.to_datetime(["2019-08-28 00:00:00+0000"]), + "datetime": pd.to_datetime(["2019-08-28 00:00:00+0000"]).as_unit("us"), "naip:year": ["2019"], "proj:bbox": [[592596.0, 4663966.8, 598495.8, 4671633.0]], "proj:epsg": [26915], From 9c602199e1d990c02a467bb35f3a89f89b4b5800 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 29 Mar 2024 12:42:11 -0500 Subject: [PATCH 3/3] parameter for datetime precision --- stac_geoparquet/stac_geoparquet.py | 4 ++-- tests/test_stac_geoparquet.py | 17 ++++++++++++++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/stac_geoparquet/stac_geoparquet.py b/stac_geoparquet/stac_geoparquet.py index 2efc299..74f1872 100644 --- a/stac_geoparquet/stac_geoparquet.py +++ b/stac_geoparquet/stac_geoparquet.py @@ -38,7 +38,7 @@ def to_geodataframe( items: Sequence[dict[str, Any]], add_self_link: bool = False, dtype_backend: DTYPE_BACKEND | None = None, - datetime_precision: str = "us", + datetime_precision: str = "ns", ) -> geopandas.GeoDataFrame: """ Convert a sequence of STAC items to a :class:`geopandas.GeoDataFrame`. @@ -87,7 +87,7 @@ def to_geodataframe( with fields ``{"href": "a.tif", "title", None}``. pyarrow will infer that the struct field ``asset.title`` is nullable. - datetime_precision: str, default "us" + datetime_precision: str, default "ns" The precision to use for the datetime columns. For example, "us" is microsecond and "ns" is nanosecond. diff --git a/tests/test_stac_geoparquet.py b/tests/test_stac_geoparquet.py index b9e18b7..db6b978 100644 --- a/tests/test_stac_geoparquet.py +++ b/tests/test_stac_geoparquet.py @@ -227,7 +227,7 @@ def test_assert_equal(): ], "collection": ["naip"], "gsd": [0.6], - "datetime": pd.to_datetime(["2019-08-28 00:00:00+0000"]).as_unit("us"), + "datetime": pd.to_datetime(["2019-08-28 00:00:00+0000"]).as_unit("ns"), "naip:year": ["2019"], "proj:bbox": [[592596.0, 4663966.8, 598495.8, 4671633.0]], "proj:epsg": [26915], @@ -404,3 +404,18 @@ def test_mixed_date_format(): ] assert result["datetime"].tolist() == expected + + +@pytest.mark.parametrize("datetime_precision", ["us", "ns"]) +def test_datetime_precision(datetime_precision): + item = json.loads((HERE / "sentinel-2-item.json").read_text()) + item["properties"]["datetime"] = "2000-12-10T22:00:00.123456Z" + df = stac_geoparquet.to_geodataframe( + [item], dtype_backend="pyarrow", datetime_precision=datetime_precision + ) + result = df["datetime"].iloc[0] + expected = pd.Timestamp("2000-12-10 22:00:00.123456+0000", tz="UTC").as_unit( + datetime_precision + ) + assert result == expected + assert result.unit == datetime_precision