From a75cc149b0cdc842dc1310d18c4c72ed10426795 Mon Sep 17 00:00:00 2001 From: Chris Holmes Date: Tue, 26 Sep 2023 13:45:52 -0700 Subject: [PATCH 01/19] initial spec skeleton --- spec/stac-geoparquet-spec.md | 39 ++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 spec/stac-geoparquet-spec.md diff --git a/spec/stac-geoparquet-spec.md b/spec/stac-geoparquet-spec.md new file mode 100644 index 0000000..2a34301 --- /dev/null +++ b/spec/stac-geoparquet-spec.md @@ -0,0 +1,39 @@ +# STAC GeoParquet Specification + +## Overview + +This document specifies how to map a set of [STAC Items](https://github.com/radiantearth/stac-spec/tree/v1.0.0/item-spec) into +[GeoParquet](https://geoparquet.org). It is directly inspired by the [STAC GeoParquet](https://github.com/stac-utils/stac-geoparquet) +library, but aims to provide guidance for anyone putting STAC data into GeoParquet. + +## Guidelines + +Generally most all the fields in a STAC Item should be mapped to a row in GeoParquet. We embrace Parquet structures where possible, mapping +from JSON into nested structures. We do pull the properties to the top level, so that it is easier to query and use them. The names of the +most of the fields should be the same in STAC and in GeoParquet. + +| Field | GeoParquet Type | Required | Details | +| --------------- | ---------------- | ---------|--------------------------------------------------- | +| type | String | Optional | This is just needed for GeoJSON, so it is optional and not recommended to include in GeoParquet | +| stac_extensions | List of Strings | Required | This column is required, but can be blank if no STAC extensions were used | +| id | String | Required | Required, should be unique | +| geometry | Binary (WKB) | Required | For GeoParquet 1.0 this must be well-known Binary. | +| bbox | List of Decimals | Required | Can be 4 or 6 decimals, so won't be a fixed size list. | +| properties | per field | Required | Each property should use the relevant Parquet type, and be pulled out of the properties object to be a top-level Parquet field | +| links | List of structs | Required | Each struct in the array should have Strings of `href`, `rel` and `type` | +| assets | A struct of assets | Required | Each struct has each full asset key and object as a sub-struct, it's a direct mapping from the JSON to Parquet | +| collection | String | Required | The ID of the collection this Item is a part of | + + +* Must be valid GeoParquet, with proper metadata. Ideally the geometry types are defined and as narrow as possible. +* Strongly recommend to only have one GeoParquet per STAC 'Collection'. Not doing this will lead to an expanded GeoParquet schema (the union of all the schemas of the collection) with lots of empty data +* Any field in 'properties' should be moved up to be a top-level field in the GeoParquet. + +## Mapping to other geospatial data formats + +The principles here can likely be used to map into other geospatial data formats (GeoPackage, FlatGeobuf, etc), but we embrace Parquet's nested 'structs' for some of the mappings, so other formats will need to do something different. The obvious thing to do is to dump JSON into those fields, but that's outside the scope of this document, and we recommend creating a general document for that. + +## Use cases + +* Provide a STAC GeoParquet that mirrors a static Collection as a way to query the whole dataset instead of reading every specific GeoJSON file. +* As an output format for STAC API responses that is more efficient than paging through thousands of pages of GeoJSON. \ No newline at end of file From 1f9efaa6fa525e44072610b72b3f907ba9c48f34 Mon Sep 17 00:00:00 2001 From: Chris Holmes Date: Tue, 26 Sep 2023 13:48:57 -0700 Subject: [PATCH 02/19] spacing fixes --- spec/stac-geoparquet-spec.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spec/stac-geoparquet-spec.md b/spec/stac-geoparquet-spec.md index 2a34301..74087dd 100644 --- a/spec/stac-geoparquet-spec.md +++ b/spec/stac-geoparquet-spec.md @@ -14,15 +14,15 @@ most of the fields should be the same in STAC and in GeoParquet. | Field | GeoParquet Type | Required | Details | | --------------- | ---------------- | ---------|--------------------------------------------------- | -| type | String | Optional | This is just needed for GeoJSON, so it is optional and not recommended to include in GeoParquet | +| type | String | Optional | This is just needed for GeoJSON, so it is optional and not recommended to include in GeoParquet | | stac_extensions | List of Strings | Required | This column is required, but can be blank if no STAC extensions were used | | id | String | Required | Required, should be unique | -| geometry | Binary (WKB) | Required | For GeoParquet 1.0 this must be well-known Binary. | -| bbox | List of Decimals | Required | Can be 4 or 6 decimals, so won't be a fixed size list. | +| geometry | Binary (WKB) | Required | For GeoParquet 1.0 this must be well-known Binary. | +| bbox | List of Decimals | Required | Can be 4 or 6 decimals, so won't be a fixed size list. | | properties | per field | Required | Each property should use the relevant Parquet type, and be pulled out of the properties object to be a top-level Parquet field | | links | List of structs | Required | Each struct in the array should have Strings of `href`, `rel` and `type` | -| assets | A struct of assets | Required | Each struct has each full asset key and object as a sub-struct, it's a direct mapping from the JSON to Parquet | -| collection | String | Required | The ID of the collection this Item is a part of | +| assets | A struct of assets | Required | Each struct has each full asset key and object as a sub-struct, it's a direct mapping from the JSON to Parquet | +| collection | String | Required | The ID of the collection this Item is a part of | * Must be valid GeoParquet, with proper metadata. Ideally the geometry types are defined and as narrow as possible. From a834fe5bf2d5ff0e5bef00052b4c62be10ea0b0b Mon Sep 17 00:00:00 2001 From: Chris Holmes Date: Tue, 26 Sep 2023 13:50:42 -0700 Subject: [PATCH 03/19] more spacing fixes --- spec/stac-geoparquet-spec.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spec/stac-geoparquet-spec.md b/spec/stac-geoparquet-spec.md index 74087dd..dc3cb32 100644 --- a/spec/stac-geoparquet-spec.md +++ b/spec/stac-geoparquet-spec.md @@ -13,12 +13,12 @@ from JSON into nested structures. We do pull the properties to the top level, so most of the fields should be the same in STAC and in GeoParquet. | Field | GeoParquet Type | Required | Details | -| --------------- | ---------------- | ---------|--------------------------------------------------- | +| --------------- | ------------------ | ---------|--------------------------------------------------- | | type | String | Optional | This is just needed for GeoJSON, so it is optional and not recommended to include in GeoParquet | | stac_extensions | List of Strings | Required | This column is required, but can be blank if no STAC extensions were used | | id | String | Required | Required, should be unique | | geometry | Binary (WKB) | Required | For GeoParquet 1.0 this must be well-known Binary. | -| bbox | List of Decimals | Required | Can be 4 or 6 decimals, so won't be a fixed size list. | +| bbox | List of Decimals | Required | Can be 4 or 6 decimals, so won't be a fixed size list. | | properties | per field | Required | Each property should use the relevant Parquet type, and be pulled out of the properties object to be a top-level Parquet field | | links | List of structs | Required | Each struct in the array should have Strings of `href`, `rel` and `type` | | assets | A struct of assets | Required | Each struct has each full asset key and object as a sub-struct, it's a direct mapping from the JSON to Parquet | From ae9d966dbc86e207b12ae9bf6161808790c09d9e Mon Sep 17 00:00:00 2001 From: Chris Holmes Date: Wed, 27 Sep 2023 10:28:51 -0700 Subject: [PATCH 04/19] Update spec/stac-geoparquet-spec.md Co-authored-by: Tom Augspurger --- spec/stac-geoparquet-spec.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/stac-geoparquet-spec.md b/spec/stac-geoparquet-spec.md index dc3cb32..7657d84 100644 --- a/spec/stac-geoparquet-spec.md +++ b/spec/stac-geoparquet-spec.md @@ -8,7 +8,7 @@ library, but aims to provide guidance for anyone putting STAC data into GeoParqu ## Guidelines -Generally most all the fields in a STAC Item should be mapped to a row in GeoParquet. We embrace Parquet structures where possible, mapping +Generally most all the fields in a STAC Item should be mapped to a column in GeoParquet. We embrace Parquet structures where possible, mapping from JSON into nested structures. We do pull the properties to the top level, so that it is easier to query and use them. The names of the most of the fields should be the same in STAC and in GeoParquet. From 0af604b7896e0689a7c1646e66f81da25f8e1cf8 Mon Sep 17 00:00:00 2001 From: Chris Holmes Date: Wed, 27 Sep 2023 10:29:01 -0700 Subject: [PATCH 05/19] Update spec/stac-geoparquet-spec.md Co-authored-by: Tom Augspurger --- spec/stac-geoparquet-spec.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/stac-geoparquet-spec.md b/spec/stac-geoparquet-spec.md index 7657d84..35e4f86 100644 --- a/spec/stac-geoparquet-spec.md +++ b/spec/stac-geoparquet-spec.md @@ -9,7 +9,7 @@ library, but aims to provide guidance for anyone putting STAC data into GeoParqu ## Guidelines Generally most all the fields in a STAC Item should be mapped to a column in GeoParquet. We embrace Parquet structures where possible, mapping -from JSON into nested structures. We do pull the properties to the top level, so that it is easier to query and use them. The names of the +from JSON into nested structures. We do pull the properties to the top level, so that it is easier to query and use them. The names of most of the fields should be the same in STAC and in GeoParquet. | Field | GeoParquet Type | Required | Details | From 1121145853751a1686a01bbb198283cc480f7c4f Mon Sep 17 00:00:00 2001 From: Chris Holmes Date: Wed, 27 Sep 2023 10:33:51 -0700 Subject: [PATCH 06/19] Update spec/stac-geoparquet-spec.md Co-authored-by: Tom Augspurger --- spec/stac-geoparquet-spec.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/stac-geoparquet-spec.md b/spec/stac-geoparquet-spec.md index 35e4f86..473f5fd 100644 --- a/spec/stac-geoparquet-spec.md +++ b/spec/stac-geoparquet-spec.md @@ -15,7 +15,7 @@ most of the fields should be the same in STAC and in GeoParquet. | Field | GeoParquet Type | Required | Details | | --------------- | ------------------ | ---------|--------------------------------------------------- | | type | String | Optional | This is just needed for GeoJSON, so it is optional and not recommended to include in GeoParquet | -| stac_extensions | List of Strings | Required | This column is required, but can be blank if no STAC extensions were used | +| stac_extensions | List of Strings | Required | This column is required, but can be empty if no STAC extensions were used | | id | String | Required | Required, should be unique | | geometry | Binary (WKB) | Required | For GeoParquet 1.0 this must be well-known Binary. | | bbox | List of Decimals | Required | Can be 4 or 6 decimals, so won't be a fixed size list. | From db63c0386a1012bd7ddc58223d1f6fd0935dd6bd Mon Sep 17 00:00:00 2001 From: Chris Holmes Date: Thu, 28 Sep 2023 05:33:47 -0700 Subject: [PATCH 07/19] updates based on PR conversations --- spec/stac-geoparquet-spec.md | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/spec/stac-geoparquet-spec.md b/spec/stac-geoparquet-spec.md index 473f5fd..1e70e9d 100644 --- a/spec/stac-geoparquet-spec.md +++ b/spec/stac-geoparquet-spec.md @@ -16,18 +16,39 @@ most of the fields should be the same in STAC and in GeoParquet. | --------------- | ------------------ | ---------|--------------------------------------------------- | | type | String | Optional | This is just needed for GeoJSON, so it is optional and not recommended to include in GeoParquet | | stac_extensions | List of Strings | Required | This column is required, but can be empty if no STAC extensions were used | -| id | String | Required | Required, should be unique | +| id | String | Required | Required, should be unique within each collection | | geometry | Binary (WKB) | Required | For GeoParquet 1.0 this must be well-known Binary. | -| bbox | List of Decimals | Required | Can be 4 or 6 decimals, so won't be a fixed size list. | +| bbox | Struct of Floats | Required | Can be a 4 or 6 value struct, depending on dimension of the data | | properties | per field | Required | Each property should use the relevant Parquet type, and be pulled out of the properties object to be a top-level Parquet field | -| links | List of structs | Required | Each struct in the array should have Strings of `href`, `rel` and `type` | -| assets | A struct of assets | Required | Each struct has each full asset key and object as a sub-struct, it's a direct mapping from the JSON to Parquet | +| links | List of Link structs | Required | See [Link Struct](#link-struct) for more info | +| assets | An Assets struct | Required | See [Asset Struct](#asset-struct) for more info | | collection | String | Required | The ID of the collection this Item is a part of | * Must be valid GeoParquet, with proper metadata. Ideally the geometry types are defined and as narrow as possible. * Strongly recommend to only have one GeoParquet per STAC 'Collection'. Not doing this will lead to an expanded GeoParquet schema (the union of all the schemas of the collection) with lots of empty data -* Any field in 'properties' should be moved up to be a top-level field in the GeoParquet. +* Any field in 'properties' should be moved up to be a top-level field in the GeoParquet. +* STAC GeoParquet does not support properties that are named such that they collide with a top-level key. +* datetime columns should be stored as a native timestamp, not as a string +* The Collection JSON should be included in the Parquet metadata (TODO: flesh this out more) + +### Link Struct + +Each Link Struct has 2 required fields and 2 optional ones: + +| Field Name | Type | Description | +| ---------- | ------ | ----------- | +| href | string | **REQUIRED.** The actual link in the format of an URL. Relative and absolute links are both allowed. | +| rel | string | **REQUIRED.** Relationship between the current document and the linked document. See chapter "Relation types" for more information. | +| type | string | [Media type](../catalog-spec/catalog-spec.md#media-types) of the referenced entity. | +| title | string | A human readable title to be used in rendered displays of the link. | + + +### Asset Struct + +TODO: Explain this more, and how it works best if it's just one collection. + +Each struct has each full asset key and object as a sub-struct, it's a direct mapping from the JSON to Parquet ## Mapping to other geospatial data formats From 9f3cffff7ddb782ec7954292a4860e755e6dbf9a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 7 Oct 2023 09:07:30 -0500 Subject: [PATCH 08/19] Fixups * wording * links to STAC spec * Added Asset Structoo * formatting --- spec/stac-geoparquet-spec.md | 65 +++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 23 deletions(-) diff --git a/spec/stac-geoparquet-spec.md b/spec/stac-geoparquet-spec.md index 1e70e9d..c1d04b6 100644 --- a/spec/stac-geoparquet-spec.md +++ b/spec/stac-geoparquet-spec.md @@ -8,48 +8,60 @@ library, but aims to provide guidance for anyone putting STAC data into GeoParqu ## Guidelines -Generally most all the fields in a STAC Item should be mapped to a column in GeoParquet. We embrace Parquet structures where possible, mapping +Each row in the Parquet Dataset represents a single STAC item. Most all the fields in a STAC Item should be mapped to a column in GeoParquet. We embrace Parquet structures where possible, mapping from JSON into nested structures. We do pull the properties to the top level, so that it is easier to query and use them. The names of most of the fields should be the same in STAC and in GeoParquet. -| Field | GeoParquet Type | Required | Details | -| --------------- | ------------------ | ---------|--------------------------------------------------- | -| type | String | Optional | This is just needed for GeoJSON, so it is optional and not recommended to include in GeoParquet | -| stac_extensions | List of Strings | Required | This column is required, but can be empty if no STAC extensions were used | -| id | String | Required | Required, should be unique within each collection | -| geometry | Binary (WKB) | Required | For GeoParquet 1.0 this must be well-known Binary. | -| bbox | Struct of Floats | Required | Can be a 4 or 6 value struct, depending on dimension of the data | -| properties | per field | Required | Each property should use the relevant Parquet type, and be pulled out of the properties object to be a top-level Parquet field | -| links | List of Link structs | Required | See [Link Struct](#link-struct) for more info | -| assets | An Assets struct | Required | See [Asset Struct](#asset-struct) for more info | -| collection | String | Required | The ID of the collection this Item is a part of | - +| Field | GeoParquet Type | Required | Details | +|--------------------|----------------------|----------|--------------------------------------------------------------------------------------------------------------------------------| +| type | String | Optional | This is just needed for GeoJSON, so it is optional and not recommended to include in GeoParquet | +| stac_extensions | List of Strings | Required | This column is required, but can be empty if no STAC extensions were used | +| id | String | Required | Required, should be unique within each collection | +| geometry | Binary (WKB) | Required | For GeoParquet 1.0 this must be well-known Binary | +| bbox | Struct of Floats | Required | Can be a 4 or 6 value struct, depending on dimension of the data | +| links | List of Link structs | Required | See [Link Struct](#link-struct) for more info | +| assets | An Assets struct | Required | See [Asset Struct](#asset-struct) for more info | +| collection | String | Required | The ID of the collection this Item is a part of | +| *property columns* | *varies* | - | Each property should use the relevant Parquet type, and be pulled out of the properties object to be a top-level Parquet field | * Must be valid GeoParquet, with proper metadata. Ideally the geometry types are defined and as narrow as possible. * Strongly recommend to only have one GeoParquet per STAC 'Collection'. Not doing this will lead to an expanded GeoParquet schema (the union of all the schemas of the collection) with lots of empty data -* Any field in 'properties' should be moved up to be a top-level field in the GeoParquet. +* Any field in 'properties' of the STAC item should be moved up to be a top-level field in the GeoParquet. * STAC GeoParquet does not support properties that are named such that they collide with a top-level key. -* datetime columns should be stored as a native timestamp, not as a string +* datetime columns should be stored as a [native timestamp][timestamp], not as a string * The Collection JSON should be included in the Parquet metadata (TODO: flesh this out more) ### Link Struct -Each Link Struct has 2 required fields and 2 optional ones: +The GeoParquet dataset can contain zero or more Link Structs. Each Link Struct has 2 required fields and 2 optional ones: -| Field Name | Type | Description | -| ---------- | ------ | ----------- | -| href | string | **REQUIRED.** The actual link in the format of an URL. Relative and absolute links are both allowed. | +| Field Name | Type | Description | +|------------|--------|-------------------------------------------------------------------------------------------------------------------------------------| +| href | string | **REQUIRED.** The actual link in the format of an URL. Relative and absolute links are both allowed. | | rel | string | **REQUIRED.** Relationship between the current document and the linked document. See chapter "Relation types" for more information. | -| type | string | [Media type](../catalog-spec/catalog-spec.md#media-types) of the referenced entity. | -| title | string | A human readable title to be used in rendered displays of the link. | +| type | string | [Media type][media-type] of the referenced entity. | +| title | string | A human readable title to be used in rendered displays of the link. | +See [Link Object][link] for more. ### Asset Struct -TODO: Explain this more, and how it works best if it's just one collection. +The GeoParquet dataset can contain zero or more Asset Structs. Each Asset Struct can have the following fields: + +| Field Name | Type | Description | +|-------------|-----------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| href | string | **REQUIRED.** URI to the asset object. Relative and absolute URI are both allowed. | +| title | string | The displayed title for clients and users. | +| description | string | A description of the Asset providing additional details, such as how it was processed or created. [CommonMark 0.29](http://commonmark.org/) syntax MAY be used for rich text representation. | +| type | string | [Media type][media-type] of the asset. See the [common media types][common-media-types] in the best practice doc for commonly used asset types. | +| roles | \[string] | The [semantic roles][asset-roles] of the asset, similar to the use of `rel` in links. | Each struct has each full asset key and object as a sub-struct, it's a direct mapping from the JSON to Parquet +To take advantage of Parquet's columnar nature and compression, the assets should be uniform so they can be represented by a simple schema, which in turn means every item should probably come from the same STAC collection. + +See [Asset Object][asset] for more. + ## Mapping to other geospatial data formats The principles here can likely be used to map into other geospatial data formats (GeoPackage, FlatGeobuf, etc), but we embrace Parquet's nested 'structs' for some of the mappings, so other formats will need to do something different. The obvious thing to do is to dump JSON into those fields, but that's outside the scope of this document, and we recommend creating a general document for that. @@ -57,4 +69,11 @@ The principles here can likely be used to map into other geospatial data formats ## Use cases * Provide a STAC GeoParquet that mirrors a static Collection as a way to query the whole dataset instead of reading every specific GeoJSON file. -* As an output format for STAC API responses that is more efficient than paging through thousands of pages of GeoJSON. \ No newline at end of file +* As an output format for STAC API responses that is more efficient than paging through thousands of pages of GeoJSON. + +[media-type]: https://github.com/radiantearth/stac-spec/blob/master/item-spec/item-spec.md#asset-media-type +[asset]: https://github.com/radiantearth/stac-spec/blob/master/item-spec/item-spec.md#asset-object +[asset-roles]: https://github.com/radiantearth/stac-spec/blob/master/item-spec/item-spec.md#asset-roles +[link]: https://github.com/radiantearth/stac-spec/blob/master/item-spec/item-spec.md#link-object +[common-media-types]: https://github.com/radiantearth/stac-spec/blob/master/best-practices.md#common-media-types-in-stac +[timestamp]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp \ No newline at end of file From b0d9a6a8cdb73cdba61df53c7eda9adb468fbda4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 15 Oct 2023 06:19:57 -0500 Subject: [PATCH 09/19] Updates * Move Use Cases to the Top * Added section on Collection JSON * Added note on accessing fields --- spec/stac-geoparquet-spec.md | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/spec/stac-geoparquet-spec.md b/spec/stac-geoparquet-spec.md index c1d04b6..48d235f 100644 --- a/spec/stac-geoparquet-spec.md +++ b/spec/stac-geoparquet-spec.md @@ -6,6 +6,12 @@ This document specifies how to map a set of [STAC Items](https://github.com/radi [GeoParquet](https://geoparquet.org). It is directly inspired by the [STAC GeoParquet](https://github.com/stac-utils/stac-geoparquet) library, but aims to provide guidance for anyone putting STAC data into GeoParquet. +## Use cases + +* Provide a STAC GeoParquet that mirrors a static Collection as a way to query the whole dataset instead of reading every specific GeoJSON file. +* As an output format for STAC API responses that is more efficient than paging through thousands of pages of GeoJSON. +* Provide efficient access to specific fields of a STAC item, thanks to Parquet's columnar format. + ## Guidelines Each row in the Parquet Dataset represents a single STAC item. Most all the fields in a STAC Item should be mapped to a column in GeoParquet. We embrace Parquet structures where possible, mapping @@ -29,7 +35,7 @@ most of the fields should be the same in STAC and in GeoParquet. * Any field in 'properties' of the STAC item should be moved up to be a top-level field in the GeoParquet. * STAC GeoParquet does not support properties that are named such that they collide with a top-level key. * datetime columns should be stored as a [native timestamp][timestamp], not as a string -* The Collection JSON should be included in the Parquet metadata (TODO: flesh this out more) +* The Collection JSON should be included in the Parquet metadata. See [Collection JSON](#collection-json) below. ### Link Struct @@ -62,18 +68,21 @@ To take advantage of Parquet's columnar nature and compression, the assets shoul See [Asset Object][asset] for more. -## Mapping to other geospatial data formats +## Collection JSON -The principles here can likely be used to map into other geospatial data formats (GeoPackage, FlatGeobuf, etc), but we embrace Parquet's nested 'structs' for some of the mappings, so other formats will need to do something different. The obvious thing to do is to dump JSON into those fields, but that's outside the scope of this document, and we recommend creating a general document for that. +To make a stac-geoparquet file a fully self-contained representation, you can +include the Collection JSON in the Parquet metadata. If present in the [Parquet +file metadata][parquet-metadata], the key must be `stac:collection` and the +value must be a JSON string with the Collection JSON. -## Use cases +## Mapping to other geospatial data formats -* Provide a STAC GeoParquet that mirrors a static Collection as a way to query the whole dataset instead of reading every specific GeoJSON file. -* As an output format for STAC API responses that is more efficient than paging through thousands of pages of GeoJSON. +The principles here can likely be used to map into other geospatial data formats (GeoPackage, FlatGeobuf, etc), but we embrace Parquet's nested 'structs' for some of the mappings, so other formats will need to do something different. The obvious thing to do is to dump JSON into those fields, but that's outside the scope of this document, and we recommend creating a general document for that. [media-type]: https://github.com/radiantearth/stac-spec/blob/master/item-spec/item-spec.md#asset-media-type [asset]: https://github.com/radiantearth/stac-spec/blob/master/item-spec/item-spec.md#asset-object [asset-roles]: https://github.com/radiantearth/stac-spec/blob/master/item-spec/item-spec.md#asset-roles [link]: https://github.com/radiantearth/stac-spec/blob/master/item-spec/item-spec.md#link-object [common-media-types]: https://github.com/radiantearth/stac-spec/blob/master/best-practices.md#common-media-types-in-stac -[timestamp]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp \ No newline at end of file +[timestamp]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp +[parquet-metadata]: https://github.com/apache/parquet-format#metadata \ No newline at end of file From e3471ec86acba97a6c4736d0c790f2ce17031ea6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 9 Nov 2023 21:27:31 -0600 Subject: [PATCH 10/19] Update note on collection, removed type --- spec/stac-geoparquet-spec.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spec/stac-geoparquet-spec.md b/spec/stac-geoparquet-spec.md index 48d235f..3179011 100644 --- a/spec/stac-geoparquet-spec.md +++ b/spec/stac-geoparquet-spec.md @@ -27,7 +27,7 @@ most of the fields should be the same in STAC and in GeoParquet. | bbox | Struct of Floats | Required | Can be a 4 or 6 value struct, depending on dimension of the data | | links | List of Link structs | Required | See [Link Struct](#link-struct) for more info | | assets | An Assets struct | Required | See [Asset Struct](#asset-struct) for more info | -| collection | String | Required | The ID of the collection this Item is a part of | +| collection | String | Optional | The ID of the collection this Item is a part of. See notes below on 'Collection' and 'Collection JSON' in the Parquet metadata | | *property columns* | *varies* | - | Each property should use the relevant Parquet type, and be pulled out of the properties object to be a top-level Parquet field | * Must be valid GeoParquet, with proper metadata. Ideally the geometry types are defined and as narrow as possible. @@ -85,4 +85,4 @@ The principles here can likely be used to map into other geospatial data formats [link]: https://github.com/radiantearth/stac-spec/blob/master/item-spec/item-spec.md#link-object [common-media-types]: https://github.com/radiantearth/stac-spec/blob/master/best-practices.md#common-media-types-in-stac [timestamp]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp -[parquet-metadata]: https://github.com/apache/parquet-format#metadata \ No newline at end of file +[parquet-metadata]: https://github.com/apache/parquet-format#metadata From ed2c5589ac73a41817db92aafd7a94499aca50be Mon Sep 17 00:00:00 2001 From: Ryan Avery Date: Tue, 30 Jan 2024 15:16:22 -0800 Subject: [PATCH 11/19] guidance on referencing geoparquet in a STAC Collection JSON --- spec/stac-geoparquet-spec.md | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/spec/stac-geoparquet-spec.md b/spec/stac-geoparquet-spec.md index 3179011..91bc5cf 100644 --- a/spec/stac-geoparquet-spec.md +++ b/spec/stac-geoparquet-spec.md @@ -68,13 +68,32 @@ To take advantage of Parquet's columnar nature and compression, the assets shoul See [Asset Object][asset] for more. -## Collection JSON +## Including a STAC Collection JSON in a STAC Geoparquet Collection To make a stac-geoparquet file a fully self-contained representation, you can include the Collection JSON in the Parquet metadata. If present in the [Parquet file metadata][parquet-metadata], the key must be `stac:collection` and the value must be a JSON string with the Collection JSON. +## Referencing a STAC Geoparquet Collections in a STAC Collection JSON + +A common use case of stac-geoparquet is to create a mirror of a STAC collection. To refer to this mirror in the original collection, use an [Asset Object](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md#asset-object) at the collection level of the STAC JSON that includes the `application/vnd.apache.parquet` MIME type and `collection-mirror` Role type to describe the function of the Geoparquet STAC Collection Asset. + +For example: + +| Field Name | Type | Value | +| ----------- | --------- | ----------- | +| href | string | s3://example/uti/to/file.geoparquet | +| title | string | An example STAC geoparquet. | +| description | string | Example description. | +| type | string | application/vnd.apache.parquet | +| roles | \[string] | [collection-mirror]* | + +*Note the IANA has not approved the new MIME type `application/vnd.apache.parquet` yet, it's been (submitted for approval)[https://issues.apache.org/jira/browse/PARQUET-1889]. + +The description should ideally include details about the spatial partitioning method. + + ## Mapping to other geospatial data formats The principles here can likely be used to map into other geospatial data formats (GeoPackage, FlatGeobuf, etc), but we embrace Parquet's nested 'structs' for some of the mappings, so other formats will need to do something different. The obvious thing to do is to dump JSON into those fields, but that's outside the scope of this document, and we recommend creating a general document for that. From 19a2141057476c46cde24c633cdaa7b61964a979 Mon Sep 17 00:00:00 2001 From: Ryan Avery Date: Tue, 30 Jan 2024 16:09:58 -0800 Subject: [PATCH 12/19] format table --- spec/stac-geoparquet-spec.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/spec/stac-geoparquet-spec.md b/spec/stac-geoparquet-spec.md index 91bc5cf..9b054bb 100644 --- a/spec/stac-geoparquet-spec.md +++ b/spec/stac-geoparquet-spec.md @@ -81,13 +81,13 @@ A common use case of stac-geoparquet is to create a mirror of a STAC collection. For example: -| Field Name | Type | Value | -| ----------- | --------- | ----------- | +| Field Name | Type | Value | +|-------------|-----------|-------------------------------------| | href | string | s3://example/uti/to/file.geoparquet | -| title | string | An example STAC geoparquet. | -| description | string | Example description. | -| type | string | application/vnd.apache.parquet | -| roles | \[string] | [collection-mirror]* | +| title | string | An example STAC geoparquet. | +| description | string | Example description. | +| type | string | application/vnd.apache.parquet | +| roles | \[string] | [collection-mirror]* | *Note the IANA has not approved the new MIME type `application/vnd.apache.parquet` yet, it's been (submitted for approval)[https://issues.apache.org/jira/browse/PARQUET-1889]. From 7cac0b08c06bff8773a49f7d4dd420ea777d965a Mon Sep 17 00:00:00 2001 From: Ryan Avery Date: Tue, 6 Feb 2024 10:36:20 -0800 Subject: [PATCH 13/19] MIME -> Media --- spec/stac-geoparquet-spec.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spec/stac-geoparquet-spec.md b/spec/stac-geoparquet-spec.md index 9b054bb..5ae7951 100644 --- a/spec/stac-geoparquet-spec.md +++ b/spec/stac-geoparquet-spec.md @@ -4,7 +4,7 @@ This document specifies how to map a set of [STAC Items](https://github.com/radiantearth/stac-spec/tree/v1.0.0/item-spec) into [GeoParquet](https://geoparquet.org). It is directly inspired by the [STAC GeoParquet](https://github.com/stac-utils/stac-geoparquet) -library, but aims to provide guidance for anyone putting STAC data into GeoParquet. +library, but aims to provide guidance for anyone putting STAC data into GeoParquet. ## Use cases @@ -32,7 +32,7 @@ most of the fields should be the same in STAC and in GeoParquet. * Must be valid GeoParquet, with proper metadata. Ideally the geometry types are defined and as narrow as possible. * Strongly recommend to only have one GeoParquet per STAC 'Collection'. Not doing this will lead to an expanded GeoParquet schema (the union of all the schemas of the collection) with lots of empty data -* Any field in 'properties' of the STAC item should be moved up to be a top-level field in the GeoParquet. +* Any field in 'properties' of the STAC item should be moved up to be a top-level field in the GeoParquet. * STAC GeoParquet does not support properties that are named such that they collide with a top-level key. * datetime columns should be stored as a [native timestamp][timestamp], not as a string * The Collection JSON should be included in the Parquet metadata. See [Collection JSON](#collection-json) below. @@ -77,7 +77,7 @@ value must be a JSON string with the Collection JSON. ## Referencing a STAC Geoparquet Collections in a STAC Collection JSON -A common use case of stac-geoparquet is to create a mirror of a STAC collection. To refer to this mirror in the original collection, use an [Asset Object](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md#asset-object) at the collection level of the STAC JSON that includes the `application/vnd.apache.parquet` MIME type and `collection-mirror` Role type to describe the function of the Geoparquet STAC Collection Asset. +A common use case of stac-geoparquet is to create a mirror of a STAC collection. To refer to this mirror in the original collection, use an [Asset Object](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md#asset-object) at the collection level of the STAC JSON that includes the `application/vnd.apache.parquet` Media type and `collection-mirror` Role type to describe the function of the Geoparquet STAC Collection Asset. For example: @@ -89,7 +89,7 @@ For example: | type | string | application/vnd.apache.parquet | | roles | \[string] | [collection-mirror]* | -*Note the IANA has not approved the new MIME type `application/vnd.apache.parquet` yet, it's been (submitted for approval)[https://issues.apache.org/jira/browse/PARQUET-1889]. +*Note the IANA has not approved the new Media type `application/vnd.apache.parquet` yet, it's been (submitted for approval)[https://issues.apache.org/jira/browse/PARQUET-1889]. The description should ideally include details about the spatial partitioning method. From fb798f4cac590bd4520b513e170963d40dc952f4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 3 Mar 2024 11:22:07 -0600 Subject: [PATCH 14/19] Optionally use pyarrow types in to_geodataframe This updates to_geodataframe to optionally use pyarrow types, rather than NumPy. These types let us faithfully represent the actual nested types, rather than casting everything to `object`. --- stac_geoparquet/stac_geoparquet.py | 147 ++++++++++++---- stac_geoparquet/utils.py | 44 ++++- tests/test_pgstac_reader.py | 4 +- tests/test_stac_geoparquet.py | 260 ++++++++++++++++------------- tests/test_to_dict.py | 31 ++++ 5 files changed, 327 insertions(+), 159 deletions(-) diff --git a/stac_geoparquet/stac_geoparquet.py b/stac_geoparquet/stac_geoparquet.py index b42cce4..0974547 100644 --- a/stac_geoparquet/stac_geoparquet.py +++ b/stac_geoparquet/stac_geoparquet.py @@ -1,13 +1,17 @@ """ Generate geoparquet from a sequence of STAC items. """ + from __future__ import annotations +import collections -from typing import Sequence, Any +from typing import Sequence, Any, Literal +import warnings import pystac import geopandas import pandas as pd +import pyarrow as pa import numpy as np import shapely.geometry @@ -16,7 +20,7 @@ from stac_geoparquet.utils import fix_empty_multipolygon STAC_ITEM_TYPES = ["application/json", "application/geo+json"] - +DTYPE_BACKEND = Literal["numpy_nullable", "pyarrow"] SELF_LINK_COLUMN = "self_link" @@ -31,7 +35,9 @@ def _fix_array(v): def to_geodataframe( - items: Sequence[dict[str, Any]], add_self_link: bool = False + items: Sequence[dict[str, Any]], + add_self_link: bool = False, + dtype_backend: DTYPE_BACKEND | None = None, ) -> geopandas.GeoDataFrame: """ Convert a sequence of STAC items to a :class:`geopandas.GeoDataFrame`. @@ -42,19 +48,68 @@ def to_geodataframe( Parameters ---------- items: A sequence of STAC items. - add_self_link: Add the absolute link (if available) to the source STAC Item as a separate column named "self_link" + add_self_link: bool, default False + Add the absolute link (if available) to the source STAC Item + as a separate column named "self_link" + dtype_backend: {'pyarrow', 'numpy_nullable'}, optional + The dtype backend to use for storing arrays. + + By default, this will use 'numpy_nullable' and emit a + FutureWarning that the default will change to 'pyarrow' in + the next release. + + Set to 'numpy_nullable' to silence the warning and accept the + old behavior. + + Set to 'pyarrow' to silence the warning and accept the new behavior. + + There are some difference in the output as well: with + ``dtype_backend="pyarrow"``, struct-like fields will explicitly + contain null values for fields that appear in only some of the + records. For example, given an ``assets`` like:: + + { + "a": { + "href": "a.tif", + }, + "b": { + "href": "b.tif", + "title": "B", + } + } + + The ``assets`` field of the output for the first row with + ``dtype_backend="numpy_nullable"`` will be a Python dictionary with + just ``{"href": "a.tiff"}``. + + With ``dtype_backend="pyarrow"``, this will be a pyarrow struct + with fields ``{"href": "a.tif", "title", None}``. pyarrow will + infer that the struct field ``asset.title`` is nullable. Returns ------- The converted GeoDataFrame. """ - items2 = [] + items2 = collections.defaultdict(list) + for item in items: - item2 = {k: v for k, v in item.items() if k != "properties"} + keys = set(item) - {"properties", "geometry"} + + for k in keys: + items2[k].append(item[k]) + + item_geometry = item["geometry"] + if item_geometry: + item_geometry = fix_empty_multipolygon(item_geometry) + + items2["geometry"].append(item_geometry) + for k, v in item["properties"].items(): - if k in item2: - raise ValueError("k", k) - item2[k] = v + if k in item: + msg = f"Key '{k}' appears in both 'properties' and the top level." + raise ValueError(msg) + items2[k].append(v) + if add_self_link: self_href = None for link in item["links"]: @@ -65,23 +120,11 @@ def to_geodataframe( ): self_href = link["href"] break - item2[SELF_LINK_COLUMN] = self_href - items2.append(item2) - - # Filter out missing geoms in MultiPolygons - # https://github.com/shapely/shapely/issues/1407 - # geometry = [shapely.geometry.shape(x["geometry"]) for x in items2] - - geometry = [] - for item2 in items2: - item_geometry = item2["geometry"] - if item_geometry: - item_geometry = fix_empty_multipolygon(item_geometry) # type: ignore - geometry.append(item_geometry) - - gdf = geopandas.GeoDataFrame(items2, geometry=geometry, crs="WGS84") + items2[SELF_LINK_COLUMN].append(self_href) - for column in [ + # TODO: Ideally we wouldn't have to hard-code this list. + # Could we get it from the JSON schema. + DATETIME_COLUMNS = { "datetime", # common metadata "start_datetime", "end_datetime", @@ -90,9 +133,42 @@ def to_geodataframe( "expires", # timestamps extension "published", "unpublished", - ]: - if column in gdf.columns: - gdf[column] = pd.to_datetime(gdf[column], format="ISO8601") + } + + items2["geometry"] = geopandas.array.from_shapely(items2["geometry"]) + + if dtype_backend is None: + msg = ( + "The default argument for 'dtype_backend' will change from " + "'numpy_nullable' to 'pyarrow'. To keep the previous default " + "specify ``dtype_backend='numpy_nullable'``. To accept the future " + "behavior specify ``dtype_backend='pyarrow'." + ) + warnings.warn(FutureWarning(msg)) + dtype_backend = "numpy_nullable" + + if dtype_backend == "pyarrow": + for k, v in items2.items(): + if k in DATETIME_COLUMNS: + items2[k] = pd.arrays.ArrowExtensionArray( + pa.array(pd.to_datetime(v, format="ISO8601")) + ) + + elif k != "geometry": + items2[k] = pd.arrays.ArrowExtensionArray(pa.array(v)) + + elif dtype_backend == "numpy_nullable": + for k, v in items2.items(): + if k in DATETIME_COLUMNS: + items2[k] = pd.to_datetime(v, format="ISO8601") + + if k in {"type", "stac_version", "id", "collection", SELF_LINK_COLUMN}: + items2[k] = pd.array(v, dtype="string") + else: + msg = f"Invalid 'dtype_backend={dtype_backend}'." + raise TypeError(msg) + + gdf = geopandas.GeoDataFrame(items2, geometry="geometry", crs="WGS84") columns = [ "type", @@ -111,10 +187,6 @@ def to_geodataframe( columns.remove(col) gdf = pd.concat([gdf[columns], gdf.drop(columns=columns)], axis="columns") - for k in ["type", "stac_version", "id", "collection", SELF_LINK_COLUMN]: - if k in gdf: - gdf[k] = gdf[k].astype("string") - return gdf @@ -144,12 +216,16 @@ def to_dict(record: dict) -> dict: if k == SELF_LINK_COLUMN: continue + elif k == "assets": + item[k] = {k2: v2 for k2, v2 in v.items() if v2 is not None} elif k in top_level_keys: item[k] = v else: properties[k] = v - item["geometry"] = shapely.geometry.mapping(item["geometry"]) + if item["geometry"]: + item["geometry"] = shapely.geometry.mapping(item["geometry"]) + item["properties"] = properties return item @@ -175,6 +251,11 @@ def to_item_collection(df: geopandas.GeoDataFrame) -> pystac.ItemCollection: include=["datetime64[ns, UTC]", "datetime64[ns]"] ).columns for k in datelike: + # %f isn't implemented in pyarrow + # https://github.com/apache/arrow/issues/20146 + if isinstance(df2[k].dtype, pd.ArrowDtype): + df2[k] = df2[k].astype("datetime64[ns, utc]") + df2[k] = ( df2[k].dt.strftime("%Y-%m-%dT%H:%M:%S.%fZ").fillna("").replace({"": None}) ) diff --git a/stac_geoparquet/utils.py b/stac_geoparquet/utils.py index 6c912b1..68e4eba 100644 --- a/stac_geoparquet/utils.py +++ b/stac_geoparquet/utils.py @@ -8,23 +8,27 @@ @functools.singledispatch -def assert_equal(result: Any, expected: Any) -> bool: +def assert_equal(result: Any, expected: Any, ignore_none: bool = False) -> bool: raise TypeError(f"Invalid type {type(result)}") @assert_equal.register(pystac.ItemCollection) def assert_equal_ic( - result: pystac.ItemCollection, expected: pystac.ItemCollection + result: pystac.ItemCollection, + expected: pystac.ItemCollection, + ignore_none: bool = False, ) -> None: assert type(result) == type(expected) assert len(result) == len(expected) assert result.extra_fields == expected.extra_fields for a, b in zip(result.items, expected.items): - assert_equal(a, b) + assert_equal(a, b, ignore_none=ignore_none) @assert_equal.register(pystac.Item) -def assert_equal_item(result: pystac.Item, expected: pystac.Item) -> None: +def assert_equal_item( + result: pystac.Item, expected: pystac.Item, ignore_none: bool = False +) -> None: assert type(result) == type(expected) assert result.id == expected.id assert shapely.geometry.shape(result.geometry) == shapely.geometry.shape( @@ -41,20 +45,44 @@ def assert_equal_item(result: pystac.Item, expected: pystac.Item) -> None: expected_links = sorted(expected.links, key=lambda x: x.href) assert len(result_links) == len(expected_links) for a, b in zip(result_links, expected_links): - assert_equal(a, b) + assert_equal(a, b, ignore_none=ignore_none) assert set(result.assets) == set(expected.assets) for k in result.assets: - assert_equal(result.assets[k], expected.assets[k]) + assert_equal(result.assets[k], expected.assets[k], ignore_none=ignore_none) @assert_equal.register(pystac.Link) @assert_equal.register(pystac.Asset) def assert_link_equal( - result: pystac.Link | pystac.Asset, expected: pystac.Link | pystac.Asset + result: pystac.Link | pystac.Asset, + expected: pystac.Link | pystac.Asset, + ignore_none: bool = False, ) -> None: assert type(result) == type(expected) - assert result.to_dict() == expected.to_dict() + resultd = result.to_dict() + expectedd = expected.to_dict() + + left = {} + + if ignore_none: + for k, v in resultd.items(): + if v is None and k not in expectedd: + pass + elif isinstance(v, list) and k in expectedd: + out = [] + for val in v: + if isinstance(val, dict): + out.append({k: v2 for k, v2 in val.items() if v2 is not None}) + else: + out.append(val) + left[k] = out + else: + left[k] = v + else: + left = resultd + + assert left == expectedd def fix_empty_multipolygon( diff --git a/tests/test_pgstac_reader.py b/tests/test_pgstac_reader.py index 9cdc8cc..d921a6e 100644 --- a/tests/test_pgstac_reader.py +++ b/tests/test_pgstac_reader.py @@ -113,7 +113,7 @@ def test_naip_item(): expected.remove_links(rel=pystac.RelType.SELF) result.remove_links(rel=pystac.RelType.SELF) - assert_equal(result, expected) + assert_equal(result, expected, ignore_none=True) def test_sentinel2_l2a(): @@ -139,7 +139,7 @@ def test_sentinel2_l2a(): result.remove_links(rel=pystac.RelType.SELF) expected.remove_links(rel=pystac.RelType.LICENSE) - assert_equal(result, expected) + assert_equal(result, expected, ignore_none=True) def test_generate_endpoints(): diff --git a/tests/test_stac_geoparquet.py b/tests/test_stac_geoparquet.py index 10c484c..3ddca61 100644 --- a/tests/test_stac_geoparquet.py +++ b/tests/test_stac_geoparquet.py @@ -5,6 +5,7 @@ import shapely.geometry import pandas as pd import pandas.testing +import pyarrow as pa import pystac import geopandas import requests @@ -63,9 +64,9 @@ def test_assert_equal(): "roles": ["data"], "title": "RGBIR COG tile", "eo:bands": [ - {"name": "Red", "common_name": "red"}, - {"name": "Green", "common_name": "green"}, - {"name": "Blue", "common_name": "blue"}, + {"name": "Red", "common_name": "red", "description": "Red"}, + {"name": "Green", "common_name": "green", "description": "Green"}, + {"name": "Blue", "common_name": "blue", "description": "Blue"}, {"name": "NIR", "common_name": "nir", "description": "near-infrared"}, ], }, @@ -125,111 +126,131 @@ def test_assert_equal(): "stac_version": "1.0.0", } -EXPECTED_GDF = { - "type": {0: "Feature"}, - "stac_version": {0: "1.0.0"}, - "stac_extensions": { - 0: [ - "https://stac-extensions.github.io/eo/v1.0.0/schema.json", - "https://stac-extensions.github.io/projection/v1.0.0/schema.json", - ] - }, - "id": {0: "ia_m_4209150_sw_15_060_20190828_20191105"}, - "geometry": {0: shapely.geometry.shape(ITEM["geometry"])}, - "bbox": {0: [-91.879788, 42.121621, -91.807132, 42.191372]}, - "links": { - 0: [ - { - "rel": "collection", - "type": "application/json", - "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip", - }, - { - "rel": "parent", - "type": "application/json", - "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip", - }, - { - "rel": "root", - "type": "application/json", - "href": "https://planetarycomputer.microsoft.com/api/stac/v1/", - }, - { - "rel": "self", - "type": "application/geo+json", - "href": ITEM_SELF_HREF, - }, + +EXPECTED_GDF = geopandas.GeoDataFrame( + { + "type": ["Feature"], + "stac_version": ["1.0.0"], + "stac_extensions": [ + [ + "https://stac-extensions.github.io/eo/v1.0.0/schema.json", + "https://stac-extensions.github.io/projection/v1.0.0/schema.json", + ] + ], + "id": ["ia_m_4209150_sw_15_060_20190828_20191105"], + "geometry": geopandas.array.from_shapely( + [shapely.geometry.shape(ITEM["geometry"])] + ), + "bbox": [[-91.879788, 42.121621, -91.807132, 42.191372]], + "links": [ + [ + { + "rel": "collection", + "type": "application/json", + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip", + }, + { + "rel": "parent", + "type": "application/json", + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip", + }, + { + "rel": "root", + "type": "application/json", + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/", + }, + { + "rel": "self", + "type": "application/geo+json", + "href": ITEM_SELF_HREF, + }, + { + "rel": "preview", + "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/map?collection=naip&item=ia_m_4209150_sw_15_060_20190828_20191105", # noqa: E501 + "title": "Map of item", + "type": "text/html", + }, + ] + ], + "assets": [ { - "rel": "preview", - "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/map?collection=naip&item=ia_m_4209150_sw_15_060_20190828_20191105", # noqa: E501 - "title": "Map of item", - "type": "text/html", - }, - ] - }, - "assets": { - 0: { - "image": { - "href": "https://naipeuwest.blob.core.windows.net/naip/v002/ia/2019/ia_60cm_2019/42091/m_4209150_sw_15_060_20190828.tif", # noqa: E501 - "type": "image/tiff; application=geotiff; profile=cloud-optimized", - "roles": ["data"], - "title": "RGBIR COG tile", - "eo:bands": [ - {"name": "Red", "common_name": "red"}, - {"name": "Green", "common_name": "green"}, - {"name": "Blue", "common_name": "blue"}, - { - "name": "NIR", - "common_name": "nir", - "description": "near-infrared", - }, - ], - }, - "metadata": { - "href": "https://naipeuwest.blob.core.windows.net/naip/v002/ia/2019/ia_fgdc_2019/42091/m_4209150_sw_15_060_20190828.txt", # noqa: E501 - "type": "text/plain", - "roles": ["metadata"], - "title": "FGDC Metdata", - }, - "thumbnail": { - "href": "https://naipeuwest.blob.core.windows.net/naip/v002/ia/2019/ia_60cm_2019/42091/m_4209150_sw_15_060_20190828.200.jpg", # noqa: E501 - "type": "image/jpeg", - "roles": ["thumbnail"], - "title": "Thumbnail", - }, - "tilejson": { - "title": "TileJSON with default rendering", - "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/tilejson.json?collection=naip&item=ia_m_4209150_sw_15_060_20190828_20191105&assets=image&asset_bidx=image%7C1%2C2%2C3", # noqa: E501 - "type": "application/json", - "roles": ["tiles"], - }, - "rendered_preview": { - "title": "Rendered preview", - "rel": "preview", - "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/preview.png?collection=naip&item=ia_m_4209150_sw_15_060_20190828_20191105&assets=image&asset_bidx=image%7C1%2C2%2C3", # noqa: E501 - "roles": ["overview"], - "type": "image/png", - }, - } - }, - "collection": {0: "naip"}, - "gsd": {0: 0.6}, - "datetime": {0: pd.Timestamp("2019-08-28 00:00:00+0000", tz="UTC")}, - "naip:year": {0: "2019"}, - "proj:bbox": {0: [592596.0, 4663966.8, 598495.8, 4671633.0]}, - "proj:epsg": {0: 26915}, - "naip:state": {0: "ia"}, - "proj:shape": {0: [12777, 9833]}, - "proj:transform": {0: [0.6, 0.0, 592596.0, 0.0, -0.6, 4671633.0, 0.0, 0.0, 1.0]}, -} + "image": { + "href": "https://naipeuwest.blob.core.windows.net/naip/v002/ia/2019/ia_60cm_2019/42091/m_4209150_sw_15_060_20190828.tif", # noqa: E501 + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "roles": ["data"], + "title": "RGBIR COG tile", + "eo:bands": [ + {"name": "Red", "common_name": "red", "description": "Red"}, + { + "name": "Green", + "common_name": "green", + "description": "Green", + }, + { + "name": "Blue", + "common_name": "blue", + "description": "Blue", + }, + { + "name": "NIR", + "common_name": "nir", + "description": "near-infrared", + }, + ], + }, + "metadata": { + "href": "https://naipeuwest.blob.core.windows.net/naip/v002/ia/2019/ia_fgdc_2019/42091/m_4209150_sw_15_060_20190828.txt", # noqa: E501 + "type": "text/plain", + "roles": ["metadata"], + "title": "FGDC Metdata", + }, + "thumbnail": { + "href": "https://naipeuwest.blob.core.windows.net/naip/v002/ia/2019/ia_60cm_2019/42091/m_4209150_sw_15_060_20190828.200.jpg", # noqa: E501 + "type": "image/jpeg", + "roles": ["thumbnail"], + "title": "Thumbnail", + }, + "tilejson": { + "title": "TileJSON with default rendering", + "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/tilejson.json?collection=naip&item=ia_m_4209150_sw_15_060_20190828_20191105&assets=image&asset_bidx=image%7C1%2C2%2C3", # noqa: E501 + "type": "application/json", + "roles": ["tiles"], + }, + "rendered_preview": { + "title": "Rendered preview", + "rel": "preview", + "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/preview.png?collection=naip&item=ia_m_4209150_sw_15_060_20190828_20191105&assets=image&asset_bidx=image%7C1%2C2%2C3", # noqa: E501 + "roles": ["overview"], + "type": "image/png", + }, + } + ], + "collection": ["naip"], + "gsd": [0.6], + "datetime": pd.to_datetime(["2019-08-28 00:00:00+0000"]), + "naip:year": ["2019"], + "proj:bbox": [[592596.0, 4663966.8, 598495.8, 4671633.0]], + "proj:epsg": [26915], + "naip:state": ["ia"], + "proj:shape": [[12777, 9833]], + "proj:transform": [[0.6, 0.0, 592596.0, 0.0, -0.6, 4671633.0, 0.0, 0.0, 1.0]], + } +) -def test_to_geodataframe(): - result = stac_geoparquet.to_geodataframe([ITEM]) - expected = geopandas.GeoDataFrame(EXPECTED_GDF) - for k in ["type", "stac_version", "id", "collection"]: - if k in expected: - expected[k] = expected[k].astype("string") +@pytest.mark.parametrize("dtype_backend", ["numpy_nullable", "pyarrow"]) +def test_to_geodataframe(dtype_backend): + result = stac_geoparquet.to_geodataframe([ITEM], dtype_backend=dtype_backend) + expected = EXPECTED_GDF.copy() + + if dtype_backend == "numpy_nullable": + for k in ["type", "stac_version", "id", "collection"]: + expected[k] = expected[k].astype(pd.StringDtype()) + + else: + for k, v in EXPECTED_GDF.items(): + if k != "geometry": + expected[k] = pd.arrays.ArrowExtensionArray(pa.array(v)) pandas.testing.assert_frame_equal(result, expected) @@ -238,14 +259,21 @@ def test_to_geodataframe(): assert_equal(ic1, ic2) +def test_dtype_backend_warns(): + with pytest.warns(FutureWarning, match="dtype_backend"): + stac_geoparquet.to_geodataframe([ITEM]) + + def test_to_geodataframe_with_self_link(): - result = stac_geoparquet.to_geodataframe([ITEM], add_self_link=True) - gdf = EXPECTED_GDF.copy() - gdf["self_link"] = {0: ITEM_SELF_HREF} - expected = geopandas.GeoDataFrame(gdf) - for k in ["type", "stac_version", "id", "collection", "self_link"]: - if k in expected: - expected[k] = expected[k].astype("string") + result = stac_geoparquet.to_geodataframe( + [ITEM], add_self_link=True, dtype_backend="pyarrow" + ) + expected = EXPECTED_GDF.copy() + expected["self_link"] = pd.arrays.ArrowExtensionArray(pa.array([ITEM_SELF_HREF])) + + for k, v in EXPECTED_GDF.items(): + if k != "geometry": + expected[k] = pd.arrays.ArrowExtensionArray(pa.array(v)) pandas.testing.assert_frame_equal(result, expected) @@ -269,7 +297,7 @@ def test_s1_grd(): item["stac_extensions"][i] = EO_V11 item["geometry"] = fix_empty_multipolygon(item["geometry"]).__geo_interface__ - df = stac_geoparquet.to_geodataframe([item]) + df = stac_geoparquet.to_geodataframe([item], dtype_backend="pyarrow") result = to_item_collection(df)[0] assert_equal(result, pystac.read_dict(item)) @@ -354,11 +382,11 @@ def test_smoke(collection_id): ) r.raise_for_status() items = r.json()["features"] - df = stac_geoparquet.to_geodataframe(items) + df = stac_geoparquet.to_geodataframe(items, dtype_backend="pyarrow") result = to_item_collection(df) expected = pystac.ItemCollection(items) - assert_equal(result, expected) + assert_equal(result, expected, ignore_none=True) def test_mixed_date_format(): @@ -369,7 +397,7 @@ def test_mixed_date_format(): a["geometry"] = {"type": "Point", "coordinates": [0, 0]} b["geometry"] = {"type": "Point", "coordinates": [0, 0]} - result = stac_geoparquet.to_geodataframe([a, b]) + result = stac_geoparquet.to_geodataframe([a, b], dtype_backend="pyarrow") expected = [ pd.Timestamp("2000-12-10 22:04:58+0000", tz="UTC"), pd.Timestamp("2000-12-10 22:04:57.998000+0000", tz="UTC"), diff --git a/tests/test_to_dict.py b/tests/test_to_dict.py index 6df2fd8..9b5b336 100644 --- a/tests/test_to_dict.py +++ b/tests/test_to_dict.py @@ -116,3 +116,34 @@ def test_to_dict(naip): "type": "Feature", } assert result[0].to_dict() == expected + + +def test_to_dict_optional_asset(): + items = [ + { + "id": "a", + "geometry": None, + "bbox": None, + "links": [], + "type": "Feature", + "stac_version": "1.0.0", + "properties": {"datetime": "2021-01-01T00:00:00Z"}, + "assets": {"a": {"href": "a.txt"}, "b": {"href": "b.txt"}}, + }, + { + "id": "b", + "geometry": None, + "bbox": None, + "links": [], + "type": "Feature", + "stac_version": "1.0.0", + "properties": {"datetime": "2021-01-01T00:00:00Z"}, + "assets": {"a": {"href": "a.txt"}}, + }, + ] + df = stac_geoparquet.to_geodataframe(items, dtype_backend="pyarrow") + result = stac_geoparquet.to_item_collection(df) + assert result[0].assets["a"].to_dict() == {"href": "a.txt"} + assert result[0].assets["b"].to_dict() == {"href": "b.txt"} + assert result[1].assets["a"].to_dict() == {"href": "a.txt"} + assert "b" not in result[1].assets From 5c646cba236a1106b8c890bd826bc71504500958 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 24 Mar 2024 15:19:11 -0500 Subject: [PATCH 15/19] ts resolution --- stac_geoparquet/stac_geoparquet.py | 14 ++++++++++---- tests/test_stac_geoparquet.py | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/stac_geoparquet/stac_geoparquet.py b/stac_geoparquet/stac_geoparquet.py index 0974547..2efc299 100644 --- a/stac_geoparquet/stac_geoparquet.py +++ b/stac_geoparquet/stac_geoparquet.py @@ -38,6 +38,7 @@ def to_geodataframe( items: Sequence[dict[str, Any]], add_self_link: bool = False, dtype_backend: DTYPE_BACKEND | None = None, + datetime_precision: str = "us", ) -> geopandas.GeoDataFrame: """ Convert a sequence of STAC items to a :class:`geopandas.GeoDataFrame`. @@ -86,6 +87,10 @@ def to_geodataframe( with fields ``{"href": "a.tif", "title", None}``. pyarrow will infer that the struct field ``asset.title`` is nullable. + datetime_precision: str, default "us" + The precision to use for the datetime columns. For example, + "us" is microsecond and "ns" is nanosecond. + Returns ------- The converted GeoDataFrame. @@ -150,9 +155,8 @@ def to_geodataframe( if dtype_backend == "pyarrow": for k, v in items2.items(): if k in DATETIME_COLUMNS: - items2[k] = pd.arrays.ArrowExtensionArray( - pa.array(pd.to_datetime(v, format="ISO8601")) - ) + dt = pd.to_datetime(v, format="ISO8601").as_unit(datetime_precision) + items2[k] = pd.arrays.ArrowExtensionArray(pa.array(dt)) elif k != "geometry": items2[k] = pd.arrays.ArrowExtensionArray(pa.array(v)) @@ -160,7 +164,9 @@ def to_geodataframe( elif dtype_backend == "numpy_nullable": for k, v in items2.items(): if k in DATETIME_COLUMNS: - items2[k] = pd.to_datetime(v, format="ISO8601") + items2[k] = pd.to_datetime(v, format="ISO8601").as_unit( + datetime_precision + ) if k in {"type", "stac_version", "id", "collection", SELF_LINK_COLUMN}: items2[k] = pd.array(v, dtype="string") diff --git a/tests/test_stac_geoparquet.py b/tests/test_stac_geoparquet.py index 3ddca61..b9e18b7 100644 --- a/tests/test_stac_geoparquet.py +++ b/tests/test_stac_geoparquet.py @@ -227,7 +227,7 @@ def test_assert_equal(): ], "collection": ["naip"], "gsd": [0.6], - "datetime": pd.to_datetime(["2019-08-28 00:00:00+0000"]), + "datetime": pd.to_datetime(["2019-08-28 00:00:00+0000"]).as_unit("us"), "naip:year": ["2019"], "proj:bbox": [[592596.0, 4663966.8, 598495.8, 4671633.0]], "proj:epsg": [26915], From 9c602199e1d990c02a467bb35f3a89f89b4b5800 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 29 Mar 2024 12:42:11 -0500 Subject: [PATCH 16/19] parameter for datetime precision --- stac_geoparquet/stac_geoparquet.py | 4 ++-- tests/test_stac_geoparquet.py | 17 ++++++++++++++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/stac_geoparquet/stac_geoparquet.py b/stac_geoparquet/stac_geoparquet.py index 2efc299..74f1872 100644 --- a/stac_geoparquet/stac_geoparquet.py +++ b/stac_geoparquet/stac_geoparquet.py @@ -38,7 +38,7 @@ def to_geodataframe( items: Sequence[dict[str, Any]], add_self_link: bool = False, dtype_backend: DTYPE_BACKEND | None = None, - datetime_precision: str = "us", + datetime_precision: str = "ns", ) -> geopandas.GeoDataFrame: """ Convert a sequence of STAC items to a :class:`geopandas.GeoDataFrame`. @@ -87,7 +87,7 @@ def to_geodataframe( with fields ``{"href": "a.tif", "title", None}``. pyarrow will infer that the struct field ``asset.title`` is nullable. - datetime_precision: str, default "us" + datetime_precision: str, default "ns" The precision to use for the datetime columns. For example, "us" is microsecond and "ns" is nanosecond. diff --git a/tests/test_stac_geoparquet.py b/tests/test_stac_geoparquet.py index b9e18b7..db6b978 100644 --- a/tests/test_stac_geoparquet.py +++ b/tests/test_stac_geoparquet.py @@ -227,7 +227,7 @@ def test_assert_equal(): ], "collection": ["naip"], "gsd": [0.6], - "datetime": pd.to_datetime(["2019-08-28 00:00:00+0000"]).as_unit("us"), + "datetime": pd.to_datetime(["2019-08-28 00:00:00+0000"]).as_unit("ns"), "naip:year": ["2019"], "proj:bbox": [[592596.0, 4663966.8, 598495.8, 4671633.0]], "proj:epsg": [26915], @@ -404,3 +404,18 @@ def test_mixed_date_format(): ] assert result["datetime"].tolist() == expected + + +@pytest.mark.parametrize("datetime_precision", ["us", "ns"]) +def test_datetime_precision(datetime_precision): + item = json.loads((HERE / "sentinel-2-item.json").read_text()) + item["properties"]["datetime"] = "2000-12-10T22:00:00.123456Z" + df = stac_geoparquet.to_geodataframe( + [item], dtype_backend="pyarrow", datetime_precision=datetime_precision + ) + result = df["datetime"].iloc[0] + expected = pd.Timestamp("2000-12-10 22:00:00.123456+0000", tz="UTC").as_unit( + datetime_precision + ) + assert result == expected + assert result.unit == datetime_precision From 2cc57977de340b0c2cc46c5d84726ae3c94bce87 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 29 Mar 2024 16:00:21 -0500 Subject: [PATCH 17/19] Bump package version --- stac_geoparquet/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stac_geoparquet/__init__.py b/stac_geoparquet/__init__.py index cbd10fa..26ee6e5 100644 --- a/stac_geoparquet/__init__.py +++ b/stac_geoparquet/__init__.py @@ -1,5 +1,5 @@ """stac-geoparquet""" -__version__ = "0.3.2" +__version__ = "0.4.1" from .stac_geoparquet import to_geodataframe, to_dict, to_item_collection From 3d073bb1b74981c971d149dbdc6e6b3a99bad68d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 30 Mar 2024 10:40:51 -0500 Subject: [PATCH 18/19] PKG: Use hatch, automatic versioning This updates the packaging to use Hatch, primarily so that we can use hatch-vcs to automatically get the version from the git tag. --- .gitignore | 3 ++- pyproject.toml | 10 ++++++++-- stac_geoparquet/__init__.py | 11 ++++++++--- stac_geoparquet/_version.py | 17 +++++++++++++++++ 4 files changed, 35 insertions(+), 6 deletions(-) create mode 100644 stac_geoparquet/_version.py diff --git a/.gitignore b/.gitignore index 608c2d6..1d83202 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ __pycache__ .venv -dist \ No newline at end of file +dist +.direnv diff --git a/pyproject.toml b/pyproject.toml index 8c96044..392abb5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] -requires = ["flit_core >=3.2,<4"] -build-backend = "flit_core.buildapi" +requires = ["hatchling>=1.22.2", "hatch-vcs>=0.3.0"] +build-backend = "hatchling.build" [project] name = "stac_geoparquet" @@ -19,6 +19,12 @@ dependencies = [ "packaging", ] +[tool.hatch.version] +source = "vcs" + +[tool.hatch.build.hooks.vcs] +version-file = "stac_geoparquet/_version.py" + [project.optional-dependencies] pgstac = [ "fsspec", diff --git a/stac_geoparquet/__init__.py b/stac_geoparquet/__init__.py index 26ee6e5..e697314 100644 --- a/stac_geoparquet/__init__.py +++ b/stac_geoparquet/__init__.py @@ -1,7 +1,12 @@ """stac-geoparquet""" -__version__ = "0.4.1" - from .stac_geoparquet import to_geodataframe, to_dict, to_item_collection +from ._version import __version__ -__all__ = ["__version__", "to_geodataframe", "to_dict", "to_item_collection"] +__all__ = [ + "__version__", + "to_geodataframe", + "to_dict", + "to_item_collection", + "__version__", +] diff --git a/stac_geoparquet/_version.py b/stac_geoparquet/_version.py new file mode 100644 index 0000000..eb64310 --- /dev/null +++ b/stac_geoparquet/_version.py @@ -0,0 +1,17 @@ +# file generated by setuptools_scm +# don't change, don't track in version control +TYPE_CHECKING = False +if TYPE_CHECKING: + from typing import Tuple, Union + + VERSION_TUPLE = Tuple[Union[int, str], ...] +else: + VERSION_TUPLE = object + +version: str +__version__: str +__version_tuple__: VERSION_TUPLE +version_tuple: VERSION_TUPLE + +__version__ = version = "0.4.2.dev0+g7dc20b6.d20240330" +__version_tuple__ = version_tuple = (0, 4, 2, "dev0", "g7dc20b6.d20240330") From 2001e3f3c605fc5503630039cdc1c40f40cb25e5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 30 Mar 2024 11:07:46 -0500 Subject: [PATCH 19/19] ignore version --- .gitignore | 1 + stac_geoparquet/_version.py | 17 ----------------- 2 files changed, 1 insertion(+), 17 deletions(-) delete mode 100644 stac_geoparquet/_version.py diff --git a/.gitignore b/.gitignore index 1d83202..53e55b5 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ __pycache__ .venv dist .direnv +stac_geoparquet/_version.py diff --git a/stac_geoparquet/_version.py b/stac_geoparquet/_version.py deleted file mode 100644 index eb64310..0000000 --- a/stac_geoparquet/_version.py +++ /dev/null @@ -1,17 +0,0 @@ -# file generated by setuptools_scm -# don't change, don't track in version control -TYPE_CHECKING = False -if TYPE_CHECKING: - from typing import Tuple, Union - - VERSION_TUPLE = Tuple[Union[int, str], ...] -else: - VERSION_TUPLE = object - -version: str -__version__: str -__version_tuple__: VERSION_TUPLE -version_tuple: VERSION_TUPLE - -__version__ = version = "0.4.2.dev0+g7dc20b6.d20240330" -__version_tuple__ = version_tuple = (0, 4, 2, "dev0", "g7dc20b6.d20240330")