From 0882e85223e58d62def9bd375c86d6bd1707e1ca Mon Sep 17 00:00:00 2001 From: Jacob Wasserman Date: Mon, 27 Nov 2023 14:07:15 -0500 Subject: [PATCH] Add geometry_bbox proposal definition * Add documentation to the top-level GeoParquet description and definition. * Add the geometry_bbox definition to the json schema * Add a few tests. Verify with `pytest test_json_schema.py` --- format-specs/geoparquet.md | 16 ++++++++++++++++ format-specs/schema.json | 10 ++++++++++ scripts/test_json_schema.py | 15 +++++++++++++++ 3 files changed, 41 insertions(+) diff --git a/format-specs/geoparquet.md b/format-specs/geoparquet.md index 4782e57..81bcaf4 100644 --- a/format-specs/geoparquet.md +++ b/format-specs/geoparquet.md @@ -58,6 +58,8 @@ Each geometry column in the dataset MUST be included in the `columns` field abov | edges | string | Name of the coordinate system for the edges. Must be one of `"planar"` or `"spherical"`. The default value is `"planar"`. | | bbox | \[number] | Bounding Box of the geometries in the file, formatted according to [RFC 7946, section 5](https://tools.ietf.org/html/rfc7946#section-5). | | epoch | number | Coordinate epoch in case of a dynamic CRS, expressed as a decimal year. | +| geometry_bbox | object | Object specifying a column name of a [Bounding Box Column](#bounding-box-columns). | + #### crs @@ -134,6 +136,20 @@ For non-geographic coordinate reference systems, the items in the bbox are minim The bbox values are in the same coordinate reference system as the geometry. +#### geometry_bbox + +Including a per-row bounding box can be useful for accelerating spatial queries by allowing consumers to inspect row group bounding box summary statistics. Furthermore a bounding box may be used to avoid complex spatial operations by first checking for bounding box overlaps. This field captures the name of a column containing the bounding box of the geometry for every row. + +The format of `geometry_bbox` is `{"name": "column_name"}` where `column_name` MUST exist in the Parquet file and meet the criteria in the [Bounding Box Column](#bounding-box-columns) definition. + +Note: the value specified in this field should not be confused with the [`bbox`](#bbox) field which contains the single bounding box of this geometry over the whole GeoParquet file. + +### Bounding Box Columns + +A bounding box column MUST be a Parquet struct with required fields `xmin`, `xmax`, `ymin`, and `ymax`. For three dimensions the additional fields `zmin` and `zmax` MUST be present. The fields MUST be of Parquet type `FLOAT` or `DOUBLE`. The repetition of a bounding box column MUST match the geometry column's [repetition](#repetition). A row MUST contain a bounding box value if and only if the row contains a geometry value. In cases where the geometry is optional and a row not contain a geometry value, the row MUST NOT contain a bounding box value. + +The bounding box column MUST be at the root of the schema. The bounding box column MUST NOT be nested in a group. + ### Additional information #### Feature identifiers diff --git a/format-specs/schema.json b/format-specs/schema.json index ae31ee0..6f27af6 100644 --- a/format-specs/schema.json +++ b/format-specs/schema.json @@ -71,6 +71,16 @@ }, "epoch": { "type": "number" + }, + "geometry_bbox": { + "type": "object", + "required": ["column"], + "properties": { + "column": { + "type": "string", + "minLength": 1 + } + } } } } diff --git a/scripts/test_json_schema.py b/scripts/test_json_schema.py index cda7fb1..dfafbce 100644 --- a/scripts/test_json_schema.py +++ b/scripts/test_json_schema.py @@ -41,6 +41,9 @@ def get_version() -> str: "geometry": { "encoding": "WKB", "geometry_types": [], + "geometry_bbox": { + "column": "bbox", + }, }, }, } @@ -210,6 +213,18 @@ def get_version() -> str: metadata["columns"]["geometry"]["epoch"] = "2015.1" invalid_cases["epoch_string"] = metadata +# Geometry Bbox + +metadata = copy.deepcopy(metadata_template) +metadata["columns"]["geometry"]["geometry_bbox"].pop("column") +invalid_cases["empty_geometry_bbox"] = metadata + + +metadata = copy.deepcopy(metadata_template) +metadata["columns"]["geometry"]["geometry_bbox"]["column"] = "" +invalid_cases["empty_geometry_bbox_column"] = metadata + + # # Tests