diff --git a/README.md b/README.md index f011d2c..dd310ce 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ This repository defines a [specification](https://geoparquet.org/releases/) for how to store geospatial [vector data](https://gisgeography.com/spatial-data-types-vector-raster/) (point, lines, polygons) in [Apache Parquet](https://parquet.apache.org/), a popular columnar storage format for tabular data - see [this vendor explanation](https://databricks.com/glossary/what-is-parquet) for more on what that means. Our goal is to standardize how geospatial data is represented in Parquet to further geospatial interoperability among tools using Parquet today, and hopefully help push forward what's possible with 'cloud-native geospatial' workflows. There are now more than 20 different tools and libraries in 6 different languages that support GeoParquet, you can learn more at [geoparquet.org](https://geoparquet.org). -Early contributors include developers from GeoPandas, GeoTrellis, OpenLayers, Vis.gl, Voltron Data, Microsoft, Carto, Azavea, Planet & Unfolded. +Early contributors include developers from GeoPandas, GeoTrellis, OpenLayers, Vis.gl, Voltron Data, Microsoft, CARTO, Azavea, Planet & Unfolded. Anyone is welcome to join the project, by building implementations, trying it out, giving feedback through issues and contributing to the spec via pull requests. Initial work started in the [geo-arrow-spec](https://github.com/geoarrow/geoarrow) GeoPandas repository, and that will continue on Arrow work in a compatible way, with this specification focused solely on Parquet. We are in the process of becoming an [OGC](https://ogc.org) official @@ -12,6 +12,8 @@ Arrow work in a compatible way, with this specification focused solely on Parque **The latest [stable specification](https://geoparquet.org/releases/v1.0.0/) and [JSON schema](https://geoparquet.org/releases/v1.0.0/schema.json) are published at [geoparquet.org/releases/](https://geoparquet.org/releases/).** +**The community has agreed on this release, but it is still pending OGC approval.** We are currently working on the process to get it officially OGC approved as soon as possible. The OGC candidate Standard is at [https://docs.ogc.org/DRAFTS/24-013.html](https://docs.ogc.org/DRAFTS/24-013.html). The candidate Standard remains in draft form until it is approved as a Standard by the OGC Membership. Released versions of GeoParquet will not be changed, so if changes are needed for OGC approval, it will be released with a new version number. + The 'dev' versions of the spec are available in this repo: - [**Specification**](format-specs/geoparquet.md) (dev version - not stable, go to the [stable specification](https://geoparquet.org/releases/v1.0.0/) instead) diff --git a/format-specs/geoparquet.md b/format-specs/geoparquet.md index 86ae521..d476139 100644 --- a/format-specs/geoparquet.md +++ b/format-specs/geoparquet.md @@ -289,3 +289,11 @@ The CRS is likely equivalent to OGC:CRS84 for a GeoParquet file if the `id` elem It is reasonable for implementations to require that one of the above `id` elements are present and skip further tests to determine if the CRS is functionally equivalent with OGC:CRS84. Note: EPSG:4326 and OGC:CRS84 are equivalent with respect to this specification because this specification specifically overrides the coordinate axis order in the `crs` to be longitude-latitude. + +## File Extension + +It is RECOMMENDED to use `.parquet` as the file extension for a GeoParquet file. This provides the best interoperability with existing Parquet tools. The file extension `.geoparquet` SHOULD NOT be used. + +## Media Type + +If a [media type](https://en.wikipedia.org/wiki/Media_type) (formerly: MIME type) is used, a GeoParquet file MUST use [application/vnd.apache.parquet](https://www.iana.org/assignments/media-types/application/vnd.apache.parquet) as the media type. diff --git a/format-specs/schema.json b/format-specs/schema.json index 241d68e..26cddec 100644 --- a/format-specs/schema.json +++ b/format-specs/schema.json @@ -74,7 +74,9 @@ }, "covering": { "type": "object", - "minProperties": 1, + "required": [ + "bbox" + ], "properties": { "bbox": { "type": "object", @@ -83,34 +85,38 @@ "xmin": { "type": "array", "items": [ - { "type": "string" }, + { "type": "string", "minLength": 1 }, { "const": "xmin" } ], - "additionalItems": false + "minItems": 2, + "maxItems": 2 }, "xmax": { "type": "array", "items": [ - { "type": "string" }, + { "type": "string", "minLength": 1 }, { "const": "xmax" } ], - "additionalItems": false + "minItems": 2, + "maxItems": 2 }, "ymin": { "type": "array", "items": [ - { "type": "string" }, + { "type": "string", "minLength": 1 }, { "const": "ymin" } ], - "additionalItems": false + "minItems": 2, + "maxItems": 2 }, "ymax": { "type": "array", "items": [ - { "type": "string" }, + { "type": "string", "minLength": 1 }, { "const": "ymax" } ], - "additionalItems": false + "minItems": 2, + "maxItems": 2 } } } diff --git a/scripts/test_json_schema.py b/scripts/test_json_schema.py index f0217c1..61a1b4b 100644 --- a/scripts/test_json_schema.py +++ b/scripts/test_json_schema.py @@ -221,7 +221,6 @@ def get_version() -> str: }, } - # Allow "any_column.xmin" etc. metadata = copy.deepcopy(metadata_covering_template) valid_cases["valid_default_bbox"] = metadata @@ -235,6 +234,35 @@ def get_version() -> str: } valid_cases["valid_but_not_bbox_struct_name"] = metadata +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"] = { + "xmin": ["", "xmin"], + "ymin": ["", "ymin"], + "xmax": ["", "xmax"], + "ymax": ["", "ymax"], +} +invalid_cases["empty_column_name"] = metadata + +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"]["xmin"] = [] +invalid_cases["xmin_array_length_must_be_2_is_0"] = metadata + +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"]["ymax"] = [] +invalid_cases["ymax_array_length_must_be_2_is_0"] = metadata + +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"]["ymin"] = ["column"] +invalid_cases["ymin_array_length_must_be_2_is_1"] = metadata + +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"]["xmax"] = ["column"] +invalid_cases["xmax_array_length_must_be_2_is_1"] = metadata + +metadata = copy.deepcopy(metadata_covering_template) +metadata["columns"]["geometry"]["covering"]["bbox"]["xmin"] = ["xmin", "xmin", "xmin"] +invalid_cases["xmin_array_length_must_be_2_is_3"] = metadata + metadata = copy.deepcopy(metadata_covering_template) metadata["columns"]["geometry"]["covering"].pop("bbox") invalid_cases["empty_geometry_bbox"] = metadata diff --git a/test_data/data-linestring-encoding_native.parquet b/test_data/data-linestring-encoding_native.parquet new file mode 100644 index 0000000..264705d Binary files /dev/null and b/test_data/data-linestring-encoding_native.parquet differ diff --git a/test_data/data-linestring-encoding_wkb.parquet b/test_data/data-linestring-encoding_wkb.parquet new file mode 100644 index 0000000..fd8b1d4 Binary files /dev/null and b/test_data/data-linestring-encoding_wkb.parquet differ diff --git a/test_data/data-linestring-wkt.csv b/test_data/data-linestring-wkt.csv new file mode 100644 index 0000000..a3a320d --- /dev/null +++ b/test_data/data-linestring-wkt.csv @@ -0,0 +1,4 @@ +"col","geometry" +0,"LINESTRING (30 10, 10 30, 40 40)" +1,"LINESTRING EMPTY" +2, diff --git a/test_data/data-multilinestring-encoding_native.parquet b/test_data/data-multilinestring-encoding_native.parquet new file mode 100644 index 0000000..2bb5822 Binary files /dev/null and b/test_data/data-multilinestring-encoding_native.parquet differ diff --git a/test_data/data-multilinestring-encoding_wkb.parquet b/test_data/data-multilinestring-encoding_wkb.parquet new file mode 100644 index 0000000..08f9f0d Binary files /dev/null and b/test_data/data-multilinestring-encoding_wkb.parquet differ diff --git a/test_data/data-multilinestring-wkt.csv b/test_data/data-multilinestring-wkt.csv new file mode 100644 index 0000000..1c5f1ea --- /dev/null +++ b/test_data/data-multilinestring-wkt.csv @@ -0,0 +1,5 @@ +"col","geometry" +0,"MULTILINESTRING ((30 10, 10 30, 40 40))" +1,"MULTILINESTRING ((10 10, 20 20, 10 40), (40 40, 30 30, 40 20, 30 10))" +2,"MULTILINESTRING EMPTY" +3, diff --git a/test_data/data-multipoint-encoding_native.parquet b/test_data/data-multipoint-encoding_native.parquet new file mode 100644 index 0000000..b0435ef Binary files /dev/null and b/test_data/data-multipoint-encoding_native.parquet differ diff --git a/test_data/data-multipoint-encoding_wkb.parquet b/test_data/data-multipoint-encoding_wkb.parquet new file mode 100644 index 0000000..8454f25 Binary files /dev/null and b/test_data/data-multipoint-encoding_wkb.parquet differ diff --git a/test_data/data-multipoint-wkt.csv b/test_data/data-multipoint-wkt.csv new file mode 100644 index 0000000..00d926f --- /dev/null +++ b/test_data/data-multipoint-wkt.csv @@ -0,0 +1,5 @@ +"col","geometry" +0,"MULTIPOINT ((30 10))" +1,"MULTIPOINT ((10 40), (40 30), (20 20), (30 10))" +2,"MULTIPOINT EMPTY" +3, diff --git a/test_data/data-multipolygon-encoding_native.parquet b/test_data/data-multipolygon-encoding_native.parquet new file mode 100644 index 0000000..43a8d2f Binary files /dev/null and b/test_data/data-multipolygon-encoding_native.parquet differ diff --git a/test_data/data-multipolygon-encoding_wkb.parquet b/test_data/data-multipolygon-encoding_wkb.parquet new file mode 100644 index 0000000..538b8f4 Binary files /dev/null and b/test_data/data-multipolygon-encoding_wkb.parquet differ diff --git a/test_data/data-multipolygon-wkt.csv b/test_data/data-multipolygon-wkt.csv new file mode 100644 index 0000000..211a681 --- /dev/null +++ b/test_data/data-multipolygon-wkt.csv @@ -0,0 +1,6 @@ +"col","geometry" +0,"MULTIPOLYGON (((30 10, 40 40, 20 40, 10 20, 30 10)))" +1,"MULTIPOLYGON (((30 20, 45 40, 10 40, 30 20)), ((15 5, 40 10, 10 20, 5 10, 15 5)))" +2,"MULTIPOLYGON (((40 40, 20 45, 45 30, 40 40)), ((20 35, 10 30, 10 10, 30 5, 45 20, 20 35), (30 20, 20 15, 20 25, 30 20)))" +3,"MULTIPOLYGON EMPTY" +4, diff --git a/test_data/data-point-encoding_native.parquet b/test_data/data-point-encoding_native.parquet new file mode 100644 index 0000000..4e6489a Binary files /dev/null and b/test_data/data-point-encoding_native.parquet differ diff --git a/test_data/data-point-encoding_wkb.parquet b/test_data/data-point-encoding_wkb.parquet new file mode 100644 index 0000000..94b87d1 Binary files /dev/null and b/test_data/data-point-encoding_wkb.parquet differ diff --git a/test_data/data-point-wkt.csv b/test_data/data-point-wkt.csv new file mode 100644 index 0000000..8164a4b --- /dev/null +++ b/test_data/data-point-wkt.csv @@ -0,0 +1,5 @@ +"col","geometry" +0,"POINT (30 10)" +1,"POINT EMPTY" +2, +3,"POINT (40 40)" diff --git a/test_data/data-polygon-encoding_native.parquet b/test_data/data-polygon-encoding_native.parquet new file mode 100644 index 0000000..68d1aa6 Binary files /dev/null and b/test_data/data-polygon-encoding_native.parquet differ diff --git a/test_data/data-polygon-encoding_wkb.parquet b/test_data/data-polygon-encoding_wkb.parquet new file mode 100644 index 0000000..cce77ba Binary files /dev/null and b/test_data/data-polygon-encoding_wkb.parquet differ diff --git a/test_data/data-polygon-wkt.csv b/test_data/data-polygon-wkt.csv new file mode 100644 index 0000000..0574892 --- /dev/null +++ b/test_data/data-polygon-wkt.csv @@ -0,0 +1,5 @@ +"col","geometry" +0,"POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))" +1,"POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20, 20 30))" +2,"POLYGON EMPTY" +3, diff --git a/test_data/generate_test_data.py b/test_data/generate_test_data.py new file mode 100644 index 0000000..0e50a14 --- /dev/null +++ b/test_data/generate_test_data.py @@ -0,0 +1,218 @@ +""" +Generates example data using pyarrow by running `python generate_test_data.py`. + +You can print the metadata with: + +.. code-block:: python + + >>> import json, pprint, pyarrow.parquet as pq + >>> pprint.pprint(json.loads(pq.read_schema("example.parquet").metadata[b"geo"])) +""" +import json +import pathlib +import copy + +import numpy as np +import pyarrow as pa +import pyarrow.parquet as pq +from pyarrow.csv import write_csv + +from shapely import from_wkt, to_wkb + + +HERE = pathlib.Path(__file__).parent + + +metadata_template = { + "version": "1.1.0", + "primary_column": "geometry", + "columns": { + "geometry": { + "encoding": "WKB", + "geometry_types": [], + }, + }, +} + + +## Various geometry types with WKB and native (GeoArrow-based) encodings + +def write_encoding_files(geometries_wkt, geometries_geoarrow, geometry_type): + + table = pa.table({"col": range(len(geometries_wkt)), "geometry": geometries_wkt}) + write_csv(table, HERE / f"data-{geometry_type.lower()}-wkt.csv") + + # WKB encoding + table = pa.table( + {"col": range(len(geometries_wkt)), "geometry": to_wkb(from_wkt(geometries_wkt))} + ) + metadata = copy.deepcopy(metadata_template) + metadata["columns"]["geometry"]["geometry_types"] = [geometry_type] + table = table.replace_schema_metadata({"geo": json.dumps(metadata)}) + pq.write_table(table, HERE / f"data-{geometry_type.lower()}-encoding_wkb.parquet") + + # native (geoarrow) encoding + table = pa.table( + {"col": range(len(geometries_wkt)), "geometry": geometries_geoarrow} + ) + metadata["columns"]["geometry"]["encoding"] = geometry_type.lower() + table = table.replace_schema_metadata({"geo": json.dumps(metadata)}) + pq.write_table(table, HERE / f"data-{geometry_type.lower()}-encoding_native.parquet") + + +# point + +geometries_wkt = [ + "POINT (30 10)", + "POINT EMPTY", + None, + "POINT (40 40)", +] + +point_type = pa.struct( + [ + pa.field("x", pa.float64(), nullable=False), + pa.field("y", pa.float64(), nullable=False) + ] +) +geometries = pa.array( + [(30, 10), (float("nan"), float("nan")), (float("nan"), float("nan")), (40, 40)], + mask=np.array([False, False, True, False]), + type=point_type +) + +write_encoding_files( + geometries_wkt, geometries, geometry_type="Point" +) + +# linestring + +geometries_wkt = [ + "LINESTRING (30 10, 10 30, 40 40)", + "LINESTRING EMPTY", + None +] + +linestring_type = pa.list_(pa.field("vertices", point_type, nullable=False)) +geometries = pa.array( + [[(30, 10), (10, 30), (40, 40)], [], []], + mask=np.array([False, False, True]), + type=linestring_type +) + +write_encoding_files( + geometries_wkt, geometries, geometry_type="LineString" +) + +# polygon + +geometries_wkt = [ + "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))", + "POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20, 20 30))", + "POLYGON EMPTY", + None, +] + +polygon_type = pa.list_( + pa.field("rings", pa.list_( + pa.field("vertices", point_type, nullable=False) + ), nullable=False) +) +geometries = pa.array( + [ + [[(30, 10), (40, 40), (20, 40), (10, 20), (30, 10)]], + [[(35, 10), (45, 45), (15, 40), (10, 20), (35, 10)], + [(20, 30), (35, 35), (30, 20), (20, 30)]], + [], + [], + ], + mask=np.array([False, False, False, True]), + type=polygon_type +) + +write_encoding_files( + geometries_wkt, geometries, geometry_type="Polygon" +) + +# multipoint + +geometries_wkt = [ + "MULTIPOINT ((30 10))", + "MULTIPOINT ((10 40), (40 30), (20 20), (30 10))", + "MULTIPOINT EMPTY", + None, +] + +multipoint_type = pa.list_(pa.field("points", point_type, nullable=False)) +geometries = pa.array( + [ + [(30, 10)], + [(10, 40), (40, 30), (20, 20), (30, 10)], + [], + [], + ], + mask=np.array([False, False, False, True]), + type=multipoint_type +) + +write_encoding_files( + geometries_wkt, geometries, geometry_type="MultiPoint" +) + +# multilinestring + +geometries_wkt = [ + "MULTILINESTRING ((30 10, 10 30, 40 40))", + "MULTILINESTRING ((10 10, 20 20, 10 40), (40 40, 30 30, 40 20, 30 10))", + "MULTILINESTRING EMPTY", + None, +] + +multilinestring_type = pa.list_( + pa.field("linestrings", linestring_type, nullable=False) +) +geometries = pa.array( + [ + [[(30, 10), (10, 30), (40, 40)]], + [[(10, 10), (20, 20), (10, 40)], + [(40, 40), (30, 30), (40, 20), (30, 10)]], + [], + [], + ], + mask=np.array([False, False, False, True]), + type=multilinestring_type +) + +write_encoding_files( + geometries_wkt, geometries, geometry_type="MultiLineString" +) + +# multipolygon + +geometries_wkt = [ + "MULTIPOLYGON (((30 10, 40 40, 20 40, 10 20, 30 10)))", + "MULTIPOLYGON (((30 20, 45 40, 10 40, 30 20)), ((15 5, 40 10, 10 20, 5 10, 15 5)))", + "MULTIPOLYGON (((40 40, 20 45, 45 30, 40 40)), ((20 35, 10 30, 10 10, 30 5, 45 20, 20 35), (30 20, 20 15, 20 25, 30 20)))", + "MULTIPOLYGON EMPTY", + None, +] + +multipolygon_type = pa.list_(pa.field("polygons", polygon_type, nullable=False)) +geometries = pa.array( + [ + [[[(30, 10), (40, 40), (20, 40), (10, 20), (30, 10)]]], + [[[(30, 20), (45, 40), (10, 40), (30, 20)]], + [[(15, 5), (40, 10), (10, 20), (5, 10), (15, 5)]]], + [[[(40, 40), (20, 45), (45, 30), (40, 40)]], + [[(20, 35), (10, 30), (10, 10), (30, 5), (45, 20), (20, 35)], + [(30, 20), (20, 15), (20, 25), (30, 20)]]], + [], + [], + ], + mask=np.array([False, False, False, False, True]), + type=multipolygon_type +) + +write_encoding_files( + geometries_wkt, geometries, geometry_type="MultiPolygon" +)