diff --git a/test_data/data-linestring-encoding_native.parquet b/test_data/data-linestring-encoding_native.parquet new file mode 100644 index 0000000..264705d Binary files /dev/null and b/test_data/data-linestring-encoding_native.parquet differ diff --git a/test_data/data-linestring-encoding_wkb.parquet b/test_data/data-linestring-encoding_wkb.parquet new file mode 100644 index 0000000..fd8b1d4 Binary files /dev/null and b/test_data/data-linestring-encoding_wkb.parquet differ diff --git a/test_data/data-linestring-wkt.csv b/test_data/data-linestring-wkt.csv new file mode 100644 index 0000000..a3a320d --- /dev/null +++ b/test_data/data-linestring-wkt.csv @@ -0,0 +1,4 @@ +"col","geometry" +0,"LINESTRING (30 10, 10 30, 40 40)" +1,"LINESTRING EMPTY" +2, diff --git a/test_data/data-multilinestring-encoding_native.parquet b/test_data/data-multilinestring-encoding_native.parquet new file mode 100644 index 0000000..2bb5822 Binary files /dev/null and b/test_data/data-multilinestring-encoding_native.parquet differ diff --git a/test_data/data-multilinestring-encoding_wkb.parquet b/test_data/data-multilinestring-encoding_wkb.parquet new file mode 100644 index 0000000..08f9f0d Binary files /dev/null and b/test_data/data-multilinestring-encoding_wkb.parquet differ diff --git a/test_data/data-multilinestring-wkt.csv b/test_data/data-multilinestring-wkt.csv new file mode 100644 index 0000000..1c5f1ea --- /dev/null +++ b/test_data/data-multilinestring-wkt.csv @@ -0,0 +1,5 @@ +"col","geometry" +0,"MULTILINESTRING ((30 10, 10 30, 40 40))" +1,"MULTILINESTRING ((10 10, 20 20, 10 40), (40 40, 30 30, 40 20, 30 10))" +2,"MULTILINESTRING EMPTY" +3, diff --git a/test_data/data-multipoint-encoding_native.parquet b/test_data/data-multipoint-encoding_native.parquet new file mode 100644 index 0000000..b0435ef Binary files /dev/null and b/test_data/data-multipoint-encoding_native.parquet differ diff --git a/test_data/data-multipoint-encoding_wkb.parquet b/test_data/data-multipoint-encoding_wkb.parquet new file mode 100644 index 0000000..8454f25 Binary files /dev/null and b/test_data/data-multipoint-encoding_wkb.parquet differ diff --git a/test_data/data-multipoint-wkt.csv b/test_data/data-multipoint-wkt.csv new file mode 100644 index 0000000..00d926f --- /dev/null +++ b/test_data/data-multipoint-wkt.csv @@ -0,0 +1,5 @@ +"col","geometry" +0,"MULTIPOINT ((30 10))" +1,"MULTIPOINT ((10 40), (40 30), (20 20), (30 10))" +2,"MULTIPOINT EMPTY" +3, diff --git a/test_data/data-multipolygon-encoding_native.parquet b/test_data/data-multipolygon-encoding_native.parquet new file mode 100644 index 0000000..43a8d2f Binary files /dev/null and b/test_data/data-multipolygon-encoding_native.parquet differ diff --git a/test_data/data-multipolygon-encoding_wkb.parquet b/test_data/data-multipolygon-encoding_wkb.parquet new file mode 100644 index 0000000..538b8f4 Binary files /dev/null and b/test_data/data-multipolygon-encoding_wkb.parquet differ diff --git a/test_data/data-multipolygon-wkt.csv b/test_data/data-multipolygon-wkt.csv new file mode 100644 index 0000000..211a681 --- /dev/null +++ b/test_data/data-multipolygon-wkt.csv @@ -0,0 +1,6 @@ +"col","geometry" +0,"MULTIPOLYGON (((30 10, 40 40, 20 40, 10 20, 30 10)))" +1,"MULTIPOLYGON (((30 20, 45 40, 10 40, 30 20)), ((15 5, 40 10, 10 20, 5 10, 15 5)))" +2,"MULTIPOLYGON (((40 40, 20 45, 45 30, 40 40)), ((20 35, 10 30, 10 10, 30 5, 45 20, 20 35), (30 20, 20 15, 20 25, 30 20)))" +3,"MULTIPOLYGON EMPTY" +4, diff --git a/test_data/data-point-encoding_native.parquet b/test_data/data-point-encoding_native.parquet new file mode 100644 index 0000000..4e6489a Binary files /dev/null and b/test_data/data-point-encoding_native.parquet differ diff --git a/test_data/data-point-encoding_wkb.parquet b/test_data/data-point-encoding_wkb.parquet new file mode 100644 index 0000000..94b87d1 Binary files /dev/null and b/test_data/data-point-encoding_wkb.parquet differ diff --git a/test_data/data-point-wkt.csv b/test_data/data-point-wkt.csv new file mode 100644 index 0000000..8164a4b --- /dev/null +++ b/test_data/data-point-wkt.csv @@ -0,0 +1,5 @@ +"col","geometry" +0,"POINT (30 10)" +1,"POINT EMPTY" +2, +3,"POINT (40 40)" diff --git a/test_data/data-polygon-encoding_native.parquet b/test_data/data-polygon-encoding_native.parquet new file mode 100644 index 0000000..68d1aa6 Binary files /dev/null and b/test_data/data-polygon-encoding_native.parquet differ diff --git a/test_data/data-polygon-encoding_wkb.parquet b/test_data/data-polygon-encoding_wkb.parquet new file mode 100644 index 0000000..cce77ba Binary files /dev/null and b/test_data/data-polygon-encoding_wkb.parquet differ diff --git a/test_data/data-polygon-wkt.csv b/test_data/data-polygon-wkt.csv new file mode 100644 index 0000000..0574892 --- /dev/null +++ b/test_data/data-polygon-wkt.csv @@ -0,0 +1,5 @@ +"col","geometry" +0,"POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))" +1,"POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20, 20 30))" +2,"POLYGON EMPTY" +3, diff --git a/test_data/generate_test_data.py b/test_data/generate_test_data.py new file mode 100644 index 0000000..0e50a14 --- /dev/null +++ b/test_data/generate_test_data.py @@ -0,0 +1,218 @@ +""" +Generates example data using pyarrow by running `python generate_test_data.py`. + +You can print the metadata with: + +.. code-block:: python + + >>> import json, pprint, pyarrow.parquet as pq + >>> pprint.pprint(json.loads(pq.read_schema("example.parquet").metadata[b"geo"])) +""" +import json +import pathlib +import copy + +import numpy as np +import pyarrow as pa +import pyarrow.parquet as pq +from pyarrow.csv import write_csv + +from shapely import from_wkt, to_wkb + + +HERE = pathlib.Path(__file__).parent + + +metadata_template = { + "version": "1.1.0", + "primary_column": "geometry", + "columns": { + "geometry": { + "encoding": "WKB", + "geometry_types": [], + }, + }, +} + + +## Various geometry types with WKB and native (GeoArrow-based) encodings + +def write_encoding_files(geometries_wkt, geometries_geoarrow, geometry_type): + + table = pa.table({"col": range(len(geometries_wkt)), "geometry": geometries_wkt}) + write_csv(table, HERE / f"data-{geometry_type.lower()}-wkt.csv") + + # WKB encoding + table = pa.table( + {"col": range(len(geometries_wkt)), "geometry": to_wkb(from_wkt(geometries_wkt))} + ) + metadata = copy.deepcopy(metadata_template) + metadata["columns"]["geometry"]["geometry_types"] = [geometry_type] + table = table.replace_schema_metadata({"geo": json.dumps(metadata)}) + pq.write_table(table, HERE / f"data-{geometry_type.lower()}-encoding_wkb.parquet") + + # native (geoarrow) encoding + table = pa.table( + {"col": range(len(geometries_wkt)), "geometry": geometries_geoarrow} + ) + metadata["columns"]["geometry"]["encoding"] = geometry_type.lower() + table = table.replace_schema_metadata({"geo": json.dumps(metadata)}) + pq.write_table(table, HERE / f"data-{geometry_type.lower()}-encoding_native.parquet") + + +# point + +geometries_wkt = [ + "POINT (30 10)", + "POINT EMPTY", + None, + "POINT (40 40)", +] + +point_type = pa.struct( + [ + pa.field("x", pa.float64(), nullable=False), + pa.field("y", pa.float64(), nullable=False) + ] +) +geometries = pa.array( + [(30, 10), (float("nan"), float("nan")), (float("nan"), float("nan")), (40, 40)], + mask=np.array([False, False, True, False]), + type=point_type +) + +write_encoding_files( + geometries_wkt, geometries, geometry_type="Point" +) + +# linestring + +geometries_wkt = [ + "LINESTRING (30 10, 10 30, 40 40)", + "LINESTRING EMPTY", + None +] + +linestring_type = pa.list_(pa.field("vertices", point_type, nullable=False)) +geometries = pa.array( + [[(30, 10), (10, 30), (40, 40)], [], []], + mask=np.array([False, False, True]), + type=linestring_type +) + +write_encoding_files( + geometries_wkt, geometries, geometry_type="LineString" +) + +# polygon + +geometries_wkt = [ + "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))", + "POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20, 20 30))", + "POLYGON EMPTY", + None, +] + +polygon_type = pa.list_( + pa.field("rings", pa.list_( + pa.field("vertices", point_type, nullable=False) + ), nullable=False) +) +geometries = pa.array( + [ + [[(30, 10), (40, 40), (20, 40), (10, 20), (30, 10)]], + [[(35, 10), (45, 45), (15, 40), (10, 20), (35, 10)], + [(20, 30), (35, 35), (30, 20), (20, 30)]], + [], + [], + ], + mask=np.array([False, False, False, True]), + type=polygon_type +) + +write_encoding_files( + geometries_wkt, geometries, geometry_type="Polygon" +) + +# multipoint + +geometries_wkt = [ + "MULTIPOINT ((30 10))", + "MULTIPOINT ((10 40), (40 30), (20 20), (30 10))", + "MULTIPOINT EMPTY", + None, +] + +multipoint_type = pa.list_(pa.field("points", point_type, nullable=False)) +geometries = pa.array( + [ + [(30, 10)], + [(10, 40), (40, 30), (20, 20), (30, 10)], + [], + [], + ], + mask=np.array([False, False, False, True]), + type=multipoint_type +) + +write_encoding_files( + geometries_wkt, geometries, geometry_type="MultiPoint" +) + +# multilinestring + +geometries_wkt = [ + "MULTILINESTRING ((30 10, 10 30, 40 40))", + "MULTILINESTRING ((10 10, 20 20, 10 40), (40 40, 30 30, 40 20, 30 10))", + "MULTILINESTRING EMPTY", + None, +] + +multilinestring_type = pa.list_( + pa.field("linestrings", linestring_type, nullable=False) +) +geometries = pa.array( + [ + [[(30, 10), (10, 30), (40, 40)]], + [[(10, 10), (20, 20), (10, 40)], + [(40, 40), (30, 30), (40, 20), (30, 10)]], + [], + [], + ], + mask=np.array([False, False, False, True]), + type=multilinestring_type +) + +write_encoding_files( + geometries_wkt, geometries, geometry_type="MultiLineString" +) + +# multipolygon + +geometries_wkt = [ + "MULTIPOLYGON (((30 10, 40 40, 20 40, 10 20, 30 10)))", + "MULTIPOLYGON (((30 20, 45 40, 10 40, 30 20)), ((15 5, 40 10, 10 20, 5 10, 15 5)))", + "MULTIPOLYGON (((40 40, 20 45, 45 30, 40 40)), ((20 35, 10 30, 10 10, 30 5, 45 20, 20 35), (30 20, 20 15, 20 25, 30 20)))", + "MULTIPOLYGON EMPTY", + None, +] + +multipolygon_type = pa.list_(pa.field("polygons", polygon_type, nullable=False)) +geometries = pa.array( + [ + [[[(30, 10), (40, 40), (20, 40), (10, 20), (30, 10)]]], + [[[(30, 20), (45, 40), (10, 40), (30, 20)]], + [[(15, 5), (40, 10), (10, 20), (5, 10), (15, 5)]]], + [[[(40, 40), (20, 45), (45, 30), (40, 40)]], + [[(20, 35), (10, 30), (10, 10), (30, 5), (45, 20), (20, 35)], + [(30, 20), (20, 15), (20, 25), (30, 20)]]], + [], + [], + ], + mask=np.array([False, False, False, False, True]), + type=multipolygon_type +) + +write_encoding_files( + geometries_wkt, geometries, geometry_type="MultiPolygon" +)