Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add test data covering different native (geoarrow-based) encodings #204

Merged
merged 9 commits into from
May 28, 2024
Binary file added test_data/data-linestring-encoding_native.parquet
Binary file not shown.
Binary file added test_data/data-linestring-encoding_wkb.parquet
Binary file not shown.
4 changes: 4 additions & 0 deletions test_data/data-linestring-wkt.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"col","geometry"
0,"LINESTRING (30 10, 10 30, 40 40)"
1,"LINESTRING EMPTY"
2,
Binary file not shown.
Binary file not shown.
5 changes: 5 additions & 0 deletions test_data/data-multilinestring-wkt.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"col","geometry"
0,"MULTILINESTRING ((30 10, 10 30, 40 40))"
1,"MULTILINESTRING ((10 10, 20 20, 10 40), (40 40, 30 30, 40 20, 30 10))"
2,"MULTILINESTRING EMPTY"
3,
Binary file added test_data/data-multipoint-encoding_native.parquet
Binary file not shown.
Binary file added test_data/data-multipoint-encoding_wkb.parquet
Binary file not shown.
5 changes: 5 additions & 0 deletions test_data/data-multipoint-wkt.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"col","geometry"
0,"MULTIPOINT ((30 10))"
1,"MULTIPOINT ((10 40), (40 30), (20 20), (30 10))"
2,"MULTIPOINT EMPTY"
3,
Binary file not shown.
Binary file added test_data/data-multipolygon-encoding_wkb.parquet
Binary file not shown.
6 changes: 6 additions & 0 deletions test_data/data-multipolygon-wkt.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"col","geometry"
0,"MULTIPOLYGON (((30 10, 40 40, 20 40, 10 20, 30 10)))"
1,"MULTIPOLYGON (((30 20, 45 40, 10 40, 30 20)), ((15 5, 40 10, 10 20, 5 10, 15 5)))"
2,"MULTIPOLYGON (((40 40, 20 45, 45 30, 40 40)), ((20 35, 10 30, 10 10, 30 5, 45 20, 20 35), (30 20, 20 15, 20 25, 30 20)))"
3,"MULTIPOLYGON EMPTY"
4,
Binary file added test_data/data-point-encoding_native.parquet
Binary file not shown.
Binary file added test_data/data-point-encoding_wkb.parquet
Binary file not shown.
5 changes: 5 additions & 0 deletions test_data/data-point-wkt.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"col","geometry"
0,"POINT (30 10)"
1,"POINT EMPTY"
2,
3,"POINT (40 40)"
Binary file added test_data/data-polygon-encoding_native.parquet
Binary file not shown.
Binary file added test_data/data-polygon-encoding_wkb.parquet
Binary file not shown.
5 changes: 5 additions & 0 deletions test_data/data-polygon-wkt.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"col","geometry"
0,"POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"
1,"POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20, 20 30))"
2,"POLYGON EMPTY"
3,
218 changes: 218 additions & 0 deletions test_data/generate_test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
"""
Generates example data using pyarrow by running `python generate_test_data.py`.

You can print the metadata with:

.. code-block:: python

>>> import json, pprint, pyarrow.parquet as pq
>>> pprint.pprint(json.loads(pq.read_schema("example.parquet").metadata[b"geo"]))
"""
import json
import pathlib
import copy

import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from pyarrow.csv import write_csv

from shapely import from_wkt, to_wkb


HERE = pathlib.Path(__file__).parent


metadata_template = {
"version": "1.1.0",
"primary_column": "geometry",
"columns": {
"geometry": {
"encoding": "WKB",
"geometry_types": [],
},
},
}


## Various geometry types with WKB and native (GeoArrow-based) encodings

def write_encoding_files(geometries_wkt, geometries_geoarrow, geometry_type):
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved

table = pa.table({"col": range(len(geometries_wkt)), "geometry": geometries_wkt})
write_csv(table, HERE / f"data-{geometry_type.lower()}-wkt.csv")

# WKB encoding
table = pa.table(
{"col": range(len(geometries_wkt)), "geometry": to_wkb(from_wkt(geometries_wkt))}
)
metadata = copy.deepcopy(metadata_template)
metadata["columns"]["geometry"]["geometry_types"] = [geometry_type]
table = table.replace_schema_metadata({"geo": json.dumps(metadata)})
pq.write_table(table, HERE / f"data-{geometry_type.lower()}-encoding_wkb.parquet")
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved

# native (geoarrow) encoding
table = pa.table(
{"col": range(len(geometries_wkt)), "geometry": geometries_geoarrow}
)
metadata["columns"]["geometry"]["encoding"] = geometry_type.lower()
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
table = table.replace_schema_metadata({"geo": json.dumps(metadata)})
pq.write_table(table, HERE / f"data-{geometry_type.lower()}-encoding_native.parquet")
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved


# point

geometries_wkt = [
"POINT (30 10)",
"POINT EMPTY",
None,
"POINT (40 40)",
]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should there also be a version of these that contain NULLs or a version that contains Z values?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added some null values!


point_type = pa.struct(
[
pa.field("x", pa.float64(), nullable=False),
pa.field("y", pa.float64(), nullable=False)
]
)
geometries = pa.array(
[(30, 10), (float("nan"), float("nan")), (float("nan"), float("nan")), (40, 40)],
mask=np.array([False, False, True, False]),
type=point_type
)

write_encoding_files(
geometries_wkt, geometries, geometry_type="Point"
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
)

# linestring

geometries_wkt = [
"LINESTRING (30 10, 10 30, 40 40)",
"LINESTRING EMPTY",
None
]

linestring_type = pa.list_(pa.field("vertices", point_type, nullable=False))
geometries = pa.array(
[[(30, 10), (10, 30), (40, 40)], [], []],
mask=np.array([False, False, True]),
type=linestring_type
)

write_encoding_files(
geometries_wkt, geometries, geometry_type="LineString"
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
)

# polygon

geometries_wkt = [
"POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))",
"POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20, 20 30))",
"POLYGON EMPTY",
None,
]

polygon_type = pa.list_(
pa.field("rings", pa.list_(
pa.field("vertices", point_type, nullable=False)
), nullable=False)
)
geometries = pa.array(
[
[[(30, 10), (40, 40), (20, 40), (10, 20), (30, 10)]],
[[(35, 10), (45, 45), (15, 40), (10, 20), (35, 10)],
[(20, 30), (35, 35), (30, 20), (20, 30)]],
[],
[],
],
mask=np.array([False, False, False, True]),
type=polygon_type
)

write_encoding_files(
geometries_wkt, geometries, geometry_type="Polygon"
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
)

# multipoint

geometries_wkt = [
"MULTIPOINT ((30 10))",
"MULTIPOINT ((10 40), (40 30), (20 20), (30 10))",
"MULTIPOINT EMPTY",
None,
]

multipoint_type = pa.list_(pa.field("points", point_type, nullable=False))
geometries = pa.array(
[
[(30, 10)],
[(10, 40), (40, 30), (20, 20), (30, 10)],
[],
[],
],
mask=np.array([False, False, False, True]),
type=multipoint_type
)

write_encoding_files(
geometries_wkt, geometries, geometry_type="MultiPoint"
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
)

# multilinestring

geometries_wkt = [
"MULTILINESTRING ((30 10, 10 30, 40 40))",
"MULTILINESTRING ((10 10, 20 20, 10 40), (40 40, 30 30, 40 20, 30 10))",
"MULTILINESTRING EMPTY",
None,
]

multilinestring_type = pa.list_(
pa.field("linestrings", linestring_type, nullable=False)
)
geometries = pa.array(
[
[[(30, 10), (10, 30), (40, 40)]],
[[(10, 10), (20, 20), (10, 40)],
[(40, 40), (30, 30), (40, 20), (30, 10)]],
[],
[],
],
mask=np.array([False, False, False, True]),
type=multilinestring_type
)

write_encoding_files(
geometries_wkt, geometries, geometry_type="MultiLineString"
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
)

# multipolygon

geometries_wkt = [
"MULTIPOLYGON (((30 10, 40 40, 20 40, 10 20, 30 10)))",
"MULTIPOLYGON (((30 20, 45 40, 10 40, 30 20)), ((15 5, 40 10, 10 20, 5 10, 15 5)))",
"MULTIPOLYGON (((40 40, 20 45, 45 30, 40 40)), ((20 35, 10 30, 10 10, 30 5, 45 20, 20 35), (30 20, 20 15, 20 25, 30 20)))",
"MULTIPOLYGON EMPTY",
None,
]

multipolygon_type = pa.list_(pa.field("polygons", polygon_type, nullable=False))
geometries = pa.array(
[
[[[(30, 10), (40, 40), (20, 40), (10, 20), (30, 10)]]],
[[[(30, 20), (45, 40), (10, 40), (30, 20)]],
[[(15, 5), (40, 10), (10, 20), (5, 10), (15, 5)]]],
[[[(40, 40), (20, 45), (45, 30), (40, 40)]],
[[(20, 35), (10, 30), (10, 10), (30, 5), (45, 20), (20, 35)],
[(30, 20), (20, 15), (20, 25), (30, 20)]]],
[],
[],
],
mask=np.array([False, False, False, False, True]),
type=multipolygon_type
)

write_encoding_files(
geometries_wkt, geometries, geometry_type="MultiPolygon"
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
)
Loading