-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
2e7cef7
commit 5406f96
Showing
4 changed files
with
208 additions
and
68 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import json | ||
from pathlib import Path | ||
from typing import Any, Dict, Iterable, List, Union | ||
|
||
import pyarrow as pa | ||
|
||
from stac_geoparquet.arrow._util import stac_items_to_arrow | ||
|
||
|
||
class InferredSchema: | ||
""" | ||
A schema representing the original STAC JSON with absolutely minimal modifications. | ||
The only modification from the data is converting any geometry fields from GeoJSON | ||
to WKB. | ||
""" | ||
|
||
inner: pa.Schema | ||
"""The underlying Arrow schema.""" | ||
|
||
count: int | ||
"""The total number of items scanned.""" | ||
|
||
def __init__(self) -> None: | ||
self.inner = pa.schema([]) | ||
self.count = 0 | ||
|
||
def update_from_ndjson( | ||
self, | ||
path: Union[Union[str, Path], Iterable[Union[str, Path]]], | ||
*, | ||
chunk_size: int = 10000, | ||
): | ||
# Handle multi-path input | ||
if not isinstance(path, (str, Path)): | ||
for p in path: | ||
self.update_from_ndjson(p) | ||
|
||
return | ||
|
||
# Handle single-path input | ||
with open(path) as f: | ||
items = [] | ||
for line in f: | ||
item = json.loads(line) | ||
items.append(item) | ||
|
||
if len(items) >= chunk_size: | ||
self.update_from_items(items) | ||
items = [] | ||
|
||
# Handle remainder | ||
if len(items) > 0: | ||
self.update_from_items(items) | ||
|
||
def update_from_items(self, items: List[Dict[str, Any]]): | ||
self.count += len(items) | ||
current_schema = stac_items_to_arrow(items, schema=None).schema | ||
new_schema = pa.unify_schemas( | ||
[self.inner, current_schema], promote_options="permissive" | ||
) | ||
self.inner = new_schema |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
from copy import deepcopy | ||
from typing import Any, Dict, Optional, Sequence | ||
|
||
import pyarrow as pa | ||
import shapely | ||
import shapely.geometry | ||
|
||
|
||
def stac_items_to_arrow( | ||
items: Sequence[Dict[str, Any]], *, schema: Optional[pa.Schema] = None | ||
) -> pa.RecordBatch: | ||
"""Convert dicts representing STAC Items to Arrow | ||
This converts GeoJSON geometries to WKB before Arrow conversion to allow multiple | ||
geometry types. | ||
All items will be parsed into a single RecordBatch, meaning that each internal array | ||
is fully contiguous in memory for the length of `items`. | ||
Args: | ||
items: STAC Items to convert to Arrow | ||
Kwargs: | ||
schema: An optional schema that describes the format of the data. Note that this | ||
must represent the geometry column as binary type. | ||
Returns: | ||
Arrow RecordBatch with items in Arrow | ||
""" | ||
# Preprocess GeoJSON to WKB in each STAC item | ||
# Otherwise, pyarrow will try to parse coordinates into a native geometry type and | ||
# if you have multiple geometry types pyarrow will error with | ||
# `ArrowInvalid: cannot mix list and non-list, non-null values` | ||
wkb_items = [] | ||
for item in items: | ||
wkb_item = deepcopy(item) | ||
wkb_item["geometry"] = shapely.to_wkb( | ||
shapely.geometry.shape(wkb_item["geometry"]), flavor="iso" | ||
) | ||
|
||
# If a proj:geometry key exists in top-level properties, convert that to WKB | ||
if "proj:geometry" in wkb_item["properties"]: | ||
wkb_item["proj:geometry"] = shapely.to_wkb( | ||
shapely.geometry.shape(wkb_item["proj:geometry"]), flavor="iso" | ||
) | ||
|
||
# If a proj:geometry key exists in any asset properties, convert that to WKB | ||
for asset_value in wkb_item["assets"].values(): | ||
if "proj:geometry" in asset_value: | ||
asset_value["proj:geometry"] = shapely.to_wkb( | ||
shapely.geometry.shape(asset_value["proj:geometry"]), | ||
flavor="iso", | ||
) | ||
|
||
wkb_items.append(wkb_item) | ||
|
||
if schema is not None: | ||
array = pa.array(wkb_items, type=pa.struct(schema)) | ||
else: | ||
array = pa.array(wkb_items) | ||
|
||
return pa.RecordBatch.from_struct_array(array) |