Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Let pyarrow cast strings to dates #80

Merged
merged 2 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions stac_geoparquet/arrow/_to_arrow.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Convert STAC data into Arrow tables"""

import ciso8601
Copy link
Contributor Author

@scottyhq scottyhq Oct 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since pandas is a dependency, maybe ciso8601 could be dropped as a dependency and instead just use pandas or pyarrow for date handling? see also #31 (comment)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Definitely don't want to use pandas here, but using pyarrow natively for the cast is fine.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like the only other place ciso8601 is used currently is in testing here

result_datetime = parse_rfc3339(result)
expected_datetime = parse_rfc3339(expected)

Should I switch that to use pandas or pa.scalar(timestamp_str, type=pa.string()).cast(pa.timestamp('us', tz='UTC')).as_py()?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's fine for ciso8601 to still be used in the tests.

import numpy as np
import orjson
import pyarrow as pa
Expand Down Expand Up @@ -78,9 +77,7 @@ def convert_timestamp_columns(

def _convert_single_timestamp_column(column: pa.Array) -> pa.TimestampArray:
"""Convert an individual timestamp column from string to a Timestamp type"""
return pa.array(
[ciso8601.parse_rfc3339(str(t)) for t in column], pa.timestamp("us", tz="UTC")
)
return pa.array(column, pa.timestamp("us", tz="UTC"))


def _is_bbox_3d(bbox_col: pa.Array) -> bool:
Expand Down
224 changes: 224 additions & 0 deletions tests/data/umbra-sar.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
[
{
"type": "Feature",
"stac_version": "1.0.0",
"stac_extensions": [
"https://stac-extensions.github.io/view/v1.0.0/schema.json",
"https://stac-extensions.github.io/sar/v1.0.0/schema.json"
],
"id": "52f2317f-091b-4f90-b385-08c93655e089",
"geometry": {
"type": "Polygon",
"coordinates": [
[
[
-79.54925151958344,
8.974791994120121,
14.312313693418984
],
[
-79.58251694872273,
9.005616405406611,
14.318863303708582
],
[
-79.61351789024071,
8.972540909838038,
14.312558707495423
],
[
-79.58025285722145,
8.941719234834665,
14.318864497230194
],
[
-79.54925151958344,
8.974791994120121,
14.312313693418984
]
]
]
},
"bbox": [
-79.61351789024071,
8.941719234834665,
-79.54925151958344,
9.005616405406611
],
"properties": {
"created": "2024-09-10T10:00:00.425293+00:00",
"updated": "2024-09-10T10:00:00.425300+00:00",
Comment on lines +49 to +50
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

only this first item has these properties set

"platform": "Umbra-08",
"end_datetime": "2024-09-10T03:32:32.903484+00:00",
"umbra:task_id": "ba1ca3b0-f458-4cd9-8e99-52d2d899d5dd",
"start_datetime": "2024-09-10T03:32:23+00:00",
"sar:product_type": "GEC",
"sar:looks_azimuth": 2,
"sar:polarizations": [
"VV"
],
"umbra:collect_ids": [
"7cfa17f0-9b69-4686-949e-5604d24beb3c"
],
"sar:frequency_band": "X",
"sar:instrument_mode": "SPOTLIGHT",
"sar:resolution_range": 0.5,
"view:incidence_angle": 23.750572204589844,
"sar:resolution_azimuth": 0.25,
"umbra:open-data-catalog": true,
"umbra:squint_angle_degrees": 120.06241607666016,
"umbra:grazing_angle_degrees": 66.24942779541016,
"umbra:slant_range_kilometers": 567.7435913085938,
"umbra:target_azimuth_angle_degrees": 138.5594940185547,
"umbra:squint_angle_engineering_degrees": -30.062416076660156,
"umbra:squint_angle_exploitation_degrees": 59.937583923339844,
"umbra:squint_angle_degrees_off_broadside": 59.937583923339844,
"datetime": null
},
"links": [
{
"rel": "collection",
"href": "https://api.canopy.umbra.space/archive/collections/umbra-sar",
"type": "application/json"
},
{
"rel": "parent",
"href": "https://api.canopy.umbra.space/archive/collections/umbra-sar",
"type": "application/json"
},
{
"rel": "root",
"href": "https://api.canopy.umbra.space/archive/",
"type": "application/json",
"title": "stac-fastapi"
},
{
"rel": "self",
"href": "https://api.canopy.umbra.space/archive/collections/umbra-sar/items/52f2317f-091b-4f90-b385-08c93655e089",
"type": "application/geo+json"
}
],
"assets": {
"thumbnail": {
"href": "https://api.canopy.umbra.space/archive/thumbnail/52f2317f-091b-4f90-b385-08c93655e089",
"type": "image/png",
"title": "Thumbnail for 52f2317f-091b-4f90-b385-08c93655e089",
"description": "Low-resolution preview PNG thumbnail for 52f2317f-091b-4f90-b385-08c93655e089",
"roles": [
"thumbnail"
]
}
},
"collection": "umbra-sar"
},
{
"type": "Feature",
"stac_version": "1.0.0",
"stac_extensions": [
"https://stac-extensions.github.io/view/v1.0.0/schema.json",
"https://stac-extensions.github.io/sar/v1.0.0/schema.json"
],
"id": "192f767c-20f8-4b42-8ea2-d1f60fdaace1",
"geometry": {
"type": "Polygon",
"coordinates": [
[
[
-79.55942755490392,
8.960346708857864,
0
],
[
-79.56798414722924,
8.995497807881366,
0
],
[
-79.60333954966426,
8.986988377465938,
0
],
[
-79.59477975406185,
8.951838081249301,
0
],
[
-79.55942755490392,
8.960346708857864,
0
]
]
]
},
"bbox": [
-79.60333954966426,
8.951838081249301,
-79.55942755490392,
8.995497807881366
],
"properties": {
"platform": "Umbra-05",
"end_datetime": "2023-02-01T02:17:12.594980+00:00",
"umbra:task_id": "fa8af008-6dc6-4382-8f5a-205f5a6af209",
"start_datetime": "2023-02-01T02:17:08.851006+00:00",
"sar:product_type": "GEC",
"sar:looks_azimuth": 1,
"sar:polarizations": [
"VV"
],
"umbra:collect_ids": [
"1b454eab-7958-4755-bb6b-797bde214e8d"
],
"sar:frequency_band": "X",
"sar:instrument_mode": "SPOTLIGHT",
"sar:resolution_range": 0.5,
"view:incidence_angle": 59.1085205078125,
"sar:resolution_azimuth": 0.5,
"umbra:open-data-catalog": true,
"umbra:squint_angle_degrees": 180.2089385986328,
"umbra:grazing_angle_degrees": 30.8914794921875,
"umbra:slant_range_kilometers": 939.5191650390625,
"umbra:target_azimuth_angle_degrees": 77.42530059814453,
"umbra:squint_angle_engineering_degrees": -90.20893859863281,
"umbra:squint_angle_exploitation_degrees": -0.2089385986328125,
"umbra:squint_angle_degrees_off_broadside": 0.2089385986328125,
"datetime": null
},
"links": [
{
"rel": "collection",
"href": "https://api.canopy.umbra.space/archive/collections/umbra-sar",
"type": "application/json"
},
{
"rel": "parent",
"href": "https://api.canopy.umbra.space/archive/collections/umbra-sar",
"type": "application/json"
},
{
"rel": "root",
"href": "https://api.canopy.umbra.space/archive/",
"type": "application/json",
"title": "stac-fastapi"
},
{
"rel": "self",
"href": "https://api.canopy.umbra.space/archive/collections/umbra-sar/items/192f767c-20f8-4b42-8ea2-d1f60fdaace1",
"type": "application/geo+json"
}
],
"assets": {
"thumbnail": {
"href": "https://api.canopy.umbra.space/archive/thumbnail/192f767c-20f8-4b42-8ea2-d1f60fdaace1",
"type": "image/png",
"title": "Thumbnail for 192f767c-20f8-4b42-8ea2-d1f60fdaace1",
"description": "512x512 PNG thumbnail for 192f767c-20f8-4b42-8ea2-d1f60fdaace1",
"roles": [
"thumbnail"
]
}
},
"collection": "umbra-sar"
}
]
37 changes: 20 additions & 17 deletions tests/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,21 @@
HERE = Path(__file__).parent

TEST_COLLECTIONS = [
"3dep-lidar-copc",
"3dep-lidar-dsm",
"cop-dem-glo-30",
"io-lulc-annual-v02",
"io-lulc",
"landsat-c2-l1",
"landsat-c2-l2",
"naip",
"planet-nicfi-analytic",
"sentinel-1-rtc",
"sentinel-2-l2a",
"us-census",
# Microsoft Planetary Computer
"3dep-lidar-copc-pc",
"3dep-lidar-dsm-pc",
"cop-dem-glo-30-pc",
"io-lulc-annual-v02-pc",
"io-lulc-pc",
"landsat-c2-l1-pc",
"landsat-c2-l2-pc",
"naip-pc",
"planet-nicfi-analytic-pc",
"sentinel-1-rtc-pc",
"sentinel-2-l2a-pc",
"us-census-pc",
# Other
"umbra-sar",
]

CHUNK_SIZES = [2, DEFAULT_JSON_CHUNK_SIZE]
Expand All @@ -42,7 +45,7 @@
"collection_id,chunk_size", itertools.product(TEST_COLLECTIONS, CHUNK_SIZES)
)
def test_round_trip_read_write(collection_id: str, chunk_size: int):
with open(HERE / "data" / f"{collection_id}-pc.json") as f:
with open(HERE / "data" / f"{collection_id}.json") as f:
items = json.load(f)

table = parse_stac_items_to_arrow(items, chunk_size=chunk_size).read_all()
Expand All @@ -59,7 +62,7 @@ def test_round_trip_write_read_ndjson(
collection_id: str, chunk_size: int, tmp_path: Path
):
# First load into a STAC-GeoParquet table
path = HERE / "data" / f"{collection_id}-pc.json"
path = HERE / "data" / f"{collection_id}.json"
table = parse_stac_ndjson_to_arrow(path, chunk_size=chunk_size).read_all()

# Then write to disk
Expand All @@ -79,7 +82,7 @@ def test_round_trip_write_read_ndjson(

def test_table_contains_geoarrow_metadata():
collection_id = "naip"
with open(HERE / "data" / f"{collection_id}-pc.json") as f:
with open(HERE / "data" / f"{collection_id}.json") as f:
items = json.load(f)

table = parse_stac_items_to_arrow(items).read_all()
Expand All @@ -93,11 +96,11 @@ def test_table_contains_geoarrow_metadata():

@pytest.mark.parametrize("collection_id", TEST_COLLECTIONS)
def test_parse_json_to_arrow(collection_id: str):
path = HERE / "data" / f"{collection_id}-pc.json"
path = HERE / "data" / f"{collection_id}.json"
table = pa.Table.from_batches(parse_stac_ndjson_to_arrow(path))
items_result = list(stac_table_to_items(table))

with open(HERE / "data" / f"{collection_id}-pc.json") as f:
with open(HERE / "data" / f"{collection_id}.json") as f:
items = json.load(f)

for result, expected in zip(items_result, items):
Expand Down
Loading