Skip to content

Commit

Permalink
Merge pull request #131 from nansencenter/optimize_netcdf_shape_retri…
Browse files Browse the repository at this point in the history
…eval

Optimize netcdf shape retrieval
  • Loading branch information
aperrin66 authored Dec 21, 2023
2 parents ef4a625 + 72a6459 commit eb8b4e0
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 47 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
env:
BASE_IMAGE: "${{ vars.DOCKER_ORG }}/geospaas:2.5.2-python${{ matrix.python_version }}"
IMAGE_NAME: "${{ vars.DOCKER_ORG }}/geospaas_harvesting"
METANORM_VERSION: '4.1.0'
METANORM_VERSION: '4.2.0'
GEOSPAAS_DB_HOST: 'db'
GEOSPAAS_DB_USER: 'test'
GEOSPAAS_DB_PASSWORD: "${{ secrets.GEOSPAAS_DB_PASSWORD }}"
Expand Down
6 changes: 5 additions & 1 deletion geospaas_harvesting/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,12 @@ providers:
latitude_attribute: 'lat'
nansat:
type: 'nansat'
nextsim:
netcdf_l:
type: 'netcdf'
longitude_attribute: 'longitude'
latitude_attribute: 'latitude'
netcdf_L:
type: 'netcdf'
longitude_attribute: 'LONGITUDE'
latitude_attribute: 'LATITUDE'
...
56 changes: 20 additions & 36 deletions geospaas_harvesting/providers/local.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Code for searching local files"""
import itertools
import json
import logging
import uuid
Expand All @@ -7,9 +8,9 @@
import dateutil.parser
import netCDF4
import numpy as np
import shapely.wkt
from dateutil.tz import tzutc
from django.contrib.gis.geos import GEOSGeometry, LineString, MultiPoint
from django.contrib.gis.geos.point import Point
from shapely.geometry import MultiPoint

import geospaas.catalog.managers as catalog_managers
import pythesint as pti
Expand Down Expand Up @@ -117,8 +118,8 @@ def get_normalized_attributes(self, dataset_info, **kwargs):
# Find coverage to set number of points in the geolocation
if nansat_object.vrt.dataset.GetGCPs():
nansat_object.reproject_gcps()
normalized_attributes['location_geometry'] = GEOSGeometry(
nansat_object.get_border_wkt(n_points=n_points), srid=4326)
normalized_attributes['location_geometry'] = shapely.wkt.loads(
nansat_object.get_border_wkt(n_points=n_points))

json_dumped_dataset_parameters = n_metadata.get('dataset_parameters', None)
if json_dumped_dataset_parameters:
Expand Down Expand Up @@ -150,8 +151,8 @@ def __init__(self, *args, **kwargs):

# --------- get metadata ---------
def _get_geometry_wkt(self, dataset):
longitudes = dataset.variables[self.longitude_attribute]
latitudes = dataset.variables[self.latitude_attribute]
longitudes = dataset.variables[self.longitude_attribute][:]
latitudes = dataset.variables[self.latitude_attribute][:]

lonlat_dependent_data = False
for nc_variable_name, nc_variable_value in dataset.variables.items():
Expand All @@ -165,43 +166,26 @@ def _get_geometry_wkt(self, dataset):
# longitude, the longitude and latitude arrays are combined to
# find all the data points
if lonlat_dependent_data:
points = []
for lon in longitudes:
for lat in latitudes:
points.append(Point(float(lon), float(lat), srid=4326))
geometry = MultiPoint(points, srid=4326).convex_hull
valid_lon = longitudes.compressed() if np.ma.isMaskedArray(longitudes) else longitudes
valid_lat = latitudes.compressed() if np.ma.isMaskedArray(latitudes) else latitudes
points = list(itertools.product(valid_lon, valid_lat))
# If the longitude and latitude variables have the same shape,
# we assume that they contain the coordinates for each data
# point
elif longitudes.shape == latitudes.shape:
points = []
lat_fil_value = latitudes[:].fill_value if np.ma.isMaskedArray(latitudes[:]) else None
lon_fil_value = longitudes[:].fill_value if np.ma.isMaskedArray(longitudes[:]) else None
# In this case numpy.nditer() works like zip() for
# multi-dimensional arrays.
# It does not keep the masking information
for lon, lat in np.nditer((longitudes, latitudes), flags=['buffered']):
# Skip points containing masked values
# It needs to be excluded from coverage because of a
# Python optimization which the "continue" line to
# never be executed
if lon_fil_value == lon or lat_fil_value == lat:
continue # pragma: no cover
new_point = Point(float(lon), float(lat), srid=4326)
# Don't add duplicate points in trajectories
if not points or new_point != points[-1]:
points.append(new_point)

if len(longitudes.shape) == 1:
if len(points) == 1:
geometry = points[0]
masks = []
for l in (longitudes, latitudes):
if np.ma.isMaskedArray(l):
masks.append(l.mask)
else:
geometry = LineString(points, srid=4326)
else:
geometry = MultiPoint(points, srid=4326).convex_hull
masks.append(np.full(l.shape, False))
combined_mask = np.logical_or(*masks)
points = np.array(np.nditer((longitudes[~combined_mask],
latitudes[~combined_mask]),
flags=['buffered']))
else:
raise ValueError("Could not determine the spatial coverage")

geometry = MultiPoint(points).convex_hull
return geometry.wkt

def _get_raw_attributes(self, dataset_path):
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
'PyYAML',
'requests_oauthlib',
'requests',
'shapely',
],
package_data={'': ['*.yml']},
)
17 changes: 8 additions & 9 deletions tests/providers/test_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from datetime import datetime, timezone

import numpy as np
from django.contrib.gis.geos import GEOSGeometry
import shapely.wkt
from geospaas.catalog.managers import (FILE_SERVICE_NAME,
LOCAL_FILE_SERVICE,
DAP_SERVICE_NAME,
Expand Down Expand Up @@ -121,8 +121,8 @@ def test_normalize_netcdf_attributes_with_nansat(self):
self.assertEqual(normalized_attributes['platform']['Category'], 'Models/Analyses')
self.assertEqual(normalized_attributes['platform']['Series_Entity'], '')

expected_geometry = GEOSGeometry((
'POLYGON((24.88 68.08,22.46 68.71,19.96 69.31,17.39 69.87,24.88 68.08))'), srid=4326)
expected_geometry = shapely.wkt.loads(
'POLYGON((24.88 68.08,22.46 68.71,19.96 69.31,17.39 69.87,24.88 68.08))')

# This fails, which is why string representations are compared. Any explanation is welcome.
# self.assertTrue(normalized_attributes['location_geometry'].equals(expected_geometry))
Expand Down Expand Up @@ -415,7 +415,7 @@ def test_get_trajectory(self):
}
self.assertEqual(
self.crawler._get_geometry_wkt(mock_dataset),
'LINESTRING (1 2, 3 4, 5 6)')
'LINESTRING (1 2, 5 6)')

def test_get_point(self):
"""Test getting a WKT point when the shape of the latitude and
Expand Down Expand Up @@ -493,17 +493,16 @@ def test_get_polygon_from_coordinates_lists_with_masked_array(self, mock_isMaske
@mock.patch('geospaas_harvesting.providers.local.np.ma.isMaskedArray', return_value=True)
def test_get_polygon_from_coordinates_lists_with_masked_array_1d_case(self, mock_isMaskedArray):
"""Test getting a polygonal coverage from a dataset when the
latitude and longitude are 1d masked_array as an abstracted version of 2d lon and lat values
latitude and longitude are 1d masked_array as an abstracted
version of 2d lon and lat values
"""
mock_dataset = mock.Mock()
mock_dataset.dimensions = {}
mock_dataset.variables = {
'LONGITUDE': self.MaskedMockVariable(
(1,1e10, 1e10, 2, 0, 3, 1),dimensions=['LONGITUDE','LATITUDE']
),
(1, 1e10, 1e10, 2, 0, 3, 1), dimensions=['LONGITUDE','LATITUDE']),
'LATITUDE': self.MaskedMockVariable(
(1, 1e10, 1e10, 4, 0, 4, 1),dimensions=['LONGITUDE','LATITUDE']
),
(1, 1e10, 1e10, 4, 0, 4, 1), dimensions=['LONGITUDE','LATITUDE']),
}
self.assertEqual(
self.crawler._get_geometry_wkt(mock_dataset),
Expand Down

0 comments on commit eb8b4e0

Please sign in to comment.