Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add read-in functionality for deeply nested variables #281

Merged
merged 13 commits into from
Mar 17, 2022
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/example_notebooks/IS2_data_read-in.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@
"\n",
"***ATTENTION: icepyx loads your data by creating an Xarray DataSet for each input granule and then merging them. In some cases, the automatic merge fails and needs to be handled manually. In these cases, icepyx will return a warning with the error message from the failed Xarray merge and a list of per-granule DataSets***\n",
"\n",
"This can happen if you unintentionally provide the same granule multiple times with different filenames."
"This can happen if you unintentionally provide the same granule multiple times with different filenames or in segmented products where the rgt+cycle automatically generated `gran_idx` values match. In this latter case, you can simply provide unique `gran_idx` values for each DataSet in `ds` and run `import xarray as xr` and `ds_merged = xr.merge(ds)` to create one merged DataSet."
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion icepyx/core/is2ref.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def _default_varlists(product):

else:
print(
"THE REQUESTED PRODUCT DOES NOT YET HAVE A DEFAULT LIST SET UP. ONLY DELTA_TIME, LATITUTDE, AND LONGITUDE WILL BE RETURNED"
"THE REQUESTED PRODUCT DOES NOT YET HAVE A DEFAULT LIST SET UP. ONLY DELTA_TIME, LATITUDE, AND LONGITUDE WILL BE RETURNED"
)
return common_list

Expand Down
2 changes: 1 addition & 1 deletion icepyx/core/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -1017,7 +1017,7 @@ def download_granules(
by default when subset=True, but additional subsetting options are available.
Spatial subsetting returns all data that are within the area of interest (but not complete
granules. This eliminates false-positive granules returned by the metadata-level search)
restart: boolean, default false
restart : boolean, default false
If previous download was terminated unexpectedly. Run again with restart set to True to continue.
**kwargs : key-value pairs
Additional parameters to be passed to the subsetter.
Expand Down
88 changes: 71 additions & 17 deletions icepyx/core/read.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,9 +311,9 @@ def _check_source_for_pattern(source, filename_pattern):
return False, None

@staticmethod
def _add_var_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict):
def _add_vars_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict):
"""
Add the new variable group to the dataset template.
Add the new variables in the group to the dataset template.

Parameters
----------
Expand All @@ -336,11 +336,9 @@ def _add_var_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict):
Xarray Dataset with variables from the ds variable group added.
"""

wanted_vars = list(wanted_dict.keys())

if grp_path in ["orbit_info", "ancillary_data"]:
grp_spec_vars = [
wanted_vars[i]
wanted_groups_tiered[-1][i]
for i, x in enumerate(wanted_groups_tiered[0])
if x == grp_path
]
Expand Down Expand Up @@ -389,9 +387,10 @@ def _add_var_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict):
# add a test for the new function (called here)!

grp_spec_vars = [
k for k, v in wanted_dict.items() if any(grp_path in x for x in v)
k
for k, v in wanted_dict.items()
if any(f"{grp_path}/{k}" in x for x in v)
]
# print(grp_spec_vars)

ds = (
ds.reset_coords(drop=False)
Expand All @@ -400,17 +399,57 @@ def _add_var_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict):
.assign(gt=(("gran_idx", "spot"), [[gt_str]]))
)

# print(ds)
grp_spec_vars.append("gt")
is2ds = is2ds.merge(
ds[grp_spec_vars], join="outer", combine_attrs="no_conflicts"
)
# print(is2ds)

# re-cast some dtypes to make array smaller
is2ds["gt"] = is2ds.gt.astype(str)
is2ds["spot"] = is2ds.spot.astype(np.uint8)

return is2ds, ds[grp_spec_vars]

@staticmethod
def _combine_nested_vars(is2ds, ds, grp_path, wanted_dict):
"""
Add the deeply nested variables to a dataset with appropriate coordinate information.

Parameters
----------
is2ds : Xarray dataset
Dataset to add deeply nested variables to.
ds : Xarray dataset
Dataset containing proper dimensions for the variables being added
grp_path : str
hdf5 group path read into ds
wanted_dict : dict
Dictionary with variable names as keys and a list of group + variable paths containing those variables as values.

Returns
-------
Xarray Dataset with variables from the ds variable group added.
"""

grp_spec_vars = [
k for k, v in wanted_dict.items() if any(f"{grp_path}/{k}" in x for x in v)
]

# # Use this to handle issues specific to group paths that are more nested
# tiers = len(wanted_groups_tiered)
# if tiers > 3 and grp_path.count("/") == tiers - 2:
# # Handle attribute conflicts that arose from data descriptions during merging
# for var in grp_spec_vars:
# ds[var].attrs = ds.attrs
# for k in ds[var].attrs.keys():
# ds.attrs.pop(k)
# # warnings.warn(
# # "Due to the number of layers of variable group paths, some attributes have been dropped from your DataSet during merging",
# # UserWarning,
# # )

is2ds = is2ds.assign(ds[grp_spec_vars])

return is2ds

def load(self):
Expand Down Expand Up @@ -485,7 +524,7 @@ def _build_dataset_template(self, file):
)
return is2ds

def _read_single_var(self, file, grp_path):
def _read_single_grp(self, file, grp_path):
"""
For a given file and variable group path, construct an Intake catalog and use it to read in the data.

Expand Down Expand Up @@ -519,12 +558,10 @@ def _read_single_var(self, file, grp_path):
grp_paths=grp_path,
extra_engine_kwargs={"phony_dims": "access"},
)

ds = grpcat[self._source_type].read()

return ds

# NOTE: for non-gridded datasets only
def _build_single_file_dataset(self, file, groups_list):
"""
Create a single xarray dataset with all of the wanted variables/groups from the wanted var list for a single data file/url.
Expand All @@ -544,7 +581,7 @@ def _build_single_file_dataset(self, file, groups_list):
Xarray Dataset
"""

file_product = self._read_single_var(file, "/").attrs["identifier_product_type"]
file_product = self._read_single_grp(file, "/").attrs["identifier_product_type"]
assert (
file_product == self._prod
), "Your product specification does not match the product specification within your files."
Expand Down Expand Up @@ -577,13 +614,30 @@ def _build_single_file_dataset(self, file, groups_list):
wanted_groups_set = set(wanted_groups)
# orbit_info is used automatically as the first group path so the info is available for the rest of the groups
wanted_groups_set.remove("orbit_info")
# Note: the sorting is critical for datasets with highly nested groups
wanted_groups_list = ["orbit_info"] + sorted(wanted_groups_set)
# returns the wanted groups as a list of lists with group path string elements separated
_, wanted_groups_tiered = Variables.parse_var_list(groups_list, tiered=True)
_, wanted_groups_tiered = Variables.parse_var_list(
groups_list, tiered=True, tiered_vars=True
)

for grp_path in ["orbit_info"] + list(wanted_groups_set):
ds = self._read_single_var(file, grp_path)
is2ds = Read._add_var_to_ds(
while wanted_groups_list:
grp_path = wanted_groups_list[0]
wanted_groups_list = wanted_groups_list[1:]
ds = self._read_single_grp(file, grp_path)
is2ds, ds = Read._add_vars_to_ds(
is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict
)

# if there are any deeper nested variables, get those so they have actual coordinates and add them
if any(grp_path in grp_path2 for grp_path2 in wanted_groups_list):
for grp_path2 in wanted_groups_list:
if grp_path in grp_path2:
sub_ds = self._read_single_grp(file, grp_path2)
ds = Read._combine_nested_vars(
ds, sub_ds, grp_path2, wanted_dict
)
wanted_groups_list.remove(grp_path2)
is2ds = is2ds.merge(ds, join="outer", combine_attrs="no_conflicts")

return is2ds
14 changes: 12 additions & 2 deletions icepyx/core/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def visitor_func(name, node):
return self._avail

@staticmethod
def parse_var_list(varlist, tiered=True):
def parse_var_list(varlist, tiered=True, tiered_vars=False):
"""
Parse a list of path strings into tiered lists and names of variables

Expand All @@ -149,6 +149,11 @@ def parse_var_list(varlist, tiered=True):
(e.g. [['orbit_info', 'ancillary_data', 'gt1l'],['none','none','land_ice_segments']])
or a single list of path strings (e.g. ['orbit_info','ancillary_data','gt1l/land_ice_segments'])

tiered_vars : boolean, default False
Whether or not to append a list of the variable names to the nested list of component strings
(e.g. [['orbit_info', 'ancillary_data', 'gt1l'],['none','none','land_ice_segments'],
['sc_orient','atlas_sdp_gps_epoch','h_li']]))

Examples
--------
>>> reg_a = ipx.Query('ATL06',[-55, 68, -48, 71],['2019-02-20','2019-02-28'], version='1') # doctest: +SKIP
Expand Down Expand Up @@ -215,7 +220,10 @@ def parse_var_list(varlist, tiered=True):
else:
num = np.max([v.count("/") for v in varlist])
# print('max needed: ' + str(num))
paths = [[] for i in range(num)]
if tiered_vars == True:
paths = [[] for i in range(num + 1)]
else:
paths = [[] for i in range(num)]

# print(self._cust_options['variables'])
for vn in varlist:
Expand All @@ -237,6 +245,8 @@ def parse_var_list(varlist, tiered=True):
for i in range(j, num):
paths[i].append("none")
i = i + 1
if tiered_vars == True:
paths[num].append(vkey)

return vgrp, paths

Expand Down