Skip to content

Commit

Permalink
Merge pull request #296 from NeurodataWithoutBorders/extend_h5py_data…
Browse files Browse the repository at this point in the history
…set_type_check_on_caching

Debug lazy dataset class types and timestamps loading
  • Loading branch information
CodyCBakerPhD authored Nov 11, 2022
2 parents ca878e0 + 13c4211 commit 20dd501
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 26 deletions.
30 changes: 23 additions & 7 deletions src/nwbinspector/checks/time_series.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""Check functions that can apply to any descendant of TimeSeries."""
import numpy as np

import h5py
from pynwb import TimeSeries
from hdmf.backends.hdf5.h5_utils import H5Dataset

# from ..tools import all_of_type
from ..register_checks import register_check, Importance, Severity, InspectorMessage
from ..utils import is_regular_series, is_ascending_series

Expand Down Expand Up @@ -53,15 +54,30 @@ def check_timestamps_match_first_dimension(time_series: TimeSeries):
Best Practice: :ref:`best_practice_data_orientation`
"""
if (
time_series.data is not None
and time_series.timestamps is not None
and np.array(time_series.data).shape[:1] != np.array(time_series.timestamps).shape
):
if time_series.data is None or time_series.timestamps is None:
return

is_lazy_type = any(isinstance(time_series.data, lazy_type) for lazy_type in [np.memmap, h5py.Dataset, H5Dataset])
if is_lazy_type and time_series.data.shape[0] != time_series.timestamps.shape[0]:
return InspectorMessage(
message="The length of the first dimension of data does not match the length of timestamps.",
message=(
f"The length of the first dimension of data ({time_series.data.shape[0]}) "
f"does not match the length of timestamps ({time_series.timestamps.shape[0]})."
)
)

# object data is already loaded into memory - cast as numpy array to infer shaping
if not is_lazy_type:
data_shape = np.array(time_series.data).shape[0]
timestamps_shape = np.array(time_series.timestamps).shape[0]
if data_shape != timestamps_shape:
return InspectorMessage(
message=(
f"The length of the first dimension of data ({data_shape}) "
f"does not match the length of timestamps ({timestamps_shape})."
)
)


@register_check(importance=Importance.BEST_PRACTICE_VIOLATION, neurodata_type=TimeSeries)
def check_timestamps_ascending(time_series: TimeSeries, nelems=200):
Expand Down
19 changes: 12 additions & 7 deletions src/nwbinspector/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import h5py
import numpy as np
from numpy.typing import ArrayLike
from hdmf.backends.hdf5.h5_utils import H5Dataset


PathType = TypeVar("PathType", str, Path) # For types that can be either files or folders
Expand All @@ -33,15 +34,19 @@ def _cache_data_retrieval_command(

def _cache_data_selection(data: Union[h5py.Dataset, ArrayLike], selection: Union[slice, Tuple[slice]]) -> np.ndarray:
"""Extract the selection lazily from the data object for efficient caching (most beneficial during streaming)."""
if isinstance(data, np.memmap): # Technically np.memmap should be able to support this type of behavior as well
return data[selection] # But they aren't natively hashable either...
if not isinstance(data, h5py.Dataset): # No need to attempt to cache if already an in-memory object
if isinstance(data, np.memmap): # np.memmap objects are not hashable - simply return the selection lazily
return data[selection]
if not (
isinstance(data, h5py.Dataset) or isinstance(data, H5Dataset)
): # No need to attempt to cache if data is already in-memory
# Cast as numpy array for efficient fancy indexing
# Note that this technically copies the entire data, so could use more than 2x RAM for that object
return np.array(data)[selection]

# slices also aren't hashable, but their reduced representation is
if isinstance(selection, slice): # If a single slice
reduced_selection = tuple([selection.__reduce__()[1]]) # if a single slice
else:
# Slices aren't hashable, but their reduced representation is
if isinstance(selection, slice): # A single slice
reduced_selection = tuple([selection.__reduce__()[1]])
else: # Iterable of slices
reduced_selection = tuple([selection_slice.__reduce__()[1] for selection_slice in selection])
return _cache_data_retrieval_command(data=data, reduced_selection=reduced_selection)

Expand Down
14 changes: 8 additions & 6 deletions tests/test_inspector.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def test_inspect_all(self):
file_path=self.nwbfile_paths[0],
),
InspectorMessage(
message="The length of the first dimension of data does not match the length of timestamps.",
message="The length of the first dimension of data (4) does not match the length of timestamps (3).",
importance=Importance.CRITICAL,
severity=Severity.LOW,
check_function_name="check_timestamps_match_first_dimension",
Expand Down Expand Up @@ -248,7 +248,9 @@ def test_inspect_all_parallel(self):
file_path=self.nwbfile_paths[0],
),
InspectorMessage(
message="The length of the first dimension of data does not match the length of timestamps.",
message=(
"The length of the first dimension of data (4) does not match the length of timestamps (3)."
),
importance=Importance.CRITICAL,
severity=Severity.LOW,
check_function_name="check_timestamps_match_first_dimension",
Expand Down Expand Up @@ -312,7 +314,7 @@ def test_inspect_nwb(self):
file_path=self.nwbfile_paths[0],
),
InspectorMessage(
message="The length of the first dimension of data does not match the length of timestamps.",
message="The length of the first dimension of data (4) does not match the length of timestamps (3).",
importance=Importance.CRITICAL,
check_function_name="check_timestamps_match_first_dimension",
object_type="TimeSeries",
Expand Down Expand Up @@ -343,7 +345,7 @@ def test_inspect_nwb_importance_threshold_as_importance(self):
file_path=self.nwbfile_paths[0],
),
InspectorMessage(
message="The length of the first dimension of data does not match the length of timestamps.",
message="The length of the first dimension of data (4) does not match the length of timestamps (3).",
importance=Importance.CRITICAL,
check_function_name="check_timestamps_match_first_dimension",
object_type="TimeSeries",
Expand Down Expand Up @@ -372,7 +374,7 @@ def test_inspect_nwb_importance_threshold_as_string(self):
file_path=self.nwbfile_paths[0],
),
InspectorMessage(
message="The length of the first dimension of data does not match the length of timestamps.",
message="The length of the first dimension of data (4) does not match the length of timestamps (3).",
importance=Importance.CRITICAL,
check_function_name="check_timestamps_match_first_dimension",
object_type="TimeSeries",
Expand Down Expand Up @@ -548,7 +550,7 @@ def test_inspect_nwb_dandi_config(self):
file_path=self.nwbfile_paths[0],
),
InspectorMessage(
message="The length of the first dimension of data does not match the length of timestamps.",
message="The length of the first dimension of data (4) does not match the length of timestamps (3).",
importance=Importance.CRITICAL,
check_function_name="check_timestamps_match_first_dimension",
object_type="TimeSeries",
Expand Down
2 changes: 1 addition & 1 deletion tests/true_nwbinspector_default_report.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ Found 5 issues over 2 files:
Message: Data may be in the wrong orientation. Time should be in the first dimension, and is usually the longest dimension. Here, another dimension is longer.

0.1 ./testing0.nwb: check_timestamps_match_first_dimension - 'TimeSeries' object at location '/acquisition/test_time_series_3'
Message: The length of the first dimension of data does not match the length of timestamps.
Message: The length of the first dimension of data (4) does not match the length of timestamps (3).

1 BEST_PRACTICE_VIOLATION
==========================
Expand Down
10 changes: 5 additions & 5 deletions tests/unit_tests/test_time_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,11 @@ def test_check_timestamps():
time_series=pynwb.TimeSeries(
name="test_time_series",
unit="test_units",
data=np.zeros(shape=4),
data=np.empty(shape=4),
timestamps=[1.0, 2.0, 3.0],
)
) == InspectorMessage(
message="The length of the first dimension of data does not match the length of timestamps.",
message="The length of the first dimension of data (4) does not match the length of timestamps (3).",
importance=Importance.CRITICAL,
check_function_name="check_timestamps_match_first_dimension",
object_type="TimeSeries",
Expand All @@ -99,7 +99,7 @@ def test_check_timestamps_empty_data():
assert check_timestamps_match_first_dimension(
time_series=pynwb.TimeSeries(name="test_time_series", unit="test_units", data=[], timestamps=[1.0, 2.0, 3.0])
) == InspectorMessage(
message="The length of the first dimension of data does not match the length of timestamps.",
message="The length of the first dimension of data (0) does not match the length of timestamps (3).",
importance=Importance.CRITICAL,
check_function_name="check_timestamps_match_first_dimension",
object_type="TimeSeries",
Expand All @@ -110,9 +110,9 @@ def test_check_timestamps_empty_data():

def test_check_timestamps_empty_timestamps():
assert check_timestamps_match_first_dimension(
time_series=pynwb.TimeSeries(name="test_time_series", unit="test_units", data=np.zeros(shape=4), timestamps=[])
time_series=pynwb.TimeSeries(name="test_time_series", unit="test_units", data=np.empty(shape=4), timestamps=[])
) == InspectorMessage(
message="The length of the first dimension of data does not match the length of timestamps.",
message="The length of the first dimension of data (4) does not match the length of timestamps (0).",
importance=Importance.CRITICAL,
check_function_name="check_timestamps_match_first_dimension",
object_type="TimeSeries",
Expand Down

0 comments on commit 20dd501

Please sign in to comment.