diff --git a/src/nwbinspector/checks/time_series.py b/src/nwbinspector/checks/time_series.py index ce3366f2c..eb255f631 100644 --- a/src/nwbinspector/checks/time_series.py +++ b/src/nwbinspector/checks/time_series.py @@ -1,9 +1,10 @@ """Check functions that can apply to any descendant of TimeSeries.""" import numpy as np +import h5py from pynwb import TimeSeries +from hdmf.backends.hdf5.h5_utils import H5Dataset -# from ..tools import all_of_type from ..register_checks import register_check, Importance, Severity, InspectorMessage from ..utils import is_regular_series, is_ascending_series @@ -53,15 +54,30 @@ def check_timestamps_match_first_dimension(time_series: TimeSeries): Best Practice: :ref:`best_practice_data_orientation` """ - if ( - time_series.data is not None - and time_series.timestamps is not None - and np.array(time_series.data).shape[:1] != np.array(time_series.timestamps).shape - ): + if time_series.data is None or time_series.timestamps is None: + return + + is_lazy_type = any(isinstance(time_series.data, lazy_type) for lazy_type in [np.memmap, h5py.Dataset, H5Dataset]) + if is_lazy_type and time_series.data.shape[0] != time_series.timestamps.shape[0]: return InspectorMessage( - message="The length of the first dimension of data does not match the length of timestamps.", + message=( + f"The length of the first dimension of data ({time_series.data.shape[0]}) " + f"does not match the length of timestamps ({time_series.timestamps.shape[0]})." + ) ) + # object data is already loaded into memory - cast as numpy array to infer shaping + if not is_lazy_type: + data_shape = np.array(time_series.data).shape[0] + timestamps_shape = np.array(time_series.timestamps).shape[0] + if data_shape != timestamps_shape: + return InspectorMessage( + message=( + f"The length of the first dimension of data ({data_shape}) " + f"does not match the length of timestamps ({timestamps_shape})." + ) + ) + @register_check(importance=Importance.BEST_PRACTICE_VIOLATION, neurodata_type=TimeSeries) def check_timestamps_ascending(time_series: TimeSeries, nelems=200): diff --git a/src/nwbinspector/utils.py b/src/nwbinspector/utils.py index a0542d55e..4e2693171 100644 --- a/src/nwbinspector/utils.py +++ b/src/nwbinspector/utils.py @@ -12,6 +12,7 @@ import h5py import numpy as np from numpy.typing import ArrayLike +from hdmf.backends.hdf5.h5_utils import H5Dataset PathType = TypeVar("PathType", str, Path) # For types that can be either files or folders @@ -33,15 +34,19 @@ def _cache_data_retrieval_command( def _cache_data_selection(data: Union[h5py.Dataset, ArrayLike], selection: Union[slice, Tuple[slice]]) -> np.ndarray: """Extract the selection lazily from the data object for efficient caching (most beneficial during streaming).""" - if isinstance(data, np.memmap): # Technically np.memmap should be able to support this type of behavior as well - return data[selection] # But they aren't natively hashable either... - if not isinstance(data, h5py.Dataset): # No need to attempt to cache if already an in-memory object + if isinstance(data, np.memmap): # np.memmap objects are not hashable - simply return the selection lazily + return data[selection] + if not ( + isinstance(data, h5py.Dataset) or isinstance(data, H5Dataset) + ): # No need to attempt to cache if data is already in-memory + # Cast as numpy array for efficient fancy indexing + # Note that this technically copies the entire data, so could use more than 2x RAM for that object return np.array(data)[selection] - # slices also aren't hashable, but their reduced representation is - if isinstance(selection, slice): # If a single slice - reduced_selection = tuple([selection.__reduce__()[1]]) # if a single slice - else: + # Slices aren't hashable, but their reduced representation is + if isinstance(selection, slice): # A single slice + reduced_selection = tuple([selection.__reduce__()[1]]) + else: # Iterable of slices reduced_selection = tuple([selection_slice.__reduce__()[1] for selection_slice in selection]) return _cache_data_retrieval_command(data=data, reduced_selection=reduced_selection) diff --git a/tests/test_inspector.py b/tests/test_inspector.py index 90e86594a..c0dad5950 100644 --- a/tests/test_inspector.py +++ b/tests/test_inspector.py @@ -181,7 +181,7 @@ def test_inspect_all(self): file_path=self.nwbfile_paths[0], ), InspectorMessage( - message="The length of the first dimension of data does not match the length of timestamps.", + message="The length of the first dimension of data (4) does not match the length of timestamps (3).", importance=Importance.CRITICAL, severity=Severity.LOW, check_function_name="check_timestamps_match_first_dimension", @@ -248,7 +248,9 @@ def test_inspect_all_parallel(self): file_path=self.nwbfile_paths[0], ), InspectorMessage( - message="The length of the first dimension of data does not match the length of timestamps.", + message=( + "The length of the first dimension of data (4) does not match the length of timestamps (3)." + ), importance=Importance.CRITICAL, severity=Severity.LOW, check_function_name="check_timestamps_match_first_dimension", @@ -312,7 +314,7 @@ def test_inspect_nwb(self): file_path=self.nwbfile_paths[0], ), InspectorMessage( - message="The length of the first dimension of data does not match the length of timestamps.", + message="The length of the first dimension of data (4) does not match the length of timestamps (3).", importance=Importance.CRITICAL, check_function_name="check_timestamps_match_first_dimension", object_type="TimeSeries", @@ -343,7 +345,7 @@ def test_inspect_nwb_importance_threshold_as_importance(self): file_path=self.nwbfile_paths[0], ), InspectorMessage( - message="The length of the first dimension of data does not match the length of timestamps.", + message="The length of the first dimension of data (4) does not match the length of timestamps (3).", importance=Importance.CRITICAL, check_function_name="check_timestamps_match_first_dimension", object_type="TimeSeries", @@ -372,7 +374,7 @@ def test_inspect_nwb_importance_threshold_as_string(self): file_path=self.nwbfile_paths[0], ), InspectorMessage( - message="The length of the first dimension of data does not match the length of timestamps.", + message="The length of the first dimension of data (4) does not match the length of timestamps (3).", importance=Importance.CRITICAL, check_function_name="check_timestamps_match_first_dimension", object_type="TimeSeries", @@ -548,7 +550,7 @@ def test_inspect_nwb_dandi_config(self): file_path=self.nwbfile_paths[0], ), InspectorMessage( - message="The length of the first dimension of data does not match the length of timestamps.", + message="The length of the first dimension of data (4) does not match the length of timestamps (3).", importance=Importance.CRITICAL, check_function_name="check_timestamps_match_first_dimension", object_type="TimeSeries", diff --git a/tests/true_nwbinspector_default_report.txt b/tests/true_nwbinspector_default_report.txt index 11458303a..846255249 100644 --- a/tests/true_nwbinspector_default_report.txt +++ b/tests/true_nwbinspector_default_report.txt @@ -19,7 +19,7 @@ Found 5 issues over 2 files: Message: Data may be in the wrong orientation. Time should be in the first dimension, and is usually the longest dimension. Here, another dimension is longer. 0.1 ./testing0.nwb: check_timestamps_match_first_dimension - 'TimeSeries' object at location '/acquisition/test_time_series_3' - Message: The length of the first dimension of data does not match the length of timestamps. + Message: The length of the first dimension of data (4) does not match the length of timestamps (3). 1 BEST_PRACTICE_VIOLATION ========================== diff --git a/tests/unit_tests/test_time_series.py b/tests/unit_tests/test_time_series.py index b1dc81930..ea5dacd39 100644 --- a/tests/unit_tests/test_time_series.py +++ b/tests/unit_tests/test_time_series.py @@ -82,11 +82,11 @@ def test_check_timestamps(): time_series=pynwb.TimeSeries( name="test_time_series", unit="test_units", - data=np.zeros(shape=4), + data=np.empty(shape=4), timestamps=[1.0, 2.0, 3.0], ) ) == InspectorMessage( - message="The length of the first dimension of data does not match the length of timestamps.", + message="The length of the first dimension of data (4) does not match the length of timestamps (3).", importance=Importance.CRITICAL, check_function_name="check_timestamps_match_first_dimension", object_type="TimeSeries", @@ -99,7 +99,7 @@ def test_check_timestamps_empty_data(): assert check_timestamps_match_first_dimension( time_series=pynwb.TimeSeries(name="test_time_series", unit="test_units", data=[], timestamps=[1.0, 2.0, 3.0]) ) == InspectorMessage( - message="The length of the first dimension of data does not match the length of timestamps.", + message="The length of the first dimension of data (0) does not match the length of timestamps (3).", importance=Importance.CRITICAL, check_function_name="check_timestamps_match_first_dimension", object_type="TimeSeries", @@ -110,9 +110,9 @@ def test_check_timestamps_empty_data(): def test_check_timestamps_empty_timestamps(): assert check_timestamps_match_first_dimension( - time_series=pynwb.TimeSeries(name="test_time_series", unit="test_units", data=np.zeros(shape=4), timestamps=[]) + time_series=pynwb.TimeSeries(name="test_time_series", unit="test_units", data=np.empty(shape=4), timestamps=[]) ) == InspectorMessage( - message="The length of the first dimension of data does not match the length of timestamps.", + message="The length of the first dimension of data (4) does not match the length of timestamps (0).", importance=Importance.CRITICAL, check_function_name="check_timestamps_match_first_dimension", object_type="TimeSeries",