Merge pull request #296 from NeurodataWithoutBorders/extend_h5py_data…

…set_type_check_on_caching Debug lazy dataset class types and timestamps loading
NeurodataWithoutBorders · Nov 11, 2022 · 20dd501 · 20dd501
2 parents ca878e0 + 13c4211
commit 20dd501
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 26 deletions.
diff --git a/src/nwbinspector/checks/time_series.py b/src/nwbinspector/checks/time_series.py
@@ -1,9 +1,10 @@
 """Check functions that can apply to any descendant of TimeSeries."""
 import numpy as np
 
+import h5py
 from pynwb import TimeSeries
+from hdmf.backends.hdf5.h5_utils import H5Dataset
 
-# from ..tools import all_of_type
 from ..register_checks import register_check, Importance, Severity, InspectorMessage
 from ..utils import is_regular_series, is_ascending_series
 
@@ -53,15 +54,30 @@ def check_timestamps_match_first_dimension(time_series: TimeSeries):
 
     Best Practice: :ref:`best_practice_data_orientation`
     """
-    if (
-        time_series.data is not None
-        and time_series.timestamps is not None
-        and np.array(time_series.data).shape[:1] != np.array(time_series.timestamps).shape
-    ):
+    if time_series.data is None or time_series.timestamps is None:
+        return
+
+    is_lazy_type = any(isinstance(time_series.data, lazy_type) for lazy_type in [np.memmap, h5py.Dataset, H5Dataset])
+    if is_lazy_type and time_series.data.shape[0] != time_series.timestamps.shape[0]:
         return InspectorMessage(
-            message="The length of the first dimension of data does not match the length of timestamps.",
+            message=(
+                f"The length of the first dimension of data ({time_series.data.shape[0]}) "
+                f"does not match the length of timestamps ({time_series.timestamps.shape[0]})."
+            )
         )
 
+    # object data is already loaded into memory - cast as numpy array to infer shaping
+    if not is_lazy_type:
+        data_shape = np.array(time_series.data).shape[0]
+        timestamps_shape = np.array(time_series.timestamps).shape[0]
+        if data_shape != timestamps_shape:
+            return InspectorMessage(
+                message=(
+                    f"The length of the first dimension of data ({data_shape}) "
+                    f"does not match the length of timestamps ({timestamps_shape})."
+                )
+            )
+
 
 @register_check(importance=Importance.BEST_PRACTICE_VIOLATION, neurodata_type=TimeSeries)
 def check_timestamps_ascending(time_series: TimeSeries, nelems=200):

diff --git a/src/nwbinspector/utils.py b/src/nwbinspector/utils.py
@@ -12,6 +12,7 @@
 import h5py
 import numpy as np
 from numpy.typing import ArrayLike
+from hdmf.backends.hdf5.h5_utils import H5Dataset
 
 
 PathType = TypeVar("PathType", str, Path)  # For types that can be either files or folders
@@ -33,15 +34,19 @@ def _cache_data_retrieval_command(
 
 def _cache_data_selection(data: Union[h5py.Dataset, ArrayLike], selection: Union[slice, Tuple[slice]]) -> np.ndarray:
     """Extract the selection lazily from the data object for efficient caching (most beneficial during streaming)."""
-    if isinstance(data, np.memmap):  # Technically np.memmap should be able to support this type of behavior as well
-        return data[selection]  # But they aren't natively hashable either...
-    if not isinstance(data, h5py.Dataset):  # No need to attempt to cache if already an in-memory object
+    if isinstance(data, np.memmap):  # np.memmap objects are not hashable - simply return the selection lazily
+        return data[selection]
+    if not (
+        isinstance(data, h5py.Dataset) or isinstance(data, H5Dataset)
+    ):  # No need to attempt to cache if data is already in-memory
+        # Cast as numpy array for efficient fancy indexing
+        # Note that this technically copies the entire data, so could use more than 2x RAM for that object
         return np.array(data)[selection]
 
-    # slices also aren't hashable, but their reduced representation is
-    if isinstance(selection, slice):  # If a single slice
-        reduced_selection = tuple([selection.__reduce__()[1]])  # if a single slice
-    else:
+    # Slices aren't hashable, but their reduced representation is
+    if isinstance(selection, slice):  # A single slice
+        reduced_selection = tuple([selection.__reduce__()[1]])
+    else:  # Iterable of slices
         reduced_selection = tuple([selection_slice.__reduce__()[1] for selection_slice in selection])
     return _cache_data_retrieval_command(data=data, reduced_selection=reduced_selection)
 

diff --git a/tests/test_inspector.py b/tests/test_inspector.py
@@ -181,7 +181,7 @@ def test_inspect_all(self):
                 file_path=self.nwbfile_paths[0],
             ),
             InspectorMessage(
-                message="The length of the first dimension of data does not match the length of timestamps.",
+                message="The length of the first dimension of data (4) does not match the length of timestamps (3).",
                 importance=Importance.CRITICAL,
                 severity=Severity.LOW,
                 check_function_name="check_timestamps_match_first_dimension",
@@ -248,7 +248,9 @@ def test_inspect_all_parallel(self):
                     file_path=self.nwbfile_paths[0],
                 ),
                 InspectorMessage(
-                    message="The length of the first dimension of data does not match the length of timestamps.",
+                    message=(
+                        "The length of the first dimension of data (4) does not match the length of timestamps (3)."
+                    ),
                     importance=Importance.CRITICAL,
                     severity=Severity.LOW,
                     check_function_name="check_timestamps_match_first_dimension",
@@ -312,7 +314,7 @@ def test_inspect_nwb(self):
                 file_path=self.nwbfile_paths[0],
             ),
             InspectorMessage(
-                message="The length of the first dimension of data does not match the length of timestamps.",
+                message="The length of the first dimension of data (4) does not match the length of timestamps (3).",
                 importance=Importance.CRITICAL,
                 check_function_name="check_timestamps_match_first_dimension",
                 object_type="TimeSeries",
@@ -343,7 +345,7 @@ def test_inspect_nwb_importance_threshold_as_importance(self):
                 file_path=self.nwbfile_paths[0],
             ),
             InspectorMessage(
-                message="The length of the first dimension of data does not match the length of timestamps.",
+                message="The length of the first dimension of data (4) does not match the length of timestamps (3).",
                 importance=Importance.CRITICAL,
                 check_function_name="check_timestamps_match_first_dimension",
                 object_type="TimeSeries",
@@ -372,7 +374,7 @@ def test_inspect_nwb_importance_threshold_as_string(self):
                 file_path=self.nwbfile_paths[0],
             ),
             InspectorMessage(
-                message="The length of the first dimension of data does not match the length of timestamps.",
+                message="The length of the first dimension of data (4) does not match the length of timestamps (3).",
                 importance=Importance.CRITICAL,
                 check_function_name="check_timestamps_match_first_dimension",
                 object_type="TimeSeries",
@@ -548,7 +550,7 @@ def test_inspect_nwb_dandi_config(self):
                 file_path=self.nwbfile_paths[0],
             ),
             InspectorMessage(
-                message="The length of the first dimension of data does not match the length of timestamps.",
+                message="The length of the first dimension of data (4) does not match the length of timestamps (3).",
                 importance=Importance.CRITICAL,
                 check_function_name="check_timestamps_match_first_dimension",
                 object_type="TimeSeries",

diff --git a/tests/true_nwbinspector_default_report.txt b/tests/true_nwbinspector_default_report.txt
@@ -19,7 +19,7 @@ Found 5 issues over 2 files:
        Message: Data may be in the wrong orientation. Time should be in the first dimension, and is usually the longest dimension. Here, another dimension is longer.
 
 0.1  ./testing0.nwb: check_timestamps_match_first_dimension - 'TimeSeries' object at location '/acquisition/test_time_series_3'
-       Message: The length of the first dimension of data does not match the length of timestamps.
+       Message: The length of the first dimension of data (4) does not match the length of timestamps (3).
 
 1  BEST_PRACTICE_VIOLATION
 ==========================

diff --git a/tests/unit_tests/test_time_series.py b/tests/unit_tests/test_time_series.py
@@ -82,11 +82,11 @@ def test_check_timestamps():
         time_series=pynwb.TimeSeries(
             name="test_time_series",
             unit="test_units",
-            data=np.zeros(shape=4),
+            data=np.empty(shape=4),
             timestamps=[1.0, 2.0, 3.0],
         )
     ) == InspectorMessage(
-        message="The length of the first dimension of data does not match the length of timestamps.",
+        message="The length of the first dimension of data (4) does not match the length of timestamps (3).",
         importance=Importance.CRITICAL,
         check_function_name="check_timestamps_match_first_dimension",
         object_type="TimeSeries",
@@ -99,7 +99,7 @@ def test_check_timestamps_empty_data():
     assert check_timestamps_match_first_dimension(
         time_series=pynwb.TimeSeries(name="test_time_series", unit="test_units", data=[], timestamps=[1.0, 2.0, 3.0])
     ) == InspectorMessage(
-        message="The length of the first dimension of data does not match the length of timestamps.",
+        message="The length of the first dimension of data (0) does not match the length of timestamps (3).",
         importance=Importance.CRITICAL,
         check_function_name="check_timestamps_match_first_dimension",
         object_type="TimeSeries",
@@ -110,9 +110,9 @@ def test_check_timestamps_empty_data():
 
 def test_check_timestamps_empty_timestamps():
     assert check_timestamps_match_first_dimension(
-        time_series=pynwb.TimeSeries(name="test_time_series", unit="test_units", data=np.zeros(shape=4), timestamps=[])
+        time_series=pynwb.TimeSeries(name="test_time_series", unit="test_units", data=np.empty(shape=4), timestamps=[])
     ) == InspectorMessage(
-        message="The length of the first dimension of data does not match the length of timestamps.",
+        message="The length of the first dimension of data (4) does not match the length of timestamps (0).",
         importance=Importance.CRITICAL,
         check_function_name="check_timestamps_match_first_dimension",
         object_type="TimeSeries",