From 8a44291a99ebeced8644ed3cbcb7cf5287853f84 Mon Sep 17 00:00:00 2001
From: CodyCBakerPhD <cody.c.baker.phd@gmail.com>
Date: Sun, 30 Oct 2022 23:11:06 +0000
Subject: [PATCH 1/7] debugging

---
 src/nwbinspector/checks/tables.py      | 2 ++
 src/nwbinspector/checks/time_series.py | 5 ++---
 src/nwbinspector/nwbinspector.py       | 1 +
 src/nwbinspector/utils.py              | 6 +++++-
 4 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/nwbinspector/checks/tables.py b/src/nwbinspector/checks/tables.py
index 613156d10..55426f8da 100644
--- a/src/nwbinspector/checks/tables.py
+++ b/src/nwbinspector/checks/tables.py
@@ -113,6 +113,8 @@ def check_column_binary_capability(table: DynamicTable, nelems: int = 200):
             if np.asarray(column.data[0]).itemsize == 1:
                 continue  # already boolean, int8, or uint8
             try:
+                print(column.name)
+                print(column.data.shape)
                 unique_values = np.unique(_cache_data_selection(data=column.data, selection=slice(nelems)))
             except TypeError:  # some contained objects are unhashable or have no comparison defined
                 continue
diff --git a/src/nwbinspector/checks/time_series.py b/src/nwbinspector/checks/time_series.py
index ce3366f2c..7aa925c9e 100644
--- a/src/nwbinspector/checks/time_series.py
+++ b/src/nwbinspector/checks/time_series.py
@@ -3,7 +3,6 @@
 
 from pynwb import TimeSeries
 
-# from ..tools import all_of_type
 from ..register_checks import register_check, Importance, Severity, InspectorMessage
 from ..utils import is_regular_series, is_ascending_series
 
@@ -56,10 +55,10 @@ def check_timestamps_match_first_dimension(time_series: TimeSeries):
     if (
         time_series.data is not None
         and time_series.timestamps is not None
-        and np.array(time_series.data).shape[:1] != np.array(time_series.timestamps).shape
+        and time_series.data.shape[:1] != time_series.timestamps.shape
     ):
         return InspectorMessage(
-            message="The length of the first dimension of data does not match the length of timestamps.",
+            message=f"The length of the first dimension of data ({time_series.data.shape[:1]}) does not match the length of timestamps ({time_series.timestamps.shape}).",
         )
 
 
diff --git a/src/nwbinspector/nwbinspector.py b/src/nwbinspector/nwbinspector.py
index 4afb45101..5ce42fbf3 100644
--- a/src/nwbinspector/nwbinspector.py
+++ b/src/nwbinspector/nwbinspector.py
@@ -554,6 +554,7 @@ def run_checks(nwbfile: pynwb.NWBFile, checks: list):
     checks : list
     """
     for check_function in checks:
+        print(check_function)
         for nwbfile_object in nwbfile.objects.values():
             if check_function.neurodata_type is None or issubclass(type(nwbfile_object), check_function.neurodata_type):
                 try:
diff --git a/src/nwbinspector/utils.py b/src/nwbinspector/utils.py
index a0542d55e..40ed200cd 100644
--- a/src/nwbinspector/utils.py
+++ b/src/nwbinspector/utils.py
@@ -12,6 +12,7 @@
 import h5py
 import numpy as np
 from numpy.typing import ArrayLike
+from hdmf.backends.hdf5.h5_utils import H5Dataset
 
 
 PathType = TypeVar("PathType", str, Path)  # For types that can be either files or folders
@@ -35,7 +36,10 @@ def _cache_data_selection(data: Union[h5py.Dataset, ArrayLike], selection: Union
     """Extract the selection lazily from the data object for efficient caching (most beneficial during streaming)."""
     if isinstance(data, np.memmap):  # Technically np.memmap should be able to support this type of behavior as well
         return data[selection]  # But they aren't natively hashable either...
-    if not isinstance(data, h5py.Dataset):  # No need to attempt to cache if already an in-memory object
+    print(type(data))
+    print(isinstance(data, h5py.Dataset))
+    print(isinstance(data, H5Dataset))
+    if not (isinstance(data, h5py.Dataset) or isinstance(data, H5Dataset)):  # No need to attempt to cache if already an in-memory object
         return np.array(data)[selection]
 
     # slices also aren't hashable, but their reduced representation is

From 56167d2991da6449b2ab3c7553b2e20b8d9053a5 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 31 Oct 2022 17:25:27 +0000
Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/nwbinspector/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/nwbinspector/utils.py b/src/nwbinspector/utils.py
index 40ed200cd..15ca2309d 100644
--- a/src/nwbinspector/utils.py
+++ b/src/nwbinspector/utils.py
@@ -39,7 +39,9 @@ def _cache_data_selection(data: Union[h5py.Dataset, ArrayLike], selection: Union
     print(type(data))
     print(isinstance(data, h5py.Dataset))
     print(isinstance(data, H5Dataset))
-    if not (isinstance(data, h5py.Dataset) or isinstance(data, H5Dataset)):  # No need to attempt to cache if already an in-memory object
+    if not (
+        isinstance(data, h5py.Dataset) or isinstance(data, H5Dataset)
+    ):  # No need to attempt to cache if already an in-memory object
         return np.array(data)[selection]
 
     # slices also aren't hashable, but their reduced representation is

From 60ff9f6bebc7ddcd442ad9736dd5b60ae9b75939 Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 31 Oct 2022 13:25:58 -0400
Subject: [PATCH 3/7] Apply suggestions from code review

---
 src/nwbinspector/checks/tables.py | 2 --
 src/nwbinspector/nwbinspector.py  | 1 -
 src/nwbinspector/utils.py         | 3 ---
 3 files changed, 6 deletions(-)

diff --git a/src/nwbinspector/checks/tables.py b/src/nwbinspector/checks/tables.py
index 0347f6a18..c67a11025 100644
--- a/src/nwbinspector/checks/tables.py
+++ b/src/nwbinspector/checks/tables.py
@@ -113,8 +113,6 @@ def check_column_binary_capability(table: DynamicTable, nelems: int = 200):
             if np.asarray(column.data[0]).itemsize == 1:
                 continue  # already boolean, int8, or uint8
             try:
-                print(column.name)
-                print(column.data.shape)
                 unique_values = np.unique(_cache_data_selection(data=column.data, selection=slice(nelems)))
             except TypeError:  # some contained objects are unhashable or have no comparison defined
                 continue
diff --git a/src/nwbinspector/nwbinspector.py b/src/nwbinspector/nwbinspector.py
index 5ce42fbf3..4afb45101 100644
--- a/src/nwbinspector/nwbinspector.py
+++ b/src/nwbinspector/nwbinspector.py
@@ -554,7 +554,6 @@ def run_checks(nwbfile: pynwb.NWBFile, checks: list):
     checks : list
     """
     for check_function in checks:
-        print(check_function)
         for nwbfile_object in nwbfile.objects.values():
             if check_function.neurodata_type is None or issubclass(type(nwbfile_object), check_function.neurodata_type):
                 try:
diff --git a/src/nwbinspector/utils.py b/src/nwbinspector/utils.py
index 15ca2309d..b4abb423c 100644
--- a/src/nwbinspector/utils.py
+++ b/src/nwbinspector/utils.py
@@ -36,9 +36,6 @@ def _cache_data_selection(data: Union[h5py.Dataset, ArrayLike], selection: Union
     """Extract the selection lazily from the data object for efficient caching (most beneficial during streaming)."""
     if isinstance(data, np.memmap):  # Technically np.memmap should be able to support this type of behavior as well
         return data[selection]  # But they aren't natively hashable either...
-    print(type(data))
-    print(isinstance(data, h5py.Dataset))
-    print(isinstance(data, H5Dataset))
     if not (
         isinstance(data, h5py.Dataset) or isinstance(data, H5Dataset)
     ):  # No need to attempt to cache if already an in-memory object

From 959372de87bdc60cd45bea6a08c99694e4b7bb3d Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 31 Oct 2022 13:26:52 -0400
Subject: [PATCH 4/7] Update test_tables.py

---
 tests/unit_tests/test_tables.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit_tests/test_tables.py b/tests/unit_tests/test_tables.py
index dc711c231..a77a379c3 100644
--- a/tests/unit_tests/test_tables.py
+++ b/tests/unit_tests/test_tables.py
@@ -84,7 +84,7 @@ def test_check_time_interval_time_columns():
 
     assert check_time_interval_time_columns(time_intervals) == InspectorMessage(
         message=(
-            "['start_time', 'stop_time'] are time columns but the values are not in ascending order."
+            "['start_time', 'stop_time'] are time columns but the values are not in ascending order. "
             "All times should be in seconds with respect to the session start time."
         ),
         importance=Importance.BEST_PRACTICE_VIOLATION,

From b400f7123663dd7a6d0005c763d59fc8a79ee54d Mon Sep 17 00:00:00 2001
From: Cody Baker <cbaker9@nd.edu>
Date: Thu, 10 Nov 2022 13:07:02 -0500
Subject: [PATCH 5/7] generalized and debugged

---
 src/nwbinspector/checks/time_series.py     | 25 ++++++++++++++++++----
 src/nwbinspector/utils.py                  | 16 ++++++++------
 tests/test_inspector.py                    | 14 ++++++------
 tests/true_nwbinspector_default_report.txt |  2 +-
 tests/unit_tests/test_time_series.py       | 10 ++++-----
 5 files changed, 44 insertions(+), 23 deletions(-)

diff --git a/src/nwbinspector/checks/time_series.py b/src/nwbinspector/checks/time_series.py
index 7aa925c9e..3488b85b0 100644
--- a/src/nwbinspector/checks/time_series.py
+++ b/src/nwbinspector/checks/time_series.py
@@ -1,7 +1,9 @@
 """Check functions that can apply to any descendant of TimeSeries."""
 import numpy as np
 
+import h5py
 from pynwb import TimeSeries
+from hdmf.backends.hdf5.h5_utils import H5Dataset
 
 from ..register_checks import register_check, Importance, Severity, InspectorMessage
 from ..utils import is_regular_series, is_ascending_series
@@ -52,13 +54,28 @@ def check_timestamps_match_first_dimension(time_series: TimeSeries):
 
     Best Practice: :ref:`best_practice_data_orientation`
     """
+    if time_series.data is None or time_series.timestamps is None:
+        return
+
     if (
-        time_series.data is not None
-        and time_series.timestamps is not None
-        and time_series.data.shape[:1] != time_series.timestamps.shape
+            any(isinstance(time_series.data, lazy_type) for lazy_type in [np.memmap, h5py.Dataset, H5Dataset]) and  time_series.data.shape[0] != time_series.timestamps.shape[0]
     ):
         return InspectorMessage(
-            message=f"The length of the first dimension of data ({time_series.data.shape[:1]}) does not match the length of timestamps ({time_series.timestamps.shape}).",
+            message=(
+                f"The length of the first dimension of data ({time_series.data.shape[0]}) "
+                f"does not match the length of timestamps ({time_series.timestamps.shape[0]})."
+            )
+        )
+
+    # object data is already loaded into memory - cast as numpy array to infer shaping
+    data_shape = np.array(time_series.data).shape[0]
+    timestamps_shape = np.array(time_series.timestamps).shape[0]
+    if data_shape != timestamps_shape:
+        return InspectorMessage(
+            message=(
+                f"The length of the first dimension of data ({data_shape}) "
+                f"does not match the length of timestamps ({timestamps_shape})."
+            )
         )
 
 
diff --git a/src/nwbinspector/utils.py b/src/nwbinspector/utils.py
index b4abb423c..4e2693171 100644
--- a/src/nwbinspector/utils.py
+++ b/src/nwbinspector/utils.py
@@ -34,17 +34,19 @@ def _cache_data_retrieval_command(
 
 def _cache_data_selection(data: Union[h5py.Dataset, ArrayLike], selection: Union[slice, Tuple[slice]]) -> np.ndarray:
     """Extract the selection lazily from the data object for efficient caching (most beneficial during streaming)."""
-    if isinstance(data, np.memmap):  # Technically np.memmap should be able to support this type of behavior as well
-        return data[selection]  # But they aren't natively hashable either...
+    if isinstance(data, np.memmap):  # np.memmap objects are not hashable - simply return the selection lazily
+        return data[selection]
     if not (
         isinstance(data, h5py.Dataset) or isinstance(data, H5Dataset)
-    ):  # No need to attempt to cache if already an in-memory object
+    ):  # No need to attempt to cache if data is already in-memory
+        # Cast as numpy array for efficient fancy indexing
+        # Note that this technically copies the entire data, so could use more than 2x RAM for that object
         return np.array(data)[selection]
 
-    # slices also aren't hashable, but their reduced representation is
-    if isinstance(selection, slice):  # If a single slice
-        reduced_selection = tuple([selection.__reduce__()[1]])  # if a single slice
-    else:
+    # Slices aren't hashable, but their reduced representation is
+    if isinstance(selection, slice):  # A single slice
+        reduced_selection = tuple([selection.__reduce__()[1]])
+    else:  # Iterable of slices
         reduced_selection = tuple([selection_slice.__reduce__()[1] for selection_slice in selection])
     return _cache_data_retrieval_command(data=data, reduced_selection=reduced_selection)
 
diff --git a/tests/test_inspector.py b/tests/test_inspector.py
index 90e86594a..c0dad5950 100644
--- a/tests/test_inspector.py
+++ b/tests/test_inspector.py
@@ -181,7 +181,7 @@ def test_inspect_all(self):
                 file_path=self.nwbfile_paths[0],
             ),
             InspectorMessage(
-                message="The length of the first dimension of data does not match the length of timestamps.",
+                message="The length of the first dimension of data (4) does not match the length of timestamps (3).",
                 importance=Importance.CRITICAL,
                 severity=Severity.LOW,
                 check_function_name="check_timestamps_match_first_dimension",
@@ -248,7 +248,9 @@ def test_inspect_all_parallel(self):
                     file_path=self.nwbfile_paths[0],
                 ),
                 InspectorMessage(
-                    message="The length of the first dimension of data does not match the length of timestamps.",
+                    message=(
+                        "The length of the first dimension of data (4) does not match the length of timestamps (3)."
+                    ),
                     importance=Importance.CRITICAL,
                     severity=Severity.LOW,
                     check_function_name="check_timestamps_match_first_dimension",
@@ -312,7 +314,7 @@ def test_inspect_nwb(self):
                 file_path=self.nwbfile_paths[0],
             ),
             InspectorMessage(
-                message="The length of the first dimension of data does not match the length of timestamps.",
+                message="The length of the first dimension of data (4) does not match the length of timestamps (3).",
                 importance=Importance.CRITICAL,
                 check_function_name="check_timestamps_match_first_dimension",
                 object_type="TimeSeries",
@@ -343,7 +345,7 @@ def test_inspect_nwb_importance_threshold_as_importance(self):
                 file_path=self.nwbfile_paths[0],
             ),
             InspectorMessage(
-                message="The length of the first dimension of data does not match the length of timestamps.",
+                message="The length of the first dimension of data (4) does not match the length of timestamps (3).",
                 importance=Importance.CRITICAL,
                 check_function_name="check_timestamps_match_first_dimension",
                 object_type="TimeSeries",
@@ -372,7 +374,7 @@ def test_inspect_nwb_importance_threshold_as_string(self):
                 file_path=self.nwbfile_paths[0],
             ),
             InspectorMessage(
-                message="The length of the first dimension of data does not match the length of timestamps.",
+                message="The length of the first dimension of data (4) does not match the length of timestamps (3).",
                 importance=Importance.CRITICAL,
                 check_function_name="check_timestamps_match_first_dimension",
                 object_type="TimeSeries",
@@ -548,7 +550,7 @@ def test_inspect_nwb_dandi_config(self):
                 file_path=self.nwbfile_paths[0],
             ),
             InspectorMessage(
-                message="The length of the first dimension of data does not match the length of timestamps.",
+                message="The length of the first dimension of data (4) does not match the length of timestamps (3).",
                 importance=Importance.CRITICAL,
                 check_function_name="check_timestamps_match_first_dimension",
                 object_type="TimeSeries",
diff --git a/tests/true_nwbinspector_default_report.txt b/tests/true_nwbinspector_default_report.txt
index 11458303a..846255249 100644
--- a/tests/true_nwbinspector_default_report.txt
+++ b/tests/true_nwbinspector_default_report.txt
@@ -19,7 +19,7 @@ Found 5 issues over 2 files:
        Message: Data may be in the wrong orientation. Time should be in the first dimension, and is usually the longest dimension. Here, another dimension is longer.
 
 0.1  ./testing0.nwb: check_timestamps_match_first_dimension - 'TimeSeries' object at location '/acquisition/test_time_series_3'
-       Message: The length of the first dimension of data does not match the length of timestamps.
+       Message: The length of the first dimension of data (4) does not match the length of timestamps (3).
 
 1  BEST_PRACTICE_VIOLATION
 ==========================
diff --git a/tests/unit_tests/test_time_series.py b/tests/unit_tests/test_time_series.py
index b1dc81930..ea5dacd39 100644
--- a/tests/unit_tests/test_time_series.py
+++ b/tests/unit_tests/test_time_series.py
@@ -82,11 +82,11 @@ def test_check_timestamps():
         time_series=pynwb.TimeSeries(
             name="test_time_series",
             unit="test_units",
-            data=np.zeros(shape=4),
+            data=np.empty(shape=4),
             timestamps=[1.0, 2.0, 3.0],
         )
     ) == InspectorMessage(
-        message="The length of the first dimension of data does not match the length of timestamps.",
+        message="The length of the first dimension of data (4) does not match the length of timestamps (3).",
         importance=Importance.CRITICAL,
         check_function_name="check_timestamps_match_first_dimension",
         object_type="TimeSeries",
@@ -99,7 +99,7 @@ def test_check_timestamps_empty_data():
     assert check_timestamps_match_first_dimension(
         time_series=pynwb.TimeSeries(name="test_time_series", unit="test_units", data=[], timestamps=[1.0, 2.0, 3.0])
     ) == InspectorMessage(
-        message="The length of the first dimension of data does not match the length of timestamps.",
+        message="The length of the first dimension of data (0) does not match the length of timestamps (3).",
         importance=Importance.CRITICAL,
         check_function_name="check_timestamps_match_first_dimension",
         object_type="TimeSeries",
@@ -110,9 +110,9 @@ def test_check_timestamps_empty_data():
 
 def test_check_timestamps_empty_timestamps():
     assert check_timestamps_match_first_dimension(
-        time_series=pynwb.TimeSeries(name="test_time_series", unit="test_units", data=np.zeros(shape=4), timestamps=[])
+        time_series=pynwb.TimeSeries(name="test_time_series", unit="test_units", data=np.empty(shape=4), timestamps=[])
     ) == InspectorMessage(
-        message="The length of the first dimension of data does not match the length of timestamps.",
+        message="The length of the first dimension of data (4) does not match the length of timestamps (0).",
         importance=Importance.CRITICAL,
         check_function_name="check_timestamps_match_first_dimension",
         object_type="TimeSeries",

From d94d373dc41ae513de540b317fda27e61607743e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 10 Nov 2022 18:07:15 +0000
Subject: [PATCH 6/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/nwbinspector/checks/time_series.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/nwbinspector/checks/time_series.py b/src/nwbinspector/checks/time_series.py
index 3488b85b0..90af16617 100644
--- a/src/nwbinspector/checks/time_series.py
+++ b/src/nwbinspector/checks/time_series.py
@@ -58,7 +58,8 @@ def check_timestamps_match_first_dimension(time_series: TimeSeries):
         return
 
     if (
-            any(isinstance(time_series.data, lazy_type) for lazy_type in [np.memmap, h5py.Dataset, H5Dataset]) and  time_series.data.shape[0] != time_series.timestamps.shape[0]
+        any(isinstance(time_series.data, lazy_type) for lazy_type in [np.memmap, h5py.Dataset, H5Dataset])
+        and time_series.data.shape[0] != time_series.timestamps.shape[0]
     ):
         return InspectorMessage(
             message=(

From 13c421198d935dc6cd8b5f5db86a9957bff424c3 Mon Sep 17 00:00:00 2001
From: Cody Baker <cbaker9@nd.edu>
Date: Thu, 10 Nov 2022 13:30:12 -0500
Subject: [PATCH 7/7] fixed

---
 src/nwbinspector/checks/time_series.py | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/nwbinspector/checks/time_series.py b/src/nwbinspector/checks/time_series.py
index 90af16617..eb255f631 100644
--- a/src/nwbinspector/checks/time_series.py
+++ b/src/nwbinspector/checks/time_series.py
@@ -57,10 +57,8 @@ def check_timestamps_match_first_dimension(time_series: TimeSeries):
     if time_series.data is None or time_series.timestamps is None:
         return
 
-    if (
-        any(isinstance(time_series.data, lazy_type) for lazy_type in [np.memmap, h5py.Dataset, H5Dataset])
-        and time_series.data.shape[0] != time_series.timestamps.shape[0]
-    ):
+    is_lazy_type = any(isinstance(time_series.data, lazy_type) for lazy_type in [np.memmap, h5py.Dataset, H5Dataset])
+    if is_lazy_type and time_series.data.shape[0] != time_series.timestamps.shape[0]:
         return InspectorMessage(
             message=(
                 f"The length of the first dimension of data ({time_series.data.shape[0]}) "
@@ -69,15 +67,16 @@ def check_timestamps_match_first_dimension(time_series: TimeSeries):
         )
 
     # object data is already loaded into memory - cast as numpy array to infer shaping
-    data_shape = np.array(time_series.data).shape[0]
-    timestamps_shape = np.array(time_series.timestamps).shape[0]
-    if data_shape != timestamps_shape:
-        return InspectorMessage(
-            message=(
-                f"The length of the first dimension of data ({data_shape}) "
-                f"does not match the length of timestamps ({timestamps_shape})."
+    if not is_lazy_type:
+        data_shape = np.array(time_series.data).shape[0]
+        timestamps_shape = np.array(time_series.timestamps).shape[0]
+        if data_shape != timestamps_shape:
+            return InspectorMessage(
+                message=(
+                    f"The length of the first dimension of data ({data_shape}) "
+                    f"does not match the length of timestamps ({timestamps_shape})."
+                )
             )
-        )
 
 
 @register_check(importance=Importance.BEST_PRACTICE_VIOLATION, neurodata_type=TimeSeries)