From 8a44291a99ebeced8644ed3cbcb7cf5287853f84 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Sun, 30 Oct 2022 23:11:06 +0000 Subject: [PATCH 1/7] debugging --- src/nwbinspector/checks/tables.py | 2 ++ src/nwbinspector/checks/time_series.py | 5 ++--- src/nwbinspector/nwbinspector.py | 1 + src/nwbinspector/utils.py | 6 +++++- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/nwbinspector/checks/tables.py b/src/nwbinspector/checks/tables.py index 613156d10..55426f8da 100644 --- a/src/nwbinspector/checks/tables.py +++ b/src/nwbinspector/checks/tables.py @@ -113,6 +113,8 @@ def check_column_binary_capability(table: DynamicTable, nelems: int = 200): if np.asarray(column.data[0]).itemsize == 1: continue # already boolean, int8, or uint8 try: + print(column.name) + print(column.data.shape) unique_values = np.unique(_cache_data_selection(data=column.data, selection=slice(nelems))) except TypeError: # some contained objects are unhashable or have no comparison defined continue diff --git a/src/nwbinspector/checks/time_series.py b/src/nwbinspector/checks/time_series.py index ce3366f2c..7aa925c9e 100644 --- a/src/nwbinspector/checks/time_series.py +++ b/src/nwbinspector/checks/time_series.py @@ -3,7 +3,6 @@ from pynwb import TimeSeries -# from ..tools import all_of_type from ..register_checks import register_check, Importance, Severity, InspectorMessage from ..utils import is_regular_series, is_ascending_series @@ -56,10 +55,10 @@ def check_timestamps_match_first_dimension(time_series: TimeSeries): if ( time_series.data is not None and time_series.timestamps is not None - and np.array(time_series.data).shape[:1] != np.array(time_series.timestamps).shape + and time_series.data.shape[:1] != time_series.timestamps.shape ): return InspectorMessage( - message="The length of the first dimension of data does not match the length of timestamps.", + message=f"The length of the first dimension of data ({time_series.data.shape[:1]}) does not match the length of timestamps ({time_series.timestamps.shape}).", ) diff --git a/src/nwbinspector/nwbinspector.py b/src/nwbinspector/nwbinspector.py index 4afb45101..5ce42fbf3 100644 --- a/src/nwbinspector/nwbinspector.py +++ b/src/nwbinspector/nwbinspector.py @@ -554,6 +554,7 @@ def run_checks(nwbfile: pynwb.NWBFile, checks: list): checks : list """ for check_function in checks: + print(check_function) for nwbfile_object in nwbfile.objects.values(): if check_function.neurodata_type is None or issubclass(type(nwbfile_object), check_function.neurodata_type): try: diff --git a/src/nwbinspector/utils.py b/src/nwbinspector/utils.py index a0542d55e..40ed200cd 100644 --- a/src/nwbinspector/utils.py +++ b/src/nwbinspector/utils.py @@ -12,6 +12,7 @@ import h5py import numpy as np from numpy.typing import ArrayLike +from hdmf.backends.hdf5.h5_utils import H5Dataset PathType = TypeVar("PathType", str, Path) # For types that can be either files or folders @@ -35,7 +36,10 @@ def _cache_data_selection(data: Union[h5py.Dataset, ArrayLike], selection: Union """Extract the selection lazily from the data object for efficient caching (most beneficial during streaming).""" if isinstance(data, np.memmap): # Technically np.memmap should be able to support this type of behavior as well return data[selection] # But they aren't natively hashable either... - if not isinstance(data, h5py.Dataset): # No need to attempt to cache if already an in-memory object + print(type(data)) + print(isinstance(data, h5py.Dataset)) + print(isinstance(data, H5Dataset)) + if not (isinstance(data, h5py.Dataset) or isinstance(data, H5Dataset)): # No need to attempt to cache if already an in-memory object return np.array(data)[selection] # slices also aren't hashable, but their reduced representation is From 56167d2991da6449b2ab3c7553b2e20b8d9053a5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 31 Oct 2022 17:25:27 +0000 Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/nwbinspector/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/nwbinspector/utils.py b/src/nwbinspector/utils.py index 40ed200cd..15ca2309d 100644 --- a/src/nwbinspector/utils.py +++ b/src/nwbinspector/utils.py @@ -39,7 +39,9 @@ def _cache_data_selection(data: Union[h5py.Dataset, ArrayLike], selection: Union print(type(data)) print(isinstance(data, h5py.Dataset)) print(isinstance(data, H5Dataset)) - if not (isinstance(data, h5py.Dataset) or isinstance(data, H5Dataset)): # No need to attempt to cache if already an in-memory object + if not ( + isinstance(data, h5py.Dataset) or isinstance(data, H5Dataset) + ): # No need to attempt to cache if already an in-memory object return np.array(data)[selection] # slices also aren't hashable, but their reduced representation is From 60ff9f6bebc7ddcd442ad9736dd5b60ae9b75939 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Mon, 31 Oct 2022 13:25:58 -0400 Subject: [PATCH 3/7] Apply suggestions from code review --- src/nwbinspector/checks/tables.py | 2 -- src/nwbinspector/nwbinspector.py | 1 - src/nwbinspector/utils.py | 3 --- 3 files changed, 6 deletions(-) diff --git a/src/nwbinspector/checks/tables.py b/src/nwbinspector/checks/tables.py index 0347f6a18..c67a11025 100644 --- a/src/nwbinspector/checks/tables.py +++ b/src/nwbinspector/checks/tables.py @@ -113,8 +113,6 @@ def check_column_binary_capability(table: DynamicTable, nelems: int = 200): if np.asarray(column.data[0]).itemsize == 1: continue # already boolean, int8, or uint8 try: - print(column.name) - print(column.data.shape) unique_values = np.unique(_cache_data_selection(data=column.data, selection=slice(nelems))) except TypeError: # some contained objects are unhashable or have no comparison defined continue diff --git a/src/nwbinspector/nwbinspector.py b/src/nwbinspector/nwbinspector.py index 5ce42fbf3..4afb45101 100644 --- a/src/nwbinspector/nwbinspector.py +++ b/src/nwbinspector/nwbinspector.py @@ -554,7 +554,6 @@ def run_checks(nwbfile: pynwb.NWBFile, checks: list): checks : list """ for check_function in checks: - print(check_function) for nwbfile_object in nwbfile.objects.values(): if check_function.neurodata_type is None or issubclass(type(nwbfile_object), check_function.neurodata_type): try: diff --git a/src/nwbinspector/utils.py b/src/nwbinspector/utils.py index 15ca2309d..b4abb423c 100644 --- a/src/nwbinspector/utils.py +++ b/src/nwbinspector/utils.py @@ -36,9 +36,6 @@ def _cache_data_selection(data: Union[h5py.Dataset, ArrayLike], selection: Union """Extract the selection lazily from the data object for efficient caching (most beneficial during streaming).""" if isinstance(data, np.memmap): # Technically np.memmap should be able to support this type of behavior as well return data[selection] # But they aren't natively hashable either... - print(type(data)) - print(isinstance(data, h5py.Dataset)) - print(isinstance(data, H5Dataset)) if not ( isinstance(data, h5py.Dataset) or isinstance(data, H5Dataset) ): # No need to attempt to cache if already an in-memory object From 959372de87bdc60cd45bea6a08c99694e4b7bb3d Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Mon, 31 Oct 2022 13:26:52 -0400 Subject: [PATCH 4/7] Update test_tables.py --- tests/unit_tests/test_tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit_tests/test_tables.py b/tests/unit_tests/test_tables.py index dc711c231..a77a379c3 100644 --- a/tests/unit_tests/test_tables.py +++ b/tests/unit_tests/test_tables.py @@ -84,7 +84,7 @@ def test_check_time_interval_time_columns(): assert check_time_interval_time_columns(time_intervals) == InspectorMessage( message=( - "['start_time', 'stop_time'] are time columns but the values are not in ascending order." + "['start_time', 'stop_time'] are time columns but the values are not in ascending order. " "All times should be in seconds with respect to the session start time." ), importance=Importance.BEST_PRACTICE_VIOLATION, From b400f7123663dd7a6d0005c763d59fc8a79ee54d Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Thu, 10 Nov 2022 13:07:02 -0500 Subject: [PATCH 5/7] generalized and debugged --- src/nwbinspector/checks/time_series.py | 25 ++++++++++++++++++---- src/nwbinspector/utils.py | 16 ++++++++------ tests/test_inspector.py | 14 ++++++------ tests/true_nwbinspector_default_report.txt | 2 +- tests/unit_tests/test_time_series.py | 10 ++++----- 5 files changed, 44 insertions(+), 23 deletions(-) diff --git a/src/nwbinspector/checks/time_series.py b/src/nwbinspector/checks/time_series.py index 7aa925c9e..3488b85b0 100644 --- a/src/nwbinspector/checks/time_series.py +++ b/src/nwbinspector/checks/time_series.py @@ -1,7 +1,9 @@ """Check functions that can apply to any descendant of TimeSeries.""" import numpy as np +import h5py from pynwb import TimeSeries +from hdmf.backends.hdf5.h5_utils import H5Dataset from ..register_checks import register_check, Importance, Severity, InspectorMessage from ..utils import is_regular_series, is_ascending_series @@ -52,13 +54,28 @@ def check_timestamps_match_first_dimension(time_series: TimeSeries): Best Practice: :ref:`best_practice_data_orientation` """ + if time_series.data is None or time_series.timestamps is None: + return + if ( - time_series.data is not None - and time_series.timestamps is not None - and time_series.data.shape[:1] != time_series.timestamps.shape + any(isinstance(time_series.data, lazy_type) for lazy_type in [np.memmap, h5py.Dataset, H5Dataset]) and time_series.data.shape[0] != time_series.timestamps.shape[0] ): return InspectorMessage( - message=f"The length of the first dimension of data ({time_series.data.shape[:1]}) does not match the length of timestamps ({time_series.timestamps.shape}).", + message=( + f"The length of the first dimension of data ({time_series.data.shape[0]}) " + f"does not match the length of timestamps ({time_series.timestamps.shape[0]})." + ) + ) + + # object data is already loaded into memory - cast as numpy array to infer shaping + data_shape = np.array(time_series.data).shape[0] + timestamps_shape = np.array(time_series.timestamps).shape[0] + if data_shape != timestamps_shape: + return InspectorMessage( + message=( + f"The length of the first dimension of data ({data_shape}) " + f"does not match the length of timestamps ({timestamps_shape})." + ) ) diff --git a/src/nwbinspector/utils.py b/src/nwbinspector/utils.py index b4abb423c..4e2693171 100644 --- a/src/nwbinspector/utils.py +++ b/src/nwbinspector/utils.py @@ -34,17 +34,19 @@ def _cache_data_retrieval_command( def _cache_data_selection(data: Union[h5py.Dataset, ArrayLike], selection: Union[slice, Tuple[slice]]) -> np.ndarray: """Extract the selection lazily from the data object for efficient caching (most beneficial during streaming).""" - if isinstance(data, np.memmap): # Technically np.memmap should be able to support this type of behavior as well - return data[selection] # But they aren't natively hashable either... + if isinstance(data, np.memmap): # np.memmap objects are not hashable - simply return the selection lazily + return data[selection] if not ( isinstance(data, h5py.Dataset) or isinstance(data, H5Dataset) - ): # No need to attempt to cache if already an in-memory object + ): # No need to attempt to cache if data is already in-memory + # Cast as numpy array for efficient fancy indexing + # Note that this technically copies the entire data, so could use more than 2x RAM for that object return np.array(data)[selection] - # slices also aren't hashable, but their reduced representation is - if isinstance(selection, slice): # If a single slice - reduced_selection = tuple([selection.__reduce__()[1]]) # if a single slice - else: + # Slices aren't hashable, but their reduced representation is + if isinstance(selection, slice): # A single slice + reduced_selection = tuple([selection.__reduce__()[1]]) + else: # Iterable of slices reduced_selection = tuple([selection_slice.__reduce__()[1] for selection_slice in selection]) return _cache_data_retrieval_command(data=data, reduced_selection=reduced_selection) diff --git a/tests/test_inspector.py b/tests/test_inspector.py index 90e86594a..c0dad5950 100644 --- a/tests/test_inspector.py +++ b/tests/test_inspector.py @@ -181,7 +181,7 @@ def test_inspect_all(self): file_path=self.nwbfile_paths[0], ), InspectorMessage( - message="The length of the first dimension of data does not match the length of timestamps.", + message="The length of the first dimension of data (4) does not match the length of timestamps (3).", importance=Importance.CRITICAL, severity=Severity.LOW, check_function_name="check_timestamps_match_first_dimension", @@ -248,7 +248,9 @@ def test_inspect_all_parallel(self): file_path=self.nwbfile_paths[0], ), InspectorMessage( - message="The length of the first dimension of data does not match the length of timestamps.", + message=( + "The length of the first dimension of data (4) does not match the length of timestamps (3)." + ), importance=Importance.CRITICAL, severity=Severity.LOW, check_function_name="check_timestamps_match_first_dimension", @@ -312,7 +314,7 @@ def test_inspect_nwb(self): file_path=self.nwbfile_paths[0], ), InspectorMessage( - message="The length of the first dimension of data does not match the length of timestamps.", + message="The length of the first dimension of data (4) does not match the length of timestamps (3).", importance=Importance.CRITICAL, check_function_name="check_timestamps_match_first_dimension", object_type="TimeSeries", @@ -343,7 +345,7 @@ def test_inspect_nwb_importance_threshold_as_importance(self): file_path=self.nwbfile_paths[0], ), InspectorMessage( - message="The length of the first dimension of data does not match the length of timestamps.", + message="The length of the first dimension of data (4) does not match the length of timestamps (3).", importance=Importance.CRITICAL, check_function_name="check_timestamps_match_first_dimension", object_type="TimeSeries", @@ -372,7 +374,7 @@ def test_inspect_nwb_importance_threshold_as_string(self): file_path=self.nwbfile_paths[0], ), InspectorMessage( - message="The length of the first dimension of data does not match the length of timestamps.", + message="The length of the first dimension of data (4) does not match the length of timestamps (3).", importance=Importance.CRITICAL, check_function_name="check_timestamps_match_first_dimension", object_type="TimeSeries", @@ -548,7 +550,7 @@ def test_inspect_nwb_dandi_config(self): file_path=self.nwbfile_paths[0], ), InspectorMessage( - message="The length of the first dimension of data does not match the length of timestamps.", + message="The length of the first dimension of data (4) does not match the length of timestamps (3).", importance=Importance.CRITICAL, check_function_name="check_timestamps_match_first_dimension", object_type="TimeSeries", diff --git a/tests/true_nwbinspector_default_report.txt b/tests/true_nwbinspector_default_report.txt index 11458303a..846255249 100644 --- a/tests/true_nwbinspector_default_report.txt +++ b/tests/true_nwbinspector_default_report.txt @@ -19,7 +19,7 @@ Found 5 issues over 2 files: Message: Data may be in the wrong orientation. Time should be in the first dimension, and is usually the longest dimension. Here, another dimension is longer. 0.1 ./testing0.nwb: check_timestamps_match_first_dimension - 'TimeSeries' object at location '/acquisition/test_time_series_3' - Message: The length of the first dimension of data does not match the length of timestamps. + Message: The length of the first dimension of data (4) does not match the length of timestamps (3). 1 BEST_PRACTICE_VIOLATION ========================== diff --git a/tests/unit_tests/test_time_series.py b/tests/unit_tests/test_time_series.py index b1dc81930..ea5dacd39 100644 --- a/tests/unit_tests/test_time_series.py +++ b/tests/unit_tests/test_time_series.py @@ -82,11 +82,11 @@ def test_check_timestamps(): time_series=pynwb.TimeSeries( name="test_time_series", unit="test_units", - data=np.zeros(shape=4), + data=np.empty(shape=4), timestamps=[1.0, 2.0, 3.0], ) ) == InspectorMessage( - message="The length of the first dimension of data does not match the length of timestamps.", + message="The length of the first dimension of data (4) does not match the length of timestamps (3).", importance=Importance.CRITICAL, check_function_name="check_timestamps_match_first_dimension", object_type="TimeSeries", @@ -99,7 +99,7 @@ def test_check_timestamps_empty_data(): assert check_timestamps_match_first_dimension( time_series=pynwb.TimeSeries(name="test_time_series", unit="test_units", data=[], timestamps=[1.0, 2.0, 3.0]) ) == InspectorMessage( - message="The length of the first dimension of data does not match the length of timestamps.", + message="The length of the first dimension of data (0) does not match the length of timestamps (3).", importance=Importance.CRITICAL, check_function_name="check_timestamps_match_first_dimension", object_type="TimeSeries", @@ -110,9 +110,9 @@ def test_check_timestamps_empty_data(): def test_check_timestamps_empty_timestamps(): assert check_timestamps_match_first_dimension( - time_series=pynwb.TimeSeries(name="test_time_series", unit="test_units", data=np.zeros(shape=4), timestamps=[]) + time_series=pynwb.TimeSeries(name="test_time_series", unit="test_units", data=np.empty(shape=4), timestamps=[]) ) == InspectorMessage( - message="The length of the first dimension of data does not match the length of timestamps.", + message="The length of the first dimension of data (4) does not match the length of timestamps (0).", importance=Importance.CRITICAL, check_function_name="check_timestamps_match_first_dimension", object_type="TimeSeries", From d94d373dc41ae513de540b317fda27e61607743e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 10 Nov 2022 18:07:15 +0000 Subject: [PATCH 6/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/nwbinspector/checks/time_series.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/nwbinspector/checks/time_series.py b/src/nwbinspector/checks/time_series.py index 3488b85b0..90af16617 100644 --- a/src/nwbinspector/checks/time_series.py +++ b/src/nwbinspector/checks/time_series.py @@ -58,7 +58,8 @@ def check_timestamps_match_first_dimension(time_series: TimeSeries): return if ( - any(isinstance(time_series.data, lazy_type) for lazy_type in [np.memmap, h5py.Dataset, H5Dataset]) and time_series.data.shape[0] != time_series.timestamps.shape[0] + any(isinstance(time_series.data, lazy_type) for lazy_type in [np.memmap, h5py.Dataset, H5Dataset]) + and time_series.data.shape[0] != time_series.timestamps.shape[0] ): return InspectorMessage( message=( From 13c421198d935dc6cd8b5f5db86a9957bff424c3 Mon Sep 17 00:00:00 2001 From: Cody Baker Date: Thu, 10 Nov 2022 13:30:12 -0500 Subject: [PATCH 7/7] fixed --- src/nwbinspector/checks/time_series.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/nwbinspector/checks/time_series.py b/src/nwbinspector/checks/time_series.py index 90af16617..eb255f631 100644 --- a/src/nwbinspector/checks/time_series.py +++ b/src/nwbinspector/checks/time_series.py @@ -57,10 +57,8 @@ def check_timestamps_match_first_dimension(time_series: TimeSeries): if time_series.data is None or time_series.timestamps is None: return - if ( - any(isinstance(time_series.data, lazy_type) for lazy_type in [np.memmap, h5py.Dataset, H5Dataset]) - and time_series.data.shape[0] != time_series.timestamps.shape[0] - ): + is_lazy_type = any(isinstance(time_series.data, lazy_type) for lazy_type in [np.memmap, h5py.Dataset, H5Dataset]) + if is_lazy_type and time_series.data.shape[0] != time_series.timestamps.shape[0]: return InspectorMessage( message=( f"The length of the first dimension of data ({time_series.data.shape[0]}) " @@ -69,15 +67,16 @@ def check_timestamps_match_first_dimension(time_series: TimeSeries): ) # object data is already loaded into memory - cast as numpy array to infer shaping - data_shape = np.array(time_series.data).shape[0] - timestamps_shape = np.array(time_series.timestamps).shape[0] - if data_shape != timestamps_shape: - return InspectorMessage( - message=( - f"The length of the first dimension of data ({data_shape}) " - f"does not match the length of timestamps ({timestamps_shape})." + if not is_lazy_type: + data_shape = np.array(time_series.data).shape[0] + timestamps_shape = np.array(time_series.timestamps).shape[0] + if data_shape != timestamps_shape: + return InspectorMessage( + message=( + f"The length of the first dimension of data ({data_shape}) " + f"does not match the length of timestamps ({timestamps_shape})." + ) ) - ) @register_check(importance=Importance.BEST_PRACTICE_VIOLATION, neurodata_type=TimeSeries)