diff --git a/CHANGELOG.md b/CHANGELOG.md index 436abbf86..a47f62857 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Upcoming +### New Checks +* Add check for unique ids for DynamicTables. [PR #316](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/316) + + ### Fixes * Fix `check_subject_proper_age_range` to parse years. [PR #314](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/314) * Write a custom `get_data_shape` method that does not return `maxshape`, which fixes errors in parsing shape. [PR #315](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/315) diff --git a/docs/best_practices/tables.rst b/docs/best_practices/tables.rst index 23b22045a..1937879ec 100644 --- a/docs/best_practices/tables.rst +++ b/docs/best_practices/tables.rst @@ -119,3 +119,14 @@ Timing Columns Times are always stored in seconds in NWB. In :ref:`nwb-schema:sec-TimeIntervals` tables such as the ``TrialsTable`` and :ref:`EpochsTable `, ``start_time`` and ``stop_time`` should both be in seconds with respect to the ``timestamps_reference_time`` of the :ref:`nwb-schema:sec-NWBFile` (which by default is the ``session_start_time``, see :ref:`best_practice_global_time_reference` for more details). Additional time columns in :ref:`nwb-schema:sec-TimeIntervals` tables, such as the ``TrialsTable`` should have ``_time`` as a suffix to the name. *E.g.*, if you add more times in ``TrialsTable``, such as a subject response time, name it ``response_time`` and store the time values in seconds from the ``timestamps_reference_time`` of the :ref:`nwb-schema:sec-NWBFile`, just like ``start_time`` and ``stop_time``. This convention is used by downstream processing tools. For instance, NWBWidgets uses these times to create peri-stimulus time histograms relating spiking activity to trial events. See :ref:`best_practice_global_time_reference` for more details. + +.. _best_practice_unique_dynamic_table_ids: + +Unique ids +~~~~~~~~~~~ + +The values of the ``id`` attribute of any :ref:`hdmf-schema:sec-dynamictable` should be unique. This includes +descendants of :ref:`hdmf-schema:sec-dynamictable` such as :ref:`nwb-schema:sec-TimeIntervals` and +``ElectrodesTable``. In PyNWB, rows of :ref:`hdmf-schema:sec-dynamictable` increment as you add rows, so this +variable is unique by default. If you would like to make values of ``id`` non-unique, a better +solution would be to store these values as a custom column and use the default ``id`` values. diff --git a/src/nwbinspector/checks/ecephys.py b/src/nwbinspector/checks/ecephys.py index 5044bd8d1..45c67113e 100644 --- a/src/nwbinspector/checks/ecephys.py +++ b/src/nwbinspector/checks/ecephys.py @@ -3,6 +3,7 @@ from pynwb.misc import Units from pynwb.ecephys import ElectricalSeries +from pynwb.file import ElectrodeTable from ..utils import get_data_shape diff --git a/src/nwbinspector/checks/tables.py b/src/nwbinspector/checks/tables.py index 5309247ba..c8a69f166 100644 --- a/src/nwbinspector/checks/tables.py +++ b/src/nwbinspector/checks/tables.py @@ -17,8 +17,11 @@ ) +NELEMS = 200 + + @register_check(importance=Importance.CRITICAL, neurodata_type=DynamicTableRegion) -def check_dynamic_table_region_data_validity(dynamic_table_region: DynamicTableRegion, nelems=200): +def check_dynamic_table_region_data_validity(dynamic_table_region: DynamicTableRegion, nelems: Optional[int] = NELEMS): """Check if a DynamicTableRegion is valid.""" if np.any(np.asarray(dynamic_table_region.data[:nelems]) > len(dynamic_table_region.table)): return InspectorMessage( @@ -41,14 +44,14 @@ def check_empty_table(table: DynamicTable): @register_check(importance=Importance.BEST_PRACTICE_VIOLATION, neurodata_type=TimeIntervals) -def check_time_interval_time_columns(time_intervals: TimeIntervals, nelems: int = 200): +def check_time_interval_time_columns(time_intervals: TimeIntervals, nelems: Optional[int] = NELEMS): """ Check that time columns are in ascending order. Parameters ---------- time_intervals: TimeIntervals - nelems: int + nelems: int, optional Only check the first {nelems} elements. This is useful in case there columns are very long so you don't need to load the entire array into memory. Use None to load the entire arrays. @@ -68,7 +71,7 @@ def check_time_interval_time_columns(time_intervals: TimeIntervals, nelems: int @register_check(importance=Importance.BEST_PRACTICE_VIOLATION, neurodata_type=TimeIntervals) -def check_time_intervals_stop_after_start(time_intervals: TimeIntervals, nelems: int = 200): +def check_time_intervals_stop_after_start(time_intervals: TimeIntervals, nelems: Optional[int] = NELEMS): """ Check that all stop times on a TimeInterval object occur after their corresponding start times. @@ -77,7 +80,7 @@ def check_time_intervals_stop_after_start(time_intervals: TimeIntervals, nelems: Parameters ---------- time_intervals: TimeIntervals - nelems: int + nelems: int, optional Only check the first {nelems} elements. This is useful in case there columns are very long so you don't need to load the entire array into memory. Use None to load the entire arrays. @@ -96,14 +99,14 @@ def check_time_intervals_stop_after_start(time_intervals: TimeIntervals, nelems: @register_check(importance=Importance.BEST_PRACTICE_SUGGESTION, neurodata_type=DynamicTable) -def check_column_binary_capability(table: DynamicTable, nelems: int = 200): +def check_column_binary_capability(table: DynamicTable, nelems: Optional[int] = NELEMS): """ Check each column of a table to see if the data could be set as a boolean dtype. Parameters ---------- time_intervals: DynamicTable - nelems: int + nelems: int, optional Only check the first {nelems} elements. This is useful in case there columns are very long so you don't need to load the entire array into memory. Use None to load the entire arrays. @@ -176,7 +179,7 @@ def check_single_row( @register_check(importance=Importance.BEST_PRACTICE_VIOLATION, neurodata_type=DynamicTable) -def check_table_values_for_dict(table: DynamicTable, nelems: int = 200): +def check_table_values_for_dict(table: DynamicTable, nelems: Optional[int] = NELEMS): """Check if any values in a row or column of a table contain a string casting of a Python dictionary.""" for column in table.columns: if not hasattr(column, "data") or isinstance(column, VectorIndex) or not isinstance(column.data[0], str): @@ -193,7 +196,7 @@ def check_table_values_for_dict(table: DynamicTable, nelems: int = 200): @register_check(importance=Importance.BEST_PRACTICE_SUGGESTION, neurodata_type=DynamicTable) -def check_col_not_nan(table: DynamicTable, nelems: Optional[int] = 200): +def check_col_not_nan(table: DynamicTable, nelems: Optional[int] = NELEMS): """Check if all of the values in a single column of a table are NaN.""" for column in table.columns: if ( @@ -210,3 +213,25 @@ def check_col_not_nan(table: DynamicTable, nelems: Optional[int] = 200): message = message.replace("might have", "has") if nelems is None or slice_by == 1 else message if all(np.isnan(column[slice(0, None, slice_by)]).flatten()): yield InspectorMessage(message=message) + + +@register_check(importance=Importance.CRITICAL, neurodata_type=DynamicTable) +def check_ids_unique(table: DynamicTable, nelems: Optional[int] = NELEMS): + """ + Ensure that the values of the id attribute of a DynamicTable are unique. + + Best Practice: :ref:`best_practice_unique_dynamic_table_ids` + + Parameters + ---------- + table: DynamicTable + nelems: int, optional + Only inspect the first {nelems} elements + + Returns + ------- + + """ + data = table.id[:nelems] + if len(set(data)) != len(data): + return InspectorMessage(message="This table has ids that are not unique.") diff --git a/src/nwbinspector/utils.py b/src/nwbinspector/utils.py index 945c93225..386e44e16 100644 --- a/src/nwbinspector/utils.py +++ b/src/nwbinspector/utils.py @@ -87,7 +87,7 @@ def is_regular_series(series: np.ndarray, tolerance_decimals: int = 9): return len(uniq_diff_ts) == 1 -def is_ascending_series(series: Union[h5py.Dataset, ArrayLike], nelems=None): +def is_ascending_series(series: Union[h5py.Dataset, ArrayLike], nelems: Optional[int] = None): """General purpose function for determining if a series is monotonic increasing.""" if isinstance(series, h5py.Dataset): return np.all(np.diff(_cache_data_selection(data=series, selection=slice(nelems))) > 0) diff --git a/tests/unit_tests/test_tables.py b/tests/unit_tests/test_tables.py index a77a379c3..291f5e828 100644 --- a/tests/unit_tests/test_tables.py +++ b/tests/unit_tests/test_tables.py @@ -18,6 +18,7 @@ check_single_row, check_table_values_for_dict, check_col_not_nan, + check_ids_unique, ) from nwbinspector.utils import get_package_version @@ -397,3 +398,21 @@ def test_check_col_not_nan_fail_span_all_data(): file_path=None, ), ] + + +def test_fail_check_ids_unique(): + dt = DynamicTable(name="test_table", description="test", id=[0, 0, 1, 1]) + assert check_ids_unique(dt) == InspectorMessage( + message="This table has ids that are not unique.", + importance=Importance.CRITICAL, + check_function_name="check_ids_unique", + object_type="DynamicTable", + object_name="test_table", + location="/", + file_path=None, + ) + + +def test_pass_check_ids_unique(): + dt = DynamicTable(name="test_table", description="test", id=[0, 1]) + assert check_ids_unique(dt) is None