Skip to content

Commit

Permalink
Dynamic table unique ids (#316)
Browse files Browse the repository at this point in the history
* create check and test for dynamic table unique ids

* add test pass for dyn tab unique ids

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update CHANGELOG.md

* add best practice section

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix CHANGELOG.md

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
bendichter and pre-commit-ci[bot] authored Nov 30, 2022
1 parent 733e711 commit 4bd63f9
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 10 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Upcoming

### New Checks
* Add check for unique ids for DynamicTables. [PR #316](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/316)


### Fixes
* Fix `check_subject_proper_age_range` to parse years. [PR #314](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/314)
* Write a custom `get_data_shape` method that does not return `maxshape`, which fixes errors in parsing shape. [PR #315](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/315)
Expand Down
11 changes: 11 additions & 0 deletions docs/best_practices/tables.rst
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,14 @@ Timing Columns
Times are always stored in seconds in NWB. In :ref:`nwb-schema:sec-TimeIntervals` tables such as the ``TrialsTable`` and :ref:`EpochsTable <nwb-schema:epochs>`, ``start_time`` and ``stop_time`` should both be in seconds with respect to the ``timestamps_reference_time`` of the :ref:`nwb-schema:sec-NWBFile` (which by default is the ``session_start_time``, see :ref:`best_practice_global_time_reference` for more details).

Additional time columns in :ref:`nwb-schema:sec-TimeIntervals` tables, such as the ``TrialsTable`` should have ``_time`` as a suffix to the name. *E.g.*, if you add more times in ``TrialsTable``, such as a subject response time, name it ``response_time`` and store the time values in seconds from the ``timestamps_reference_time`` of the :ref:`nwb-schema:sec-NWBFile`, just like ``start_time`` and ``stop_time``. This convention is used by downstream processing tools. For instance, NWBWidgets uses these times to create peri-stimulus time histograms relating spiking activity to trial events. See :ref:`best_practice_global_time_reference` for more details.

.. _best_practice_unique_dynamic_table_ids:

Unique ids
~~~~~~~~~~~

The values of the ``id`` attribute of any :ref:`hdmf-schema:sec-dynamictable` should be unique. This includes
descendants of :ref:`hdmf-schema:sec-dynamictable` such as :ref:`nwb-schema:sec-TimeIntervals` and
``ElectrodesTable``. In PyNWB, rows of :ref:`hdmf-schema:sec-dynamictable` increment as you add rows, so this
variable is unique by default. If you would like to make values of ``id`` non-unique, a better
solution would be to store these values as a custom column and use the default ``id`` values.
1 change: 1 addition & 0 deletions src/nwbinspector/checks/ecephys.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from pynwb.misc import Units
from pynwb.ecephys import ElectricalSeries
from pynwb.file import ElectrodeTable

from ..utils import get_data_shape

Expand Down
43 changes: 34 additions & 9 deletions src/nwbinspector/checks/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,11 @@
)


NELEMS = 200


@register_check(importance=Importance.CRITICAL, neurodata_type=DynamicTableRegion)
def check_dynamic_table_region_data_validity(dynamic_table_region: DynamicTableRegion, nelems=200):
def check_dynamic_table_region_data_validity(dynamic_table_region: DynamicTableRegion, nelems: Optional[int] = NELEMS):
"""Check if a DynamicTableRegion is valid."""
if np.any(np.asarray(dynamic_table_region.data[:nelems]) > len(dynamic_table_region.table)):
return InspectorMessage(
Expand All @@ -41,14 +44,14 @@ def check_empty_table(table: DynamicTable):


@register_check(importance=Importance.BEST_PRACTICE_VIOLATION, neurodata_type=TimeIntervals)
def check_time_interval_time_columns(time_intervals: TimeIntervals, nelems: int = 200):
def check_time_interval_time_columns(time_intervals: TimeIntervals, nelems: Optional[int] = NELEMS):
"""
Check that time columns are in ascending order.
Parameters
----------
time_intervals: TimeIntervals
nelems: int
nelems: int, optional
Only check the first {nelems} elements. This is useful in case there columns are
very long so you don't need to load the entire array into memory. Use None to
load the entire arrays.
Expand All @@ -68,7 +71,7 @@ def check_time_interval_time_columns(time_intervals: TimeIntervals, nelems: int


@register_check(importance=Importance.BEST_PRACTICE_VIOLATION, neurodata_type=TimeIntervals)
def check_time_intervals_stop_after_start(time_intervals: TimeIntervals, nelems: int = 200):
def check_time_intervals_stop_after_start(time_intervals: TimeIntervals, nelems: Optional[int] = NELEMS):
"""
Check that all stop times on a TimeInterval object occur after their corresponding start times.
Expand All @@ -77,7 +80,7 @@ def check_time_intervals_stop_after_start(time_intervals: TimeIntervals, nelems:
Parameters
----------
time_intervals: TimeIntervals
nelems: int
nelems: int, optional
Only check the first {nelems} elements. This is useful in case there columns are
very long so you don't need to load the entire array into memory. Use None to
load the entire arrays.
Expand All @@ -96,14 +99,14 @@ def check_time_intervals_stop_after_start(time_intervals: TimeIntervals, nelems:


@register_check(importance=Importance.BEST_PRACTICE_SUGGESTION, neurodata_type=DynamicTable)
def check_column_binary_capability(table: DynamicTable, nelems: int = 200):
def check_column_binary_capability(table: DynamicTable, nelems: Optional[int] = NELEMS):
"""
Check each column of a table to see if the data could be set as a boolean dtype.
Parameters
----------
time_intervals: DynamicTable
nelems: int
nelems: int, optional
Only check the first {nelems} elements. This is useful in case there columns are
very long so you don't need to load the entire array into memory. Use None to
load the entire arrays.
Expand Down Expand Up @@ -176,7 +179,7 @@ def check_single_row(


@register_check(importance=Importance.BEST_PRACTICE_VIOLATION, neurodata_type=DynamicTable)
def check_table_values_for_dict(table: DynamicTable, nelems: int = 200):
def check_table_values_for_dict(table: DynamicTable, nelems: Optional[int] = NELEMS):
"""Check if any values in a row or column of a table contain a string casting of a Python dictionary."""
for column in table.columns:
if not hasattr(column, "data") or isinstance(column, VectorIndex) or not isinstance(column.data[0], str):
Expand All @@ -193,7 +196,7 @@ def check_table_values_for_dict(table: DynamicTable, nelems: int = 200):


@register_check(importance=Importance.BEST_PRACTICE_SUGGESTION, neurodata_type=DynamicTable)
def check_col_not_nan(table: DynamicTable, nelems: Optional[int] = 200):
def check_col_not_nan(table: DynamicTable, nelems: Optional[int] = NELEMS):
"""Check if all of the values in a single column of a table are NaN."""
for column in table.columns:
if (
Expand All @@ -210,3 +213,25 @@ def check_col_not_nan(table: DynamicTable, nelems: Optional[int] = 200):
message = message.replace("might have", "has") if nelems is None or slice_by == 1 else message
if all(np.isnan(column[slice(0, None, slice_by)]).flatten()):
yield InspectorMessage(message=message)


@register_check(importance=Importance.CRITICAL, neurodata_type=DynamicTable)
def check_ids_unique(table: DynamicTable, nelems: Optional[int] = NELEMS):
"""
Ensure that the values of the id attribute of a DynamicTable are unique.
Best Practice: :ref:`best_practice_unique_dynamic_table_ids`
Parameters
----------
table: DynamicTable
nelems: int, optional
Only inspect the first {nelems} elements
Returns
-------
"""
data = table.id[:nelems]
if len(set(data)) != len(data):
return InspectorMessage(message="This table has ids that are not unique.")
2 changes: 1 addition & 1 deletion src/nwbinspector/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def is_regular_series(series: np.ndarray, tolerance_decimals: int = 9):
return len(uniq_diff_ts) == 1


def is_ascending_series(series: Union[h5py.Dataset, ArrayLike], nelems=None):
def is_ascending_series(series: Union[h5py.Dataset, ArrayLike], nelems: Optional[int] = None):
"""General purpose function for determining if a series is monotonic increasing."""
if isinstance(series, h5py.Dataset):
return np.all(np.diff(_cache_data_selection(data=series, selection=slice(nelems))) > 0)
Expand Down
19 changes: 19 additions & 0 deletions tests/unit_tests/test_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
check_single_row,
check_table_values_for_dict,
check_col_not_nan,
check_ids_unique,
)
from nwbinspector.utils import get_package_version

Expand Down Expand Up @@ -397,3 +398,21 @@ def test_check_col_not_nan_fail_span_all_data():
file_path=None,
),
]


def test_fail_check_ids_unique():
dt = DynamicTable(name="test_table", description="test", id=[0, 0, 1, 1])
assert check_ids_unique(dt) == InspectorMessage(
message="This table has ids that are not unique.",
importance=Importance.CRITICAL,
check_function_name="check_ids_unique",
object_type="DynamicTable",
object_name="test_table",
location="/",
file_path=None,
)


def test_pass_check_ids_unique():
dt = DynamicTable(name="test_table", description="test", id=[0, 1])
assert check_ids_unique(dt) is None

0 comments on commit 4bd63f9

Please sign in to comment.