From 9fe3f9de812ec904c644aca86e98ec8e55b4ce5c Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Fri, 1 Sep 2023 10:00:06 -0700 Subject: [PATCH] Fix DynamicTable with all DataChunkIterator columns bug (#953) * Fix 952 Raise error in DynamicTable __init__ if all columns are specified via AbstractDataChunkIterator but no id's are set * Updated changelog --- CHANGELOG.md | 5 +++++ src/hdmf/common/table.py | 24 +++++++++++++++++------- tests/unit/common/test_table.py | 19 +++++++++++++++++-- 3 files changed, 39 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d496610c5..a90e1873c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # HDMF Changelog +## HDMF 3.9.1 (Upcoming) + +### Bug fixes +- Fixed bug allowing `DynamicTable` to be constructed with empty `id` column when initializing all columns via `AbstractDataChunkIterator` objects. @oruebel [#953](https://github.com/hdmf-dev/hdmf/pull/953) + ## HDMF 3.9.0 (August 25, 2023) diff --git a/src/hdmf/common/table.py b/src/hdmf/common/table.py index cafd8ff16..08901a022 100644 --- a/src/hdmf/common/table.py +++ b/src/hdmf/common/table.py @@ -314,7 +314,8 @@ def __init__(self, **kwargs): # noqa: C901 # All tables must have ElementIdentifiers (i.e. a primary key column) # Here, we figure out what to do for that - if id is not None: + user_provided_ids = (id is not None) + if user_provided_ids: if not isinstance(id, ElementIdentifiers): id = ElementIdentifiers(name='id', data=id) else: @@ -357,13 +358,22 @@ def __init__(self, **kwargs): # noqa: C901 if isinstance(_data, AbstractDataChunkIterator): colset.pop(c.name, None) lens = [len(c) for c in colset.values()] + all_columns_are_iterators = (len(lens) == 0) + if not all(i == lens[0] for i in lens): - raise ValueError("columns must be the same length") - if len(lens) > 0 and lens[0] != len(id): - # the first part of this conditional is needed in the - # event that all columns are AbstractDataChunkIterators - if len(id) > 0: - raise ValueError("must provide same number of ids as length of columns") + raise ValueError("Columns must be the same length") + # If we have columns given, but all columns are AbstractDataChunkIterator's, then we + # cannot determine how many elements the id column will need. I.e., in this case the + # user needs to provide the id's as otherwise we may create an invalid table with an + # empty Id column but data in the rows. See: https://github.com/hdmf-dev/hdmf/issues/952 + if all_columns_are_iterators and not user_provided_ids: + raise ValueError("Cannot determine row id's for table. Must provide ids with same length " + "as the columns when all columns are specified via DataChunkIterator objects.") + # If we have columns with a known length but the length (i.e., number of rows) + # does not match the number of id's then initialize the id's + if not all_columns_are_iterators and lens[0] != len(id): + if user_provided_ids and len(id) > 0: + raise ValueError("Must provide same number of ids as length of columns") else: # set ids to: 0 to length of columns - 1 id.data.extend(range(lens[0])) diff --git a/tests/unit/common/test_table.py b/tests/unit/common/test_table.py index 311e01f8b..a6048ce88 100644 --- a/tests/unit/common/test_table.py +++ b/tests/unit/common/test_table.py @@ -13,6 +13,7 @@ DynamicTableRegion, get_manager, SimpleMultiContainer) from hdmf.testing import TestCase, H5RoundTripMixin, remove_test_file from hdmf.utils import StrDataset +from hdmf.data_utils import DataChunkIterator from tests.unit.helpers.utils import get_temp_filepath @@ -99,10 +100,24 @@ def test_constructor_ElementIdentifier_ids(self): def test_constructor_ids_bad_ids(self): columns = [VectorData(name=s['name'], description=s['description'], data=d) for s, d in zip(self.spec, self.data)] - msg = "must provide same number of ids as length of columns" + msg = "Must provide same number of ids as length of columns" with self.assertRaisesWith(ValueError, msg): DynamicTable(name="with_columns", description='a test table', id=[0, 1], columns=columns) + def test_constructor_all_columns_are_iterators(self): + """ + All columns are specified via AbstractDataChunkIterator but no id's are given. + Test that an error is being raised because we can't determine the id's. + """ + data = np.array([1., 2., 3.]) + column = VectorData(name="TestColumn", description="", data=DataChunkIterator(data)) + msg = ("Cannot determine row id's for table. Must provide ids with same length " + "as the columns when all columns are specified via DataChunkIterator objects.") + with self.assertRaisesWith(ValueError, msg): + _ = DynamicTable(name="TestTable", description="", columns=[column]) + # now test that when we supply id's that the error goes away + _ = DynamicTable(name="TestTable", description="", columns=[column], id=list(range(3))) + @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed") def test_add_col_validate(self): terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml') @@ -211,7 +226,7 @@ def test_constructor_bad_columns(self): def test_constructor_unequal_length_columns(self): columns = [VectorData(name='col1', description='desc', data=[1, 2, 3]), VectorData(name='col2', description='desc', data=[1, 2])] - msg = "columns must be the same length" + msg = "Columns must be the same length" with self.assertRaisesWith(ValueError, msg): DynamicTable(name="with_columns", description='a test table', columns=columns)