From 40caf7aa22dc17420c2381ef45b83c807c802cbd Mon Sep 17 00:00:00 2001 From: Steph Prince <40640337+stephprince@users.noreply.github.com> Date: Tue, 5 Mar 2024 08:01:34 -0800 Subject: [PATCH 1/7] add detection of ragged array inputs to table --- src/hdmf/common/table.py | 9 ++++++++- src/hdmf/utils.py | 15 +++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/hdmf/common/table.py b/src/hdmf/common/table.py index 5eeedcd86..d7948e39b 100644 --- a/src/hdmf/common/table.py +++ b/src/hdmf/common/table.py @@ -15,7 +15,7 @@ from . import register_class, EXP_NAMESPACE from ..container import Container, Data from ..data_utils import DataIO, AbstractDataChunkIterator -from ..utils import docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional, check_type +from ..utils import docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional, check_type, is_ragged from ..term_set import TermSetWrapper @@ -709,6 +709,8 @@ def add_row(self, **kwargs): c.add_vector(data[colname]) else: c.add_row(data[colname]) + if is_ragged(c.data): + raise ValueError("Data is ragged. Use the 'index' argument when creating a column that will have ragged data.") def __eq__(self, other): """Compare if the two DynamicTables contain the same data. @@ -823,6 +825,11 @@ def add_column(self, **kwargs): # noqa: C901 # once we have created the column create_vector_index = None if ckwargs.get('data', None) is not None: + + # if no index was provided, check that data is not ragged + if index is False and is_ragged(data): + raise ValueError("Data is ragged. Use the 'index' argument when adding a column with ragged data.") + # Check that we are asked to create an index if (isinstance(index, bool) or isinstance(index, int)) and index > 0 and len(data) > 0: # Iteratively flatten the data we use for the column based on the depth of the index to generate. diff --git a/src/hdmf/utils.py b/src/hdmf/utils.py index 57a4bb465..3fba7a2ab 100644 --- a/src/hdmf/utils.py +++ b/src/hdmf/utils.py @@ -954,6 +954,21 @@ def to_uint_array(arr): raise ValueError('Cannot convert array of dtype %s to uint.' % arr.dtype) +def is_ragged(data): + """ + Test whether a list of lists or array is ragged / jagged + """ + if hasattr(data, '__len__') and not isinstance(data, str): + lengths = [len(sub_data) if hasattr(sub_data, '__len__') and not isinstance(sub_data, str) else 1 + for sub_data in data] + if len(set(lengths)) > 1: + return True # ragged at this level + + return any(is_ragged(sub_data) for sub_data in data) # check next level + + return False + + class LabelledDict(dict): """A dict wrapper that allows querying by an attribute of the values and running a callable on removed items. From a147f3bddb40bb4e94f4d24c3f5943119dba256f Mon Sep 17 00:00:00 2001 From: Steph Prince <40640337+stephprince@users.noreply.github.com> Date: Tue, 5 Mar 2024 08:04:56 -0800 Subject: [PATCH 2/7] add tests for ragged array inputs to table --- tests/unit/common/test_table.py | 38 +++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tests/unit/common/test_table.py b/tests/unit/common/test_table.py index 7246a8ba8..babdff45a 100644 --- a/tests/unit/common/test_table.py +++ b/tests/unit/common/test_table.py @@ -354,6 +354,44 @@ def test_add_column_multi_index(self): ] ) + def test_add_column_without_required_index(self): + """ + Add a column as a ragged list of lists without specifying an index parameter + """ + table = self.with_spec() + table.add_row(foo=5, bar=50.0, baz='lizard') + table.add_row(foo=5, bar=50.0, baz='lizard') + + # testing adding column without a necessary index parameter + msg = "Data is ragged. Use the 'index' argument when adding a column with ragged data." + lol_data = [[1, 2, 3], [1, 2, 3, 4]] + str_data = [['a', 'b'], ['a', 'b', 'c']] + empty_data = [[1, 2], []] + multi_nested_data = [[[1, 2, 3], [1, 2, 3, 4]], [1, 2]] + arr_data = np.array([[1, 2, 3], [1, 2, 3, 4]], dtype='object') + + with self.assertRaisesWith(ValueError, msg): + table.add_column(name='qux', description='qux column', data=lol_data,) + with self.assertRaisesWith(ValueError, msg): + table.add_column(name='qux', description='qux column', data=str_data,) + with self.assertRaisesWith(ValueError, msg): + table.add_column(name='qux', description='qux column', data=empty_data,) + with self.assertRaisesWith(ValueError, msg): + table.add_column(name='qux', description='qux column', data=multi_nested_data,) + with self.assertRaisesWith(ValueError, msg): + table.add_column(name='qux', description='qux column', data=arr_data,) + + def test_add_row_without_required_index(self): + """ + Add a column as a ragged list of lists without specifying an index parameter + """ + table = self.with_spec() + table.add_column(name='qux', description='qux column') + table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3]) + msg = "Data is ragged. Use the 'index' argument when creating a column that will have ragged data." + with self.assertRaisesWith(ValueError, msg): + table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3 ,4]) + def test_add_column_auto_index_int(self): """ Add a column as a list of lists after we have already added data so that we need to create a single VectorIndex From 9f7011f3fe5a9bbe6ebc0974186faea0e26c2041 Mon Sep 17 00:00:00 2001 From: Steph Prince <40640337+stephprince@users.noreply.github.com> Date: Tue, 5 Mar 2024 16:28:46 -0800 Subject: [PATCH 3/7] add warnings for ragged inputs to table --- src/hdmf/common/table.py | 6 ++++-- tests/unit/common/test_table.py | 22 +++++++++++----------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/src/hdmf/common/table.py b/src/hdmf/common/table.py index d7948e39b..588c871e2 100644 --- a/src/hdmf/common/table.py +++ b/src/hdmf/common/table.py @@ -710,7 +710,8 @@ def add_row(self, **kwargs): else: c.add_row(data[colname]) if is_ragged(c.data): - raise ValueError("Data is ragged. Use the 'index' argument when creating a column that will have ragged data.") + warn("Data is ragged. Use the 'index' argument when creating a column that will have ragged data.", + stacklevel=2) def __eq__(self, other): """Compare if the two DynamicTables contain the same data. @@ -828,7 +829,8 @@ def add_column(self, **kwargs): # noqa: C901 # if no index was provided, check that data is not ragged if index is False and is_ragged(data): - raise ValueError("Data is ragged. Use the 'index' argument when adding a column with ragged data.") + warn("Data is ragged. Use the 'index' argument when adding a column with ragged data.", + stacklevel=2) # Check that we are asked to create an index if (isinstance(index, bool) or isinstance(index, int)) and index > 0 and len(data) > 0: diff --git a/tests/unit/common/test_table.py b/tests/unit/common/test_table.py index babdff45a..a5f38d713 100644 --- a/tests/unit/common/test_table.py +++ b/tests/unit/common/test_table.py @@ -370,16 +370,16 @@ def test_add_column_without_required_index(self): multi_nested_data = [[[1, 2, 3], [1, 2, 3, 4]], [1, 2]] arr_data = np.array([[1, 2, 3], [1, 2, 3, 4]], dtype='object') - with self.assertRaisesWith(ValueError, msg): - table.add_column(name='qux', description='qux column', data=lol_data,) - with self.assertRaisesWith(ValueError, msg): - table.add_column(name='qux', description='qux column', data=str_data,) - with self.assertRaisesWith(ValueError, msg): - table.add_column(name='qux', description='qux column', data=empty_data,) - with self.assertRaisesWith(ValueError, msg): - table.add_column(name='qux', description='qux column', data=multi_nested_data,) - with self.assertRaisesWith(ValueError, msg): - table.add_column(name='qux', description='qux column', data=arr_data,) + with self.assertWarnsWith(UserWarning, msg): + table.add_column(name='col1', description='', data=lol_data,) + with self.assertWarnsWith(UserWarning, msg): + table.add_column(name='col2', description='', data=str_data,) + with self.assertWarnsWith(UserWarning, msg): + table.add_column(name='col3', description='', data=empty_data,) + with self.assertWarnsWith(UserWarning, msg): + table.add_column(name='col4', description='', data=multi_nested_data,) + with self.assertWarnsWith(UserWarning, msg): + table.add_column(name='col5', description='', data=arr_data,) def test_add_row_without_required_index(self): """ @@ -389,7 +389,7 @@ def test_add_row_without_required_index(self): table.add_column(name='qux', description='qux column') table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3]) msg = "Data is ragged. Use the 'index' argument when creating a column that will have ragged data." - with self.assertRaisesWith(ValueError, msg): + with self.assertWarnsWith(UserWarning, msg): table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3 ,4]) def test_add_column_auto_index_int(self): From 1f940fe500aa51a44ee7c7571b4ed3d8932db461 Mon Sep 17 00:00:00 2001 From: Steph Prince <40640337+stephprince@users.noreply.github.com> Date: Tue, 5 Mar 2024 16:46:14 -0800 Subject: [PATCH 4/7] update CHANGELOG.md --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1294aee02..0d1e6241f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # HDMF Changelog +## HDMF 3.13.0 (Upcoming) + +### Enhancements +- Added warning when using `add_row` or `add_column` to add a ragged array to `DynamicTable` without an index parameter. @stephprince [#1066](https://github.com/hdmf-dev/hdmf/pull/1066) + ## HDMF 3.12.2 (February 9, 2024) ### Bug fixes From a4b20eca967545bfa4ab7aeb4f71092f7ef89ae2 Mon Sep 17 00:00:00 2001 From: Steph Prince <40640337+stephprince@users.noreply.github.com> Date: Tue, 12 Mar 2024 14:26:47 -0700 Subject: [PATCH 5/7] check only lists and tuples for raggedness --- src/hdmf/utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/hdmf/utils.py b/src/hdmf/utils.py index 3fba7a2ab..5e0b61539 100644 --- a/src/hdmf/utils.py +++ b/src/hdmf/utils.py @@ -958,9 +958,8 @@ def is_ragged(data): """ Test whether a list of lists or array is ragged / jagged """ - if hasattr(data, '__len__') and not isinstance(data, str): - lengths = [len(sub_data) if hasattr(sub_data, '__len__') and not isinstance(sub_data, str) else 1 - for sub_data in data] + if isinstance(data, (list, tuple)): + lengths = [len(sub_data) if isinstance(sub_data, (list, tuple)) else 1 for sub_data in data] if len(set(lengths)) > 1: return True # ragged at this level From d8ab642ba4bb104c65bfeadbda8d931fd4ad9cb2 Mon Sep 17 00:00:00 2001 From: Steph Prince <40640337+stephprince@users.noreply.github.com> Date: Tue, 12 Mar 2024 14:37:58 -0700 Subject: [PATCH 6/7] add flag to turn off ragged data checks --- src/hdmf/common/table.py | 23 +++++++++++++----- tests/unit/common/test_table.py | 42 ++++++++++++++++++++++++++++----- 2 files changed, 53 insertions(+), 12 deletions(-) diff --git a/src/hdmf/common/table.py b/src/hdmf/common/table.py index 588c871e2..3b67ff19d 100644 --- a/src/hdmf/common/table.py +++ b/src/hdmf/common/table.py @@ -639,12 +639,16 @@ def __len__(self): {'name': 'id', 'type': int, 'doc': 'the ID for the row', 'default': None}, {'name': 'enforce_unique_id', 'type': bool, 'doc': 'enforce that the id in the table must be unique', 'default': False}, + {'name': 'check_ragged', 'type': bool, 'default': True, + 'doc': ('whether or not to check for ragged arrays when adding data to the table. ' + 'Set to False to avoid checking every element if performance issues occur.')}, allow_extra=True) def add_row(self, **kwargs): """ Add a row to the table. If *id* is not provided, it will auto-increment. """ - data, row_id, enforce_unique_id = popargs('data', 'id', 'enforce_unique_id', kwargs) + data, row_id, enforce_unique_id, check_ragged = popargs('data', 'id', 'enforce_unique_id', 'check_ragged', + kwargs) data = data if data is not None else kwargs bad_data = [] @@ -709,8 +713,10 @@ def add_row(self, **kwargs): c.add_vector(data[colname]) else: c.add_row(data[colname]) - if is_ragged(c.data): - warn("Data is ragged. Use the 'index' argument when creating a column that will have ragged data.", + if check_ragged and is_ragged(c.data): + warn(("Data has elements with different lengths and therefore cannot be coerced into an " + "N-dimensional array. Use the 'index' argument when creating a column to add rows " + "with different lengths."), stacklevel=2) def __eq__(self, other): @@ -751,6 +757,9 @@ def __eq__(self, other): 'doc': ('class to use to represent the column data. If table=True, this field is ignored and a ' 'DynamicTableRegion object is used. If enum=True, this field is ignored and a EnumData ' 'object is used.')}, + {'name': 'check_ragged', 'type': bool, 'default': True, + 'doc': ('whether or not to check for ragged arrays when adding data to the table. ' + 'Set to False to avoid checking every element if performance issues occur.')}, allow_extra=True) def add_column(self, **kwargs): # noqa: C901 """ @@ -763,7 +772,7 @@ def add_column(self, **kwargs): # noqa: C901 :raises ValueError: if the column has already been added to the table """ name, data = getargs('name', 'data', kwargs) - index, table, enum, col_cls= popargs('index', 'table', 'enum', 'col_cls', kwargs) + index, table, enum, col_cls, check_ragged = popargs('index', 'table', 'enum', 'col_cls', 'check_ragged', kwargs) if isinstance(index, VectorIndex): warn("Passing a VectorIndex in for index may lead to unexpected behavior. This functionality will be " @@ -828,8 +837,10 @@ def add_column(self, **kwargs): # noqa: C901 if ckwargs.get('data', None) is not None: # if no index was provided, check that data is not ragged - if index is False and is_ragged(data): - warn("Data is ragged. Use the 'index' argument when adding a column with ragged data.", + if index is False and check_ragged and is_ragged(data): + warn(("Data has elements with different lengths and therefore cannot be coerced into an " + "N-dimensional array. Use the 'index' argument when adding a column of data with " + "different lengths."), stacklevel=2) # Check that we are asked to create an index diff --git a/tests/unit/common/test_table.py b/tests/unit/common/test_table.py index a5f38d713..c14ec75f0 100644 --- a/tests/unit/common/test_table.py +++ b/tests/unit/common/test_table.py @@ -356,20 +356,21 @@ def test_add_column_multi_index(self): def test_add_column_without_required_index(self): """ - Add a column as a ragged list of lists without specifying an index parameter + Add a column with different element lengths without specifying an index parameter """ table = self.with_spec() table.add_row(foo=5, bar=50.0, baz='lizard') table.add_row(foo=5, bar=50.0, baz='lizard') # testing adding column without a necessary index parameter - msg = "Data is ragged. Use the 'index' argument when adding a column with ragged data." lol_data = [[1, 2, 3], [1, 2, 3, 4]] str_data = [['a', 'b'], ['a', 'b', 'c']] empty_data = [[1, 2], []] multi_nested_data = [[[1, 2, 3], [1, 2, 3, 4]], [1, 2]] - arr_data = np.array([[1, 2, 3], [1, 2, 3, 4]], dtype='object') + tuple_data = ((1, 2, 3), (1, 2, 3, 4)) + msg = ("Data has elements with different lengths and therefore cannot be coerced into an N-dimensional " + "array. Use the 'index' argument when adding a column of data with different lengths.") with self.assertWarnsWith(UserWarning, msg): table.add_column(name='col1', description='', data=lol_data,) with self.assertWarnsWith(UserWarning, msg): @@ -379,18 +380,47 @@ def test_add_column_without_required_index(self): with self.assertWarnsWith(UserWarning, msg): table.add_column(name='col4', description='', data=multi_nested_data,) with self.assertWarnsWith(UserWarning, msg): - table.add_column(name='col5', description='', data=arr_data,) + table.add_column(name='col5', description='', data=tuple_data,) + + def test_add_column_without_required_index_and_no_ragged_check(self): + """ + Add a column with different element lengths without checking for raggedness + """ + lol_data = [[1, 2, 3], [1, 2, 3, 4]] + table = self.with_spec() + table.add_row(foo=5, bar=50.0, baz='lizard') + table.add_row(foo=5, bar=50.0, baz='lizard') + table.add_column(name='col1', description='', data=lol_data, check_ragged=False) def test_add_row_without_required_index(self): """ - Add a column as a ragged list of lists without specifying an index parameter + Add rows with different element lengths without specifying an index parameter """ + + # test adding row of list data with different lengths without index parameter + msg = ("Data has elements with different lengths and therefore cannot be coerced into an N-dimensional " + "array. Use the 'index' argument when creating a column to add rows with different lengths.") table = self.with_spec() table.add_column(name='qux', description='qux column') table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3]) - msg = "Data is ragged. Use the 'index' argument when creating a column that will have ragged data." with self.assertWarnsWith(UserWarning, msg): table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3 ,4]) + + # test adding row of tuple/str data with different lengths without index parameter + table = self.with_spec() + table.add_column(name='qux', description='qux column') + table.add_row(foo=5, bar=50.0, baz='lizard', qux=('a', 'b')) + with self.assertWarnsWith(UserWarning, msg): + table.add_row(foo=5, bar=50.0, baz='lizard', qux=('a', 'b', 'c')) + + def test_add_row_without_required_index_and_no_ragged_check(self): + """ + Add rows with different element lengths without checking for raggedness + """ + table = self.with_spec() + table.add_column(name='qux', description='qux column') + table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3]) + table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3 ,4], check_ragged=False) def test_add_column_auto_index_int(self): """ From a44206d40be22725d46964d5ba52b24dddf98642 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 12 Mar 2024 22:22:13 +0000 Subject: [PATCH 7/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/unit/common/test_table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/common/test_table.py b/tests/unit/common/test_table.py index c14ec75f0..d98add060 100644 --- a/tests/unit/common/test_table.py +++ b/tests/unit/common/test_table.py @@ -405,7 +405,7 @@ def test_add_row_without_required_index(self): table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3]) with self.assertWarnsWith(UserWarning, msg): table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3 ,4]) - + # test adding row of tuple/str data with different lengths without index parameter table = self.with_spec() table.add_column(name='qux', description='qux column')