Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Warn when adding ragged arrays to DynamicTable without index argument #1066

Merged
merged 8 commits into from
Mar 14, 2024
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

### Enhancements
- Added docs page that lists limitations of support for the HDMF specification language. @rly [#1069](https://github.com/hdmf-dev/hdmf/pull/1069)
- Added warning when using `add_row` or `add_column` to add a ragged array to `DynamicTable` without an index parameter. @stephprince [#1066](https://github.com/hdmf-dev/hdmf/pull/1066)

## HDMF 3.12.2 (February 9, 2024)

Expand Down
26 changes: 23 additions & 3 deletions src/hdmf/common/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from . import register_class, EXP_NAMESPACE
from ..container import Container, Data
from ..data_utils import DataIO, AbstractDataChunkIterator
from ..utils import docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional, check_type
from ..utils import docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional, check_type, is_ragged
from ..term_set import TermSetWrapper


Expand Down Expand Up @@ -639,12 +639,16 @@ def __len__(self):
{'name': 'id', 'type': int, 'doc': 'the ID for the row', 'default': None},
{'name': 'enforce_unique_id', 'type': bool, 'doc': 'enforce that the id in the table must be unique',
'default': False},
{'name': 'check_ragged', 'type': bool, 'default': True,
'doc': ('whether or not to check for ragged arrays when adding data to the table. '
'Set to False to avoid checking every element if performance issues occur.')},
allow_extra=True)
def add_row(self, **kwargs):
"""
Add a row to the table. If *id* is not provided, it will auto-increment.
"""
data, row_id, enforce_unique_id = popargs('data', 'id', 'enforce_unique_id', kwargs)
data, row_id, enforce_unique_id, check_ragged = popargs('data', 'id', 'enforce_unique_id', 'check_ragged',
kwargs)
data = data if data is not None else kwargs

bad_data = []
Expand Down Expand Up @@ -709,6 +713,11 @@ def add_row(self, **kwargs):
c.add_vector(data[colname])
else:
c.add_row(data[colname])
if check_ragged and is_ragged(c.data):
warn(("Data has elements with different lengths and therefore cannot be coerced into an "
"N-dimensional array. Use the 'index' argument when creating a column to add rows "
"with different lengths."),
stacklevel=2)

def __eq__(self, other):
"""Compare if the two DynamicTables contain the same data.
Expand Down Expand Up @@ -748,6 +757,9 @@ def __eq__(self, other):
'doc': ('class to use to represent the column data. If table=True, this field is ignored and a '
'DynamicTableRegion object is used. If enum=True, this field is ignored and a EnumData '
'object is used.')},
{'name': 'check_ragged', 'type': bool, 'default': True,
'doc': ('whether or not to check for ragged arrays when adding data to the table. '
'Set to False to avoid checking every element if performance issues occur.')},
allow_extra=True)
def add_column(self, **kwargs): # noqa: C901
"""
Expand All @@ -760,7 +772,7 @@ def add_column(self, **kwargs): # noqa: C901
:raises ValueError: if the column has already been added to the table
"""
name, data = getargs('name', 'data', kwargs)
index, table, enum, col_cls= popargs('index', 'table', 'enum', 'col_cls', kwargs)
index, table, enum, col_cls, check_ragged = popargs('index', 'table', 'enum', 'col_cls', 'check_ragged', kwargs)

if isinstance(index, VectorIndex):
warn("Passing a VectorIndex in for index may lead to unexpected behavior. This functionality will be "
Expand Down Expand Up @@ -823,6 +835,14 @@ def add_column(self, **kwargs): # noqa: C901
# once we have created the column
create_vector_index = None
if ckwargs.get('data', None) is not None:

# if no index was provided, check that data is not ragged
if index is False and check_ragged and is_ragged(data):
warn(("Data has elements with different lengths and therefore cannot be coerced into an "
"N-dimensional array. Use the 'index' argument when adding a column of data with "
"different lengths."),
stacklevel=2)

# Check that we are asked to create an index
if (isinstance(index, bool) or isinstance(index, int)) and index > 0 and len(data) > 0:
# Iteratively flatten the data we use for the column based on the depth of the index to generate.
Expand Down
14 changes: 14 additions & 0 deletions src/hdmf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -954,6 +954,20 @@ def to_uint_array(arr):
raise ValueError('Cannot convert array of dtype %s to uint.' % arr.dtype)


def is_ragged(data):
"""
Test whether a list of lists or array is ragged / jagged
"""
if isinstance(data, (list, tuple)):
lengths = [len(sub_data) if isinstance(sub_data, (list, tuple)) else 1 for sub_data in data]
if len(set(lengths)) > 1:
return True # ragged at this level

return any(is_ragged(sub_data) for sub_data in data) # check next level

return False


class LabelledDict(dict):
"""A dict wrapper that allows querying by an attribute of the values and running a callable on removed items.

Expand Down
68 changes: 68 additions & 0 deletions tests/unit/common/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,74 @@ def test_add_column_multi_index(self):
]
)

def test_add_column_without_required_index(self):
"""
Add a column with different element lengths without specifying an index parameter
"""
table = self.with_spec()
table.add_row(foo=5, bar=50.0, baz='lizard')
table.add_row(foo=5, bar=50.0, baz='lizard')

# testing adding column without a necessary index parameter
lol_data = [[1, 2, 3], [1, 2, 3, 4]]
str_data = [['a', 'b'], ['a', 'b', 'c']]
empty_data = [[1, 2], []]
multi_nested_data = [[[1, 2, 3], [1, 2, 3, 4]], [1, 2]]
tuple_data = ((1, 2, 3), (1, 2, 3, 4))

msg = ("Data has elements with different lengths and therefore cannot be coerced into an N-dimensional "
"array. Use the 'index' argument when adding a column of data with different lengths.")
with self.assertWarnsWith(UserWarning, msg):
table.add_column(name='col1', description='', data=lol_data,)
with self.assertWarnsWith(UserWarning, msg):
table.add_column(name='col2', description='', data=str_data,)
with self.assertWarnsWith(UserWarning, msg):
table.add_column(name='col3', description='', data=empty_data,)
with self.assertWarnsWith(UserWarning, msg):
table.add_column(name='col4', description='', data=multi_nested_data,)
with self.assertWarnsWith(UserWarning, msg):
table.add_column(name='col5', description='', data=tuple_data,)

def test_add_column_without_required_index_and_no_ragged_check(self):
"""
Add a column with different element lengths without checking for raggedness
"""
lol_data = [[1, 2, 3], [1, 2, 3, 4]]
table = self.with_spec()
table.add_row(foo=5, bar=50.0, baz='lizard')
table.add_row(foo=5, bar=50.0, baz='lizard')
table.add_column(name='col1', description='', data=lol_data, check_ragged=False)

def test_add_row_without_required_index(self):
"""
Add rows with different element lengths without specifying an index parameter
"""

# test adding row of list data with different lengths without index parameter
msg = ("Data has elements with different lengths and therefore cannot be coerced into an N-dimensional "
"array. Use the 'index' argument when creating a column to add rows with different lengths.")
table = self.with_spec()
table.add_column(name='qux', description='qux column')
table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3])
with self.assertWarnsWith(UserWarning, msg):
table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3 ,4])

# test adding row of tuple/str data with different lengths without index parameter
table = self.with_spec()
table.add_column(name='qux', description='qux column')
table.add_row(foo=5, bar=50.0, baz='lizard', qux=('a', 'b'))
with self.assertWarnsWith(UserWarning, msg):
table.add_row(foo=5, bar=50.0, baz='lizard', qux=('a', 'b', 'c'))

def test_add_row_without_required_index_and_no_ragged_check(self):
"""
Add rows with different element lengths without checking for raggedness
"""
table = self.with_spec()
table.add_column(name='qux', description='qux column')
table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3])
table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3 ,4], check_ragged=False)

def test_add_column_auto_index_int(self):
"""
Add a column as a list of lists after we have already added data so that we need to create a single VectorIndex
Expand Down
Loading