Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix to_hierarchical_dataframe with VectorIndex data columns #666

Merged
merged 8 commits into from
Nov 23, 2021
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
- Fixed broken LaTeX PDF build of the docs. @oruebel (#669)
- Fixed adding containers as a child to a parent container sometimes not marking the parent container as modified. @rly
(#683)
- Fixed `to_hierarchcial_dataframe` failing when a table contains a `VectorIndex` column as a regular data column.
@oruebel (#666)


## HDMF 3.1.1 (July 29, 2021)
Expand Down
20 changes: 19 additions & 1 deletion src/hdmf/common/hierarchicaltable.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def to_hierarchical_dataframe(dynamic_table):
names=('source_table', 'label'))

# Case 2: Our DynamicTableRegion columns points to another table with a DynamicTableRegion, i.e.,
# we need to recursively resolve more levels of the table hieararchy
# we need to recursively resolve more levels of the table hierarchy
else:
# First we need to recursively flatten the hierarchy by calling 'to_hierarchical_dataframe()'
# (i.e., this function) on the target of our hierarchical column
Expand Down Expand Up @@ -134,6 +134,24 @@ def to_hierarchical_dataframe(dynamic_table):
hcol_hdf.index.names)
columns = hcol_hdf.columns

# Check if the index contains any unhashable types. If a table contains a VectorIndex column
# (other than the DynamicTableRegion column) then "TypeError: unhashable type: 'list'" will
# occur when converting the index to pd.MultiIndex. To avoid this error, we next check if any
# of the columns in our index are of type list or np.ndarray
unhashable_index_cols = []
if len(index) > 0:
unhashable_index_cols = [i for i, v in enumerate(index[0]) if isinstance(v, (list, np.ndarray))]

# If we have any unhashable list or np.array objects in the index then update them to tuples.
# Ideally we would detect this case when constructing the index, but it is easier to do this
# here and it should not be much more expensive, but it requires iterating over all rows again
if len(unhashable_index_cols) > 0:
for i, v in enumerate(index):
temp = list(v)
for ci in unhashable_index_cols:
temp[ci] = tuple(temp[ci])
index[i] = tuple(temp)

# Construct the pandas dataframe with the hierarchical multi-index
multi_index = pd.MultiIndex.from_tuples(index, names=index_names)
out_df = pd.DataFrame(data=data, index=multi_index, columns=columns)
Expand Down
52 changes: 52 additions & 0 deletions tests/unit/common/test_linkedtables.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,58 @@ def test_to_hierarchical_dataframe_indexed_dtr_on_last_level(self):
self.assertListEqual(hier_df[('aligned_table', ('level0_0', 'tags'))].to_list(),
[['tag1'], ['tag2'], ['tag2', 'tag1']])

def test_to_hierarchical_dataframe_indexed_data_nparray(self):
# Test that we can convert a table that contains a VectorIndex column as regular data,
# i.e., it is not our DynamicTableRegion column that is index but a regular data column.
# In this test the data is defined as an numpy nd.array so that an nd.array is injected
# into the MultiIndex of the table. As a numpy array is not hashable this would normally
# create an error when creating the MultiIndex
# Parent table
dtr_p1 = DynamicTableRegion(name='l1', description='l1', data=np.arange(4), table=self.aligned_table)
vi_dtr_p1 = VectorIndex(name='sl1_index', data=[1, 2, 3], target=dtr_p1)
p1 = DynamicTable(name='parent_table', description='parent_table',
columns=[VectorData(name='p1', description='p1', data=np.arange(3)), dtr_p1, vi_dtr_p1])
# Super-parent table
dtr_sp = DynamicTableRegion(name='sl1', description='sl1', data=np.arange(3), table=p1)
spt = DynamicTable(name='super_parent_table', description='super_parent_table',
columns=[VectorData(name='sp1', description='sp1', data=np.arange(3)), dtr_sp])
spt.add_column(name='vic', description='vic', data=np.arange(9), index=[2, 4, 6])
hier_df = to_hierarchical_dataframe(spt).reset_index()
expected_columns = [('super_parent_table', 'id'), ('super_parent_table', 'sp1'), ('super_parent_table', 'vic'),
('parent_table', 'id'), ('parent_table', 'p1'),
('aligned_table', 'id'),
('aligned_table', ('aligned_table', 'a1')), ('aligned_table', ('level0_0', 'id')),
('aligned_table', ('level0_0', 'tags')), ('aligned_table', ('level0_0', 'myid'))]
self.assertListEqual(hier_df.columns.to_list(), expected_columns) # make sure we have the right columns
self.assertListEqual(hier_df[('aligned_table', ('level0_0', 'tags'))].to_list(),
[['tag1'], ['tag2'], ['tag2', 'tag1']])

def test_to_hierarchical_dataframe_indexed_data_list(self):
# Test that we can convert a table that contains a VectorIndex column as regular data,
# i.e., it is not our DynamicTableRegion column that is index but a regular data column.
# In this test the data is defined as an list so that a list is injected
# into the MultiIndex of the table. As a list is not hashable this would normally
# create an error when creating the MultiIndex
# Parent table
dtr_p1 = DynamicTableRegion(name='l1', description='l1', data=np.arange(4), table=self.aligned_table)
vi_dtr_p1 = VectorIndex(name='sl1_index', data=[1, 2, 3], target=dtr_p1)
p1 = DynamicTable(name='parent_table', description='parent_table',
columns=[VectorData(name='p1', description='p1', data=np.arange(3)), dtr_p1, vi_dtr_p1])
# Super-parent table
dtr_sp = DynamicTableRegion(name='sl1', description='sl1', data=np.arange(3), table=p1)
spt = DynamicTable(name='super_parent_table', description='super_parent_table',
columns=[VectorData(name='sp1', description='sp1', data=np.arange(3)), dtr_sp])
spt.add_column(name='vic', description='vic', data=list(range(9)), index=list([2, 4, 6]))
hier_df = to_hierarchical_dataframe(spt).reset_index()
expected_columns = [('super_parent_table', 'id'), ('super_parent_table', 'sp1'), ('super_parent_table', 'vic'),
('parent_table', 'id'), ('parent_table', 'p1'),
('aligned_table', 'id'),
('aligned_table', ('aligned_table', 'a1')), ('aligned_table', ('level0_0', 'id')),
('aligned_table', ('level0_0', 'tags')), ('aligned_table', ('level0_0', 'myid'))]
self.assertListEqual(hier_df.columns.to_list(), expected_columns) # make sure we have the right columns
self.assertListEqual(hier_df[('aligned_table', ('level0_0', 'tags'))].to_list(),
[['tag1'], ['tag2'], ['tag2', 'tag1']])

def test_to_hierarchical_dataframe_empty_tables(self):
# Setup empty tables with the following hierarchy
# super_parent_table ---> parent_table ---> child_table
Expand Down