From 8618aac63b492a15616c24aded5915ec10ecc9a2 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Sun, 22 Aug 2021 23:25:39 -0700 Subject: [PATCH 1/6] Fix #665 TypeError in to_hierarchical_dataframe when index contains lists due to VectorIndex columns --- src/hdmf/common/hierarchicaltable.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/hdmf/common/hierarchicaltable.py b/src/hdmf/common/hierarchicaltable.py index b5ca4db08..3cffc8bd0 100644 --- a/src/hdmf/common/hierarchicaltable.py +++ b/src/hdmf/common/hierarchicaltable.py @@ -101,7 +101,7 @@ def to_hierarchical_dataframe(dynamic_table): names=('source_table', 'label')) # Case 2: Our DynamicTableRegion columns points to another table with a DynamicTableRegion, i.e., - # we need to recursively resolve more levels of the table hieararchy + # we need to recursively resolve more levels of the table hierarchy else: # First we need to recursively flatten the hierarchy by calling 'to_hierarchical_dataframe()' # (i.e., this function) on the target of our hierarchical column @@ -135,7 +135,23 @@ def to_hierarchical_dataframe(dynamic_table): columns = hcol_hdf.columns # Construct the pandas dataframe with the hierarchical multi-index - multi_index = pd.MultiIndex.from_tuples(index, names=index_names) + try: + multi_index = pd.MultiIndex.from_tuples(index, names=index_names) + except TypeError as e: + # If our table contains a VectorIndex column then "TypeError: unhashable type: 'list'" will + # occur when converting the index to pd.MultiIndex. If this is the case, then we need to + # update the lists to tuples. Ideally we would detect this case when constructing the index + # but it is easier to do this here and it should not be much more expensive + if len(index) > 0: # This should always be true + list_type_index_cols = [i for i, v in enumerate(index[0]) if isinstance(v, list)] + for i, v in enumerate(index): + temp = list(v) + for ci in list_type_index_cols: + temp[ci] = tuple(temp[ci]) + index[i] = tuple(temp) + multi_index = pd.MultiIndex.from_tuples(index, names=index_names) + else: # pragma: no cover + raise e out_df = pd.DataFrame(data=data, index=multi_index, columns=columns) return out_df From 675a48bf7c3bfdb19e99f254448d7614a9195017 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Mon, 23 Aug 2021 00:08:01 -0700 Subject: [PATCH 2/6] Also cover the case where an np.ndarray is injected into the MulitIndex --- src/hdmf/common/hierarchicaltable.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hdmf/common/hierarchicaltable.py b/src/hdmf/common/hierarchicaltable.py index 3cffc8bd0..9430fceb7 100644 --- a/src/hdmf/common/hierarchicaltable.py +++ b/src/hdmf/common/hierarchicaltable.py @@ -142,15 +142,15 @@ def to_hierarchical_dataframe(dynamic_table): # occur when converting the index to pd.MultiIndex. If this is the case, then we need to # update the lists to tuples. Ideally we would detect this case when constructing the index # but it is easier to do this here and it should not be much more expensive - if len(index) > 0: # This should always be true - list_type_index_cols = [i for i, v in enumerate(index[0]) if isinstance(v, list)] + if len(index) > 0: # This should always be true as otherwise not TypeError should have been raised + list_type_index_cols = [i for i, v in enumerate(index[0]) if isinstance(v, (list, np.ndarray))] for i, v in enumerate(index): temp = list(v) for ci in list_type_index_cols: - temp[ci] = tuple(temp[ci]) + temp[ci] = tuple(temp[ci]) index[i] = tuple(temp) multi_index = pd.MultiIndex.from_tuples(index, names=index_names) - else: # pragma: no cover + else: # pragma: no cover raise e out_df = pd.DataFrame(data=data, index=multi_index, columns=columns) return out_df From a554f7e405e6a9a6057898f6d504b74baf01fffe Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Mon, 23 Aug 2021 00:27:04 -0700 Subject: [PATCH 3/6] Update error check to explicitly check for bad index values rather than to process only on TypeError --- src/hdmf/common/hierarchicaltable.py | 36 +++++++++++++++------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/hdmf/common/hierarchicaltable.py b/src/hdmf/common/hierarchicaltable.py index 9430fceb7..58ba0e875 100644 --- a/src/hdmf/common/hierarchicaltable.py +++ b/src/hdmf/common/hierarchicaltable.py @@ -134,24 +134,26 @@ def to_hierarchical_dataframe(dynamic_table): hcol_hdf.index.names) columns = hcol_hdf.columns + # Check if the index contains any unhashable types. If a table contains a VectorIndex column + # (other than the DynamicTableRegion column) then "TypeError: unhashable type: 'list'" will + # occur when converting the index to pd.MultiIndex. To avoid this error, we next check if any + # of the columns in our index are of type list of np.ndarray + unhashable_index_cols = [] + if len(index) > 0: + unhashable_index_cols = [i for i, v in enumerate(index[0]) if isinstance(v, (list, np.ndarray))] + + # If we have any unhashable list or np.array objects in the index then update them to tuples. + # Ideally we would detect this case when constructing the index, but it is easier to do this + # here and it should not be much more expensive, but it requires iterating over all rows again + if len(unhashable_index_cols) > 0: + for i, v in enumerate(index): + temp = list(v) + for ci in unhashable_index_cols: + temp[ci] = tuple(temp[ci]) + index[i] = tuple(temp) + # Construct the pandas dataframe with the hierarchical multi-index - try: - multi_index = pd.MultiIndex.from_tuples(index, names=index_names) - except TypeError as e: - # If our table contains a VectorIndex column then "TypeError: unhashable type: 'list'" will - # occur when converting the index to pd.MultiIndex. If this is the case, then we need to - # update the lists to tuples. Ideally we would detect this case when constructing the index - # but it is easier to do this here and it should not be much more expensive - if len(index) > 0: # This should always be true as otherwise not TypeError should have been raised - list_type_index_cols = [i for i, v in enumerate(index[0]) if isinstance(v, (list, np.ndarray))] - for i, v in enumerate(index): - temp = list(v) - for ci in list_type_index_cols: - temp[ci] = tuple(temp[ci]) - index[i] = tuple(temp) - multi_index = pd.MultiIndex.from_tuples(index, names=index_names) - else: # pragma: no cover - raise e + multi_index = pd.MultiIndex.from_tuples(index, names=index_names) out_df = pd.DataFrame(data=data, index=multi_index, columns=columns) return out_df From 720704f3468867c119372fd6e11e8d1ab4d147c9 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Mon, 23 Aug 2021 00:27:49 -0700 Subject: [PATCH 4/6] Add unit test cases to check for to_hierarchical_dataframe with indexed regular data columns --- tests/unit/common/test_linkedtables.py | 52 ++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tests/unit/common/test_linkedtables.py b/tests/unit/common/test_linkedtables.py index 48a9fd6a0..09680d4c2 100644 --- a/tests/unit/common/test_linkedtables.py +++ b/tests/unit/common/test_linkedtables.py @@ -438,6 +438,58 @@ def test_to_hierarchical_dataframe_indexed_dtr_on_last_level(self): self.assertListEqual(hier_df[('aligned_table', ('level0_0', 'tags'))].to_list(), [['tag1'], ['tag2'], ['tag2', 'tag1']]) + def test_to_hierarchical_dataframe_indexed_data_nparray(self): + # Test that we can convert a table that contains a VectorIndex column as regular data, + # i.e., it is not our DynamicTableRegion column that is index but a regular data column. + # In this test the data is defined as an numpy nd.array so that an nd.array is injected + # into the MultiIndex of the table. As a numpy array is not hashable this would normally + # create an error when creating the MultiIndex + # Parent table + dtr_p1 = DynamicTableRegion(name='l1', description='l1', data=np.arange(4), table=self.aligned_table) + vi_dtr_p1 = VectorIndex(name='sl1_index', data=[1, 2, 3], target=dtr_p1) + p1 = DynamicTable(name='parent_table', description='parent_table', + columns=[VectorData(name='p1', description='p1', data=np.arange(3)), dtr_p1, vi_dtr_p1]) + # Super-parent table + dtr_sp = DynamicTableRegion(name='sl1', description='sl1', data=np.arange(3), table=p1) + spt = DynamicTable(name='super_parent_table', description='super_parent_table', + columns=[VectorData(name='sp1', description='sp1', data=np.arange(3)), dtr_sp]) + spt.add_column(name='vic', description='vic', data=np.arange(9), index=[2, 4, 6]) + hier_df = to_hierarchical_dataframe(spt).reset_index() + expected_columns = [('super_parent_table', 'id'), ('super_parent_table', 'sp1'), ('super_parent_table', 'vic'), + ('parent_table', 'id'), ('parent_table', 'p1'), + ('aligned_table', 'id'), + ('aligned_table', ('aligned_table', 'a1')), ('aligned_table', ('level0_0', 'id')), + ('aligned_table', ('level0_0', 'tags')), ('aligned_table', ('level0_0', 'myid'))] + self.assertListEqual(hier_df.columns.to_list(), expected_columns) # make sure we have the right columns + self.assertListEqual(hier_df[('aligned_table', ('level0_0', 'tags'))].to_list(), + [['tag1'], ['tag2'], ['tag2', 'tag1']]) + + def test_to_hierarchical_dataframe_indexed_data_list(self): + # Test that we can convert a table that contains a VectorIndex column as regular data, + # i.e., it is not our DynamicTableRegion column that is index but a regular data column. + # In this test the data is defined as an list so that a list is injected + # into the MultiIndex of the table. As a list is not hashable this would normally + # create an error when creating the MultiIndex + # Parent table + dtr_p1 = DynamicTableRegion(name='l1', description='l1', data=np.arange(4), table=self.aligned_table) + vi_dtr_p1 = VectorIndex(name='sl1_index', data=[1, 2, 3], target=dtr_p1) + p1 = DynamicTable(name='parent_table', description='parent_table', + columns=[VectorData(name='p1', description='p1', data=np.arange(3)), dtr_p1, vi_dtr_p1]) + # Super-parent table + dtr_sp = DynamicTableRegion(name='sl1', description='sl1', data=np.arange(3), table=p1) + spt = DynamicTable(name='super_parent_table', description='super_parent_table', + columns=[VectorData(name='sp1', description='sp1', data=np.arange(3)), dtr_sp]) + spt.add_column(name='vic', description='vic', data=list(range(9)), index=list([2, 4, 6])) + hier_df = to_hierarchical_dataframe(spt).reset_index() + expected_columns = [('super_parent_table', 'id'), ('super_parent_table', 'sp1'), ('super_parent_table', 'vic'), + ('parent_table', 'id'), ('parent_table', 'p1'), + ('aligned_table', 'id'), + ('aligned_table', ('aligned_table', 'a1')), ('aligned_table', ('level0_0', 'id')), + ('aligned_table', ('level0_0', 'tags')), ('aligned_table', ('level0_0', 'myid'))] + self.assertListEqual(hier_df.columns.to_list(), expected_columns) # make sure we have the right columns + self.assertListEqual(hier_df[('aligned_table', ('level0_0', 'tags'))].to_list(), + [['tag1'], ['tag2'], ['tag2', 'tag1']]) + def test_to_hierarchical_dataframe_empty_tables(self): # Setup empty tables with the following hierarchy # super_parent_table ---> parent_table ---> child_table From f5943275413f47a3deb242a531b990487ddc521a Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Mon, 23 Aug 2021 00:35:12 -0700 Subject: [PATCH 5/6] Update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a917bb6ad..6d5e97bdc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,8 @@ ### Fixes - Fixed `setup.py` not being able to import `versioneer` when installing in an embedded Python environment. @rly (#662) +- Fixed `to_hierarchcial_dataframe` failing when a table contains a `VectorIndex` column as a regular data column. + @oruebel (#666) ## HDMF 3.1.1 (July 29, 2021) From 94d5194c997de6abc106a38de829fd5dce0581a5 Mon Sep 17 00:00:00 2001 From: Ryan Ly Date: Mon, 22 Nov 2021 15:04:01 -0800 Subject: [PATCH 6/6] Fix typo --- src/hdmf/common/hierarchicaltable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdmf/common/hierarchicaltable.py b/src/hdmf/common/hierarchicaltable.py index 58ba0e875..8322d2d73 100644 --- a/src/hdmf/common/hierarchicaltable.py +++ b/src/hdmf/common/hierarchicaltable.py @@ -137,7 +137,7 @@ def to_hierarchical_dataframe(dynamic_table): # Check if the index contains any unhashable types. If a table contains a VectorIndex column # (other than the DynamicTableRegion column) then "TypeError: unhashable type: 'list'" will # occur when converting the index to pd.MultiIndex. To avoid this error, we next check if any - # of the columns in our index are of type list of np.ndarray + # of the columns in our index are of type list or np.ndarray unhashable_index_cols = [] if len(index) > 0: unhashable_index_cols = [i for i, v in enumerate(index[0]) if isinstance(v, (list, np.ndarray))]