From e55471f5cf72f298910bb9c9a60621ebc45ad258 Mon Sep 17 00:00:00 2001 From: rly Date: Wed, 29 May 2024 07:04:09 -0700 Subject: [PATCH] Fix handling of dsets with shape (0,0,...) --- lindi/LindiH5ZarrStore/LindiH5ZarrStore.py | 24 ++++++++++++++++------ tests/test_core.py | 22 ++++++++++++++++++++ 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py index e40fcef..bafbcaf 100644 --- a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py +++ b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py @@ -303,16 +303,22 @@ def _get_zarray_bytes(self, parent_key: str): # dtype, and filters and then copy the .zarray JSON text from it memory_store = MemoryStore() dummy_group = zarr.group(store=memory_store) + chunks = h5_item.chunks + if chunks is None: + # It's important to not have chunks be None here because that would + # let zarr choose an optimal chunking, whereas we need this to reflect + # the actual chunking in the HDF5 file. + chunks = h5_item.shape + if np.prod(chunks) == 0: + # A chunking of (0,) or (0, 0) or (0, 0, 0), etc. is not allowed in Zarr + chunks = [1] * len(chunks) # Importantly, I'm pretty sure this doesn't actually create the # chunks in the memory store. That's important because we just need # to get the .zarray JSON text from the dummy group. dummy_group.create_dataset( name="dummy_array", shape=h5_item.shape, - # It's important to not have chunks be None here because that would - # let zarr choose an optimal chunking, whereas we need this to reflect - # the actual chunking in the HDF5 file. - chunks=h5_item.chunks if h5_item.chunks is not None else h5_item.shape, + chunks=chunks, dtype=h5_item.dtype, compressor=None, order="C", @@ -434,6 +440,10 @@ def _add_chunk_info_to_refs(self, key_parent: str, add_ref: Callable, add_ref_ch h5_item = self._h5f.get('/' + key_parent, None) assert isinstance(h5_item, h5py.Dataset) + # If the shape is (0,), (0, 0), (0, 0, 0), etc., then do not add any chunk references + if np.prod(h5_item.shape) == 0: + return + # For the case of a scalar dataset, we need to check a few things if h5_item.ndim == 0: if h5_item.chunks is not None: @@ -589,8 +599,10 @@ def _add_ref(key: str, content: Union[bytes, None]): ) def _add_ref_chunk(key: str, data: Tuple[str, int, int]): - assert data[1] is not None - assert data[2] is not None + assert data[1] is not None, \ + f"{key} chunk data is invalid. Element at index 1 cannot be None: {data}" + assert data[2] is not None, \ + f"{key} chunk data is invalid. Element at index 2 cannot be None: {data}" ret["refs"][key] = list(data) # downstream expects a list like on read from a JSON file def _process_group(key, item: h5py.Group): diff --git a/tests/test_core.py b/tests/test_core.py index 5063e03..b0532a6 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -575,5 +575,27 @@ def test_numpy_array_of_byte_strings(): assert lists_are_equal(X1[:].tolist(), X2[:].tolist()) # type: ignore +def test_dataset_zero_shape(): + with tempfile.TemporaryDirectory() as tmpdir: + filename = f"{tmpdir}/test.h5" + with h5py.File(filename, "w") as f: + f.create_dataset("X1D", data=np.array([], dtype=np.int32), shape=(0,)) # NOTE this is not a scalar + f.create_dataset("X3D", data=np.array([], dtype=np.int32), shape=(0,0,0)) + h5f = h5py.File(filename, "r") + with LindiH5ZarrStore.from_file(filename, url=filename) as store: + rfs = store.to_reference_file_system() + h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs) + X1 = h5f['X1D'] + assert isinstance(X1, h5py.Dataset) + X2 = h5f_2['X1D'] + assert isinstance(X2, h5py.Dataset) + assert arrays_are_equal(X1[:], X2[:]) + X1 = h5f['X3D'] + assert isinstance(X1, h5py.Dataset) + X2 = h5f_2['X3D'] + assert isinstance(X2, h5py.Dataset) + assert arrays_are_equal(X1[:], X2[:]) + + if __name__ == '__main__': pass