Skip to content

Commit

Permalink
Fix handling of dsets with shape (0,0,...)
Browse files Browse the repository at this point in the history
  • Loading branch information
rly committed May 29, 2024
1 parent 731fcb5 commit e55471f
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 6 deletions.
24 changes: 18 additions & 6 deletions lindi/LindiH5ZarrStore/LindiH5ZarrStore.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,16 +303,22 @@ def _get_zarray_bytes(self, parent_key: str):
# dtype, and filters and then copy the .zarray JSON text from it
memory_store = MemoryStore()
dummy_group = zarr.group(store=memory_store)
chunks = h5_item.chunks
if chunks is None:
# It's important to not have chunks be None here because that would
# let zarr choose an optimal chunking, whereas we need this to reflect
# the actual chunking in the HDF5 file.
chunks = h5_item.shape
if np.prod(chunks) == 0:
# A chunking of (0,) or (0, 0) or (0, 0, 0), etc. is not allowed in Zarr
chunks = [1] * len(chunks)
# Importantly, I'm pretty sure this doesn't actually create the
# chunks in the memory store. That's important because we just need
# to get the .zarray JSON text from the dummy group.
dummy_group.create_dataset(
name="dummy_array",
shape=h5_item.shape,
# It's important to not have chunks be None here because that would
# let zarr choose an optimal chunking, whereas we need this to reflect
# the actual chunking in the HDF5 file.
chunks=h5_item.chunks if h5_item.chunks is not None else h5_item.shape,
chunks=chunks,
dtype=h5_item.dtype,
compressor=None,
order="C",
Expand Down Expand Up @@ -434,6 +440,10 @@ def _add_chunk_info_to_refs(self, key_parent: str, add_ref: Callable, add_ref_ch
h5_item = self._h5f.get('/' + key_parent, None)
assert isinstance(h5_item, h5py.Dataset)

# If the shape is (0,), (0, 0), (0, 0, 0), etc., then do not add any chunk references
if np.prod(h5_item.shape) == 0:
return

# For the case of a scalar dataset, we need to check a few things
if h5_item.ndim == 0:
if h5_item.chunks is not None:
Expand Down Expand Up @@ -589,8 +599,10 @@ def _add_ref(key: str, content: Union[bytes, None]):
)

def _add_ref_chunk(key: str, data: Tuple[str, int, int]):
assert data[1] is not None
assert data[2] is not None
assert data[1] is not None, \
f"{key} chunk data is invalid. Element at index 1 cannot be None: {data}"
assert data[2] is not None, \
f"{key} chunk data is invalid. Element at index 2 cannot be None: {data}"
ret["refs"][key] = list(data) # downstream expects a list like on read from a JSON file

def _process_group(key, item: h5py.Group):
Expand Down
22 changes: 22 additions & 0 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,5 +575,27 @@ def test_numpy_array_of_byte_strings():
assert lists_are_equal(X1[:].tolist(), X2[:].tolist()) # type: ignore


def test_dataset_zero_shape():
with tempfile.TemporaryDirectory() as tmpdir:
filename = f"{tmpdir}/test.h5"
with h5py.File(filename, "w") as f:
f.create_dataset("X1D", data=np.array([], dtype=np.int32), shape=(0,)) # NOTE this is not a scalar
f.create_dataset("X3D", data=np.array([], dtype=np.int32), shape=(0,0,0))
h5f = h5py.File(filename, "r")
with LindiH5ZarrStore.from_file(filename, url=filename) as store:
rfs = store.to_reference_file_system()
h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs)
X1 = h5f['X1D']
assert isinstance(X1, h5py.Dataset)
X2 = h5f_2['X1D']
assert isinstance(X2, h5py.Dataset)
assert arrays_are_equal(X1[:], X2[:])
X1 = h5f['X3D']
assert isinstance(X1, h5py.Dataset)
X2 = h5f_2['X3D']
assert isinstance(X2, h5py.Dataset)
assert arrays_are_equal(X1[:], X2[:])


if __name__ == '__main__':
pass

0 comments on commit e55471f

Please sign in to comment.