Skip to content

Commit

Permalink
Merge pull request #77 from NeurodataWithoutBorders/fix_zero_shape
Browse files Browse the repository at this point in the history
Fix handling of datasets with shape (0,0,...)
  • Loading branch information
magland authored May 29, 2024
2 parents 731fcb5 + ffea54a commit 93487db
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 7 deletions.
24 changes: 18 additions & 6 deletions lindi/LindiH5ZarrStore/LindiH5ZarrStore.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,16 +303,22 @@ def _get_zarray_bytes(self, parent_key: str):
# dtype, and filters and then copy the .zarray JSON text from it
memory_store = MemoryStore()
dummy_group = zarr.group(store=memory_store)
chunks = h5_item.chunks
if chunks is None:
# It's important to not have chunks be None here because that would
# let zarr choose an optimal chunking, whereas we need this to reflect
# the actual chunking in the HDF5 file.
chunks = h5_item.shape
if np.prod(chunks) == 0:
# A chunking of (0,) or (0, 0) or (0, 0, 0), etc. is not allowed in Zarr
chunks = [1] * len(chunks)
# Importantly, I'm pretty sure this doesn't actually create the
# chunks in the memory store. That's important because we just need
# to get the .zarray JSON text from the dummy group.
dummy_group.create_dataset(
name="dummy_array",
shape=h5_item.shape,
# It's important to not have chunks be None here because that would
# let zarr choose an optimal chunking, whereas we need this to reflect
# the actual chunking in the HDF5 file.
chunks=h5_item.chunks if h5_item.chunks is not None else h5_item.shape,
chunks=chunks,
dtype=h5_item.dtype,
compressor=None,
order="C",
Expand Down Expand Up @@ -434,6 +440,10 @@ def _add_chunk_info_to_refs(self, key_parent: str, add_ref: Callable, add_ref_ch
h5_item = self._h5f.get('/' + key_parent, None)
assert isinstance(h5_item, h5py.Dataset)

# If the shape is (0,), (0, 0), (0, 0, 0), etc., then do not add any chunk references
if np.prod(h5_item.shape) == 0:
return

# For the case of a scalar dataset, we need to check a few things
if h5_item.ndim == 0:
if h5_item.chunks is not None:
Expand Down Expand Up @@ -589,8 +599,10 @@ def _add_ref(key: str, content: Union[bytes, None]):
)

def _add_ref_chunk(key: str, data: Tuple[str, int, int]):
assert data[1] is not None
assert data[2] is not None
assert data[1] is not None, \
f"{key} chunk data is invalid. Element at index 1 cannot be None: {data}"
assert data[2] is not None, \
f"{key} chunk data is invalid. Element at index 2 cannot be None: {data}"
ret["refs"][key] = list(data) # downstream expects a list like on read from a JSON file

def _process_group(key, item: h5py.Group):
Expand Down
4 changes: 3 additions & 1 deletion lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,9 @@ def __init__(self, rfs: dict, *, mode: Literal["r", "r+"] = "r+", local_cache: U
self.local_cache = local_cache

# These methods are overridden from MutableMapping
def __contains__(self, key: str):
def __contains__(self, key: object):
if not isinstance(key, str):
return False
return key in self.rfs["refs"]

def __getitem__(self, key: str):
Expand Down
22 changes: 22 additions & 0 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,5 +575,27 @@ def test_numpy_array_of_byte_strings():
assert lists_are_equal(X1[:].tolist(), X2[:].tolist()) # type: ignore


def test_dataset_zero_shape():
with tempfile.TemporaryDirectory() as tmpdir:
filename = f"{tmpdir}/test.h5"
with h5py.File(filename, "w") as f:
f.create_dataset("X1D", data=np.array([], dtype=np.int32), shape=(0,)) # NOTE this is not a scalar
f.create_dataset("X3D", data=np.array([], dtype=np.int32), shape=(0,0,0))
h5f = h5py.File(filename, "r")
with LindiH5ZarrStore.from_file(filename, url=filename) as store:
rfs = store.to_reference_file_system()
h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs)
X1 = h5f['X1D']
assert isinstance(X1, h5py.Dataset)
X2 = h5f_2['X1D']
assert isinstance(X2, h5py.Dataset)
assert arrays_are_equal(X1[:], X2[:])
X1 = h5f['X3D']
assert isinstance(X1, h5py.Dataset)
X2 = h5f_2['X3D']
assert isinstance(X2, h5py.Dataset)
assert arrays_are_equal(X1[:], X2[:])


if __name__ == '__main__':
pass

0 comments on commit 93487db

Please sign in to comment.