Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix handling of datasets with shape (0,0,...) #77

Merged
merged 2 commits into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 18 additions & 6 deletions lindi/LindiH5ZarrStore/LindiH5ZarrStore.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,16 +303,22 @@ def _get_zarray_bytes(self, parent_key: str):
# dtype, and filters and then copy the .zarray JSON text from it
memory_store = MemoryStore()
dummy_group = zarr.group(store=memory_store)
chunks = h5_item.chunks
if chunks is None:
# It's important to not have chunks be None here because that would
# let zarr choose an optimal chunking, whereas we need this to reflect
# the actual chunking in the HDF5 file.
chunks = h5_item.shape
if np.prod(chunks) == 0:
# A chunking of (0,) or (0, 0) or (0, 0, 0), etc. is not allowed in Zarr
chunks = [1] * len(chunks)
# Importantly, I'm pretty sure this doesn't actually create the
# chunks in the memory store. That's important because we just need
# to get the .zarray JSON text from the dummy group.
dummy_group.create_dataset(
name="dummy_array",
shape=h5_item.shape,
# It's important to not have chunks be None here because that would
# let zarr choose an optimal chunking, whereas we need this to reflect
# the actual chunking in the HDF5 file.
chunks=h5_item.chunks if h5_item.chunks is not None else h5_item.shape,
chunks=chunks,
dtype=h5_item.dtype,
compressor=None,
order="C",
Expand Down Expand Up @@ -434,6 +440,10 @@ def _add_chunk_info_to_refs(self, key_parent: str, add_ref: Callable, add_ref_ch
h5_item = self._h5f.get('/' + key_parent, None)
assert isinstance(h5_item, h5py.Dataset)

# If the shape is (0,), (0, 0), (0, 0, 0), etc., then do not add any chunk references
if np.prod(h5_item.shape) == 0:
return

# For the case of a scalar dataset, we need to check a few things
if h5_item.ndim == 0:
if h5_item.chunks is not None:
Expand Down Expand Up @@ -589,8 +599,10 @@ def _add_ref(key: str, content: Union[bytes, None]):
)

def _add_ref_chunk(key: str, data: Tuple[str, int, int]):
assert data[1] is not None
assert data[2] is not None
assert data[1] is not None, \
f"{key} chunk data is invalid. Element at index 1 cannot be None: {data}"
assert data[2] is not None, \
f"{key} chunk data is invalid. Element at index 2 cannot be None: {data}"
ret["refs"][key] = list(data) # downstream expects a list like on read from a JSON file

def _process_group(key, item: h5py.Group):
Expand Down
4 changes: 3 additions & 1 deletion lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,9 @@ def __init__(self, rfs: dict, *, mode: Literal["r", "r+"] = "r+", local_cache: U
self.local_cache = local_cache

# These methods are overridden from MutableMapping
def __contains__(self, key: str):
def __contains__(self, key: object):
if not isinstance(key, str):
return False
return key in self.rfs["refs"]

def __getitem__(self, key: str):
Expand Down
22 changes: 22 additions & 0 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,5 +575,27 @@ def test_numpy_array_of_byte_strings():
assert lists_are_equal(X1[:].tolist(), X2[:].tolist()) # type: ignore


def test_dataset_zero_shape():
with tempfile.TemporaryDirectory() as tmpdir:
filename = f"{tmpdir}/test.h5"
with h5py.File(filename, "w") as f:
f.create_dataset("X1D", data=np.array([], dtype=np.int32), shape=(0,)) # NOTE this is not a scalar
f.create_dataset("X3D", data=np.array([], dtype=np.int32), shape=(0,0,0))
h5f = h5py.File(filename, "r")
with LindiH5ZarrStore.from_file(filename, url=filename) as store:
rfs = store.to_reference_file_system()
h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs)
X1 = h5f['X1D']
assert isinstance(X1, h5py.Dataset)
X2 = h5f_2['X1D']
assert isinstance(X2, h5py.Dataset)
assert arrays_are_equal(X1[:], X2[:])
X1 = h5f['X3D']
assert isinstance(X1, h5py.Dataset)
X2 = h5f_2['X3D']
assert isinstance(X2, h5py.Dataset)
assert arrays_are_equal(X1[:], X2[:])


if __name__ == '__main__':
pass