Merge pull request #77 from NeurodataWithoutBorders/fix_zero_shape

Fix handling of datasets with shape (0,0,...)
NeurodataWithoutBorders · May 29, 2024 · 93487db · 93487db
2 parents 731fcb5 + ffea54a
commit 93487db
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 7 deletions.
diff --git a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py
@@ -303,16 +303,22 @@ def _get_zarray_bytes(self, parent_key: str):
         # dtype, and filters and then copy the .zarray JSON text from it
         memory_store = MemoryStore()
         dummy_group = zarr.group(store=memory_store)
+        chunks = h5_item.chunks
+        if chunks is None:
+            # It's important to not have chunks be None here because that would
+            # let zarr choose an optimal chunking, whereas we need this to reflect
+            # the actual chunking in the HDF5 file.
+            chunks = h5_item.shape
+            if np.prod(chunks) == 0:
+                # A chunking of (0,) or (0, 0) or (0, 0, 0), etc. is not allowed in Zarr
+                chunks = [1] * len(chunks)
         # Importantly, I'm pretty sure this doesn't actually create the
         # chunks in the memory store. That's important because we just need
         # to get the .zarray JSON text from the dummy group.
         dummy_group.create_dataset(
             name="dummy_array",
             shape=h5_item.shape,
-            # It's important to not have chunks be None here because that would
-            # let zarr choose an optimal chunking, whereas we need this to reflect
-            # the actual chunking in the HDF5 file.
-            chunks=h5_item.chunks if h5_item.chunks is not None else h5_item.shape,
+            chunks=chunks,
             dtype=h5_item.dtype,
             compressor=None,
             order="C",
@@ -434,6 +440,10 @@ def _add_chunk_info_to_refs(self, key_parent: str, add_ref: Callable, add_ref_ch
         h5_item = self._h5f.get('/' + key_parent, None)
         assert isinstance(h5_item, h5py.Dataset)
 
+        # If the shape is (0,), (0, 0), (0, 0, 0), etc., then do not add any chunk references
+        if np.prod(h5_item.shape) == 0:
+            return
+
         # For the case of a scalar dataset, we need to check a few things
         if h5_item.ndim == 0:
             if h5_item.chunks is not None:
@@ -589,8 +599,10 @@ def _add_ref(key: str, content: Union[bytes, None]):
                     )
 
         def _add_ref_chunk(key: str, data: Tuple[str, int, int]):
-            assert data[1] is not None
-            assert data[2] is not None
+            assert data[1] is not None, \
+                f"{key} chunk data is invalid. Element at index 1 cannot be None: {data}"
+            assert data[2] is not None, \
+                f"{key} chunk data is invalid. Element at index 2 cannot be None: {data}"
             ret["refs"][key] = list(data)  # downstream expects a list like on read from a JSON file
 
         def _process_group(key, item: h5py.Group):

diff --git a/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py b/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py
@@ -115,7 +115,9 @@ def __init__(self, rfs: dict, *, mode: Literal["r", "r+"] = "r+", local_cache: U
         self.local_cache = local_cache
 
     # These methods are overridden from MutableMapping
-    def __contains__(self, key: str):
+    def __contains__(self, key: object):
+        if not isinstance(key, str):
+            return False
         return key in self.rfs["refs"]
 
     def __getitem__(self, key: str):

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -575,5 +575,27 @@ def test_numpy_array_of_byte_strings():
             assert lists_are_equal(X1[:].tolist(), X2[:].tolist())  # type: ignore
 
 
+def test_dataset_zero_shape():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        filename = f"{tmpdir}/test.h5"
+        with h5py.File(filename, "w") as f:
+            f.create_dataset("X1D", data=np.array([], dtype=np.int32), shape=(0,))  # NOTE this is not a scalar
+            f.create_dataset("X3D", data=np.array([], dtype=np.int32), shape=(0,0,0))
+        h5f = h5py.File(filename, "r")
+        with LindiH5ZarrStore.from_file(filename, url=filename) as store:
+            rfs = store.to_reference_file_system()
+            h5f_2 = lindi.LindiH5pyFile.from_reference_file_system(rfs)
+            X1 = h5f['X1D']
+            assert isinstance(X1, h5py.Dataset)
+            X2 = h5f_2['X1D']
+            assert isinstance(X2, h5py.Dataset)
+            assert arrays_are_equal(X1[:], X2[:])
+            X1 = h5f['X3D']
+            assert isinstance(X1, h5py.Dataset)
+            X2 = h5f_2['X3D']
+            assert isinstance(X2, h5py.Dataset)
+            assert arrays_are_equal(X1[:], X2[:])
+
+
 if __name__ == '__main__':
     pass