scverse · ivirshup · Aug 28, 2024 · Aug 28, 2024 · Aug 28, 2024 · Aug 29, 2024
diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py
@@ -373,13 +373,12 @@
 # It's in the `AnnData.concatenate` docstring, but should we keep it?
 @_REGISTRY.register_write(H5Group, views.ArrayView, IOSpec("array", "0.2.0"))
 @_REGISTRY.register_write(H5Group, np.ndarray, IOSpec("array", "0.2.0"))
-@_REGISTRY.register_write(H5Group, h5py.Dataset, IOSpec("array", "0.2.0"))
 @_REGISTRY.register_write(H5Group, np.ma.MaskedArray, IOSpec("array", "0.2.0"))
 @_REGISTRY.register_write(ZarrGroup, views.ArrayView, IOSpec("array", "0.2.0"))
 @_REGISTRY.register_write(ZarrGroup, np.ndarray, IOSpec("array", "0.2.0"))
-@_REGISTRY.register_write(ZarrGroup, h5py.Dataset, IOSpec("array", "0.2.0"))
 @_REGISTRY.register_write(ZarrGroup, np.ma.MaskedArray, IOSpec("array", "0.2.0"))
 @_REGISTRY.register_write(ZarrGroup, ZarrArray, IOSpec("array", "0.2.0"))
+@_REGISTRY.register_write(ZarrGroup, H5Array, IOSpec("array", "0.2.0"))
 def write_basic(
  f: GroupStorageType,
  k: str,
@@ -392,6 +391,47 @@
  f.create_dataset(k, data=elem, **dataset_kwargs)
 
 
+def _iter_chunks_for_copy(elem: ArrayStorageType, dest: ArrayStorageType):
+ """
+ Returns an iterator of tuples of slices for copying chunks from `elem` to `dest`.
+
+ * If `dest` has chunks, it will return the chunks of `dest`.
+ * If `dest` is not chunked, we write it in ~100MB chunks or 1000 rows, whichever is larger.
+ """
+ if dest.chunks:
+ return dest.iter_chunks()
+ else:
+ itemsize = elem.dtype.itemsize
+ shape = elem.shape
+ entry_chunk_size = 100 * 1024 * 1024 // itemsize # number of elements to write
+ n_rows = max(
+ entry_chunk_size // shape[0], 1000
+ ) # Number of rows that works out to
+ return (slice(i, min(i + n_rows, shape[0])) for i in range(0, shape[0], n_rows))
+
+
+@_REGISTRY.register_write(H5Group, H5Array, IOSpec("array", "0.2.0"))
+@_REGISTRY.register_write(H5Group, ZarrArray, IOSpec("array", "0.2.0"))
+def write_chunked_dense_array_to_h5(
+ f: GroupStorageType,
+ k: str,
+ elem: ArrayStorageType,
+ *,
+ _writer: Writer,
+ dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
+):
+ """Write to a h5py.Dataset in chunks.
+
+ `h5py.Group.create_dataset(..., data: h5py.Dataset)` will load all of `data` into memory
+ before writing. Instead, we will write in chunks to avoid this. We don't need to do this for
+ zarr since zarr handles this automatically.
+ """
+ dest = f.create_dataset(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs)
+
+ for chunk in _iter_chunks_for_copy(elem, dest):
+ dest[chunk] = elem[chunk]
+
+
 _REGISTRY.register_write(H5Group, CupyArray, IOSpec("array", "0.2.0"))(
  _to_cpu_mem_wrapper(write_basic)
 )

diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py
@@ -166,6 +166,18 @@ def create_sparse_store(
  pytest.param(
  pd.array([True, False, True, True]), "nullable-boolean", id="pd_arr_bool"
  ),
+ pytest.param(
+ zarr.ones((100, 100), chunks=(10, 10)),
+ "array",
+ id="zarr_dense_array",
+ ),
+ pytest.param(
+ create_dense_store(
+ h5py.File("test1.h5", mode="w", driver="core", backing_store=False)
+ )["X"],
+ "array",
+ id="h5_dense_array",
+ ),
  # pytest.param(bytes, b"some bytes", "bytes", id="py_bytes"), # Does not work for zarr
  # TODO consider how specific encodings should be. Should we be fully describing the written type?
  # Currently the info we add is: "what you wouldn't be able to figure out yourself"