Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chunked writing of h5py.Dataset and zarr.Array #1624

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
43 changes: 40 additions & 3 deletions src/anndata/_io/specs/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,13 +373,10 @@
# It's in the `AnnData.concatenate` docstring, but should we keep it?
@_REGISTRY.register_write(H5Group, views.ArrayView, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(H5Group, np.ndarray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(H5Group, h5py.Dataset, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(H5Group, np.ma.MaskedArray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, views.ArrayView, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, np.ndarray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, h5py.Dataset, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, np.ma.MaskedArray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, ZarrArray, IOSpec("array", "0.2.0"))
def write_basic(
f: GroupStorageType,
k: str,
Expand All @@ -392,6 +389,46 @@
f.create_dataset(k, data=elem, **dataset_kwargs)


def _iter_chunks_for_copy(elem, dest):
ilan-gold marked this conversation as resolved.
Show resolved Hide resolved
"""
Returns an iterator of tuples of slices for copying chunks from `elem` to `dest`.

* If `elem` has chunks, it will return the chunks of `elem`.
* If `dest` has chunks, it will return the chunks of `dest`.
* If neither is chunked, we write it in ~100MB chunks or 1000 rows, whichever is larger.
"""
if elem.chunks:
return elem.iter_chunks()

Check warning on line 401 in src/anndata/_io/specs/methods.py

View check run for this annotation

Codecov / codecov/patch

src/anndata/_io/specs/methods.py#L401

Added line #L401 was not covered by tests
elif dest.chunks:
return dest.iter_chunks()

Check warning on line 403 in src/anndata/_io/specs/methods.py

View check run for this annotation

Codecov / codecov/patch

src/anndata/_io/specs/methods.py#L403

Added line #L403 was not covered by tests
else:
itemsize = elem.dtype.itemsize
shape = elem.shape
entry_chunk_size = 100 * 1024 * 1024 // itemsize # number of elements to write
n_rows = max(
entry_chunk_size // shape[0], 1000
) # Number of rows that works out to
return (slice(i, min(i + n_rows, shape[0])) for i in range(0, shape[0], n_rows))
ilan-gold marked this conversation as resolved.
Show resolved Hide resolved


@_REGISTRY.register_write(H5Group, H5Array, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(H5Group, ZarrArray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, H5Array, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, ZarrArray, IOSpec("array", "0.2.0"))
def write_chunked_dense_array(
f: GroupStorageType,
k: str,
elem,
ilan-gold marked this conversation as resolved.
Show resolved Hide resolved
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
dest = f.create_dataset_like(k, elem, **dataset_kwargs)

for chunk in _iter_chunks_for_copy(elem, dest):
dest[chunk] = elem[chunk]


_REGISTRY.register_write(H5Group, CupyArray, IOSpec("array", "0.2.0"))(
_to_cpu_mem_wrapper(write_basic)
)
Expand Down
12 changes: 12 additions & 0 deletions tests/test_io_elementwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,18 @@ def create_sparse_store(
pytest.param(
pd.array([True, False, True, True]), "nullable-boolean", id="pd_arr_bool"
),
pytest.param(
zarr.ones((100, 100), chunks=(10, 10)),
"array",
id="zarr_dense_array",
),
pytest.param(
create_dense_store(
h5py.File("test1.h5", mode="w", driver="core", backing_store=False)
)["X"],
"array",
id="h5_dense_array",
),
# pytest.param(bytes, b"some bytes", "bytes", id="py_bytes"), # Does not work for zarr
# TODO consider how specific encodings should be. Should we be fully describing the written type?
# Currently the info we add is: "what you wouldn't be able to figure out yourself"
Expand Down
Loading