Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chunked writing of h5py.Dataset and zarr.Array #1624

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 42 additions & 2 deletions src/anndata/_io/specs/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,13 +373,12 @@
# It's in the `AnnData.concatenate` docstring, but should we keep it?
@_REGISTRY.register_write(H5Group, views.ArrayView, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(H5Group, np.ndarray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(H5Group, h5py.Dataset, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(H5Group, np.ma.MaskedArray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, views.ArrayView, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, np.ndarray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, h5py.Dataset, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, np.ma.MaskedArray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, ZarrArray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, H5Array, IOSpec("array", "0.2.0"))
def write_basic(
f: GroupStorageType,
k: str,
Expand All @@ -392,6 +391,47 @@
f.create_dataset(k, data=elem, **dataset_kwargs)


def _iter_chunks_for_copy(elem: ArrayStorageType, dest: ArrayStorageType):
"""
Returns an iterator of tuples of slices for copying chunks from `elem` to `dest`.

* If `dest` has chunks, it will return the chunks of `dest`.
* If `dest` is not chunked, we write it in ~100MB chunks or 1000 rows, whichever is larger.
"""
if dest.chunks:
return dest.iter_chunks()
else:

Check warning on line 403 in src/anndata/_io/specs/methods.py

View check run for this annotation

Codecov / codecov/patch

src/anndata/_io/specs/methods.py#L403

Added line #L403 was not covered by tests
itemsize = elem.dtype.itemsize
shape = elem.shape
entry_chunk_size = 100 * 1024 * 1024 // itemsize # number of elements to write
n_rows = max(
entry_chunk_size // shape[0], 1000
) # Number of rows that works out to
return (slice(i, min(i + n_rows, shape[0])) for i in range(0, shape[0], n_rows))
ilan-gold marked this conversation as resolved.
Show resolved Hide resolved


@_REGISTRY.register_write(H5Group, H5Array, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(H5Group, ZarrArray, IOSpec("array", "0.2.0"))
def write_chunked_dense_array_to_h5(
f: GroupStorageType,
k: str,
elem: ArrayStorageType,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
"""Write to a h5py.Dataset in chunks.

`h5py.Group.create_dataset(..., data: h5py.Dataset)` will load all of `data` into memory
before writing. Instead, we will write in chunks to avoid this. We don't need to do this for
zarr since zarr handles this automatically.
"""
dest = f.create_dataset(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs)

for chunk in _iter_chunks_for_copy(elem, dest):
dest[chunk] = elem[chunk]


_REGISTRY.register_write(H5Group, CupyArray, IOSpec("array", "0.2.0"))(
_to_cpu_mem_wrapper(write_basic)
)
Expand Down
12 changes: 12 additions & 0 deletions tests/test_io_elementwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,18 @@ def create_sparse_store(
pytest.param(
pd.array([True, False, True, True]), "nullable-boolean", id="pd_arr_bool"
),
pytest.param(
zarr.ones((100, 100), chunks=(10, 10)),
"array",
id="zarr_dense_array",
),
pytest.param(
create_dense_store(
h5py.File("test1.h5", mode="w", driver="core", backing_store=False)
)["X"],
"array",
id="h5_dense_array",
),
# pytest.param(bytes, b"some bytes", "bytes", id="py_bytes"), # Does not work for zarr
# TODO consider how specific encodings should be. Should we be fully describing the written type?
# Currently the info we add is: "what you wouldn't be able to figure out yourself"
Expand Down
Loading