From d60c3aba390e84dfe99431fa3505efeb776b0bb4 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 28 Aug 2024 16:40:04 -0700 Subject: [PATCH 1/8] Chunked writing of h5py.Dataset and zarr.Array --- src/anndata/_io/specs/methods.py | 45 +++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 794d60437..477d79be8 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -373,13 +373,10 @@ def write_list( # It's in the `AnnData.concatenate` docstring, but should we keep it? @_REGISTRY.register_write(H5Group, views.ArrayView, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(H5Group, np.ndarray, IOSpec("array", "0.2.0")) -@_REGISTRY.register_write(H5Group, h5py.Dataset, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(H5Group, np.ma.MaskedArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, views.ArrayView, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, np.ndarray, IOSpec("array", "0.2.0")) -@_REGISTRY.register_write(ZarrGroup, h5py.Dataset, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, np.ma.MaskedArray, IOSpec("array", "0.2.0")) -@_REGISTRY.register_write(ZarrGroup, ZarrArray, IOSpec("array", "0.2.0")) def write_basic( f: GroupStorageType, k: str, @@ -392,6 +389,48 @@ def write_basic( f.create_dataset(k, data=elem, **dataset_kwargs) +def _iter_chunks_for_copy(elem, dest): + """ + Returns an iterator of tuples of slices for copying chunks from `elem` to `dest`. + + * If `elem` has chunks, it will return the chunks of `elem`. + * If `dest` has chunks, it will return the chunks of `dest`. + * If neither is chunked, we write it in ~100MB chunks or 1000 rows, whichever is larger. + """ + if elem.chunks: + return elem.iter_chunks() + elif dest.chunks: + return dest.iter_chunks() + else: + itemsize = elem.dtype.itemsize + shape = elem.shape + entry_chunk_size = (100 * 1024 * 1024 // itemsize) # number of elements to write + n_rows = max(entry_chunk_size // shape[0], 1000) # Number of rows that works out to + return zip( + (slice(i, min(i + n_rows, shape[0])) for i in range(0, shape[0], n_rows)), + (slice(None) for _ in range(0, shape[0], n_rows)), + ) + + +@_REGISTRY.register_write(H5Group, H5Array, IOSpec("array", "0.2.0")) +@_REGISTRY.register_write(H5Group, ZarrArray, IOSpec("array", "0.2.0")) +@_REGISTRY.register_write(ZarrGroup, H5Array, IOSpec("array", "0.2.0")) +@_REGISTRY.register_write(ZarrGroup, ZarrArray, IOSpec("array", "0.2.0")) +def write_chunked_dense_array( + f: GroupStorageType, + k: str, + elem, + *, + _writer: Writer, + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), +): + dest = f.create_dataset_like(k, elem, **dataset_kwargs) + + for chunk in _iter_chunks_for_copy(elem, dest): + dest[chunk] = elem[chunk] + + + _REGISTRY.register_write(H5Group, CupyArray, IOSpec("array", "0.2.0"))( _to_cpu_mem_wrapper(write_basic) ) From 232bee467c2ad4d43b7ff2857852fbf52c466038 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 28 Aug 2024 23:44:47 +0000 Subject: [PATCH 2/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/anndata/_io/specs/methods.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 477d79be8..4b735b4c1 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -404,8 +404,10 @@ def _iter_chunks_for_copy(elem, dest): else: itemsize = elem.dtype.itemsize shape = elem.shape - entry_chunk_size = (100 * 1024 * 1024 // itemsize) # number of elements to write - n_rows = max(entry_chunk_size // shape[0], 1000) # Number of rows that works out to + entry_chunk_size = 100 * 1024 * 1024 // itemsize # number of elements to write + n_rows = max( + entry_chunk_size // shape[0], 1000 + ) # Number of rows that works out to return zip( (slice(i, min(i + n_rows, shape[0])) for i in range(0, shape[0], n_rows)), (slice(None) for _ in range(0, shape[0], n_rows)), @@ -430,7 +432,6 @@ def write_chunked_dense_array( dest[chunk] = elem[chunk] - _REGISTRY.register_write(H5Group, CupyArray, IOSpec("array", "0.2.0"))( _to_cpu_mem_wrapper(write_basic) ) From c43c5e2dfb7ff911f6b04071a00cbb770952e3e2 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 28 Aug 2024 16:52:21 -0700 Subject: [PATCH 3/8] Make n-dimensional --- src/anndata/_io/specs/methods.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 4b735b4c1..ce8e46218 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -408,10 +408,7 @@ def _iter_chunks_for_copy(elem, dest): n_rows = max( entry_chunk_size // shape[0], 1000 ) # Number of rows that works out to - return zip( - (slice(i, min(i + n_rows, shape[0])) for i in range(0, shape[0], n_rows)), - (slice(None) for _ in range(0, shape[0], n_rows)), - ) + return (slice(i, min(i + n_rows, shape[0])) for i in range(0, shape[0], n_rows)) @_REGISTRY.register_write(H5Group, H5Array, IOSpec("array", "0.2.0")) From 749880bbc5dd353d22a0cac14dd6a28b56cecbcc Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 28 Aug 2024 17:23:11 -0700 Subject: [PATCH 4/8] Add some tests, which fail :( --- tests/test_io_elementwise.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index ed89dbe7f..a4d614c6f 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -166,6 +166,18 @@ def create_sparse_store( pytest.param( pd.array([True, False, True, True]), "nullable-boolean", id="pd_arr_bool" ), + pytest.param( + zarr.ones((100, 100), chunks=(10, 10)), + "array", + id="zarr_dense_array", + ), + pytest.param( + create_dense_store( + h5py.File("test1.h5", mode="w", driver="core", backing_store=False) + )["X"], + "array", + id="h5_dense_array", + ), # pytest.param(bytes, b"some bytes", "bytes", id="py_bytes"), # Does not work for zarr # TODO consider how specific encodings should be. Should we be fully describing the written type? # Currently the info we add is: "what you wouldn't be able to figure out yourself" From 32e008de85edda70fbf3140fb0fef3082efaad3d Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 30 Sep 2024 15:41:19 -0700 Subject: [PATCH 5/8] Fix up chunking algorithm + add some types --- src/anndata/_io/specs/methods.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index ce8e46218..2861949ab 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -377,6 +377,8 @@ def write_list( @_REGISTRY.register_write(ZarrGroup, views.ArrayView, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, np.ndarray, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, np.ma.MaskedArray, IOSpec("array", "0.2.0")) +@_REGISTRY.register_write(ZarrGroup, ZarrArray, IOSpec("array", "0.2.0")) +@_REGISTRY.register_write(ZarrGroup, H5Array, IOSpec("array", "0.2.0")) def write_basic( f: GroupStorageType, k: str, @@ -389,17 +391,14 @@ def write_basic( f.create_dataset(k, data=elem, **dataset_kwargs) -def _iter_chunks_for_copy(elem, dest): +def _iter_chunks_for_copy(elem: ArrayStorageType, dest: ArrayStorageType): """ Returns an iterator of tuples of slices for copying chunks from `elem` to `dest`. - * If `elem` has chunks, it will return the chunks of `elem`. * If `dest` has chunks, it will return the chunks of `dest`. - * If neither is chunked, we write it in ~100MB chunks or 1000 rows, whichever is larger. + * If `dest` is not chunked, we write it in ~100MB chunks or 1000 rows, whichever is larger. """ - if elem.chunks: - return elem.iter_chunks() - elif dest.chunks: + if dest.chunks: return dest.iter_chunks() else: itemsize = elem.dtype.itemsize @@ -413,17 +412,21 @@ def _iter_chunks_for_copy(elem, dest): @_REGISTRY.register_write(H5Group, H5Array, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(H5Group, ZarrArray, IOSpec("array", "0.2.0")) -@_REGISTRY.register_write(ZarrGroup, H5Array, IOSpec("array", "0.2.0")) -@_REGISTRY.register_write(ZarrGroup, ZarrArray, IOSpec("array", "0.2.0")) -def write_chunked_dense_array( +def write_chunked_dense_array_to_h5( f: GroupStorageType, k: str, - elem, + elem: ArrayStorageType, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): - dest = f.create_dataset_like(k, elem, **dataset_kwargs) + """Write to a h5py.Dataset in chunks. + + `h5py.Group.create_dataset(..., data: h5py.Dataset)` will load all of `data` into memory + before writing. Instead, we will write in chunks to avoid this. We don't need to do this for + zarr since zarr handles this automatically. + """ + dest = f.create_dataset(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs) for chunk in _iter_chunks_for_copy(elem, dest): dest[chunk] = elem[chunk] From b2192a2148574b6cce4e0a49aef64d0a5e173fc0 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 15 Nov 2024 15:52:31 +0100 Subject: [PATCH 6/8] (chore): remove unneeded check? --- tests/test_io_dispatched.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_io_dispatched.py b/tests/test_io_dispatched.py index 833b23e83..4dba9b6aa 100644 --- a/tests/test_io_dispatched.py +++ b/tests/test_io_dispatched.py @@ -180,7 +180,5 @@ def zarr_reader(func, elem_name: str, elem, iospec): write_dispatched(f, "/", adata, callback=zarr_writer) _ = read_dispatched(f, zarr_reader) - assert h5ad_write_keys == zarr_write_keys - assert h5ad_read_keys == zarr_read_keys - - assert sorted(h5ad_write_keys) == sorted(h5ad_read_keys) + assert sorted(h5ad_write_keys) == sorted(zarr_write_keys) + assert sorted(h5ad_read_keys) == sorted(zarr_read_keys) From 5938d86f0130265fa126ff3fddaac7e99143e41e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 15 Nov 2024 15:52:56 +0100 Subject: [PATCH 7/8] (fix): dispatch to chunked writing for dense arrays --- src/anndata/_io/specs/methods.py | 38 ++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 2861949ab..6379588d3 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -398,7 +398,7 @@ def _iter_chunks_for_copy(elem: ArrayStorageType, dest: ArrayStorageType): * If `dest` has chunks, it will return the chunks of `dest`. * If `dest` is not chunked, we write it in ~100MB chunks or 1000 rows, whichever is larger. """ - if dest.chunks: + if dest.chunks and hasattr(dest, "iter_chunks"): return dest.iter_chunks() else: itemsize = elem.dtype.itemsize @@ -412,7 +412,8 @@ def _iter_chunks_for_copy(elem: ArrayStorageType, dest: ArrayStorageType): @_REGISTRY.register_write(H5Group, H5Array, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(H5Group, ZarrArray, IOSpec("array", "0.2.0")) -def write_chunked_dense_array_to_h5( +@_REGISTRY.register_write(ZarrGroup, H5Array, IOSpec("array", "0.2.0")) +def write_chunked_dense_array_to_group( f: GroupStorageType, k: str, elem: ArrayStorageType, @@ -426,12 +427,31 @@ def write_chunked_dense_array_to_h5( before writing. Instead, we will write in chunks to avoid this. We don't need to do this for zarr since zarr handles this automatically. """ - dest = f.create_dataset(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs) + dtype = dataset_kwargs.get("dtype", elem.dtype) + dest = f.create_dataset(k, shape=elem.shape, **dataset_kwargs, dtype=dtype) for chunk in _iter_chunks_for_copy(elem, dest): dest[chunk] = elem[chunk] +@_REGISTRY.register_write(ZarrGroup, ZarrArray, IOSpec("array", "0.2.0")) +def write_chunked_dense_array_to_zarr( + f: ZarrGroup, + k: str, + elem: ZarrArray, + *, + _writer: Writer, + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), +): + """Write to a h5py.Dataset in chunks. + `h5py.Group.create_dataset(..., data: h5py.Dataset)` will load all of `data` into memory + before writing. Instead, we will write in chunks to avoid this. We don't need to do this for + zarr since zarr handles this automatically. + """ + dtype = dataset_kwargs.get("dtype", elem.dtype) + f.create_dataset(k, shape=elem.shape, data=elem, **dataset_kwargs, dtype=dtype) + + _REGISTRY.register_write(H5Group, CupyArray, IOSpec("array", "0.2.0"))( _to_cpu_mem_wrapper(write_basic) ) @@ -638,10 +658,14 @@ def write_sparse_compressed( # Allow resizing for hdf5 if isinstance(f, H5Group) and "maxshape" not in dataset_kwargs: dataset_kwargs = dict(maxshape=(None,), **dataset_kwargs) - - g.create_dataset("data", data=value.data, **dataset_kwargs) - g.create_dataset("indices", data=value.indices, **dataset_kwargs) - g.create_dataset("indptr", data=value.indptr, dtype=indptr_dtype, **dataset_kwargs) + _writer.write_elem(g, "data", value.data, dataset_kwargs=dataset_kwargs) + _writer.write_elem(g, "indices", value.indices, dataset_kwargs=dataset_kwargs) + _writer.write_elem( + g, + "indptr", + value.indptr, + dataset_kwargs={"dtype": indptr_dtype, **dataset_kwargs}, + ) write_csr = partial(write_sparse_compressed, fmt="csr") From 99d4400bee9f0c75a4321ba9fa18c1ba3149c7d2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 15 Nov 2024 16:03:23 +0100 Subject: [PATCH 8/8] (chore): remove unnecessary methods --- src/anndata/_io/specs/methods.py | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 6379588d3..f6916cba1 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -388,7 +388,8 @@ def write_basic( dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): """Write methods which underlying library handles natively.""" - f.create_dataset(k, data=elem, **dataset_kwargs) + dtype = dataset_kwargs.get("dtype", elem.dtype) + f.create_dataset(k, data=elem, **dataset_kwargs, dtype=dtype) def _iter_chunks_for_copy(elem: ArrayStorageType, dest: ArrayStorageType): @@ -412,7 +413,6 @@ def _iter_chunks_for_copy(elem: ArrayStorageType, dest: ArrayStorageType): @_REGISTRY.register_write(H5Group, H5Array, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(H5Group, ZarrArray, IOSpec("array", "0.2.0")) -@_REGISTRY.register_write(ZarrGroup, H5Array, IOSpec("array", "0.2.0")) def write_chunked_dense_array_to_group( f: GroupStorageType, k: str, @@ -434,24 +434,6 @@ def write_chunked_dense_array_to_group( dest[chunk] = elem[chunk] -@_REGISTRY.register_write(ZarrGroup, ZarrArray, IOSpec("array", "0.2.0")) -def write_chunked_dense_array_to_zarr( - f: ZarrGroup, - k: str, - elem: ZarrArray, - *, - _writer: Writer, - dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), -): - """Write to a h5py.Dataset in chunks. - `h5py.Group.create_dataset(..., data: h5py.Dataset)` will load all of `data` into memory - before writing. Instead, we will write in chunks to avoid this. We don't need to do this for - zarr since zarr handles this automatically. - """ - dtype = dataset_kwargs.get("dtype", elem.dtype) - f.create_dataset(k, shape=elem.shape, data=elem, **dataset_kwargs, dtype=dtype) - - _REGISTRY.register_write(H5Group, CupyArray, IOSpec("array", "0.2.0"))( _to_cpu_mem_wrapper(write_basic) )