From be4be30f9ad7779179b1bc64b92dce8c8a0e982d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 8 Nov 2024 09:41:29 +0100 Subject: [PATCH 01/22] (fix): lazy chunking respects -1 --- src/anndata/_io/specs/lazy_methods.py | 2 +- src/anndata/_io/specs/registry.py | 6 ++++++ tests/test_io_elementwise.py | 3 +++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 48770be9c..a952de37b 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -105,7 +105,7 @@ def read_sparse_as_dask( if chunks is not None: if len(chunks) != 2: raise ValueError("`chunks` must be a tuple of two integers") - if chunks[minor_dim] != shape[minor_dim]: + if chunks[minor_dim] not in {shape[minor_dim], -1}: raise ValueError( "Only the major axis can be chunked. " f"Try setting chunks to {((-1, _DEFAULT_STRIDE) if is_csc else (_DEFAULT_STRIDE, -1))}" diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 3b43def7c..37484ee3d 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -451,6 +451,12 @@ def read_elem_as_dask( ... g["X"], chunks=(500, adata.shape[1]) ... ) >>> adata.layers["dense"] = ad.experimental.read_elem_as_dask(g["layers/dense"]) + + We also support using -1 as a chunk size to signfiy the reading the whole axis: + + >>> >>> adata.X = ad.experimental.read_elem_as_dask( + ... g["X"], chunks=(500, -1) + ... ) """ return DaskReader(_LAZY_REGISTRY).read_elem(elem, chunks=chunks) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index e46cd7d81..b1898926b 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -284,6 +284,7 @@ def test_read_lazy_2d_dask(sparse_format, store): (2, (200, 400)), (1, None), (2, None), + (2, (400, -1)), ], ) def test_read_lazy_subsets_nd_dask(store, n_dims, chunks): @@ -323,6 +324,8 @@ def test_read_lazy_h5_cluster(sparse_format, tmp_path): ("csr", (10, SIZE * 2)), ("csc", None), ("csr", None), + ("csr", (10, -1)), + ("csc", (-1, 10)), ], ) def test_read_lazy_2d_chunk_kwargs(store, arr_type, chunks): From 2115298cc7aa390ddbf79697f2503c8bab0170ff Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 8 Nov 2024 09:42:09 +0100 Subject: [PATCH 02/22] (fix): cache arrays in `BaseCompressedSparseDataset` --- src/anndata/_core/sparse_dataset.py | 29 ++++++++++---- src/anndata/_io/specs/lazy_methods.py | 38 ++++++++++--------- src/anndata/tests/helpers.py | 2 +- tests/test_backed_sparse.py | 54 +++++++++++++++++++-------- 4 files changed, 82 insertions(+), 41 deletions(-) diff --git a/src/anndata/_core/sparse_dataset.py b/src/anndata/_core/sparse_dataset.py index ae6b47c7f..a051f5ea1 100644 --- a/src/anndata/_core/sparse_dataset.py +++ b/src/anndata/_core/sparse_dataset.py @@ -380,7 +380,7 @@ def backend(self) -> Literal["zarr", "hdf5"]: @property def dtype(self) -> np.dtype: """The :class:`numpy.dtype` of the `data` attribute of the sparse matrix.""" - return self.group["data"].dtype + return self._data.dtype @classmethod def _check_group_format(cls, group): @@ -546,8 +546,9 @@ def append(self, sparse_matrix: ss.csr_matrix | ss.csc_matrix | SpArray) -> None sparse_matrix.indptr[1:].astype(np.int64) + indptr_offset ) # Clear cached property - if hasattr(self, "indptr"): - del self._indptr + for attr in ["_indptr", "_indices", "_data"]: + if hasattr(self, attr): + delattr(self, attr) # indices indices = self.group["indices"] @@ -565,11 +566,25 @@ def _indptr(self) -> np.ndarray: arr = self.group["indptr"][...] return arr + @cached_property + def _indices(self) -> np.ndarray: + """\ + Cache access to the indices to prevent unnecessary reads of the zarray + """ + return self.group["indices"] + + @cached_property + def _data(self) -> np.ndarray: + """\ + Cache access to the data to prevent unnecessary reads of the zarray + """ + return self.group["data"] + def _to_backed(self) -> BackedSparseMatrix: format_class = get_backed_class(self.format) mtx = format_class(self.shape, dtype=self.dtype) - mtx.data = self.group["data"] - mtx.indices = self.group["indices"] + mtx.data = self._data + mtx.indices = self._indices mtx.indptr = self._indptr return mtx @@ -578,8 +593,8 @@ def to_memory(self) -> ss.csr_matrix | ss.csc_matrix | SpArray: self.format, use_sparray_in_io=settings.use_sparse_array_on_read ) mtx = format_class(self.shape, dtype=self.dtype) - mtx.data = self.group["data"][...] - mtx.indices = self.group["indices"][...] + mtx.data = self._data[...] + mtx.indices = self._indices[...] mtx.indptr = self._indptr return mtx diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index a952de37b..581539b15 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -10,17 +10,17 @@ from scipy import sparse import anndata as ad +from anndata.abc import CSCDataset, CSRDataset from ..._core.file_backing import filename, get_elem_name from ...compat import H5Array, H5Group, ZarrArray, ZarrGroup from .registry import _LAZY_REGISTRY, IOSpec if TYPE_CHECKING: - from collections.abc import Callable, Generator, Mapping, Sequence + from collections.abc import Generator, Mapping, Sequence from typing import Literal, ParamSpec, TypeVar - from ..._core.sparse_dataset import _CSCDataset, _CSRDataset - from ..._types import ArrayStorageType, StorageType + from ..._types import StorageType from ...compat import DaskArray from .registry import DaskReader @@ -35,12 +35,12 @@ @contextmanager def maybe_open_h5( - path_or_group: Path | ZarrGroup, elem_name: str + path_or_dataset: Path | CSRDataset | CSCDataset, elem_name: str ) -> Generator[StorageType, None, None]: - if not isinstance(path_or_group, Path): - yield path_or_group + if not isinstance(path_or_dataset, Path): + yield path_or_dataset return - file = h5py.File(path_or_group, "r") + file = h5py.File(path_or_dataset, "r") try: yield file[elem_name] finally: @@ -61,20 +61,17 @@ def compute_chunk_layout_for_axis_shape( def make_dask_chunk( - path_or_group: Path | ZarrGroup, + path_or_sparse_dataset: Path | CSRDataset | CSCDataset, elem_name: str, block_info: BlockInfo | None = None, - *, - wrap: Callable[[ArrayStorageType], ArrayStorageType] - | Callable[[H5Group | ZarrGroup], _CSRDataset | _CSCDataset] = lambda g: g, ): if block_info is None: msg = "Block info is required" raise ValueError(msg) # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` # https://github.com/scverse/anndata/issues/1105 - with maybe_open_h5(path_or_group, elem_name) as f: - mtx = wrap(f) + with maybe_open_h5(path_or_sparse_dataset, elem_name) as f: + mtx = ad.io.sparse_dataset(f) if isinstance(f, H5Group) else f idx = tuple( slice(start, stop) for start, stop in block_info[None]["array-location"] ) @@ -94,10 +91,17 @@ def read_sparse_as_dask( ) -> DaskArray: import dask.array as da - path_or_group = Path(filename(elem)) if isinstance(elem, H5Group) else elem + path_or_sparse_dataset = ( + Path(filename(elem)) + if isinstance(elem, H5Group) + else ad.io.sparse_dataset(elem) + ) elem_name = get_elem_name(elem) shape: tuple[int, int] = tuple(elem.attrs["shape"]) - dtype = elem["data"].dtype + if isinstance(path_or_sparse_dataset, CSRDataset | CSCDataset): + dtype = path_or_sparse_dataset.dtype + else: + dtype = elem["data"].dtype is_csc: bool = elem.attrs["encoding-type"] == "csc_matrix" stride: int = _DEFAULT_STRIDE @@ -119,9 +123,7 @@ def read_sparse_as_dask( (chunks_minor, chunks_major) if is_csc else (chunks_major, chunks_minor) ) memory_format = sparse.csc_matrix if is_csc else sparse.csr_matrix - make_chunk = partial( - make_dask_chunk, path_or_group, elem_name, wrap=ad.io.sparse_dataset - ) + make_chunk = partial(make_dask_chunk, path_or_sparse_dataset, elem_name) da_mtx = da.map_blocks( make_chunk, dtype=dtype, diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index 6ed637ed8..8b0226634 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -1057,7 +1057,7 @@ class AccessTrackingStore(DirectoryStore): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._access_count = Counter() - self._accessed_keys = {} + self._accessed_keys = dict() def __getitem__(self, key: str) -> object: for tracked in self._access_count: diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py index 2778c76bb..fb97c69c2 100644 --- a/tests/test_backed_sparse.py +++ b/tests/test_backed_sparse.py @@ -13,7 +13,8 @@ import anndata as ad from anndata._core.anndata import AnnData from anndata._core.sparse_dataset import sparse_dataset -from anndata.compat import CAN_USE_SPARSE_ARRAY, SpArray +from anndata._io.specs.registry import read_elem_as_dask +from anndata.compat import CAN_USE_SPARSE_ARRAY, DaskArray, SpArray from anndata.experimental import read_dispatched from anndata.tests.helpers import AccessTrackingStore, assert_equal, subset_func @@ -26,6 +27,9 @@ from numpy.typing import ArrayLike, NDArray from pytest_mock import MockerFixture + from anndata.abc import CSCDataset, CSRDataset + from anndata.compat import ZarrGroup + Idx = slice | int | NDArray[np.integer] | NDArray[np.bool_] @@ -354,24 +358,33 @@ def test_dataset_append_disk( @pytest.mark.parametrize("sparse_format", [sparse.csr_matrix, sparse.csc_matrix]) -def test_indptr_cache( +def test_lazy_array_cache( tmp_path: Path, sparse_format: Callable[[ArrayLike], sparse.spmatrix], ): + elems = {"indptr", "indices", "data"} path = tmp_path / "test.zarr" a = sparse_format(sparse.random(10, 10)) f = zarr.open_group(path, "a") ad.io.write_elem(f, "X", a) store = AccessTrackingStore(path) - store.initialize_key_trackers(["X/indptr"]) + for elem in elems: + store.initialize_key_trackers([f"X/{elem}"]) f = zarr.open_group(store, "a") a_disk = sparse_dataset(f["X"]) a_disk[:1] a_disk[3:5] a_disk[6:7] a_disk[8:9] - # one each for .zarray and actual access assert store.get_access_count("X/indptr") == 2 + for elem_not_indptr in elems - {"indptr"}: + assert ( + sum( + ".zarray" in key_accessed + for key_accessed in store.get_accessed_keys(f"X/{elem_not_indptr}") + ) + == 1 + ) Kind = Literal["slice", "int", "array", "mask"] @@ -421,27 +434,38 @@ def width_idx_kinds( ( [0], slice(None, None), - ["X/data/.zarray", "X/data/.zarray", "X/data/0"], + ["X/data/.zarray", "X/data/0"], ), ( [0], slice(None, 3), - ["X/data/.zarray", "X/data/.zarray", "X/data/0"], + ["X/data/.zarray", "X/data/0"], ), ( [3, 4, 5], slice(None, None), - ["X/data/.zarray", "X/data/.zarray", "X/data/3", "X/data/4", "X/data/5"], + ["X/data/.zarray", "X/data/3", "X/data/4", "X/data/5"], ), l=10, ), ) +@pytest.mark.parametrize( + "open_func", + [ + sparse_dataset, + lambda x: read_elem_as_dask( + x, chunks=(1, -1) if x.attrs["encoding-type"] == "csr_matrix" else (-1, 1) + ), + ], + ids=["sparse_dataset", "read_elem_as_dask"], +) def test_data_access( tmp_path: Path, sparse_format: Callable[[ArrayLike], sparse.spmatrix], idx_maj: Idx, idx_min: Idx, exp: Sequence[str], + open_func: Callable[[ZarrGroup], CSRDataset | CSCDataset | DaskArray], ): path = tmp_path / "test.zarr" a = sparse_format(np.eye(10, 10)) @@ -454,19 +478,19 @@ def test_data_access( store = AccessTrackingStore(path) store.initialize_key_trackers(["X/data"]) f = zarr.open_group(store) - a_disk = sparse_dataset(f["X"]) - - # Do the slicing with idx - store.reset_key_trackers() - if a_disk.format == "csr": - a_disk[idx_maj, idx_min] + a_disk = AnnData(X=open_func(f["X"])) + if a.format == "csr": + subset = a_disk[idx_maj, idx_min] else: - a_disk[idx_min, idx_maj] + subset = a_disk[idx_min, idx_maj] + if isinstance(subset.X, DaskArray): + subset.X.compute(scheduler="single-threaded") assert store.get_access_count("X/data") == len(exp), store.get_accessed_keys( "X/data" ) - assert store.get_accessed_keys("X/data") == exp + # dask access order is not guaranteed so need to sort + assert sorted(store.get_accessed_keys("X/data")) == sorted(exp) @pytest.mark.parametrize( From 2edabe2d947643da491df75205f7bec167c5080e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 8 Nov 2024 10:13:53 +0100 Subject: [PATCH 03/22] (fix): clean up typing --- src/anndata/_io/specs/lazy_methods.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 581539b15..2c23f2a85 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -3,7 +3,7 @@ from contextlib import contextmanager from functools import partial from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, overload import h5py import numpy as np @@ -18,10 +18,9 @@ if TYPE_CHECKING: from collections.abc import Generator, Mapping, Sequence - from typing import Literal, ParamSpec, TypeVar + from typing import Any, Literal, ParamSpec, TypeVar - from ..._types import StorageType - from ...compat import DaskArray + from ...compat import DaskArray, H5File from .registry import DaskReader BlockInfo = Mapping[ @@ -31,12 +30,21 @@ P = ParamSpec("P") R = TypeVar("R") + D = TypeVar("D", bound=CSCDataset | CSRDataset) +@overload +@contextmanager +def maybe_open_h5(path_or_dataset: D, elem_name: str) -> Generator[D, None, None]: ... +@overload +@contextmanager +def maybe_open_h5( + path_or_dataset: Path, elem_name: str +) -> Generator[H5File, None, None]: ... @contextmanager def maybe_open_h5( - path_or_dataset: Path | CSRDataset | CSCDataset, elem_name: str -) -> Generator[StorageType, None, None]: + path_or_dataset: Any, elem_name: str +) -> Generator[H5File | D, None, None]: if not isinstance(path_or_dataset, Path): yield path_or_dataset return From 286011649fb8e962fd152b751c138f0de1aed7f9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 8 Nov 2024 10:14:20 +0100 Subject: [PATCH 04/22] (fix): doctest double >>> --- src/anndata/_io/specs/registry.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 37484ee3d..c863038fc 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -454,9 +454,7 @@ def read_elem_as_dask( We also support using -1 as a chunk size to signfiy the reading the whole axis: - >>> >>> adata.X = ad.experimental.read_elem_as_dask( - ... g["X"], chunks=(500, -1) - ... ) + >>> adata.X = ad.experimental.read_elem_as_dask(g["X"], chunks=(500, -1)) """ return DaskReader(_LAZY_REGISTRY).read_elem(elem, chunks=chunks) From fa96348d35c4de7188acb13c0d136f262ec1f4d9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 8 Nov 2024 10:14:34 +0100 Subject: [PATCH 05/22] (chore): add tests --- tests/test_io_elementwise.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index b1898926b..65de0469d 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -335,12 +335,21 @@ def test_read_lazy_2d_chunk_kwargs(store, arr_type, chunks): else: arr_store = create_sparse_store(arr_type, store) X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) - if chunks is not None: - assert X_dask_from_disk.chunksize == chunks + if arr_type != "dense": + if chunks is not None: + expected_chunks = chunks + if -1 in expected_chunks: + missing_axis = expected_chunks.index(-1) + missing_aixs_size_size = SIZE * (1 + missing_axis) + expected_chunks = list(expected_chunks) + expected_chunks[missing_axis] = missing_aixs_size_size + expected_chunks = tuple(expected_chunks) + assert X_dask_from_disk.chunksize == expected_chunks + # assert that sparse chunks are set correctly by default + minor_index = int(arr_type == "csr") + assert X_dask_from_disk.chunksize[minor_index] == SIZE * (1 + minor_index) else: - minor_index = int(arr_type == "csr") - # assert that sparse chunks are set correctly by default - assert X_dask_from_disk.chunksize[minor_index] == SIZE * (1 + minor_index) + assert X_dask_from_disk.chunksize == chunks X_from_disk = read_elem(arr_store["X"]) assert_equal(X_from_disk, X_dask_from_disk) From a0e2d52477a854198df110c2547b3070b967c8c8 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 8 Nov 2024 10:17:07 +0100 Subject: [PATCH 06/22] (fix): more typing updates --- src/anndata/_io/specs/lazy_methods.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 2c23f2a85..e058221f6 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -20,7 +20,7 @@ from collections.abc import Generator, Mapping, Sequence from typing import Any, Literal, ParamSpec, TypeVar - from ...compat import DaskArray, H5File + from ...compat import DaskArray, H5File, SpArray from .registry import DaskReader BlockInfo = Mapping[ @@ -69,10 +69,10 @@ def compute_chunk_layout_for_axis_shape( def make_dask_chunk( - path_or_sparse_dataset: Path | CSRDataset | CSCDataset, + path_or_sparse_dataset: Path | D, elem_name: str, block_info: BlockInfo | None = None, -): +) -> sparse.csr_matrix | sparse.csc_matrix | SpArray: if block_info is None: msg = "Block info is required" raise ValueError(msg) From dc01a3a12ad2798eff04484df19b4c1bd8321f13 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 8 Nov 2024 10:18:58 +0100 Subject: [PATCH 07/22] (chore): add tests --- tests/test_io_elementwise.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index b1898926b..65de0469d 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -335,12 +335,21 @@ def test_read_lazy_2d_chunk_kwargs(store, arr_type, chunks): else: arr_store = create_sparse_store(arr_type, store) X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) - if chunks is not None: - assert X_dask_from_disk.chunksize == chunks + if arr_type != "dense": + if chunks is not None: + expected_chunks = chunks + if -1 in expected_chunks: + missing_axis = expected_chunks.index(-1) + missing_aixs_size_size = SIZE * (1 + missing_axis) + expected_chunks = list(expected_chunks) + expected_chunks[missing_axis] = missing_aixs_size_size + expected_chunks = tuple(expected_chunks) + assert X_dask_from_disk.chunksize == expected_chunks + # assert that sparse chunks are set correctly by default + minor_index = int(arr_type == "csr") + assert X_dask_from_disk.chunksize[minor_index] == SIZE * (1 + minor_index) else: - minor_index = int(arr_type == "csr") - # assert that sparse chunks are set correctly by default - assert X_dask_from_disk.chunksize[minor_index] == SIZE * (1 + minor_index) + assert X_dask_from_disk.chunksize == chunks X_from_disk = read_elem(arr_store["X"]) assert_equal(X_from_disk, X_dask_from_disk) From 37aba1b8b14abe7f61270ae553df2b16880ca655 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 8 Nov 2024 10:21:34 +0100 Subject: [PATCH 08/22] (fix): remove extra >>> --- src/anndata/_io/specs/registry.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 37484ee3d..c863038fc 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -454,9 +454,7 @@ def read_elem_as_dask( We also support using -1 as a chunk size to signfiy the reading the whole axis: - >>> >>> adata.X = ad.experimental.read_elem_as_dask( - ... g["X"], chunks=(500, -1) - ... ) + >>> adata.X = ad.experimental.read_elem_as_dask(g["X"], chunks=(500, -1)) """ return DaskReader(_LAZY_REGISTRY).read_elem(elem, chunks=chunks) From 32fbef9f7b80674119047ea42710a839e3a9cdae Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 8 Nov 2024 10:22:24 +0100 Subject: [PATCH 09/22] (fix): spelling --- src/anndata/_io/specs/registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index c863038fc..338e4e596 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -452,7 +452,7 @@ def read_elem_as_dask( ... ) >>> adata.layers["dense"] = ad.experimental.read_elem_as_dask(g["layers/dense"]) - We also support using -1 as a chunk size to signfiy the reading the whole axis: + We also support using -1 as a chunk size to signify the reading the whole axis: >>> adata.X = ad.experimental.read_elem_as_dask(g["X"], chunks=(500, -1)) """ From ceb70b44ef6b5fcdeff996be81ba8f651a008f4f Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 8 Nov 2024 10:30:02 +0100 Subject: [PATCH 10/22] (chore): release note --- docs/release-notes/1743.bugfix.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/release-notes/1743.bugfix.md diff --git a/docs/release-notes/1743.bugfix.md b/docs/release-notes/1743.bugfix.md new file mode 100644 index 000000000..f8f489aff --- /dev/null +++ b/docs/release-notes/1743.bugfix.md @@ -0,0 +1 @@ +Fix chunking with -1 in `chunks` argument of {func}`~anndata.experimental.read_elem_as_dask` {user}`ilan-gold` From fedd82721152e352e541dc6bb7a9bc0772c8970c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 8 Nov 2024 10:32:47 +0100 Subject: [PATCH 11/22] (chore): release note --- docs/release-notes/1744.bugfix.md | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 docs/release-notes/1744.bugfix.md diff --git a/docs/release-notes/1744.bugfix.md b/docs/release-notes/1744.bugfix.md new file mode 100644 index 000000000..18023bb16 --- /dev/null +++ b/docs/release-notes/1744.bugfix.md @@ -0,0 +1,2 @@ + +Cache accesses to the `data` and `indices` arrays in {class}`~anndata.abc.CSRDataset` and {class}`~anndata.abc.CSCDataset` {user}`ilan-gold` From 5960331a31584807c725d8a1db277673fdfc2b57 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 8 Nov 2024 11:25:43 +0100 Subject: [PATCH 12/22] (fix): support `None` and `-1` --- src/anndata/_io/specs/lazy_methods.py | 12 ++++++-- src/anndata/_io/specs/registry.py | 4 ++- tests/test_io_elementwise.py | 41 +++++++++++---------------- 3 files changed, 29 insertions(+), 28 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index a952de37b..089730fa3 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -105,12 +105,16 @@ def read_sparse_as_dask( if chunks is not None: if len(chunks) != 2: raise ValueError("`chunks` must be a tuple of two integers") - if chunks[minor_dim] not in {shape[minor_dim], -1}: + if chunks[minor_dim] not in {shape[minor_dim], -1, None}: raise ValueError( "Only the major axis can be chunked. " f"Try setting chunks to {((-1, _DEFAULT_STRIDE) if is_csc else (_DEFAULT_STRIDE, -1))}" ) - stride = chunks[major_dim] + stride = ( + chunks[major_dim] + if chunks[major_dim] not in {None, -1} + else shape[major_dim] + ) shape_minor, shape_major = shape if is_csc else shape[::-1] chunks_major = compute_chunk_layout_for_axis_shape(stride, shape_major) @@ -142,7 +146,9 @@ def read_h5_array( shape = tuple(elem.shape) dtype = elem.dtype chunks: tuple[int, ...] = ( - chunks if chunks is not None else (_DEFAULT_STRIDE,) * len(shape) + tuple(c if c not in {None, -1} else shape[i] for i, c in enumerate(chunks)) + if chunks is not None + else (_DEFAULT_STRIDE,) * len(shape) ) chunk_layout = tuple( diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 338e4e596..ca13f8e59 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -398,6 +398,7 @@ def read_elem_as_dask( Defaults to `(1000, adata.shape[1])` for CSR sparse, `(adata.shape[0], 1000)` for CSC sparse, and the on-disk chunking otherwise for dense. + Can use `-1` or `None` to indicate use of the size of the corresponding dimension. Returns ------- @@ -452,9 +453,10 @@ def read_elem_as_dask( ... ) >>> adata.layers["dense"] = ad.experimental.read_elem_as_dask(g["layers/dense"]) - We also support using -1 as a chunk size to signify the reading the whole axis: + We also support using -1 and None as a chunk size to signify the reading the whole axis: >>> adata.X = ad.experimental.read_elem_as_dask(g["X"], chunks=(500, -1)) + >>> adata.X = ad.experimental.read_elem_as_dask(g["X"], chunks=(500, None)) """ return DaskReader(_LAZY_REGISTRY).read_elem(elem, chunks=chunks) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 65de0469d..d6dc0033b 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -285,6 +285,7 @@ def test_read_lazy_2d_dask(sparse_format, store): (1, None), (2, None), (2, (400, -1)), + (2, (400, None)), ], ) def test_read_lazy_subsets_nd_dask(store, n_dims, chunks): @@ -317,39 +318,31 @@ def test_read_lazy_h5_cluster(sparse_format, tmp_path): @pytest.mark.parametrize( - ("arr_type", "chunks"), + ("arr_type", "chunks", "expected_chunksize"), [ - ("dense", (100, 100)), - ("csc", (SIZE, 10)), - ("csr", (10, SIZE * 2)), - ("csc", None), - ("csr", None), - ("csr", (10, -1)), - ("csc", (-1, 10)), + ("dense", (100, 100), (100, 100)), + ("csc", (SIZE, 10), (SIZE, 10)), + ("csr", (10, SIZE * 2), (10, SIZE * 2)), + ("csc", None, (SIZE, 1000)), + ("csr", None, (1000, SIZE * 2)), + ("csr", (10, -1), (10, SIZE * 2)), + ("csc", (-1, 10), (SIZE, 10)), + ("csr", (10, None), (10, SIZE * 2)), + ("csc", (None, 10), (SIZE, 10)), + ("csc", (None, None), (SIZE, SIZE * 2)), + ("csr", (None, None), (SIZE, SIZE * 2)), + ("csr", (-1, -1), (SIZE, SIZE * 2)), + ("csc", (-1, -1), (SIZE, SIZE * 2)), ], ) -def test_read_lazy_2d_chunk_kwargs(store, arr_type, chunks): +def test_read_lazy_2d_chunk_kwargs(store, arr_type, chunks, expected_chunksize): if arr_type == "dense": arr_store = create_dense_store(store) X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) else: arr_store = create_sparse_store(arr_type, store) X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) - if arr_type != "dense": - if chunks is not None: - expected_chunks = chunks - if -1 in expected_chunks: - missing_axis = expected_chunks.index(-1) - missing_aixs_size_size = SIZE * (1 + missing_axis) - expected_chunks = list(expected_chunks) - expected_chunks[missing_axis] = missing_aixs_size_size - expected_chunks = tuple(expected_chunks) - assert X_dask_from_disk.chunksize == expected_chunks - # assert that sparse chunks are set correctly by default - minor_index = int(arr_type == "csr") - assert X_dask_from_disk.chunksize[minor_index] == SIZE * (1 + minor_index) - else: - assert X_dask_from_disk.chunksize == chunks + assert X_dask_from_disk.chunksize == expected_chunksize X_from_disk = read_elem(arr_store["X"]) assert_equal(X_from_disk, X_dask_from_disk) From e652c449c9c219814cb5554b32969c8d22023633 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 8 Nov 2024 12:24:02 +0100 Subject: [PATCH 13/22] (chore): typing --- tests/test_io_elementwise.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index d6dc0033b..3ca5324b8 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -335,7 +335,12 @@ def test_read_lazy_h5_cluster(sparse_format, tmp_path): ("csc", (-1, -1), (SIZE, SIZE * 2)), ], ) -def test_read_lazy_2d_chunk_kwargs(store, arr_type, chunks, expected_chunksize): +def test_read_lazy_2d_chunk_kwargs( + store: H5Group | ZarrGroup, + arr_type: Literal["csr", "csc", "dense"], + chunks: None | tuple[int | None, int | None], + expected_chunksize: tuple[int, int], +): if arr_type == "dense": arr_store = create_dense_store(store) X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) From 59849a85b77442d0979fbdb15c375eae4edddf5f Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 8 Nov 2024 12:42:30 +0100 Subject: [PATCH 14/22] (chore): add cache bust test --- src/anndata/_core/sparse_dataset.py | 9 +++++---- tests/test_backed_sparse.py | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/anndata/_core/sparse_dataset.py b/src/anndata/_core/sparse_dataset.py index a051f5ea1..15335f21f 100644 --- a/src/anndata/_core/sparse_dataset.py +++ b/src/anndata/_core/sparse_dataset.py @@ -545,10 +545,6 @@ def append(self, sparse_matrix: ss.csr_matrix | ss.csc_matrix | SpArray) -> None indptr[orig_data_size:] = ( sparse_matrix.indptr[1:].astype(np.int64) + indptr_offset ) - # Clear cached property - for attr in ["_indptr", "_indices", "_data"]: - if hasattr(self, attr): - delattr(self, attr) # indices indices = self.group["indices"] @@ -556,6 +552,11 @@ def append(self, sparse_matrix: ss.csr_matrix | ss.csc_matrix | SpArray) -> None indices.resize((orig_data_size + sparse_matrix.indices.shape[0],)) indices[orig_data_size:] = sparse_matrix.indices + # Clear cached property + for attr in ["_indptr", "_indices", "_data"]: + if hasattr(self, attr): + delattr(self, attr) + @cached_property def _indptr(self) -> np.ndarray: """\ diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py index fb97c69c2..8d7a32a38 100644 --- a/tests/test_backed_sparse.py +++ b/tests/test_backed_sparse.py @@ -285,6 +285,25 @@ def test_dataset_append_memory( assert_equal(fromdisk, frommem) +def test_append_array_cache_bust(tmp_path: Path, diskfmt): + path = tmp_path / f"test.{diskfmt.replace('ad', '')}" + a = sparse.random(100, 100, format="csr") + if diskfmt == "zarr": + f = zarr.open_group(path, "a") + else: + f = h5py.File(path, "a") + ad.io.write_elem(f, "mtx", a) + ad.io.write_elem(f, "mtx_2", a) + diskmtx = sparse_dataset(f["mtx"]) + old_array_shapes = {} + array_names = ["indptr", "indices", "data"] + for name in array_names: + old_array_shapes[name] = getattr(diskmtx, f"_{name}").shape + diskmtx.append(sparse_dataset(f["mtx_2"])) + for name in array_names: + assert old_array_shapes[name] != getattr(diskmtx, f"_{name}").shape + + @pytest.mark.parametrize("sparse_format", [sparse.csr_matrix, sparse.csc_matrix]) @pytest.mark.parametrize( ("subset_func", "subset_func2"), From 41bd62e33e466e0f67a6e41e574e708d597274f2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 8 Nov 2024 12:42:54 +0100 Subject: [PATCH 15/22] (chore): type --- tests/test_backed_sparse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py index 8d7a32a38..1fc7874f8 100644 --- a/tests/test_backed_sparse.py +++ b/tests/test_backed_sparse.py @@ -285,7 +285,7 @@ def test_dataset_append_memory( assert_equal(fromdisk, frommem) -def test_append_array_cache_bust(tmp_path: Path, diskfmt): +def test_append_array_cache_bust(tmp_path: Path, diskfmt: Literal["h5ad", "zarr"]): path = tmp_path / f"test.{diskfmt.replace('ad', '')}" a = sparse.random(100, 100, format="csr") if diskfmt == "zarr": From 0304d31cf79212153ad7c6d436834e3e5b822dad Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 8 Nov 2024 17:05:23 +0100 Subject: [PATCH 16/22] (chore): types --- src/anndata/_core/sparse_dataset.py | 5 +++-- src/anndata/_io/specs/lazy_methods.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/anndata/_core/sparse_dataset.py b/src/anndata/_core/sparse_dataset.py index 15335f21f..a155352af 100644 --- a/src/anndata/_core/sparse_dataset.py +++ b/src/anndata/_core/sparse_dataset.py @@ -38,6 +38,7 @@ from scipy.sparse._compressed import _cs_matrix from .._types import GroupStorageType + from ..compat import H5Array from .index import Index else: from scipy.sparse import spmatrix as _cs_matrix @@ -568,14 +569,14 @@ def _indptr(self) -> np.ndarray: return arr @cached_property - def _indices(self) -> np.ndarray: + def _indices(self) -> H5Array | ZarrArray: """\ Cache access to the indices to prevent unnecessary reads of the zarray """ return self.group["indices"] @cached_property - def _data(self) -> np.ndarray: + def _data(self) -> H5Array | ZarrArray: """\ Cache access to the data to prevent unnecessary reads of the zarray """ diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index a3b361778..bf7f54522 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -30,7 +30,7 @@ P = ParamSpec("P") R = TypeVar("R") - D = TypeVar("D", bound=CSCDataset | CSRDataset) + D = TypeVar("D") @overload From 76ecda590d7f2c4ffd1a2a36f7e8d100aec382eb Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 8 Nov 2024 17:06:06 +0100 Subject: [PATCH 17/22] (chore): better name --- src/anndata/_io/specs/lazy_methods.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index bf7f54522..1177f19af 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -35,20 +35,20 @@ @overload @contextmanager -def maybe_open_h5(path_or_dataset: D, elem_name: str) -> Generator[D, None, None]: ... +def maybe_open_h5(path_or_other: D, elem_name: str) -> Generator[D, None, None]: ... @overload @contextmanager def maybe_open_h5( - path_or_dataset: Path, elem_name: str + path_or_other: Path, elem_name: str ) -> Generator[H5File, None, None]: ... @contextmanager def maybe_open_h5( - path_or_dataset: Any, elem_name: str + path_or_other: Any, elem_name: str ) -> Generator[H5File | D, None, None]: - if not isinstance(path_or_dataset, Path): - yield path_or_dataset + if not isinstance(path_or_other, Path): + yield path_or_other return - file = h5py.File(path_or_dataset, "r") + file = h5py.File(path_or_other, "r") try: yield file[elem_name] finally: From e538a12a3d9b830a4302daac4af360c9d51eddcd Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 8 Nov 2024 17:08:26 +0100 Subject: [PATCH 18/22] (Fix): overload type --- src/anndata/_io/specs/lazy_methods.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 1177f19af..738caf488 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -18,7 +18,7 @@ if TYPE_CHECKING: from collections.abc import Generator, Mapping, Sequence - from typing import Any, Literal, ParamSpec, TypeVar + from typing import Literal, ParamSpec, TypeVar from ...compat import DaskArray, H5File, SpArray from .registry import DaskReader @@ -33,17 +33,17 @@ D = TypeVar("D") -@overload -@contextmanager -def maybe_open_h5(path_or_other: D, elem_name: str) -> Generator[D, None, None]: ... @overload @contextmanager def maybe_open_h5( path_or_other: Path, elem_name: str ) -> Generator[H5File, None, None]: ... +@overload +@contextmanager +def maybe_open_h5(path_or_other: D, elem_name: str) -> Generator[D, None, None]: ... @contextmanager def maybe_open_h5( - path_or_other: Any, elem_name: str + path_or_other: H5File | D, elem_name: str ) -> Generator[H5File | D, None, None]: if not isinstance(path_or_other, Path): yield path_or_other From b51321749831b6dce7f0f0dabf0ee7446fab2d2e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 8 Nov 2024 17:10:12 +0100 Subject: [PATCH 19/22] (chore): bring back test comment --- tests/test_backed_sparse.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py index 1fc7874f8..03155d0a3 100644 --- a/tests/test_backed_sparse.py +++ b/tests/test_backed_sparse.py @@ -395,6 +395,7 @@ def test_lazy_array_cache( a_disk[3:5] a_disk[6:7] a_disk[8:9] + # one each for .zarray and actual access assert store.get_access_count("X/indptr") == 2 for elem_not_indptr in elems - {"indptr"}: assert ( From d2d9f55a433bd6e40dbde49d880774c5a7939feb Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Mon, 11 Nov 2024 17:05:49 +0100 Subject: [PATCH 20/22] Update 1744.bugfix.md --- docs/release-notes/1744.bugfix.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/release-notes/1744.bugfix.md b/docs/release-notes/1744.bugfix.md index 18023bb16..2a425d43c 100644 --- a/docs/release-notes/1744.bugfix.md +++ b/docs/release-notes/1744.bugfix.md @@ -1,2 +1 @@ - Cache accesses to the `data` and `indices` arrays in {class}`~anndata.abc.CSRDataset` and {class}`~anndata.abc.CSCDataset` {user}`ilan-gold` From 3dc0ddd8d5147b4036295297bfb8244856433e89 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Thu, 21 Nov 2024 10:33:35 +0100 Subject: [PATCH 21/22] (fix): revert erroneous change --- src/anndata/tests/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index 8b0226634..6ed637ed8 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -1057,7 +1057,7 @@ class AccessTrackingStore(DirectoryStore): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._access_count = Counter() - self._accessed_keys = dict() + self._accessed_keys = {} def __getitem__(self, key: str) -> object: for tracked in self._access_count: From 1dcf7adcaa5bf2be26bf70f8615aecb2692feb0b Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 21 Nov 2024 11:10:49 +0100 Subject: [PATCH 22/22] (fix): dont generate coo matrices --- tests/test_concatenate.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_concatenate.py b/tests/test_concatenate.py index e034debd2..d9f399dd6 100644 --- a/tests/test_concatenate.py +++ b/tests/test_concatenate.py @@ -1044,7 +1044,9 @@ def gen_list(n): def gen_sparse(n): - return sparse.random(np.random.randint(1, 100), np.random.randint(1, 100)) + return sparse.random( + np.random.randint(1, 100), np.random.randint(1, 100), format="csr" + ) def gen_something(n):