From 686786db35cafd8cbd7ebcc0128d5239fb06bce7 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Thu, 4 Jul 2024 10:54:45 +0200 Subject: [PATCH 1/3] (feat): allow `join=outer` for `concat_on_disk` (#1504) * (feat): allow `join=outer` for `concat_on_disk` * (recactor): don't need to redeclare types * (chore): release note * (chore): add more rigorous union/diff check * simplify * (chore): clean up interleaving naming of axis * simplify * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Phil Schaf Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/release-notes/0.10.9.md | 2 + src/anndata/experimental/merge.py | 2 - tests/test_concatenate_disk.py | 140 +++++++++++++----------------- 3 files changed, 60 insertions(+), 84 deletions(-) diff --git a/docs/release-notes/0.10.9.md b/docs/release-notes/0.10.9.md index ddfb0274d..1fc7a1ca7 100644 --- a/docs/release-notes/0.10.9.md +++ b/docs/release-notes/0.10.9.md @@ -10,3 +10,5 @@ #### Documentation #### Performance + +* Support for `concat_on_disk` outer join {pr}`1504` {user}`ilan-gold` diff --git a/src/anndata/experimental/merge.py b/src/anndata/experimental/merge.py index aa6f47e9b..040e7c9dd 100644 --- a/src/anndata/experimental/merge.py +++ b/src/anndata/experimental/merge.py @@ -538,8 +538,6 @@ def concat_on_disk( # Argument normalization if pairwise: raise NotImplementedError("pairwise concatenation not yet implemented") - if join != "inner": - raise NotImplementedError("only inner join is currently supported") merge = resolve_merge_strategy(merge) uns_merge = resolve_merge_strategy(uns_merge) diff --git a/tests/test_concatenate_disk.py b/tests/test_concatenate_disk.py index 640ae446b..216b66e21 100644 --- a/tests/test_concatenate_disk.py +++ b/tests/test_concatenate_disk.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import Mapping +from typing import TYPE_CHECKING, Literal import numpy as np import pandas as pd @@ -8,6 +9,7 @@ from scipy import sparse from anndata import AnnData, concat +from anndata._core.merge import _resolve_axis from anndata.experimental import read_elem, write_elem from anndata.experimental.merge import as_group, concat_on_disk from anndata.tests.helpers import ( @@ -16,6 +18,10 @@ ) from anndata.utils import asarray +if TYPE_CHECKING: + from pathlib import Path + + GEN_ADATA_OOC_CONCAT_ARGS = dict( obsm_types=( sparse.csr_matrix, @@ -28,30 +34,28 @@ @pytest.fixture(params=[0, 1]) -def axis(request): +def axis(request) -> Literal[0, 1]: return request.param -@pytest.fixture( - params=["array", "sparse", "sparse_array"], -) -def array_type(request): +@pytest.fixture(params=["array", "sparse", "sparse_array"]) +def array_type(request) -> Literal["array", "sparse", "sparse_array"]: return request.param -@pytest.fixture(params=["inner"]) -def join_type(request): +@pytest.fixture(params=["inner", "outer"]) +def join_type(request) -> Literal["inner", "outer"]: return request.param @pytest.fixture(params=["zarr", "h5ad"]) -def file_format(request): +def file_format(request) -> Literal["zarr", "h5ad"]: return request.param # trying with 10 should be slow but will guarantee that the feature is being used @pytest.fixture(params=[10, 100_000_000]) -def max_loaded_elems(request): +def max_loaded_elems(request) -> int: return request.param @@ -77,13 +81,18 @@ def _adatas_to_paths(adatas, tmp_path, file_format): def assert_eq_concat_on_disk( - adatas, tmp_path, file_format, max_loaded_elems=None, *args, **kwargs + adatas, + tmp_path: Path, + file_format: Literal["zarr", "h5ad"], + max_loaded_elems: int | None = None, + *args, + **kwargs, ): # create one from the concat function res1 = concat(adatas, *args, **kwargs) # create one from the on disk concat function paths = _adatas_to_paths(adatas, tmp_path, file_format) - out_name = tmp_path / ("out." + file_format) + out_name = tmp_path / f"out.{file_format}" if max_loaded_elems is not None: kwargs["max_loaded_elems"] = max_loaded_elems concat_on_disk(paths, out_name, *args, **kwargs) @@ -93,84 +102,47 @@ def assert_eq_concat_on_disk( def get_array_type(array_type, axis): if array_type == "sparse": - if axis == 0: - return sparse.csr_matrix - return sparse.csc_matrix + return sparse.csr_matrix if axis == 0 else sparse.csc_matrix if array_type == "sparse_array": - if axis == 0: - return sparse.csr_array - return sparse.csc_array + return sparse.csr_array if axis == 0 else sparse.csc_array if array_type == "array": return asarray - else: - raise NotImplementedError(f"array_type {array_type} not implemented") - - -def test_anndatas_without_reindex( - axis, array_type, join_type, tmp_path, max_loaded_elems, file_format + raise NotImplementedError(f"array_type {array_type} not implemented") + + +@pytest.mark.parametrize("reindex", [True, False], ids=["reindex", "no_reindex"]) +def test_anndatas( + axis: Literal[0, 1], + array_type: Literal["array", "sparse", "sparse_array"], + join_type: Literal["inner", "outer"], + tmp_path: Path, + max_loaded_elems: int, + file_format: Literal["zarr", "h5ad"], + reindex: bool, ): - N = 50 - M = 50 - sparse_fmt = "csr" - adatas = [] - for i in range(5): - if axis == 0: - M = np.random.randint(1, 100) - else: - N = np.random.randint(1, 100) - sparse_fmt = "csc" - - a = gen_adata( - (M, N), - X_type=get_array_type(array_type, axis), - sparse_fmt=sparse_fmt, - **GEN_ADATA_OOC_CONCAT_ARGS, + _, off_axis_name = _resolve_axis(1 - axis) + random_axes = {0, 1} if reindex else {axis} + sparse_fmt = "csr" if axis == 0 else "csc" + kw = ( + GEN_ADATA_OOC_CONCAT_ARGS + if not reindex + else dict( + obsm_types=(get_array_type("sparse", 1 - axis), np.ndarray, pd.DataFrame), + varm_types=(get_array_type("sparse", 1 - axis), np.ndarray, pd.DataFrame), + layers_types=(get_array_type("sparse", axis), np.ndarray, pd.DataFrame), ) - if axis == 0: - a.obs_names = f"{i}-" + a.obs_names - else: - a.var_names = f"{i}-" + a.var_names - adatas.append(a) - - assert_eq_concat_on_disk( - adatas, - tmp_path, - file_format, - max_loaded_elems, - axis=axis, - join=join_type, ) - -def test_anndatas_with_reindex( - axis, array_type, join_type, tmp_path, file_format, max_loaded_elems -): - N = 50 - M = 50 adatas = [] - - sparse_fmt = "csc" - if axis == 0: - sparse_fmt = "csr" - - for _ in range(5): - M = np.random.randint(1, 100) - N = np.random.randint(1, 100) - + for i in range(5): + M, N = (np.random.randint(1, 100) if a in random_axes else 50 for a in (0, 1)) a = gen_adata( - (M, N), - X_type=get_array_type(array_type, axis), - sparse_fmt=sparse_fmt, - obsm_types=( - get_array_type("sparse", 1 - axis), - np.ndarray, - pd.DataFrame, - ), - varm_types=(get_array_type("sparse", axis), np.ndarray, pd.DataFrame), - layers_types=(get_array_type("sparse", axis), np.ndarray, pd.DataFrame), + (M, N), X_type=get_array_type(array_type, axis), sparse_fmt=sparse_fmt, **kw ) - a.layers["sparse"] = get_array_type("sparse", axis)(a.layers["sparse"]) - a.varm["sparse"] = get_array_type("sparse", 1 - axis)(a.varm["sparse"]) + # ensure some names overlap, others do not, for the off-axis so that inner/outer is properly tested + off_names = getattr(a, f"{off_axis_name}_names").array + off_names[1::2] = f"{i}-" + off_names[1::2] + setattr(a, f"{off_axis_name}_names", off_names) adatas.append(a) assert_eq_concat_on_disk( @@ -208,7 +180,7 @@ def test_concat_ordered_categoricals_retained(tmp_path, file_format): @pytest.fixture() -def obsm_adatas(): +def xxxm_adatas(): def gen_index(n): return [f"cell{i}" for i in range(n)] @@ -256,8 +228,12 @@ def gen_index(n): ] -def test_concatenate_obsm_inner(obsm_adatas, tmp_path, file_format): - assert_eq_concat_on_disk(obsm_adatas, tmp_path, file_format, join="inner") +def test_concatenate_xxxm(xxxm_adatas, tmp_path, file_format, join_type): + if join_type == "outer": + for i in range(len(xxxm_adatas)): + xxxm_adatas[i] = xxxm_adatas[i].T + xxxm_adatas[i].X = sparse.csr_matrix(xxxm_adatas[i].X) + assert_eq_concat_on_disk(xxxm_adatas, tmp_path, file_format, join=join_type) def test_output_dir_exists(tmp_path): From 9664ce43af2300318ed02437d8aaf1bc99e1f714 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Thu, 4 Jul 2024 10:55:23 +0200 Subject: [PATCH 2/3] (fix): add warning for setting `X` on view with repeat indices (#1501) * (fix): add warning for setting `X` on view with repeat indices * (chore): release note * (fix): add check on indices for np array-ness * (fix): only warn with matrix value setter * (chore): update message * (fix): message * (fix): `ravel` idx before checking `len` * fmt * fmt --------- Co-authored-by: Phil Schaf --- docs/release-notes/0.10.9.md | 1 + src/anndata/_core/anndata.py | 21 +++++++++++++++++---- tests/test_x.py | 11 +++++++++++ 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/docs/release-notes/0.10.9.md b/docs/release-notes/0.10.9.md index 1fc7a1ca7..a0beab5f2 100644 --- a/docs/release-notes/0.10.9.md +++ b/docs/release-notes/0.10.9.md @@ -3,6 +3,7 @@ #### Bugfix +* Add warning for setting `X` on a view with repeated indices {pr}`1501` {user}`ilan-gold` * Coerce {class}`numpy.matrix` classes to arrays when trying to store them in `AnnData` {pr}`1516` {user}`flying-sheep` * Fix for setting a dense `X` view with a sparse matrix {pr}`1532` {user}`ilan-gold` * Upper bound {mod}`numpy` for `gpu` installation on account of https://github.com/cupy/cupy/issues/8391 {pr}`1540` {user}`ilan-gold` diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 4dbfcc254..d316405e1 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -601,10 +601,23 @@ def X(self, value: np.ndarray | sparse.spmatrix | SpArray | None): or (self.n_vars == 1 and self.n_obs == len(value)) or (self.n_obs == 1 and self.n_vars == len(value)) ): - if not np.isscalar(value) and self.shape != value.shape: - # For assigning vector of values to 2d array or matrix - # Not necessary for row of 2d array - value = value.reshape(self.shape) + if not np.isscalar(value): + if self.is_view and any( + isinstance(idx, np.ndarray) + and len(np.unique(idx)) != len(idx.ravel()) + for idx in [oidx, vidx] + ): + msg = ( + "You are attempting to set `X` to a matrix on a view which has non-unique indices. " + "The resulting `adata.X` will likely not equal the value to which you set it. " + "To avoid this potential issue, please make a copy of the data first. " + "In the future, this operation will throw an error." + ) + warnings.warn(msg, FutureWarning, stacklevel=1) + if self.shape != value.shape: + # For assigning vector of values to 2d array or matrix + # Not necessary for row of 2d array + value = value.reshape(self.shape) if self.isbacked: if self.is_view: X = self.file["X"] diff --git a/tests/test_x.py b/tests/test_x.py index c8d60f249..2b4504158 100644 --- a/tests/test_x.py +++ b/tests/test_x.py @@ -41,6 +41,17 @@ def test_setter_singular_dim(shape, orig_array_type, new_array_type): assert isinstance(adata.X, type(to_assign)) +def test_repeat_indices_view(): + adata = gen_adata((10, 10), X_type=np.asarray) + subset = adata[[0, 0, 1, 1], :] + mat = np.array([np.ones(adata.shape[1]) * i for i in range(4)]) + with pytest.warns( + FutureWarning, + match=r"You are attempting to set `X` to a matrix on a view which has non-unique indices", + ): + subset.X = mat + + @pytest.mark.parametrize("orig_array_type", UNLABELLED_ARRAY_TYPES) @pytest.mark.parametrize("new_array_type", UNLABELLED_ARRAY_TYPES) def test_setter_view(orig_array_type, new_array_type): From 6d3392b0fb32a77b6f4ddfee1ed397a81f0139b7 Mon Sep 17 00:00:00 2001 From: Philipp A Date: Thu, 4 Jul 2024 11:00:27 +0200 Subject: [PATCH 3/3] Fix sparse dataset 2D slicing (#1523) Co-authored-by: Ilan Gold --- src/anndata/_core/sparse_dataset.py | 25 ++++--- tests/test_backed_sparse.py | 104 +++++++++++++++++++++++----- 2 files changed, 99 insertions(+), 30 deletions(-) diff --git a/src/anndata/_core/sparse_dataset.py b/src/anndata/_core/sparse_dataset.py index b543c11d3..7a1db15b2 100644 --- a/src/anndata/_core/sparse_dataset.py +++ b/src/anndata/_core/sparse_dataset.py @@ -175,13 +175,13 @@ def _get_sliceXslice(self, row: slice, col: slice) -> ss.csr_matrix: ) if out_shape[0] == 1: return self._get_intXslice(slice_as_int(row, self.shape[0]), col) - elif out_shape[1] == self.shape[1] and out_shape[0] < self.shape[0]: - if row.step == 1: - return ss.csr_matrix( - self._get_contiguous_compressed_slice(row), shape=out_shape - ) + if row.step != 1: return self._get_arrayXslice(np.arange(*row.indices(self.shape[0])), col) - return super()._get_sliceXslice(row, col) + res = ss.csr_matrix( + self._get_contiguous_compressed_slice(row), + shape=(out_shape[0], self.shape[1]), + ) + return res if out_shape[1] == self.shape[1] else res[:, col] def _get_arrayXslice(self, row: Sequence[int], col: slice) -> ss.csr_matrix: idxs = np.asarray(row) @@ -208,16 +208,15 @@ def _get_sliceXslice(self, row: slice, col: slice) -> ss.csc_matrix: slice_len(row, self.shape[0]), slice_len(col, self.shape[1]), ) - if out_shape[1] == 1: return self._get_sliceXint(row, slice_as_int(col, self.shape[1])) - elif out_shape[0] == self.shape[0] and out_shape[1] < self.shape[1]: - if col.step == 1: - return ss.csc_matrix( - self._get_contiguous_compressed_slice(col), shape=out_shape - ) + if col.step != 1: return self._get_sliceXarray(row, np.arange(*col.indices(self.shape[1]))) - return super()._get_sliceXslice(row, col) + res = ss.csc_matrix( + self._get_contiguous_compressed_slice(col), + shape=(self.shape[0], out_shape[1]), + ) + return res if out_shape[0] == self.shape[0] else res[row, :] def _get_sliceXarray(self, row: slice, col: Sequence[int]) -> ss.csc_matrix: idxs = np.asarray(col) diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py index cc0468230..65966ea53 100644 --- a/tests/test_backed_sparse.py +++ b/tests/test_backed_sparse.py @@ -1,7 +1,8 @@ from __future__ import annotations from functools import partial -from typing import TYPE_CHECKING, Callable, Literal +from itertools import product +from typing import TYPE_CHECKING, Callable, Literal, get_args import h5py import numpy as np @@ -17,11 +18,16 @@ from anndata.tests.helpers import AccessTrackingStore, assert_equal, subset_func if TYPE_CHECKING: + from collections.abc import Generator, Sequence from pathlib import Path - from numpy.typing import ArrayLike + from _pytest.mark import ParameterSet + from numpy.typing import ArrayLike, NDArray from pytest_mock import MockerFixture + Idx = slice | int | NDArray[np.integer] | NDArray[np.bool_] + + subset_func2 = subset_func @@ -302,7 +308,7 @@ def test_indptr_cache( tmp_path: Path, sparse_format: Callable[[ArrayLike], sparse.spmatrix], ): - path = tmp_path / "test.zarr" # diskfmt is either h5ad or zarr + path = tmp_path / "test.zarr" a = sparse_format(sparse.random(10, 10)) f = zarr.open_group(path, "a") ad._io.specs.write_elem(f, "X", a) @@ -318,12 +324,76 @@ def test_indptr_cache( assert store.get_access_count("X/indptr") == 2 +Kind = Literal["slice", "int", "array", "mask"] + + +def mk_idx_kind(idx: Sequence[int], *, kind: Kind, l: int) -> Idx | None: + """Convert sequence of consecutive integers (e.g. range with step=1) into different kinds of indexing.""" + if kind == "slice": + start = idx[0] if idx[0] > 0 else None + if len(idx) == 1: + return slice(start, idx[0] + 1) + if all(np.diff(idx) == 1): + stop = idx[-1] + 1 if idx[-1] < l - 1 else None + return slice(start, stop) + if kind == "int": + if len(idx) == 1: + return idx[0] + if kind == "array": + return np.asarray(idx) + if kind == "mask": + return np.isin(np.arange(l), idx) + return None + + +def idify(x: object) -> str: + if isinstance(x, slice): + start, stop = ("" if s is None else str(s) for s in (x.start, x.stop)) + return f"{start}:{stop}" + (f":{x.step}" if x.step not in (1, None) else "") + return str(x) + + +def width_idx_kinds( + *idxs: tuple[Sequence[int], Idx, Sequence[str]], l: int +) -> Generator[ParameterSet, None, None]: + """Convert major (first) index into various identical kinds of indexing.""" + for (idx_maj_raw, idx_min, exp), maj_kind in product(idxs, get_args(Kind)): + if (idx_maj := mk_idx_kind(idx_maj_raw, kind=maj_kind, l=l)) is None: + continue + id_ = "-".join(map(idify, [idx_maj_raw, idx_min, maj_kind])) + yield pytest.param(idx_maj, idx_min, exp, id=id_) + + @pytest.mark.parametrize("sparse_format", [sparse.csr_matrix, sparse.csc_matrix]) +@pytest.mark.parametrize( + ("idx_maj", "idx_min", "exp"), + width_idx_kinds( + ( + [0], + slice(None, None), + ["X/data/.zarray", "X/data/.zarray", "X/data/0"], + ), + ( + [0], + slice(None, 3), + ["X/data/.zarray", "X/data/.zarray", "X/data/0"], + ), + ( + [3, 4, 5], + slice(None, None), + ["X/data/.zarray", "X/data/.zarray", "X/data/3", "X/data/4", "X/data/5"], + ), + l=10, + ), +) def test_data_access( tmp_path: Path, sparse_format: Callable[[ArrayLike], sparse.spmatrix], + idx_maj: Idx, + idx_min: Idx, + exp: Sequence[str], ): - path = tmp_path / "test.zarr" # diskfmt is either h5ad or zarr + path = tmp_path / "test.zarr" a = sparse_format(np.eye(10, 10)) f = zarr.open_group(path, "a") ad._io.specs.write_elem(f, "X", a) @@ -335,18 +405,18 @@ def test_data_access( store.initialize_key_trackers(["X/data"]) f = zarr.open_group(store) a_disk = sparse_dataset(f["X"]) - for idx in [slice(0, 1), 0, np.array([0]), np.array([True] + [False] * 9)]: - store.reset_key_trackers() - if a_disk.format == "csr": - a_disk[idx, :] - else: - a_disk[:, idx] - assert store.get_access_count("X/data") == 3 - assert store.get_accessed_keys("X/data") == [ - "X/data/.zarray", - "X/data/.zarray", - "X/data/0", - ] + + # Do the slicing with idx + store.reset_key_trackers() + if a_disk.format == "csr": + a_disk[idx_maj, idx_min] + else: + a_disk[idx_min, idx_maj] + + assert store.get_access_count("X/data") == len(exp), store.get_accessed_keys( + "X/data" + ) + assert store.get_accessed_keys("X/data") == exp @pytest.mark.parametrize( @@ -382,7 +452,7 @@ def test_wrong_shape( def test_reset_group(tmp_path: Path): - path = tmp_path / "test.zarr" # diskfmt is either h5ad or zarr + path = tmp_path / "test.zarr" base = sparse.random(100, 100, format="csr") if diskfmt == "zarr":