From 686786db35cafd8cbd7ebcc0128d5239fb06bce7 Mon Sep 17 00:00:00 2001
From: Ilan Gold <ilanbassgold@gmail.com>
Date: Thu, 4 Jul 2024 10:54:45 +0200
Subject: [PATCH 1/3] (feat): allow `join=outer` for `concat_on_disk` (#1504)

* (feat): allow `join=outer` for `concat_on_disk`

* (recactor): don't need to redeclare types

* (chore): release note

* (chore): add more rigorous union/diff check

* simplify

* (chore): clean up interleaving naming of axis

* simplify

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: Phil Schaf <flying-sheep@web.de>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 docs/release-notes/0.10.9.md      |   2 +
 src/anndata/experimental/merge.py |   2 -
 tests/test_concatenate_disk.py    | 140 +++++++++++++-----------------
 3 files changed, 60 insertions(+), 84 deletions(-)

diff --git a/docs/release-notes/0.10.9.md b/docs/release-notes/0.10.9.md
index ddfb0274d..1fc7a1ca7 100644
--- a/docs/release-notes/0.10.9.md
+++ b/docs/release-notes/0.10.9.md
@@ -10,3 +10,5 @@
 #### Documentation
 
 #### Performance
+
+* Support for `concat_on_disk` outer join {pr}`1504` {user}`ilan-gold`
diff --git a/src/anndata/experimental/merge.py b/src/anndata/experimental/merge.py
index aa6f47e9b..040e7c9dd 100644
--- a/src/anndata/experimental/merge.py
+++ b/src/anndata/experimental/merge.py
@@ -538,8 +538,6 @@ def concat_on_disk(
     # Argument normalization
     if pairwise:
         raise NotImplementedError("pairwise concatenation not yet implemented")
-    if join != "inner":
-        raise NotImplementedError("only inner join is currently supported")
 
     merge = resolve_merge_strategy(merge)
     uns_merge = resolve_merge_strategy(uns_merge)
diff --git a/tests/test_concatenate_disk.py b/tests/test_concatenate_disk.py
index 640ae446b..216b66e21 100644
--- a/tests/test_concatenate_disk.py
+++ b/tests/test_concatenate_disk.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from collections.abc import Mapping
+from typing import TYPE_CHECKING, Literal
 
 import numpy as np
 import pandas as pd
@@ -8,6 +9,7 @@
 from scipy import sparse
 
 from anndata import AnnData, concat
+from anndata._core.merge import _resolve_axis
 from anndata.experimental import read_elem, write_elem
 from anndata.experimental.merge import as_group, concat_on_disk
 from anndata.tests.helpers import (
@@ -16,6 +18,10 @@
 )
 from anndata.utils import asarray
 
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
 GEN_ADATA_OOC_CONCAT_ARGS = dict(
     obsm_types=(
         sparse.csr_matrix,
@@ -28,30 +34,28 @@
 
 
 @pytest.fixture(params=[0, 1])
-def axis(request):
+def axis(request) -> Literal[0, 1]:
     return request.param
 
 
-@pytest.fixture(
-    params=["array", "sparse", "sparse_array"],
-)
-def array_type(request):
+@pytest.fixture(params=["array", "sparse", "sparse_array"])
+def array_type(request) -> Literal["array", "sparse", "sparse_array"]:
     return request.param
 
 
-@pytest.fixture(params=["inner"])
-def join_type(request):
+@pytest.fixture(params=["inner", "outer"])
+def join_type(request) -> Literal["inner", "outer"]:
     return request.param
 
 
 @pytest.fixture(params=["zarr", "h5ad"])
-def file_format(request):
+def file_format(request) -> Literal["zarr", "h5ad"]:
     return request.param
 
 
 # trying with 10 should be slow but will guarantee that the feature is being used
 @pytest.fixture(params=[10, 100_000_000])
-def max_loaded_elems(request):
+def max_loaded_elems(request) -> int:
     return request.param
 
 
@@ -77,13 +81,18 @@ def _adatas_to_paths(adatas, tmp_path, file_format):
 
 
 def assert_eq_concat_on_disk(
-    adatas, tmp_path, file_format, max_loaded_elems=None, *args, **kwargs
+    adatas,
+    tmp_path: Path,
+    file_format: Literal["zarr", "h5ad"],
+    max_loaded_elems: int | None = None,
+    *args,
+    **kwargs,
 ):
     # create one from the concat function
     res1 = concat(adatas, *args, **kwargs)
     # create one from the on disk concat function
     paths = _adatas_to_paths(adatas, tmp_path, file_format)
-    out_name = tmp_path / ("out." + file_format)
+    out_name = tmp_path / f"out.{file_format}"
     if max_loaded_elems is not None:
         kwargs["max_loaded_elems"] = max_loaded_elems
     concat_on_disk(paths, out_name, *args, **kwargs)
@@ -93,84 +102,47 @@ def assert_eq_concat_on_disk(
 
 def get_array_type(array_type, axis):
     if array_type == "sparse":
-        if axis == 0:
-            return sparse.csr_matrix
-        return sparse.csc_matrix
+        return sparse.csr_matrix if axis == 0 else sparse.csc_matrix
     if array_type == "sparse_array":
-        if axis == 0:
-            return sparse.csr_array
-        return sparse.csc_array
+        return sparse.csr_array if axis == 0 else sparse.csc_array
     if array_type == "array":
         return asarray
-    else:
-        raise NotImplementedError(f"array_type {array_type} not implemented")
-
-
-def test_anndatas_without_reindex(
-    axis, array_type, join_type, tmp_path, max_loaded_elems, file_format
+    raise NotImplementedError(f"array_type {array_type} not implemented")
+
+
+@pytest.mark.parametrize("reindex", [True, False], ids=["reindex", "no_reindex"])
+def test_anndatas(
+    axis: Literal[0, 1],
+    array_type: Literal["array", "sparse", "sparse_array"],
+    join_type: Literal["inner", "outer"],
+    tmp_path: Path,
+    max_loaded_elems: int,
+    file_format: Literal["zarr", "h5ad"],
+    reindex: bool,
 ):
-    N = 50
-    M = 50
-    sparse_fmt = "csr"
-    adatas = []
-    for i in range(5):
-        if axis == 0:
-            M = np.random.randint(1, 100)
-        else:
-            N = np.random.randint(1, 100)
-            sparse_fmt = "csc"
-
-        a = gen_adata(
-            (M, N),
-            X_type=get_array_type(array_type, axis),
-            sparse_fmt=sparse_fmt,
-            **GEN_ADATA_OOC_CONCAT_ARGS,
+    _, off_axis_name = _resolve_axis(1 - axis)
+    random_axes = {0, 1} if reindex else {axis}
+    sparse_fmt = "csr" if axis == 0 else "csc"
+    kw = (
+        GEN_ADATA_OOC_CONCAT_ARGS
+        if not reindex
+        else dict(
+            obsm_types=(get_array_type("sparse", 1 - axis), np.ndarray, pd.DataFrame),
+            varm_types=(get_array_type("sparse", 1 - axis), np.ndarray, pd.DataFrame),
+            layers_types=(get_array_type("sparse", axis), np.ndarray, pd.DataFrame),
         )
-        if axis == 0:
-            a.obs_names = f"{i}-" + a.obs_names
-        else:
-            a.var_names = f"{i}-" + a.var_names
-        adatas.append(a)
-
-    assert_eq_concat_on_disk(
-        adatas,
-        tmp_path,
-        file_format,
-        max_loaded_elems,
-        axis=axis,
-        join=join_type,
     )
 
-
-def test_anndatas_with_reindex(
-    axis, array_type, join_type, tmp_path, file_format, max_loaded_elems
-):
-    N = 50
-    M = 50
     adatas = []
-
-    sparse_fmt = "csc"
-    if axis == 0:
-        sparse_fmt = "csr"
-
-    for _ in range(5):
-        M = np.random.randint(1, 100)
-        N = np.random.randint(1, 100)
-
+    for i in range(5):
+        M, N = (np.random.randint(1, 100) if a in random_axes else 50 for a in (0, 1))
         a = gen_adata(
-            (M, N),
-            X_type=get_array_type(array_type, axis),
-            sparse_fmt=sparse_fmt,
-            obsm_types=(
-                get_array_type("sparse", 1 - axis),
-                np.ndarray,
-                pd.DataFrame,
-            ),
-            varm_types=(get_array_type("sparse", axis), np.ndarray, pd.DataFrame),
-            layers_types=(get_array_type("sparse", axis), np.ndarray, pd.DataFrame),
+            (M, N), X_type=get_array_type(array_type, axis), sparse_fmt=sparse_fmt, **kw
         )
-        a.layers["sparse"] = get_array_type("sparse", axis)(a.layers["sparse"])
-        a.varm["sparse"] = get_array_type("sparse", 1 - axis)(a.varm["sparse"])
+        # ensure some names overlap, others do not, for the off-axis so that inner/outer is properly tested
+        off_names = getattr(a, f"{off_axis_name}_names").array
+        off_names[1::2] = f"{i}-" + off_names[1::2]
+        setattr(a, f"{off_axis_name}_names", off_names)
         adatas.append(a)
 
     assert_eq_concat_on_disk(
@@ -208,7 +180,7 @@ def test_concat_ordered_categoricals_retained(tmp_path, file_format):
 
 
 @pytest.fixture()
-def obsm_adatas():
+def xxxm_adatas():
     def gen_index(n):
         return [f"cell{i}" for i in range(n)]
 
@@ -256,8 +228,12 @@ def gen_index(n):
     ]
 
 
-def test_concatenate_obsm_inner(obsm_adatas, tmp_path, file_format):
-    assert_eq_concat_on_disk(obsm_adatas, tmp_path, file_format, join="inner")
+def test_concatenate_xxxm(xxxm_adatas, tmp_path, file_format, join_type):
+    if join_type == "outer":
+        for i in range(len(xxxm_adatas)):
+            xxxm_adatas[i] = xxxm_adatas[i].T
+            xxxm_adatas[i].X = sparse.csr_matrix(xxxm_adatas[i].X)
+    assert_eq_concat_on_disk(xxxm_adatas, tmp_path, file_format, join=join_type)
 
 
 def test_output_dir_exists(tmp_path):

From 9664ce43af2300318ed02437d8aaf1bc99e1f714 Mon Sep 17 00:00:00 2001
From: Ilan Gold <ilanbassgold@gmail.com>
Date: Thu, 4 Jul 2024 10:55:23 +0200
Subject: [PATCH 2/3] (fix): add warning for setting `X` on view with repeat
 indices (#1501)

* (fix): add warning for setting `X` on view with repeat indices

* (chore): release note

* (fix): add check on indices for np array-ness

* (fix): only warn with matrix value setter

* (chore): update message

* (fix): message

* (fix): `ravel` idx before checking `len`

* fmt

* fmt

---------

Co-authored-by: Phil Schaf <flying-sheep@web.de>
---
 docs/release-notes/0.10.9.md |  1 +
 src/anndata/_core/anndata.py | 21 +++++++++++++++++----
 tests/test_x.py              | 11 +++++++++++
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/docs/release-notes/0.10.9.md b/docs/release-notes/0.10.9.md
index 1fc7a1ca7..a0beab5f2 100644
--- a/docs/release-notes/0.10.9.md
+++ b/docs/release-notes/0.10.9.md
@@ -3,6 +3,7 @@
 
 #### Bugfix
 
+* Add warning for setting `X` on a view with repeated indices {pr}`1501` {user}`ilan-gold`
 * Coerce {class}`numpy.matrix` classes to arrays when trying to store them in `AnnData` {pr}`1516` {user}`flying-sheep`
 * Fix for setting a dense `X` view with a sparse matrix {pr}`1532` {user}`ilan-gold`
 * Upper bound {mod}`numpy` for `gpu` installation on account of https://github.com/cupy/cupy/issues/8391 {pr}`1540` {user}`ilan-gold`
diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py
index 4dbfcc254..d316405e1 100644
--- a/src/anndata/_core/anndata.py
+++ b/src/anndata/_core/anndata.py
@@ -601,10 +601,23 @@ def X(self, value: np.ndarray | sparse.spmatrix | SpArray | None):
             or (self.n_vars == 1 and self.n_obs == len(value))
             or (self.n_obs == 1 and self.n_vars == len(value))
         ):
-            if not np.isscalar(value) and self.shape != value.shape:
-                # For assigning vector of values to 2d array or matrix
-                # Not necessary for row of 2d array
-                value = value.reshape(self.shape)
+            if not np.isscalar(value):
+                if self.is_view and any(
+                    isinstance(idx, np.ndarray)
+                    and len(np.unique(idx)) != len(idx.ravel())
+                    for idx in [oidx, vidx]
+                ):
+                    msg = (
+                        "You are attempting to set `X` to a matrix on a view which has non-unique indices. "
+                        "The resulting `adata.X` will likely not equal the value to which you set it. "
+                        "To avoid this potential issue, please make a copy of the data first. "
+                        "In the future, this operation will throw an error."
+                    )
+                    warnings.warn(msg, FutureWarning, stacklevel=1)
+                if self.shape != value.shape:
+                    # For assigning vector of values to 2d array or matrix
+                    # Not necessary for row of 2d array
+                    value = value.reshape(self.shape)
             if self.isbacked:
                 if self.is_view:
                     X = self.file["X"]
diff --git a/tests/test_x.py b/tests/test_x.py
index c8d60f249..2b4504158 100644
--- a/tests/test_x.py
+++ b/tests/test_x.py
@@ -41,6 +41,17 @@ def test_setter_singular_dim(shape, orig_array_type, new_array_type):
     assert isinstance(adata.X, type(to_assign))
 
 
+def test_repeat_indices_view():
+    adata = gen_adata((10, 10), X_type=np.asarray)
+    subset = adata[[0, 0, 1, 1], :]
+    mat = np.array([np.ones(adata.shape[1]) * i for i in range(4)])
+    with pytest.warns(
+        FutureWarning,
+        match=r"You are attempting to set `X` to a matrix on a view which has non-unique indices",
+    ):
+        subset.X = mat
+
+
 @pytest.mark.parametrize("orig_array_type", UNLABELLED_ARRAY_TYPES)
 @pytest.mark.parametrize("new_array_type", UNLABELLED_ARRAY_TYPES)
 def test_setter_view(orig_array_type, new_array_type):

From 6d3392b0fb32a77b6f4ddfee1ed397a81f0139b7 Mon Sep 17 00:00:00 2001
From: Philipp A <flying-sheep@web.de>
Date: Thu, 4 Jul 2024 11:00:27 +0200
Subject: [PATCH 3/3] Fix sparse dataset 2D slicing (#1523)

Co-authored-by: Ilan Gold <ilanbassgold@gmail.com>
---
 src/anndata/_core/sparse_dataset.py |  25 ++++---
 tests/test_backed_sparse.py         | 104 +++++++++++++++++++++++-----
 2 files changed, 99 insertions(+), 30 deletions(-)

diff --git a/src/anndata/_core/sparse_dataset.py b/src/anndata/_core/sparse_dataset.py
index b543c11d3..7a1db15b2 100644
--- a/src/anndata/_core/sparse_dataset.py
+++ b/src/anndata/_core/sparse_dataset.py
@@ -175,13 +175,13 @@ def _get_sliceXslice(self, row: slice, col: slice) -> ss.csr_matrix:
         )
         if out_shape[0] == 1:
             return self._get_intXslice(slice_as_int(row, self.shape[0]), col)
-        elif out_shape[1] == self.shape[1] and out_shape[0] < self.shape[0]:
-            if row.step == 1:
-                return ss.csr_matrix(
-                    self._get_contiguous_compressed_slice(row), shape=out_shape
-                )
+        if row.step != 1:
             return self._get_arrayXslice(np.arange(*row.indices(self.shape[0])), col)
-        return super()._get_sliceXslice(row, col)
+        res = ss.csr_matrix(
+            self._get_contiguous_compressed_slice(row),
+            shape=(out_shape[0], self.shape[1]),
+        )
+        return res if out_shape[1] == self.shape[1] else res[:, col]
 
     def _get_arrayXslice(self, row: Sequence[int], col: slice) -> ss.csr_matrix:
         idxs = np.asarray(row)
@@ -208,16 +208,15 @@ def _get_sliceXslice(self, row: slice, col: slice) -> ss.csc_matrix:
             slice_len(row, self.shape[0]),
             slice_len(col, self.shape[1]),
         )
-
         if out_shape[1] == 1:
             return self._get_sliceXint(row, slice_as_int(col, self.shape[1]))
-        elif out_shape[0] == self.shape[0] and out_shape[1] < self.shape[1]:
-            if col.step == 1:
-                return ss.csc_matrix(
-                    self._get_contiguous_compressed_slice(col), shape=out_shape
-                )
+        if col.step != 1:
             return self._get_sliceXarray(row, np.arange(*col.indices(self.shape[1])))
-        return super()._get_sliceXslice(row, col)
+        res = ss.csc_matrix(
+            self._get_contiguous_compressed_slice(col),
+            shape=(self.shape[0], out_shape[1]),
+        )
+        return res if out_shape[0] == self.shape[0] else res[row, :]
 
     def _get_sliceXarray(self, row: slice, col: Sequence[int]) -> ss.csc_matrix:
         idxs = np.asarray(col)
diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py
index cc0468230..65966ea53 100644
--- a/tests/test_backed_sparse.py
+++ b/tests/test_backed_sparse.py
@@ -1,7 +1,8 @@
 from __future__ import annotations
 
 from functools import partial
-from typing import TYPE_CHECKING, Callable, Literal
+from itertools import product
+from typing import TYPE_CHECKING, Callable, Literal, get_args
 
 import h5py
 import numpy as np
@@ -17,11 +18,16 @@
 from anndata.tests.helpers import AccessTrackingStore, assert_equal, subset_func
 
 if TYPE_CHECKING:
+    from collections.abc import Generator, Sequence
     from pathlib import Path
 
-    from numpy.typing import ArrayLike
+    from _pytest.mark import ParameterSet
+    from numpy.typing import ArrayLike, NDArray
     from pytest_mock import MockerFixture
 
+    Idx = slice | int | NDArray[np.integer] | NDArray[np.bool_]
+
+
 subset_func2 = subset_func
 
 
@@ -302,7 +308,7 @@ def test_indptr_cache(
     tmp_path: Path,
     sparse_format: Callable[[ArrayLike], sparse.spmatrix],
 ):
-    path = tmp_path / "test.zarr"  # diskfmt is either h5ad or zarr
+    path = tmp_path / "test.zarr"
     a = sparse_format(sparse.random(10, 10))
     f = zarr.open_group(path, "a")
     ad._io.specs.write_elem(f, "X", a)
@@ -318,12 +324,76 @@ def test_indptr_cache(
     assert store.get_access_count("X/indptr") == 2
 
 
+Kind = Literal["slice", "int", "array", "mask"]
+
+
+def mk_idx_kind(idx: Sequence[int], *, kind: Kind, l: int) -> Idx | None:
+    """Convert sequence of consecutive integers (e.g. range with step=1) into different kinds of indexing."""
+    if kind == "slice":
+        start = idx[0] if idx[0] > 0 else None
+        if len(idx) == 1:
+            return slice(start, idx[0] + 1)
+        if all(np.diff(idx) == 1):
+            stop = idx[-1] + 1 if idx[-1] < l - 1 else None
+            return slice(start, stop)
+    if kind == "int":
+        if len(idx) == 1:
+            return idx[0]
+    if kind == "array":
+        return np.asarray(idx)
+    if kind == "mask":
+        return np.isin(np.arange(l), idx)
+    return None
+
+
+def idify(x: object) -> str:
+    if isinstance(x, slice):
+        start, stop = ("" if s is None else str(s) for s in (x.start, x.stop))
+        return f"{start}:{stop}" + (f":{x.step}" if x.step not in (1, None) else "")
+    return str(x)
+
+
+def width_idx_kinds(
+    *idxs: tuple[Sequence[int], Idx, Sequence[str]], l: int
+) -> Generator[ParameterSet, None, None]:
+    """Convert major (first) index into various identical kinds of indexing."""
+    for (idx_maj_raw, idx_min, exp), maj_kind in product(idxs, get_args(Kind)):
+        if (idx_maj := mk_idx_kind(idx_maj_raw, kind=maj_kind, l=l)) is None:
+            continue
+        id_ = "-".join(map(idify, [idx_maj_raw, idx_min, maj_kind]))
+        yield pytest.param(idx_maj, idx_min, exp, id=id_)
+
+
 @pytest.mark.parametrize("sparse_format", [sparse.csr_matrix, sparse.csc_matrix])
+@pytest.mark.parametrize(
+    ("idx_maj", "idx_min", "exp"),
+    width_idx_kinds(
+        (
+            [0],
+            slice(None, None),
+            ["X/data/.zarray", "X/data/.zarray", "X/data/0"],
+        ),
+        (
+            [0],
+            slice(None, 3),
+            ["X/data/.zarray", "X/data/.zarray", "X/data/0"],
+        ),
+        (
+            [3, 4, 5],
+            slice(None, None),
+            ["X/data/.zarray", "X/data/.zarray", "X/data/3", "X/data/4", "X/data/5"],
+        ),
+        l=10,
+    ),
+)
 def test_data_access(
     tmp_path: Path,
     sparse_format: Callable[[ArrayLike], sparse.spmatrix],
+    idx_maj: Idx,
+    idx_min: Idx,
+    exp: Sequence[str],
 ):
-    path = tmp_path / "test.zarr"  # diskfmt is either h5ad or zarr
+    path = tmp_path / "test.zarr"
     a = sparse_format(np.eye(10, 10))
     f = zarr.open_group(path, "a")
     ad._io.specs.write_elem(f, "X", a)
@@ -335,18 +405,18 @@ def test_data_access(
     store.initialize_key_trackers(["X/data"])
     f = zarr.open_group(store)
     a_disk = sparse_dataset(f["X"])
-    for idx in [slice(0, 1), 0, np.array([0]), np.array([True] + [False] * 9)]:
-        store.reset_key_trackers()
-        if a_disk.format == "csr":
-            a_disk[idx, :]
-        else:
-            a_disk[:, idx]
-        assert store.get_access_count("X/data") == 3
-        assert store.get_accessed_keys("X/data") == [
-            "X/data/.zarray",
-            "X/data/.zarray",
-            "X/data/0",
-        ]
+
+    # Do the slicing with idx
+    store.reset_key_trackers()
+    if a_disk.format == "csr":
+        a_disk[idx_maj, idx_min]
+    else:
+        a_disk[idx_min, idx_maj]
+
+    assert store.get_access_count("X/data") == len(exp), store.get_accessed_keys(
+        "X/data"
+    )
+    assert store.get_accessed_keys("X/data") == exp
 
 
 @pytest.mark.parametrize(
@@ -382,7 +452,7 @@ def test_wrong_shape(
 
 
 def test_reset_group(tmp_path: Path):
-    path = tmp_path / "test.zarr"  # diskfmt is either h5ad or zarr
+    path = tmp_path / "test.zarr"
     base = sparse.random(100, 100, format="csr")
 
     if diskfmt == "zarr":