From 0bc2b396b7c48bd37b51e70233e6ac92cbbbf35d Mon Sep 17 00:00:00 2001
From: Ilan Gold <ilanbassgold@gmail.com>
Date: Mon, 2 Sep 2024 10:20:21 +0200
Subject: [PATCH] (feat): allow reading of `scipy.sparse.cs{c,r}_array` (#1633)

* (chore): export `read_elem` and `write_elem` from the main package

* (chore): pr number

* (fix): agnostic way of importing

* (fix): add `RWAble` to `api.md`

* (fix): `md` file import

* (fix): clarify public backed sparse docstring/api

* (chore): small fixes

* (fix): `format` + `to_memory`

* (chore): remove deprecation tests + `SparseDataset`

* (chore): clean up private/public api

* (fix): `test_append_overflow_check` used `indptr`

* (fix): export `InMemoryElem`

* (chore): release note

* (chore): move `InMemoryElem` to the "extras" section

* Update src/anndata/_core/sparse_dataset.py

* (fix): remove dead tests

* (feat): allow reading of `scipy.sparse.cs{c,r}_array`

* (chore): release note

* (fix): add how to do it to release note

* (fix): make setting dynamic

* (fix): handle min-deps

* (fix): don't know why that didn't commit...

* (chore): better `msg`

* (refactor): change validation to setting object

* (fix): add space

---------

Co-authored-by: Philipp A. <flying-sheep@web.de>
---
 docs/release-notes/1633.feature.md  |  1 +
 src/anndata/_core/sparse_dataset.py | 11 +++++++++--
 src/anndata/_settings.py            | 22 +++++++++++++++++++++-
 tests/test_backed_sparse.py         | 22 ++++++++++++++++++++++
 tests/test_io_elementwise.py        | 22 +++++++++++++++++++++-
 tests/test_settings.py              | 13 +++++++++++++
 6 files changed, 87 insertions(+), 4 deletions(-)
 create mode 100644 docs/release-notes/1633.feature.md

diff --git a/docs/release-notes/1633.feature.md b/docs/release-notes/1633.feature.md
new file mode 100644
index 000000000..13c2399ee
--- /dev/null
+++ b/docs/release-notes/1633.feature.md
@@ -0,0 +1 @@
+Allow reading sparse data (via {func}`~anndata.read_elem` or {func}`~anndata.experimental.sparse_dataset`) into either {class}`scipy.sparse.csr_array` or {class}`scipy.sparse.csc_array` via {attr}`anndata.settings.shall_use_sparse_array_on_read` {user}`ilan-gold`
diff --git a/src/anndata/_core/sparse_dataset.py b/src/anndata/_core/sparse_dataset.py
index 091ad45f0..049884ac2 100644
--- a/src/anndata/_core/sparse_dataset.py
+++ b/src/anndata/_core/sparse_dataset.py
@@ -29,6 +29,7 @@
 from anndata._core.index import _fix_slice_bounds
 from anndata.compat import H5Group, ZarrArray, ZarrGroup
 
+from .._settings import settings
 from ..compat import SpArray, _read_attr
 
 try:
@@ -236,6 +237,8 @@ def _get_sliceXarray(self, row: slice, col: Sequence[int]) -> ss.csc_matrix:
 FORMATS = [
     BackedFormat("csr", backed_csr_matrix, ss.csr_matrix),
     BackedFormat("csc", backed_csc_matrix, ss.csc_matrix),
+    BackedFormat("csr", backed_csr_matrix, ss.csr_array),
+    BackedFormat("csc", backed_csc_matrix, ss.csc_array),
 ]
 
 
@@ -444,7 +447,9 @@ def __getitem__(
         # If indexing is array x array it returns a backed_sparse_matrix
         # Not sure what the performance is on that operation
         if isinstance(sub, BackedSparseMatrix):
-            return get_memory_class(self.format)(sub)
+            return get_memory_class(
+                self.format, use_sparray_in_io=settings.shall_use_sparse_array_on_read
+            )(sub)
         else:
             return sub
 
@@ -582,7 +587,9 @@ def to_memory(self) -> spmatrix | SpArray:
         -------
         The in-memory representation of the sparse matrix.
         """
-        format_class = get_memory_class(self.format)
+        format_class = get_memory_class(
+            self.format, use_sparray_in_io=settings.shall_use_sparse_array_on_read
+        )
         mtx = format_class(self.shape, dtype=self.dtype)
         mtx.data = self.group["data"][...]
         mtx.indices = self.group["indices"][...]
diff --git a/src/anndata/_settings.py b/src/anndata/_settings.py
index f8ba1be3b..4e7d17347 100644
--- a/src/anndata/_settings.py
+++ b/src/anndata/_settings.py
@@ -14,6 +14,7 @@
 from types import GenericAlias
 from typing import TYPE_CHECKING, Generic, NamedTuple, TypeVar, cast
 
+from anndata.compat import CAN_USE_SPARSE_ARRAY
 from anndata.compat.exceptiongroups import add_note
 
 if TYPE_CHECKING:
@@ -396,7 +397,7 @@ def __doc__(self):
 ##################################################################################
 
 
-def validate_bool(val) -> None:
+def validate_bool(val: Any) -> None:
     if not isinstance(val, bool):
         msg = f"{val} not valid boolean"
         raise TypeError(msg)
@@ -428,5 +429,24 @@ def validate_bool(val) -> None:
     get_from_env=check_and_get_bool,
 )
 
+
+def validate_sparse_settings(val: Any) -> None:
+    validate_bool(val)
+    if not CAN_USE_SPARSE_ARRAY and cast(bool, val):
+        msg = (
+            "scipy.sparse.cs{r,c}array is not available in current scipy version. "
+            "Falling back to scipy.sparse.spmatrix for reading."
+        )
+        raise ValueError(msg)
+
+
+settings.register(
+    "shall_use_sparse_array_on_read",
+    default_value=False,
+    description="Whether or not to use `sparse_array` as the default class when reading in data",
+    validate=validate_sparse_settings,
+    get_from_env=check_and_get_bool,
+)
+
 ##################################################################################
 ##################################################################################
diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py
index f2ef62387..918872fe6 100644
--- a/tests/test_backed_sparse.py
+++ b/tests/test_backed_sparse.py
@@ -269,6 +269,28 @@ def test_dataset_append_memory(
     assert_equal(fromdisk, frommem)
 
 
+@pytest.mark.parametrize("sparse_format", [sparse.csr_matrix, sparse.csc_matrix])
+def test_read_array(
+    tmp_path: Path,
+    sparse_format: Callable[[ArrayLike], sparse.spmatrix],
+    diskfmt: Literal["h5ad", "zarr"],
+):
+    path = tmp_path / f"test.{diskfmt.replace('ad', '')}"
+    a = sparse_format(sparse.random(100, 100))
+    if diskfmt == "zarr":
+        f = zarr.open_group(path, "a")
+    else:
+        f = h5py.File(path, "a")
+    ad.write_elem(f, "mtx", a)
+    diskmtx = sparse_dataset(f["mtx"])
+    if not CAN_USE_SPARSE_ARRAY:
+        pytest.skip("scipy.sparse.cs{r,c}array not available")
+    ad.settings.shall_use_sparse_array_on_read = True
+    assert issubclass(type(diskmtx[...]), SpArray)
+    ad.settings.shall_use_sparse_array_on_read = False
+    assert issubclass(type(diskmtx[...]), sparse.spmatrix)
+
+
 @pytest.mark.parametrize(
     ("sparse_format", "append_method"),
     [
diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py
index 91d97d6c9..452139406 100644
--- a/tests/test_io_elementwise.py
+++ b/tests/test_io_elementwise.py
@@ -25,7 +25,7 @@
     write_elem,
 )
 from anndata._io.specs.registry import IORegistryError
-from anndata.compat import ZarrGroup, _read_attr
+from anndata.compat import CAN_USE_SPARSE_ARRAY, SpArray, ZarrGroup, _read_attr
 from anndata.tests.helpers import (
     as_cupy,
     as_cupy_sparse_dask_array,
@@ -35,6 +35,7 @@
 )
 
 if TYPE_CHECKING:
+    from pathlib import Path
     from typing import Literal, TypeVar
 
     from anndata.compat import H5Group
@@ -570,3 +571,22 @@ def test_io_pd_cow(store, copy_on_write):
         write_elem(store, "adata", orig)
         from_store = read_elem(store["adata"])
         assert_equal(orig, from_store)
+
+
+def test_read_sparse_array(
+    tmp_path: Path,
+    sparse_format: Literal["csr", "csc"],
+    diskfmt: Literal["h5ad", "zarr"],
+):
+    path = tmp_path / f"test.{diskfmt.replace('ad', '')}"
+    a = sparse.random(100, 100, format=sparse_format)
+    if diskfmt == "zarr":
+        f = zarr.open_group(path, "a")
+    else:
+        f = h5py.File(path, "a")
+    ad.write_elem(f, "mtx", a)
+    if not CAN_USE_SPARSE_ARRAY:
+        pytest.skip("scipy.sparse.cs{r,c}array not available")
+    ad.settings.shall_use_sparse_array_on_read = True
+    mtx = ad.read_elem(f["mtx"])
+    assert issubclass(type(mtx), SpArray)
diff --git a/tests/test_settings.py b/tests/test_settings.py
index 871141d92..a9901d5a1 100644
--- a/tests/test_settings.py
+++ b/tests/test_settings.py
@@ -243,3 +243,16 @@ class TestEnum(Enum):
 )
 def test_describe(as_rst: bool, expected: str, settings: SettingsManager):
     assert settings.describe("test_var_3", as_rst=as_rst) == expected
+
+
+def test_shall_use_sparse_array_on_read():
+    import anndata as ad
+
+    if not ad.compat.CAN_USE_SPARSE_ARRAY:
+        with pytest.raises(
+            ValueError,
+            match=r"scipy.sparse.cs{r,c}array is not available in current scipy version",
+        ):
+            ad.settings.shall_use_sparse_array_on_read = True
+    else:
+        ad.settings.shall_use_sparse_array_on_read = True