From 0bc2b396b7c48bd37b51e70233e6ac92cbbbf35d Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Mon, 2 Sep 2024 10:20:21 +0200 Subject: [PATCH] (feat): allow reading of `scipy.sparse.cs{c,r}_array` (#1633) * (chore): export `read_elem` and `write_elem` from the main package * (chore): pr number * (fix): agnostic way of importing * (fix): add `RWAble` to `api.md` * (fix): `md` file import * (fix): clarify public backed sparse docstring/api * (chore): small fixes * (fix): `format` + `to_memory` * (chore): remove deprecation tests + `SparseDataset` * (chore): clean up private/public api * (fix): `test_append_overflow_check` used `indptr` * (fix): export `InMemoryElem` * (chore): release note * (chore): move `InMemoryElem` to the "extras" section * Update src/anndata/_core/sparse_dataset.py * (fix): remove dead tests * (feat): allow reading of `scipy.sparse.cs{c,r}_array` * (chore): release note * (fix): add how to do it to release note * (fix): make setting dynamic * (fix): handle min-deps * (fix): don't know why that didn't commit... * (chore): better `msg` * (refactor): change validation to setting object * (fix): add space --------- Co-authored-by: Philipp A. --- docs/release-notes/1633.feature.md | 1 + src/anndata/_core/sparse_dataset.py | 11 +++++++++-- src/anndata/_settings.py | 22 +++++++++++++++++++++- tests/test_backed_sparse.py | 22 ++++++++++++++++++++++ tests/test_io_elementwise.py | 22 +++++++++++++++++++++- tests/test_settings.py | 13 +++++++++++++ 6 files changed, 87 insertions(+), 4 deletions(-) create mode 100644 docs/release-notes/1633.feature.md diff --git a/docs/release-notes/1633.feature.md b/docs/release-notes/1633.feature.md new file mode 100644 index 000000000..13c2399ee --- /dev/null +++ b/docs/release-notes/1633.feature.md @@ -0,0 +1 @@ +Allow reading sparse data (via {func}`~anndata.read_elem` or {func}`~anndata.experimental.sparse_dataset`) into either {class}`scipy.sparse.csr_array` or {class}`scipy.sparse.csc_array` via {attr}`anndata.settings.shall_use_sparse_array_on_read` {user}`ilan-gold` diff --git a/src/anndata/_core/sparse_dataset.py b/src/anndata/_core/sparse_dataset.py index 091ad45f0..049884ac2 100644 --- a/src/anndata/_core/sparse_dataset.py +++ b/src/anndata/_core/sparse_dataset.py @@ -29,6 +29,7 @@ from anndata._core.index import _fix_slice_bounds from anndata.compat import H5Group, ZarrArray, ZarrGroup +from .._settings import settings from ..compat import SpArray, _read_attr try: @@ -236,6 +237,8 @@ def _get_sliceXarray(self, row: slice, col: Sequence[int]) -> ss.csc_matrix: FORMATS = [ BackedFormat("csr", backed_csr_matrix, ss.csr_matrix), BackedFormat("csc", backed_csc_matrix, ss.csc_matrix), + BackedFormat("csr", backed_csr_matrix, ss.csr_array), + BackedFormat("csc", backed_csc_matrix, ss.csc_array), ] @@ -444,7 +447,9 @@ def __getitem__( # If indexing is array x array it returns a backed_sparse_matrix # Not sure what the performance is on that operation if isinstance(sub, BackedSparseMatrix): - return get_memory_class(self.format)(sub) + return get_memory_class( + self.format, use_sparray_in_io=settings.shall_use_sparse_array_on_read + )(sub) else: return sub @@ -582,7 +587,9 @@ def to_memory(self) -> spmatrix | SpArray: ------- The in-memory representation of the sparse matrix. """ - format_class = get_memory_class(self.format) + format_class = get_memory_class( + self.format, use_sparray_in_io=settings.shall_use_sparse_array_on_read + ) mtx = format_class(self.shape, dtype=self.dtype) mtx.data = self.group["data"][...] mtx.indices = self.group["indices"][...] diff --git a/src/anndata/_settings.py b/src/anndata/_settings.py index f8ba1be3b..4e7d17347 100644 --- a/src/anndata/_settings.py +++ b/src/anndata/_settings.py @@ -14,6 +14,7 @@ from types import GenericAlias from typing import TYPE_CHECKING, Generic, NamedTuple, TypeVar, cast +from anndata.compat import CAN_USE_SPARSE_ARRAY from anndata.compat.exceptiongroups import add_note if TYPE_CHECKING: @@ -396,7 +397,7 @@ def __doc__(self): ################################################################################## -def validate_bool(val) -> None: +def validate_bool(val: Any) -> None: if not isinstance(val, bool): msg = f"{val} not valid boolean" raise TypeError(msg) @@ -428,5 +429,24 @@ def validate_bool(val) -> None: get_from_env=check_and_get_bool, ) + +def validate_sparse_settings(val: Any) -> None: + validate_bool(val) + if not CAN_USE_SPARSE_ARRAY and cast(bool, val): + msg = ( + "scipy.sparse.cs{r,c}array is not available in current scipy version. " + "Falling back to scipy.sparse.spmatrix for reading." + ) + raise ValueError(msg) + + +settings.register( + "shall_use_sparse_array_on_read", + default_value=False, + description="Whether or not to use `sparse_array` as the default class when reading in data", + validate=validate_sparse_settings, + get_from_env=check_and_get_bool, +) + ################################################################################## ################################################################################## diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py index f2ef62387..918872fe6 100644 --- a/tests/test_backed_sparse.py +++ b/tests/test_backed_sparse.py @@ -269,6 +269,28 @@ def test_dataset_append_memory( assert_equal(fromdisk, frommem) +@pytest.mark.parametrize("sparse_format", [sparse.csr_matrix, sparse.csc_matrix]) +def test_read_array( + tmp_path: Path, + sparse_format: Callable[[ArrayLike], sparse.spmatrix], + diskfmt: Literal["h5ad", "zarr"], +): + path = tmp_path / f"test.{diskfmt.replace('ad', '')}" + a = sparse_format(sparse.random(100, 100)) + if diskfmt == "zarr": + f = zarr.open_group(path, "a") + else: + f = h5py.File(path, "a") + ad.write_elem(f, "mtx", a) + diskmtx = sparse_dataset(f["mtx"]) + if not CAN_USE_SPARSE_ARRAY: + pytest.skip("scipy.sparse.cs{r,c}array not available") + ad.settings.shall_use_sparse_array_on_read = True + assert issubclass(type(diskmtx[...]), SpArray) + ad.settings.shall_use_sparse_array_on_read = False + assert issubclass(type(diskmtx[...]), sparse.spmatrix) + + @pytest.mark.parametrize( ("sparse_format", "append_method"), [ diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 91d97d6c9..452139406 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -25,7 +25,7 @@ write_elem, ) from anndata._io.specs.registry import IORegistryError -from anndata.compat import ZarrGroup, _read_attr +from anndata.compat import CAN_USE_SPARSE_ARRAY, SpArray, ZarrGroup, _read_attr from anndata.tests.helpers import ( as_cupy, as_cupy_sparse_dask_array, @@ -35,6 +35,7 @@ ) if TYPE_CHECKING: + from pathlib import Path from typing import Literal, TypeVar from anndata.compat import H5Group @@ -570,3 +571,22 @@ def test_io_pd_cow(store, copy_on_write): write_elem(store, "adata", orig) from_store = read_elem(store["adata"]) assert_equal(orig, from_store) + + +def test_read_sparse_array( + tmp_path: Path, + sparse_format: Literal["csr", "csc"], + diskfmt: Literal["h5ad", "zarr"], +): + path = tmp_path / f"test.{diskfmt.replace('ad', '')}" + a = sparse.random(100, 100, format=sparse_format) + if diskfmt == "zarr": + f = zarr.open_group(path, "a") + else: + f = h5py.File(path, "a") + ad.write_elem(f, "mtx", a) + if not CAN_USE_SPARSE_ARRAY: + pytest.skip("scipy.sparse.cs{r,c}array not available") + ad.settings.shall_use_sparse_array_on_read = True + mtx = ad.read_elem(f["mtx"]) + assert issubclass(type(mtx), SpArray) diff --git a/tests/test_settings.py b/tests/test_settings.py index 871141d92..a9901d5a1 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -243,3 +243,16 @@ class TestEnum(Enum): ) def test_describe(as_rst: bool, expected: str, settings: SettingsManager): assert settings.describe("test_var_3", as_rst=as_rst) == expected + + +def test_shall_use_sparse_array_on_read(): + import anndata as ad + + if not ad.compat.CAN_USE_SPARSE_ARRAY: + with pytest.raises( + ValueError, + match=r"scipy.sparse.cs{r,c}array is not available in current scipy version", + ): + ad.settings.shall_use_sparse_array_on_read = True + else: + ad.settings.shall_use_sparse_array_on_read = True