Skip to content

Commit

Permalink
(feat): allow reading of scipy.sparse.cs{c,r}_array (#1633)
Browse files Browse the repository at this point in the history
* (chore): export `read_elem` and `write_elem` from the main package

* (chore): pr number

* (fix): agnostic way of importing

* (fix): add `RWAble` to `api.md`

* (fix): `md` file import

* (fix): clarify public backed sparse docstring/api

* (chore): small fixes

* (fix): `format` + `to_memory`

* (chore): remove deprecation tests + `SparseDataset`

* (chore): clean up private/public api

* (fix): `test_append_overflow_check` used `indptr`

* (fix): export `InMemoryElem`

* (chore): release note

* (chore): move `InMemoryElem` to the "extras" section

* Update src/anndata/_core/sparse_dataset.py

* (fix): remove dead tests

* (feat): allow reading of `scipy.sparse.cs{c,r}_array`

* (chore): release note

* (fix): add how to do it to release note

* (fix): make setting dynamic

* (fix): handle min-deps

* (fix): don't know why that didn't commit...

* (chore): better `msg`

* (refactor): change validation to setting object

* (fix): add space

---------

Co-authored-by: Philipp A. <[email protected]>
  • Loading branch information
ilan-gold and flying-sheep authored Sep 2, 2024
1 parent 53537b5 commit 0bc2b39
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 4 deletions.
1 change: 1 addition & 0 deletions docs/release-notes/1633.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Allow reading sparse data (via {func}`~anndata.read_elem` or {func}`~anndata.experimental.sparse_dataset`) into either {class}`scipy.sparse.csr_array` or {class}`scipy.sparse.csc_array` via {attr}`anndata.settings.shall_use_sparse_array_on_read` {user}`ilan-gold`
11 changes: 9 additions & 2 deletions src/anndata/_core/sparse_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from anndata._core.index import _fix_slice_bounds
from anndata.compat import H5Group, ZarrArray, ZarrGroup

from .._settings import settings
from ..compat import SpArray, _read_attr

try:
Expand Down Expand Up @@ -236,6 +237,8 @@ def _get_sliceXarray(self, row: slice, col: Sequence[int]) -> ss.csc_matrix:
FORMATS = [
BackedFormat("csr", backed_csr_matrix, ss.csr_matrix),
BackedFormat("csc", backed_csc_matrix, ss.csc_matrix),
BackedFormat("csr", backed_csr_matrix, ss.csr_array),
BackedFormat("csc", backed_csc_matrix, ss.csc_array),
]


Expand Down Expand Up @@ -444,7 +447,9 @@ def __getitem__(
# If indexing is array x array it returns a backed_sparse_matrix
# Not sure what the performance is on that operation
if isinstance(sub, BackedSparseMatrix):
return get_memory_class(self.format)(sub)
return get_memory_class(
self.format, use_sparray_in_io=settings.shall_use_sparse_array_on_read
)(sub)
else:
return sub

Expand Down Expand Up @@ -582,7 +587,9 @@ def to_memory(self) -> spmatrix | SpArray:
-------
The in-memory representation of the sparse matrix.
"""
format_class = get_memory_class(self.format)
format_class = get_memory_class(
self.format, use_sparray_in_io=settings.shall_use_sparse_array_on_read
)
mtx = format_class(self.shape, dtype=self.dtype)
mtx.data = self.group["data"][...]
mtx.indices = self.group["indices"][...]
Expand Down
22 changes: 21 additions & 1 deletion src/anndata/_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from types import GenericAlias
from typing import TYPE_CHECKING, Generic, NamedTuple, TypeVar, cast

from anndata.compat import CAN_USE_SPARSE_ARRAY
from anndata.compat.exceptiongroups import add_note

if TYPE_CHECKING:
Expand Down Expand Up @@ -396,7 +397,7 @@ def __doc__(self):
##################################################################################


def validate_bool(val) -> None:
def validate_bool(val: Any) -> None:
if not isinstance(val, bool):
msg = f"{val} not valid boolean"
raise TypeError(msg)
Expand Down Expand Up @@ -428,5 +429,24 @@ def validate_bool(val) -> None:
get_from_env=check_and_get_bool,
)


def validate_sparse_settings(val: Any) -> None:
validate_bool(val)
if not CAN_USE_SPARSE_ARRAY and cast(bool, val):
msg = (
"scipy.sparse.cs{r,c}array is not available in current scipy version. "
"Falling back to scipy.sparse.spmatrix for reading."
)
raise ValueError(msg)


settings.register(
"shall_use_sparse_array_on_read",
default_value=False,
description="Whether or not to use `sparse_array` as the default class when reading in data",
validate=validate_sparse_settings,
get_from_env=check_and_get_bool,
)

##################################################################################
##################################################################################
22 changes: 22 additions & 0 deletions tests/test_backed_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,28 @@ def test_dataset_append_memory(
assert_equal(fromdisk, frommem)


@pytest.mark.parametrize("sparse_format", [sparse.csr_matrix, sparse.csc_matrix])
def test_read_array(
tmp_path: Path,
sparse_format: Callable[[ArrayLike], sparse.spmatrix],
diskfmt: Literal["h5ad", "zarr"],
):
path = tmp_path / f"test.{diskfmt.replace('ad', '')}"
a = sparse_format(sparse.random(100, 100))
if diskfmt == "zarr":
f = zarr.open_group(path, "a")
else:
f = h5py.File(path, "a")
ad.write_elem(f, "mtx", a)
diskmtx = sparse_dataset(f["mtx"])
if not CAN_USE_SPARSE_ARRAY:
pytest.skip("scipy.sparse.cs{r,c}array not available")
ad.settings.shall_use_sparse_array_on_read = True
assert issubclass(type(diskmtx[...]), SpArray)
ad.settings.shall_use_sparse_array_on_read = False
assert issubclass(type(diskmtx[...]), sparse.spmatrix)


@pytest.mark.parametrize(
("sparse_format", "append_method"),
[
Expand Down
22 changes: 21 additions & 1 deletion tests/test_io_elementwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
write_elem,
)
from anndata._io.specs.registry import IORegistryError
from anndata.compat import ZarrGroup, _read_attr
from anndata.compat import CAN_USE_SPARSE_ARRAY, SpArray, ZarrGroup, _read_attr
from anndata.tests.helpers import (
as_cupy,
as_cupy_sparse_dask_array,
Expand All @@ -35,6 +35,7 @@
)

if TYPE_CHECKING:
from pathlib import Path
from typing import Literal, TypeVar

from anndata.compat import H5Group
Expand Down Expand Up @@ -570,3 +571,22 @@ def test_io_pd_cow(store, copy_on_write):
write_elem(store, "adata", orig)
from_store = read_elem(store["adata"])
assert_equal(orig, from_store)


def test_read_sparse_array(
tmp_path: Path,
sparse_format: Literal["csr", "csc"],
diskfmt: Literal["h5ad", "zarr"],
):
path = tmp_path / f"test.{diskfmt.replace('ad', '')}"
a = sparse.random(100, 100, format=sparse_format)
if diskfmt == "zarr":
f = zarr.open_group(path, "a")
else:
f = h5py.File(path, "a")
ad.write_elem(f, "mtx", a)
if not CAN_USE_SPARSE_ARRAY:
pytest.skip("scipy.sparse.cs{r,c}array not available")
ad.settings.shall_use_sparse_array_on_read = True
mtx = ad.read_elem(f["mtx"])
assert issubclass(type(mtx), SpArray)
13 changes: 13 additions & 0 deletions tests/test_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,3 +243,16 @@ class TestEnum(Enum):
)
def test_describe(as_rst: bool, expected: str, settings: SettingsManager):
assert settings.describe("test_var_3", as_rst=as_rst) == expected


def test_shall_use_sparse_array_on_read():
import anndata as ad

if not ad.compat.CAN_USE_SPARSE_ARRAY:
with pytest.raises(
ValueError,
match=r"scipy.sparse.cs{r,c}array is not available in current scipy version",
):
ad.settings.shall_use_sparse_array_on_read = True
else:
ad.settings.shall_use_sparse_array_on_read = True

0 comments on commit 0bc2b39

Please sign in to comment.