Skip to content

Commit

Permalink
(fix): clarify public backed sparse docstring/api (#1608)
Browse files Browse the repository at this point in the history
* (chore): export `read_elem` and `write_elem` from the main package

* (chore): pr number

* (fix): agnostic way of importing

* (fix): add `RWAble` to `api.md`

* (fix): `md` file import

* (fix): clarify public backed sparse docstring/api

* (chore): small fixes

* (fix): `format` + `to_memory`

* (chore): remove deprecation tests + `SparseDataset`

* (chore): clean up private/public api

* (fix): `test_append_overflow_check` used `indptr`

* (fix): export `InMemoryElem`

* (chore): release note

* (chore): move `InMemoryElem` to the "extras" section

* Update src/anndata/_core/sparse_dataset.py

* (fix): remove dead tests

---------

Co-authored-by: Philipp A. <[email protected]>
  • Loading branch information
ilan-gold and flying-sheep authored Aug 30, 2024
1 parent 94b2304 commit 53537b5
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 91 deletions.
96 changes: 40 additions & 56 deletions src/anndata/_core/sparse_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
from collections.abc import Sequence
from typing import Literal

from scipy.sparse import spmatrix

from .._types import GroupStorageType
from .index import Index

Expand Down Expand Up @@ -353,9 +355,8 @@ def is_sparse_indexing_overridden(format: Literal["csr", "csc"], row, col):


class BaseCompressedSparseDataset(ABC):
"""Analogous to :class:`h5py.Dataset <h5py:Dataset>` or `zarr.Array`, but for sparse matrices."""

format: Literal["csr", "csc"]
"""The format of the sparse matrix."""
_group: GroupStorageType

def __init__(self, group: GroupStorageType):
Expand All @@ -378,6 +379,7 @@ def group(self, val):

@property
def backend(self) -> Literal["zarr", "hdf5"]:
"""Which file type is used on-disk."""
if isinstance(self.group, ZarrGroup):
return "zarr"
elif isinstance(self.group, H5Group):
Expand All @@ -387,6 +389,7 @@ def backend(self) -> Literal["zarr", "hdf5"]:

@property
def dtype(self) -> np.dtype:
"""The :class:`numpy.dtype` of the `data` attribute of the sparse matrix."""
return self.group["data"].dtype

@classmethod
Expand All @@ -395,37 +398,19 @@ def _check_group_format(cls, group):
assert group_format == cls.format

@property
def format_str(self) -> Literal["csr", "csc"]:
"""DEPRECATED Use .format instead."""
warnings.warn(
"The attribute .format_str is deprecated and will be removed in the anndata 0.11.0. "
"Please use .format instead.",
FutureWarning,
)
return self.format

@property
def name(self) -> str:
def _name(self) -> str:
"""Name of the group."""
return self.group.name

@property
def shape(self) -> tuple[int, int]:
"""Shape of the matrix read off disk."""
shape = _read_attr(self.group.attrs, "shape", None)
if shape is None:
# TODO warn
shape = self.group.attrs.get("h5sparse_shape")
return tuple(map(int, shape))

@property
def value(self) -> ss.csr_matrix | ss.csc_matrix:
"""DEPRECATED Use .to_memory() instead."""
warnings.warn(
"The .value attribute is deprecated and will be removed in the anndata 0.11.0. "
"Please use .to_memory() instead.",
FutureWarning,
)
return self.to_memory()

def __repr__(self) -> str:
return f"{type(self).__name__}: backend {self.backend}, shape {self.shape}, data_dtype {self.dtype}"

Expand Down Expand Up @@ -483,7 +468,25 @@ def __setitem__(self, index: Index | tuple[()], value) -> None:
mock_matrix[row, col] = value

# TODO: split to other classes?
def append(self, sparse_matrix: _cs_matrix | SpArray) -> None:
def append(self, sparse_matrix: spmatrix | SpArray) -> None:
"""Append an in-memory or on-disk sparse matrix to the current object's store.
Parameters
----------
sparse_matrix
The matrix to append.
Raises
------
NotImplementedError
If the matrix to append is not one of :class:`~scipy.sparse.csr_array`, :class:`~scipy.sparse.csc_array`, :class:`~scipy.sparse.csr_matrix`, or :class:`~scipy.sparse.csc_matrix`.
ValueError
If both the on-disk and to-append matrices are not of the same format i.e., `csr` or `csc`.
OverflowError
If the underlying data store has a 32 bit indptr, and the new matrix is too large to fit in it i.e., would cause a 64 bit `indptr` to be written.
AssertionError
If the on-disk data does not have `csc` or `csr` format.
"""
# Prep variables
shape = self.shape
if isinstance(sparse_matrix, BaseCompressedSparseDataset):
Expand Down Expand Up @@ -546,7 +549,7 @@ def append(self, sparse_matrix: _cs_matrix | SpArray) -> None:
)
# Clear cached property
if hasattr(self, "indptr"):
del self.indptr
del self._indptr

# indices
indices = self.group["indices"]
Expand All @@ -555,7 +558,7 @@ def append(self, sparse_matrix: _cs_matrix | SpArray) -> None:
indices[orig_data_size:] = sparse_matrix.indices

@cached_property
def indptr(self) -> np.ndarray:
def _indptr(self) -> np.ndarray:
"""\
Other than `data` and `indices`, this is only as long as the major axis
Expand All @@ -569,21 +572,29 @@ def _to_backed(self) -> BackedSparseMatrix:
mtx = format_class(self.shape, dtype=self.dtype)
mtx.data = self.group["data"]
mtx.indices = self.group["indices"]
mtx.indptr = self.indptr
mtx.indptr = self._indptr
return mtx

def to_memory(self) -> ss.csr_matrix | ss.csc_matrix:
def to_memory(self) -> spmatrix | SpArray:
"""Returns an in-memory representation of the sparse matrix.
Returns
-------
The in-memory representation of the sparse matrix.
"""
format_class = get_memory_class(self.format)
mtx = format_class(self.shape, dtype=self.dtype)
mtx.data = self.group["data"][...]
mtx.indices = self.group["indices"][...]
mtx.indptr = self.indptr
mtx.indptr = self._indptr
return mtx


_sparse_dataset_doc = """\
On disk {format} sparse matrix.
Analogous to :class:`h5py.Dataset` or :class:`zarr.core.Array`, but for sparse matrices.
Parameters
----------
group
Expand Down Expand Up @@ -662,30 +673,3 @@ def sparse_dataset(group: GroupStorageType) -> CSRDataset | CSCDataset:
@_subset.register(BaseCompressedSparseDataset)
def subset_sparsedataset(d, subset_idx):
return d[subset_idx]


## Backwards compat

_sparsedataset_depr_msg = """\
SparseDataset is deprecated and will be removed in late 2024. It has been replaced by the public classes CSRDataset and CSCDataset.
For instance checks, use `isinstance(X, (anndata.CSRDataset, anndata.CSCDataset))` instead.
For creation, use `anndata.experimental.sparse_dataset(X)` instead.
"""


class SparseDataset(ABC):
"""DEPRECATED.
Use CSRDataset, CSCDataset, and sparse_dataset from anndata.experimental instead.
"""

def __new__(cls, group):
warnings.warn(FutureWarning(_sparsedataset_depr_msg), stacklevel=2)
return sparse_dataset(group)

@classmethod
def __subclasshook__(cls, C):
warnings.warn(FutureWarning(_sparsedataset_depr_msg), stacklevel=3)
return issubclass(C, (CSRDataset, CSCDataset))
2 changes: 1 addition & 1 deletion tests/test_backed_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -549,7 +549,7 @@ def test_append_overflow_check(group_fn, sparse_class, tmpdir):
backed = sparse_dataset(group["mtx"])

# Checking for correct caching behaviour
backed.indptr
backed._indptr

with pytest.raises(
OverflowError,
Expand Down
34 changes: 0 additions & 34 deletions tests/test_deprecations.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import h5py
import numpy as np
import pytest
import zarr
from scipy import sparse

import anndata as ad
Expand Down Expand Up @@ -129,39 +128,6 @@ def test_deprecated_read(tmp_path):
assert_equal(memory, from_disk)


def test_deprecated_sparse_dataset_values():
import zarr

from anndata.experimental import sparse_dataset

mtx = sparse.random(50, 50, format="csr")
g = zarr.group()

ad.write_elem(g, "mtx", mtx)
mtx_backed = sparse_dataset(g["mtx"])

with pytest.warns(FutureWarning, match=r"Please use .to_memory()"):
mtx_backed.value

with pytest.warns(FutureWarning, match=r"Please use .format"):
mtx_backed.format_str


def test_deprecated_sparse_dataset():
from anndata._core.sparse_dataset import SparseDataset

mem_X = sparse.random(50, 50, format="csr")
g = zarr.group()
ad.write_elem(g, "X", mem_X)
with pytest.warns(FutureWarning, match=r"SparseDataset is deprecated"):
X = SparseDataset(g["X"])

assert isinstance(X, ad.CSRDataset)

with pytest.warns(FutureWarning, match=r"SparseDataset is deprecated"):
assert isinstance(X, SparseDataset)


@pytest.mark.parametrize("name", anndata.experimental._DEPRECATED)
def test_warn_on_import_from_experimental(name: str):
with pytest.warns(FutureWarning, match=rf"Importing {name}"):
Expand Down

0 comments on commit 53537b5

Please sign in to comment.