From 53537b5219ff82cbdee96b7733172fb114e80ca8 Mon Sep 17 00:00:00 2001
From: Ilan Gold <ilanbassgold@gmail.com>
Date: Fri, 30 Aug 2024 16:21:07 +0200
Subject: [PATCH] (fix): clarify public backed sparse docstring/api (#1608)

* (chore): export `read_elem` and `write_elem` from the main package

* (chore): pr number

* (fix): agnostic way of importing

* (fix): add `RWAble` to `api.md`

* (fix): `md` file import

* (fix): clarify public backed sparse docstring/api

* (chore): small fixes

* (fix): `format` + `to_memory`

* (chore): remove deprecation tests + `SparseDataset`

* (chore): clean up private/public api

* (fix): `test_append_overflow_check` used `indptr`

* (fix): export `InMemoryElem`

* (chore): release note

* (chore): move `InMemoryElem` to the "extras" section

* Update src/anndata/_core/sparse_dataset.py

* (fix): remove dead tests

---------

Co-authored-by: Philipp A. <flying-sheep@web.de>
---
 src/anndata/_core/sparse_dataset.py | 96 ++++++++++++-----------------
 tests/test_backed_sparse.py         |  2 +-
 tests/test_deprecations.py          | 34 ----------
 3 files changed, 41 insertions(+), 91 deletions(-)

diff --git a/src/anndata/_core/sparse_dataset.py b/src/anndata/_core/sparse_dataset.py
index b57241ed7..091ad45f0 100644
--- a/src/anndata/_core/sparse_dataset.py
+++ b/src/anndata/_core/sparse_dataset.py
@@ -44,6 +44,8 @@
     from collections.abc import Sequence
     from typing import Literal
 
+    from scipy.sparse import spmatrix
+
     from .._types import GroupStorageType
     from .index import Index
 
@@ -353,9 +355,8 @@ def is_sparse_indexing_overridden(format: Literal["csr", "csc"], row, col):
 
 
 class BaseCompressedSparseDataset(ABC):
-    """Analogous to :class:`h5py.Dataset <h5py:Dataset>` or `zarr.Array`, but for sparse matrices."""
-
     format: Literal["csr", "csc"]
+    """The format of the sparse matrix."""
     _group: GroupStorageType
 
     def __init__(self, group: GroupStorageType):
@@ -378,6 +379,7 @@ def group(self, val):
 
     @property
     def backend(self) -> Literal["zarr", "hdf5"]:
+        """Which file type is used on-disk."""
         if isinstance(self.group, ZarrGroup):
             return "zarr"
         elif isinstance(self.group, H5Group):
@@ -387,6 +389,7 @@ def backend(self) -> Literal["zarr", "hdf5"]:
 
     @property
     def dtype(self) -> np.dtype:
+        """The :class:`numpy.dtype` of the `data` attribute of the sparse matrix."""
         return self.group["data"].dtype
 
     @classmethod
@@ -395,37 +398,19 @@ def _check_group_format(cls, group):
         assert group_format == cls.format
 
     @property
-    def format_str(self) -> Literal["csr", "csc"]:
-        """DEPRECATED Use .format instead."""
-        warnings.warn(
-            "The attribute .format_str is deprecated and will be removed in the anndata 0.11.0. "
-            "Please use .format instead.",
-            FutureWarning,
-        )
-        return self.format
-
-    @property
-    def name(self) -> str:
+    def _name(self) -> str:
+        """Name of the group."""
         return self.group.name
 
     @property
     def shape(self) -> tuple[int, int]:
+        """Shape of the matrix read off disk."""
         shape = _read_attr(self.group.attrs, "shape", None)
         if shape is None:
             # TODO warn
             shape = self.group.attrs.get("h5sparse_shape")
         return tuple(map(int, shape))
 
-    @property
-    def value(self) -> ss.csr_matrix | ss.csc_matrix:
-        """DEPRECATED Use .to_memory() instead."""
-        warnings.warn(
-            "The .value attribute is deprecated and will be removed in the anndata 0.11.0. "
-            "Please use .to_memory() instead.",
-            FutureWarning,
-        )
-        return self.to_memory()
-
     def __repr__(self) -> str:
         return f"{type(self).__name__}: backend {self.backend}, shape {self.shape}, data_dtype {self.dtype}"
 
@@ -483,7 +468,25 @@ def __setitem__(self, index: Index | tuple[()], value) -> None:
         mock_matrix[row, col] = value
 
     # TODO: split to other classes?
-    def append(self, sparse_matrix: _cs_matrix | SpArray) -> None:
+    def append(self, sparse_matrix: spmatrix | SpArray) -> None:
+        """Append an in-memory or on-disk sparse matrix to the current object's store.
+
+        Parameters
+        ----------
+        sparse_matrix
+            The matrix to append.
+
+        Raises
+        ------
+        NotImplementedError
+            If the matrix to append is not one of :class:`~scipy.sparse.csr_array`, :class:`~scipy.sparse.csc_array`, :class:`~scipy.sparse.csr_matrix`, or :class:`~scipy.sparse.csc_matrix`.
+        ValueError
+            If both the on-disk and to-append matrices are not of the same format i.e., `csr` or `csc`.
+        OverflowError
+            If the underlying data store has a 32 bit indptr, and the new matrix is too large to fit in it i.e., would cause a 64 bit `indptr` to be written.
+        AssertionError
+            If the on-disk data does not have `csc` or `csr` format.
+        """
         # Prep variables
         shape = self.shape
         if isinstance(sparse_matrix, BaseCompressedSparseDataset):
@@ -546,7 +549,7 @@ def append(self, sparse_matrix: _cs_matrix | SpArray) -> None:
         )
         # Clear cached property
         if hasattr(self, "indptr"):
-            del self.indptr
+            del self._indptr
 
         # indices
         indices = self.group["indices"]
@@ -555,7 +558,7 @@ def append(self, sparse_matrix: _cs_matrix | SpArray) -> None:
         indices[orig_data_size:] = sparse_matrix.indices
 
     @cached_property
-    def indptr(self) -> np.ndarray:
+    def _indptr(self) -> np.ndarray:
         """\
         Other than `data` and `indices`, this is only as long as the major axis
 
@@ -569,21 +572,29 @@ def _to_backed(self) -> BackedSparseMatrix:
         mtx = format_class(self.shape, dtype=self.dtype)
         mtx.data = self.group["data"]
         mtx.indices = self.group["indices"]
-        mtx.indptr = self.indptr
+        mtx.indptr = self._indptr
         return mtx
 
-    def to_memory(self) -> ss.csr_matrix | ss.csc_matrix:
+    def to_memory(self) -> spmatrix | SpArray:
+        """Returns an in-memory representation of the sparse matrix.
+
+        Returns
+        -------
+        The in-memory representation of the sparse matrix.
+        """
         format_class = get_memory_class(self.format)
         mtx = format_class(self.shape, dtype=self.dtype)
         mtx.data = self.group["data"][...]
         mtx.indices = self.group["indices"][...]
-        mtx.indptr = self.indptr
+        mtx.indptr = self._indptr
         return mtx
 
 
 _sparse_dataset_doc = """\
     On disk {format} sparse matrix.
 
+    Analogous to :class:`h5py.Dataset` or :class:`zarr.core.Array`, but for sparse matrices.
+
     Parameters
     ----------
     group
@@ -662,30 +673,3 @@ def sparse_dataset(group: GroupStorageType) -> CSRDataset | CSCDataset:
 @_subset.register(BaseCompressedSparseDataset)
 def subset_sparsedataset(d, subset_idx):
     return d[subset_idx]
-
-
-## Backwards compat
-
-_sparsedataset_depr_msg = """\
-SparseDataset is deprecated and will be removed in late 2024. It has been replaced by the public classes CSRDataset and CSCDataset.
-
-For instance checks, use `isinstance(X, (anndata.CSRDataset, anndata.CSCDataset))` instead.
-
-For creation, use `anndata.experimental.sparse_dataset(X)` instead.
-"""
-
-
-class SparseDataset(ABC):
-    """DEPRECATED.
-
-    Use CSRDataset, CSCDataset, and sparse_dataset from anndata.experimental instead.
-    """
-
-    def __new__(cls, group):
-        warnings.warn(FutureWarning(_sparsedataset_depr_msg), stacklevel=2)
-        return sparse_dataset(group)
-
-    @classmethod
-    def __subclasshook__(cls, C):
-        warnings.warn(FutureWarning(_sparsedataset_depr_msg), stacklevel=3)
-        return issubclass(C, (CSRDataset, CSCDataset))
diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py
index 2349e2fa0..f2ef62387 100644
--- a/tests/test_backed_sparse.py
+++ b/tests/test_backed_sparse.py
@@ -549,7 +549,7 @@ def test_append_overflow_check(group_fn, sparse_class, tmpdir):
     backed = sparse_dataset(group["mtx"])
 
     # Checking for correct caching behaviour
-    backed.indptr
+    backed._indptr
 
     with pytest.raises(
         OverflowError,
diff --git a/tests/test_deprecations.py b/tests/test_deprecations.py
index f1f857def..61a2d804b 100644
--- a/tests/test_deprecations.py
+++ b/tests/test_deprecations.py
@@ -11,7 +11,6 @@
 import h5py
 import numpy as np
 import pytest
-import zarr
 from scipy import sparse
 
 import anndata as ad
@@ -129,39 +128,6 @@ def test_deprecated_read(tmp_path):
     assert_equal(memory, from_disk)
 
 
-def test_deprecated_sparse_dataset_values():
-    import zarr
-
-    from anndata.experimental import sparse_dataset
-
-    mtx = sparse.random(50, 50, format="csr")
-    g = zarr.group()
-
-    ad.write_elem(g, "mtx", mtx)
-    mtx_backed = sparse_dataset(g["mtx"])
-
-    with pytest.warns(FutureWarning, match=r"Please use .to_memory()"):
-        mtx_backed.value
-
-    with pytest.warns(FutureWarning, match=r"Please use .format"):
-        mtx_backed.format_str
-
-
-def test_deprecated_sparse_dataset():
-    from anndata._core.sparse_dataset import SparseDataset
-
-    mem_X = sparse.random(50, 50, format="csr")
-    g = zarr.group()
-    ad.write_elem(g, "X", mem_X)
-    with pytest.warns(FutureWarning, match=r"SparseDataset is deprecated"):
-        X = SparseDataset(g["X"])
-
-    assert isinstance(X, ad.CSRDataset)
-
-    with pytest.warns(FutureWarning, match=r"SparseDataset is deprecated"):
-        assert isinstance(X, SparseDataset)
-
-
 @pytest.mark.parametrize("name", anndata.experimental._DEPRECATED)
 def test_warn_on_import_from_experimental(name: str):
     with pytest.warns(FutureWarning, match=rf"Importing {name}"):