Skip to content

Commit

Permalink
(feat): raising errors where backed is not supported (#3048)
Browse files Browse the repository at this point in the history
* (feat): first step raising errors where `backed` is not supported

* (fix): fix `chunked` with `log1p`

* (fix): complications in `log1p` with `chunked`

* remove duplicated import

* (feat): pca check

* (feat): `ingest` check added

* (feat): add `dendrogram` test

* (feat): add `tsne` check

* (feat): `rank_genes_groups`

* (feat): `score_genes` check

* (chore): add plotting backed tests

* (chore): check type instead of `isbacked`

* (chore): release note

* (fix): string formatting error

* (fix): `worker_id` default

* (fix): try `SparseDataset`

* (fix): correct fixture creation

* Apply suggestions from code review

Co-authored-by: Philipp A. <[email protected]>

* (chore): consolidate tests

* (chore): remove erroneous plotting tests

* (chore): remove other `tempfile` import

* (fix): try moving scanpy install

* (fix): `np.Inf` -> `np.inf`

* (fix): fail if pynn is newest

* (fix): name

* (fix): `score_genes` name

* revert 88d51ab

* session-scoped backed_adata

* (fix): move umap import back to top

* (fix): try block shape

* (fix): revert chunks fix

---------

Co-authored-by: Philipp A <[email protected]>
  • Loading branch information
ilan-gold and flying-sheep authored May 21, 2024
1 parent 23c20bc commit 3ba3f46
Show file tree
Hide file tree
Showing 14 changed files with 229 additions and 10 deletions.
1 change: 1 addition & 0 deletions docs/release-notes/1.10.2.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
```

* Compatibility with `matplotlib` 3.9 {pr}`2999` {smaller}`I Virshup`
* Add clear errors where `backed` mode-like matrices (i.e., from `sparse_dataset`) are not supported {pr}`3048` {smaller}`I gold`

```{rubric} Performance
```
Expand Down
19 changes: 19 additions & 0 deletions scanpy/_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from typing import TYPE_CHECKING, overload
from weakref import WeakSet

import h5py
import numpy as np
from anndata import __version__ as anndata_version
from packaging.version import Version
Expand All @@ -33,6 +34,13 @@
from .._settings import settings
from .compute.is_constant import is_constant # noqa: F401

if Version(anndata_version) >= Version("0.10.0"):
from anndata._core.sparse_dataset import (
BaseCompressedSparseDataset as SparseDataset,
)
else:
from anndata._core.sparse_dataset import SparseDataset

if TYPE_CHECKING:
from collections.abc import Mapping
from pathlib import Path
Expand Down Expand Up @@ -1084,3 +1092,14 @@ def _resolve_axis(
if axis in {1, "var"}:
return (1, "var")
raise ValueError(f"`axis` must be either 0, 1, 'obs', or 'var', was {axis!r}")


def is_backed_type(X: object) -> bool:
return isinstance(X, (SparseDataset, h5py.File, h5py.Dataset))


def raise_not_implemented_error_if_backed_type(X: object, method_name: str) -> None:
if is_backed_type(X):
raise NotImplementedError(
f"{method_name} is not implemented for matrices of type {type(X)}"
)
11 changes: 9 additions & 2 deletions scanpy/preprocessing/_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from .. import logging as logg
from .._compat import DaskArray, pkg_version
from .._settings import settings
from .._utils import _doc_params, _empty
from .._utils import _doc_params, _empty, is_backed_type
from ..get import _check_mask, _get_obs_rep
from ._docs import doc_mask_var_hvg
from ._utils import _get_mean_var
Expand Down Expand Up @@ -173,6 +173,10 @@ def pca(
)
data_is_AnnData = isinstance(data, AnnData)
if data_is_AnnData:
if layer is None and not chunked and is_backed_type(data.X):
raise NotImplementedError(
f"PCA is not implemented for matrices of type {type(data.X)} with chunked as False"
)
adata = data.copy() if copy else data
else:
if pkg_version("anndata") < Version("0.8.0rc1"):
Expand All @@ -195,7 +199,10 @@ def pca(
logg.info(f" with n_comps={n_comps}")

X = _get_obs_rep(adata_comp, layer=layer)

if is_backed_type(X) and layer is not None:
raise NotImplementedError(
f"PCA is not implemented for matrices of type {type(X)} from layers"
)
# See: https://github.com/scverse/scanpy/pull/2816#issuecomment-1932650529
if (
Version(ad.__version__) < Version("0.9")
Expand Down
2 changes: 2 additions & 0 deletions scanpy/preprocessing/_scale.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from .._utils import (
_check_array_function_arguments,
axis_mul_or_truediv,
raise_not_implemented_error_if_backed_type,
renamed_arg,
view_to_actual,
)
Expand Down Expand Up @@ -298,6 +299,7 @@ def scale_anndata(
mask_obs = _check_mask(adata, mask_obs, "obs")
view_to_actual(adata)
X = _get_obs_rep(adata, layer=layer, obsm=obsm)
raise_not_implemented_error_if_backed_type(X, "scale")
X, adata.var[str_mean_std[0]], adata.var[str_mean_std[1]] = scale(
X,
zero_center=zero_center,
Expand Down
15 changes: 15 additions & 0 deletions scanpy/preprocessing/_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
from .._utils import (
_check_array_function_arguments,
axis_sum,
is_backed_type,
raise_not_implemented_error_if_backed_type,
renamed_arg,
sanitize_anndata,
view_to_actual,
Expand Down Expand Up @@ -145,6 +147,7 @@ def filter_cells(
"`min_genes`, `max_counts`, `max_genes` per call."
)
if isinstance(data, AnnData):
raise_not_implemented_error_if_backed_type(data.X, "filter_cells")
adata = data.copy() if copy else data
cell_subset, number = materialize_as_ndarray(
filter_cells(
Expand Down Expand Up @@ -260,6 +263,7 @@ def filter_genes(
)

if isinstance(data, AnnData):
raise_not_implemented_error_if_backed_type(data.X, "filter_genes")
adata = data.copy() if copy else data
gene_subset, number = materialize_as_ndarray(
filter_genes(
Expand Down Expand Up @@ -405,10 +409,19 @@ def log1p_anndata(
raise NotImplementedError(
"Currently cannot perform chunked operations on arrays not stored in X."
)
if adata.isbacked and adata.file._filemode != "r+":
raise NotImplementedError(
"log1p is not implemented for backed AnnData with backed mode not r+"
)
for chunk, start, end in adata.chunked_X(chunk_size):
adata.X[start:end] = log1p(chunk, base=base, copy=False)
else:
X = _get_obs_rep(adata, layer=layer, obsm=obsm)
if is_backed_type(X):
msg = f"log1p is not implemented for matrices of type {type(X)}"
if layer is not None:
raise NotImplementedError(f"{msg} from layers")
raise NotImplementedError(f"{msg} without `chunked=True`")
X = log1p(X, copy=False, base=base)
_set_obs_rep(adata, X, layer=layer, obsm=obsm)

Expand Down Expand Up @@ -647,6 +660,7 @@ def regress_out(
keys = [keys]

X = _get_obs_rep(adata, layer=layer)
raise_not_implemented_error_if_backed_type(X, "regress_out")

if issparse(X):
logg.info(" sparse input is densified and may " "lead to high memory use")
Expand Down Expand Up @@ -855,6 +869,7 @@ def downsample_counts(
`adata.X` : :class:`numpy.ndarray` | :class:`scipy.sparse.spmatrix` (dtype `float`)
Downsampled counts matrix.
"""
raise_not_implemented_error_if_backed_type(adata.X, "downsample_counts")
# This logic is all dispatch
total_counts_call = total_counts is not None
counts_per_cell_call = counts_per_cell is not None
Expand Down
98 changes: 98 additions & 0 deletions scanpy/tests/test_backed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from __future__ import annotations

from functools import partial

import pytest
from anndata import read_h5ad

import scanpy as sc


@pytest.mark.parametrize(
("name", "func", "msg"),
[
pytest.param("PCA", sc.pp.pca, " with chunked as False", id="pca"),
pytest.param(
"PCA", partial(sc.pp.pca, layer="X_copy"), " from layers", id="pca_layer"
),
pytest.param(
"regress_out",
partial(sc.pp.regress_out, keys=["n_counts", "percent_mito"]),
"",
id="regress_out",
),
pytest.param(
"dendrogram", partial(sc.tl.dendrogram, groupby="cat"), "", id="dendrogram"
),
pytest.param("tsne", sc.tl.tsne, "", id="tsne"),
pytest.param("scale", sc.pp.scale, "", id="scale"),
pytest.param(
"downsample_counts",
partial(sc.pp.downsample_counts, counts_per_cell=1000),
"",
id="downsample_counts",
),
pytest.param(
"filter_genes",
partial(sc.pp.filter_genes, max_cells=1000),
"",
id="filter_genes",
),
pytest.param(
"filter_cells",
partial(sc.pp.filter_cells, max_genes=1000),
"",
id="filter_cells",
),
pytest.param(
"rank_genes_groups",
partial(sc.tl.rank_genes_groups, groupby="cat"),
"",
id="rank_genes_groups",
),
pytest.param(
"score_genes",
partial(sc.tl.score_genes, gene_list=map(str, range(100))),
"",
id="score_genes",
),
],
)
def test_backed_error(backed_adata, name, func, msg):
with pytest.raises(
NotImplementedError,
match=f"{name} is not implemented for matrices of type {type(backed_adata.X)}{msg}",
):
func(backed_adata)


def test_log1p_backed_errors(backed_adata):
with pytest.raises(
NotImplementedError,
match="log1p is not implemented for backed AnnData with backed mode not r+",
):
sc.pp.log1p(backed_adata, chunked=True)
backed_adata.file.close()
backed_adata = read_h5ad(backed_adata.filename, backed="r+")
with pytest.raises(
NotImplementedError,
match=f"log1p is not implemented for matrices of type {type(backed_adata.X)} without `chunked=True`",
):
sc.pp.log1p(backed_adata)
backed_adata.layers["X_copy"] = backed_adata.X
layer_type = type(backed_adata.layers["X_copy"])
with pytest.raises(
NotImplementedError,
match=f"log1p is not implemented for matrices of type {layer_type} from layers",
):
sc.pp.log1p(backed_adata, layer="X_copy")
backed_adata.file.close()


def test_scatter_backed(backed_adata):
sc.pp.pca(backed_adata, chunked=True)
sc.pl.scatter(backed_adata, color="0", basis="pca")


def test_dotplot_backed(backed_adata):
sc.pl.dotplot(backed_adata, ["0", "1", "2", "3"], groupby="cat")
17 changes: 17 additions & 0 deletions scanpy/tests/test_ingest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import anndata
import numpy as np
import pytest
from sklearn.neighbors import KDTree
Expand Down Expand Up @@ -153,3 +154,19 @@ def test_ingest_map_embedding_umap():
umap_transformed_t = reducer.transform(T)

assert np.allclose(ing._obsm["X_umap"], umap_transformed_t)


def test_ingest_backed(adatas, tmp_path):
adata_ref = adatas[0].copy()
adata_new = adatas[1].copy()

adata_new.write_h5ad(f"{tmp_path}/new.h5ad")

adata_new = anndata.read_h5ad(f"{tmp_path}/new.h5ad", backed="r")

ing = sc.tl.Ingest(adata_ref)
with pytest.raises(
NotImplementedError,
match=f"Ingest.fit is not implemented for matrices of type {type(adata_new.X)}",
):
ing.fit(adata_new)
4 changes: 3 additions & 1 deletion scanpy/tools/_dendrogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from .. import logging as logg
from .._compat import old_positionals
from .._utils import _doc_params
from .._utils import _doc_params, raise_not_implemented_error_if_backed_type
from ..neighbors._doc import doc_n_pcs, doc_use_rep
from ._utils import _choose_representation

Expand Down Expand Up @@ -117,6 +117,8 @@ def dendrogram(
>>> markers = ['C1QA', 'PSAP', 'CD79A', 'CD79B', 'CST3', 'LYZ']
>>> sc.pl.dotplot(adata, markers, groupby='bulk_labels', dendrogram=True)
"""

raise_not_implemented_error_if_backed_type(adata.X, "dendrogram")
if isinstance(groupby, str):
# if not a list, turn into a list
groupby = [groupby]
Expand Down
3 changes: 2 additions & 1 deletion scanpy/tools/_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from .. import logging as logg
from .._compat import old_positionals, pkg_version
from .._settings import settings
from .._utils import NeighborsView
from .._utils import NeighborsView, raise_not_implemented_error_if_backed_type
from .._utils._doctests import doctest_skip
from ..neighbors import FlatTree

Expand Down Expand Up @@ -392,6 +392,7 @@ def fit(self, adata_new):
`adata` refers to the :class:`~anndata.AnnData` object
that is passed during the initialization of an Ingest instance.
"""
raise_not_implemented_error_if_backed_type(adata_new.X, "Ingest.fit")
ref_var_names = self._adata_ref.var_names.str.upper()
new_var_names = adata_new.var_names.str.upper()

Expand Down
7 changes: 5 additions & 2 deletions scanpy/tools/_rank_genes_groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
from .. import _utils
from .. import logging as logg
from .._compat import old_positionals
from .._utils import check_nonnegative_integers
from .._utils import (
check_nonnegative_integers,
raise_not_implemented_error_if_backed_type,
)
from ..get import _check_mask
from ..preprocessing._utils import _get_mean_var

Expand Down Expand Up @@ -134,6 +137,7 @@ def __init__(
if use_raw and adata.raw is not None:
adata_comp = adata.raw
X = adata_comp.X
raise_not_implemented_error_if_backed_type(X, "rank_genes_groups")

# for correct getnnz calculation
if issparse(X):
Expand Down Expand Up @@ -594,7 +598,6 @@ def rank_genes_groups(
>>> # to visualize the results
>>> sc.pl.rank_genes_groups(adata)
"""

if mask_var is not None:
mask_var = _check_mask(adata, mask_var, "var")

Expand Down
6 changes: 5 additions & 1 deletion scanpy/tools/_score_genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pandas as pd
from scipy.sparse import issparse

from scanpy._utils import _check_use_raw
from scanpy._utils import _check_use_raw, is_backed_type

from .. import logging as logg
from .._compat import old_positionals
Expand Down Expand Up @@ -110,6 +110,10 @@ def score_genes(
start = logg.info(f"computing score {score_name!r}")
adata = adata.copy() if copy else adata
use_raw = _check_use_raw(adata, use_raw)
if is_backed_type(adata.X) and not use_raw:
raise NotImplementedError(
f"score_genes is not implemented for matrices of type {type(adata.X)}"
)

if random_state is not None:
np.random.seed(random_state)
Expand Down
3 changes: 2 additions & 1 deletion scanpy/tools/_tsne.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from .. import logging as logg
from .._compat import old_positionals
from .._settings import settings
from .._utils import _doc_params
from .._utils import _doc_params, raise_not_implemented_error_if_backed_type
from ..neighbors._doc import doc_n_pcs, doc_use_rep
from ._utils import _choose_representation

Expand Down Expand Up @@ -106,6 +106,7 @@ def tsne(
start = logg.info("computing tSNE")
adata = adata.copy() if copy else adata
X = _choose_representation(adata, use_rep=use_rep, n_pcs=n_pcs)
raise_not_implemented_error_if_backed_type(X, "tsne")
# params for sklearn
n_jobs = settings.n_jobs if n_jobs is None else n_jobs
params_sklearn = dict(
Expand Down
2 changes: 2 additions & 0 deletions src/testing/scanpy/_pytest/fixtures/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from .data import (
_pbmc3ks_parametrized_session,
backed_adata,
pbmc3k_parametrized,
pbmc3k_parametrized_small,
)
Expand All @@ -27,6 +28,7 @@
"_pbmc3ks_parametrized_session",
"pbmc3k_parametrized",
"pbmc3k_parametrized_small",
"backed_adata",
]


Expand Down
Loading

0 comments on commit 3ba3f46

Please sign in to comment.