From 3121e04ce6164ada72c25f1db88d54269ee26ede Mon Sep 17 00:00:00 2001 From: kaushalprasadhial Date: Tue, 1 Oct 2024 09:48:00 +0000 Subject: [PATCH 1/6] Apply suggestions from code review --- docs/release-notes/1.10.3.md | 24 ++++++------- src/scanpy/preprocessing/_utils.py | 57 +++++++++++++++++++++++++----- 2 files changed, 60 insertions(+), 21 deletions(-) diff --git a/docs/release-notes/1.10.3.md b/docs/release-notes/1.10.3.md index f2f06ca94b..9afb3a0be5 100644 --- a/docs/release-notes/1.10.3.md +++ b/docs/release-notes/1.10.3.md @@ -1,16 +1,14 @@ -(v1.10.3)= -### 1.10.3 {small}`2024-09-17` +### 1.10.3 {small}`the future` -#### Bug fixes +```{rubric} Development features +``` -- Prevent empty control gene set in {func}`~scanpy.tl.score_genes` {smaller}`M Müller` ({pr}`2875`) -- Fix `subset=True` of {func}`~scanpy.pp.highly_variable_genes` when `flavor` is `seurat` or `cell_ranger`, and `batch_key!=None` {smaller}`E Roellin` ({pr}`3042`) -- Add compatibility with {mod}`numpy` 2.0 {smaller}`P Angerer` {pr}`3065` and ({pr}`3115`) -- Fix `legend_loc` argument in {func}`scanpy.pl.embedding` not accepting matplotlib parameters {smaller}`P Angerer` ({pr}`3163`) -- Fix dispersion cutoff in {func}`~scanpy.pp.highly_variable_genes` in presence of `NaN`s {smaller}`P Angerer` ({pr}`3176`) -- Fix axis labeling for swapped axes in {func}`~scanpy.pl.rank_genes_groups_stacked_violin` {smaller}`Ilan Gold` ({pr}`3196`) -- Upper bound dask on account of {issue}`scverse/anndata#1579` {smaller}`Ilan Gold` ({pr}`3217`) -- The [fa2-modified][] package replaces [forceatlas2][] for the latter’s lack of maintenance {smaller}`A Alam` ({pr}`3220`) +```{rubric} Docs +``` - [fa2-modified]: https://github.com/AminAlam/fa2_modified - [forceatlas2]: https://github.com/bhargavchippada/forceatlas2 +```{rubric} Bug fixes +``` + +```{rubric} Performance +``` +* Speed up _get_mean_var used in {func}`~scanpy.pp.scale` {pr}`3099` {smaller}`P Ashish & S Dicks` \ No newline at end of file diff --git a/src/scanpy/preprocessing/_utils.py b/src/scanpy/preprocessing/_utils.py index 64adb036d9..add30c92dd 100644 --- a/src/scanpy/preprocessing/_utils.py +++ b/src/scanpy/preprocessing/_utils.py @@ -33,14 +33,55 @@ def _(X: np.ndarray, *, axis: Literal[0, 1], dtype: DTypeLike) -> np.ndarray: def _get_mean_var( X: _SupportedArray, *, axis: Literal[0, 1] = 0 ) -> tuple[NDArray[np.float64], NDArray[np.float64]]: - if isinstance(X, sparse.spmatrix): - mean, var = sparse_mean_variance_axis(X, axis=axis) + if isinstance(X, np.ndarray): + n_threads = numba.get_num_threads() + mean, var = _compute_mean_var(X, axis=axis, n_threads=n_threads) else: - mean = axis_mean(X, axis=axis, dtype=np.float64) - mean_sq = axis_mean(elem_mul(X, X), axis=axis, dtype=np.float64) - var = mean_sq - mean**2 - # enforce R convention (unbiased estimator) for variance - var *= X.shape[axis] / (X.shape[axis] - 1) + if isinstance(X, sparse.spmatrix): + mean, var = sparse_mean_variance_axis(X, axis=axis) + else: + mean = axis_mean(X, axis=axis, dtype=np.float64) + mean_sq = axis_mean(elem_mul(X, X), axis=axis, dtype=np.float64) + var = mean_sq - mean**2 + # enforce R convention (unbiased estimator) for variance + var *= X.shape[axis] / (X.shape[axis] - 1) + return mean, var + + +@numba.njit(cache=True, parallel=True) +def _compute_mean_var( + X: _SupportedArray, axis: Literal[0, 1] = 0, n_threads=1 +) -> tuple[NDArray[np.float64], NDArray[np.float64]]: + if axis == 0: + axis_i = 1 + sums = np.zeros((n_threads, X.shape[axis_i]), dtype=np.float64) + sums_squared = np.zeros((n_threads, X.shape[axis_i]), dtype=np.float64) + mean = np.zeros(X.shape[axis_i], dtype=np.float64) + var = np.zeros(X.shape[axis_i], dtype=np.float64) + n = X.shape[axis] + for i in numba.prange(n_threads): + for r in range(i, n, n_threads): + for c in range(X.shape[axis_i]): + value = X[r, c] + sums[i, c] += value + sums_squared[i, c] += value * value + for c in numba.prange(X.shape[axis_i]): + sum_ = sums[:, c].sum() + mean[c] = sum_ / n + var[c] = (sums_squared[:, c].sum() - sum_ * sum_ / n) / (n - 1) + else: + axis_i = 0 + mean = np.zeros(X.shape[axis_i], dtype=np.float64) + var = np.zeros(X.shape[axis_i], dtype=np.float64) + for r in numba.prange(X.shape[0]): + for c in range(X.shape[1]): + value = X[r, c] + mean[r] += value + var[r] += value * value + for c in numba.prange(X.shape[0]): + mean[c] = mean[c] / X.shape[1] + var[c] = (var[c] - mean[c] ** 2) / (X.shape[1] - 1) + return mean, var @@ -157,4 +198,4 @@ def sample_comb( idx = sample_without_replacement( np.prod(dims), nsamp, random_state=random_state, method=method ) - return np.vstack(np.unravel_index(idx, dims)).T + return np.vstack(np.unravel_index(idx, dims)).T \ No newline at end of file From 061bf7537158a9944eec71b71b3a8f596bffee02 Mon Sep 17 00:00:00 2001 From: Severin Dicks <37635888+Intron7@users.noreply.github.com> Date: Thu, 7 Nov 2024 14:53:02 +0100 Subject: [PATCH 2/6] Update src/scanpy/preprocessing/_utils.py Co-authored-by: Ilan Gold --- src/scanpy/preprocessing/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scanpy/preprocessing/_utils.py b/src/scanpy/preprocessing/_utils.py index 8104a8b930..d9dcbbea91 100644 --- a/src/scanpy/preprocessing/_utils.py +++ b/src/scanpy/preprocessing/_utils.py @@ -47,7 +47,7 @@ def _get_mean_var( return mean, var @numba.njit(cache=True, parallel=True) -def _compute_mean_var( +def _compute_mean_var_dense( X: _SupportedArray, axis: Literal[0, 1] = 0, n_threads=1 ) -> tuple[NDArray[np.float64], NDArray[np.float64]]: if axis == 0: From e05cf081b834b0a6debab20a7a388e41cd499708 Mon Sep 17 00:00:00 2001 From: Intron7 Date: Thu, 7 Nov 2024 14:56:24 +0100 Subject: [PATCH 3/6] update release --- docs/release-notes/1.10.3.md | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/docs/release-notes/1.10.3.md b/docs/release-notes/1.10.3.md index 9afb3a0be5..f2f06ca94b 100644 --- a/docs/release-notes/1.10.3.md +++ b/docs/release-notes/1.10.3.md @@ -1,14 +1,16 @@ -### 1.10.3 {small}`the future` +(v1.10.3)= +### 1.10.3 {small}`2024-09-17` -```{rubric} Development features -``` +#### Bug fixes -```{rubric} Docs -``` +- Prevent empty control gene set in {func}`~scanpy.tl.score_genes` {smaller}`M Müller` ({pr}`2875`) +- Fix `subset=True` of {func}`~scanpy.pp.highly_variable_genes` when `flavor` is `seurat` or `cell_ranger`, and `batch_key!=None` {smaller}`E Roellin` ({pr}`3042`) +- Add compatibility with {mod}`numpy` 2.0 {smaller}`P Angerer` {pr}`3065` and ({pr}`3115`) +- Fix `legend_loc` argument in {func}`scanpy.pl.embedding` not accepting matplotlib parameters {smaller}`P Angerer` ({pr}`3163`) +- Fix dispersion cutoff in {func}`~scanpy.pp.highly_variable_genes` in presence of `NaN`s {smaller}`P Angerer` ({pr}`3176`) +- Fix axis labeling for swapped axes in {func}`~scanpy.pl.rank_genes_groups_stacked_violin` {smaller}`Ilan Gold` ({pr}`3196`) +- Upper bound dask on account of {issue}`scverse/anndata#1579` {smaller}`Ilan Gold` ({pr}`3217`) +- The [fa2-modified][] package replaces [forceatlas2][] for the latter’s lack of maintenance {smaller}`A Alam` ({pr}`3220`) -```{rubric} Bug fixes -``` - -```{rubric} Performance -``` -* Speed up _get_mean_var used in {func}`~scanpy.pp.scale` {pr}`3099` {smaller}`P Ashish & S Dicks` \ No newline at end of file + [fa2-modified]: https://github.com/AminAlam/fa2_modified + [forceatlas2]: https://github.com/bhargavchippada/forceatlas2 From 45aaeed3a10594f8c91244b64d5e850180b90c23 Mon Sep 17 00:00:00 2001 From: Intron7 Date: Thu, 7 Nov 2024 14:59:20 +0100 Subject: [PATCH 4/6] try to update --- src/scanpy/preprocessing/_utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/scanpy/preprocessing/_utils.py b/src/scanpy/preprocessing/_utils.py index d9dcbbea91..6bb1bb00b9 100644 --- a/src/scanpy/preprocessing/_utils.py +++ b/src/scanpy/preprocessing/_utils.py @@ -35,7 +35,9 @@ def _get_mean_var( ) -> tuple[NDArray[np.float64], NDArray[np.float64]]: if isinstance(X, np.ndarray): n_threads = numba.get_num_threads() - mean, var = _compute_mean_var(X, axis=axis, n_threads=n_threads) + mean, var = _compute_mean_var_dense(X, axis=axis, n_threads=n_threads) + elif isinstance(X, sparse.spmatrix): + mean, var = sparse_mean_variance_axis(X, axis=axis) else: mean = axis_mean(X, axis=axis, dtype=np.float64) mean_sq = axis_mean(elem_mul(X, X), axis=axis, dtype=np.float64) @@ -43,9 +45,9 @@ def _get_mean_var( # enforce R convention (unbiased estimator) for variance if X.shape[axis] != 1: var *= X.shape[axis] / (X.shape[axis] - 1) - return mean, var + @numba.njit(cache=True, parallel=True) def _compute_mean_var_dense( X: _SupportedArray, axis: Literal[0, 1] = 0, n_threads=1 @@ -194,4 +196,4 @@ def sample_comb( idx = sample_without_replacement( np.prod(dims), nsamp, random_state=random_state, method=method ) - return np.vstack(np.unravel_index(idx, dims)).T \ No newline at end of file + return np.vstack(np.unravel_index(idx, dims)).T From 9bcfa7cc22992813b39ae38c96bf5300564294e0 Mon Sep 17 00:00:00 2001 From: kaushalprasadhial Date: Thu, 21 Nov 2024 17:57:43 +0000 Subject: [PATCH 5/6] suggested changes --- src/scanpy/preprocessing/_utils.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/scanpy/preprocessing/_utils.py b/src/scanpy/preprocessing/_utils.py index af33b5b74b..175c803f19 100644 --- a/src/scanpy/preprocessing/_utils.py +++ b/src/scanpy/preprocessing/_utils.py @@ -70,19 +70,19 @@ def _compute_mean_var_dense( sum_ = sums[:, c].sum() mean[c] = sum_ / n var[c] = (sums_squared[:, c].sum() - sum_ * sum_ / n) / (n - 1) - else: - axis_i = 0 - mean = np.zeros(X.shape[axis_i], dtype=np.float64) - var = np.zeros(X.shape[axis_i], dtype=np.float64) - for r in numba.prange(X.shape[0]): - for c in range(X.shape[1]): - value = X[r, c] - mean[r] += value - var[r] += value * value - for c in numba.prange(X.shape[0]): - mean[c] = mean[c] / X.shape[1] - var[c] = (var[c] - mean[c] ** 2) / (X.shape[1] - 1) + axis_i = 0 + mean = np.zeros(X.shape[axis_i], dtype=np.float64) + var = np.zeros(X.shape[axis_i], dtype=np.float64) + for r in numba.prange(X.shape[0]): + for c in range(X.shape[1]): + value = X[r, c] + mean[r] += value + var[r] += value * value + for c in numba.prange(X.shape[0]): + mean[c] = mean[c] / X.shape[1] + var[c] = (var[c] - mean[c] ** 2) / (X.shape[1] - 1) + return mean, var def sparse_mean_variance_axis(mtx: sparse.spmatrix, axis: int): """ From 56bd4387a285ccfeb344c122f0045180d4cf03ed Mon Sep 17 00:00:00 2001 From: kaushal Date: Fri, 22 Nov 2024 10:18:40 +0530 Subject: [PATCH 6/6] else to _compute_mean_var_dense --- src/scanpy/preprocessing/_utils.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/scanpy/preprocessing/_utils.py b/src/scanpy/preprocessing/_utils.py index 175c803f19..2a94343955 100644 --- a/src/scanpy/preprocessing/_utils.py +++ b/src/scanpy/preprocessing/_utils.py @@ -70,17 +70,18 @@ def _compute_mean_var_dense( sum_ = sums[:, c].sum() mean[c] = sum_ / n var[c] = (sums_squared[:, c].sum() - sum_ * sum_ / n) / (n - 1) - axis_i = 0 - mean = np.zeros(X.shape[axis_i], dtype=np.float64) - var = np.zeros(X.shape[axis_i], dtype=np.float64) - for r in numba.prange(X.shape[0]): - for c in range(X.shape[1]): - value = X[r, c] - mean[r] += value - var[r] += value * value - for c in numba.prange(X.shape[0]): - mean[c] = mean[c] / X.shape[1] - var[c] = (var[c] - mean[c] ** 2) / (X.shape[1] - 1) + else: + axis_i = 0 + mean = np.zeros(X.shape[axis_i], dtype=np.float64) + var = np.zeros(X.shape[axis_i], dtype=np.float64) + for r in numba.prange(X.shape[0]): + for c in range(X.shape[1]): + value = X[r, c] + mean[r] += value + var[r] += value * value + for c in numba.prange(X.shape[0]): + mean[c] = mean[c] / X.shape[1] + var[c] = (var[c] - mean[c] ** 2) / (X.shape[1] - 1) return mean, var