From fd21dc1a7deee129545c5ee7c22732ee5a565fa1 Mon Sep 17 00:00:00 2001 From: wxicu Date: Sun, 26 May 2024 13:42:37 +0200 Subject: [PATCH 01/34] add two distance metrics --- pertpy/tools/_distances/_distances.py | 146 +++++++++++++++++- tests/tools/_distances/test_distance_tests.py | 2 + tests/tools/_distances/test_distances.py | 18 ++- 3 files changed, 159 insertions(+), 7 deletions(-) diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index 348f1f78..e3920c17 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -12,16 +12,50 @@ from ott.solvers.linear.sinkhorn import Sinkhorn from pandas import Series from rich.progress import track -from scipy.sparse import issparse -from scipy.spatial.distance import cosine +from scipy.sparse import csr_matrix, issparse +from scipy.spatial.distance import cosine, mahalanobis from scipy.special import gammaln from scipy.stats import kendalltau, kstest, pearsonr, spearmanr from sklearn.linear_model import LogisticRegression from sklearn.metrics import pairwise_distances, r2_score from sklearn.metrics.pairwise import polynomial_kernel, rbf_kernel +from sklearn.neighbors import KernelDensity from statsmodels.discrete.discrete_model import NegativeBinomialP + +def compute_medoid(arr, axis=None): + if len(arr) == 0: + return None # Handle the case when the array is empty + + if axis is not None: + # If axis is specified, compute the medoid along that axis + return np.apply_along_axis(lambda x: compute_medoid(x), axis, arr) + + # Calculate pairwise distances between all elements + distances = np.abs(arr[:, np.newaxis] - arr) + + # Calculate the total distance for each element + total_distances = np.sum(distances, axis=1) + + # Find the index of the element with the smallest total distance (medoid) + medoid_index = np.argmin(total_distances) + + # Return the medoid value + medoid = arr[medoid_index] + + return medoid + + +AGG_FCTS = { + "mean": np.mean, + "median": np.median, + "medoid": compute_medoid, + "variance": np.var, +} + if TYPE_CHECKING: + from collections.abc import Callable + from anndata import AnnData @@ -80,6 +114,9 @@ class Distance: Average of the classification probability of the perturbation for a binary classifier. - "classifier_cp": classifier class projection Average of the class + - "mean_var_distn": Distance between mean-var distibutions of gene expression. + - "mahalanobis": Mahalanobis distance between the cells of two groups. + Attributes: metric: Name of distance metric. @@ -99,6 +136,7 @@ class Distance: def __init__( self, metric: str = "edistance", + agg_fct: str = "mean", layer_key: str = None, obsm_key: str = None, cell_wise_metric: str = "euclidean", @@ -107,6 +145,7 @@ def __init__( Args: metric: Distance metric to use. Defaults to "edistance". + agg_fct: Name of the aggregation function to generate pseodobulk vectors. Defaults to "mean". layer_key: Name of the counts layer containing raw counts to calculate distances for. Mutually exclusive with 'obsm_key'. Defaults to None and is then not used. @@ -116,6 +155,7 @@ def __init__( cell_wise_metric: Metric from scipy.spatial.distance to use for pairwise distances between single cells. Defaults to "euclidean". """ + self.aggregation_func = AGG_FCTS[agg_fct] metric_fct: AbstractDistance = None if metric == "edistance": metric_fct = Edistance() @@ -155,6 +195,10 @@ def __init__( metric_fct = ClassifierProbaDistance() elif metric == "classifier_cp": metric_fct = ClassifierClassProjection() + elif metric == "mean_var_distn": + metric_fct = MeanVarDistnDistance() + elif metric == "mahalanobis": + metric_fct = MahalanobisDistance(self.aggregation_func) else: raise ValueError(f"Metric {metric} not recognized.") self.metric_fct = metric_fct @@ -315,7 +359,13 @@ def onesided_distances( """ if self.metric == "classifier_cp": return self.metric_fct.onesided_distances( # type: ignore - adata, groupby, selected_group, groups, show_progressbar, n_jobs, **kwargs + adata, + groupby, + selected_group, + groups, + show_progressbar, + n_jobs, + **kwargs, ) groups = adata.obs[groupby].unique() if groups is None else groups @@ -857,3 +907,93 @@ def onesided_distances( def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: raise NotImplementedError("ClassifierClassProjection cannot be called on a pairwise distance matrix.") + + +class MeanVarDistnDistance(AbstractDistance): + """ + Distance betweenv mean-var distributions of gene expression. + + """ + + def __init__(self, aggregation_func: Callable = np.mean) -> None: + super().__init__() + self.accepts_precomputed = False + + def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: + """ + Difference of mean-var distibutions in 2 matrices + :param X,Y: Matrices of cells*genes, normalized and log transformed + """ + + # Get log mean & var for comparison + def mean_var(x, log: bool = False): + mean = np.mean(x, axis=0) + var = np.var(x, axis=0) + positive = mean > 0 + mean = mean[positive] + var = var[positive] + if log: + mean = np.log(mean) + var = np.log(var) + return mean, var + + def prep_kde_data(x, y): + return np.concatenate([x.reshape(-1, 1), y.reshape(-1, 1)], axis=1) + + mean_x, var_x = mean_var(X, log=True) + x = prep_kde_data(mean_x, var_x) + mean_y, var_y = mean_var(Y, log=True) + y = prep_kde_data(mean_y, var_y) + + def grid_points(d, n_points=100): + # Make grid, add 1 bin on lower/upper end to get final n_points + d_min = d.min() + d_max = d.max() + # Compute bin size + d_bin = (d_max - d_min) / (n_points - 2) + d_min = d_min - d_bin + d_max = d_max + d_bin + return np.arange(start=d_min + 0.5 * d_bin, stop=d_max, step=d_bin) + + # Gridpoints to eval KDE on + mean_grid = grid_points(np.concatenate([mean_x, mean_y])) + var_grid = grid_points(np.concatenate([var_x, var_y])) + grid = np.array(np.meshgrid(mean_grid, var_grid)).T.reshape(-1, 2) + + # KDE + def kde_eval(d, grid): + # Kernel choice: Gaussian is too smoothing and cosine or other kernels that do not stretch out + # can not be compared well on regions further away from the data as they are -inf + return KernelDensity(bandwidth="silverman", kernel="exponential").fit(d).score_samples(grid) + + kde_x = kde_eval(x, grid) + kde_y = kde_eval(y, grid) + + # KDE difference + kde_diff = ((kde_x - kde_y) ** 2).mean() + + return kde_diff + + def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: + raise NotImplementedError("MeanVarDistnDistance cannot be called on a pairwise distance matrix.") + + +class MahalanobisDistance(AbstractDistance): + """ + Mahalanobis distance between pseudobulk vectors. + """ + + def __init__(self, aggregation_func: Callable = np.mean) -> None: + super().__init__() + self.accepts_precomputed = False + self.aggregation_func = aggregation_func + + def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: + return mahalanobis( + self.aggregation_func(X, axis=0), + self.aggregation_func(Y, axis=0), + np.linalg.inv(np.cov(X.T)), + ) + + def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: + raise NotImplementedError("Mahalanobis cannot be called on a pairwise distance matrix.") diff --git a/tests/tools/_distances/test_distance_tests.py b/tests/tools/_distances/test_distance_tests.py index 22c3e93f..573e9e0f 100644 --- a/tests/tools/_distances/test_distance_tests.py +++ b/tests/tools/_distances/test_distance_tests.py @@ -22,6 +22,8 @@ "classifier_proba", # "classifier_cp", # "nbll", + "mahalanobis", + "mean_var_distn", ] count_distances = ["nb_ll"] diff --git a/tests/tools/_distances/test_distances.py b/tests/tools/_distances/test_distances.py index 95c81225..bd67efac 100644 --- a/tests/tools/_distances/test_distances.py +++ b/tests/tools/_distances/test_distances.py @@ -20,32 +20,42 @@ "spearman_distance", "t_test", "wasserstein", + "mahalanobis", ] semi_distances = ["r2_distance", "sym_kldiv", "ks_test"] non_distances = ["classifier_proba"] onesided_only = ["classifier_cp"] pseudo_counts_distances = ["nb_ll"] +lognorm_counts_distances = ["mean_var_distn"] all_distances = actual_distances + semi_distances + non_distances + pseudo_counts_distances # + onesided_only @pytest.fixture def all_pairwise_distances(): all_calulated_distances = {} - no_subsample_distances = ["sym_kldiv", "t_test", "ks_test", "classifier_proba", "classifier_cp"] + no_subsample_distances = [ + "sym_kldiv", + "t_test", + "ks_test", + "classifier_proba", + "classifier_cp", + "mahalanobis", + ] for distance in all_distances: adata = pt.dt.distance_example() if distance not in no_subsample_distances: adata = sc.pp.subsample(adata, 0.001, copy=True) - else: + elif distance != "mahalanobis": adata = sc.pp.subsample(adata, 0.1, copy=True) adata.layers["lognorm"] = adata.X.copy() adata.layers["counts"] = np.round(adata.X.toarray()).astype(int) if "X_pca" not in adata.obsm.keys(): sc.pp.pca(adata, n_comps=5) - - if distance in pseudo_counts_distances: + if distance in lognorm_counts_distances: + Distance = pt.tl.Distance(distance, layer_key="lognorm") + elif distance in pseudo_counts_distances: Distance = pt.tl.Distance(distance, layer_key="counts") else: Distance = pt.tl.Distance(distance, obsm_key="X_pca") From 3af7d8990eaf1e6e50e48cbe238e47289b78a34c Mon Sep 17 00:00:00 2001 From: wxicu Date: Sun, 26 May 2024 22:50:38 +0200 Subject: [PATCH 02/34] add obsm_key param to distance test --- pertpy/tools/_distances/_distance_tests.py | 13 +- pertpy/tools/_distances/_distances.py | 131 ++++++++++++++++----- 2 files changed, 110 insertions(+), 34 deletions(-) diff --git a/pertpy/tools/_distances/_distance_tests.py b/pertpy/tools/_distances/_distance_tests.py index 76418139..ad29c311 100644 --- a/pertpy/tools/_distances/_distance_tests.py +++ b/pertpy/tools/_distances/_distance_tests.py @@ -66,11 +66,14 @@ def __init__( self.alpha = alpha self.correction = correction self.cell_wise_metric = ( - cell_wise_metric if cell_wise_metric else Distance(self.metric, self.obsm_key).cell_wise_metric + cell_wise_metric if cell_wise_metric else Distance(self.metric, obsm_key=self.obsm_key).cell_wise_metric ) self.distance = Distance( - self.metric, layer_key=self.layer_key, obsm_key=self.obsm_key, cell_wise_metric=self.cell_wise_metric + self.metric, + layer_key=self.layer_key, + obsm_key=self.obsm_key, + cell_wise_metric=self.cell_wise_metric, ) def __call__( @@ -176,7 +179,8 @@ def test_xy(self, adata: AnnData, groupby: str, contrast: str, show_progressbar: # Evaluate the test # count times shuffling resulted in larger distance comparison_results = np.array( - pd.concat([r["distance"] - df["distance"] for r in results], axis=1) > 0, dtype=int + pd.concat([r["distance"] - df["distance"] for r in results], axis=1) > 0, + dtype=int, ) n_failures = pd.Series(np.clip(np.sum(comparison_results, axis=1), 1, np.inf), index=df.index) pvalues = n_failures / self.n_perms @@ -284,7 +288,8 @@ def test_precomputed(self, adata: AnnData, groupby: str, contrast: str, verbose: # Evaluate the test # count times shuffling resulted in larger distance comparison_results = np.array( - pd.concat([r["distance"] - df["distance"] for r in results], axis=1) > 0, dtype=int + pd.concat([r["distance"] - df["distance"] for r in results], axis=1) > 0, + dtype=int, ) n_failures = pd.Series(np.clip(np.sum(comparison_results, axis=1), 1, np.inf), index=df.index) pvalues = n_failures / self.n_perms diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index e3920c17..8eacbfd3 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -289,7 +289,10 @@ def pairwise( # able to handle precomputed distances such as the PseudobulkDistance. if self.metric_fct.accepts_precomputed: # Precompute the pairwise distances if needed - if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys(): + if ( + f"{self.obsm_key}_{self.cell_wise_metric}_predistances" + not in adata.obsp.keys() + ): self.precompute_distances(adata, n_jobs=n_jobs, **kwargs) pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] for index_x, group_x in enumerate(fct(groups)): @@ -302,7 +305,9 @@ def pairwise( # subset the pairwise distance matrix to the two groups sub_pwd = pwd[idx_x | idx_y, :][:, idx_x | idx_y] sub_idx = grouping[idx_x | idx_y] == group_x - dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs) + dist = self.metric_fct.from_precomputed( + sub_pwd, sub_idx, **kwargs + ) df.loc[group_x, group_y] = dist df.loc[group_y, group_x] = dist else: @@ -380,7 +385,10 @@ def onesided_distances( # able to handle precomputed distances such as the PsuedobulkDistance. if self.metric_fct.accepts_precomputed: # Precompute the pairwise distances if needed - if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys(): + if ( + f"{self.obsm_key}_{self.cell_wise_metric}_predistances" + not in adata.obsp.keys() + ): self.precompute_distances(adata, n_jobs=n_jobs, **kwargs) pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] for group_x in fct(groups): @@ -434,7 +442,9 @@ def precompute_distances(self, adata: AnnData, n_jobs: int = -1) -> None: cells = adata.layers[self.layer_key] else: cells = adata.obsm[self.obsm_key].copy() - pwd = pairwise_distances(cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs) + pwd = pairwise_distances( + cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs + ) adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] = pwd @@ -503,7 +513,9 @@ def __init__(self) -> None: super().__init__() self.accepts_precomputed = False - def __call__(self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs) -> float: + def __call__( + self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs + ) -> float: if kernel == "linear": XX = np.dot(X, X.T) YY = np.dot(Y, Y.T) @@ -558,7 +570,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("EuclideanDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "EuclideanDistance cannot be called on a pairwise distance matrix." + ) class MeanSquaredDistance(AbstractDistance): @@ -569,10 +583,15 @@ def __init__(self) -> None: self.accepts_precomputed = False def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 2 / X.shape[1] + return ( + np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 2 + / X.shape[1] + ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("MeanSquaredDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "MeanSquaredDistance cannot be called on a pairwise distance matrix." + ) class MeanAbsoluteDistance(AbstractDistance): @@ -583,10 +602,15 @@ def __init__(self) -> None: self.accepts_precomputed = False def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs) / X.shape[1] + return ( + np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs) + / X.shape[1] + ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("MeanAbsoluteDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "MeanAbsoluteDistance cannot be called on a pairwise distance matrix." + ) class MeanPairwiseDistance(AbstractDistance): @@ -616,7 +640,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return 1 - pearsonr(X.mean(axis=0), Y.mean(axis=0))[0] def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("PearsonDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "PearsonDistance cannot be called on a pairwise distance matrix." + ) class SpearmanDistance(AbstractDistance): @@ -630,7 +656,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return 1 - spearmanr(X.mean(axis=0), Y.mean(axis=0))[0] def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("SpearmanDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "SpearmanDistance cannot be called on a pairwise distance matrix." + ) class KendallTauDistance(AbstractDistance): @@ -648,7 +676,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return tau_dist def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("KendallTauDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "KendallTauDistance cannot be called on a pairwise distance matrix." + ) class CosineDistance(AbstractDistance): @@ -662,7 +692,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return cosine(X.mean(axis=0), Y.mean(axis=0)) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("CosineDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "CosineDistance cannot be called on a pairwise distance matrix." + ) class R2ScoreDistance(AbstractDistance): @@ -678,7 +710,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return 1 - r2_score(X.mean(axis=0), Y.mean(axis=0)) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("R2ScoreDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "R2ScoreDistance cannot be called on a pairwise distance matrix." + ) class SymmetricKLDivergence(AbstractDistance): @@ -699,13 +733,23 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa for i in range(X.shape[1]): x_mean, x_std = X[:, i].mean(), X[:, i].std() + epsilon y_mean, y_std = Y[:, i].mean(), Y[:, i].std() + epsilon - kl = np.log(y_std / x_std) + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2) - 1 / 2 - klr = np.log(x_std / y_std) + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2) - 1 / 2 + kl = ( + np.log(y_std / x_std) + + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2) + - 1 / 2 + ) + klr = ( + np.log(x_std / y_std) + + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2) + - 1 / 2 + ) kl_all.append(kl + klr) return sum(kl_all) / len(kl_all) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("SymmetricKLDivergence cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "SymmetricKLDivergence cannot be called on a pairwise distance matrix." + ) class TTestDistance(AbstractDistance): @@ -729,7 +773,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa return sum(t_test_all) / len(t_test_all) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("TTestDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "TTestDistance cannot be called on a pairwise distance matrix." + ) class KSTestDistance(AbstractDistance): @@ -746,7 +792,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return sum(stats) / len(stats) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("KSTestDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "KSTestDistance cannot be called on a pairwise distance matrix." + ) class NBLL(AbstractDistance): @@ -761,7 +809,9 @@ def __init__(self) -> None: def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> float: def _is_count_matrix(matrix, tolerance=1e-6): - if matrix.dtype.kind == "i" or np.all(np.abs(matrix - np.round(matrix)) < tolerance): + if matrix.dtype.kind == "i" or np.all( + np.abs(matrix - np.round(matrix)) < tolerance + ): return True else: return False @@ -770,7 +820,9 @@ def _is_count_matrix(matrix, tolerance=1e-6): raise ValueError("NBLL distance only works for raw counts.") @numba.jit(forceobj=True) - def _compute_nll(y: np.ndarray, nb_params: tuple[float, float], epsilon: float) -> float: + def _compute_nll( + y: np.ndarray, nb_params: tuple[float, float], epsilon: float + ) -> float: mu = np.exp(nb_params[0]) theta = 1 / nb_params[1] eps = epsilon @@ -806,12 +858,16 @@ def _process_gene(x: np.ndarray, y: np.ndarray, epsilon: float) -> float: nlls.append(nll) if genes_skipped > X.shape[1] / 2: - raise AttributeError(f"{genes_skipped} genes could not be fit, which is over half.") + raise AttributeError( + f"{genes_skipped} genes could not be fit, which is over half." + ) return -np.sum(nlls) / len(nlls) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("NBLL cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "NBLL cannot be called on a pairwise distance matrix." + ) def _sample(X, frac=None, n=None): @@ -853,7 +909,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return np.mean(test_labels[:, 1]) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("ClassifierProbaDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "ClassifierProbaDistance cannot be called on a pairwise distance matrix." + ) class ClassifierClassProjection(AbstractDistance): @@ -867,7 +925,9 @@ def __init__(self) -> None: self.accepts_precomputed = False def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - raise NotImplementedError("ClassifierClassProjection can currently only be called with onesided.") + raise NotImplementedError( + "ClassifierClassProjection can currently only be called with onesided." + ) def onesided_distances( self, @@ -906,7 +966,9 @@ def onesided_distances( return df def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("ClassifierClassProjection cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "ClassifierClassProjection cannot be called on a pairwise distance matrix." + ) class MeanVarDistnDistance(AbstractDistance): @@ -964,7 +1026,11 @@ def grid_points(d, n_points=100): def kde_eval(d, grid): # Kernel choice: Gaussian is too smoothing and cosine or other kernels that do not stretch out # can not be compared well on regions further away from the data as they are -inf - return KernelDensity(bandwidth="silverman", kernel="exponential").fit(d).score_samples(grid) + return ( + KernelDensity(bandwidth="silverman", kernel="exponential") + .fit(d) + .score_samples(grid) + ) kde_x = kde_eval(x, grid) kde_y = kde_eval(y, grid) @@ -975,7 +1041,9 @@ def kde_eval(d, grid): return kde_diff def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("MeanVarDistnDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "MeanVarDistnDistance cannot be called on a pairwise distance matrix." + ) class MahalanobisDistance(AbstractDistance): @@ -989,6 +1057,7 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: + print(np.linalg.inv(np.cov(X.T)).shape) return mahalanobis( self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0), @@ -996,4 +1065,6 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("Mahalanobis cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "Mahalanobis cannot be called on a pairwise distance matrix." + ) From 3fe911b0b57be7ded10ecd9b73a6457d6f819229 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 26 May 2024 20:50:50 +0000 Subject: [PATCH 03/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pertpy/tools/_distances/_distances.py | 130 ++++++-------------------- 1 file changed, 30 insertions(+), 100 deletions(-) diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index 8eacbfd3..4f4bc4d2 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -289,10 +289,7 @@ def pairwise( # able to handle precomputed distances such as the PseudobulkDistance. if self.metric_fct.accepts_precomputed: # Precompute the pairwise distances if needed - if ( - f"{self.obsm_key}_{self.cell_wise_metric}_predistances" - not in adata.obsp.keys() - ): + if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys(): self.precompute_distances(adata, n_jobs=n_jobs, **kwargs) pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] for index_x, group_x in enumerate(fct(groups)): @@ -305,9 +302,7 @@ def pairwise( # subset the pairwise distance matrix to the two groups sub_pwd = pwd[idx_x | idx_y, :][:, idx_x | idx_y] sub_idx = grouping[idx_x | idx_y] == group_x - dist = self.metric_fct.from_precomputed( - sub_pwd, sub_idx, **kwargs - ) + dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs) df.loc[group_x, group_y] = dist df.loc[group_y, group_x] = dist else: @@ -385,10 +380,7 @@ def onesided_distances( # able to handle precomputed distances such as the PsuedobulkDistance. if self.metric_fct.accepts_precomputed: # Precompute the pairwise distances if needed - if ( - f"{self.obsm_key}_{self.cell_wise_metric}_predistances" - not in adata.obsp.keys() - ): + if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys(): self.precompute_distances(adata, n_jobs=n_jobs, **kwargs) pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] for group_x in fct(groups): @@ -442,9 +434,7 @@ def precompute_distances(self, adata: AnnData, n_jobs: int = -1) -> None: cells = adata.layers[self.layer_key] else: cells = adata.obsm[self.obsm_key].copy() - pwd = pairwise_distances( - cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs - ) + pwd = pairwise_distances(cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs) adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] = pwd @@ -513,9 +503,7 @@ def __init__(self) -> None: super().__init__() self.accepts_precomputed = False - def __call__( - self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs - ) -> float: + def __call__(self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs) -> float: if kernel == "linear": XX = np.dot(X, X.T) YY = np.dot(Y, Y.T) @@ -570,9 +558,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "EuclideanDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("EuclideanDistance cannot be called on a pairwise distance matrix.") class MeanSquaredDistance(AbstractDistance): @@ -583,15 +569,10 @@ def __init__(self) -> None: self.accepts_precomputed = False def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return ( - np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 2 - / X.shape[1] - ) + return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 2 / X.shape[1] def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "MeanSquaredDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("MeanSquaredDistance cannot be called on a pairwise distance matrix.") class MeanAbsoluteDistance(AbstractDistance): @@ -602,15 +583,10 @@ def __init__(self) -> None: self.accepts_precomputed = False def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return ( - np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs) - / X.shape[1] - ) + return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs) / X.shape[1] def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "MeanAbsoluteDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("MeanAbsoluteDistance cannot be called on a pairwise distance matrix.") class MeanPairwiseDistance(AbstractDistance): @@ -640,9 +616,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return 1 - pearsonr(X.mean(axis=0), Y.mean(axis=0))[0] def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "PearsonDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("PearsonDistance cannot be called on a pairwise distance matrix.") class SpearmanDistance(AbstractDistance): @@ -656,9 +630,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return 1 - spearmanr(X.mean(axis=0), Y.mean(axis=0))[0] def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "SpearmanDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("SpearmanDistance cannot be called on a pairwise distance matrix.") class KendallTauDistance(AbstractDistance): @@ -676,9 +648,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return tau_dist def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "KendallTauDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("KendallTauDistance cannot be called on a pairwise distance matrix.") class CosineDistance(AbstractDistance): @@ -692,9 +662,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return cosine(X.mean(axis=0), Y.mean(axis=0)) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "CosineDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("CosineDistance cannot be called on a pairwise distance matrix.") class R2ScoreDistance(AbstractDistance): @@ -710,9 +678,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return 1 - r2_score(X.mean(axis=0), Y.mean(axis=0)) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "R2ScoreDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("R2ScoreDistance cannot be called on a pairwise distance matrix.") class SymmetricKLDivergence(AbstractDistance): @@ -733,23 +699,13 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa for i in range(X.shape[1]): x_mean, x_std = X[:, i].mean(), X[:, i].std() + epsilon y_mean, y_std = Y[:, i].mean(), Y[:, i].std() + epsilon - kl = ( - np.log(y_std / x_std) - + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2) - - 1 / 2 - ) - klr = ( - np.log(x_std / y_std) - + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2) - - 1 / 2 - ) + kl = np.log(y_std / x_std) + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2) - 1 / 2 + klr = np.log(x_std / y_std) + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2) - 1 / 2 kl_all.append(kl + klr) return sum(kl_all) / len(kl_all) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "SymmetricKLDivergence cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("SymmetricKLDivergence cannot be called on a pairwise distance matrix.") class TTestDistance(AbstractDistance): @@ -773,9 +729,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa return sum(t_test_all) / len(t_test_all) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "TTestDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("TTestDistance cannot be called on a pairwise distance matrix.") class KSTestDistance(AbstractDistance): @@ -792,9 +746,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return sum(stats) / len(stats) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "KSTestDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("KSTestDistance cannot be called on a pairwise distance matrix.") class NBLL(AbstractDistance): @@ -809,9 +761,7 @@ def __init__(self) -> None: def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> float: def _is_count_matrix(matrix, tolerance=1e-6): - if matrix.dtype.kind == "i" or np.all( - np.abs(matrix - np.round(matrix)) < tolerance - ): + if matrix.dtype.kind == "i" or np.all(np.abs(matrix - np.round(matrix)) < tolerance): return True else: return False @@ -820,9 +770,7 @@ def _is_count_matrix(matrix, tolerance=1e-6): raise ValueError("NBLL distance only works for raw counts.") @numba.jit(forceobj=True) - def _compute_nll( - y: np.ndarray, nb_params: tuple[float, float], epsilon: float - ) -> float: + def _compute_nll(y: np.ndarray, nb_params: tuple[float, float], epsilon: float) -> float: mu = np.exp(nb_params[0]) theta = 1 / nb_params[1] eps = epsilon @@ -858,16 +806,12 @@ def _process_gene(x: np.ndarray, y: np.ndarray, epsilon: float) -> float: nlls.append(nll) if genes_skipped > X.shape[1] / 2: - raise AttributeError( - f"{genes_skipped} genes could not be fit, which is over half." - ) + raise AttributeError(f"{genes_skipped} genes could not be fit, which is over half.") return -np.sum(nlls) / len(nlls) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "NBLL cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("NBLL cannot be called on a pairwise distance matrix.") def _sample(X, frac=None, n=None): @@ -909,9 +853,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return np.mean(test_labels[:, 1]) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "ClassifierProbaDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("ClassifierProbaDistance cannot be called on a pairwise distance matrix.") class ClassifierClassProjection(AbstractDistance): @@ -925,9 +867,7 @@ def __init__(self) -> None: self.accepts_precomputed = False def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "ClassifierClassProjection can currently only be called with onesided." - ) + raise NotImplementedError("ClassifierClassProjection can currently only be called with onesided.") def onesided_distances( self, @@ -966,9 +906,7 @@ def onesided_distances( return df def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "ClassifierClassProjection cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("ClassifierClassProjection cannot be called on a pairwise distance matrix.") class MeanVarDistnDistance(AbstractDistance): @@ -1026,11 +964,7 @@ def grid_points(d, n_points=100): def kde_eval(d, grid): # Kernel choice: Gaussian is too smoothing and cosine or other kernels that do not stretch out # can not be compared well on regions further away from the data as they are -inf - return ( - KernelDensity(bandwidth="silverman", kernel="exponential") - .fit(d) - .score_samples(grid) - ) + return KernelDensity(bandwidth="silverman", kernel="exponential").fit(d).score_samples(grid) kde_x = kde_eval(x, grid) kde_y = kde_eval(y, grid) @@ -1041,9 +975,7 @@ def kde_eval(d, grid): return kde_diff def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "MeanVarDistnDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("MeanVarDistnDistance cannot be called on a pairwise distance matrix.") class MahalanobisDistance(AbstractDistance): @@ -1065,6 +997,4 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "Mahalanobis cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("Mahalanobis cannot be called on a pairwise distance matrix.") From 6d419c3c6dcd5b703b3ee795f845dd9b044fe779 Mon Sep 17 00:00:00 2001 From: wxicu Date: Mon, 3 Jun 2024 00:08:30 +0200 Subject: [PATCH 04/34] add agg fct --- pertpy/tools/_distances/_distances.py | 209 +++++++++++++++++------ tests/tools/_distances/test_distances.py | 22 ++- 2 files changed, 171 insertions(+), 60 deletions(-) diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index 4f4bc4d2..5d50a9bb 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -160,23 +160,23 @@ def __init__( if metric == "edistance": metric_fct = Edistance() elif metric == "euclidean": - metric_fct = EuclideanDistance() + metric_fct = EuclideanDistance(self.aggregation_func) elif metric == "root_mean_squared_error": - metric_fct = EuclideanDistance() + metric_fct = EuclideanDistance(self.aggregation_func) elif metric == "mse": - metric_fct = MeanSquaredDistance() + metric_fct = MeanSquaredDistance(self.aggregation_func) elif metric == "mean_absolute_error": - metric_fct = MeanAbsoluteDistance() + metric_fct = MeanAbsoluteDistance(self.aggregation_func) elif metric == "pearson_distance": - metric_fct = PearsonDistance() + metric_fct = PearsonDistance(self.aggregation_func) elif metric == "spearman_distance": - metric_fct = SpearmanDistance() + metric_fct = SpearmanDistance(self.aggregation_func) elif metric == "kendalltau_distance": - metric_fct = KendallTauDistance() + metric_fct = KendallTauDistance(self.aggregation_func) elif metric == "cosine_distance": - metric_fct = CosineDistance() + metric_fct = CosineDistance(self.aggregation_func) elif metric == "r2_distance": - metric_fct = R2ScoreDistance() + metric_fct = R2ScoreDistance(self.aggregation_func) elif metric == "mean_pairwise": metric_fct = MeanPairwiseDistance() elif metric == "mmd": @@ -289,7 +289,10 @@ def pairwise( # able to handle precomputed distances such as the PseudobulkDistance. if self.metric_fct.accepts_precomputed: # Precompute the pairwise distances if needed - if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys(): + if ( + f"{self.obsm_key}_{self.cell_wise_metric}_predistances" + not in adata.obsp.keys() + ): self.precompute_distances(adata, n_jobs=n_jobs, **kwargs) pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] for index_x, group_x in enumerate(fct(groups)): @@ -302,7 +305,9 @@ def pairwise( # subset the pairwise distance matrix to the two groups sub_pwd = pwd[idx_x | idx_y, :][:, idx_x | idx_y] sub_idx = grouping[idx_x | idx_y] == group_x - dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs) + dist = self.metric_fct.from_precomputed( + sub_pwd, sub_idx, **kwargs + ) df.loc[group_x, group_y] = dist df.loc[group_y, group_x] = dist else: @@ -380,7 +385,10 @@ def onesided_distances( # able to handle precomputed distances such as the PsuedobulkDistance. if self.metric_fct.accepts_precomputed: # Precompute the pairwise distances if needed - if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys(): + if ( + f"{self.obsm_key}_{self.cell_wise_metric}_predistances" + not in adata.obsp.keys() + ): self.precompute_distances(adata, n_jobs=n_jobs, **kwargs) pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] for group_x in fct(groups): @@ -434,7 +442,9 @@ def precompute_distances(self, adata: AnnData, n_jobs: int = -1) -> None: cells = adata.layers[self.layer_key] else: cells = adata.obsm[self.obsm_key].copy() - pwd = pairwise_distances(cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs) + pwd = pairwise_distances( + cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs + ) adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] = pwd @@ -503,7 +513,9 @@ def __init__(self) -> None: super().__init__() self.accepts_precomputed = False - def __call__(self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs) -> float: + def __call__( + self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs + ) -> float: if kernel == "linear": XX = np.dot(X, X.T) YY = np.dot(Y, Y.T) @@ -550,43 +562,62 @@ def solve_ot_problem(self, geom: Geometry, **kwargs): class EuclideanDistance(AbstractDistance): """Euclidean distance between pseudobulk vectors.""" - def __init__(self) -> None: + def __init__(self, aggregation_func: Callable = np.mean) -> None: super().__init__() self.accepts_precomputed = False + self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) + return np.linalg.norm( + self.aggregation_func(X, axis=0) - self.aggregation_func(Y, axis=0), + ord=2, + **kwargs, + ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("EuclideanDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "EuclideanDistance cannot be called on a pairwise distance matrix." + ) class MeanSquaredDistance(AbstractDistance): """Mean squared distance between pseudobulk vectors.""" - def __init__(self) -> None: + def __init__(self, aggregation_func: Callable = np.mean) -> None: super().__init__() self.accepts_precomputed = False + self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 2 / X.shape[1] + return ( + np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 2 + / X.shape[1] + ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("MeanSquaredDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "MeanSquaredDistance cannot be called on a pairwise distance matrix." + ) class MeanAbsoluteDistance(AbstractDistance): """Absolute (Norm-1) distance between pseudobulk vectors.""" - def __init__(self) -> None: + def __init__(self, aggregation_func: Callable = np.mean) -> None: super().__init__() self.accepts_precomputed = False + self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs) / X.shape[1] + return ( + np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs) + / X.shape[1] + ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("MeanAbsoluteDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "MeanAbsoluteDistance cannot be called on a pairwise distance matrix." + ) class MeanPairwiseDistance(AbstractDistance): @@ -608,61 +639,85 @@ def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: class PearsonDistance(AbstractDistance): """Pearson distance between pseudobulk vectors.""" - def __init__(self) -> None: + def __init__(self, aggregation_func: Callable = np.mean) -> None: super().__init__() self.accepts_precomputed = False + self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return 1 - pearsonr(X.mean(axis=0), Y.mean(axis=0))[0] + return ( + 1 + - pearsonr( + self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0) + )[0] + ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("PearsonDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "PearsonDistance cannot be called on a pairwise distance matrix." + ) class SpearmanDistance(AbstractDistance): """Spearman distance between pseudobulk vectors.""" - def __init__(self) -> None: + def __init__(self, aggregation_func: Callable = np.mean) -> None: super().__init__() self.accepts_precomputed = False + self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return 1 - spearmanr(X.mean(axis=0), Y.mean(axis=0))[0] + return ( + 1 + - spearmanr( + self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0) + )[0] + ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("SpearmanDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "SpearmanDistance cannot be called on a pairwise distance matrix." + ) class KendallTauDistance(AbstractDistance): """Kendall-tau distance between pseudobulk vectors.""" - def __init__(self) -> None: + def __init__(self, aggregation_func: Callable = np.mean) -> None: super().__init__() self.accepts_precomputed = False + self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - x, y = X.mean(axis=0), Y.mean(axis=0) + x, y = self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0) n = len(x) tau_corr = kendalltau(x, y).statistic tau_dist = (1 - tau_corr) * n * (n - 1) / 4 return tau_dist def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("KendallTauDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "KendallTauDistance cannot be called on a pairwise distance matrix." + ) class CosineDistance(AbstractDistance): """Cosine distance between pseudobulk vectors.""" - def __init__(self) -> None: + def __init__(self, aggregation_func: Callable = np.mean) -> None: super().__init__() self.accepts_precomputed = False + self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return cosine(X.mean(axis=0), Y.mean(axis=0)) + return cosine( + self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0) + ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("CosineDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "CosineDistance cannot be called on a pairwise distance matrix." + ) class R2ScoreDistance(AbstractDistance): @@ -670,15 +725,20 @@ class R2ScoreDistance(AbstractDistance): # NOTE: This is not a distance metric but a similarity metric. - def __init__(self) -> None: + def __init__(self, aggregation_func: Callable = np.mean) -> None: super().__init__() self.accepts_precomputed = False + self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return 1 - r2_score(X.mean(axis=0), Y.mean(axis=0)) + return 1 - r2_score( + self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0) + ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("R2ScoreDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "R2ScoreDistance cannot be called on a pairwise distance matrix." + ) class SymmetricKLDivergence(AbstractDistance): @@ -699,13 +759,23 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa for i in range(X.shape[1]): x_mean, x_std = X[:, i].mean(), X[:, i].std() + epsilon y_mean, y_std = Y[:, i].mean(), Y[:, i].std() + epsilon - kl = np.log(y_std / x_std) + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2) - 1 / 2 - klr = np.log(x_std / y_std) + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2) - 1 / 2 + kl = ( + np.log(y_std / x_std) + + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2) + - 1 / 2 + ) + klr = ( + np.log(x_std / y_std) + + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2) + - 1 / 2 + ) kl_all.append(kl + klr) return sum(kl_all) / len(kl_all) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("SymmetricKLDivergence cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "SymmetricKLDivergence cannot be called on a pairwise distance matrix." + ) class TTestDistance(AbstractDistance): @@ -729,7 +799,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa return sum(t_test_all) / len(t_test_all) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("TTestDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "TTestDistance cannot be called on a pairwise distance matrix." + ) class KSTestDistance(AbstractDistance): @@ -746,7 +818,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return sum(stats) / len(stats) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("KSTestDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "KSTestDistance cannot be called on a pairwise distance matrix." + ) class NBLL(AbstractDistance): @@ -761,7 +835,9 @@ def __init__(self) -> None: def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> float: def _is_count_matrix(matrix, tolerance=1e-6): - if matrix.dtype.kind == "i" or np.all(np.abs(matrix - np.round(matrix)) < tolerance): + if matrix.dtype.kind == "i" or np.all( + np.abs(matrix - np.round(matrix)) < tolerance + ): return True else: return False @@ -770,7 +846,9 @@ def _is_count_matrix(matrix, tolerance=1e-6): raise ValueError("NBLL distance only works for raw counts.") @numba.jit(forceobj=True) - def _compute_nll(y: np.ndarray, nb_params: tuple[float, float], epsilon: float) -> float: + def _compute_nll( + y: np.ndarray, nb_params: tuple[float, float], epsilon: float + ) -> float: mu = np.exp(nb_params[0]) theta = 1 / nb_params[1] eps = epsilon @@ -806,12 +884,16 @@ def _process_gene(x: np.ndarray, y: np.ndarray, epsilon: float) -> float: nlls.append(nll) if genes_skipped > X.shape[1] / 2: - raise AttributeError(f"{genes_skipped} genes could not be fit, which is over half.") + raise AttributeError( + f"{genes_skipped} genes could not be fit, which is over half." + ) return -np.sum(nlls) / len(nlls) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("NBLL cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "NBLL cannot be called on a pairwise distance matrix." + ) def _sample(X, frac=None, n=None): @@ -853,7 +935,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return np.mean(test_labels[:, 1]) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("ClassifierProbaDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "ClassifierProbaDistance cannot be called on a pairwise distance matrix." + ) class ClassifierClassProjection(AbstractDistance): @@ -867,7 +951,9 @@ def __init__(self) -> None: self.accepts_precomputed = False def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - raise NotImplementedError("ClassifierClassProjection can currently only be called with onesided.") + raise NotImplementedError( + "ClassifierClassProjection can currently only be called with onesided." + ) def onesided_distances( self, @@ -884,6 +970,7 @@ def onesided_distances( Similar to the parent function, the returned dataframe contains only the specified groups. """ groups = adata.obs[groupby].unique() if groups is None else groups + fct = track if show_progressbar else lambda iterable: iterable X = adata[adata.obs[groupby] != selected_group].X labels = adata[adata.obs[groupby] != selected_group].obs[groupby].values @@ -894,7 +981,8 @@ def onesided_distances( test_probas = reg.predict_proba(Y) df = pd.Series(index=groups, dtype=float) - for group in groups: + + for group in fct(groups): if group == selected_group: df.loc[group] = 0 else: @@ -906,7 +994,9 @@ def onesided_distances( return df def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("ClassifierClassProjection cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "ClassifierClassProjection cannot be called on a pairwise distance matrix." + ) class MeanVarDistnDistance(AbstractDistance): @@ -915,7 +1005,7 @@ class MeanVarDistnDistance(AbstractDistance): """ - def __init__(self, aggregation_func: Callable = np.mean) -> None: + def __init__(self) -> None: super().__init__() self.accepts_precomputed = False @@ -964,7 +1054,11 @@ def grid_points(d, n_points=100): def kde_eval(d, grid): # Kernel choice: Gaussian is too smoothing and cosine or other kernels that do not stretch out # can not be compared well on regions further away from the data as they are -inf - return KernelDensity(bandwidth="silverman", kernel="exponential").fit(d).score_samples(grid) + return ( + KernelDensity(bandwidth="silverman", kernel="exponential") + .fit(d) + .score_samples(grid) + ) kde_x = kde_eval(x, grid) kde_y = kde_eval(y, grid) @@ -975,7 +1069,9 @@ def kde_eval(d, grid): return kde_diff def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("MeanVarDistnDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "MeanVarDistnDistance cannot be called on a pairwise distance matrix." + ) class MahalanobisDistance(AbstractDistance): @@ -989,7 +1085,6 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - print(np.linalg.inv(np.cov(X.T)).shape) return mahalanobis( self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0), @@ -997,4 +1092,6 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("Mahalanobis cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "Mahalanobis cannot be called on a pairwise distance matrix." + ) diff --git a/tests/tools/_distances/test_distances.py b/tests/tools/_distances/test_distances.py index bd67efac..ccf9b4b9 100644 --- a/tests/tools/_distances/test_distances.py +++ b/tests/tools/_distances/test_distances.py @@ -27,7 +27,13 @@ onesided_only = ["classifier_cp"] pseudo_counts_distances = ["nb_ll"] lognorm_counts_distances = ["mean_var_distn"] -all_distances = actual_distances + semi_distances + non_distances + pseudo_counts_distances # + onesided_only +all_distances = ( + actual_distances + + semi_distances + + non_distances + + pseudo_counts_distances + + lognorm_counts_distances +) # + onesided_only @pytest.fixture @@ -40,6 +46,7 @@ def all_pairwise_distances(): "classifier_proba", "classifier_cp", "mahalanobis", + "mean_var_distn", ] for distance in all_distances: @@ -90,7 +97,10 @@ def test_triangle_inequality(all_pairwise_distances): for _i in range(10): rng = np.random.default_rng() triplet = rng.choice(df.index, size=3, replace=False) - assert df.loc[triplet[0], triplet[1]] + df.loc[triplet[1], triplet[2]] >= df.loc[triplet[0], triplet[2]] + assert ( + df.loc[triplet[0], triplet[1]] + df.loc[triplet[1], triplet[2]] + >= df.loc[triplet[0], triplet[2]] + ) def test_distance_layers(all_pairwise_distances): @@ -120,7 +130,9 @@ def test_distance_output_type(all_pairwise_distances): # Test if distances are outputting floats for distance in all_distances: df = all_pairwise_distances[distance] - assert df.apply(lambda col: pd.api.types.is_float_dtype(col)).all(), "Not all values are floats." + assert df.apply( + lambda col: pd.api.types.is_float_dtype(col) + ).all(), "Not all values are floats." def test_distance_pairwise(all_pairwise_distances): @@ -141,7 +153,9 @@ def test_distance_onesided(): for distance in onesided_only: Distance = pt.tl.Distance(distance, obsm_key="X_pca") - df = Distance.onesided_distances(adata, groupby="perturbation", selected_group=selected_group) + df = Distance.onesided_distances( + adata, groupby="perturbation", selected_group=selected_group + ) assert isinstance(df, Series) assert df.loc[selected_group] == 0 # distance to self is 0 From ca860256bef6d0a43338f4f538e27139204019a6 Mon Sep 17 00:00:00 2001 From: wxicu Date: Mon, 3 Jun 2024 14:04:51 +0200 Subject: [PATCH 05/34] speed up tests --- pertpy/tools/_distances/_distances.py | 194 ++++++----------------- tests/tools/_distances/test_distances.py | 32 ++-- 2 files changed, 65 insertions(+), 161 deletions(-) diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index 5d50a9bb..c2bcfd73 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -1,5 +1,6 @@ from __future__ import annotations +import multiprocessing from abc import ABC, abstractmethod from typing import TYPE_CHECKING @@ -25,22 +26,14 @@ def compute_medoid(arr, axis=None): if len(arr) == 0: - return None # Handle the case when the array is empty + return None if axis is not None: - # If axis is specified, compute the medoid along that axis return np.apply_along_axis(lambda x: compute_medoid(x), axis, arr) - # Calculate pairwise distances between all elements distances = np.abs(arr[:, np.newaxis] - arr) - - # Calculate the total distance for each element total_distances = np.sum(distances, axis=1) - - # Find the index of the element with the smallest total distance (medoid) medoid_index = np.argmin(total_distances) - - # Return the medoid value medoid = arr[medoid_index] return medoid @@ -289,10 +282,7 @@ def pairwise( # able to handle precomputed distances such as the PseudobulkDistance. if self.metric_fct.accepts_precomputed: # Precompute the pairwise distances if needed - if ( - f"{self.obsm_key}_{self.cell_wise_metric}_predistances" - not in adata.obsp.keys() - ): + if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys(): self.precompute_distances(adata, n_jobs=n_jobs, **kwargs) pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] for index_x, group_x in enumerate(fct(groups)): @@ -305,9 +295,7 @@ def pairwise( # subset the pairwise distance matrix to the two groups sub_pwd = pwd[idx_x | idx_y, :][:, idx_x | idx_y] sub_idx = grouping[idx_x | idx_y] == group_x - dist = self.metric_fct.from_precomputed( - sub_pwd, sub_idx, **kwargs - ) + dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs) df.loc[group_x, group_y] = dist df.loc[group_y, group_x] = dist else: @@ -385,10 +373,7 @@ def onesided_distances( # able to handle precomputed distances such as the PsuedobulkDistance. if self.metric_fct.accepts_precomputed: # Precompute the pairwise distances if needed - if ( - f"{self.obsm_key}_{self.cell_wise_metric}_predistances" - not in adata.obsp.keys() - ): + if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys(): self.precompute_distances(adata, n_jobs=n_jobs, **kwargs) pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] for group_x in fct(groups): @@ -442,9 +427,7 @@ def precompute_distances(self, adata: AnnData, n_jobs: int = -1) -> None: cells = adata.layers[self.layer_key] else: cells = adata.obsm[self.obsm_key].copy() - pwd = pairwise_distances( - cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs - ) + pwd = pairwise_distances(cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs) adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] = pwd @@ -513,9 +496,7 @@ def __init__(self) -> None: super().__init__() self.accepts_precomputed = False - def __call__( - self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs - ) -> float: + def __call__(self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs) -> float: if kernel == "linear": XX = np.dot(X, X.T) YY = np.dot(Y, Y.T) @@ -575,9 +556,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "EuclideanDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("EuclideanDistance cannot be called on a pairwise distance matrix.") class MeanSquaredDistance(AbstractDistance): @@ -589,15 +568,10 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return ( - np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 2 - / X.shape[1] - ) + return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 2 / X.shape[1] def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "MeanSquaredDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("MeanSquaredDistance cannot be called on a pairwise distance matrix.") class MeanAbsoluteDistance(AbstractDistance): @@ -609,15 +583,10 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return ( - np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs) - / X.shape[1] - ) + return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs) / X.shape[1] def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "MeanAbsoluteDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("MeanAbsoluteDistance cannot be called on a pairwise distance matrix.") class MeanPairwiseDistance(AbstractDistance): @@ -645,17 +614,10 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return ( - 1 - - pearsonr( - self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0) - )[0] - ) + return 1 - pearsonr(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0))[0] def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "PearsonDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("PearsonDistance cannot be called on a pairwise distance matrix.") class SpearmanDistance(AbstractDistance): @@ -667,17 +629,10 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return ( - 1 - - spearmanr( - self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0) - )[0] - ) + return 1 - spearmanr(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0))[0] def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "SpearmanDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("SpearmanDistance cannot be called on a pairwise distance matrix.") class KendallTauDistance(AbstractDistance): @@ -696,9 +651,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return tau_dist def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "KendallTauDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("KendallTauDistance cannot be called on a pairwise distance matrix.") class CosineDistance(AbstractDistance): @@ -710,14 +663,10 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return cosine( - self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0) - ) + return cosine(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "CosineDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("CosineDistance cannot be called on a pairwise distance matrix.") class R2ScoreDistance(AbstractDistance): @@ -731,14 +680,10 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return 1 - r2_score( - self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0) - ) + return 1 - r2_score(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "R2ScoreDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("R2ScoreDistance cannot be called on a pairwise distance matrix.") class SymmetricKLDivergence(AbstractDistance): @@ -759,23 +704,13 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa for i in range(X.shape[1]): x_mean, x_std = X[:, i].mean(), X[:, i].std() + epsilon y_mean, y_std = Y[:, i].mean(), Y[:, i].std() + epsilon - kl = ( - np.log(y_std / x_std) - + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2) - - 1 / 2 - ) - klr = ( - np.log(x_std / y_std) - + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2) - - 1 / 2 - ) + kl = np.log(y_std / x_std) + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2) - 1 / 2 + klr = np.log(x_std / y_std) + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2) - 1 / 2 kl_all.append(kl + klr) return sum(kl_all) / len(kl_all) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "SymmetricKLDivergence cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("SymmetricKLDivergence cannot be called on a pairwise distance matrix.") class TTestDistance(AbstractDistance): @@ -799,9 +734,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa return sum(t_test_all) / len(t_test_all) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "TTestDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("TTestDistance cannot be called on a pairwise distance matrix.") class KSTestDistance(AbstractDistance): @@ -818,9 +751,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return sum(stats) / len(stats) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "KSTestDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("KSTestDistance cannot be called on a pairwise distance matrix.") class NBLL(AbstractDistance): @@ -835,9 +766,7 @@ def __init__(self) -> None: def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> float: def _is_count_matrix(matrix, tolerance=1e-6): - if matrix.dtype.kind == "i" or np.all( - np.abs(matrix - np.round(matrix)) < tolerance - ): + if matrix.dtype.kind == "i" or np.all(np.abs(matrix - np.round(matrix)) < tolerance): return True else: return False @@ -846,9 +775,7 @@ def _is_count_matrix(matrix, tolerance=1e-6): raise ValueError("NBLL distance only works for raw counts.") @numba.jit(forceobj=True) - def _compute_nll( - y: np.ndarray, nb_params: tuple[float, float], epsilon: float - ) -> float: + def _compute_nll(y: np.ndarray, nb_params: tuple[float, float], epsilon: float) -> float: mu = np.exp(nb_params[0]) theta = 1 / nb_params[1] eps = epsilon @@ -884,16 +811,12 @@ def _process_gene(x: np.ndarray, y: np.ndarray, epsilon: float) -> float: nlls.append(nll) if genes_skipped > X.shape[1] / 2: - raise AttributeError( - f"{genes_skipped} genes could not be fit, which is over half." - ) + raise AttributeError(f"{genes_skipped} genes could not be fit, which is over half.") return -np.sum(nlls) / len(nlls) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "NBLL cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("NBLL cannot be called on a pairwise distance matrix.") def _sample(X, frac=None, n=None): @@ -935,9 +858,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return np.mean(test_labels[:, 1]) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "ClassifierProbaDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("ClassifierProbaDistance cannot be called on a pairwise distance matrix.") class ClassifierClassProjection(AbstractDistance): @@ -951,9 +872,7 @@ def __init__(self) -> None: self.accepts_precomputed = False def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "ClassifierClassProjection can currently only be called with onesided." - ) + raise NotImplementedError("ClassifierClassProjection can currently only be called with onesided.") def onesided_distances( self, @@ -994,9 +913,7 @@ def onesided_distances( return df def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "ClassifierClassProjection cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("ClassifierClassProjection cannot be called on a pairwise distance matrix.") class MeanVarDistnDistance(AbstractDistance): @@ -1030,11 +947,6 @@ def mean_var(x, log: bool = False): def prep_kde_data(x, y): return np.concatenate([x.reshape(-1, 1), y.reshape(-1, 1)], axis=1) - mean_x, var_x = mean_var(X, log=True) - x = prep_kde_data(mean_x, var_x) - mean_y, var_y = mean_var(Y, log=True) - y = prep_kde_data(mean_y, var_y) - def grid_points(d, n_points=100): # Make grid, add 1 bin on lower/upper end to get final n_points d_min = d.min() @@ -1045,39 +957,39 @@ def grid_points(d, n_points=100): d_max = d_max + d_bin return np.arange(start=d_min + 0.5 * d_bin, stop=d_max, step=d_bin) - # Gridpoints to eval KDE on - mean_grid = grid_points(np.concatenate([mean_x, mean_y])) - var_grid = grid_points(np.concatenate([var_x, var_y])) - grid = np.array(np.meshgrid(mean_grid, var_grid)).T.reshape(-1, 2) + def parrallel_score_samples(kde, samples, thread_count=int(0.875 * multiprocessing.cpu_count())): + with multiprocessing.Pool(thread_count) as p: + return np.concatenate(p.map(kde.score_samples, np.array_split(samples, thread_count))) - # KDE def kde_eval(d, grid): # Kernel choice: Gaussian is too smoothing and cosine or other kernels that do not stretch out # can not be compared well on regions further away from the data as they are -inf - return ( - KernelDensity(bandwidth="silverman", kernel="exponential") - .fit(d) - .score_samples(grid) - ) + kde = KernelDensity(bandwidth="silverman", kernel="exponential").fit(d) + return parrallel_score_samples(kde, grid) + + mean_x, var_x = mean_var(X, log=True) + mean_y, var_y = mean_var(Y, log=True) + + x = prep_kde_data(mean_x, var_x) + y = prep_kde_data(mean_y, var_y) + + # Gridpoints to eval KDE on + mean_grid = grid_points(np.concatenate([mean_x, mean_y])) + var_grid = grid_points(np.concatenate([var_x, var_y])) + grid = np.array(np.meshgrid(mean_grid, var_grid)).T.reshape(-1, 2) kde_x = kde_eval(x, grid) kde_y = kde_eval(y, grid) - - # KDE difference kde_diff = ((kde_x - kde_y) ** 2).mean() return kde_diff def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "MeanVarDistnDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("MeanVarDistnDistance cannot be called on a pairwise distance matrix.") class MahalanobisDistance(AbstractDistance): - """ - Mahalanobis distance between pseudobulk vectors. - """ + """Mahalanobis distance between pseudobulk vectors.""" def __init__(self, aggregation_func: Callable = np.mean) -> None: super().__init__() @@ -1092,6 +1004,4 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "Mahalanobis cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("Mahalanobis cannot be called on a pairwise distance matrix.") diff --git a/tests/tools/_distances/test_distances.py b/tests/tools/_distances/test_distances.py index ccf9b4b9..a4583ab1 100644 --- a/tests/tools/_distances/test_distances.py +++ b/tests/tools/_distances/test_distances.py @@ -28,18 +28,14 @@ pseudo_counts_distances = ["nb_ll"] lognorm_counts_distances = ["mean_var_distn"] all_distances = ( - actual_distances - + semi_distances - + non_distances - + pseudo_counts_distances - + lognorm_counts_distances + actual_distances + semi_distances + non_distances + pseudo_counts_distances + lognorm_counts_distances ) # + onesided_only @pytest.fixture def all_pairwise_distances(): all_calulated_distances = {} - no_subsample_distances = [ + low_subsample_distances = [ "sym_kldiv", "t_test", "ks_test", @@ -48,19 +44,24 @@ def all_pairwise_distances(): "mahalanobis", "mean_var_distn", ] + no_subsample_distances = ["mahalanobis"] # mahalanobis only works on the full data without subsampling for distance in all_distances: adata = pt.dt.distance_example() if distance not in no_subsample_distances: - adata = sc.pp.subsample(adata, 0.001, copy=True) - elif distance != "mahalanobis": - adata = sc.pp.subsample(adata, 0.1, copy=True) + if distance in low_subsample_distances: + adata = sc.pp.subsample(adata, 0.1, copy=True) + else: + adata = sc.pp.subsample(adata, 0.001, copy=True) adata.layers["lognorm"] = adata.X.copy() adata.layers["counts"] = np.round(adata.X.toarray()).astype(int) if "X_pca" not in adata.obsm.keys(): sc.pp.pca(adata, n_comps=5) if distance in lognorm_counts_distances: + groups = np.unique(adata.obs["perturbation"]) + # KDE is slow, subset to 5 groups for speed up + adata = adata[adata.obs["perturbation"].isin(groups[0:5])].copy() Distance = pt.tl.Distance(distance, layer_key="lognorm") elif distance in pseudo_counts_distances: Distance = pt.tl.Distance(distance, layer_key="counts") @@ -97,10 +98,7 @@ def test_triangle_inequality(all_pairwise_distances): for _i in range(10): rng = np.random.default_rng() triplet = rng.choice(df.index, size=3, replace=False) - assert ( - df.loc[triplet[0], triplet[1]] + df.loc[triplet[1], triplet[2]] - >= df.loc[triplet[0], triplet[2]] - ) + assert df.loc[triplet[0], triplet[1]] + df.loc[triplet[1], triplet[2]] >= df.loc[triplet[0], triplet[2]] def test_distance_layers(all_pairwise_distances): @@ -130,9 +128,7 @@ def test_distance_output_type(all_pairwise_distances): # Test if distances are outputting floats for distance in all_distances: df = all_pairwise_distances[distance] - assert df.apply( - lambda col: pd.api.types.is_float_dtype(col) - ).all(), "Not all values are floats." + assert df.apply(lambda col: pd.api.types.is_float_dtype(col)).all(), "Not all values are floats." def test_distance_pairwise(all_pairwise_distances): @@ -153,9 +149,7 @@ def test_distance_onesided(): for distance in onesided_only: Distance = pt.tl.Distance(distance, obsm_key="X_pca") - df = Distance.onesided_distances( - adata, groupby="perturbation", selected_group=selected_group - ) + df = Distance.onesided_distances(adata, groupby="perturbation", selected_group=selected_group) assert isinstance(df, Series) assert df.loc[selected_group] == 0 # distance to self is 0 From 9fd4c2b72b2dd1d0256848c4f5a4668adbf0c657 Mon Sep 17 00:00:00 2001 From: wxicu Date: Mon, 3 Jun 2024 14:52:27 +0200 Subject: [PATCH 06/34] add type --- pertpy/tools/_distances/_distances.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index 2208f51b..61cdd43a 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -24,10 +24,9 @@ from statsmodels.discrete.discrete_model import NegativeBinomialP -def compute_medoid(arr, axis=None): +def compute_medoid(arr: np.ndarray, axis=None): if len(arr) == 0: return None - if axis is not None: return np.apply_along_axis(lambda x: compute_medoid(x), axis, arr) From fc71eae3d3c28183ca36e02ec5e5d0dd4032d27a Mon Sep 17 00:00:00 2001 From: wxicu Date: Mon, 3 Jun 2024 16:24:27 +0200 Subject: [PATCH 07/34] add description --- pertpy/tools/_distances/_distances.py | 45 +++++++++++++++------------ 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index 61cdd43a..12d1d758 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -2,7 +2,7 @@ import multiprocessing from abc import ABC, abstractmethod -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal import numba import numpy as np @@ -106,9 +106,11 @@ class Distance: Average of the classification probability of the perturbation for a binary classifier. - "classifier_cp": classifier class projection Average of the class - - "mean_var_distn": Distance between mean-var distibutions of gene expression. - - "mahalanobis": Mahalanobis distance between the cells of two groups. - + - "mean_var_distn": Distance between mean-variance distibutions between cells of 2 groups. + Mean square distance between the mean-variance distributions of cells from 2 groups using Kernel Density Estimation (KDE). + - "mahalanobis": Mahalanobis disatnce between the means of cells from two groups. + It is originally used to measure distance between a point and a distribution. + in this context, it quantifies the difference between the mean profiles of a target group and a reference group. Attributes: metric: Name of distance metric. @@ -128,7 +130,7 @@ class Distance: def __init__( self, metric: str = "edistance", - agg_fct: str = "mean", + agg_fct: Literal["mean", "median", "medoid", "variance"] = "mean", layer_key: str = None, obsm_key: str = None, cell_wise_metric: str = "euclidean", @@ -927,11 +929,13 @@ def __init__(self) -> None: def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: """ Difference of mean-var distibutions in 2 matrices - :param X,Y: Matrices of cells*genes, normalized and log transformed + Args: + X: Normalized and log transcformed cells*genes count matrix. + Y: Normalized and log transcformed cells*genes count matrix. """ # Get log mean & var for comparison - def mean_var(x, log: bool = False): + def _mean_var(x, log: bool = False): mean = np.mean(x, axis=0) var = np.var(x, axis=0) positive = mean > 0 @@ -942,10 +946,10 @@ def mean_var(x, log: bool = False): var = np.log(var) return mean, var - def prep_kde_data(x, y): + def _prep_kde_data(x, y): return np.concatenate([x.reshape(-1, 1), y.reshape(-1, 1)], axis=1) - def grid_points(d, n_points=100): + def _grid_points(d, n_points=100): # Make grid, add 1 bin on lower/upper end to get final n_points d_min = d.min() d_max = d.max() @@ -955,29 +959,30 @@ def grid_points(d, n_points=100): d_max = d_max + d_bin return np.arange(start=d_min + 0.5 * d_bin, stop=d_max, step=d_bin) - def parrallel_score_samples(kde, samples, thread_count=int(0.875 * multiprocessing.cpu_count())): + def _parrallel_score_samples(kde, samples, thread_count=int(0.875 * multiprocessing.cpu_count())): with multiprocessing.Pool(thread_count) as p: return np.concatenate(p.map(kde.score_samples, np.array_split(samples, thread_count))) - def kde_eval(d, grid): + def _kde_eval(d, grid): # Kernel choice: Gaussian is too smoothing and cosine or other kernels that do not stretch out # can not be compared well on regions further away from the data as they are -inf kde = KernelDensity(bandwidth="silverman", kernel="exponential").fit(d) - return parrallel_score_samples(kde, grid) + return _parrallel_score_samples(kde, grid) - mean_x, var_x = mean_var(X, log=True) - mean_y, var_y = mean_var(Y, log=True) + mean_x, var_x = _mean_var(X, log=True) + mean_y, var_y = _mean_var(Y, log=True) - x = prep_kde_data(mean_x, var_x) - y = prep_kde_data(mean_y, var_y) + x = _prep_kde_data(mean_x, var_x) + y = _prep_kde_data(mean_y, var_y) # Gridpoints to eval KDE on - mean_grid = grid_points(np.concatenate([mean_x, mean_y])) - var_grid = grid_points(np.concatenate([var_x, var_y])) + mean_grid = _grid_points(np.concatenate([mean_x, mean_y])) + var_grid = _grid_points(np.concatenate([var_x, var_y])) grid = np.array(np.meshgrid(mean_grid, var_grid)).T.reshape(-1, 2) - kde_x = kde_eval(x, grid) - kde_y = kde_eval(y, grid) + kde_x = _kde_eval(x, grid) + kde_y = _kde_eval(y, grid) + kde_diff = ((kde_x - kde_y) ** 2).mean() return kde_diff From 09e5fea91e8ffabdc836af38fea49db5279f5b3c Mon Sep 17 00:00:00 2001 From: Xichen Wu <102925032+wxicu@users.noreply.github.com> Date: Wed, 5 Jun 2024 09:32:03 +0200 Subject: [PATCH 08/34] Update pertpy/tools/_distances/_distances.py Co-authored-by: Lukas Heumos --- pertpy/tools/_distances/_distances.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index 12d1d758..9686cce3 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -24,7 +24,7 @@ from statsmodels.discrete.discrete_model import NegativeBinomialP -def compute_medoid(arr: np.ndarray, axis=None): +def _compute_medoid(arr: np.ndarray, axis: str | None=None): if len(arr) == 0: return None if axis is not None: From ad23ca66696ec7234c8734705203394086fc76d6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 5 Jun 2024 07:32:14 +0000 Subject: [PATCH 09/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pertpy/tools/_distances/_distances.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index 9686cce3..03da1c2b 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -24,7 +24,7 @@ from statsmodels.discrete.discrete_model import NegativeBinomialP -def _compute_medoid(arr: np.ndarray, axis: str | None=None): +def _compute_medoid(arr: np.ndarray, axis: str | None = None): if len(arr) == 0: return None if axis is not None: From dc74884468463b6d52e73bd532475fc283c54ae7 Mon Sep 17 00:00:00 2001 From: Xichen Wu <102925032+wxicu@users.noreply.github.com> Date: Wed, 5 Jun 2024 09:32:29 +0200 Subject: [PATCH 10/34] Update pertpy/tools/_distances/_distances.py Co-authored-by: Lukas Heumos --- pertpy/tools/_distances/_distances.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index 03da1c2b..19d848dd 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -108,7 +108,7 @@ class Distance: Average of the class - "mean_var_distn": Distance between mean-variance distibutions between cells of 2 groups. Mean square distance between the mean-variance distributions of cells from 2 groups using Kernel Density Estimation (KDE). - - "mahalanobis": Mahalanobis disatnce between the means of cells from two groups. + - "mahalanobis": Mahalanobis distance between the means of cells from two groups. It is originally used to measure distance between a point and a distribution. in this context, it quantifies the difference between the mean profiles of a target group and a reference group. From c774cd28052d305a0d2dcd6c12948dffbe55fedb Mon Sep 17 00:00:00 2001 From: Xichen Wu <102925032+wxicu@users.noreply.github.com> Date: Wed, 5 Jun 2024 09:32:38 +0200 Subject: [PATCH 11/34] Update pertpy/tools/_distances/_distances.py Co-authored-by: Lukas Heumos --- pertpy/tools/_distances/_distances.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index 19d848dd..915b1584 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -934,7 +934,6 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: Y: Normalized and log transcformed cells*genes count matrix. """ - # Get log mean & var for comparison def _mean_var(x, log: bool = False): mean = np.mean(x, axis=0) var = np.var(x, axis=0) From d413d672dbca6d7f34059fb370f1bf19c44d2ef9 Mon Sep 17 00:00:00 2001 From: Xichen Wu <102925032+wxicu@users.noreply.github.com> Date: Wed, 5 Jun 2024 09:34:03 +0200 Subject: [PATCH 12/34] Update pertpy/tools/_distances/_distances.py Co-authored-by: Eljas Roellin <65244425+eroell@users.noreply.github.com> --- pertpy/tools/_distances/_distances.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index 915b1584..161f8152 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -930,7 +930,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: """ Difference of mean-var distibutions in 2 matrices Args: - X: Normalized and log transcformed cells*genes count matrix. + X: Normalized and log transformed cells x genes count matrix. Y: Normalized and log transcformed cells*genes count matrix. """ From b7f2cf77f402702aa7183bb4f200a1983ece1680 Mon Sep 17 00:00:00 2001 From: Xichen Wu <102925032+wxicu@users.noreply.github.com> Date: Wed, 5 Jun 2024 09:34:12 +0200 Subject: [PATCH 13/34] Update pertpy/tools/_distances/_distances.py Co-authored-by: Eljas Roellin <65244425+eroell@users.noreply.github.com> --- pertpy/tools/_distances/_distances.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index 161f8152..0af0ac47 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -966,7 +966,7 @@ def _kde_eval(d, grid): # Kernel choice: Gaussian is too smoothing and cosine or other kernels that do not stretch out # can not be compared well on regions further away from the data as they are -inf kde = KernelDensity(bandwidth="silverman", kernel="exponential").fit(d) - return _parrallel_score_samples(kde, grid) + return _parallel_score_samples(kde, grid) mean_x, var_x = _mean_var(X, log=True) mean_y, var_y = _mean_var(Y, log=True) From e71f81cd31b21ceef2720e2f45b31037a34acbe0 Mon Sep 17 00:00:00 2001 From: Xichen Wu <102925032+wxicu@users.noreply.github.com> Date: Wed, 5 Jun 2024 09:34:19 +0200 Subject: [PATCH 14/34] Update pertpy/tools/_distances/_distances.py Co-authored-by: Eljas Roellin <65244425+eroell@users.noreply.github.com> --- pertpy/tools/_distances/_distances.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index 0af0ac47..8764f906 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -139,7 +139,7 @@ def __init__( Args: metric: Distance metric to use. - agg_fct: Name of the aggregation function to generate pseodobulk vectors. + agg_fct: Name of the aggregation function to generate pseudobulk vectors. layer_key: Name of the counts layer containing raw counts to calculate distances for. Mutually exclusive with 'obsm_key'. Is not used if `None`. From edaa6e650192e6eb51be0fd9776ab96e93053dd0 Mon Sep 17 00:00:00 2001 From: Xichen Wu <102925032+wxicu@users.noreply.github.com> Date: Wed, 5 Jun 2024 09:34:42 +0200 Subject: [PATCH 15/34] Update pertpy/tools/_distances/_distances.py Co-authored-by: Eljas Roellin <65244425+eroell@users.noreply.github.com> --- pertpy/tools/_distances/_distances.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index 8764f906..6ec03d53 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -931,7 +931,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: Difference of mean-var distibutions in 2 matrices Args: X: Normalized and log transformed cells x genes count matrix. - Y: Normalized and log transcformed cells*genes count matrix. + Y: Normalized and log transformed cells x genes count matrix. """ def _mean_var(x, log: bool = False): From 317cfd595eee7690f141da8ee02d3d75f2c2a14a Mon Sep 17 00:00:00 2001 From: wxicu Date: Thu, 6 Jun 2024 20:47:44 +0200 Subject: [PATCH 16/34] update code --- pertpy/tools/_distances/_distances.py | 203 ++++++++++++------ tests/tools/_distances/test_distance_tests.py | 6 +- tests/tools/_distances/test_distances.py | 27 ++- 3 files changed, 163 insertions(+), 73 deletions(-) diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index 6ec03d53..d3daf5cf 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -13,7 +13,7 @@ from ott.solvers.linear.sinkhorn import Sinkhorn from pandas import Series from rich.progress import track -from scipy.sparse import csr_matrix, issparse +from scipy.sparse import issparse from scipy.spatial.distance import cosine, mahalanobis from scipy.special import gammaln from scipy.stats import kendalltau, kstest, pearsonr, spearmanr @@ -23,27 +23,7 @@ from sklearn.neighbors import KernelDensity from statsmodels.discrete.discrete_model import NegativeBinomialP - -def _compute_medoid(arr: np.ndarray, axis: str | None = None): - if len(arr) == 0: - return None - if axis is not None: - return np.apply_along_axis(lambda x: compute_medoid(x), axis, arr) - - distances = np.abs(arr[:, np.newaxis] - arr) - total_distances = np.sum(distances, axis=1) - medoid_index = np.argmin(total_distances) - medoid = arr[medoid_index] - - return medoid - - -AGG_FCTS = { - "mean": np.mean, - "median": np.median, - "medoid": compute_medoid, - "variance": np.var, -} +AGG_FCTS = {"mean": np.mean, "median": np.median, "sum": np.sum} if TYPE_CHECKING: from collections.abc import Callable @@ -106,7 +86,7 @@ class Distance: Average of the classification probability of the perturbation for a binary classifier. - "classifier_cp": classifier class projection Average of the class - - "mean_var_distn": Distance between mean-variance distibutions between cells of 2 groups. + - "mean_var_distribution": Distance between mean-variance distibutions between cells of 2 groups. Mean square distance between the mean-variance distributions of cells from 2 groups using Kernel Density Estimation (KDE). - "mahalanobis": Mahalanobis distance between the means of cells from two groups. It is originally used to measure distance between a point and a distribution. @@ -130,7 +110,7 @@ class Distance: def __init__( self, metric: str = "edistance", - agg_fct: Literal["mean", "median", "medoid", "variance"] = "mean", + agg_fct: Literal["mean", "median", "sum"] = "mean", layer_key: str = None, obsm_key: str = None, cell_wise_metric: str = "euclidean", @@ -188,8 +168,8 @@ def __init__( metric_fct = ClassifierProbaDistance() elif metric == "classifier_cp": metric_fct = ClassifierClassProjection() - elif metric == "mean_var_distn": - metric_fct = MeanVarDistnDistance() + elif metric == "mean_var_distribution": + metric_fct = MeanVarDistributionDistance() elif metric == "mahalanobis": metric_fct = MahalanobisDistance(self.aggregation_func) else: @@ -282,7 +262,10 @@ def pairwise( # able to handle precomputed distances such as the PseudobulkDistance. if self.metric_fct.accepts_precomputed: # Precompute the pairwise distances if needed - if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys(): + if ( + f"{self.obsm_key}_{self.cell_wise_metric}_predistances" + not in adata.obsp.keys() + ): self.precompute_distances(adata, n_jobs=n_jobs, **kwargs) pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] for index_x, group_x in enumerate(fct(groups)): @@ -295,7 +278,9 @@ def pairwise( # subset the pairwise distance matrix to the two groups sub_pwd = pwd[idx_x | idx_y, :][:, idx_x | idx_y] sub_idx = grouping[idx_x | idx_y] == group_x - dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs) + dist = self.metric_fct.from_precomputed( + sub_pwd, sub_idx, **kwargs + ) df.loc[group_x, group_y] = dist df.loc[group_y, group_x] = dist else: @@ -373,7 +358,10 @@ def onesided_distances( # able to handle precomputed distances such as the PsuedobulkDistance. if self.metric_fct.accepts_precomputed: # Precompute the pairwise distances if needed - if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys(): + if ( + f"{self.obsm_key}_{self.cell_wise_metric}_predistances" + not in adata.obsp.keys() + ): self.precompute_distances(adata, n_jobs=n_jobs, **kwargs) pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] for group_x in fct(groups): @@ -427,7 +415,9 @@ def precompute_distances(self, adata: AnnData, n_jobs: int = -1) -> None: cells = adata.layers[self.layer_key] else: cells = adata.obsm[self.obsm_key].copy() - pwd = pairwise_distances(cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs) + pwd = pairwise_distances( + cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs + ) adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] = pwd @@ -496,7 +486,9 @@ def __init__(self) -> None: super().__init__() self.accepts_precomputed = False - def __call__(self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs) -> float: + def __call__( + self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs + ) -> float: if kernel == "linear": XX = np.dot(X, X.T) YY = np.dot(Y, Y.T) @@ -556,7 +548,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("EuclideanDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "EuclideanDistance cannot be called on a pairwise distance matrix." + ) class MeanSquaredDistance(AbstractDistance): @@ -568,10 +562,20 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 2 / X.shape[1] + return ( + np.linalg.norm( + self.aggregation_func(X, axis=0) - self.aggregation_func(Y, axis=0), + ord=2, + **kwargs, + ) + ** 2 + / X.shape[1] + ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("MeanSquaredDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "MeanSquaredDistance cannot be called on a pairwise distance matrix." + ) class MeanAbsoluteDistance(AbstractDistance): @@ -583,10 +587,19 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs) / X.shape[1] + return ( + np.linalg.norm( + self.aggregation_func(X, axis=0) - self.aggregation_func(Y, axis=0), + ord=1, + **kwargs, + ) + / X.shape[1] + ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("MeanAbsoluteDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "MeanAbsoluteDistance cannot be called on a pairwise distance matrix." + ) class MeanPairwiseDistance(AbstractDistance): @@ -614,10 +627,17 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return 1 - pearsonr(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0))[0] + return ( + 1 + - pearsonr( + self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0) + )[0] + ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("PearsonDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "PearsonDistance cannot be called on a pairwise distance matrix." + ) class SpearmanDistance(AbstractDistance): @@ -629,10 +649,17 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return 1 - spearmanr(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0))[0] + return ( + 1 + - spearmanr( + self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0) + )[0] + ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("SpearmanDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "SpearmanDistance cannot be called on a pairwise distance matrix." + ) class KendallTauDistance(AbstractDistance): @@ -651,7 +678,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return tau_dist def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("KendallTauDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "KendallTauDistance cannot be called on a pairwise distance matrix." + ) class CosineDistance(AbstractDistance): @@ -663,10 +692,14 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return cosine(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)) + return cosine( + self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0) + ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("CosineDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "CosineDistance cannot be called on a pairwise distance matrix." + ) class R2ScoreDistance(AbstractDistance): @@ -680,10 +713,14 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return 1 - r2_score(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)) + return 1 - r2_score( + self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0) + ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("R2ScoreDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "R2ScoreDistance cannot be called on a pairwise distance matrix." + ) class SymmetricKLDivergence(AbstractDistance): @@ -704,13 +741,23 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa for i in range(X.shape[1]): x_mean, x_std = X[:, i].mean(), X[:, i].std() + epsilon y_mean, y_std = Y[:, i].mean(), Y[:, i].std() + epsilon - kl = np.log(y_std / x_std) + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2) - 1 / 2 - klr = np.log(x_std / y_std) + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2) - 1 / 2 + kl = ( + np.log(y_std / x_std) + + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2) + - 1 / 2 + ) + klr = ( + np.log(x_std / y_std) + + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2) + - 1 / 2 + ) kl_all.append(kl + klr) return sum(kl_all) / len(kl_all) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("SymmetricKLDivergence cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "SymmetricKLDivergence cannot be called on a pairwise distance matrix." + ) class TTestDistance(AbstractDistance): @@ -734,7 +781,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa return sum(t_test_all) / len(t_test_all) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("TTestDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "TTestDistance cannot be called on a pairwise distance matrix." + ) class KSTestDistance(AbstractDistance): @@ -751,7 +800,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return sum(stats) / len(stats) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("KSTestDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "KSTestDistance cannot be called on a pairwise distance matrix." + ) class NBLL(AbstractDistance): @@ -766,7 +817,9 @@ def __init__(self) -> None: def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> float: def _is_count_matrix(matrix, tolerance=1e-6): - if matrix.dtype.kind == "i" or np.all(np.abs(matrix - np.round(matrix)) < tolerance): + if matrix.dtype.kind == "i" or np.all( + np.abs(matrix - np.round(matrix)) < tolerance + ): return True else: return False @@ -775,7 +828,9 @@ def _is_count_matrix(matrix, tolerance=1e-6): raise ValueError("NBLL distance only works for raw counts.") @numba.jit(forceobj=True) - def _compute_nll(y: np.ndarray, nb_params: tuple[float, float], epsilon: float) -> float: + def _compute_nll( + y: np.ndarray, nb_params: tuple[float, float], epsilon: float + ) -> float: mu = np.exp(nb_params[0]) theta = 1 / nb_params[1] eps = epsilon @@ -811,12 +866,16 @@ def _process_gene(x: np.ndarray, y: np.ndarray, epsilon: float) -> float: nlls.append(nll) if genes_skipped > X.shape[1] / 2: - raise AttributeError(f"{genes_skipped} genes could not be fit, which is over half.") + raise AttributeError( + f"{genes_skipped} genes could not be fit, which is over half." + ) return -np.sum(nlls) / len(nlls) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("NBLL cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "NBLL cannot be called on a pairwise distance matrix." + ) def _sample(X, frac=None, n=None): @@ -858,7 +917,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return np.mean(test_labels[:, 1]) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("ClassifierProbaDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "ClassifierProbaDistance cannot be called on a pairwise distance matrix." + ) class ClassifierClassProjection(AbstractDistance): @@ -872,7 +933,9 @@ def __init__(self) -> None: self.accepts_precomputed = False def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - raise NotImplementedError("ClassifierClassProjection can currently only be called with onesided.") + raise NotImplementedError( + "ClassifierClassProjection can currently only be called with onesided." + ) def onesided_distances( self, @@ -913,10 +976,12 @@ def onesided_distances( return df def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("ClassifierClassProjection cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "ClassifierClassProjection cannot be called on a pairwise distance matrix." + ) -class MeanVarDistnDistance(AbstractDistance): +class MeanVarDistributionDistance(AbstractDistance): """ Distance betweenv mean-var distributions of gene expression. @@ -927,8 +992,8 @@ def __init__(self) -> None: self.accepts_precomputed = False def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - """ - Difference of mean-var distibutions in 2 matrices + """Difference of mean-var distibutions in 2 matrices. + Args: X: Normalized and log transformed cells x genes count matrix. Y: Normalized and log transformed cells x genes count matrix. @@ -958,9 +1023,15 @@ def _grid_points(d, n_points=100): d_max = d_max + d_bin return np.arange(start=d_min + 0.5 * d_bin, stop=d_max, step=d_bin) - def _parrallel_score_samples(kde, samples, thread_count=int(0.875 * multiprocessing.cpu_count())): + def _parallel_score_samples( + kde, samples, thread_count=int(0.875 * multiprocessing.cpu_count()) + ): + # the thread_count is determined using the factor 0.875 as recommended here: + # https://stackoverflow.com/questions/32625094/scipy-parallel-computing-in-ipython-notebook with multiprocessing.Pool(thread_count) as p: - return np.concatenate(p.map(kde.score_samples, np.array_split(samples, thread_count))) + return np.concatenate( + p.map(kde.score_samples, np.array_split(samples, thread_count)) + ) def _kde_eval(d, grid): # Kernel choice: Gaussian is too smoothing and cosine or other kernels that do not stretch out @@ -987,7 +1058,9 @@ def _kde_eval(d, grid): return kde_diff def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("MeanVarDistnDistance cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "MeanVarDistributionDistance cannot be called on a pairwise distance matrix." + ) class MahalanobisDistance(AbstractDistance): @@ -1006,4 +1079,6 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError("Mahalanobis cannot be called on a pairwise distance matrix.") + raise NotImplementedError( + "Mahalanobis cannot be called on a pairwise distance matrix." + ) diff --git a/tests/tools/_distances/test_distance_tests.py b/tests/tools/_distances/test_distance_tests.py index 573e9e0f..369a2da9 100644 --- a/tests/tools/_distances/test_distance_tests.py +++ b/tests/tools/_distances/test_distance_tests.py @@ -23,7 +23,7 @@ # "classifier_cp", # "nbll", "mahalanobis", - "mean_var_distn", + "mean_var_distribution", ] count_distances = ["nb_ll"] @@ -42,7 +42,9 @@ def test_distancetest(adata, distance): if distance == "wasserstein": pytest.mark.apply(pytest.mark.slow) - etest = pt.tl.DistanceTest(distance, n_perms=10, obsm_key="X_pca", alpha=0.05, correction="holm-sidak") + etest = pt.tl.DistanceTest( + distance, n_perms=10, obsm_key="X_pca", alpha=0.05, correction="holm-sidak" + ) tab = etest(adata, groupby="perturbation", contrast="control") # Well-defined output diff --git a/tests/tools/_distances/test_distances.py b/tests/tools/_distances/test_distances.py index a4583ab1..e02bf2b6 100644 --- a/tests/tools/_distances/test_distances.py +++ b/tests/tools/_distances/test_distances.py @@ -26,9 +26,13 @@ non_distances = ["classifier_proba"] onesided_only = ["classifier_cp"] pseudo_counts_distances = ["nb_ll"] -lognorm_counts_distances = ["mean_var_distn"] +lognorm_counts_distances = ["mean_var_distribution"] all_distances = ( - actual_distances + semi_distances + non_distances + pseudo_counts_distances + lognorm_counts_distances + actual_distances + + semi_distances + + non_distances + + pseudo_counts_distances + + lognorm_counts_distances ) # + onesided_only @@ -42,9 +46,11 @@ def all_pairwise_distances(): "classifier_proba", "classifier_cp", "mahalanobis", - "mean_var_distn", + "mean_var_distribution", ] - no_subsample_distances = ["mahalanobis"] # mahalanobis only works on the full data without subsampling + no_subsample_distances = [ + "mahalanobis" + ] # mahalanobis only works on the full data without subsampling for distance in all_distances: adata = pt.dt.distance_example() @@ -98,7 +104,10 @@ def test_triangle_inequality(all_pairwise_distances): for _i in range(10): rng = np.random.default_rng() triplet = rng.choice(df.index, size=3, replace=False) - assert df.loc[triplet[0], triplet[1]] + df.loc[triplet[1], triplet[2]] >= df.loc[triplet[0], triplet[2]] + assert ( + df.loc[triplet[0], triplet[1]] + df.loc[triplet[1], triplet[2]] + >= df.loc[triplet[0], triplet[2]] + ) def test_distance_layers(all_pairwise_distances): @@ -128,7 +137,9 @@ def test_distance_output_type(all_pairwise_distances): # Test if distances are outputting floats for distance in all_distances: df = all_pairwise_distances[distance] - assert df.apply(lambda col: pd.api.types.is_float_dtype(col)).all(), "Not all values are floats." + assert df.apply( + lambda col: pd.api.types.is_float_dtype(col) + ).all(), "Not all values are floats." def test_distance_pairwise(all_pairwise_distances): @@ -149,7 +160,9 @@ def test_distance_onesided(): for distance in onesided_only: Distance = pt.tl.Distance(distance, obsm_key="X_pca") - df = Distance.onesided_distances(adata, groupby="perturbation", selected_group=selected_group) + df = Distance.onesided_distances( + adata, groupby="perturbation", selected_group=selected_group + ) assert isinstance(df, Series) assert df.loc[selected_group] == 0 # distance to self is 0 From 47b41342829d8eb3f8ea4538627f7a86a9e3596d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 6 Jun 2024 18:47:59 +0000 Subject: [PATCH 17/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pertpy/tools/_distances/_distances.py | 144 ++++-------------- tests/tools/_distances/test_distance_tests.py | 4 +- tests/tools/_distances/test_distances.py | 23 +-- 3 files changed, 39 insertions(+), 132 deletions(-) diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index d3daf5cf..71b353c7 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -262,10 +262,7 @@ def pairwise( # able to handle precomputed distances such as the PseudobulkDistance. if self.metric_fct.accepts_precomputed: # Precompute the pairwise distances if needed - if ( - f"{self.obsm_key}_{self.cell_wise_metric}_predistances" - not in adata.obsp.keys() - ): + if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys(): self.precompute_distances(adata, n_jobs=n_jobs, **kwargs) pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] for index_x, group_x in enumerate(fct(groups)): @@ -278,9 +275,7 @@ def pairwise( # subset the pairwise distance matrix to the two groups sub_pwd = pwd[idx_x | idx_y, :][:, idx_x | idx_y] sub_idx = grouping[idx_x | idx_y] == group_x - dist = self.metric_fct.from_precomputed( - sub_pwd, sub_idx, **kwargs - ) + dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs) df.loc[group_x, group_y] = dist df.loc[group_y, group_x] = dist else: @@ -358,10 +353,7 @@ def onesided_distances( # able to handle precomputed distances such as the PsuedobulkDistance. if self.metric_fct.accepts_precomputed: # Precompute the pairwise distances if needed - if ( - f"{self.obsm_key}_{self.cell_wise_metric}_predistances" - not in adata.obsp.keys() - ): + if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys(): self.precompute_distances(adata, n_jobs=n_jobs, **kwargs) pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] for group_x in fct(groups): @@ -415,9 +407,7 @@ def precompute_distances(self, adata: AnnData, n_jobs: int = -1) -> None: cells = adata.layers[self.layer_key] else: cells = adata.obsm[self.obsm_key].copy() - pwd = pairwise_distances( - cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs - ) + pwd = pairwise_distances(cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs) adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] = pwd @@ -486,9 +476,7 @@ def __init__(self) -> None: super().__init__() self.accepts_precomputed = False - def __call__( - self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs - ) -> float: + def __call__(self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs) -> float: if kernel == "linear": XX = np.dot(X, X.T) YY = np.dot(Y, Y.T) @@ -548,9 +536,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "EuclideanDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("EuclideanDistance cannot be called on a pairwise distance matrix.") class MeanSquaredDistance(AbstractDistance): @@ -573,9 +559,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "MeanSquaredDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("MeanSquaredDistance cannot be called on a pairwise distance matrix.") class MeanAbsoluteDistance(AbstractDistance): @@ -597,9 +581,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "MeanAbsoluteDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("MeanAbsoluteDistance cannot be called on a pairwise distance matrix.") class MeanPairwiseDistance(AbstractDistance): @@ -627,17 +609,10 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return ( - 1 - - pearsonr( - self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0) - )[0] - ) + return 1 - pearsonr(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0))[0] def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "PearsonDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("PearsonDistance cannot be called on a pairwise distance matrix.") class SpearmanDistance(AbstractDistance): @@ -649,17 +624,10 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return ( - 1 - - spearmanr( - self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0) - )[0] - ) + return 1 - spearmanr(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0))[0] def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "SpearmanDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("SpearmanDistance cannot be called on a pairwise distance matrix.") class KendallTauDistance(AbstractDistance): @@ -678,9 +646,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return tau_dist def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "KendallTauDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("KendallTauDistance cannot be called on a pairwise distance matrix.") class CosineDistance(AbstractDistance): @@ -692,14 +658,10 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return cosine( - self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0) - ) + return cosine(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "CosineDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("CosineDistance cannot be called on a pairwise distance matrix.") class R2ScoreDistance(AbstractDistance): @@ -713,14 +675,10 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return 1 - r2_score( - self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0) - ) + return 1 - r2_score(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "R2ScoreDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("R2ScoreDistance cannot be called on a pairwise distance matrix.") class SymmetricKLDivergence(AbstractDistance): @@ -741,23 +699,13 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa for i in range(X.shape[1]): x_mean, x_std = X[:, i].mean(), X[:, i].std() + epsilon y_mean, y_std = Y[:, i].mean(), Y[:, i].std() + epsilon - kl = ( - np.log(y_std / x_std) - + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2) - - 1 / 2 - ) - klr = ( - np.log(x_std / y_std) - + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2) - - 1 / 2 - ) + kl = np.log(y_std / x_std) + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2) - 1 / 2 + klr = np.log(x_std / y_std) + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2) - 1 / 2 kl_all.append(kl + klr) return sum(kl_all) / len(kl_all) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "SymmetricKLDivergence cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("SymmetricKLDivergence cannot be called on a pairwise distance matrix.") class TTestDistance(AbstractDistance): @@ -781,9 +729,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa return sum(t_test_all) / len(t_test_all) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "TTestDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("TTestDistance cannot be called on a pairwise distance matrix.") class KSTestDistance(AbstractDistance): @@ -800,9 +746,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return sum(stats) / len(stats) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "KSTestDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("KSTestDistance cannot be called on a pairwise distance matrix.") class NBLL(AbstractDistance): @@ -817,9 +761,7 @@ def __init__(self) -> None: def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> float: def _is_count_matrix(matrix, tolerance=1e-6): - if matrix.dtype.kind == "i" or np.all( - np.abs(matrix - np.round(matrix)) < tolerance - ): + if matrix.dtype.kind == "i" or np.all(np.abs(matrix - np.round(matrix)) < tolerance): return True else: return False @@ -828,9 +770,7 @@ def _is_count_matrix(matrix, tolerance=1e-6): raise ValueError("NBLL distance only works for raw counts.") @numba.jit(forceobj=True) - def _compute_nll( - y: np.ndarray, nb_params: tuple[float, float], epsilon: float - ) -> float: + def _compute_nll(y: np.ndarray, nb_params: tuple[float, float], epsilon: float) -> float: mu = np.exp(nb_params[0]) theta = 1 / nb_params[1] eps = epsilon @@ -866,16 +806,12 @@ def _process_gene(x: np.ndarray, y: np.ndarray, epsilon: float) -> float: nlls.append(nll) if genes_skipped > X.shape[1] / 2: - raise AttributeError( - f"{genes_skipped} genes could not be fit, which is over half." - ) + raise AttributeError(f"{genes_skipped} genes could not be fit, which is over half.") return -np.sum(nlls) / len(nlls) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "NBLL cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("NBLL cannot be called on a pairwise distance matrix.") def _sample(X, frac=None, n=None): @@ -917,9 +853,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return np.mean(test_labels[:, 1]) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "ClassifierProbaDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("ClassifierProbaDistance cannot be called on a pairwise distance matrix.") class ClassifierClassProjection(AbstractDistance): @@ -933,9 +867,7 @@ def __init__(self) -> None: self.accepts_precomputed = False def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "ClassifierClassProjection can currently only be called with onesided." - ) + raise NotImplementedError("ClassifierClassProjection can currently only be called with onesided.") def onesided_distances( self, @@ -976,9 +908,7 @@ def onesided_distances( return df def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "ClassifierClassProjection cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("ClassifierClassProjection cannot be called on a pairwise distance matrix.") class MeanVarDistributionDistance(AbstractDistance): @@ -1023,15 +953,11 @@ def _grid_points(d, n_points=100): d_max = d_max + d_bin return np.arange(start=d_min + 0.5 * d_bin, stop=d_max, step=d_bin) - def _parallel_score_samples( - kde, samples, thread_count=int(0.875 * multiprocessing.cpu_count()) - ): + def _parallel_score_samples(kde, samples, thread_count=int(0.875 * multiprocessing.cpu_count())): # the thread_count is determined using the factor 0.875 as recommended here: # https://stackoverflow.com/questions/32625094/scipy-parallel-computing-in-ipython-notebook with multiprocessing.Pool(thread_count) as p: - return np.concatenate( - p.map(kde.score_samples, np.array_split(samples, thread_count)) - ) + return np.concatenate(p.map(kde.score_samples, np.array_split(samples, thread_count))) def _kde_eval(d, grid): # Kernel choice: Gaussian is too smoothing and cosine or other kernels that do not stretch out @@ -1058,9 +984,7 @@ def _kde_eval(d, grid): return kde_diff def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "MeanVarDistributionDistance cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("MeanVarDistributionDistance cannot be called on a pairwise distance matrix.") class MahalanobisDistance(AbstractDistance): @@ -1079,6 +1003,4 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: - raise NotImplementedError( - "Mahalanobis cannot be called on a pairwise distance matrix." - ) + raise NotImplementedError("Mahalanobis cannot be called on a pairwise distance matrix.") diff --git a/tests/tools/_distances/test_distance_tests.py b/tests/tools/_distances/test_distance_tests.py index 369a2da9..8e62154e 100644 --- a/tests/tools/_distances/test_distance_tests.py +++ b/tests/tools/_distances/test_distance_tests.py @@ -42,9 +42,7 @@ def test_distancetest(adata, distance): if distance == "wasserstein": pytest.mark.apply(pytest.mark.slow) - etest = pt.tl.DistanceTest( - distance, n_perms=10, obsm_key="X_pca", alpha=0.05, correction="holm-sidak" - ) + etest = pt.tl.DistanceTest(distance, n_perms=10, obsm_key="X_pca", alpha=0.05, correction="holm-sidak") tab = etest(adata, groupby="perturbation", contrast="control") # Well-defined output diff --git a/tests/tools/_distances/test_distances.py b/tests/tools/_distances/test_distances.py index e02bf2b6..0508d9ce 100644 --- a/tests/tools/_distances/test_distances.py +++ b/tests/tools/_distances/test_distances.py @@ -28,11 +28,7 @@ pseudo_counts_distances = ["nb_ll"] lognorm_counts_distances = ["mean_var_distribution"] all_distances = ( - actual_distances - + semi_distances - + non_distances - + pseudo_counts_distances - + lognorm_counts_distances + actual_distances + semi_distances + non_distances + pseudo_counts_distances + lognorm_counts_distances ) # + onesided_only @@ -48,9 +44,7 @@ def all_pairwise_distances(): "mahalanobis", "mean_var_distribution", ] - no_subsample_distances = [ - "mahalanobis" - ] # mahalanobis only works on the full data without subsampling + no_subsample_distances = ["mahalanobis"] # mahalanobis only works on the full data without subsampling for distance in all_distances: adata = pt.dt.distance_example() @@ -104,10 +98,7 @@ def test_triangle_inequality(all_pairwise_distances): for _i in range(10): rng = np.random.default_rng() triplet = rng.choice(df.index, size=3, replace=False) - assert ( - df.loc[triplet[0], triplet[1]] + df.loc[triplet[1], triplet[2]] - >= df.loc[triplet[0], triplet[2]] - ) + assert df.loc[triplet[0], triplet[1]] + df.loc[triplet[1], triplet[2]] >= df.loc[triplet[0], triplet[2]] def test_distance_layers(all_pairwise_distances): @@ -137,9 +128,7 @@ def test_distance_output_type(all_pairwise_distances): # Test if distances are outputting floats for distance in all_distances: df = all_pairwise_distances[distance] - assert df.apply( - lambda col: pd.api.types.is_float_dtype(col) - ).all(), "Not all values are floats." + assert df.apply(lambda col: pd.api.types.is_float_dtype(col)).all(), "Not all values are floats." def test_distance_pairwise(all_pairwise_distances): @@ -160,9 +149,7 @@ def test_distance_onesided(): for distance in onesided_only: Distance = pt.tl.Distance(distance, obsm_key="X_pca") - df = Distance.onesided_distances( - adata, groupby="perturbation", selected_group=selected_group - ) + df = Distance.onesided_distances(adata, groupby="perturbation", selected_group=selected_group) assert isinstance(df, Series) assert df.loc[selected_group] == 0 # distance to self is 0 From d26141023075869ee008dae8c7b84085857a7df8 Mon Sep 17 00:00:00 2001 From: wxicu Date: Thu, 6 Jun 2024 21:43:44 +0200 Subject: [PATCH 18/34] fix drug --- pertpy/metadata/_drug.py | 45 ++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/pertpy/metadata/_drug.py b/pertpy/metadata/_drug.py index 106ce93b..422a9718 100644 --- a/pertpy/metadata/_drug.py +++ b/pertpy/metadata/_drug.py @@ -45,7 +45,7 @@ def _download_drug_annotation( block_size=4096, is_zip=False, ) - dgidb_df = pd.read_table(dgidb_path) + dgidb_df = pd.read_table(dgidb_path, delimiter=",") return dgidb_df else: @@ -64,7 +64,10 @@ def _download_drug_annotation( pharmgkb_df = pharmgkb_df[pharmgkb_df["Association"] != "not associated"] pharmgkb_df = pharmgkb_df[ (pharmgkb_df["Entity1_type"] == "Gene") - & ((pharmgkb_df["Entity2_type"] == "Chemical") | (pharmgkb_df["Entity2_type"] == "Disease")) + & ( + (pharmgkb_df["Entity2_type"] == "Chemical") + | (pharmgkb_df["Entity2_type"] == "Disease") + ) ] pharmgkb_df.rename( columns={ @@ -74,7 +77,9 @@ def _download_drug_annotation( }, inplace=True, ) - pharmgkb_df.drop(["Entity1_type", "Entity1_id", "Entity2_id"], axis=1, inplace=True) + pharmgkb_df.drop( + ["Entity1_type", "Entity1_id", "Entity2_id"], axis=1, inplace=True + ) return pharmgkb_df @@ -129,7 +134,9 @@ def annotate( .to_dict() ) - adata.var["compounds"] = adata.var_names.map(lambda gene: gene_compound_dict.get(gene, "")) + adata.var["compounds"] = adata.var_names.map( + lambda gene: gene_compound_dict.get(gene, "") + ) else: compounds = interaction[interaction["Type"] == "Chemical"] exploded_df = compounds.explode("Gene") @@ -139,7 +146,9 @@ def annotate( .to_dict() ) - adata.var["compounds"] = adata.var_names.map(lambda gene: gene_compound_dict.get(gene, "")) + adata.var["compounds"] = adata.var_names.map( + lambda gene: gene_compound_dict.get(gene, "") + ) diseases = interaction[interaction["Type"] == "Disease"] exploded_df = diseases.explode("Gene") gene_disease_dict = ( @@ -148,7 +157,9 @@ def annotate( .to_dict() ) - adata.var["diseases"] = adata.var_names.map(lambda gene: gene_disease_dict.get(gene, "")) + adata.var["diseases"] = adata.var_names.map( + lambda gene: gene_disease_dict.get(gene, "") + ) return adata def lookup(self) -> LookUp: @@ -193,7 +204,9 @@ def set(self) -> None: ) self.dictionary = data targets = dict(ChainMap(*[data[cat] for cat in data])) - self.dataframe = pd.DataFrame([{"Compound": k, "Targets": v} for k, v in targets.items()]) + self.dataframe = pd.DataFrame( + [{"Compound": k, "Targets": v} for k, v in targets.items()] + ) self.dataframe.rename( columns={"Targets": "targets", "Compound": "compounds"}, inplace=True, @@ -203,7 +216,11 @@ def set(self) -> None: raise ValueError( "The dgidb data is in a wrong format. Please clear the cache and reinitialize the object." ) - self.dataframe = data.groupby("drug_claim_name")["gene_claim_name"].apply(list).reset_index() + self.dataframe = ( + data.groupby("drug_claim_name")["gene_claim_name"] + .apply(list) + .reset_index() + ) self.dataframe.rename( columns={ "gene_claim_name": "targets", @@ -211,13 +228,17 @@ def set(self) -> None: }, inplace=True, ) - self.dictionary = self.dataframe.set_index("compounds")["targets"].to_dict() + self.dictionary = self.dataframe.set_index("compounds")[ + "targets" + ].to_dict() else: if not isinstance(data, pd.DataFrame): raise ValueError( "The pharmGKB data is in a wrong format. Please clear the cache and reinitialize the object." ) - self.dataframe = data.groupby("Compound|Disease")["Gene"].apply(list).reset_index() + self.dataframe = ( + data.groupby("Compound|Disease")["Gene"].apply(list).reset_index() + ) self.dataframe.rename( columns={ "Gene": "targets", @@ -225,7 +246,9 @@ def set(self) -> None: }, inplace=True, ) - self.dictionary = self.dataframe.set_index("compounds|diseases")["targets"].to_dict() + self.dictionary = self.dataframe.set_index("compounds|diseases")[ + "targets" + ].to_dict() def df(self) -> pd.DataFrame: if not self.loaded: From 4fd29a0e12c1bcde982fa6e6218f14a3e8e98e4c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 6 Jun 2024 19:43:57 +0000 Subject: [PATCH 19/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pertpy/metadata/_drug.py | 43 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 33 deletions(-) diff --git a/pertpy/metadata/_drug.py b/pertpy/metadata/_drug.py index 422a9718..0f6066eb 100644 --- a/pertpy/metadata/_drug.py +++ b/pertpy/metadata/_drug.py @@ -64,10 +64,7 @@ def _download_drug_annotation( pharmgkb_df = pharmgkb_df[pharmgkb_df["Association"] != "not associated"] pharmgkb_df = pharmgkb_df[ (pharmgkb_df["Entity1_type"] == "Gene") - & ( - (pharmgkb_df["Entity2_type"] == "Chemical") - | (pharmgkb_df["Entity2_type"] == "Disease") - ) + & ((pharmgkb_df["Entity2_type"] == "Chemical") | (pharmgkb_df["Entity2_type"] == "Disease")) ] pharmgkb_df.rename( columns={ @@ -77,9 +74,7 @@ def _download_drug_annotation( }, inplace=True, ) - pharmgkb_df.drop( - ["Entity1_type", "Entity1_id", "Entity2_id"], axis=1, inplace=True - ) + pharmgkb_df.drop(["Entity1_type", "Entity1_id", "Entity2_id"], axis=1, inplace=True) return pharmgkb_df @@ -134,9 +129,7 @@ def annotate( .to_dict() ) - adata.var["compounds"] = adata.var_names.map( - lambda gene: gene_compound_dict.get(gene, "") - ) + adata.var["compounds"] = adata.var_names.map(lambda gene: gene_compound_dict.get(gene, "")) else: compounds = interaction[interaction["Type"] == "Chemical"] exploded_df = compounds.explode("Gene") @@ -146,9 +139,7 @@ def annotate( .to_dict() ) - adata.var["compounds"] = adata.var_names.map( - lambda gene: gene_compound_dict.get(gene, "") - ) + adata.var["compounds"] = adata.var_names.map(lambda gene: gene_compound_dict.get(gene, "")) diseases = interaction[interaction["Type"] == "Disease"] exploded_df = diseases.explode("Gene") gene_disease_dict = ( @@ -157,9 +148,7 @@ def annotate( .to_dict() ) - adata.var["diseases"] = adata.var_names.map( - lambda gene: gene_disease_dict.get(gene, "") - ) + adata.var["diseases"] = adata.var_names.map(lambda gene: gene_disease_dict.get(gene, "")) return adata def lookup(self) -> LookUp: @@ -204,9 +193,7 @@ def set(self) -> None: ) self.dictionary = data targets = dict(ChainMap(*[data[cat] for cat in data])) - self.dataframe = pd.DataFrame( - [{"Compound": k, "Targets": v} for k, v in targets.items()] - ) + self.dataframe = pd.DataFrame([{"Compound": k, "Targets": v} for k, v in targets.items()]) self.dataframe.rename( columns={"Targets": "targets", "Compound": "compounds"}, inplace=True, @@ -216,11 +203,7 @@ def set(self) -> None: raise ValueError( "The dgidb data is in a wrong format. Please clear the cache and reinitialize the object." ) - self.dataframe = ( - data.groupby("drug_claim_name")["gene_claim_name"] - .apply(list) - .reset_index() - ) + self.dataframe = data.groupby("drug_claim_name")["gene_claim_name"].apply(list).reset_index() self.dataframe.rename( columns={ "gene_claim_name": "targets", @@ -228,17 +211,13 @@ def set(self) -> None: }, inplace=True, ) - self.dictionary = self.dataframe.set_index("compounds")[ - "targets" - ].to_dict() + self.dictionary = self.dataframe.set_index("compounds")["targets"].to_dict() else: if not isinstance(data, pd.DataFrame): raise ValueError( "The pharmGKB data is in a wrong format. Please clear the cache and reinitialize the object." ) - self.dataframe = ( - data.groupby("Compound|Disease")["Gene"].apply(list).reset_index() - ) + self.dataframe = data.groupby("Compound|Disease")["Gene"].apply(list).reset_index() self.dataframe.rename( columns={ "Gene": "targets", @@ -246,9 +225,7 @@ def set(self) -> None: }, inplace=True, ) - self.dictionary = self.dataframe.set_index("compounds|diseases")[ - "targets" - ].to_dict() + self.dictionary = self.dataframe.set_index("compounds|diseases")["targets"].to_dict() def df(self) -> pd.DataFrame: if not self.loaded: From 30baefecd309169d767450a8eb41a7164610bb04 Mon Sep 17 00:00:00 2001 From: wxicu Date: Fri, 7 Jun 2024 17:03:38 +0200 Subject: [PATCH 20/34] add bootstrapping and metrics_3g --- pertpy/tools/_distances/_distances.py | 239 +++++++++++++++++++---- pertpy/tools/_metrics_3g.py | 214 ++++++++++++++++++++ tests/tools/_distances/test_distances.py | 171 ++++++++++------ 3 files changed, 525 insertions(+), 99 deletions(-) create mode 100644 pertpy/tools/_metrics_3g.py diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index 71b353c7..815da084 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -2,7 +2,7 @@ import multiprocessing from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Literal +from typing import TYPE_CHECKING, Literal, NamedTuple import numba import numpy as np @@ -31,6 +31,11 @@ from anndata import AnnData +class MeanVar(NamedTuple): + mean: float + variance: float + + class Distance: """Distance class, used to compute distances between groups of cells. @@ -86,7 +91,7 @@ class Distance: Average of the classification probability of the perturbation for a binary classifier. - "classifier_cp": classifier class projection Average of the class - - "mean_var_distribution": Distance between mean-variance distibutions between cells of 2 groups. + - "mean_var_distribution": Distance between mean-variance distributions between cells of 2 groups. Mean square distance between the mean-variance distributions of cells from 2 groups using Kernel Density Estimation (KDE). - "mahalanobis": Mahalanobis distance between the means of cells from two groups. It is originally used to measure distance between a point and a distribution. @@ -124,7 +129,7 @@ def __init__( Mutually exclusive with 'obsm_key'. Is not used if `None`. obsm_key: Name of embedding in adata.obsm to use. - Mutually exclusive with 'counts_layer_key'. + Mutually exclusive with 'layer_key'. Defaults to None, but is set to "X_pca" if not explicitly set internally. cell_wise_metric: Metric from scipy.spatial.distance to use for pairwise distances between single cells. """ @@ -178,8 +183,7 @@ def __init__( if layer_key and obsm_key: raise ValueError( - "Cannot use 'counts_layer_key' and 'obsm_key' at the same time.\n" - "Please provide only one of the two keys." + "Cannot use 'layer_key' and 'obsm_key' at the same time.\n" "Please provide only one of the two keys." ) if not layer_key and not obsm_key: obsm_key = "X_pca" @@ -221,15 +225,54 @@ def __call__( return self.metric_fct(X, Y, **kwargs) + def bootstrap( + self, + X: np.ndarray, + Y: np.ndarray, + *, + n_bootstrap: int = 100, + bootstrap_random_state: int = 0, + **kwargs, + ) -> MeanVar: + """Bootstrap computation of mean and variance of the distance between vectors X and Y. + + Args: + X: First vector of shape (n_samples, n_features). + Y: Second vector of shape (n_samples, n_features). + n_bootstrap: Number of bootstrap samples. + bootstrap_random_state: Random state for bootstrapping. + + Returns: + MeanVar: Mean and variance of distance between X and Y. + + Examples: + >>> import pertpy as pt + >>> adata = pt.dt.distance_example() + >>> Distance = pt.tools.Distance(metric="edistance") + >>> X = adata.obsm["X_pca"][adata.obs["perturbation"] == "p-sgCREB1-2"] + >>> Y = adata.obsm["X_pca"][adata.obs["perturbation"] == "control"] + >>> D = Distance.bootstrap(X, Y) + """ + return self._bootstrap_mode( + X, + Y, + n_bootstraps=n_bootstrap, + bootstrap_random_state=bootstrap_random_state, + **kwargs, + ) + def pairwise( self, adata: AnnData, groupby: str, groups: list[str] | None = None, + bootstrap: bool = False, + n_bootstrap: int = 100, + bootstrap_random_state: int = 0, show_progressbar: bool = True, n_jobs: int = -1, **kwargs, - ) -> pd.DataFrame: + ) -> pd.DataFrame | tuple[pd.DataFrame, pd.DataFrame]: """Get pairwise distances between groups of cells. Args: @@ -237,12 +280,16 @@ def pairwise( groupby: Column name in adata.obs. groups: List of groups to compute pairwise distances for. If None, uses all groups. + bootstrap: Whether to bootstrap the distance. + n_bootstrap: Number of bootstrap samples. + bootstrap_random_state: Random state for bootstrapping. show_progressbar: Whether to show progress bar. n_jobs: Number of cores to use. Defaults to -1 (all). kwargs: Additional keyword arguments passed to the metric function. Returns: pd.DataFrame: Dataframe with pairwise distances. + tuple[pd.DataFrame, pd.DataFrame]: Two Dataframes, one for the mean and one for the variance of pairwise distances. Examples: >>> import pertpy as pt @@ -253,6 +300,8 @@ def pairwise( groups = adata.obs[groupby].unique() if groups is None else groups grouping = adata.obs[groupby].copy() df = pd.DataFrame(index=groups, columns=groups, dtype=float) + if bootstrap: + df_var = pd.DataFrame(index=groups, columns=groups, dtype=float) fct = track if show_progressbar else lambda iterable: iterable # Some metrics are able to handle precomputed distances. This means that @@ -268,16 +317,29 @@ def pairwise( for index_x, group_x in enumerate(fct(groups)): idx_x = grouping == group_x for group_y in groups[index_x:]: # type: ignore - if group_x == group_y: - dist = 0.0 # by distance axiom + # subset the pairwise distance matrix to the two groups + idx_y = grouping == group_y + sub_pwd = pwd[idx_x | idx_y, :][:, idx_x | idx_y] + sub_idx = grouping[idx_x | idx_y] == group_x + if not bootstrap: + if group_x == group_y: + dist = 0.0 + else: + dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs) + df.loc[group_x, group_y] = dist + df.loc[group_y, group_x] = dist + else: - idx_y = grouping == group_y - # subset the pairwise distance matrix to the two groups - sub_pwd = pwd[idx_x | idx_y, :][:, idx_x | idx_y] - sub_idx = grouping[idx_x | idx_y] == group_x - dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs) - df.loc[group_x, group_y] = dist - df.loc[group_y, group_x] = dist + bootstrap_output = self._bootstrap_mode_precomputed( + sub_pwd, + sub_idx, + n_bootstraps=n_bootstrap, + bootstrap_random_state=bootstrap_random_state, + **kwargs, + ) + # In the bootstrap case, distance of group to itself is a mean and can be non-zero + df.loc[group_x, group_y] = df.loc[group_y, group_x] = bootstrap_output.mean + df_var.loc[group_x, group_y] = df_var.loc[group_y, group_x] = bootstrap_output.variance else: if self.layer_key: embedding = adata.layers[self.layer_key] @@ -286,18 +348,39 @@ def pairwise( for index_x, group_x in enumerate(fct(groups)): cells_x = embedding[grouping == group_x].copy() for group_y in groups[index_x:]: # type: ignore - if group_x == group_y: - dist = 0.0 + cells_y = embedding[grouping == group_y].copy() + if not bootstrap: + # By distance axiom, the distance between a group and itself is 0 + dist = 0.0 if group_x == group_y else self(cells_x, cells_y, **kwargs) + + df.loc[group_x, group_y] = dist + df.loc[group_y, group_x] = dist else: - cells_y = embedding[grouping == group_y].copy() - dist = self(cells_x, cells_y, **kwargs) - df.loc[group_x, group_y] = dist - df.loc[group_y, group_x] = dist + bootstrap_output = self.bootstrap( + cells_x, + cells_y, + n_bootstrap=n_bootstrap, + bootstrap_random_state=bootstrap_random_state, + **kwargs, + ) + # In the bootstrap case, distance of group to itself is a mean and can be non-zero + df.loc[group_x, group_y] = df.loc[group_y, group_x] = bootstrap_output.mean + df_var.loc[group_x, group_y] = df_var.loc[group_y, group_x] = bootstrap_output.variance + df.index.name = groupby df.columns.name = groupby df.name = f"pairwise {self.metric}" - return df + if not bootstrap: + return df + else: + df = df.fillna(0) + df_var.index.name = groupby + df_var.columns.name = groupby + df_var = df_var.fillna(0) + df_var.name = f"pairwise {self.metric} variance" + + return df, df_var def onesided_distances( self, @@ -305,10 +388,13 @@ def onesided_distances( groupby: str, selected_group: str | None = None, groups: list[str] | None = None, + bootstrap: bool = False, + n_bootstrap: int = 100, + bootstrap_random_state: int = 0, show_progressbar: bool = True, n_jobs: int = -1, **kwargs, - ) -> pd.DataFrame: + ) -> pd.DataFrame | tuple[pd.DataFrame, pd.DataFrame]: """Get distances between one selected cell group and the remaining other cell groups. Args: @@ -317,12 +403,17 @@ def onesided_distances( selected_group: Group to compute pairwise distances to all other. groups: List of groups to compute distances to selected_group for. If None, uses all groups. + bootstrap: Whether to bootstrap the distance. + n_bootstrap: Number of bootstrap samples. + bootstrap_random_state: Random state for bootstrapping. show_progressbar: Whether to show progress bar. n_jobs: Number of cores to use. Defaults to -1 (all). kwargs: Additional keyword arguments passed to the metric function. Returns: pd.DataFrame: Dataframe with distances of groups to selected_group. + tuple[pd.DataFrame, pd.DataFrame]: Two Dataframes, one for the mean and one for the variance of distances of groups to selected_group. + Examples: >>> import pertpy as pt @@ -331,6 +422,8 @@ def onesided_distances( >>> pairwise_df = Distance.onesided_distances(adata, groupby="perturbation", selected_group="control") """ if self.metric == "classifier_cp": + if bootstrap: + raise NotImplementedError("Currently, ClassifierClassProjection does not support bootstrapping.") return self.metric_fct.onesided_distances( # type: ignore adata, groupby, @@ -344,13 +437,15 @@ def onesided_distances( groups = adata.obs[groupby].unique() if groups is None else groups grouping = adata.obs[groupby].copy() df = pd.Series(index=groups, dtype=float) + if bootstrap: + df_var = pd.Series(index=groups, dtype=float) fct = track if show_progressbar else lambda iterable: iterable # Some metrics are able to handle precomputed distances. This means that # the pairwise distances between all cells are computed once and then # passed to the metric function. This is much faster than computing the # pairwise distances for each group separately. Other metrics are not - # able to handle precomputed distances such as the PsuedobulkDistance. + # able to handle precomputed distances such as the PseudobulkDistance. if self.metric_fct.accepts_precomputed: # Precompute the pairwise distances if needed if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys(): @@ -360,14 +455,25 @@ def onesided_distances( idx_x = grouping == group_x group_y = selected_group if group_x == group_y: - dist = 0.0 # by distance axiom + df.loc[group_x] = 0.0 # by distance axiom else: idx_y = grouping == group_y # subset the pairwise distance matrix to the two groups sub_pwd = pwd[idx_x | idx_y, :][:, idx_x | idx_y] sub_idx = grouping[idx_x | idx_y] == group_x - dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs) - df.loc[group_x] = dist + if not bootstrap: + dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs) + df.loc[group_x] = dist + else: + bootstrap_output = self._bootstrap_mode_precomputed( + sub_pwd, + sub_idx, + n_bootstraps=n_bootstrap, + bootstrap_random_state=bootstrap_random_state, + **kwargs, + ) + df.loc[group_x] = bootstrap_output.mean + df_var.loc[group_x] = bootstrap_output.variance else: if self.layer_key: embedding = adata.layers[self.layer_key] @@ -376,15 +482,32 @@ def onesided_distances( for group_x in fct(groups): cells_x = embedding[grouping == group_x].copy() group_y = selected_group - if group_x == group_y: - dist = 0.0 + cells_y = embedding[grouping == group_y].copy() + if not bootstrap: + # By distance axiom, the distance between a group and itself is 0 + dist = 0.0 if group_x == group_y else self(cells_x, cells_y, **kwargs) + df.loc[group_x] = dist else: - cells_y = embedding[grouping == group_y].copy() - dist = self(cells_x, cells_y, **kwargs) - df.loc[group_x] = dist + bootstrap_output = self.bootstrap( + cells_x, + cells_y, + n_bootstrap=n_bootstrap, + bootstrap_random_state=bootstrap_random_state, + **kwargs, + ) + # In the bootstrap case, distance of group to itself is a mean and can be non-zero + df.loc[group_x] = bootstrap_output.mean + df_var.loc[group_x] = bootstrap_output.variance df.index.name = groupby df.name = f"{self.metric} to {selected_group}" - return df + if not bootstrap: + return df + else: + df_var.index.name = groupby + df_var = df_var.fillna(0) + df_var.name = f"pairwise {self.metric} variance to {selected_group}" + + return df, df_var def precompute_distances(self, adata: AnnData, n_jobs: int = -1) -> None: """Precompute pairwise distances between all cells, writes to adata.obsp. @@ -410,6 +533,45 @@ def precompute_distances(self, adata: AnnData, n_jobs: int = -1) -> None: pwd = pairwise_distances(cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs) adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] = pwd + def _bootstrap_mode(self, X, Y, n_bootstraps=100, bootstrap_random_state=0, **kwargs) -> MeanVar: + rng = np.random.default_rng(bootstrap_random_state) + + distances = [] + for _ in range(n_bootstraps): + X_bootstrapped = X[rng.choice(a=X.shape[0], size=X.shape[0], replace=True)] + Y_bootstrapped = Y[rng.choice(a=Y.shape[0], size=X.shape[0], replace=True)] + + distance = self(X_bootstrapped, Y_bootstrapped, **kwargs) + distances.append(distance) + + mean = np.mean(distances) + variance = np.var(distances) + return MeanVar(mean=mean, variance=variance) + + def _bootstrap_mode_precomputed( + self, sub_pwd, sub_idx, n_bootstraps=100, bootstrap_random_state=0, **kwargs + ) -> MeanVar: + rng = np.random.default_rng(bootstrap_random_state) + + distances = [] + for _ in range(n_bootstraps): + # in order to maintain the number of cells for both groups (whatever balancing they may have), + # we sample the positive and negative indices separately + bootstrap_pos_idx = rng.choice(a=sub_idx[sub_idx].index, size=sub_idx[sub_idx].size, replace=True) + bootstrap_neg_idx = rng.choice(a=sub_idx[~sub_idx].index, size=sub_idx[~sub_idx].size, replace=True) + bootstrap_idx = np.concatenate([bootstrap_pos_idx, bootstrap_neg_idx]) + bootstrap_idx_nrs = sub_idx.index.get_indexer(bootstrap_idx) + + bootstrap_sub_idx = sub_idx[bootstrap_idx] + bootstrap_sub_pwd = sub_pwd[bootstrap_idx_nrs, :][:, bootstrap_idx_nrs] + + distance = self.metric_fct.from_precomputed(bootstrap_sub_pwd, bootstrap_sub_idx, **kwargs) + distances.append(distance) + + mean = np.mean(distances) + variance = np.var(distances) + return MeanVar(mean=mean, variance=variance) + class AbstractDistance(ABC): """Abstract class of distance metrics between two sets of vectors.""" @@ -913,7 +1075,7 @@ def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: class MeanVarDistributionDistance(AbstractDistance): """ - Distance betweenv mean-var distributions of gene expression. + Distance between mean-var distributions of gene expression. """ @@ -922,7 +1084,7 @@ def __init__(self) -> None: self.accepts_precomputed = False def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - """Difference of mean-var distibutions in 2 matrices. + """Difference of mean-var distributions in 2 matrices. Args: X: Normalized and log transformed cells x genes count matrix. @@ -996,10 +1158,15 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: + cov = np.cov(X.T) + if cov.shape[0] == cov.shape[1] and np.linalg.matrix_rank(cov) == cov.shape[0]: # check invertiblity + inverse_cov = np.linalg.inv(cov) + else: # if not invertible, compute the (Moore-Penrose) pseudo-inverse of a matrix + inverse_cov = np.linalg.pinv(cov) return mahalanobis( self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0), - np.linalg.inv(np.cov(X.T)), + inverse_cov, ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: diff --git a/pertpy/tools/_metrics_3g.py b/pertpy/tools/_metrics_3g.py new file mode 100644 index 00000000..8eb951f5 --- /dev/null +++ b/pertpy/tools/_metrics_3g.py @@ -0,0 +1,214 @@ +from collections.abc import Mapping +from functools import partial +from types import MappingProxyType +from typing import TYPE_CHECKING, Any, Literal, Optional + +import anndata as ad +import numpy as np +import pandas as pd +import pynndescent +import scanpy as sc +from scipy.sparse import issparse +from scipy.sparse import vstack as sp_vstack +from sklearn.base import ClassifierMixin +from sklearn.linear_model import LogisticRegression + +from pertpy.tools._distances._distances import Distance + +if TYPE_CHECKING: + from numpy.typing import NDArray + + +def compare_de( + X: np.ndarray, Y: np.ndarray, C: np.ndarray, shared_top: int = 100, **kwargs +) -> dict: + """Compare DEG across real and simulated perturbations. + + Computes DEG for real and simulated perturbations vs. control and calculates + metrics to evaluate similarity of the results. + + Args: + X: Real perturbed data. + Y: Simulated perturbed data. + C: Control data + shared_top: The number of top DEG to compute the proportion of their intersection. + **kwargs: arguments for `scanpy.tl.rank_genes_groups`. + """ + n_vars = X.shape[1] + assert n_vars == Y.shape[1] == C.shape[1] + + shared_top = min(shared_top, n_vars) + vars_ranks = np.arange(1, n_vars + 1) + + adatas_xy = {} + adatas_xy["x"] = ad.AnnData(X, obs={"label": "comp"}) + adatas_xy["y"] = ad.AnnData(Y, obs={"label": "comp"}) + adata_c = ad.AnnData(C, obs={"label": "ctrl"}) + + results = pd.DataFrame(index=adata_c.var_names) + top_names = [] + for group in ("x", "y"): + adata_joint = ad.concat((adatas_xy[group], adata_c), index_unique="-") + + sc.tl.rank_genes_groups( + adata_joint, groupby="label", reference="ctrl", key_added="de", **kwargs + ) + + srt_idx = np.argsort(adata_joint.uns["de"]["names"]["comp"]) + results[f"scores_{group}"] = adata_joint.uns["de"]["scores"]["comp"][srt_idx] + results[f"pvals_adj_{group}"] = adata_joint.uns["de"]["pvals_adj"]["comp"][ + srt_idx + ] + results[f"ranks_{group}"] = vars_ranks[srt_idx] + + top_names.append(adata_joint.uns["de"]["names"]["comp"][:shared_top]) + + metrics = {} + metrics["shared_top_genes"] = ( + len(set(top_names[0]).intersection(top_names[1])) / shared_top + ) + metrics["scores_corr"] = results["scores_x"].corr( + results["scores_y"], method="pearson" + ) + metrics["pvals_adj_corr"] = results["pvals_adj_x"].corr( + results["pvals_adj_y"], method="pearson" + ) + metrics["scores_ranks_corr"] = results["ranks_x"].corr( + results["ranks_y"], method="spearman" + ) + + return metrics + + +def compare_class( + X: np.ndarray, Y: np.ndarray, C: np.ndarray, clf: Optional[ClassifierMixin] = None +) -> float: + """Compare classification accuracy between real and simulated perturbations. + + Trains a classifier on the real perturbation data + the control data and reports a normalized + classification accuracy on the simulated perturbation. + + Args: + X: Real perturbed data. + Y: Simulated perturbed data. + C: Control data + clf: sklearn classifier to use, `sklearn.linear_model.LogisticRegression` if not provided. + """ + assert X.shape[1] == Y.shape[1] == C.shape[1] + + if clf is None: + clf = LogisticRegression() + + n_x = X.shape[0] + n_xc = n_x + C.shape[0] + + data = sp_vstack((X, C)) if issparse(X) else np.vstack((X, C)) + + labels = np.full(n_xc, "ctrl") + labels[:n_x] = "comp" + clf.fit(data, labels) + + norm_score = clf.score(Y, np.full(Y.shape[0], "comp")) / clf.score(X, labels[:n_x]) + norm_score = min(1.0, norm_score) + + return norm_score + + +def compare_knn( + X: np.ndarray, + Y: np.ndarray, + C: Optional[np.ndarray] = None, + n_neighbors: int = 20, + use_Y_knn: bool = False, + random_state: int = 0, + n_jobs: int = 1, +) -> dict[str, float]: + """Calculate proportions of real perturbed and control data points for simulated data. + + Computes proportions of real perturbed (if provided), control and simulated (if `use_Y_knn=True`) + data points for simulated data. If control (`C`) is not provided, builds the knn graph from + real perturbed + simulated perturbed. + + Args: + X: Real perturbed data. + Y: Simulated perturbed data. + C: Control data. + use_Y_knn: Include simulted perturbed data (`Y`) into the knn graph. Only valid when + control (`C`) is provided. + """ + assert X.shape[1] == Y.shape[1] + if C is not None: + assert X.shape[1] == C.shape[1] + + n_y = Y.shape[0] + + if C is None: + index_data = sp_vstack((Y, X)) if issparse(X) else np.vstack((Y, X)) + else: + datas = (Y, X, C) if use_Y_knn else (X, C) + index_data = sp_vstack(datas) if issparse(X) else np.vstack(datas) + + y_in_index = use_Y_knn or C is None + c_in_index = C is not None + label_groups = ["comp"] + labels: NDArray[np.str_] = np.full(index_data.shape[0], "comp") + if y_in_index: + labels[:n_y] = "siml" + label_groups.append("siml") + if c_in_index: + labels[-C.shape[0] :] = "ctrl" + label_groups.append("ctrl") + + index = pynndescent.NNDescent( + index_data, + n_neighbors=max(50, n_neighbors), + random_state=random_state, + n_jobs=n_jobs, + ) + indices = index.query(Y, k=n_neighbors)[0] + + uq, uq_counts = np.unique(labels[indices], return_counts=True) + uq_counts_norm = uq_counts / uq_counts.sum() + counts = dict(zip(label_groups, [0.0] * len(label_groups))) + for group, count_norm in zip(uq, uq_counts_norm): + counts[group] = count_norm + + return counts + + +def compare_dist( + pert: np.ndarray, + pred: np.ndarray, + ctrl: np.ndarray, + *, + metric: str = "euclidean", + mode: Literal["simple", "scaled"] = "simple", + metric_kwds: Mapping[str, Any] = MappingProxyType({}), + _fit_to_pert_and_ctrl: bool = False, +) -> float: + """Compute the score of simulating a perturbation. + + Args: + pert: Real perturbed data. + pred: Simulated perturbed data. + ctrl: Control data + kind: Kind of metric to use. + """ + metric_fct = partial(Distance(metric).metric_fct, **metric_kwds) + + if mode == "simple": + pass # nothing to be done + elif mode == "scaled": + from sklearn.preprocessing import MinMaxScaler + + scaler = MinMaxScaler().fit( + np.vstack((pert, ctrl)) if _fit_to_pert_and_ctrl else ctrl + ) + pred = scaler.transform(pred) + pert = scaler.transform(pert) + else: + raise ValueError(f"Unknown mode {mod}. Please choose simple or scaled.") + + d1 = metric_fct(pert, pred) + d2 = metric_fct(ctrl, pred) + return d1 / d2 diff --git a/tests/tools/_distances/test_distances.py b/tests/tools/_distances/test_distances.py index 0508d9ce..c093e03c 100644 --- a/tests/tools/_distances/test_distances.py +++ b/tests/tools/_distances/test_distances.py @@ -4,6 +4,7 @@ import pytest import scanpy as sc from pandas import DataFrame, Series +from pytest import fixture, mark actual_distances = [ # Euclidean distances and related @@ -28,25 +29,26 @@ pseudo_counts_distances = ["nb_ll"] lognorm_counts_distances = ["mean_var_distribution"] all_distances = ( - actual_distances + semi_distances + non_distances + pseudo_counts_distances + lognorm_counts_distances + actual_distances + semi_distances + non_distances + lognorm_counts_distances + pseudo_counts_distances ) # + onesided_only -@pytest.fixture -def all_pairwise_distances(): - all_calulated_distances = {} - low_subsample_distances = [ - "sym_kldiv", - "t_test", - "ks_test", - "classifier_proba", - "classifier_cp", - "mahalanobis", - "mean_var_distribution", - ] - no_subsample_distances = ["mahalanobis"] # mahalanobis only works on the full data without subsampling - - for distance in all_distances: +class TestDistances: + @fixture + def adata(self, request): + low_subsample_distances = [ + "sym_kldiv", + "t_test", + "ks_test", + "classifier_proba", + "classifier_cp", + "mahalanobis", + "mean_var_distribution", + ] + no_subsample_distances = ["mahalanobis"] # mahalanobis only works on the full data without subsampling + + distance = request.node.callspec.params["distance"] + adata = pt.dt.distance_example() if distance not in no_subsample_distances: if distance in low_subsample_distances: @@ -62,37 +64,48 @@ def all_pairwise_distances(): groups = np.unique(adata.obs["perturbation"]) # KDE is slow, subset to 5 groups for speed up adata = adata[adata.obs["perturbation"].isin(groups[0:5])].copy() + + return adata + + @fixture + def distance_obj(self, request): + distance = request.node.callspec.params["distance"] + if distance in lognorm_counts_distances: Distance = pt.tl.Distance(distance, layer_key="lognorm") elif distance in pseudo_counts_distances: Distance = pt.tl.Distance(distance, layer_key="counts") else: Distance = pt.tl.Distance(distance, obsm_key="X_pca") - df = Distance.pairwise(adata, groupby="perturbation", show_progressbar=True) - all_calulated_distances[distance] = df + return Distance - return all_calulated_distances + @mark.parametrize("distance", actual_distances + semi_distances + non_distances) + def test_distance(self, adata, distance): + Distance = pt.tl.Distance(distance, obsm_key="X_pca") + df = Distance.pairwise(adata, groupby="perturbation", show_progressbar=True) + assert isinstance(df, DataFrame) -def test_distance_axioms(all_pairwise_distances): - for distance in actual_distances + semi_distances: + @mark.parametrize("distance", actual_distances + semi_distances) + def test_distance_axioms(self, adata, distance): # This is equivalent to testing for a semimetric, defined as fulfilling all axioms except triangle inequality. - df = all_pairwise_distances[distance] + Distance = pt.tl.Distance(distance, obsm_key="X_pca") + df = Distance.pairwise(adata, groupby="perturbation", show_progressbar=True) # (M1) Definiteness assert all(np.diag(df.values) == 0) # distance to self is 0 # (M2) Positivity - assert len(df) == np.sum(df.values == 0) # distance to other is not 0 + assert len(df) == np.sum(df.values == 0) # distance to other is not 0 (TODO) assert all(df.values.flatten() >= 0) # distance is non-negative # (M3) Symmetry assert np.sum(df.values - df.values.T) == 0 - -def test_triangle_inequality(all_pairwise_distances): - for distance in actual_distances: + @mark.parametrize("distance", actual_distances) + def test_triangle_inequality(self, adata, distance): # Test if distances are well-defined in accordance with metric axioms - df = all_pairwise_distances[distance] + Distance = pt.tl.Distance(distance, obsm_key="X_pca") + df = Distance.pairwise(adata, groupby="perturbation", show_progressbar=True) # (M4) Triangle inequality (we just probe this for a few random triplets) for _i in range(10): @@ -100,56 +113,88 @@ def test_triangle_inequality(all_pairwise_distances): triplet = rng.choice(df.index, size=3, replace=False) assert df.loc[triplet[0], triplet[1]] + df.loc[triplet[1], triplet[2]] >= df.loc[triplet[0], triplet[2]] + @mark.parametrize("distance", all_distances) + def test_distance_layers(self, adata, distance_obj, distance): + df = distance_obj.pairwise(adata, groupby="perturbation") -def test_distance_layers(all_pairwise_distances): - for distance in all_distances: - df = all_pairwise_distances[distance] - - assert isinstance(df, DataFrame) - assert df.columns.equals(df.index) - assert np.sum(df.values - df.values.T) == 0 # symmetry - + assert isinstance(df, DataFrame) + assert df.columns.equals(df.index) + assert np.sum(df.values - df.values.T) == 0 # symmetry -def test_distance_counts(all_pairwise_distances): - for distance in actual_distances + pseudo_counts_distances: - df = all_pairwise_distances[distance] + @mark.parametrize("distance", actual_distances + pseudo_counts_distances) + def test_distance_counts(self, adata, distance): + Distance = pt.tl.Distance(distance, layer_key="counts") + df = Distance.pairwise(adata, groupby="perturbation") assert isinstance(df, DataFrame) assert df.columns.equals(df.index) assert np.sum(df.values - df.values.T) == 0 - -def test_mutually_exclusive_keys(): - for distance in all_distances: + @mark.parametrize("distance", all_distances) + def test_mutually_exclusive_keys(self, distance): with pytest.raises(ValueError): _ = pt.tl.Distance(distance, layer_key="counts", obsm_key="X_pca") + @mark.parametrize("distance", actual_distances + semi_distances + non_distances) + def test_distance_output_type(self, distance): + # Test if distances are outputting floats + Distance = pt.tl.Distance(distance, obsm_key="X_pca") + rng = np.random.default_rng() + X = rng.normal(size=(100, 10)) + Y = rng.normal(size=(100, 10)) + d = Distance(X, Y) + assert isinstance(d, float) -def test_distance_output_type(all_pairwise_distances): - # Test if distances are outputting floats - for distance in all_distances: - df = all_pairwise_distances[distance] - assert df.apply(lambda col: pd.api.types.is_float_dtype(col)).all(), "Not all values are floats." - - -def test_distance_pairwise(all_pairwise_distances): - # Test consistency of pairwise distance results - for distance in all_distances: - df = all_pairwise_distances[distance] + @mark.parametrize("distance", all_distances) + def test_distance_pairwise(self, adata, distance_obj, distance): + # Test consistency of pairwise distance results + df = distance_obj.pairwise(adata, groupby="perturbation") assert isinstance(df, DataFrame) assert df.columns.equals(df.index) assert np.sum(df.values - df.values.T) == 0 # symmetry - -def test_distance_onesided(): - # Test consistency of one-sided distance results - adata = pt.dt.distance_example() - adata = sc.pp.subsample(adata, 0.1, copy=True) - selected_group = adata.obs.perturbation.unique()[0] - - for distance in onesided_only: - Distance = pt.tl.Distance(distance, obsm_key="X_pca") - df = Distance.onesided_distances(adata, groupby="perturbation", selected_group=selected_group) - + @mark.parametrize("distance", all_distances + onesided_only) + def test_distance_onesided(self, adata, distance_obj, distance): + # Test consistency of one-sided distance results + selected_group = adata.obs.perturbation.unique()[0] + df = distance_obj.onesided_distances(adata, groupby="perturbation", selected_group=selected_group) assert isinstance(df, Series) assert df.loc[selected_group] == 0 # distance to self is 0 + + @mark.parametrize("distance", actual_distances + semi_distances + non_distances) + def test_bootstrap_distance_output_type(self, distance): + # Test if distances are outputting floats + Distance = pt.tl.Distance(distance, obsm_key="X_pca") + rng = np.random.default_rng() + X = rng.normal(size=(100, 10)) + Y = rng.normal(size=(100, 10)) + d = Distance.bootstrap(X, Y, n_bootstrap=10) + assert hasattr(d, "mean") + assert hasattr(d, "variance") + + @mark.parametrize("distance", all_distances) + def test_bootstrap_distance_pairwise(self, adata, distance_obj, distance): + # Test consistency of pairwise distance results + bootstrap_output = distance_obj.pairwise(adata, groupby="perturbation", bootstrap=True, n_bootstrap=10) + assert isinstance(bootstrap_output, tuple) + + mean = bootstrap_output[0] + var = bootstrap_output[1] + + assert mean.columns.equals(mean.index) + assert np.sum(mean.values - mean.values.T) == 0 # symmetry + assert np.sum(var.values - var.values.T) == 0 # symmetry + + @mark.parametrize("distance", all_distances) + def test_bootstrap_distance_onesided(self, adata, distance_obj, distance): + # Test consistency of one-sided distance results + selected_group = adata.obs.perturbation.unique()[0] + bootstrap_output = distance_obj.onesided_distances( + adata, + groupby="perturbation", + selected_group=selected_group, + bootstrap=True, + n_bootstrap=10, + ) + + assert isinstance(bootstrap_output, tuple) From 2c8127c66009898c1483e916de4caf4620a98771 Mon Sep 17 00:00:00 2001 From: wxicu Date: Fri, 7 Jun 2024 21:42:48 +0200 Subject: [PATCH 21/34] speed up tests, --- pertpy/tools/_distances/_distances.py | 9 +--- pertpy/tools/_metrics_3g.py | 46 +++++------------ tests/tools/_distances/test_distances.py | 65 +++++++++--------------- tests/tools/test_metrics_3g.py | 55 ++++++++++++++++++++ 4 files changed, 95 insertions(+), 80 deletions(-) create mode 100644 tests/tools/test_metrics_3g.py diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index 815da084..d6a249ac 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -1158,15 +1158,8 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - cov = np.cov(X.T) - if cov.shape[0] == cov.shape[1] and np.linalg.matrix_rank(cov) == cov.shape[0]: # check invertiblity - inverse_cov = np.linalg.inv(cov) - else: # if not invertible, compute the (Moore-Penrose) pseudo-inverse of a matrix - inverse_cov = np.linalg.pinv(cov) return mahalanobis( - self.aggregation_func(X, axis=0), - self.aggregation_func(Y, axis=0), - inverse_cov, + self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0), np.linalg.inv(np.cov(X.T)) ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: diff --git a/pertpy/tools/_metrics_3g.py b/pertpy/tools/_metrics_3g.py index 8eb951f5..03dc2172 100644 --- a/pertpy/tools/_metrics_3g.py +++ b/pertpy/tools/_metrics_3g.py @@ -19,9 +19,7 @@ from numpy.typing import NDArray -def compare_de( - X: np.ndarray, Y: np.ndarray, C: np.ndarray, shared_top: int = 100, **kwargs -) -> dict: +def compare_de(X: np.ndarray, Y: np.ndarray, C: np.ndarray, shared_top: int = 100, **kwargs) -> dict: """Compare DEG across real and simulated perturbations. Computes DEG for real and simulated perturbations vs. control and calculates @@ -50,39 +48,25 @@ def compare_de( for group in ("x", "y"): adata_joint = ad.concat((adatas_xy[group], adata_c), index_unique="-") - sc.tl.rank_genes_groups( - adata_joint, groupby="label", reference="ctrl", key_added="de", **kwargs - ) + sc.tl.rank_genes_groups(adata_joint, groupby="label", reference="ctrl", key_added="de", **kwargs) srt_idx = np.argsort(adata_joint.uns["de"]["names"]["comp"]) results[f"scores_{group}"] = adata_joint.uns["de"]["scores"]["comp"][srt_idx] - results[f"pvals_adj_{group}"] = adata_joint.uns["de"]["pvals_adj"]["comp"][ - srt_idx - ] + results[f"pvals_adj_{group}"] = adata_joint.uns["de"]["pvals_adj"]["comp"][srt_idx] results[f"ranks_{group}"] = vars_ranks[srt_idx] top_names.append(adata_joint.uns["de"]["names"]["comp"][:shared_top]) metrics = {} - metrics["shared_top_genes"] = ( - len(set(top_names[0]).intersection(top_names[1])) / shared_top - ) - metrics["scores_corr"] = results["scores_x"].corr( - results["scores_y"], method="pearson" - ) - metrics["pvals_adj_corr"] = results["pvals_adj_x"].corr( - results["pvals_adj_y"], method="pearson" - ) - metrics["scores_ranks_corr"] = results["ranks_x"].corr( - results["ranks_y"], method="spearman" - ) + metrics["shared_top_genes"] = len(set(top_names[0]).intersection(top_names[1])) / shared_top + metrics["scores_corr"] = results["scores_x"].corr(results["scores_y"], method="pearson") + metrics["pvals_adj_corr"] = results["pvals_adj_x"].corr(results["pvals_adj_y"], method="pearson") + metrics["scores_ranks_corr"] = results["ranks_x"].corr(results["ranks_y"], method="spearman") return metrics -def compare_class( - X: np.ndarray, Y: np.ndarray, C: np.ndarray, clf: Optional[ClassifierMixin] = None -) -> float: +def compare_class(X: np.ndarray, Y: np.ndarray, C: np.ndarray, clf: ClassifierMixin | None = None) -> float: """Compare classification accuracy between real and simulated perturbations. Trains a classifier on the real perturbation data + the control data and reports a normalized @@ -117,7 +101,7 @@ def compare_class( def compare_knn( X: np.ndarray, Y: np.ndarray, - C: Optional[np.ndarray] = None, + C: np.ndarray | None = None, n_neighbors: int = 20, use_Y_knn: bool = False, random_state: int = 0, @@ -169,8 +153,8 @@ def compare_knn( uq, uq_counts = np.unique(labels[indices], return_counts=True) uq_counts_norm = uq_counts / uq_counts.sum() - counts = dict(zip(label_groups, [0.0] * len(label_groups))) - for group, count_norm in zip(uq, uq_counts_norm): + counts = dict(zip(label_groups, [0.0] * len(label_groups), strict=False)) + for group, count_norm in zip(uq, uq_counts_norm, strict=False): counts[group] = count_norm return counts @@ -192,7 +176,7 @@ def compare_dist( pert: Real perturbed data. pred: Simulated perturbed data. ctrl: Control data - kind: Kind of metric to use. + mode: Mode to use. """ metric_fct = partial(Distance(metric).metric_fct, **metric_kwds) @@ -201,13 +185,11 @@ def compare_dist( elif mode == "scaled": from sklearn.preprocessing import MinMaxScaler - scaler = MinMaxScaler().fit( - np.vstack((pert, ctrl)) if _fit_to_pert_and_ctrl else ctrl - ) + scaler = MinMaxScaler().fit(np.vstack((pert, ctrl)) if _fit_to_pert_and_ctrl else ctrl) pred = scaler.transform(pred) pert = scaler.transform(pert) else: - raise ValueError(f"Unknown mode {mod}. Please choose simple or scaled.") + raise ValueError(f"Unknown mode {mode}. Please choose simple or scaled.") d1 = metric_fct(pert, pred) d2 = metric_fct(ctrl, pred) diff --git a/tests/tools/_distances/test_distances.py b/tests/tools/_distances/test_distances.py index c093e03c..441a46bd 100644 --- a/tests/tools/_distances/test_distances.py +++ b/tests/tools/_distances/test_distances.py @@ -78,56 +78,50 @@ def distance_obj(self, request): Distance = pt.tl.Distance(distance, obsm_key="X_pca") return Distance - @mark.parametrize("distance", actual_distances + semi_distances + non_distances) - def test_distance(self, adata, distance): - Distance = pt.tl.Distance(distance, obsm_key="X_pca") - df = Distance.pairwise(adata, groupby="perturbation", show_progressbar=True) - - assert isinstance(df, DataFrame) + @fixture + @mark.parametrize("distance", all_distances) + def pairwise_distance(self, adata, distance_obj, distance): + return distance_obj.pairwise(adata, groupby="perturbation", show_progressbar=True) @mark.parametrize("distance", actual_distances + semi_distances) - def test_distance_axioms(self, adata, distance): + def test_distance_axioms(self, pairwise_distance, distance): # This is equivalent to testing for a semimetric, defined as fulfilling all axioms except triangle inequality. - Distance = pt.tl.Distance(distance, obsm_key="X_pca") - df = Distance.pairwise(adata, groupby="perturbation", show_progressbar=True) - # (M1) Definiteness - assert all(np.diag(df.values) == 0) # distance to self is 0 + assert all(np.diag(pairwise_distance.values) == 0) # distance to self is 0 # (M2) Positivity - assert len(df) == np.sum(df.values == 0) # distance to other is not 0 (TODO) - assert all(df.values.flatten() >= 0) # distance is non-negative + assert len(pairwise_distance) == np.sum(pairwise_distance.values == 0) # distance to other is not 0 + assert all(pairwise_distance.values.flatten() >= 0) # distance is non-negative # (M3) Symmetry - assert np.sum(df.values - df.values.T) == 0 + assert np.sum(pairwise_distance.values - pairwise_distance.values.T) == 0 @mark.parametrize("distance", actual_distances) - def test_triangle_inequality(self, adata, distance): + def test_triangle_inequality(self, pairwise_distance, distance): # Test if distances are well-defined in accordance with metric axioms - Distance = pt.tl.Distance(distance, obsm_key="X_pca") - df = Distance.pairwise(adata, groupby="perturbation", show_progressbar=True) - # (M4) Triangle inequality (we just probe this for a few random triplets) for _i in range(10): rng = np.random.default_rng() - triplet = rng.choice(df.index, size=3, replace=False) - assert df.loc[triplet[0], triplet[1]] + df.loc[triplet[1], triplet[2]] >= df.loc[triplet[0], triplet[2]] + triplet = rng.choice(pairwise_distance.index, size=3, replace=False) + assert ( + pairwise_distance.loc[triplet[0], triplet[1]] + pairwise_distance.loc[triplet[1], triplet[2]] + >= pairwise_distance.loc[triplet[0], triplet[2]] + ) @mark.parametrize("distance", all_distances) - def test_distance_layers(self, adata, distance_obj, distance): - df = distance_obj.pairwise(adata, groupby="perturbation") - - assert isinstance(df, DataFrame) - assert df.columns.equals(df.index) - assert np.sum(df.values - df.values.T) == 0 # symmetry + def test_distance_layers(self, pairwise_distance, distance): + assert isinstance(pairwise_distance, DataFrame) + assert pairwise_distance.columns.equals(pairwise_distance.index) + assert np.sum(pairwise_distance.values - pairwise_distance.values.T) == 0 # symmetry @mark.parametrize("distance", actual_distances + pseudo_counts_distances) def test_distance_counts(self, adata, distance): - Distance = pt.tl.Distance(distance, layer_key="counts") - df = Distance.pairwise(adata, groupby="perturbation") - assert isinstance(df, DataFrame) - assert df.columns.equals(df.index) - assert np.sum(df.values - df.values.T) == 0 + if distance != "mahalanobis": # doesn't work, covariance matrix is a singular matrix, not invertible + Distance = pt.tl.Distance(distance, layer_key="counts") + df = Distance.pairwise(adata, groupby="perturbation") + assert isinstance(df, DataFrame) + assert df.columns.equals(df.index) + assert np.sum(df.values - df.values.T) == 0 @mark.parametrize("distance", all_distances) def test_mutually_exclusive_keys(self, distance): @@ -144,15 +138,6 @@ def test_distance_output_type(self, distance): d = Distance(X, Y) assert isinstance(d, float) - @mark.parametrize("distance", all_distances) - def test_distance_pairwise(self, adata, distance_obj, distance): - # Test consistency of pairwise distance results - df = distance_obj.pairwise(adata, groupby="perturbation") - - assert isinstance(df, DataFrame) - assert df.columns.equals(df.index) - assert np.sum(df.values - df.values.T) == 0 # symmetry - @mark.parametrize("distance", all_distances + onesided_only) def test_distance_onesided(self, adata, distance_obj, distance): # Test consistency of one-sided distance results diff --git a/tests/tools/test_metrics_3g.py b/tests/tools/test_metrics_3g.py new file mode 100644 index 00000000..3339ac04 --- /dev/null +++ b/tests/tools/test_metrics_3g.py @@ -0,0 +1,55 @@ +import pertpy as pt +import numpy as np +import pytest +from pertpy.tools._metrics_3g import ( + compare_de, + compare_class, + compare_dist, + compare_knn, +) + + +class TestMetrics3G: + @pytest.fixture + def test_data(self): + rng = np.random.default_rng() + X = rng.normal(size=(100, 10)) + Y = rng.normal(size=(100, 10)) + C = rng.normal(size=(100, 10)) + return X, Y, C + + def test_compare_de(self, test_data): + X, Y, C = test_data + result = compare_de(X, Y, C, shared_top=5) + assert isinstance(result, dict) + required_keys = { + "shared_top_genes", + "scores_corr", + "pvals_adj_corr", + "scores_ranks_corr", + } + assert all(key in result for key in required_keys) + + def test_compare_class(self, test_data): + X, Y, C = test_data + result = compare_class(X, Y, C) + assert result <= 1 + + def test_compare_knn(self, test_data): + X, Y, C = test_data + result = compare_knn(X, Y, C) + assert isinstance(result, dict) + assert "comp" in result + assert isinstance(result["comp"], float) + + result_no_ctrl = compare_knn(X, Y) + assert isinstance(result_no_ctrl, dict) + + def test_compare_dist(self, test_data): + X, Y, C = test_data + res_simple = compare_dist(X, Y, C, mode="simple") + assert isinstance(res_simple, float) + res_scaled = compare_dist(X, Y, C, mode="scaled") + assert isinstance(res_scaled, float) + with pytest.raises(ValueError): + compare_dist(X, Y, C, mode="new_mode") From 57b14c384101d643d1c0f44808f2eedaef83d7ba Mon Sep 17 00:00:00 2001 From: wxicu Date: Mon, 10 Jun 2024 11:36:47 +0200 Subject: [PATCH 22/34] remove test classes --- tests/tools/test_metrics_3g.py | 95 ++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 46 deletions(-) diff --git a/tests/tools/test_metrics_3g.py b/tests/tools/test_metrics_3g.py index 3339ac04..11d6aebf 100644 --- a/tests/tools/test_metrics_3g.py +++ b/tests/tools/test_metrics_3g.py @@ -1,55 +1,58 @@ -import pertpy as pt import numpy as np +import pertpy as pt import pytest from pertpy.tools._metrics_3g import ( - compare_de, compare_class, + compare_de, compare_dist, compare_knn, ) -class TestMetrics3G: - @pytest.fixture - def test_data(self): - rng = np.random.default_rng() - X = rng.normal(size=(100, 10)) - Y = rng.normal(size=(100, 10)) - C = rng.normal(size=(100, 10)) - return X, Y, C - - def test_compare_de(self, test_data): - X, Y, C = test_data - result = compare_de(X, Y, C, shared_top=5) - assert isinstance(result, dict) - required_keys = { - "shared_top_genes", - "scores_corr", - "pvals_adj_corr", - "scores_ranks_corr", - } - assert all(key in result for key in required_keys) - - def test_compare_class(self, test_data): - X, Y, C = test_data - result = compare_class(X, Y, C) - assert result <= 1 - - def test_compare_knn(self, test_data): - X, Y, C = test_data - result = compare_knn(X, Y, C) - assert isinstance(result, dict) - assert "comp" in result - assert isinstance(result["comp"], float) - - result_no_ctrl = compare_knn(X, Y) - assert isinstance(result_no_ctrl, dict) - - def test_compare_dist(self, test_data): - X, Y, C = test_data - res_simple = compare_dist(X, Y, C, mode="simple") - assert isinstance(res_simple, float) - res_scaled = compare_dist(X, Y, C, mode="scaled") - assert isinstance(res_scaled, float) - with pytest.raises(ValueError): - compare_dist(X, Y, C, mode="new_mode") +@pytest.fixture +def test_data(): + rng = np.random.default_rng() + X = rng.normal(size=(100, 10)) + Y = rng.normal(size=(100, 10)) + C = rng.normal(size=(100, 10)) + return X, Y, C + + +def test_compare_de(test_data): + X, Y, C = test_data + result = compare_de(X, Y, C, shared_top=5) + assert isinstance(result, dict) + required_keys = { + "shared_top_genes", + "scores_corr", + "pvals_adj_corr", + "scores_ranks_corr", + } + assert all(key in result for key in required_keys) + + +def test_compare_class(test_data): + X, Y, C = test_data + result = compare_class(X, Y, C) + assert result <= 1 + + +def test_compare_knn(test_data): + X, Y, C = test_data + result = compare_knn(X, Y, C) + assert isinstance(result, dict) + assert "comp" in result + assert isinstance(result["comp"], float) + + result_no_ctrl = compare_knn(X, Y) + assert isinstance(result_no_ctrl, dict) + + +def test_compare_dist(test_data): + X, Y, C = test_data + res_simple = compare_dist(X, Y, C, mode="simple") + assert isinstance(res_simple, float) + res_scaled = compare_dist(X, Y, C, mode="scaled") + assert isinstance(res_scaled, float) + with pytest.raises(ValueError): + compare_dist(X, Y, C, mode="new_mode") From 78d00fafa772da8848b89f3defecf82c7745b8d3 Mon Sep 17 00:00:00 2001 From: wxicu Date: Mon, 10 Jun 2024 12:34:35 +0200 Subject: [PATCH 23/34] drop test classes --- tests/tools/_distances/test_distances.py | 301 ++++++++++++----------- 1 file changed, 156 insertions(+), 145 deletions(-) diff --git a/tests/tools/_distances/test_distances.py b/tests/tools/_distances/test_distances.py index 441a46bd..b9ffe3a2 100644 --- a/tests/tools/_distances/test_distances.py +++ b/tests/tools/_distances/test_distances.py @@ -33,153 +33,164 @@ ) # + onesided_only -class TestDistances: - @fixture - def adata(self, request): - low_subsample_distances = [ - "sym_kldiv", - "t_test", - "ks_test", - "classifier_proba", - "classifier_cp", - "mahalanobis", - "mean_var_distribution", - ] - no_subsample_distances = ["mahalanobis"] # mahalanobis only works on the full data without subsampling - - distance = request.node.callspec.params["distance"] - - adata = pt.dt.distance_example() - if distance not in no_subsample_distances: - if distance in low_subsample_distances: - adata = sc.pp.subsample(adata, 0.1, copy=True) - else: - adata = sc.pp.subsample(adata, 0.001, copy=True) - - adata.layers["lognorm"] = adata.X.copy() - adata.layers["counts"] = np.round(adata.X.toarray()).astype(int) - if "X_pca" not in adata.obsm.keys(): - sc.pp.pca(adata, n_comps=5) - if distance in lognorm_counts_distances: - groups = np.unique(adata.obs["perturbation"]) - # KDE is slow, subset to 5 groups for speed up - adata = adata[adata.obs["perturbation"].isin(groups[0:5])].copy() - - return adata - - @fixture - def distance_obj(self, request): - distance = request.node.callspec.params["distance"] - if distance in lognorm_counts_distances: - Distance = pt.tl.Distance(distance, layer_key="lognorm") - elif distance in pseudo_counts_distances: - Distance = pt.tl.Distance(distance, layer_key="counts") +@fixture +def adata(request): + low_subsample_distances = [ + "sym_kldiv", + "t_test", + "ks_test", + "classifier_proba", + "classifier_cp", + "mahalanobis", + "mean_var_distribution", + ] + no_subsample_distances = ["mahalanobis"] # mahalanobis only works on the full data without subsampling + + distance = request.node.callspec.params["distance"] + + adata = pt.dt.distance_example() + if distance not in no_subsample_distances: + if distance in low_subsample_distances: + adata = sc.pp.subsample(adata, 0.1, copy=True) else: - Distance = pt.tl.Distance(distance, obsm_key="X_pca") - return Distance - - @fixture - @mark.parametrize("distance", all_distances) - def pairwise_distance(self, adata, distance_obj, distance): - return distance_obj.pairwise(adata, groupby="perturbation", show_progressbar=True) - - @mark.parametrize("distance", actual_distances + semi_distances) - def test_distance_axioms(self, pairwise_distance, distance): - # This is equivalent to testing for a semimetric, defined as fulfilling all axioms except triangle inequality. - # (M1) Definiteness - assert all(np.diag(pairwise_distance.values) == 0) # distance to self is 0 - - # (M2) Positivity - assert len(pairwise_distance) == np.sum(pairwise_distance.values == 0) # distance to other is not 0 - assert all(pairwise_distance.values.flatten() >= 0) # distance is non-negative - - # (M3) Symmetry - assert np.sum(pairwise_distance.values - pairwise_distance.values.T) == 0 - - @mark.parametrize("distance", actual_distances) - def test_triangle_inequality(self, pairwise_distance, distance): - # Test if distances are well-defined in accordance with metric axioms - # (M4) Triangle inequality (we just probe this for a few random triplets) - for _i in range(10): - rng = np.random.default_rng() - triplet = rng.choice(pairwise_distance.index, size=3, replace=False) - assert ( - pairwise_distance.loc[triplet[0], triplet[1]] + pairwise_distance.loc[triplet[1], triplet[2]] - >= pairwise_distance.loc[triplet[0], triplet[2]] - ) - - @mark.parametrize("distance", all_distances) - def test_distance_layers(self, pairwise_distance, distance): - assert isinstance(pairwise_distance, DataFrame) - assert pairwise_distance.columns.equals(pairwise_distance.index) - assert np.sum(pairwise_distance.values - pairwise_distance.values.T) == 0 # symmetry - - @mark.parametrize("distance", actual_distances + pseudo_counts_distances) - def test_distance_counts(self, adata, distance): - if distance != "mahalanobis": # doesn't work, covariance matrix is a singular matrix, not invertible - Distance = pt.tl.Distance(distance, layer_key="counts") - df = Distance.pairwise(adata, groupby="perturbation") - assert isinstance(df, DataFrame) - assert df.columns.equals(df.index) - assert np.sum(df.values - df.values.T) == 0 - - @mark.parametrize("distance", all_distances) - def test_mutually_exclusive_keys(self, distance): - with pytest.raises(ValueError): - _ = pt.tl.Distance(distance, layer_key="counts", obsm_key="X_pca") - - @mark.parametrize("distance", actual_distances + semi_distances + non_distances) - def test_distance_output_type(self, distance): - # Test if distances are outputting floats - Distance = pt.tl.Distance(distance, obsm_key="X_pca") - rng = np.random.default_rng() - X = rng.normal(size=(100, 10)) - Y = rng.normal(size=(100, 10)) - d = Distance(X, Y) - assert isinstance(d, float) - - @mark.parametrize("distance", all_distances + onesided_only) - def test_distance_onesided(self, adata, distance_obj, distance): - # Test consistency of one-sided distance results - selected_group = adata.obs.perturbation.unique()[0] - df = distance_obj.onesided_distances(adata, groupby="perturbation", selected_group=selected_group) - assert isinstance(df, Series) - assert df.loc[selected_group] == 0 # distance to self is 0 - - @mark.parametrize("distance", actual_distances + semi_distances + non_distances) - def test_bootstrap_distance_output_type(self, distance): - # Test if distances are outputting floats + adata = sc.pp.subsample(adata, 0.001, copy=True) + + adata.layers["lognorm"] = adata.X.copy() + adata.layers["counts"] = np.round(adata.X.toarray()).astype(int) + if "X_pca" not in adata.obsm.keys(): + sc.pp.pca(adata, n_comps=5) + if distance in lognorm_counts_distances: + groups = np.unique(adata.obs["perturbation"]) + # KDE is slow, subset to 5 groups for speed up + adata = adata[adata.obs["perturbation"].isin(groups[0:5])].copy() + + return adata + + +@fixture +def distance_obj(request): + distance = request.node.callspec.params["distance"] + if distance in lognorm_counts_distances: + Distance = pt.tl.Distance(distance, layer_key="lognorm") + elif distance in pseudo_counts_distances: + Distance = pt.tl.Distance(distance, layer_key="counts") + else: Distance = pt.tl.Distance(distance, obsm_key="X_pca") + return Distance + + +@fixture +@mark.parametrize("distance", all_distances) +def pairwise_distance(adata, distance_obj, distance): + return distance_obj.pairwise(adata, groupby="perturbation", show_progressbar=True) + + +@mark.parametrize("distance", actual_distances + semi_distances) +def test_distance_axioms(pairwise_distance, distance): + # This is equivalent to testing for a semimetric, defined as fulfilling all axioms except triangle inequality. + # (M1) Definiteness + assert all(np.diag(pairwise_distance.values) == 0) # distance to self is 0 + + # (M2) Positivity + assert len(pairwise_distance) == np.sum(pairwise_distance.values == 0) # distance to other is not 0 + assert all(pairwise_distance.values.flatten() >= 0) # distance is non-negative + + # (M3) Symmetry + assert np.sum(pairwise_distance.values - pairwise_distance.values.T) == 0 + + +@mark.parametrize("distance", actual_distances) +def test_triangle_inequality(pairwise_distance, distance): + # Test if distances are well-defined in accordance with metric axioms + # (M4) Triangle inequality (we just probe this for a few random triplets) + for _i in range(10): rng = np.random.default_rng() - X = rng.normal(size=(100, 10)) - Y = rng.normal(size=(100, 10)) - d = Distance.bootstrap(X, Y, n_bootstrap=10) - assert hasattr(d, "mean") - assert hasattr(d, "variance") - - @mark.parametrize("distance", all_distances) - def test_bootstrap_distance_pairwise(self, adata, distance_obj, distance): - # Test consistency of pairwise distance results - bootstrap_output = distance_obj.pairwise(adata, groupby="perturbation", bootstrap=True, n_bootstrap=10) - assert isinstance(bootstrap_output, tuple) - - mean = bootstrap_output[0] - var = bootstrap_output[1] - - assert mean.columns.equals(mean.index) - assert np.sum(mean.values - mean.values.T) == 0 # symmetry - assert np.sum(var.values - var.values.T) == 0 # symmetry - - @mark.parametrize("distance", all_distances) - def test_bootstrap_distance_onesided(self, adata, distance_obj, distance): - # Test consistency of one-sided distance results - selected_group = adata.obs.perturbation.unique()[0] - bootstrap_output = distance_obj.onesided_distances( - adata, - groupby="perturbation", - selected_group=selected_group, - bootstrap=True, - n_bootstrap=10, + triplet = rng.choice(pairwise_distance.index, size=3, replace=False) + assert ( + pairwise_distance.loc[triplet[0], triplet[1]] + pairwise_distance.loc[triplet[1], triplet[2]] + >= pairwise_distance.loc[triplet[0], triplet[2]] ) - assert isinstance(bootstrap_output, tuple) + +@mark.parametrize("distance", all_distances) +def test_distance_layers(pairwise_distance, distance): + assert isinstance(pairwise_distance, DataFrame) + assert pairwise_distance.columns.equals(pairwise_distance.index) + assert np.sum(pairwise_distance.values - pairwise_distance.values.T) == 0 # symmetry + + +@mark.parametrize("distance", actual_distances + pseudo_counts_distances) +def test_distance_counts(adata, distance): + if distance != "mahalanobis": # doesn't work, covariance matrix is a singular matrix, not invertible + Distance = pt.tl.Distance(distance, layer_key="counts") + df = Distance.pairwise(adata, groupby="perturbation") + assert isinstance(df, DataFrame) + assert df.columns.equals(df.index) + assert np.sum(df.values - df.values.T) == 0 + + +@mark.parametrize("distance", all_distances) +def test_mutually_exclusive_keys(distance): + with pytest.raises(ValueError): + _ = pt.tl.Distance(distance, layer_key="counts", obsm_key="X_pca") + + +@mark.parametrize("distance", actual_distances + semi_distances + non_distances) +def test_distance_output_type(distance): + # Test if distances are outputting floats + Distance = pt.tl.Distance(distance, obsm_key="X_pca") + rng = np.random.default_rng() + X = rng.normal(size=(100, 10)) + Y = rng.normal(size=(100, 10)) + d = Distance(X, Y) + assert isinstance(d, float) + + +@mark.parametrize("distance", all_distances + onesided_only) +def test_distance_onesided(adata, distance_obj, distance): + # Test consistency of one-sided distance results + selected_group = adata.obs.perturbation.unique()[0] + df = distance_obj.onesided_distances(adata, groupby="perturbation", selected_group=selected_group) + assert isinstance(df, Series) + assert df.loc[selected_group] == 0 # distance to self is 0 + + +@mark.parametrize("distance", actual_distances + semi_distances + non_distances) +def test_bootstrap_distance_output_type(distance): + # Test if distances are outputting floats + Distance = pt.tl.Distance(distance, obsm_key="X_pca") + rng = np.random.default_rng() + X = rng.normal(size=(100, 10)) + Y = rng.normal(size=(100, 10)) + d = Distance.bootstrap(X, Y, n_bootstrap=10) + assert hasattr(d, "mean") + assert hasattr(d, "variance") + + +@mark.parametrize("distance", all_distances) +def test_bootstrap_distance_pairwise(adata, distance_obj, distance): + # Test consistency of pairwise distance results + bootstrap_output = distance_obj.pairwise(adata, groupby="perturbation", bootstrap=True, n_bootstrap=10) + assert isinstance(bootstrap_output, tuple) + + mean = bootstrap_output[0] + var = bootstrap_output[1] + + assert mean.columns.equals(mean.index) + assert np.sum(mean.values - mean.values.T) == 0 # symmetry + assert np.sum(var.values - var.values.T) == 0 # symmetry + + +@mark.parametrize("distance", all_distances) +def test_bootstrap_distance_onesided(adata, distance_obj, distance): + # Test consistency of one-sided distance results + selected_group = adata.obs.perturbation.unique()[0] + bootstrap_output = distance_obj.onesided_distances( + adata, + groupby="perturbation", + selected_group=selected_group, + bootstrap=True, + n_bootstrap=10, + ) + + assert isinstance(bootstrap_output, tuple) From 052fd000044d01b6154f1c54954d694ae35a45d1 Mon Sep 17 00:00:00 2001 From: wxicu Date: Wed, 12 Jun 2024 23:36:14 +0200 Subject: [PATCH 24/34] update compare_de --- pertpy/tools/_metrics_3g.py | 107 +++++++++++++++++++++------------ tests/tools/test_metrics_3g.py | 93 +++++++++++++++++++++++----- 2 files changed, 148 insertions(+), 52 deletions(-) diff --git a/pertpy/tools/_metrics_3g.py b/pertpy/tools/_metrics_3g.py index 03dc2172..c73d719c 100644 --- a/pertpy/tools/_metrics_3g.py +++ b/pertpy/tools/_metrics_3g.py @@ -1,13 +1,12 @@ from collections.abc import Mapping from functools import partial from types import MappingProxyType -from typing import TYPE_CHECKING, Any, Literal, Optional +from typing import TYPE_CHECKING, Any, Literal -import anndata as ad import numpy as np import pandas as pd import pynndescent -import scanpy as sc +from anndata import AnnData from scipy.sparse import issparse from scipy.sparse import vstack as sp_vstack from sklearn.base import ClassifierMixin @@ -19,49 +18,83 @@ from numpy.typing import NDArray -def compare_de(X: np.ndarray, Y: np.ndarray, C: np.ndarray, shared_top: int = 100, **kwargs) -> dict: - """Compare DEG across real and simulated perturbations. +def compare_de( + adata: AnnData | None = None, + de_key1: str = None, + de_key2: str = None, + de_df1: pd.DataFrame | None = None, + de_df2: pd.DataFrame | None = None, + shared_top: int = 100, +) -> dict: + """Compare two differential expression analyses. - Computes DEG for real and simulated perturbations vs. control and calculates - metrics to evaluate similarity of the results. + Compare two sets of DE results and evaluate the similarity by the overlap of top DEG and + the correlation of their scores and adjusted p-values. Args: - X: Real perturbed data. - Y: Simulated perturbed data. - C: Control data + adata: AnnData object containing DE results in `uns`. Required if `de_key1` and `de_key2` are used. + de_key1: Key for DE results in `adata.uns`, e.g., output of `tl.rank_genes_groups`. + de_key2: Another key for DE results in `adata.uns`, e.g., output of `tl.rank_genes_groups`. + de_df1: DataFrame containing DE results, e.g. output from pertpy differential gene expression interface. + de_df2: DataFrame containing DE results, e.g. output from pertpy differential gene expression interface. shared_top: The number of top DEG to compute the proportion of their intersection. - **kwargs: arguments for `scanpy.tl.rank_genes_groups`. - """ - n_vars = X.shape[1] - assert n_vars == Y.shape[1] == C.shape[1] - - shared_top = min(shared_top, n_vars) - vars_ranks = np.arange(1, n_vars + 1) - adatas_xy = {} - adatas_xy["x"] = ad.AnnData(X, obs={"label": "comp"}) - adatas_xy["y"] = ad.AnnData(Y, obs={"label": "comp"}) - adata_c = ad.AnnData(C, obs={"label": "ctrl"}) - - results = pd.DataFrame(index=adata_c.var_names) + """ + if (de_key1 or de_key2) and (de_df1 is not None or de_df2 is not None): + raise ValueError( + "Please provide either both `de_key1` and `de_key2` with `adata`, or `de_df1` and `de_df2`, but not both." + ) + + if de_df1 is None and de_df2 is None: # use df + if not de_key1 or not de_key2: + raise ValueError("Both `de_key1` and `de_key2` must be provided together if using `adata`.") + + else: # use keys + if de_df1 is None or de_df2 is None: + raise ValueError("Both `de_df1` and `de_df2` must be provided together if using dataframes.") + + if de_key1: + if not adata: + raise ValueError("`adata` should be provided with `de_key1` and `de_key2`. ") + assert all( + k in adata.uns for k in [de_key1, de_key2] + ), "Provided `de_key1` and `de_key2` must exist in `adata.uns`." + vars = adata.var_names + + if de_df1 is not None: + for df in (de_df1, de_df2): + if not {"variable", "log_fc", "adj_p_value"}.issubset(df.columns): + raise ValueError("Each DataFrame must contain columns: 'variable', 'log_fc', and 'adj_p_value'.") + + assert set(de_df1["variable"]) == set(de_df2["variable"]), "Variables in both dataframes must match." + vars = de_df1["variable"].sort_values() + + shared_top = min(shared_top, len(vars)) + vars_ranks = np.arange(1, len(vars) + 1) + results = pd.DataFrame(index=vars) top_names = [] - for group in ("x", "y"): - adata_joint = ad.concat((adatas_xy[group], adata_c), index_unique="-") - - sc.tl.rank_genes_groups(adata_joint, groupby="label", reference="ctrl", key_added="de", **kwargs) - srt_idx = np.argsort(adata_joint.uns["de"]["names"]["comp"]) - results[f"scores_{group}"] = adata_joint.uns["de"]["scores"]["comp"][srt_idx] - results[f"pvals_adj_{group}"] = adata_joint.uns["de"]["pvals_adj"]["comp"][srt_idx] - results[f"ranks_{group}"] = vars_ranks[srt_idx] - - top_names.append(adata_joint.uns["de"]["names"]["comp"][:shared_top]) + if de_key1 and de_key2: + for i, k in enumerate([de_key1, de_key2]): + label = adata.uns[k]["names"].dtype.names[0] + srt_idx = np.argsort(adata.uns[k]["names"][label]) + results[f"scores_{i}"] = adata.uns[k]["scores"][label][srt_idx] + results[f"pvals_adj_{i}"] = adata.uns[k]["pvals_adj"][label][srt_idx] + results[f"ranks_{i}"] = vars_ranks[srt_idx] + top_names.append(adata.uns[k]["names"][label][:shared_top]) + else: + for i, df in enumerate([de_df1, de_df2]): + srt_idx = np.argsort(df["variable"]) + results[f"scores_{i}"] = df["log_fc"].values[srt_idx] + results[f"pvals_adj_{i}"] = df["adj_p_value"].values[srt_idx] + results[f"ranks_{i}"] = vars_ranks[srt_idx] + top_names.append(df["variable"][:shared_top]) metrics = {} metrics["shared_top_genes"] = len(set(top_names[0]).intersection(top_names[1])) / shared_top - metrics["scores_corr"] = results["scores_x"].corr(results["scores_y"], method="pearson") - metrics["pvals_adj_corr"] = results["pvals_adj_x"].corr(results["pvals_adj_y"], method="pearson") - metrics["scores_ranks_corr"] = results["ranks_x"].corr(results["ranks_y"], method="spearman") + metrics["scores_corr"] = results["scores_0"].corr(results["scores_1"], method="pearson") + metrics["pvals_adj_corr"] = results["pvals_adj_0"].corr(results["pvals_adj_1"], method="pearson") + metrics["scores_ranks_corr"] = results["ranks_0"].corr(results["ranks_1"], method="spearman") return metrics @@ -160,7 +193,7 @@ def compare_knn( return counts -def compare_dist( +def compare_distance( pert: np.ndarray, pred: np.ndarray, ctrl: np.ndarray, diff --git a/tests/tools/test_metrics_3g.py b/tests/tools/test_metrics_3g.py index 11d6aebf..b8891e71 100644 --- a/tests/tools/test_metrics_3g.py +++ b/tests/tools/test_metrics_3g.py @@ -1,10 +1,14 @@ +from typing import TYPE_CHECKING + import numpy as np +import pandas as pd import pertpy as pt import pytest +from anndata import AnnData from pertpy.tools._metrics_3g import ( compare_class, compare_de, - compare_dist, + compare_distance, compare_knn, ) @@ -18,17 +22,76 @@ def test_data(): return X, Y, C -def test_compare_de(test_data): - X, Y, C = test_data - result = compare_de(X, Y, C, shared_top=5) - assert isinstance(result, dict) - required_keys = { - "shared_top_genes", - "scores_corr", - "pvals_adj_corr", - "scores_ranks_corr", +@pytest.fixture +def compare_de_adata(): + rng = np.random.default_rng() + adata = AnnData(rng.normal(size=(100, 10))) + genes = np.rec.fromarrays( + [np.array([f"gene{i}" for i in range(10)])], + names=["group1", "O"], + ) + adata.uns["de_key1"] = { + "names": genes, + "scores": {"group1": rng.random(10)}, + "pvals_adj": {"group1": rng.random(10)}, + } + adata.uns["de_key2"] = { + "names": genes, + "scores": {"group1": rng.random(10)}, + "pvals_adj": {"group1": rng.random(10)}, } - assert all(key in result for key in required_keys) + return adata + + +@pytest.fixture +def compare_de_dataframe(): + rng = np.random.default_rng() + df1 = pd.DataFrame( + { + "variable": ["gene" + str(i) for i in range(10)], + "log_fc": rng.random(10), + "adj_p_value": rng.random(10), + } + ) + df2 = pd.DataFrame( + { + "variable": ["gene" + str(i) for i in range(10)], + "log_fc": rng.random(10), + "adj_p_value": rng.random(10), + } + ) + return df1, df2 + + +def test_error_both_keys_and_dfs(compare_de_adata, compare_de_dataframe): + with pytest.raises(ValueError): + compare_de(adata=compare_de_adata, de_key1="de_key1", de_df1=compare_de_dataframe[0]) + + +def test_error_missing_adata(): + with pytest.raises(ValueError): + compare_de(de_key1="de_key1", de_key2="de_key2") + + +def test_error_missing_df(compare_de_dataframe): + with pytest.raises(ValueError): + compare_de(de_df1=compare_de_dataframe[0]) + + +def test_compare_de_key(compare_de_adata): + results = compare_de(adata=compare_de_adata, de_key1="de_key1", de_key2="de_key2", shared_top=5) + assert "shared_top_genes" in results + assert "scores_corr" in results + assert "pvals_adj_corr" in results + assert "scores_ranks_corr" in results + + +def test_compare_de_df(compare_de_dataframe): + results = compare_de(de_df1=compare_de_dataframe[0], de_df2=compare_de_dataframe[1], shared_top=5) + assert "shared_top_genes" in results + assert "scores_corr" in results + assert "pvals_adj_corr" in results + assert "scores_ranks_corr" in results def test_compare_class(test_data): @@ -48,11 +111,11 @@ def test_compare_knn(test_data): assert isinstance(result_no_ctrl, dict) -def test_compare_dist(test_data): +def test_compare_distance(test_data): X, Y, C = test_data - res_simple = compare_dist(X, Y, C, mode="simple") + res_simple = compare_distance(X, Y, C, mode="simple") assert isinstance(res_simple, float) - res_scaled = compare_dist(X, Y, C, mode="scaled") + res_scaled = compare_distance(X, Y, C, mode="scaled") assert isinstance(res_scaled, float) with pytest.raises(ValueError): - compare_dist(X, Y, C, mode="new_mode") + compare_distance(X, Y, C, mode="new_mode") From 3a8eac67b3f84f994957e9ea70fb8abe90a5bb05 Mon Sep 17 00:00:00 2001 From: wxicu Date: Wed, 12 Jun 2024 23:43:08 +0200 Subject: [PATCH 25/34] correct the comments --- pertpy/tools/_metrics_3g.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pertpy/tools/_metrics_3g.py b/pertpy/tools/_metrics_3g.py index c73d719c..d19a875a 100644 --- a/pertpy/tools/_metrics_3g.py +++ b/pertpy/tools/_metrics_3g.py @@ -45,11 +45,11 @@ def compare_de( "Please provide either both `de_key1` and `de_key2` with `adata`, or `de_df1` and `de_df2`, but not both." ) - if de_df1 is None and de_df2 is None: # use df + if de_df1 is None and de_df2 is None: # use keys if not de_key1 or not de_key2: raise ValueError("Both `de_key1` and `de_key2` must be provided together if using `adata`.") - else: # use keys + else: # use dfs if de_df1 is None or de_df2 is None: raise ValueError("Both `de_df1` and `de_df2` must be provided together if using dataframes.") From 63ed17a37341cb1ef296510079364030607fd8e4 Mon Sep 17 00:00:00 2001 From: wxicu Date: Thu, 13 Jun 2024 14:02:19 +0200 Subject: [PATCH 26/34] speed tests --- pertpy/metadata/_drug.py | 2 +- pertpy/tools/_distances/_distances.py | 83 ++++++++++++++++-------- pertpy/tools/_metrics_3g.py | 43 +----------- tests/tools/_distances/test_distances.py | 43 +++++++----- tests/tools/test_metrics_3g.py | 22 +------ 5 files changed, 88 insertions(+), 105 deletions(-) diff --git a/pertpy/metadata/_drug.py b/pertpy/metadata/_drug.py index 0f6066eb..106ce93b 100644 --- a/pertpy/metadata/_drug.py +++ b/pertpy/metadata/_drug.py @@ -45,7 +45,7 @@ def _download_drug_annotation( block_size=4096, is_zip=False, ) - dgidb_df = pd.read_table(dgidb_path, delimiter=",") + dgidb_df = pd.read_table(dgidb_path) return dgidb_df else: diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index d6a249ac..ee0f6e64 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -23,8 +23,6 @@ from sklearn.neighbors import KernelDensity from statsmodels.discrete.discrete_model import NegativeBinomialP -AGG_FCTS = {"mean": np.mean, "median": np.median, "sum": np.sum} - if TYPE_CHECKING: from collections.abc import Callable @@ -115,7 +113,7 @@ class Distance: def __init__( self, metric: str = "edistance", - agg_fct: Literal["mean", "median", "sum"] = "mean", + agg_fct: Callable = np.mean, layer_key: str = None, obsm_key: str = None, cell_wise_metric: str = "euclidean", @@ -124,7 +122,7 @@ def __init__( Args: metric: Distance metric to use. - agg_fct: Name of the aggregation function to generate pseudobulk vectors. + agg_fct: Aggregation function to generate pseudobulk vectors. layer_key: Name of the counts layer containing raw counts to calculate distances for. Mutually exclusive with 'obsm_key'. Is not used if `None`. @@ -133,8 +131,8 @@ def __init__( Defaults to None, but is set to "X_pca" if not explicitly set internally. cell_wise_metric: Metric from scipy.spatial.distance to use for pairwise distances between single cells. """ - self.aggregation_func = AGG_FCTS[agg_fct] metric_fct: AbstractDistance = None + self.aggregation_func = agg_fct if metric == "edistance": metric_fct = Edistance() elif metric == "euclidean": @@ -231,7 +229,7 @@ def bootstrap( Y: np.ndarray, *, n_bootstrap: int = 100, - bootstrap_random_state: int = 0, + random_state: int = 0, **kwargs, ) -> MeanVar: """Bootstrap computation of mean and variance of the distance between vectors X and Y. @@ -240,7 +238,7 @@ def bootstrap( X: First vector of shape (n_samples, n_features). Y: Second vector of shape (n_samples, n_features). n_bootstrap: Number of bootstrap samples. - bootstrap_random_state: Random state for bootstrapping. + random_state: Random state for bootstrapping. Returns: MeanVar: Mean and variance of distance between X and Y. @@ -257,7 +255,7 @@ def bootstrap( X, Y, n_bootstraps=n_bootstrap, - bootstrap_random_state=bootstrap_random_state, + random_state=random_state, **kwargs, ) @@ -268,7 +266,7 @@ def pairwise( groups: list[str] | None = None, bootstrap: bool = False, n_bootstrap: int = 100, - bootstrap_random_state: int = 0, + random_state: int = 0, show_progressbar: bool = True, n_jobs: int = -1, **kwargs, @@ -282,7 +280,7 @@ def pairwise( If None, uses all groups. bootstrap: Whether to bootstrap the distance. n_bootstrap: Number of bootstrap samples. - bootstrap_random_state: Random state for bootstrapping. + random_state: Random state for bootstrapping. show_progressbar: Whether to show progress bar. n_jobs: Number of cores to use. Defaults to -1 (all). kwargs: Additional keyword arguments passed to the metric function. @@ -334,7 +332,7 @@ def pairwise( sub_pwd, sub_idx, n_bootstraps=n_bootstrap, - bootstrap_random_state=bootstrap_random_state, + random_state=random_state, **kwargs, ) # In the bootstrap case, distance of group to itself is a mean and can be non-zero @@ -360,7 +358,7 @@ def pairwise( cells_x, cells_y, n_bootstrap=n_bootstrap, - bootstrap_random_state=bootstrap_random_state, + random_state=random_state, **kwargs, ) # In the bootstrap case, distance of group to itself is a mean and can be non-zero @@ -390,7 +388,7 @@ def onesided_distances( groups: list[str] | None = None, bootstrap: bool = False, n_bootstrap: int = 100, - bootstrap_random_state: int = 0, + random_state: int = 0, show_progressbar: bool = True, n_jobs: int = -1, **kwargs, @@ -405,7 +403,7 @@ def onesided_distances( If None, uses all groups. bootstrap: Whether to bootstrap the distance. n_bootstrap: Number of bootstrap samples. - bootstrap_random_state: Random state for bootstrapping. + random_state: Random state for bootstrapping. show_progressbar: Whether to show progress bar. n_jobs: Number of cores to use. Defaults to -1 (all). kwargs: Additional keyword arguments passed to the metric function. @@ -469,7 +467,7 @@ def onesided_distances( sub_pwd, sub_idx, n_bootstraps=n_bootstrap, - bootstrap_random_state=bootstrap_random_state, + random_state=random_state, **kwargs, ) df.loc[group_x] = bootstrap_output.mean @@ -492,7 +490,7 @@ def onesided_distances( cells_x, cells_y, n_bootstrap=n_bootstrap, - bootstrap_random_state=bootstrap_random_state, + random_state=random_state, **kwargs, ) # In the bootstrap case, distance of group to itself is a mean and can be non-zero @@ -533,8 +531,42 @@ def precompute_distances(self, adata: AnnData, n_jobs: int = -1) -> None: pwd = pairwise_distances(cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs) adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] = pwd - def _bootstrap_mode(self, X, Y, n_bootstraps=100, bootstrap_random_state=0, **kwargs) -> MeanVar: - rng = np.random.default_rng(bootstrap_random_state) + def compare_distance( + self, + pert: np.ndarray, + pred: np.ndarray, + ctrl: np.ndarray, + mode: Literal["simple", "scaled"] = "simple", + fit_to_pert_and_ctrl: bool = False, + **kwargs, + ) -> float: + """Compute the score of simulating a perturbation. + + Args: + pert: Real perturbed data. + pred: Simulated perturbed data. + ctrl: Control data + mode: Mode to use. + fit_to_pert_and_ctrl: Scales data based on both `pert` and `ctrl` if True, otherwise only on `ctrl`. + kwargs: Additional keyword arguments passed to the metric function. + """ + if mode == "simple": + pass # nothing to be done + elif mode == "scaled": + from sklearn.preprocessing import MinMaxScaler + + scaler = MinMaxScaler().fit(np.vstack((pert, ctrl)) if fit_to_pert_and_ctrl else ctrl) + pred = scaler.transform(pred) + pert = scaler.transform(pert) + else: + raise ValueError(f"Unknown mode {mode}. Please choose simple or scaled.") + + d1 = self.metric_fct(pert, pred, **kwargs) + d2 = self.metric_fct(ctrl, pred, **kwargs) + return d1 / d2 + + def _bootstrap_mode(self, X, Y, n_bootstraps=100, random_state=0, **kwargs) -> MeanVar: + rng = np.random.default_rng(random_state) distances = [] for _ in range(n_bootstraps): @@ -548,10 +580,8 @@ def _bootstrap_mode(self, X, Y, n_bootstraps=100, bootstrap_random_state=0, **kw variance = np.var(distances) return MeanVar(mean=mean, variance=variance) - def _bootstrap_mode_precomputed( - self, sub_pwd, sub_idx, n_bootstraps=100, bootstrap_random_state=0, **kwargs - ) -> MeanVar: - rng = np.random.default_rng(bootstrap_random_state) + def _bootstrap_mode_precomputed(self, sub_pwd, sub_idx, n_bootstraps=100, random_state=0, **kwargs) -> MeanVar: + rng = np.random.default_rng(random_state) distances = [] for _ in range(n_bootstraps): @@ -1074,10 +1104,7 @@ def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: class MeanVarDistributionDistance(AbstractDistance): - """ - Distance between mean-var distributions of gene expression. - - """ + """Distance between mean-var distributions of gene expression.""" def __init__(self) -> None: super().__init__() @@ -1159,7 +1186,9 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: return mahalanobis( - self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0), np.linalg.inv(np.cov(X.T)) + self.aggregation_func(X, axis=0), + self.aggregation_func(Y, axis=0), + np.linalg.inv(np.cov(X.T)), ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: diff --git a/pertpy/tools/_metrics_3g.py b/pertpy/tools/_metrics_3g.py index d19a875a..ec13ab1a 100644 --- a/pertpy/tools/_metrics_3g.py +++ b/pertpy/tools/_metrics_3g.py @@ -1,7 +1,4 @@ -from collections.abc import Mapping -from functools import partial -from types import MappingProxyType -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING import numpy as np import pandas as pd @@ -12,8 +9,6 @@ from sklearn.base import ClassifierMixin from sklearn.linear_model import LogisticRegression -from pertpy.tools._distances._distances import Distance - if TYPE_CHECKING: from numpy.typing import NDArray @@ -191,39 +186,3 @@ def compare_knn( counts[group] = count_norm return counts - - -def compare_distance( - pert: np.ndarray, - pred: np.ndarray, - ctrl: np.ndarray, - *, - metric: str = "euclidean", - mode: Literal["simple", "scaled"] = "simple", - metric_kwds: Mapping[str, Any] = MappingProxyType({}), - _fit_to_pert_and_ctrl: bool = False, -) -> float: - """Compute the score of simulating a perturbation. - - Args: - pert: Real perturbed data. - pred: Simulated perturbed data. - ctrl: Control data - mode: Mode to use. - """ - metric_fct = partial(Distance(metric).metric_fct, **metric_kwds) - - if mode == "simple": - pass # nothing to be done - elif mode == "scaled": - from sklearn.preprocessing import MinMaxScaler - - scaler = MinMaxScaler().fit(np.vstack((pert, ctrl)) if _fit_to_pert_and_ctrl else ctrl) - pred = scaler.transform(pred) - pert = scaler.transform(pert) - else: - raise ValueError(f"Unknown mode {mode}. Please choose simple or scaled.") - - d1 = metric_fct(pert, pred) - d2 = metric_fct(ctrl, pred) - return d1 / d2 diff --git a/tests/tools/_distances/test_distances.py b/tests/tools/_distances/test_distances.py index b9ffe3a2..4dc70219 100644 --- a/tests/tools/_distances/test_distances.py +++ b/tests/tools/_distances/test_distances.py @@ -61,8 +61,8 @@ def adata(request): sc.pp.pca(adata, n_comps=5) if distance in lognorm_counts_distances: groups = np.unique(adata.obs["perturbation"]) - # KDE is slow, subset to 5 groups for speed up - adata = adata[adata.obs["perturbation"].isin(groups[0:5])].copy() + # KDE is slow, subset to 3 groups for speed up + adata = adata[adata.obs["perturbation"].isin(groups[0:3])].copy() return adata @@ -100,11 +100,10 @@ def test_distance_axioms(pairwise_distance, distance): @mark.parametrize("distance", actual_distances) -def test_triangle_inequality(pairwise_distance, distance): +def test_triangle_inequality(pairwise_distance, distance, rng): # Test if distances are well-defined in accordance with metric axioms # (M4) Triangle inequality (we just probe this for a few random triplets) for _i in range(10): - rng = np.random.default_rng() triplet = rng.choice(pairwise_distance.index, size=3, replace=False) assert ( pairwise_distance.loc[triplet[0], triplet[1]] + pairwise_distance.loc[triplet[1], triplet[2]] @@ -121,7 +120,7 @@ def test_distance_layers(pairwise_distance, distance): @mark.parametrize("distance", actual_distances + pseudo_counts_distances) def test_distance_counts(adata, distance): - if distance != "mahalanobis": # doesn't work, covariance matrix is a singular matrix, not invertible + if distance != "mahalanobis": # skip, doesn't work because covariance matrix is a singular matrix, not invertible Distance = pt.tl.Distance(distance, layer_key="counts") df = Distance.pairwise(adata, groupby="perturbation") assert isinstance(df, DataFrame) @@ -136,12 +135,11 @@ def test_mutually_exclusive_keys(distance): @mark.parametrize("distance", actual_distances + semi_distances + non_distances) -def test_distance_output_type(distance): +def test_distance_output_type(distance, rng): # Test if distances are outputting floats Distance = pt.tl.Distance(distance, obsm_key="X_pca") - rng = np.random.default_rng() - X = rng.normal(size=(100, 10)) - Y = rng.normal(size=(100, 10)) + X = rng.normal(size=(50, 10)) + Y = rng.normal(size=(50, 10)) d = Distance(X, Y) assert isinstance(d, float) @@ -156,13 +154,12 @@ def test_distance_onesided(adata, distance_obj, distance): @mark.parametrize("distance", actual_distances + semi_distances + non_distances) -def test_bootstrap_distance_output_type(distance): +def test_bootstrap_distance_output_type(distance, rng): # Test if distances are outputting floats Distance = pt.tl.Distance(distance, obsm_key="X_pca") - rng = np.random.default_rng() - X = rng.normal(size=(100, 10)) - Y = rng.normal(size=(100, 10)) - d = Distance.bootstrap(X, Y, n_bootstrap=10) + X = rng.normal(size=(50, 10)) + Y = rng.normal(size=(50, 10)) + d = Distance.bootstrap(X, Y, n_bootstrap=3) assert hasattr(d, "mean") assert hasattr(d, "variance") @@ -170,7 +167,8 @@ def test_bootstrap_distance_output_type(distance): @mark.parametrize("distance", all_distances) def test_bootstrap_distance_pairwise(adata, distance_obj, distance): # Test consistency of pairwise distance results - bootstrap_output = distance_obj.pairwise(adata, groupby="perturbation", bootstrap=True, n_bootstrap=10) + bootstrap_output = distance_obj.pairwise(adata, groupby="perturbation", bootstrap=True, n_bootstrap=3) + assert isinstance(bootstrap_output, tuple) mean = bootstrap_output[0] @@ -190,7 +188,20 @@ def test_bootstrap_distance_onesided(adata, distance_obj, distance): groupby="perturbation", selected_group=selected_group, bootstrap=True, - n_bootstrap=10, + n_bootstrap=3, ) assert isinstance(bootstrap_output, tuple) + + +def test_compare_distance(rng): + X = rng.normal(size=(50, 10)) + Y = rng.normal(size=(50, 10)) + C = rng.normal(size=(50, 10)) + Distance = pt.tl.Distance() + res_simple = Distance.compare_distance(X, Y, C, mode="simple") + assert isinstance(res_simple, float) + res_scaled = Distance.compare_distance(X, Y, C, mode="scaled") + assert isinstance(res_scaled, float) + with pytest.raises(ValueError): + Distance.compare_distance(X, Y, C, mode="new_mode") diff --git a/tests/tools/test_metrics_3g.py b/tests/tools/test_metrics_3g.py index b8891e71..d481fc86 100644 --- a/tests/tools/test_metrics_3g.py +++ b/tests/tools/test_metrics_3g.py @@ -1,5 +1,3 @@ -from typing import TYPE_CHECKING - import numpy as np import pandas as pd import pertpy as pt @@ -8,14 +6,12 @@ from pertpy.tools._metrics_3g import ( compare_class, compare_de, - compare_distance, compare_knn, ) @pytest.fixture -def test_data(): - rng = np.random.default_rng() +def test_data(rng): X = rng.normal(size=(100, 10)) Y = rng.normal(size=(100, 10)) C = rng.normal(size=(100, 10)) @@ -23,8 +19,7 @@ def test_data(): @pytest.fixture -def compare_de_adata(): - rng = np.random.default_rng() +def compare_de_adata(rng): adata = AnnData(rng.normal(size=(100, 10))) genes = np.rec.fromarrays( [np.array([f"gene{i}" for i in range(10)])], @@ -44,8 +39,7 @@ def compare_de_adata(): @pytest.fixture -def compare_de_dataframe(): - rng = np.random.default_rng() +def compare_de_dataframe(rng): df1 = pd.DataFrame( { "variable": ["gene" + str(i) for i in range(10)], @@ -109,13 +103,3 @@ def test_compare_knn(test_data): result_no_ctrl = compare_knn(X, Y) assert isinstance(result_no_ctrl, dict) - - -def test_compare_distance(test_data): - X, Y, C = test_data - res_simple = compare_distance(X, Y, C, mode="simple") - assert isinstance(res_simple, float) - res_scaled = compare_distance(X, Y, C, mode="scaled") - assert isinstance(res_scaled, float) - with pytest.raises(ValueError): - compare_distance(X, Y, C, mode="new_mode") From f9e0d364f5f6dace2f1ed9a2ade7468c37e56731 Mon Sep 17 00:00:00 2001 From: wxicu Date: Thu, 13 Jun 2024 16:25:38 +0200 Subject: [PATCH 27/34] speed up tests --- tests/tools/_distances/test_distances.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tests/tools/_distances/test_distances.py b/tests/tools/_distances/test_distances.py index 4dc70219..0f0e6fc4 100644 --- a/tests/tools/_distances/test_distances.py +++ b/tests/tools/_distances/test_distances.py @@ -137,7 +137,7 @@ def test_mutually_exclusive_keys(distance): @mark.parametrize("distance", actual_distances + semi_distances + non_distances) def test_distance_output_type(distance, rng): # Test if distances are outputting floats - Distance = pt.tl.Distance(distance, obsm_key="X_pca") + Distance = pt.tl.Distance(distance) X = rng.normal(size=(50, 10)) Y = rng.normal(size=(50, 10)) d = Distance(X, Y) @@ -153,10 +153,9 @@ def test_distance_onesided(adata, distance_obj, distance): assert df.loc[selected_group] == 0 # distance to self is 0 -@mark.parametrize("distance", actual_distances + semi_distances + non_distances) -def test_bootstrap_distance_output_type(distance, rng): +def test_bootstrap_distance_output_type(rng): # Test if distances are outputting floats - Distance = pt.tl.Distance(distance, obsm_key="X_pca") + Distance = pt.tl.Distance(metric="edistance") X = rng.normal(size=(50, 10)) Y = rng.normal(size=(50, 10)) d = Distance.bootstrap(X, Y, n_bootstrap=3) @@ -164,10 +163,11 @@ def test_bootstrap_distance_output_type(distance, rng): assert hasattr(d, "variance") -@mark.parametrize("distance", all_distances) -def test_bootstrap_distance_pairwise(adata, distance_obj, distance): +@mark.parametrize("distance", ["edistance"]) +def test_bootstrap_distance_pairwise(adata, distance): # Test consistency of pairwise distance results - bootstrap_output = distance_obj.pairwise(adata, groupby="perturbation", bootstrap=True, n_bootstrap=3) + Distance = pt.tl.Distance(distance, obsm_key="X_pca") + bootstrap_output = Distance.pairwise(adata, groupby="perturbation", bootstrap=True, n_bootstrap=3) assert isinstance(bootstrap_output, tuple) @@ -179,11 +179,12 @@ def test_bootstrap_distance_pairwise(adata, distance_obj, distance): assert np.sum(var.values - var.values.T) == 0 # symmetry -@mark.parametrize("distance", all_distances) -def test_bootstrap_distance_onesided(adata, distance_obj, distance): +@mark.parametrize("distance", ["edistance"]) +def test_bootstrap_distance_onesided(adata, distance): # Test consistency of one-sided distance results selected_group = adata.obs.perturbation.unique()[0] - bootstrap_output = distance_obj.onesided_distances( + Distance = pt.tl.Distance(distance, obsm_key="X_pca") + bootstrap_output = Distance.onesided_distances( adata, groupby="perturbation", selected_group=selected_group, From 2e65f9de7f1bd16d7a145711b00f6b895be243bd Mon Sep 17 00:00:00 2001 From: wxicu Date: Tue, 18 Jun 2024 21:21:58 +0200 Subject: [PATCH 28/34] split metrics_3g --- pertpy/tools/__init__.py | 17 +- .../_differential_gene_expression/__init__.py | 1 + .../_dge_comparison.py | 105 ++++++++++ pertpy/tools/_metrics_3g.py | 188 ------------------ .../tools/_perturbation_space/_comparison.py | 124 ++++++++++++ .../test_dge.py} | 61 ++---- .../_perturbation_space/test_comparison.py | 29 +++ 7 files changed, 293 insertions(+), 232 deletions(-) create mode 100644 pertpy/tools/_differential_gene_expression/_dge_comparison.py delete mode 100644 pertpy/tools/_metrics_3g.py create mode 100644 pertpy/tools/_perturbation_space/_comparison.py rename tests/tools/{test_metrics_3g.py => _differential_gene_expression/test_dge.py} (52%) create mode 100644 tests/tools/_perturbation_space/test_comparison.py diff --git a/pertpy/tools/__init__.py b/pertpy/tools/__init__.py index 4bc976a2..18b11f1d 100644 --- a/pertpy/tools/__init__.py +++ b/pertpy/tools/__init__.py @@ -3,18 +3,31 @@ from pertpy.tools._coda._sccoda import Sccoda from pertpy.tools._coda._tasccoda import Tasccoda from pertpy.tools._dialogue import Dialogue -from pertpy.tools._differential_gene_expression import EdgeR, PyDESeq2, Statsmodels, TTest, WilcoxonTest +from pertpy.tools._differential_gene_expression import ( + DGE, + EdgeR, + PyDESeq2, + Statsmodels, + TTest, + WilcoxonTest, +) from pertpy.tools._distances._distance_tests import DistanceTest from pertpy.tools._distances._distances import Distance from pertpy.tools._enrichment import Enrichment from pertpy.tools._milo import Milo from pertpy.tools._mixscape import Mixscape from pertpy.tools._perturbation_space._clustering import ClusteringSpace +from pertpy.tools._perturbation_space._comparison import PerturbationComparison from pertpy.tools._perturbation_space._discriminator_classifiers import ( LRClassifierSpace, MLPClassifierSpace, ) -from pertpy.tools._perturbation_space._simple import CentroidSpace, DBSCANSpace, KMeansSpace, PseudobulkSpace +from pertpy.tools._perturbation_space._simple import ( + CentroidSpace, + DBSCANSpace, + KMeansSpace, + PseudobulkSpace, +) from pertpy.tools._scgen import Scgen __all__ = [ diff --git a/pertpy/tools/_differential_gene_expression/__init__.py b/pertpy/tools/_differential_gene_expression/__init__.py index 178ecaa3..b349cedb 100644 --- a/pertpy/tools/_differential_gene_expression/__init__.py +++ b/pertpy/tools/_differential_gene_expression/__init__.py @@ -1,4 +1,5 @@ from ._base import ContrastType, LinearModelBase, MethodBase +from ._dge_comparison import DGE from ._edger import EdgeR from ._pydeseq2 import PyDESeq2 from ._simple_tests import SimpleComparisonBase, TTest, WilcoxonTest diff --git a/pertpy/tools/_differential_gene_expression/_dge_comparison.py b/pertpy/tools/_differential_gene_expression/_dge_comparison.py new file mode 100644 index 00000000..16c8c9c7 --- /dev/null +++ b/pertpy/tools/_differential_gene_expression/_dge_comparison.py @@ -0,0 +1,105 @@ +import numpy as np +import pandas as pd +from anndata import AnnData +from typing import Dict + + +class DGE: + def compare( + self, + adata: AnnData | None = None, + de_key1: str = None, + de_key2: str = None, + de_df1: pd.DataFrame | None = None, + de_df2: pd.DataFrame | None = None, + shared_top: int = 100, + ) -> Dict[str, float]: + """Compare two differential expression analyses. + + Compare two sets of DE results and evaluate the similarity by the overlap of top DEG and + the correlation of their scores and adjusted p-values. + + Args: + adata: AnnData object containing DE results in `uns`. Required if `de_key1` and `de_key2` are used. + de_key1: Key for DE results in `adata.uns`, e.g., output of `tl.rank_genes_groups`. + de_key2: Another key for DE results in `adata.uns`, e.g., output of `tl.rank_genes_groups`. + de_df1: DataFrame containing DE results, e.g. output from pertpy differential gene expression interface. + de_df2: DataFrame containing DE results, e.g. output from pertpy differential gene expression interface. + shared_top: The number of top DEG to compute the proportion of their intersection. + + """ + if (de_key1 or de_key2) and (de_df1 is not None or de_df2 is not None): + raise ValueError( + "Please provide either both `de_key1` and `de_key2` with `adata`, or `de_df1` and `de_df2`, but not both." + ) + + if de_df1 is None and de_df2 is None: # use keys + if not de_key1 or not de_key2: + raise ValueError( + "Both `de_key1` and `de_key2` must be provided together if using `adata`." + ) + + else: # use dfs + if de_df1 is None or de_df2 is None: + raise ValueError( + "Both `de_df1` and `de_df2` must be provided together if using dataframes." + ) + + if de_key1: + if not adata: + raise ValueError( + "`adata` should be provided with `de_key1` and `de_key2`. " + ) + assert all( + k in adata.uns for k in [de_key1, de_key2] + ), "Provided `de_key1` and `de_key2` must exist in `adata.uns`." + vars = adata.var_names + + if de_df1 is not None: + for df in (de_df1, de_df2): + if not {"variable", "log_fc", "adj_p_value"}.issubset(df.columns): + raise ValueError( + "Each DataFrame must contain columns: 'variable', 'log_fc', and 'adj_p_value'." + ) + + assert set(de_df1["variable"]) == set( + de_df2["variable"] + ), "Variables in both dataframes must match." + vars = de_df1["variable"].sort_values() + + shared_top = min(shared_top, len(vars)) + vars_ranks = np.arange(1, len(vars) + 1) + results = pd.DataFrame(index=vars) + top_names = [] + + if de_key1 and de_key2: + for i, k in enumerate([de_key1, de_key2]): + label = adata.uns[k]["names"].dtype.names[0] + srt_idx = np.argsort(adata.uns[k]["names"][label]) + results[f"scores_{i}"] = adata.uns[k]["scores"][label][srt_idx] + results[f"pvals_adj_{i}"] = adata.uns[k]["pvals_adj"][label][srt_idx] + results[f"ranks_{i}"] = vars_ranks[srt_idx] + top_names.append(adata.uns[k]["names"][label][:shared_top]) + else: + for i, df in enumerate([de_df1, de_df2]): + srt_idx = np.argsort(df["variable"]) + results[f"scores_{i}"] = df["log_fc"].values[srt_idx] + results[f"pvals_adj_{i}"] = df["adj_p_value"].values[srt_idx] + results[f"ranks_{i}"] = vars_ranks[srt_idx] + top_names.append(df["variable"][:shared_top]) + + metrics = {} + metrics["shared_top_genes"] = ( + len(set(top_names[0]).intersection(top_names[1])) / shared_top + ) + metrics["scores_corr"] = results["scores_0"].corr( + results["scores_1"], method="pearson" + ) + metrics["pvals_adj_corr"] = results["pvals_adj_0"].corr( + results["pvals_adj_1"], method="pearson" + ) + metrics["scores_ranks_corr"] = results["ranks_0"].corr( + results["ranks_1"], method="spearman" + ) + + return metrics diff --git a/pertpy/tools/_metrics_3g.py b/pertpy/tools/_metrics_3g.py deleted file mode 100644 index ec13ab1a..00000000 --- a/pertpy/tools/_metrics_3g.py +++ /dev/null @@ -1,188 +0,0 @@ -from typing import TYPE_CHECKING - -import numpy as np -import pandas as pd -import pynndescent -from anndata import AnnData -from scipy.sparse import issparse -from scipy.sparse import vstack as sp_vstack -from sklearn.base import ClassifierMixin -from sklearn.linear_model import LogisticRegression - -if TYPE_CHECKING: - from numpy.typing import NDArray - - -def compare_de( - adata: AnnData | None = None, - de_key1: str = None, - de_key2: str = None, - de_df1: pd.DataFrame | None = None, - de_df2: pd.DataFrame | None = None, - shared_top: int = 100, -) -> dict: - """Compare two differential expression analyses. - - Compare two sets of DE results and evaluate the similarity by the overlap of top DEG and - the correlation of their scores and adjusted p-values. - - Args: - adata: AnnData object containing DE results in `uns`. Required if `de_key1` and `de_key2` are used. - de_key1: Key for DE results in `adata.uns`, e.g., output of `tl.rank_genes_groups`. - de_key2: Another key for DE results in `adata.uns`, e.g., output of `tl.rank_genes_groups`. - de_df1: DataFrame containing DE results, e.g. output from pertpy differential gene expression interface. - de_df2: DataFrame containing DE results, e.g. output from pertpy differential gene expression interface. - shared_top: The number of top DEG to compute the proportion of their intersection. - - """ - if (de_key1 or de_key2) and (de_df1 is not None or de_df2 is not None): - raise ValueError( - "Please provide either both `de_key1` and `de_key2` with `adata`, or `de_df1` and `de_df2`, but not both." - ) - - if de_df1 is None and de_df2 is None: # use keys - if not de_key1 or not de_key2: - raise ValueError("Both `de_key1` and `de_key2` must be provided together if using `adata`.") - - else: # use dfs - if de_df1 is None or de_df2 is None: - raise ValueError("Both `de_df1` and `de_df2` must be provided together if using dataframes.") - - if de_key1: - if not adata: - raise ValueError("`adata` should be provided with `de_key1` and `de_key2`. ") - assert all( - k in adata.uns for k in [de_key1, de_key2] - ), "Provided `de_key1` and `de_key2` must exist in `adata.uns`." - vars = adata.var_names - - if de_df1 is not None: - for df in (de_df1, de_df2): - if not {"variable", "log_fc", "adj_p_value"}.issubset(df.columns): - raise ValueError("Each DataFrame must contain columns: 'variable', 'log_fc', and 'adj_p_value'.") - - assert set(de_df1["variable"]) == set(de_df2["variable"]), "Variables in both dataframes must match." - vars = de_df1["variable"].sort_values() - - shared_top = min(shared_top, len(vars)) - vars_ranks = np.arange(1, len(vars) + 1) - results = pd.DataFrame(index=vars) - top_names = [] - - if de_key1 and de_key2: - for i, k in enumerate([de_key1, de_key2]): - label = adata.uns[k]["names"].dtype.names[0] - srt_idx = np.argsort(adata.uns[k]["names"][label]) - results[f"scores_{i}"] = adata.uns[k]["scores"][label][srt_idx] - results[f"pvals_adj_{i}"] = adata.uns[k]["pvals_adj"][label][srt_idx] - results[f"ranks_{i}"] = vars_ranks[srt_idx] - top_names.append(adata.uns[k]["names"][label][:shared_top]) - else: - for i, df in enumerate([de_df1, de_df2]): - srt_idx = np.argsort(df["variable"]) - results[f"scores_{i}"] = df["log_fc"].values[srt_idx] - results[f"pvals_adj_{i}"] = df["adj_p_value"].values[srt_idx] - results[f"ranks_{i}"] = vars_ranks[srt_idx] - top_names.append(df["variable"][:shared_top]) - - metrics = {} - metrics["shared_top_genes"] = len(set(top_names[0]).intersection(top_names[1])) / shared_top - metrics["scores_corr"] = results["scores_0"].corr(results["scores_1"], method="pearson") - metrics["pvals_adj_corr"] = results["pvals_adj_0"].corr(results["pvals_adj_1"], method="pearson") - metrics["scores_ranks_corr"] = results["ranks_0"].corr(results["ranks_1"], method="spearman") - - return metrics - - -def compare_class(X: np.ndarray, Y: np.ndarray, C: np.ndarray, clf: ClassifierMixin | None = None) -> float: - """Compare classification accuracy between real and simulated perturbations. - - Trains a classifier on the real perturbation data + the control data and reports a normalized - classification accuracy on the simulated perturbation. - - Args: - X: Real perturbed data. - Y: Simulated perturbed data. - C: Control data - clf: sklearn classifier to use, `sklearn.linear_model.LogisticRegression` if not provided. - """ - assert X.shape[1] == Y.shape[1] == C.shape[1] - - if clf is None: - clf = LogisticRegression() - - n_x = X.shape[0] - n_xc = n_x + C.shape[0] - - data = sp_vstack((X, C)) if issparse(X) else np.vstack((X, C)) - - labels = np.full(n_xc, "ctrl") - labels[:n_x] = "comp" - clf.fit(data, labels) - - norm_score = clf.score(Y, np.full(Y.shape[0], "comp")) / clf.score(X, labels[:n_x]) - norm_score = min(1.0, norm_score) - - return norm_score - - -def compare_knn( - X: np.ndarray, - Y: np.ndarray, - C: np.ndarray | None = None, - n_neighbors: int = 20, - use_Y_knn: bool = False, - random_state: int = 0, - n_jobs: int = 1, -) -> dict[str, float]: - """Calculate proportions of real perturbed and control data points for simulated data. - - Computes proportions of real perturbed (if provided), control and simulated (if `use_Y_knn=True`) - data points for simulated data. If control (`C`) is not provided, builds the knn graph from - real perturbed + simulated perturbed. - - Args: - X: Real perturbed data. - Y: Simulated perturbed data. - C: Control data. - use_Y_knn: Include simulted perturbed data (`Y`) into the knn graph. Only valid when - control (`C`) is provided. - """ - assert X.shape[1] == Y.shape[1] - if C is not None: - assert X.shape[1] == C.shape[1] - - n_y = Y.shape[0] - - if C is None: - index_data = sp_vstack((Y, X)) if issparse(X) else np.vstack((Y, X)) - else: - datas = (Y, X, C) if use_Y_knn else (X, C) - index_data = sp_vstack(datas) if issparse(X) else np.vstack(datas) - - y_in_index = use_Y_knn or C is None - c_in_index = C is not None - label_groups = ["comp"] - labels: NDArray[np.str_] = np.full(index_data.shape[0], "comp") - if y_in_index: - labels[:n_y] = "siml" - label_groups.append("siml") - if c_in_index: - labels[-C.shape[0] :] = "ctrl" - label_groups.append("ctrl") - - index = pynndescent.NNDescent( - index_data, - n_neighbors=max(50, n_neighbors), - random_state=random_state, - n_jobs=n_jobs, - ) - indices = index.query(Y, k=n_neighbors)[0] - - uq, uq_counts = np.unique(labels[indices], return_counts=True) - uq_counts_norm = uq_counts / uq_counts.sum() - counts = dict(zip(label_groups, [0.0] * len(label_groups), strict=False)) - for group, count_norm in zip(uq, uq_counts_norm, strict=False): - counts[group] = count_norm - - return counts diff --git a/pertpy/tools/_perturbation_space/_comparison.py b/pertpy/tools/_perturbation_space/_comparison.py new file mode 100644 index 00000000..390c2d9d --- /dev/null +++ b/pertpy/tools/_perturbation_space/_comparison.py @@ -0,0 +1,124 @@ +from typing import TYPE_CHECKING + +import numpy as np +import pynndescent +from scipy.sparse import issparse +from scipy.sparse import vstack as sp_vstack +from sklearn.base import ClassifierMixin +from sklearn.linear_model import LogisticRegression + +if TYPE_CHECKING: + from numpy.typing import NDArray + + +class PerturbationComparison: + """Comparison between real and simulated perturbations.""" + + def compare_classification( + self, + real: np.ndarray, + simulated: np.ndarray, + control: np.ndarray, + clf: ClassifierMixin | None = None, + ) -> float: + """Compare classification accuracy between real and simulated perturbations. + + Trains a classifier on the real perturbation data + the control data and reports a normalized + classification accuracy on the simulated perturbation. + + Args: + real: Real perturbed data. + simulated: Simulated perturbed data. + control: Control data + clf: sklearn classifier to use, `sklearn.linear_model.LogisticRegression` if not provided. + """ + assert real.shape[1] == simulated.shape[1] == control.shape[1] + if clf is None: + clf = LogisticRegression() + n_x = real.shape[0] + data = ( + sp_vstack((real, control)) if issparse(real) else np.vstack((real, control)) + ) + labels = np.concatenate( + [np.full(real.shape[0], "comp"), np.full(control.shape[0], "ctrl")] + ) + + clf.fit(data, labels) + norm_score = clf.score( + simulated, np.full(simulated.shape[0], "comp") + ) / clf.score(real, labels[:n_x]) + norm_score = min(1.0, norm_score) + + return norm_score + + def compare_knn( + self, + real: np.ndarray, + simulated: np.ndarray, + control: np.ndarray | None = None, + use_simulated_for_knn: bool = False, + n_neighbors: int = 20, + random_state: int = 0, + n_jobs: int = 1, + ) -> dict[str, float]: + """Calculate proportions of real perturbed and control data points for simulated data. + + Computes proportions of real perturbed, control and simulated (if `use_simulated_for_knn=True`) + data points for simulated data. If control (`C`) is not provided, builds the knn graph from + real perturbed + simulated perturbed. + + Args: + real: Real perturbed data. + simulated: Simulated perturbed data. + control: Control data + use_simulated_for_knn: Include simulted perturbed data (`simulated`) into the knn graph. Only valid when + control (`control`) is provided. + n_neighbors: Number of neighbors to use in k-neighbor graph. + random_state: Random state used for k-neighbor graph construction. + n_jobs: Number of cores to use. Defaults to -1 (all). + + """ + assert real.shape[1] == simulated.shape[1] + if control is not None: + assert real.shape[1] == control.shape[1] + + n_y = simulated.shape[0] + + if control is None: + index_data = ( + sp_vstack((simulated, real)) + if issparse(real) + else np.vstack((simulated, real)) + ) + else: + datas = ( + (simulated, real, control) if use_simulated_for_knn else (real, control) + ) + index_data = sp_vstack(datas) if issparse(real) else np.vstack(datas) + + y_in_index = use_simulated_for_knn or control is None + c_in_index = control is not None + label_groups = ["comp"] + labels: NDArray[np.str_] = np.full(index_data.shape[0], "comp") + if y_in_index: + labels[:n_y] = "siml" + label_groups.append("siml") + if c_in_index: + labels[-control.shape[0] :] = "ctrl" + label_groups.append("ctrl") + + index = pynndescent.NNDescent( + index_data, + n_neighbors=max(50, n_neighbors), + random_state=random_state, + n_jobs=n_jobs, + ) + indices = index.query(simulated, k=n_neighbors)[0] + + uq, uq_counts = np.unique(labels[indices], return_counts=True) + uq_counts_norm = uq_counts / uq_counts.sum() + counts = dict(zip(label_groups, [0.0] * len(label_groups), strict=False)) + for group, count_norm in zip(uq, uq_counts_norm, strict=False): + counts[group] = count_norm + + return counts diff --git a/tests/tools/test_metrics_3g.py b/tests/tools/_differential_gene_expression/test_dge.py similarity index 52% rename from tests/tools/test_metrics_3g.py rename to tests/tools/_differential_gene_expression/test_dge.py index d481fc86..4addf51a 100644 --- a/tests/tools/test_metrics_3g.py +++ b/tests/tools/_differential_gene_expression/test_dge.py @@ -1,25 +1,12 @@ +from anndata import AnnData import numpy as np import pandas as pd import pertpy as pt import pytest -from anndata import AnnData -from pertpy.tools._metrics_3g import ( - compare_class, - compare_de, - compare_knn, -) - - -@pytest.fixture -def test_data(rng): - X = rng.normal(size=(100, 10)) - Y = rng.normal(size=(100, 10)) - C = rng.normal(size=(100, 10)) - return X, Y, C @pytest.fixture -def compare_de_adata(rng): +def adata(rng): adata = AnnData(rng.normal(size=(100, 10))) genes = np.rec.fromarrays( [np.array([f"gene{i}" for i in range(10)])], @@ -39,7 +26,7 @@ def compare_de_adata(rng): @pytest.fixture -def compare_de_dataframe(rng): +def dataframe(rng): df1 = pd.DataFrame( { "variable": ["gene" + str(i) for i in range(10)], @@ -57,49 +44,39 @@ def compare_de_dataframe(rng): return df1, df2 -def test_error_both_keys_and_dfs(compare_de_adata, compare_de_dataframe): +def test_error_both_keys_and_dfs(adata, dataframe): with pytest.raises(ValueError): - compare_de(adata=compare_de_adata, de_key1="de_key1", de_df1=compare_de_dataframe[0]) + pt_DGE = pt.tl.DGE() + pt_DGE.compare(adata=adata, de_key1="de_key1", de_df1=dataframe[0]) def test_error_missing_adata(): with pytest.raises(ValueError): - compare_de(de_key1="de_key1", de_key2="de_key2") + pt_DGE = pt.tl.DGE() + pt_DGE.compare(de_key1="de_key1", de_key2="de_key2") -def test_error_missing_df(compare_de_dataframe): +def test_error_missing_df(dataframe): with pytest.raises(ValueError): - compare_de(de_df1=compare_de_dataframe[0]) + pt_DGE = pt.tl.DGE() + pt_DGE.compare(de_df1=dataframe[0]) -def test_compare_de_key(compare_de_adata): - results = compare_de(adata=compare_de_adata, de_key1="de_key1", de_key2="de_key2", shared_top=5) +def test_key(adata): + pt_DGE = pt.tl.DGE() + results = pt_DGE.compare( + adata=adata, de_key1="de_key1", de_key2="de_key2", shared_top=5 + ) assert "shared_top_genes" in results assert "scores_corr" in results assert "pvals_adj_corr" in results assert "scores_ranks_corr" in results -def test_compare_de_df(compare_de_dataframe): - results = compare_de(de_df1=compare_de_dataframe[0], de_df2=compare_de_dataframe[1], shared_top=5) +def test_df(dataframe): + pt_DGE = pt.tl.DGE() + results = pt_DGE.compare(de_df1=dataframe[0], de_df2=dataframe[1], shared_top=5) assert "shared_top_genes" in results assert "scores_corr" in results assert "pvals_adj_corr" in results assert "scores_ranks_corr" in results - - -def test_compare_class(test_data): - X, Y, C = test_data - result = compare_class(X, Y, C) - assert result <= 1 - - -def test_compare_knn(test_data): - X, Y, C = test_data - result = compare_knn(X, Y, C) - assert isinstance(result, dict) - assert "comp" in result - assert isinstance(result["comp"], float) - - result_no_ctrl = compare_knn(X, Y) - assert isinstance(result_no_ctrl, dict) diff --git a/tests/tools/_perturbation_space/test_comparison.py b/tests/tools/_perturbation_space/test_comparison.py new file mode 100644 index 00000000..7f291f2b --- /dev/null +++ b/tests/tools/_perturbation_space/test_comparison.py @@ -0,0 +1,29 @@ +import pytest +import pertpy as pt + + +@pytest.fixture +def test_data(rng): + X = rng.normal(size=(100, 10)) + Y = rng.normal(size=(100, 10)) + C = rng.normal(size=(100, 10)) + return X, Y, C + + +def test_compare_class(test_data): + X, Y, C = test_data + pt_comparison = pt.tl.PerturbationComparison() + result = pt_comparison.compare_classification(X, Y, C) + assert result <= 1 + + +def test_compare_knn(test_data): + X, Y, C = test_data + pt_comparison = pt.tl.PerturbationComparison() + result = pt_comparison.compare_knn(X, Y, C) + assert isinstance(result, dict) + assert "comp" in result + assert isinstance(result["comp"], float) + + result_no_ctrl = pt_comparison.compare_knn(X, Y) + assert isinstance(result_no_ctrl, dict) From 2e7acf393f33523592603d2e30d7cb22262e855b Mon Sep 17 00:00:00 2001 From: wxicu Date: Tue, 18 Jun 2024 21:26:12 +0200 Subject: [PATCH 29/34] fix pre-commit --- .../_dge_comparison.py | 41 ++++++------------- .../tools/_perturbation_space/_comparison.py | 22 +++------- .../_differential_gene_expression/test_dge.py | 6 +-- .../_perturbation_space/test_comparison.py | 2 +- 4 files changed, 20 insertions(+), 51 deletions(-) diff --git a/pertpy/tools/_differential_gene_expression/_dge_comparison.py b/pertpy/tools/_differential_gene_expression/_dge_comparison.py index 16c8c9c7..bd8c10fe 100644 --- a/pertpy/tools/_differential_gene_expression/_dge_comparison.py +++ b/pertpy/tools/_differential_gene_expression/_dge_comparison.py @@ -1,7 +1,8 @@ +from typing import Dict + import numpy as np import pandas as pd from anndata import AnnData -from typing import Dict class DGE: @@ -13,7 +14,7 @@ def compare( de_df1: pd.DataFrame | None = None, de_df2: pd.DataFrame | None = None, shared_top: int = 100, - ) -> Dict[str, float]: + ) -> dict[str, float]: """Compare two differential expression analyses. Compare two sets of DE results and evaluate the similarity by the overlap of top DEG and @@ -35,21 +36,15 @@ def compare( if de_df1 is None and de_df2 is None: # use keys if not de_key1 or not de_key2: - raise ValueError( - "Both `de_key1` and `de_key2` must be provided together if using `adata`." - ) + raise ValueError("Both `de_key1` and `de_key2` must be provided together if using `adata`.") else: # use dfs if de_df1 is None or de_df2 is None: - raise ValueError( - "Both `de_df1` and `de_df2` must be provided together if using dataframes." - ) + raise ValueError("Both `de_df1` and `de_df2` must be provided together if using dataframes.") if de_key1: if not adata: - raise ValueError( - "`adata` should be provided with `de_key1` and `de_key2`. " - ) + raise ValueError("`adata` should be provided with `de_key1` and `de_key2`. ") assert all( k in adata.uns for k in [de_key1, de_key2] ), "Provided `de_key1` and `de_key2` must exist in `adata.uns`." @@ -58,13 +53,9 @@ def compare( if de_df1 is not None: for df in (de_df1, de_df2): if not {"variable", "log_fc", "adj_p_value"}.issubset(df.columns): - raise ValueError( - "Each DataFrame must contain columns: 'variable', 'log_fc', and 'adj_p_value'." - ) + raise ValueError("Each DataFrame must contain columns: 'variable', 'log_fc', and 'adj_p_value'.") - assert set(de_df1["variable"]) == set( - de_df2["variable"] - ), "Variables in both dataframes must match." + assert set(de_df1["variable"]) == set(de_df2["variable"]), "Variables in both dataframes must match." vars = de_df1["variable"].sort_values() shared_top = min(shared_top, len(vars)) @@ -89,17 +80,9 @@ def compare( top_names.append(df["variable"][:shared_top]) metrics = {} - metrics["shared_top_genes"] = ( - len(set(top_names[0]).intersection(top_names[1])) / shared_top - ) - metrics["scores_corr"] = results["scores_0"].corr( - results["scores_1"], method="pearson" - ) - metrics["pvals_adj_corr"] = results["pvals_adj_0"].corr( - results["pvals_adj_1"], method="pearson" - ) - metrics["scores_ranks_corr"] = results["ranks_0"].corr( - results["ranks_1"], method="spearman" - ) + metrics["shared_top_genes"] = len(set(top_names[0]).intersection(top_names[1])) / shared_top + metrics["scores_corr"] = results["scores_0"].corr(results["scores_1"], method="pearson") + metrics["pvals_adj_corr"] = results["pvals_adj_0"].corr(results["pvals_adj_1"], method="pearson") + metrics["scores_ranks_corr"] = results["ranks_0"].corr(results["ranks_1"], method="spearman") return metrics diff --git a/pertpy/tools/_perturbation_space/_comparison.py b/pertpy/tools/_perturbation_space/_comparison.py index 390c2d9d..2ea76cc2 100644 --- a/pertpy/tools/_perturbation_space/_comparison.py +++ b/pertpy/tools/_perturbation_space/_comparison.py @@ -36,17 +36,11 @@ def compare_classification( if clf is None: clf = LogisticRegression() n_x = real.shape[0] - data = ( - sp_vstack((real, control)) if issparse(real) else np.vstack((real, control)) - ) - labels = np.concatenate( - [np.full(real.shape[0], "comp"), np.full(control.shape[0], "ctrl")] - ) + data = sp_vstack((real, control)) if issparse(real) else np.vstack((real, control)) + labels = np.concatenate([np.full(real.shape[0], "comp"), np.full(control.shape[0], "ctrl")]) clf.fit(data, labels) - norm_score = clf.score( - simulated, np.full(simulated.shape[0], "comp") - ) / clf.score(real, labels[:n_x]) + norm_score = clf.score(simulated, np.full(simulated.shape[0], "comp")) / clf.score(real, labels[:n_x]) norm_score = min(1.0, norm_score) return norm_score @@ -85,15 +79,9 @@ def compare_knn( n_y = simulated.shape[0] if control is None: - index_data = ( - sp_vstack((simulated, real)) - if issparse(real) - else np.vstack((simulated, real)) - ) + index_data = sp_vstack((simulated, real)) if issparse(real) else np.vstack((simulated, real)) else: - datas = ( - (simulated, real, control) if use_simulated_for_knn else (real, control) - ) + datas = (simulated, real, control) if use_simulated_for_knn else (real, control) index_data = sp_vstack(datas) if issparse(real) else np.vstack(datas) y_in_index = use_simulated_for_knn or control is None diff --git a/tests/tools/_differential_gene_expression/test_dge.py b/tests/tools/_differential_gene_expression/test_dge.py index 4addf51a..74e31d61 100644 --- a/tests/tools/_differential_gene_expression/test_dge.py +++ b/tests/tools/_differential_gene_expression/test_dge.py @@ -1,8 +1,8 @@ -from anndata import AnnData import numpy as np import pandas as pd import pertpy as pt import pytest +from anndata import AnnData @pytest.fixture @@ -64,9 +64,7 @@ def test_error_missing_df(dataframe): def test_key(adata): pt_DGE = pt.tl.DGE() - results = pt_DGE.compare( - adata=adata, de_key1="de_key1", de_key2="de_key2", shared_top=5 - ) + results = pt_DGE.compare(adata=adata, de_key1="de_key1", de_key2="de_key2", shared_top=5) assert "shared_top_genes" in results assert "scores_corr" in results assert "pvals_adj_corr" in results diff --git a/tests/tools/_perturbation_space/test_comparison.py b/tests/tools/_perturbation_space/test_comparison.py index 7f291f2b..1751a5f1 100644 --- a/tests/tools/_perturbation_space/test_comparison.py +++ b/tests/tools/_perturbation_space/test_comparison.py @@ -1,5 +1,5 @@ -import pytest import pertpy as pt +import pytest @pytest.fixture From 69163ffd5d73974f5d93a2b5fdca2cd5131aa6c3 Mon Sep 17 00:00:00 2001 From: wxicu Date: Wed, 19 Jun 2024 20:25:53 +0200 Subject: [PATCH 30/34] pin numpy <2 --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7139813d..728f12b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,8 @@ dependencies = [ "pubchempy", "pyarrow", "blitzgsea", - "lamin_utils" + "lamin_utils", + "numpy<2.0.0" ] [project.optional-dependencies] From 67c54be63f444a5447b543f0a86d5464ee125492 Mon Sep 17 00:00:00 2001 From: wxicu Date: Thu, 20 Jun 2024 20:21:54 +0200 Subject: [PATCH 31/34] unpin numpy --- pyproject.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 728f12b5..7139813d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,8 +62,7 @@ dependencies = [ "pubchempy", "pyarrow", "blitzgsea", - "lamin_utils", - "numpy<2.0.0" + "lamin_utils" ] [project.optional-dependencies] From 6e32f379b32df149e1543813af23f98060f7fb14 Mon Sep 17 00:00:00 2001 From: wxicu Date: Thu, 20 Jun 2024 22:02:36 +0200 Subject: [PATCH 32/34] speed up mahalanobis distance --- pertpy/tools/_distances/_distances.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index ee0f6e64..b7edf15d 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -1185,11 +1185,13 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return mahalanobis( - self.aggregation_func(X, axis=0), - self.aggregation_func(Y, axis=0), - np.linalg.inv(np.cov(X.T)), - ) + delta = self.aggregation_func(X, axis=0) - self.aggregation_func(Y, axis=0) + cov = np.cov(X.T) + s, u = np.linalg.eigh(cov) + ci = u @ (1 / s[..., None] * u.T) + return np.sqrt(np.sum(((delta @ ci) * delta), axis=-1)) + # ci = np.linalg.pinv(cov) + # return np.sqrt(np.sum(((delta @ ci) * delta), axis=-1)) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: raise NotImplementedError("Mahalanobis cannot be called on a pairwise distance matrix.") From 620e6453205c24550e75be83a46419cffea273e6 Mon Sep 17 00:00:00 2001 From: wxicu Date: Thu, 20 Jun 2024 22:58:33 +0200 Subject: [PATCH 33/34] use scipy to calculate mahalanobis distance --- .../_dge_comparison.py | 2 +- pertpy/tools/_distances/_distances.py | 14 ++++++-------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pertpy/tools/_differential_gene_expression/_dge_comparison.py b/pertpy/tools/_differential_gene_expression/_dge_comparison.py index bd8c10fe..34013aac 100644 --- a/pertpy/tools/_differential_gene_expression/_dge_comparison.py +++ b/pertpy/tools/_differential_gene_expression/_dge_comparison.py @@ -40,7 +40,7 @@ def compare( else: # use dfs if de_df1 is None or de_df2 is None: - raise ValueError("Both `de_df1` and `de_df2` must be provided together if using dataframes.") + raise ValueError("Both `de_df1` and `de_df2` must be provided together if using DataFrames.") if de_key1: if not adata: diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py index b7edf15d..5cde82d5 100644 --- a/pertpy/tools/_distances/_distances.py +++ b/pertpy/tools/_distances/_distances.py @@ -585,7 +585,7 @@ def _bootstrap_mode_precomputed(self, sub_pwd, sub_idx, n_bootstraps=100, random distances = [] for _ in range(n_bootstraps): - # in order to maintain the number of cells for both groups (whatever balancing they may have), + # To maintain the number of cells for both groups (whatever balancing they may have), # we sample the positive and negative indices separately bootstrap_pos_idx = rng.choice(a=sub_idx[sub_idx].index, size=sub_idx[sub_idx].size, replace=True) bootstrap_neg_idx = rng.choice(a=sub_idx[~sub_idx].index, size=sub_idx[~sub_idx].size, replace=True) @@ -1185,13 +1185,11 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None: self.aggregation_func = aggregation_func def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - delta = self.aggregation_func(X, axis=0) - self.aggregation_func(Y, axis=0) - cov = np.cov(X.T) - s, u = np.linalg.eigh(cov) - ci = u @ (1 / s[..., None] * u.T) - return np.sqrt(np.sum(((delta @ ci) * delta), axis=-1)) - # ci = np.linalg.pinv(cov) - # return np.sqrt(np.sum(((delta @ ci) * delta), axis=-1)) + return mahalanobis( + self.aggregation_func(X, axis=0), + self.aggregation_func(Y, axis=0), + np.linalg.inv(np.cov(X.T)), + ) def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float: raise NotImplementedError("Mahalanobis cannot be called on a pairwise distance matrix.") From 10d348370c17eeff4cec620c457b626b644a4950 Mon Sep 17 00:00:00 2001 From: wxicu Date: Sun, 23 Jun 2024 18:46:37 +0200 Subject: [PATCH 34/34] rename DGE to DGEEVAL --- pertpy/tools/__init__.py | 2 +- pertpy/tools/_differential_gene_expression/__init__.py | 2 +- .../_differential_gene_expression/_dge_comparison.py | 4 +--- tests/tools/_differential_gene_expression/test_dge.py | 10 +++++----- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/pertpy/tools/__init__.py b/pertpy/tools/__init__.py index 18b11f1d..4e2c709e 100644 --- a/pertpy/tools/__init__.py +++ b/pertpy/tools/__init__.py @@ -4,7 +4,7 @@ from pertpy.tools._coda._tasccoda import Tasccoda from pertpy.tools._dialogue import Dialogue from pertpy.tools._differential_gene_expression import ( - DGE, + DGEEVAL, EdgeR, PyDESeq2, Statsmodels, diff --git a/pertpy/tools/_differential_gene_expression/__init__.py b/pertpy/tools/_differential_gene_expression/__init__.py index b349cedb..35ccef51 100644 --- a/pertpy/tools/_differential_gene_expression/__init__.py +++ b/pertpy/tools/_differential_gene_expression/__init__.py @@ -1,5 +1,5 @@ from ._base import ContrastType, LinearModelBase, MethodBase -from ._dge_comparison import DGE +from ._dge_comparison import DGEEVAL from ._edger import EdgeR from ._pydeseq2 import PyDESeq2 from ._simple_tests import SimpleComparisonBase, TTest, WilcoxonTest diff --git a/pertpy/tools/_differential_gene_expression/_dge_comparison.py b/pertpy/tools/_differential_gene_expression/_dge_comparison.py index 34013aac..0492dcc5 100644 --- a/pertpy/tools/_differential_gene_expression/_dge_comparison.py +++ b/pertpy/tools/_differential_gene_expression/_dge_comparison.py @@ -1,11 +1,9 @@ -from typing import Dict - import numpy as np import pandas as pd from anndata import AnnData -class DGE: +class DGEEVAL: def compare( self, adata: AnnData | None = None, diff --git a/tests/tools/_differential_gene_expression/test_dge.py b/tests/tools/_differential_gene_expression/test_dge.py index 74e31d61..1f9d3eb8 100644 --- a/tests/tools/_differential_gene_expression/test_dge.py +++ b/tests/tools/_differential_gene_expression/test_dge.py @@ -46,24 +46,24 @@ def dataframe(rng): def test_error_both_keys_and_dfs(adata, dataframe): with pytest.raises(ValueError): - pt_DGE = pt.tl.DGE() + pt_DGE = pt.tl.DGEEVAL() pt_DGE.compare(adata=adata, de_key1="de_key1", de_df1=dataframe[0]) def test_error_missing_adata(): with pytest.raises(ValueError): - pt_DGE = pt.tl.DGE() + pt_DGE = pt.tl.DGEEVAL() pt_DGE.compare(de_key1="de_key1", de_key2="de_key2") def test_error_missing_df(dataframe): with pytest.raises(ValueError): - pt_DGE = pt.tl.DGE() + pt_DGE = pt.tl.DGEEVAL() pt_DGE.compare(de_df1=dataframe[0]) def test_key(adata): - pt_DGE = pt.tl.DGE() + pt_DGE = pt.tl.DGEEVAL() results = pt_DGE.compare(adata=adata, de_key1="de_key1", de_key2="de_key2", shared_top=5) assert "shared_top_genes" in results assert "scores_corr" in results @@ -72,7 +72,7 @@ def test_key(adata): def test_df(dataframe): - pt_DGE = pt.tl.DGE() + pt_DGE = pt.tl.DGEEVAL() results = pt_DGE.compare(de_df1=dataframe[0], de_df2=dataframe[1], shared_top=5) assert "shared_top_genes" in results assert "scores_corr" in results