From fd21dc1a7deee129545c5ee7c22732ee5a565fa1 Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Sun, 26 May 2024 13:42:37 +0200
Subject: [PATCH 01/34] add two distance metrics

---
 pertpy/tools/_distances/_distances.py         | 146 +++++++++++++++++-
 tests/tools/_distances/test_distance_tests.py |   2 +
 tests/tools/_distances/test_distances.py      |  18 ++-
 3 files changed, 159 insertions(+), 7 deletions(-)

diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index 348f1f78..e3920c17 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -12,16 +12,50 @@
 from ott.solvers.linear.sinkhorn import Sinkhorn
 from pandas import Series
 from rich.progress import track
-from scipy.sparse import issparse
-from scipy.spatial.distance import cosine
+from scipy.sparse import csr_matrix, issparse
+from scipy.spatial.distance import cosine, mahalanobis
 from scipy.special import gammaln
 from scipy.stats import kendalltau, kstest, pearsonr, spearmanr
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import pairwise_distances, r2_score
 from sklearn.metrics.pairwise import polynomial_kernel, rbf_kernel
+from sklearn.neighbors import KernelDensity
 from statsmodels.discrete.discrete_model import NegativeBinomialP
 
+
+def compute_medoid(arr, axis=None):
+    if len(arr) == 0:
+        return None  # Handle the case when the array is empty
+
+    if axis is not None:
+        # If axis is specified, compute the medoid along that axis
+        return np.apply_along_axis(lambda x: compute_medoid(x), axis, arr)
+
+    # Calculate pairwise distances between all elements
+    distances = np.abs(arr[:, np.newaxis] - arr)
+
+    # Calculate the total distance for each element
+    total_distances = np.sum(distances, axis=1)
+
+    # Find the index of the element with the smallest total distance (medoid)
+    medoid_index = np.argmin(total_distances)
+
+    # Return the medoid value
+    medoid = arr[medoid_index]
+
+    return medoid
+
+
+AGG_FCTS = {
+    "mean": np.mean,
+    "median": np.median,
+    "medoid": compute_medoid,
+    "variance": np.var,
+}
+
 if TYPE_CHECKING:
+    from collections.abc import Callable
+
     from anndata import AnnData
 
 
@@ -80,6 +114,9 @@ class Distance:
         Average of the classification probability of the perturbation for a binary classifier.
     - "classifier_cp": classifier class projection
         Average of the class
+    - "mean_var_distn": Distance between mean-var distibutions of gene expression.
+    - "mahalanobis": Mahalanobis distance between the cells of two groups.
+
 
     Attributes:
         metric: Name of distance metric.
@@ -99,6 +136,7 @@ class Distance:
     def __init__(
         self,
         metric: str = "edistance",
+        agg_fct: str = "mean",
         layer_key: str = None,
         obsm_key: str = None,
         cell_wise_metric: str = "euclidean",
@@ -107,6 +145,7 @@ def __init__(
 
         Args:
             metric: Distance metric to use. Defaults to "edistance".
+            agg_fct: Name of the aggregation function to generate pseodobulk vectors. Defaults to "mean".
             layer_key: Name of the counts layer containing raw counts to calculate distances for.
                               Mutually exclusive with 'obsm_key'.
                               Defaults to None and is then not used.
@@ -116,6 +155,7 @@ def __init__(
             cell_wise_metric: Metric from scipy.spatial.distance to use for pairwise distances between single cells.
                                 Defaults to "euclidean".
         """
+        self.aggregation_func = AGG_FCTS[agg_fct]
         metric_fct: AbstractDistance = None
         if metric == "edistance":
             metric_fct = Edistance()
@@ -155,6 +195,10 @@ def __init__(
             metric_fct = ClassifierProbaDistance()
         elif metric == "classifier_cp":
             metric_fct = ClassifierClassProjection()
+        elif metric == "mean_var_distn":
+            metric_fct = MeanVarDistnDistance()
+        elif metric == "mahalanobis":
+            metric_fct = MahalanobisDistance(self.aggregation_func)
         else:
             raise ValueError(f"Metric {metric} not recognized.")
         self.metric_fct = metric_fct
@@ -315,7 +359,13 @@ def onesided_distances(
         """
         if self.metric == "classifier_cp":
             return self.metric_fct.onesided_distances(  # type: ignore
-                adata, groupby, selected_group, groups, show_progressbar, n_jobs, **kwargs
+                adata,
+                groupby,
+                selected_group,
+                groups,
+                show_progressbar,
+                n_jobs,
+                **kwargs,
             )
 
         groups = adata.obs[groupby].unique() if groups is None else groups
@@ -857,3 +907,93 @@ def onesided_distances(
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
         raise NotImplementedError("ClassifierClassProjection cannot be called on a pairwise distance matrix.")
+
+
+class MeanVarDistnDistance(AbstractDistance):
+    """
+    Distance betweenv mean-var distributions of gene expression.
+
+    """
+
+    def __init__(self, aggregation_func: Callable = np.mean) -> None:
+        super().__init__()
+        self.accepts_precomputed = False
+
+    def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
+        """
+        Difference of mean-var distibutions in 2 matrices
+        :param X,Y: Matrices of cells*genes, normalized and log transformed
+        """
+
+        # Get log mean & var for comparison
+        def mean_var(x, log: bool = False):
+            mean = np.mean(x, axis=0)
+            var = np.var(x, axis=0)
+            positive = mean > 0
+            mean = mean[positive]
+            var = var[positive]
+            if log:
+                mean = np.log(mean)
+                var = np.log(var)
+            return mean, var
+
+        def prep_kde_data(x, y):
+            return np.concatenate([x.reshape(-1, 1), y.reshape(-1, 1)], axis=1)
+
+        mean_x, var_x = mean_var(X, log=True)
+        x = prep_kde_data(mean_x, var_x)
+        mean_y, var_y = mean_var(Y, log=True)
+        y = prep_kde_data(mean_y, var_y)
+
+        def grid_points(d, n_points=100):
+            # Make grid, add 1 bin on lower/upper end to get final n_points
+            d_min = d.min()
+            d_max = d.max()
+            # Compute bin size
+            d_bin = (d_max - d_min) / (n_points - 2)
+            d_min = d_min - d_bin
+            d_max = d_max + d_bin
+            return np.arange(start=d_min + 0.5 * d_bin, stop=d_max, step=d_bin)
+
+        # Gridpoints to eval KDE on
+        mean_grid = grid_points(np.concatenate([mean_x, mean_y]))
+        var_grid = grid_points(np.concatenate([var_x, var_y]))
+        grid = np.array(np.meshgrid(mean_grid, var_grid)).T.reshape(-1, 2)
+
+        # KDE
+        def kde_eval(d, grid):
+            # Kernel choice: Gaussian is too smoothing and cosine or other kernels that do not stretch out
+            # can not be compared well on regions further away from the data as they are -inf
+            return KernelDensity(bandwidth="silverman", kernel="exponential").fit(d).score_samples(grid)
+
+        kde_x = kde_eval(x, grid)
+        kde_y = kde_eval(y, grid)
+
+        # KDE difference
+        kde_diff = ((kde_x - kde_y) ** 2).mean()
+
+        return kde_diff
+
+    def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
+        raise NotImplementedError("MeanVarDistnDistance cannot be called on a pairwise distance matrix.")
+
+
+class MahalanobisDistance(AbstractDistance):
+    """
+    Mahalanobis distance between pseudobulk vectors.
+    """
+
+    def __init__(self, aggregation_func: Callable = np.mean) -> None:
+        super().__init__()
+        self.accepts_precomputed = False
+        self.aggregation_func = aggregation_func
+
+    def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
+        return mahalanobis(
+            self.aggregation_func(X, axis=0),
+            self.aggregation_func(Y, axis=0),
+            np.linalg.inv(np.cov(X.T)),
+        )
+
+    def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
+        raise NotImplementedError("Mahalanobis cannot be called on a pairwise distance matrix.")
diff --git a/tests/tools/_distances/test_distance_tests.py b/tests/tools/_distances/test_distance_tests.py
index 22c3e93f..573e9e0f 100644
--- a/tests/tools/_distances/test_distance_tests.py
+++ b/tests/tools/_distances/test_distance_tests.py
@@ -22,6 +22,8 @@
     "classifier_proba",
     # "classifier_cp",
     # "nbll",
+    "mahalanobis",
+    "mean_var_distn",
 ]
 
 count_distances = ["nb_ll"]
diff --git a/tests/tools/_distances/test_distances.py b/tests/tools/_distances/test_distances.py
index 95c81225..bd67efac 100644
--- a/tests/tools/_distances/test_distances.py
+++ b/tests/tools/_distances/test_distances.py
@@ -20,32 +20,42 @@
     "spearman_distance",
     "t_test",
     "wasserstein",
+    "mahalanobis",
 ]
 semi_distances = ["r2_distance", "sym_kldiv", "ks_test"]
 non_distances = ["classifier_proba"]
 onesided_only = ["classifier_cp"]
 pseudo_counts_distances = ["nb_ll"]
+lognorm_counts_distances = ["mean_var_distn"]
 all_distances = actual_distances + semi_distances + non_distances + pseudo_counts_distances  # + onesided_only
 
 
 @pytest.fixture
 def all_pairwise_distances():
     all_calulated_distances = {}
-    no_subsample_distances = ["sym_kldiv", "t_test", "ks_test", "classifier_proba", "classifier_cp"]
+    no_subsample_distances = [
+        "sym_kldiv",
+        "t_test",
+        "ks_test",
+        "classifier_proba",
+        "classifier_cp",
+        "mahalanobis",
+    ]
 
     for distance in all_distances:
         adata = pt.dt.distance_example()
         if distance not in no_subsample_distances:
             adata = sc.pp.subsample(adata, 0.001, copy=True)
-        else:
+        elif distance != "mahalanobis":
             adata = sc.pp.subsample(adata, 0.1, copy=True)
 
         adata.layers["lognorm"] = adata.X.copy()
         adata.layers["counts"] = np.round(adata.X.toarray()).astype(int)
         if "X_pca" not in adata.obsm.keys():
             sc.pp.pca(adata, n_comps=5)
-
-        if distance in pseudo_counts_distances:
+        if distance in lognorm_counts_distances:
+            Distance = pt.tl.Distance(distance, layer_key="lognorm")
+        elif distance in pseudo_counts_distances:
             Distance = pt.tl.Distance(distance, layer_key="counts")
         else:
             Distance = pt.tl.Distance(distance, obsm_key="X_pca")

From 3af7d8990eaf1e6e50e48cbe238e47289b78a34c Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Sun, 26 May 2024 22:50:38 +0200
Subject: [PATCH 02/34] add obsm_key param to distance test

---
 pertpy/tools/_distances/_distance_tests.py |  13 +-
 pertpy/tools/_distances/_distances.py      | 131 ++++++++++++++++-----
 2 files changed, 110 insertions(+), 34 deletions(-)

diff --git a/pertpy/tools/_distances/_distance_tests.py b/pertpy/tools/_distances/_distance_tests.py
index 76418139..ad29c311 100644
--- a/pertpy/tools/_distances/_distance_tests.py
+++ b/pertpy/tools/_distances/_distance_tests.py
@@ -66,11 +66,14 @@ def __init__(
         self.alpha = alpha
         self.correction = correction
         self.cell_wise_metric = (
-            cell_wise_metric if cell_wise_metric else Distance(self.metric, self.obsm_key).cell_wise_metric
+            cell_wise_metric if cell_wise_metric else Distance(self.metric, obsm_key=self.obsm_key).cell_wise_metric
         )
 
         self.distance = Distance(
-            self.metric, layer_key=self.layer_key, obsm_key=self.obsm_key, cell_wise_metric=self.cell_wise_metric
+            self.metric,
+            layer_key=self.layer_key,
+            obsm_key=self.obsm_key,
+            cell_wise_metric=self.cell_wise_metric,
         )
 
     def __call__(
@@ -176,7 +179,8 @@ def test_xy(self, adata: AnnData, groupby: str, contrast: str, show_progressbar:
         # Evaluate the test
         # count times shuffling resulted in larger distance
         comparison_results = np.array(
-            pd.concat([r["distance"] - df["distance"] for r in results], axis=1) > 0, dtype=int
+            pd.concat([r["distance"] - df["distance"] for r in results], axis=1) > 0,
+            dtype=int,
         )
         n_failures = pd.Series(np.clip(np.sum(comparison_results, axis=1), 1, np.inf), index=df.index)
         pvalues = n_failures / self.n_perms
@@ -284,7 +288,8 @@ def test_precomputed(self, adata: AnnData, groupby: str, contrast: str, verbose:
         # Evaluate the test
         # count times shuffling resulted in larger distance
         comparison_results = np.array(
-            pd.concat([r["distance"] - df["distance"] for r in results], axis=1) > 0, dtype=int
+            pd.concat([r["distance"] - df["distance"] for r in results], axis=1) > 0,
+            dtype=int,
         )
         n_failures = pd.Series(np.clip(np.sum(comparison_results, axis=1), 1, np.inf), index=df.index)
         pvalues = n_failures / self.n_perms
diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index e3920c17..8eacbfd3 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -289,7 +289,10 @@ def pairwise(
         # able to handle precomputed distances such as the PseudobulkDistance.
         if self.metric_fct.accepts_precomputed:
             # Precompute the pairwise distances if needed
-            if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys():
+            if (
+                f"{self.obsm_key}_{self.cell_wise_metric}_predistances"
+                not in adata.obsp.keys()
+            ):
                 self.precompute_distances(adata, n_jobs=n_jobs, **kwargs)
             pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"]
             for index_x, group_x in enumerate(fct(groups)):
@@ -302,7 +305,9 @@ def pairwise(
                         # subset the pairwise distance matrix to the two groups
                         sub_pwd = pwd[idx_x | idx_y, :][:, idx_x | idx_y]
                         sub_idx = grouping[idx_x | idx_y] == group_x
-                        dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs)
+                        dist = self.metric_fct.from_precomputed(
+                            sub_pwd, sub_idx, **kwargs
+                        )
                     df.loc[group_x, group_y] = dist
                     df.loc[group_y, group_x] = dist
         else:
@@ -380,7 +385,10 @@ def onesided_distances(
         # able to handle precomputed distances such as the PsuedobulkDistance.
         if self.metric_fct.accepts_precomputed:
             # Precompute the pairwise distances if needed
-            if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys():
+            if (
+                f"{self.obsm_key}_{self.cell_wise_metric}_predistances"
+                not in adata.obsp.keys()
+            ):
                 self.precompute_distances(adata, n_jobs=n_jobs, **kwargs)
             pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"]
             for group_x in fct(groups):
@@ -434,7 +442,9 @@ def precompute_distances(self, adata: AnnData, n_jobs: int = -1) -> None:
             cells = adata.layers[self.layer_key]
         else:
             cells = adata.obsm[self.obsm_key].copy()
-        pwd = pairwise_distances(cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs)
+        pwd = pairwise_distances(
+            cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs
+        )
         adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] = pwd
 
 
@@ -503,7 +513,9 @@ def __init__(self) -> None:
         super().__init__()
         self.accepts_precomputed = False
 
-    def __call__(self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs) -> float:
+    def __call__(
+        self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs
+    ) -> float:
         if kernel == "linear":
             XX = np.dot(X, X.T)
             YY = np.dot(Y, Y.T)
@@ -558,7 +570,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("EuclideanDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "EuclideanDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class MeanSquaredDistance(AbstractDistance):
@@ -569,10 +583,15 @@ def __init__(self) -> None:
         self.accepts_precomputed = False
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 2 / X.shape[1]
+        return (
+            np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 2
+            / X.shape[1]
+        )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("MeanSquaredDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "MeanSquaredDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class MeanAbsoluteDistance(AbstractDistance):
@@ -583,10 +602,15 @@ def __init__(self) -> None:
         self.accepts_precomputed = False
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs) / X.shape[1]
+        return (
+            np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs)
+            / X.shape[1]
+        )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("MeanAbsoluteDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "MeanAbsoluteDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class MeanPairwiseDistance(AbstractDistance):
@@ -616,7 +640,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return 1 - pearsonr(X.mean(axis=0), Y.mean(axis=0))[0]
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("PearsonDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "PearsonDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class SpearmanDistance(AbstractDistance):
@@ -630,7 +656,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return 1 - spearmanr(X.mean(axis=0), Y.mean(axis=0))[0]
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("SpearmanDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "SpearmanDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class KendallTauDistance(AbstractDistance):
@@ -648,7 +676,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return tau_dist
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("KendallTauDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "KendallTauDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class CosineDistance(AbstractDistance):
@@ -662,7 +692,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return cosine(X.mean(axis=0), Y.mean(axis=0))
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("CosineDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "CosineDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class R2ScoreDistance(AbstractDistance):
@@ -678,7 +710,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return 1 - r2_score(X.mean(axis=0), Y.mean(axis=0))
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("R2ScoreDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "R2ScoreDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class SymmetricKLDivergence(AbstractDistance):
@@ -699,13 +733,23 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa
         for i in range(X.shape[1]):
             x_mean, x_std = X[:, i].mean(), X[:, i].std() + epsilon
             y_mean, y_std = Y[:, i].mean(), Y[:, i].std() + epsilon
-            kl = np.log(y_std / x_std) + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2) - 1 / 2
-            klr = np.log(x_std / y_std) + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2) - 1 / 2
+            kl = (
+                np.log(y_std / x_std)
+                + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2)
+                - 1 / 2
+            )
+            klr = (
+                np.log(x_std / y_std)
+                + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2)
+                - 1 / 2
+            )
             kl_all.append(kl + klr)
         return sum(kl_all) / len(kl_all)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("SymmetricKLDivergence cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "SymmetricKLDivergence cannot be called on a pairwise distance matrix."
+        )
 
 
 class TTestDistance(AbstractDistance):
@@ -729,7 +773,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa
         return sum(t_test_all) / len(t_test_all)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("TTestDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "TTestDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class KSTestDistance(AbstractDistance):
@@ -746,7 +792,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return sum(stats) / len(stats)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("KSTestDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "KSTestDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class NBLL(AbstractDistance):
@@ -761,7 +809,9 @@ def __init__(self) -> None:
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> float:
         def _is_count_matrix(matrix, tolerance=1e-6):
-            if matrix.dtype.kind == "i" or np.all(np.abs(matrix - np.round(matrix)) < tolerance):
+            if matrix.dtype.kind == "i" or np.all(
+                np.abs(matrix - np.round(matrix)) < tolerance
+            ):
                 return True
             else:
                 return False
@@ -770,7 +820,9 @@ def _is_count_matrix(matrix, tolerance=1e-6):
             raise ValueError("NBLL distance only works for raw counts.")
 
         @numba.jit(forceobj=True)
-        def _compute_nll(y: np.ndarray, nb_params: tuple[float, float], epsilon: float) -> float:
+        def _compute_nll(
+            y: np.ndarray, nb_params: tuple[float, float], epsilon: float
+        ) -> float:
             mu = np.exp(nb_params[0])
             theta = 1 / nb_params[1]
             eps = epsilon
@@ -806,12 +858,16 @@ def _process_gene(x: np.ndarray, y: np.ndarray, epsilon: float) -> float:
                 nlls.append(nll)
 
         if genes_skipped > X.shape[1] / 2:
-            raise AttributeError(f"{genes_skipped} genes could not be fit, which is over half.")
+            raise AttributeError(
+                f"{genes_skipped} genes could not be fit, which is over half."
+            )
 
         return -np.sum(nlls) / len(nlls)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("NBLL cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "NBLL cannot be called on a pairwise distance matrix."
+        )
 
 
 def _sample(X, frac=None, n=None):
@@ -853,7 +909,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return np.mean(test_labels[:, 1])
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("ClassifierProbaDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "ClassifierProbaDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class ClassifierClassProjection(AbstractDistance):
@@ -867,7 +925,9 @@ def __init__(self) -> None:
         self.accepts_precomputed = False
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("ClassifierClassProjection can currently only be called with onesided.")
+        raise NotImplementedError(
+            "ClassifierClassProjection can currently only be called with onesided."
+        )
 
     def onesided_distances(
         self,
@@ -906,7 +966,9 @@ def onesided_distances(
         return df
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("ClassifierClassProjection cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "ClassifierClassProjection cannot be called on a pairwise distance matrix."
+        )
 
 
 class MeanVarDistnDistance(AbstractDistance):
@@ -964,7 +1026,11 @@ def grid_points(d, n_points=100):
         def kde_eval(d, grid):
             # Kernel choice: Gaussian is too smoothing and cosine or other kernels that do not stretch out
             # can not be compared well on regions further away from the data as they are -inf
-            return KernelDensity(bandwidth="silverman", kernel="exponential").fit(d).score_samples(grid)
+            return (
+                KernelDensity(bandwidth="silverman", kernel="exponential")
+                .fit(d)
+                .score_samples(grid)
+            )
 
         kde_x = kde_eval(x, grid)
         kde_y = kde_eval(y, grid)
@@ -975,7 +1041,9 @@ def kde_eval(d, grid):
         return kde_diff
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("MeanVarDistnDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "MeanVarDistnDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class MahalanobisDistance(AbstractDistance):
@@ -989,6 +1057,7 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
         self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
+        print(np.linalg.inv(np.cov(X.T)).shape)
         return mahalanobis(
             self.aggregation_func(X, axis=0),
             self.aggregation_func(Y, axis=0),
@@ -996,4 +1065,6 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("Mahalanobis cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "Mahalanobis cannot be called on a pairwise distance matrix."
+        )

From 3fe911b0b57be7ded10ecd9b73a6457d6f819229 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 26 May 2024 20:50:50 +0000
Subject: [PATCH 03/34] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pertpy/tools/_distances/_distances.py | 130 ++++++--------------------
 1 file changed, 30 insertions(+), 100 deletions(-)

diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index 8eacbfd3..4f4bc4d2 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -289,10 +289,7 @@ def pairwise(
         # able to handle precomputed distances such as the PseudobulkDistance.
         if self.metric_fct.accepts_precomputed:
             # Precompute the pairwise distances if needed
-            if (
-                f"{self.obsm_key}_{self.cell_wise_metric}_predistances"
-                not in adata.obsp.keys()
-            ):
+            if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys():
                 self.precompute_distances(adata, n_jobs=n_jobs, **kwargs)
             pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"]
             for index_x, group_x in enumerate(fct(groups)):
@@ -305,9 +302,7 @@ def pairwise(
                         # subset the pairwise distance matrix to the two groups
                         sub_pwd = pwd[idx_x | idx_y, :][:, idx_x | idx_y]
                         sub_idx = grouping[idx_x | idx_y] == group_x
-                        dist = self.metric_fct.from_precomputed(
-                            sub_pwd, sub_idx, **kwargs
-                        )
+                        dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs)
                     df.loc[group_x, group_y] = dist
                     df.loc[group_y, group_x] = dist
         else:
@@ -385,10 +380,7 @@ def onesided_distances(
         # able to handle precomputed distances such as the PsuedobulkDistance.
         if self.metric_fct.accepts_precomputed:
             # Precompute the pairwise distances if needed
-            if (
-                f"{self.obsm_key}_{self.cell_wise_metric}_predistances"
-                not in adata.obsp.keys()
-            ):
+            if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys():
                 self.precompute_distances(adata, n_jobs=n_jobs, **kwargs)
             pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"]
             for group_x in fct(groups):
@@ -442,9 +434,7 @@ def precompute_distances(self, adata: AnnData, n_jobs: int = -1) -> None:
             cells = adata.layers[self.layer_key]
         else:
             cells = adata.obsm[self.obsm_key].copy()
-        pwd = pairwise_distances(
-            cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs
-        )
+        pwd = pairwise_distances(cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs)
         adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] = pwd
 
 
@@ -513,9 +503,7 @@ def __init__(self) -> None:
         super().__init__()
         self.accepts_precomputed = False
 
-    def __call__(
-        self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs
-    ) -> float:
+    def __call__(self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs) -> float:
         if kernel == "linear":
             XX = np.dot(X, X.T)
             YY = np.dot(Y, Y.T)
@@ -570,9 +558,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "EuclideanDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("EuclideanDistance cannot be called on a pairwise distance matrix.")
 
 
 class MeanSquaredDistance(AbstractDistance):
@@ -583,15 +569,10 @@ def __init__(self) -> None:
         self.accepts_precomputed = False
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return (
-            np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 2
-            / X.shape[1]
-        )
+        return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 2 / X.shape[1]
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "MeanSquaredDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("MeanSquaredDistance cannot be called on a pairwise distance matrix.")
 
 
 class MeanAbsoluteDistance(AbstractDistance):
@@ -602,15 +583,10 @@ def __init__(self) -> None:
         self.accepts_precomputed = False
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return (
-            np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs)
-            / X.shape[1]
-        )
+        return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs) / X.shape[1]
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "MeanAbsoluteDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("MeanAbsoluteDistance cannot be called on a pairwise distance matrix.")
 
 
 class MeanPairwiseDistance(AbstractDistance):
@@ -640,9 +616,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return 1 - pearsonr(X.mean(axis=0), Y.mean(axis=0))[0]
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "PearsonDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("PearsonDistance cannot be called on a pairwise distance matrix.")
 
 
 class SpearmanDistance(AbstractDistance):
@@ -656,9 +630,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return 1 - spearmanr(X.mean(axis=0), Y.mean(axis=0))[0]
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "SpearmanDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("SpearmanDistance cannot be called on a pairwise distance matrix.")
 
 
 class KendallTauDistance(AbstractDistance):
@@ -676,9 +648,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return tau_dist
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "KendallTauDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("KendallTauDistance cannot be called on a pairwise distance matrix.")
 
 
 class CosineDistance(AbstractDistance):
@@ -692,9 +662,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return cosine(X.mean(axis=0), Y.mean(axis=0))
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "CosineDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("CosineDistance cannot be called on a pairwise distance matrix.")
 
 
 class R2ScoreDistance(AbstractDistance):
@@ -710,9 +678,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return 1 - r2_score(X.mean(axis=0), Y.mean(axis=0))
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "R2ScoreDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("R2ScoreDistance cannot be called on a pairwise distance matrix.")
 
 
 class SymmetricKLDivergence(AbstractDistance):
@@ -733,23 +699,13 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa
         for i in range(X.shape[1]):
             x_mean, x_std = X[:, i].mean(), X[:, i].std() + epsilon
             y_mean, y_std = Y[:, i].mean(), Y[:, i].std() + epsilon
-            kl = (
-                np.log(y_std / x_std)
-                + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2)
-                - 1 / 2
-            )
-            klr = (
-                np.log(x_std / y_std)
-                + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2)
-                - 1 / 2
-            )
+            kl = np.log(y_std / x_std) + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2) - 1 / 2
+            klr = np.log(x_std / y_std) + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2) - 1 / 2
             kl_all.append(kl + klr)
         return sum(kl_all) / len(kl_all)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "SymmetricKLDivergence cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("SymmetricKLDivergence cannot be called on a pairwise distance matrix.")
 
 
 class TTestDistance(AbstractDistance):
@@ -773,9 +729,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa
         return sum(t_test_all) / len(t_test_all)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "TTestDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("TTestDistance cannot be called on a pairwise distance matrix.")
 
 
 class KSTestDistance(AbstractDistance):
@@ -792,9 +746,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return sum(stats) / len(stats)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "KSTestDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("KSTestDistance cannot be called on a pairwise distance matrix.")
 
 
 class NBLL(AbstractDistance):
@@ -809,9 +761,7 @@ def __init__(self) -> None:
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> float:
         def _is_count_matrix(matrix, tolerance=1e-6):
-            if matrix.dtype.kind == "i" or np.all(
-                np.abs(matrix - np.round(matrix)) < tolerance
-            ):
+            if matrix.dtype.kind == "i" or np.all(np.abs(matrix - np.round(matrix)) < tolerance):
                 return True
             else:
                 return False
@@ -820,9 +770,7 @@ def _is_count_matrix(matrix, tolerance=1e-6):
             raise ValueError("NBLL distance only works for raw counts.")
 
         @numba.jit(forceobj=True)
-        def _compute_nll(
-            y: np.ndarray, nb_params: tuple[float, float], epsilon: float
-        ) -> float:
+        def _compute_nll(y: np.ndarray, nb_params: tuple[float, float], epsilon: float) -> float:
             mu = np.exp(nb_params[0])
             theta = 1 / nb_params[1]
             eps = epsilon
@@ -858,16 +806,12 @@ def _process_gene(x: np.ndarray, y: np.ndarray, epsilon: float) -> float:
                 nlls.append(nll)
 
         if genes_skipped > X.shape[1] / 2:
-            raise AttributeError(
-                f"{genes_skipped} genes could not be fit, which is over half."
-            )
+            raise AttributeError(f"{genes_skipped} genes could not be fit, which is over half.")
 
         return -np.sum(nlls) / len(nlls)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "NBLL cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("NBLL cannot be called on a pairwise distance matrix.")
 
 
 def _sample(X, frac=None, n=None):
@@ -909,9 +853,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return np.mean(test_labels[:, 1])
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "ClassifierProbaDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("ClassifierProbaDistance cannot be called on a pairwise distance matrix.")
 
 
 class ClassifierClassProjection(AbstractDistance):
@@ -925,9 +867,7 @@ def __init__(self) -> None:
         self.accepts_precomputed = False
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "ClassifierClassProjection can currently only be called with onesided."
-        )
+        raise NotImplementedError("ClassifierClassProjection can currently only be called with onesided.")
 
     def onesided_distances(
         self,
@@ -966,9 +906,7 @@ def onesided_distances(
         return df
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "ClassifierClassProjection cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("ClassifierClassProjection cannot be called on a pairwise distance matrix.")
 
 
 class MeanVarDistnDistance(AbstractDistance):
@@ -1026,11 +964,7 @@ def grid_points(d, n_points=100):
         def kde_eval(d, grid):
             # Kernel choice: Gaussian is too smoothing and cosine or other kernels that do not stretch out
             # can not be compared well on regions further away from the data as they are -inf
-            return (
-                KernelDensity(bandwidth="silverman", kernel="exponential")
-                .fit(d)
-                .score_samples(grid)
-            )
+            return KernelDensity(bandwidth="silverman", kernel="exponential").fit(d).score_samples(grid)
 
         kde_x = kde_eval(x, grid)
         kde_y = kde_eval(y, grid)
@@ -1041,9 +975,7 @@ def kde_eval(d, grid):
         return kde_diff
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "MeanVarDistnDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("MeanVarDistnDistance cannot be called on a pairwise distance matrix.")
 
 
 class MahalanobisDistance(AbstractDistance):
@@ -1065,6 +997,4 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "Mahalanobis cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("Mahalanobis cannot be called on a pairwise distance matrix.")

From 6d419c3c6dcd5b703b3ee795f845dd9b044fe779 Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Mon, 3 Jun 2024 00:08:30 +0200
Subject: [PATCH 04/34] add agg fct

---
 pertpy/tools/_distances/_distances.py    | 209 +++++++++++++++++------
 tests/tools/_distances/test_distances.py |  22 ++-
 2 files changed, 171 insertions(+), 60 deletions(-)

diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index 4f4bc4d2..5d50a9bb 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -160,23 +160,23 @@ def __init__(
         if metric == "edistance":
             metric_fct = Edistance()
         elif metric == "euclidean":
-            metric_fct = EuclideanDistance()
+            metric_fct = EuclideanDistance(self.aggregation_func)
         elif metric == "root_mean_squared_error":
-            metric_fct = EuclideanDistance()
+            metric_fct = EuclideanDistance(self.aggregation_func)
         elif metric == "mse":
-            metric_fct = MeanSquaredDistance()
+            metric_fct = MeanSquaredDistance(self.aggregation_func)
         elif metric == "mean_absolute_error":
-            metric_fct = MeanAbsoluteDistance()
+            metric_fct = MeanAbsoluteDistance(self.aggregation_func)
         elif metric == "pearson_distance":
-            metric_fct = PearsonDistance()
+            metric_fct = PearsonDistance(self.aggregation_func)
         elif metric == "spearman_distance":
-            metric_fct = SpearmanDistance()
+            metric_fct = SpearmanDistance(self.aggregation_func)
         elif metric == "kendalltau_distance":
-            metric_fct = KendallTauDistance()
+            metric_fct = KendallTauDistance(self.aggregation_func)
         elif metric == "cosine_distance":
-            metric_fct = CosineDistance()
+            metric_fct = CosineDistance(self.aggregation_func)
         elif metric == "r2_distance":
-            metric_fct = R2ScoreDistance()
+            metric_fct = R2ScoreDistance(self.aggregation_func)
         elif metric == "mean_pairwise":
             metric_fct = MeanPairwiseDistance()
         elif metric == "mmd":
@@ -289,7 +289,10 @@ def pairwise(
         # able to handle precomputed distances such as the PseudobulkDistance.
         if self.metric_fct.accepts_precomputed:
             # Precompute the pairwise distances if needed
-            if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys():
+            if (
+                f"{self.obsm_key}_{self.cell_wise_metric}_predistances"
+                not in adata.obsp.keys()
+            ):
                 self.precompute_distances(adata, n_jobs=n_jobs, **kwargs)
             pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"]
             for index_x, group_x in enumerate(fct(groups)):
@@ -302,7 +305,9 @@ def pairwise(
                         # subset the pairwise distance matrix to the two groups
                         sub_pwd = pwd[idx_x | idx_y, :][:, idx_x | idx_y]
                         sub_idx = grouping[idx_x | idx_y] == group_x
-                        dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs)
+                        dist = self.metric_fct.from_precomputed(
+                            sub_pwd, sub_idx, **kwargs
+                        )
                     df.loc[group_x, group_y] = dist
                     df.loc[group_y, group_x] = dist
         else:
@@ -380,7 +385,10 @@ def onesided_distances(
         # able to handle precomputed distances such as the PsuedobulkDistance.
         if self.metric_fct.accepts_precomputed:
             # Precompute the pairwise distances if needed
-            if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys():
+            if (
+                f"{self.obsm_key}_{self.cell_wise_metric}_predistances"
+                not in adata.obsp.keys()
+            ):
                 self.precompute_distances(adata, n_jobs=n_jobs, **kwargs)
             pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"]
             for group_x in fct(groups):
@@ -434,7 +442,9 @@ def precompute_distances(self, adata: AnnData, n_jobs: int = -1) -> None:
             cells = adata.layers[self.layer_key]
         else:
             cells = adata.obsm[self.obsm_key].copy()
-        pwd = pairwise_distances(cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs)
+        pwd = pairwise_distances(
+            cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs
+        )
         adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] = pwd
 
 
@@ -503,7 +513,9 @@ def __init__(self) -> None:
         super().__init__()
         self.accepts_precomputed = False
 
-    def __call__(self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs) -> float:
+    def __call__(
+        self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs
+    ) -> float:
         if kernel == "linear":
             XX = np.dot(X, X.T)
             YY = np.dot(Y, Y.T)
@@ -550,43 +562,62 @@ def solve_ot_problem(self, geom: Geometry, **kwargs):
 class EuclideanDistance(AbstractDistance):
     """Euclidean distance between pseudobulk vectors."""
 
-    def __init__(self) -> None:
+    def __init__(self, aggregation_func: Callable = np.mean) -> None:
         super().__init__()
         self.accepts_precomputed = False
+        self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs)
+        return np.linalg.norm(
+            self.aggregation_func(X, axis=0) - self.aggregation_func(Y, axis=0),
+            ord=2,
+            **kwargs,
+        )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("EuclideanDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "EuclideanDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class MeanSquaredDistance(AbstractDistance):
     """Mean squared distance between pseudobulk vectors."""
 
-    def __init__(self) -> None:
+    def __init__(self, aggregation_func: Callable = np.mean) -> None:
         super().__init__()
         self.accepts_precomputed = False
+        self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 2 / X.shape[1]
+        return (
+            np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 2
+            / X.shape[1]
+        )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("MeanSquaredDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "MeanSquaredDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class MeanAbsoluteDistance(AbstractDistance):
     """Absolute (Norm-1) distance between pseudobulk vectors."""
 
-    def __init__(self) -> None:
+    def __init__(self, aggregation_func: Callable = np.mean) -> None:
         super().__init__()
         self.accepts_precomputed = False
+        self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs) / X.shape[1]
+        return (
+            np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs)
+            / X.shape[1]
+        )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("MeanAbsoluteDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "MeanAbsoluteDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class MeanPairwiseDistance(AbstractDistance):
@@ -608,61 +639,85 @@ def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
 class PearsonDistance(AbstractDistance):
     """Pearson distance between pseudobulk vectors."""
 
-    def __init__(self) -> None:
+    def __init__(self, aggregation_func: Callable = np.mean) -> None:
         super().__init__()
         self.accepts_precomputed = False
+        self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return 1 - pearsonr(X.mean(axis=0), Y.mean(axis=0))[0]
+        return (
+            1
+            - pearsonr(
+                self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)
+            )[0]
+        )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("PearsonDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "PearsonDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class SpearmanDistance(AbstractDistance):
     """Spearman distance between pseudobulk vectors."""
 
-    def __init__(self) -> None:
+    def __init__(self, aggregation_func: Callable = np.mean) -> None:
         super().__init__()
         self.accepts_precomputed = False
+        self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return 1 - spearmanr(X.mean(axis=0), Y.mean(axis=0))[0]
+        return (
+            1
+            - spearmanr(
+                self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)
+            )[0]
+        )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("SpearmanDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "SpearmanDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class KendallTauDistance(AbstractDistance):
     """Kendall-tau distance between pseudobulk vectors."""
 
-    def __init__(self) -> None:
+    def __init__(self, aggregation_func: Callable = np.mean) -> None:
         super().__init__()
         self.accepts_precomputed = False
+        self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        x, y = X.mean(axis=0), Y.mean(axis=0)
+        x, y = self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)
         n = len(x)
         tau_corr = kendalltau(x, y).statistic
         tau_dist = (1 - tau_corr) * n * (n - 1) / 4
         return tau_dist
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("KendallTauDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "KendallTauDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class CosineDistance(AbstractDistance):
     """Cosine distance between pseudobulk vectors."""
 
-    def __init__(self) -> None:
+    def __init__(self, aggregation_func: Callable = np.mean) -> None:
         super().__init__()
         self.accepts_precomputed = False
+        self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return cosine(X.mean(axis=0), Y.mean(axis=0))
+        return cosine(
+            self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)
+        )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("CosineDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "CosineDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class R2ScoreDistance(AbstractDistance):
@@ -670,15 +725,20 @@ class R2ScoreDistance(AbstractDistance):
 
     # NOTE: This is not a distance metric but a similarity metric.
 
-    def __init__(self) -> None:
+    def __init__(self, aggregation_func: Callable = np.mean) -> None:
         super().__init__()
         self.accepts_precomputed = False
+        self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return 1 - r2_score(X.mean(axis=0), Y.mean(axis=0))
+        return 1 - r2_score(
+            self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)
+        )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("R2ScoreDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "R2ScoreDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class SymmetricKLDivergence(AbstractDistance):
@@ -699,13 +759,23 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa
         for i in range(X.shape[1]):
             x_mean, x_std = X[:, i].mean(), X[:, i].std() + epsilon
             y_mean, y_std = Y[:, i].mean(), Y[:, i].std() + epsilon
-            kl = np.log(y_std / x_std) + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2) - 1 / 2
-            klr = np.log(x_std / y_std) + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2) - 1 / 2
+            kl = (
+                np.log(y_std / x_std)
+                + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2)
+                - 1 / 2
+            )
+            klr = (
+                np.log(x_std / y_std)
+                + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2)
+                - 1 / 2
+            )
             kl_all.append(kl + klr)
         return sum(kl_all) / len(kl_all)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("SymmetricKLDivergence cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "SymmetricKLDivergence cannot be called on a pairwise distance matrix."
+        )
 
 
 class TTestDistance(AbstractDistance):
@@ -729,7 +799,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa
         return sum(t_test_all) / len(t_test_all)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("TTestDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "TTestDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class KSTestDistance(AbstractDistance):
@@ -746,7 +818,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return sum(stats) / len(stats)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("KSTestDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "KSTestDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class NBLL(AbstractDistance):
@@ -761,7 +835,9 @@ def __init__(self) -> None:
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> float:
         def _is_count_matrix(matrix, tolerance=1e-6):
-            if matrix.dtype.kind == "i" or np.all(np.abs(matrix - np.round(matrix)) < tolerance):
+            if matrix.dtype.kind == "i" or np.all(
+                np.abs(matrix - np.round(matrix)) < tolerance
+            ):
                 return True
             else:
                 return False
@@ -770,7 +846,9 @@ def _is_count_matrix(matrix, tolerance=1e-6):
             raise ValueError("NBLL distance only works for raw counts.")
 
         @numba.jit(forceobj=True)
-        def _compute_nll(y: np.ndarray, nb_params: tuple[float, float], epsilon: float) -> float:
+        def _compute_nll(
+            y: np.ndarray, nb_params: tuple[float, float], epsilon: float
+        ) -> float:
             mu = np.exp(nb_params[0])
             theta = 1 / nb_params[1]
             eps = epsilon
@@ -806,12 +884,16 @@ def _process_gene(x: np.ndarray, y: np.ndarray, epsilon: float) -> float:
                 nlls.append(nll)
 
         if genes_skipped > X.shape[1] / 2:
-            raise AttributeError(f"{genes_skipped} genes could not be fit, which is over half.")
+            raise AttributeError(
+                f"{genes_skipped} genes could not be fit, which is over half."
+            )
 
         return -np.sum(nlls) / len(nlls)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("NBLL cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "NBLL cannot be called on a pairwise distance matrix."
+        )
 
 
 def _sample(X, frac=None, n=None):
@@ -853,7 +935,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return np.mean(test_labels[:, 1])
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("ClassifierProbaDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "ClassifierProbaDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class ClassifierClassProjection(AbstractDistance):
@@ -867,7 +951,9 @@ def __init__(self) -> None:
         self.accepts_precomputed = False
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("ClassifierClassProjection can currently only be called with onesided.")
+        raise NotImplementedError(
+            "ClassifierClassProjection can currently only be called with onesided."
+        )
 
     def onesided_distances(
         self,
@@ -884,6 +970,7 @@ def onesided_distances(
         Similar to the parent function, the returned dataframe contains only the specified groups.
         """
         groups = adata.obs[groupby].unique() if groups is None else groups
+        fct = track if show_progressbar else lambda iterable: iterable
 
         X = adata[adata.obs[groupby] != selected_group].X
         labels = adata[adata.obs[groupby] != selected_group].obs[groupby].values
@@ -894,7 +981,8 @@ def onesided_distances(
         test_probas = reg.predict_proba(Y)
 
         df = pd.Series(index=groups, dtype=float)
-        for group in groups:
+
+        for group in fct(groups):
             if group == selected_group:
                 df.loc[group] = 0
             else:
@@ -906,7 +994,9 @@ def onesided_distances(
         return df
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("ClassifierClassProjection cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "ClassifierClassProjection cannot be called on a pairwise distance matrix."
+        )
 
 
 class MeanVarDistnDistance(AbstractDistance):
@@ -915,7 +1005,7 @@ class MeanVarDistnDistance(AbstractDistance):
 
     """
 
-    def __init__(self, aggregation_func: Callable = np.mean) -> None:
+    def __init__(self) -> None:
         super().__init__()
         self.accepts_precomputed = False
 
@@ -964,7 +1054,11 @@ def grid_points(d, n_points=100):
         def kde_eval(d, grid):
             # Kernel choice: Gaussian is too smoothing and cosine or other kernels that do not stretch out
             # can not be compared well on regions further away from the data as they are -inf
-            return KernelDensity(bandwidth="silverman", kernel="exponential").fit(d).score_samples(grid)
+            return (
+                KernelDensity(bandwidth="silverman", kernel="exponential")
+                .fit(d)
+                .score_samples(grid)
+            )
 
         kde_x = kde_eval(x, grid)
         kde_y = kde_eval(y, grid)
@@ -975,7 +1069,9 @@ def kde_eval(d, grid):
         return kde_diff
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("MeanVarDistnDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "MeanVarDistnDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class MahalanobisDistance(AbstractDistance):
@@ -989,7 +1085,6 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
         self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        print(np.linalg.inv(np.cov(X.T)).shape)
         return mahalanobis(
             self.aggregation_func(X, axis=0),
             self.aggregation_func(Y, axis=0),
@@ -997,4 +1092,6 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("Mahalanobis cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "Mahalanobis cannot be called on a pairwise distance matrix."
+        )
diff --git a/tests/tools/_distances/test_distances.py b/tests/tools/_distances/test_distances.py
index bd67efac..ccf9b4b9 100644
--- a/tests/tools/_distances/test_distances.py
+++ b/tests/tools/_distances/test_distances.py
@@ -27,7 +27,13 @@
 onesided_only = ["classifier_cp"]
 pseudo_counts_distances = ["nb_ll"]
 lognorm_counts_distances = ["mean_var_distn"]
-all_distances = actual_distances + semi_distances + non_distances + pseudo_counts_distances  # + onesided_only
+all_distances = (
+    actual_distances
+    + semi_distances
+    + non_distances
+    + pseudo_counts_distances
+    + lognorm_counts_distances
+)  # + onesided_only
 
 
 @pytest.fixture
@@ -40,6 +46,7 @@ def all_pairwise_distances():
         "classifier_proba",
         "classifier_cp",
         "mahalanobis",
+        "mean_var_distn",
     ]
 
     for distance in all_distances:
@@ -90,7 +97,10 @@ def test_triangle_inequality(all_pairwise_distances):
         for _i in range(10):
             rng = np.random.default_rng()
             triplet = rng.choice(df.index, size=3, replace=False)
-            assert df.loc[triplet[0], triplet[1]] + df.loc[triplet[1], triplet[2]] >= df.loc[triplet[0], triplet[2]]
+            assert (
+                df.loc[triplet[0], triplet[1]] + df.loc[triplet[1], triplet[2]]
+                >= df.loc[triplet[0], triplet[2]]
+            )
 
 
 def test_distance_layers(all_pairwise_distances):
@@ -120,7 +130,9 @@ def test_distance_output_type(all_pairwise_distances):
     # Test if distances are outputting floats
     for distance in all_distances:
         df = all_pairwise_distances[distance]
-        assert df.apply(lambda col: pd.api.types.is_float_dtype(col)).all(), "Not all values are floats."
+        assert df.apply(
+            lambda col: pd.api.types.is_float_dtype(col)
+        ).all(), "Not all values are floats."
 
 
 def test_distance_pairwise(all_pairwise_distances):
@@ -141,7 +153,9 @@ def test_distance_onesided():
 
     for distance in onesided_only:
         Distance = pt.tl.Distance(distance, obsm_key="X_pca")
-        df = Distance.onesided_distances(adata, groupby="perturbation", selected_group=selected_group)
+        df = Distance.onesided_distances(
+            adata, groupby="perturbation", selected_group=selected_group
+        )
 
         assert isinstance(df, Series)
         assert df.loc[selected_group] == 0  # distance to self is 0

From ca860256bef6d0a43338f4f538e27139204019a6 Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Mon, 3 Jun 2024 14:04:51 +0200
Subject: [PATCH 05/34] speed up tests

---
 pertpy/tools/_distances/_distances.py    | 194 ++++++-----------------
 tests/tools/_distances/test_distances.py |  32 ++--
 2 files changed, 65 insertions(+), 161 deletions(-)

diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index 5d50a9bb..c2bcfd73 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import multiprocessing
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING
 
@@ -25,22 +26,14 @@
 
 def compute_medoid(arr, axis=None):
     if len(arr) == 0:
-        return None  # Handle the case when the array is empty
+        return None
 
     if axis is not None:
-        # If axis is specified, compute the medoid along that axis
         return np.apply_along_axis(lambda x: compute_medoid(x), axis, arr)
 
-    # Calculate pairwise distances between all elements
     distances = np.abs(arr[:, np.newaxis] - arr)
-
-    # Calculate the total distance for each element
     total_distances = np.sum(distances, axis=1)
-
-    # Find the index of the element with the smallest total distance (medoid)
     medoid_index = np.argmin(total_distances)
-
-    # Return the medoid value
     medoid = arr[medoid_index]
 
     return medoid
@@ -289,10 +282,7 @@ def pairwise(
         # able to handle precomputed distances such as the PseudobulkDistance.
         if self.metric_fct.accepts_precomputed:
             # Precompute the pairwise distances if needed
-            if (
-                f"{self.obsm_key}_{self.cell_wise_metric}_predistances"
-                not in adata.obsp.keys()
-            ):
+            if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys():
                 self.precompute_distances(adata, n_jobs=n_jobs, **kwargs)
             pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"]
             for index_x, group_x in enumerate(fct(groups)):
@@ -305,9 +295,7 @@ def pairwise(
                         # subset the pairwise distance matrix to the two groups
                         sub_pwd = pwd[idx_x | idx_y, :][:, idx_x | idx_y]
                         sub_idx = grouping[idx_x | idx_y] == group_x
-                        dist = self.metric_fct.from_precomputed(
-                            sub_pwd, sub_idx, **kwargs
-                        )
+                        dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs)
                     df.loc[group_x, group_y] = dist
                     df.loc[group_y, group_x] = dist
         else:
@@ -385,10 +373,7 @@ def onesided_distances(
         # able to handle precomputed distances such as the PsuedobulkDistance.
         if self.metric_fct.accepts_precomputed:
             # Precompute the pairwise distances if needed
-            if (
-                f"{self.obsm_key}_{self.cell_wise_metric}_predistances"
-                not in adata.obsp.keys()
-            ):
+            if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys():
                 self.precompute_distances(adata, n_jobs=n_jobs, **kwargs)
             pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"]
             for group_x in fct(groups):
@@ -442,9 +427,7 @@ def precompute_distances(self, adata: AnnData, n_jobs: int = -1) -> None:
             cells = adata.layers[self.layer_key]
         else:
             cells = adata.obsm[self.obsm_key].copy()
-        pwd = pairwise_distances(
-            cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs
-        )
+        pwd = pairwise_distances(cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs)
         adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] = pwd
 
 
@@ -513,9 +496,7 @@ def __init__(self) -> None:
         super().__init__()
         self.accepts_precomputed = False
 
-    def __call__(
-        self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs
-    ) -> float:
+    def __call__(self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs) -> float:
         if kernel == "linear":
             XX = np.dot(X, X.T)
             YY = np.dot(Y, Y.T)
@@ -575,9 +556,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "EuclideanDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("EuclideanDistance cannot be called on a pairwise distance matrix.")
 
 
 class MeanSquaredDistance(AbstractDistance):
@@ -589,15 +568,10 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
         self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return (
-            np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 2
-            / X.shape[1]
-        )
+        return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 2 / X.shape[1]
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "MeanSquaredDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("MeanSquaredDistance cannot be called on a pairwise distance matrix.")
 
 
 class MeanAbsoluteDistance(AbstractDistance):
@@ -609,15 +583,10 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
         self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return (
-            np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs)
-            / X.shape[1]
-        )
+        return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs) / X.shape[1]
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "MeanAbsoluteDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("MeanAbsoluteDistance cannot be called on a pairwise distance matrix.")
 
 
 class MeanPairwiseDistance(AbstractDistance):
@@ -645,17 +614,10 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
         self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return (
-            1
-            - pearsonr(
-                self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)
-            )[0]
-        )
+        return 1 - pearsonr(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0))[0]
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "PearsonDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("PearsonDistance cannot be called on a pairwise distance matrix.")
 
 
 class SpearmanDistance(AbstractDistance):
@@ -667,17 +629,10 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
         self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return (
-            1
-            - spearmanr(
-                self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)
-            )[0]
-        )
+        return 1 - spearmanr(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0))[0]
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "SpearmanDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("SpearmanDistance cannot be called on a pairwise distance matrix.")
 
 
 class KendallTauDistance(AbstractDistance):
@@ -696,9 +651,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return tau_dist
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "KendallTauDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("KendallTauDistance cannot be called on a pairwise distance matrix.")
 
 
 class CosineDistance(AbstractDistance):
@@ -710,14 +663,10 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
         self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return cosine(
-            self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)
-        )
+        return cosine(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0))
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "CosineDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("CosineDistance cannot be called on a pairwise distance matrix.")
 
 
 class R2ScoreDistance(AbstractDistance):
@@ -731,14 +680,10 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
         self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return 1 - r2_score(
-            self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)
-        )
+        return 1 - r2_score(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0))
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "R2ScoreDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("R2ScoreDistance cannot be called on a pairwise distance matrix.")
 
 
 class SymmetricKLDivergence(AbstractDistance):
@@ -759,23 +704,13 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa
         for i in range(X.shape[1]):
             x_mean, x_std = X[:, i].mean(), X[:, i].std() + epsilon
             y_mean, y_std = Y[:, i].mean(), Y[:, i].std() + epsilon
-            kl = (
-                np.log(y_std / x_std)
-                + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2)
-                - 1 / 2
-            )
-            klr = (
-                np.log(x_std / y_std)
-                + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2)
-                - 1 / 2
-            )
+            kl = np.log(y_std / x_std) + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2) - 1 / 2
+            klr = np.log(x_std / y_std) + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2) - 1 / 2
             kl_all.append(kl + klr)
         return sum(kl_all) / len(kl_all)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "SymmetricKLDivergence cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("SymmetricKLDivergence cannot be called on a pairwise distance matrix.")
 
 
 class TTestDistance(AbstractDistance):
@@ -799,9 +734,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa
         return sum(t_test_all) / len(t_test_all)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "TTestDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("TTestDistance cannot be called on a pairwise distance matrix.")
 
 
 class KSTestDistance(AbstractDistance):
@@ -818,9 +751,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return sum(stats) / len(stats)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "KSTestDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("KSTestDistance cannot be called on a pairwise distance matrix.")
 
 
 class NBLL(AbstractDistance):
@@ -835,9 +766,7 @@ def __init__(self) -> None:
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> float:
         def _is_count_matrix(matrix, tolerance=1e-6):
-            if matrix.dtype.kind == "i" or np.all(
-                np.abs(matrix - np.round(matrix)) < tolerance
-            ):
+            if matrix.dtype.kind == "i" or np.all(np.abs(matrix - np.round(matrix)) < tolerance):
                 return True
             else:
                 return False
@@ -846,9 +775,7 @@ def _is_count_matrix(matrix, tolerance=1e-6):
             raise ValueError("NBLL distance only works for raw counts.")
 
         @numba.jit(forceobj=True)
-        def _compute_nll(
-            y: np.ndarray, nb_params: tuple[float, float], epsilon: float
-        ) -> float:
+        def _compute_nll(y: np.ndarray, nb_params: tuple[float, float], epsilon: float) -> float:
             mu = np.exp(nb_params[0])
             theta = 1 / nb_params[1]
             eps = epsilon
@@ -884,16 +811,12 @@ def _process_gene(x: np.ndarray, y: np.ndarray, epsilon: float) -> float:
                 nlls.append(nll)
 
         if genes_skipped > X.shape[1] / 2:
-            raise AttributeError(
-                f"{genes_skipped} genes could not be fit, which is over half."
-            )
+            raise AttributeError(f"{genes_skipped} genes could not be fit, which is over half.")
 
         return -np.sum(nlls) / len(nlls)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "NBLL cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("NBLL cannot be called on a pairwise distance matrix.")
 
 
 def _sample(X, frac=None, n=None):
@@ -935,9 +858,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return np.mean(test_labels[:, 1])
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "ClassifierProbaDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("ClassifierProbaDistance cannot be called on a pairwise distance matrix.")
 
 
 class ClassifierClassProjection(AbstractDistance):
@@ -951,9 +872,7 @@ def __init__(self) -> None:
         self.accepts_precomputed = False
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "ClassifierClassProjection can currently only be called with onesided."
-        )
+        raise NotImplementedError("ClassifierClassProjection can currently only be called with onesided.")
 
     def onesided_distances(
         self,
@@ -994,9 +913,7 @@ def onesided_distances(
         return df
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "ClassifierClassProjection cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("ClassifierClassProjection cannot be called on a pairwise distance matrix.")
 
 
 class MeanVarDistnDistance(AbstractDistance):
@@ -1030,11 +947,6 @@ def mean_var(x, log: bool = False):
         def prep_kde_data(x, y):
             return np.concatenate([x.reshape(-1, 1), y.reshape(-1, 1)], axis=1)
 
-        mean_x, var_x = mean_var(X, log=True)
-        x = prep_kde_data(mean_x, var_x)
-        mean_y, var_y = mean_var(Y, log=True)
-        y = prep_kde_data(mean_y, var_y)
-
         def grid_points(d, n_points=100):
             # Make grid, add 1 bin on lower/upper end to get final n_points
             d_min = d.min()
@@ -1045,39 +957,39 @@ def grid_points(d, n_points=100):
             d_max = d_max + d_bin
             return np.arange(start=d_min + 0.5 * d_bin, stop=d_max, step=d_bin)
 
-        # Gridpoints to eval KDE on
-        mean_grid = grid_points(np.concatenate([mean_x, mean_y]))
-        var_grid = grid_points(np.concatenate([var_x, var_y]))
-        grid = np.array(np.meshgrid(mean_grid, var_grid)).T.reshape(-1, 2)
+        def parrallel_score_samples(kde, samples, thread_count=int(0.875 * multiprocessing.cpu_count())):
+            with multiprocessing.Pool(thread_count) as p:
+                return np.concatenate(p.map(kde.score_samples, np.array_split(samples, thread_count)))
 
-        # KDE
         def kde_eval(d, grid):
             # Kernel choice: Gaussian is too smoothing and cosine or other kernels that do not stretch out
             # can not be compared well on regions further away from the data as they are -inf
-            return (
-                KernelDensity(bandwidth="silverman", kernel="exponential")
-                .fit(d)
-                .score_samples(grid)
-            )
+            kde = KernelDensity(bandwidth="silverman", kernel="exponential").fit(d)
+            return parrallel_score_samples(kde, grid)
+
+        mean_x, var_x = mean_var(X, log=True)
+        mean_y, var_y = mean_var(Y, log=True)
+
+        x = prep_kde_data(mean_x, var_x)
+        y = prep_kde_data(mean_y, var_y)
+
+        # Gridpoints to eval KDE on
+        mean_grid = grid_points(np.concatenate([mean_x, mean_y]))
+        var_grid = grid_points(np.concatenate([var_x, var_y]))
+        grid = np.array(np.meshgrid(mean_grid, var_grid)).T.reshape(-1, 2)
 
         kde_x = kde_eval(x, grid)
         kde_y = kde_eval(y, grid)
-
-        # KDE difference
         kde_diff = ((kde_x - kde_y) ** 2).mean()
 
         return kde_diff
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "MeanVarDistnDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("MeanVarDistnDistance cannot be called on a pairwise distance matrix.")
 
 
 class MahalanobisDistance(AbstractDistance):
-    """
-    Mahalanobis distance between pseudobulk vectors.
-    """
+    """Mahalanobis distance between pseudobulk vectors."""
 
     def __init__(self, aggregation_func: Callable = np.mean) -> None:
         super().__init__()
@@ -1092,6 +1004,4 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "Mahalanobis cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("Mahalanobis cannot be called on a pairwise distance matrix.")
diff --git a/tests/tools/_distances/test_distances.py b/tests/tools/_distances/test_distances.py
index ccf9b4b9..a4583ab1 100644
--- a/tests/tools/_distances/test_distances.py
+++ b/tests/tools/_distances/test_distances.py
@@ -28,18 +28,14 @@
 pseudo_counts_distances = ["nb_ll"]
 lognorm_counts_distances = ["mean_var_distn"]
 all_distances = (
-    actual_distances
-    + semi_distances
-    + non_distances
-    + pseudo_counts_distances
-    + lognorm_counts_distances
+    actual_distances + semi_distances + non_distances + pseudo_counts_distances + lognorm_counts_distances
 )  # + onesided_only
 
 
 @pytest.fixture
 def all_pairwise_distances():
     all_calulated_distances = {}
-    no_subsample_distances = [
+    low_subsample_distances = [
         "sym_kldiv",
         "t_test",
         "ks_test",
@@ -48,19 +44,24 @@ def all_pairwise_distances():
         "mahalanobis",
         "mean_var_distn",
     ]
+    no_subsample_distances = ["mahalanobis"]  # mahalanobis only works on the full data without subsampling
 
     for distance in all_distances:
         adata = pt.dt.distance_example()
         if distance not in no_subsample_distances:
-            adata = sc.pp.subsample(adata, 0.001, copy=True)
-        elif distance != "mahalanobis":
-            adata = sc.pp.subsample(adata, 0.1, copy=True)
+            if distance in low_subsample_distances:
+                adata = sc.pp.subsample(adata, 0.1, copy=True)
+            else:
+                adata = sc.pp.subsample(adata, 0.001, copy=True)
 
         adata.layers["lognorm"] = adata.X.copy()
         adata.layers["counts"] = np.round(adata.X.toarray()).astype(int)
         if "X_pca" not in adata.obsm.keys():
             sc.pp.pca(adata, n_comps=5)
         if distance in lognorm_counts_distances:
+            groups = np.unique(adata.obs["perturbation"])
+            # KDE is slow, subset to 5 groups for speed up
+            adata = adata[adata.obs["perturbation"].isin(groups[0:5])].copy()
             Distance = pt.tl.Distance(distance, layer_key="lognorm")
         elif distance in pseudo_counts_distances:
             Distance = pt.tl.Distance(distance, layer_key="counts")
@@ -97,10 +98,7 @@ def test_triangle_inequality(all_pairwise_distances):
         for _i in range(10):
             rng = np.random.default_rng()
             triplet = rng.choice(df.index, size=3, replace=False)
-            assert (
-                df.loc[triplet[0], triplet[1]] + df.loc[triplet[1], triplet[2]]
-                >= df.loc[triplet[0], triplet[2]]
-            )
+            assert df.loc[triplet[0], triplet[1]] + df.loc[triplet[1], triplet[2]] >= df.loc[triplet[0], triplet[2]]
 
 
 def test_distance_layers(all_pairwise_distances):
@@ -130,9 +128,7 @@ def test_distance_output_type(all_pairwise_distances):
     # Test if distances are outputting floats
     for distance in all_distances:
         df = all_pairwise_distances[distance]
-        assert df.apply(
-            lambda col: pd.api.types.is_float_dtype(col)
-        ).all(), "Not all values are floats."
+        assert df.apply(lambda col: pd.api.types.is_float_dtype(col)).all(), "Not all values are floats."
 
 
 def test_distance_pairwise(all_pairwise_distances):
@@ -153,9 +149,7 @@ def test_distance_onesided():
 
     for distance in onesided_only:
         Distance = pt.tl.Distance(distance, obsm_key="X_pca")
-        df = Distance.onesided_distances(
-            adata, groupby="perturbation", selected_group=selected_group
-        )
+        df = Distance.onesided_distances(adata, groupby="perturbation", selected_group=selected_group)
 
         assert isinstance(df, Series)
         assert df.loc[selected_group] == 0  # distance to self is 0

From 9fd4c2b72b2dd1d0256848c4f5a4668adbf0c657 Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Mon, 3 Jun 2024 14:52:27 +0200
Subject: [PATCH 06/34] add type

---
 pertpy/tools/_distances/_distances.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index 2208f51b..61cdd43a 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -24,10 +24,9 @@
 from statsmodels.discrete.discrete_model import NegativeBinomialP
 
 
-def compute_medoid(arr, axis=None):
+def compute_medoid(arr: np.ndarray, axis=None):
     if len(arr) == 0:
         return None
-
     if axis is not None:
         return np.apply_along_axis(lambda x: compute_medoid(x), axis, arr)
 

From fc71eae3d3c28183ca36e02ec5e5d0dd4032d27a Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Mon, 3 Jun 2024 16:24:27 +0200
Subject: [PATCH 07/34] add description

---
 pertpy/tools/_distances/_distances.py | 45 +++++++++++++++------------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index 61cdd43a..12d1d758 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -2,7 +2,7 @@
 
 import multiprocessing
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 import numba
 import numpy as np
@@ -106,9 +106,11 @@ class Distance:
         Average of the classification probability of the perturbation for a binary classifier.
     - "classifier_cp": classifier class projection
         Average of the class
-    - "mean_var_distn": Distance between mean-var distibutions of gene expression.
-    - "mahalanobis": Mahalanobis distance between the cells of two groups.
-
+    - "mean_var_distn": Distance between mean-variance distibutions between cells of 2 groups.
+       Mean square distance between the mean-variance distributions of cells from 2 groups using Kernel Density Estimation (KDE).
+    - "mahalanobis": Mahalanobis disatnce between the means of cells from two groups.
+        It is originally used to measure distance between a point and a distribution.
+        in this context, it quantifies the difference between the mean profiles of a target group and a reference group.
 
     Attributes:
         metric: Name of distance metric.
@@ -128,7 +130,7 @@ class Distance:
     def __init__(
         self,
         metric: str = "edistance",
-        agg_fct: str = "mean",
+        agg_fct: Literal["mean", "median", "medoid", "variance"] = "mean",
         layer_key: str = None,
         obsm_key: str = None,
         cell_wise_metric: str = "euclidean",
@@ -927,11 +929,13 @@ def __init__(self) -> None:
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         """
         Difference of mean-var distibutions in 2 matrices
-        :param X,Y: Matrices of cells*genes, normalized and log transformed
+        Args:
+            X: Normalized and log transcformed cells*genes count matrix.
+            Y: Normalized and log transcformed cells*genes count matrix.
         """
 
         # Get log mean & var for comparison
-        def mean_var(x, log: bool = False):
+        def _mean_var(x, log: bool = False):
             mean = np.mean(x, axis=0)
             var = np.var(x, axis=0)
             positive = mean > 0
@@ -942,10 +946,10 @@ def mean_var(x, log: bool = False):
                 var = np.log(var)
             return mean, var
 
-        def prep_kde_data(x, y):
+        def _prep_kde_data(x, y):
             return np.concatenate([x.reshape(-1, 1), y.reshape(-1, 1)], axis=1)
 
-        def grid_points(d, n_points=100):
+        def _grid_points(d, n_points=100):
             # Make grid, add 1 bin on lower/upper end to get final n_points
             d_min = d.min()
             d_max = d.max()
@@ -955,29 +959,30 @@ def grid_points(d, n_points=100):
             d_max = d_max + d_bin
             return np.arange(start=d_min + 0.5 * d_bin, stop=d_max, step=d_bin)
 
-        def parrallel_score_samples(kde, samples, thread_count=int(0.875 * multiprocessing.cpu_count())):
+        def _parrallel_score_samples(kde, samples, thread_count=int(0.875 * multiprocessing.cpu_count())):
             with multiprocessing.Pool(thread_count) as p:
                 return np.concatenate(p.map(kde.score_samples, np.array_split(samples, thread_count)))
 
-        def kde_eval(d, grid):
+        def _kde_eval(d, grid):
             # Kernel choice: Gaussian is too smoothing and cosine or other kernels that do not stretch out
             # can not be compared well on regions further away from the data as they are -inf
             kde = KernelDensity(bandwidth="silverman", kernel="exponential").fit(d)
-            return parrallel_score_samples(kde, grid)
+            return _parrallel_score_samples(kde, grid)
 
-        mean_x, var_x = mean_var(X, log=True)
-        mean_y, var_y = mean_var(Y, log=True)
+        mean_x, var_x = _mean_var(X, log=True)
+        mean_y, var_y = _mean_var(Y, log=True)
 
-        x = prep_kde_data(mean_x, var_x)
-        y = prep_kde_data(mean_y, var_y)
+        x = _prep_kde_data(mean_x, var_x)
+        y = _prep_kde_data(mean_y, var_y)
 
         # Gridpoints to eval KDE on
-        mean_grid = grid_points(np.concatenate([mean_x, mean_y]))
-        var_grid = grid_points(np.concatenate([var_x, var_y]))
+        mean_grid = _grid_points(np.concatenate([mean_x, mean_y]))
+        var_grid = _grid_points(np.concatenate([var_x, var_y]))
         grid = np.array(np.meshgrid(mean_grid, var_grid)).T.reshape(-1, 2)
 
-        kde_x = kde_eval(x, grid)
-        kde_y = kde_eval(y, grid)
+        kde_x = _kde_eval(x, grid)
+        kde_y = _kde_eval(y, grid)
+
         kde_diff = ((kde_x - kde_y) ** 2).mean()
 
         return kde_diff

From 09e5fea91e8ffabdc836af38fea49db5279f5b3c Mon Sep 17 00:00:00 2001
From: Xichen Wu <102925032+wxicu@users.noreply.github.com>
Date: Wed, 5 Jun 2024 09:32:03 +0200
Subject: [PATCH 08/34] Update pertpy/tools/_distances/_distances.py

Co-authored-by: Lukas Heumos <lukas.heumos@posteo.net>
---
 pertpy/tools/_distances/_distances.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index 12d1d758..9686cce3 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -24,7 +24,7 @@
 from statsmodels.discrete.discrete_model import NegativeBinomialP
 
 
-def compute_medoid(arr: np.ndarray, axis=None):
+def _compute_medoid(arr: np.ndarray, axis: str | None=None):
     if len(arr) == 0:
         return None
     if axis is not None:

From ad23ca66696ec7234c8734705203394086fc76d6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 5 Jun 2024 07:32:14 +0000
Subject: [PATCH 09/34] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pertpy/tools/_distances/_distances.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index 9686cce3..03da1c2b 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -24,7 +24,7 @@
 from statsmodels.discrete.discrete_model import NegativeBinomialP
 
 
-def _compute_medoid(arr: np.ndarray, axis: str | None=None):
+def _compute_medoid(arr: np.ndarray, axis: str | None = None):
     if len(arr) == 0:
         return None
     if axis is not None:

From dc74884468463b6d52e73bd532475fc283c54ae7 Mon Sep 17 00:00:00 2001
From: Xichen Wu <102925032+wxicu@users.noreply.github.com>
Date: Wed, 5 Jun 2024 09:32:29 +0200
Subject: [PATCH 10/34] Update pertpy/tools/_distances/_distances.py

Co-authored-by: Lukas Heumos <lukas.heumos@posteo.net>
---
 pertpy/tools/_distances/_distances.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index 03da1c2b..19d848dd 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -108,7 +108,7 @@ class Distance:
         Average of the class
     - "mean_var_distn": Distance between mean-variance distibutions between cells of 2 groups.
        Mean square distance between the mean-variance distributions of cells from 2 groups using Kernel Density Estimation (KDE).
-    - "mahalanobis": Mahalanobis disatnce between the means of cells from two groups.
+    - "mahalanobis": Mahalanobis distance between the means of cells from two groups.
         It is originally used to measure distance between a point and a distribution.
         in this context, it quantifies the difference between the mean profiles of a target group and a reference group.
 

From c774cd28052d305a0d2dcd6c12948dffbe55fedb Mon Sep 17 00:00:00 2001
From: Xichen Wu <102925032+wxicu@users.noreply.github.com>
Date: Wed, 5 Jun 2024 09:32:38 +0200
Subject: [PATCH 11/34] Update pertpy/tools/_distances/_distances.py

Co-authored-by: Lukas Heumos <lukas.heumos@posteo.net>
---
 pertpy/tools/_distances/_distances.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index 19d848dd..915b1584 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -934,7 +934,6 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
             Y: Normalized and log transcformed cells*genes count matrix.
         """
 
-        # Get log mean & var for comparison
         def _mean_var(x, log: bool = False):
             mean = np.mean(x, axis=0)
             var = np.var(x, axis=0)

From d413d672dbca6d7f34059fb370f1bf19c44d2ef9 Mon Sep 17 00:00:00 2001
From: Xichen Wu <102925032+wxicu@users.noreply.github.com>
Date: Wed, 5 Jun 2024 09:34:03 +0200
Subject: [PATCH 12/34] Update pertpy/tools/_distances/_distances.py

Co-authored-by: Eljas Roellin <65244425+eroell@users.noreply.github.com>
---
 pertpy/tools/_distances/_distances.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index 915b1584..161f8152 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -930,7 +930,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         """
         Difference of mean-var distibutions in 2 matrices
         Args:
-            X: Normalized and log transcformed cells*genes count matrix.
+            X: Normalized and log transformed cells x genes count matrix.
             Y: Normalized and log transcformed cells*genes count matrix.
         """
 

From b7f2cf77f402702aa7183bb4f200a1983ece1680 Mon Sep 17 00:00:00 2001
From: Xichen Wu <102925032+wxicu@users.noreply.github.com>
Date: Wed, 5 Jun 2024 09:34:12 +0200
Subject: [PATCH 13/34] Update pertpy/tools/_distances/_distances.py

Co-authored-by: Eljas Roellin <65244425+eroell@users.noreply.github.com>
---
 pertpy/tools/_distances/_distances.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index 161f8152..0af0ac47 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -966,7 +966,7 @@ def _kde_eval(d, grid):
             # Kernel choice: Gaussian is too smoothing and cosine or other kernels that do not stretch out
             # can not be compared well on regions further away from the data as they are -inf
             kde = KernelDensity(bandwidth="silverman", kernel="exponential").fit(d)
-            return _parrallel_score_samples(kde, grid)
+            return _parallel_score_samples(kde, grid)
 
         mean_x, var_x = _mean_var(X, log=True)
         mean_y, var_y = _mean_var(Y, log=True)

From e71f81cd31b21ceef2720e2f45b31037a34acbe0 Mon Sep 17 00:00:00 2001
From: Xichen Wu <102925032+wxicu@users.noreply.github.com>
Date: Wed, 5 Jun 2024 09:34:19 +0200
Subject: [PATCH 14/34] Update pertpy/tools/_distances/_distances.py

Co-authored-by: Eljas Roellin <65244425+eroell@users.noreply.github.com>
---
 pertpy/tools/_distances/_distances.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index 0af0ac47..8764f906 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -139,7 +139,7 @@ def __init__(
 
         Args:
             metric: Distance metric to use.
-            agg_fct: Name of the aggregation function to generate pseodobulk vectors.
+            agg_fct: Name of the aggregation function to generate pseudobulk vectors.
             layer_key: Name of the counts layer containing raw counts to calculate distances for.
                               Mutually exclusive with 'obsm_key'.
                               Is not used if `None`.

From edaa6e650192e6eb51be0fd9776ab96e93053dd0 Mon Sep 17 00:00:00 2001
From: Xichen Wu <102925032+wxicu@users.noreply.github.com>
Date: Wed, 5 Jun 2024 09:34:42 +0200
Subject: [PATCH 15/34] Update pertpy/tools/_distances/_distances.py

Co-authored-by: Eljas Roellin <65244425+eroell@users.noreply.github.com>
---
 pertpy/tools/_distances/_distances.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index 8764f906..6ec03d53 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -931,7 +931,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         Difference of mean-var distibutions in 2 matrices
         Args:
             X: Normalized and log transformed cells x genes count matrix.
-            Y: Normalized and log transcformed cells*genes count matrix.
+            Y: Normalized and log transformed cells x genes count matrix.
         """
 
         def _mean_var(x, log: bool = False):

From 317cfd595eee7690f141da8ee02d3d75f2c2a14a Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Thu, 6 Jun 2024 20:47:44 +0200
Subject: [PATCH 16/34] update code

---
 pertpy/tools/_distances/_distances.py         | 203 ++++++++++++------
 tests/tools/_distances/test_distance_tests.py |   6 +-
 tests/tools/_distances/test_distances.py      |  27 ++-
 3 files changed, 163 insertions(+), 73 deletions(-)

diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index 6ec03d53..d3daf5cf 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -13,7 +13,7 @@
 from ott.solvers.linear.sinkhorn import Sinkhorn
 from pandas import Series
 from rich.progress import track
-from scipy.sparse import csr_matrix, issparse
+from scipy.sparse import issparse
 from scipy.spatial.distance import cosine, mahalanobis
 from scipy.special import gammaln
 from scipy.stats import kendalltau, kstest, pearsonr, spearmanr
@@ -23,27 +23,7 @@
 from sklearn.neighbors import KernelDensity
 from statsmodels.discrete.discrete_model import NegativeBinomialP
 
-
-def _compute_medoid(arr: np.ndarray, axis: str | None = None):
-    if len(arr) == 0:
-        return None
-    if axis is not None:
-        return np.apply_along_axis(lambda x: compute_medoid(x), axis, arr)
-
-    distances = np.abs(arr[:, np.newaxis] - arr)
-    total_distances = np.sum(distances, axis=1)
-    medoid_index = np.argmin(total_distances)
-    medoid = arr[medoid_index]
-
-    return medoid
-
-
-AGG_FCTS = {
-    "mean": np.mean,
-    "median": np.median,
-    "medoid": compute_medoid,
-    "variance": np.var,
-}
+AGG_FCTS = {"mean": np.mean, "median": np.median, "sum": np.sum}
 
 if TYPE_CHECKING:
     from collections.abc import Callable
@@ -106,7 +86,7 @@ class Distance:
         Average of the classification probability of the perturbation for a binary classifier.
     - "classifier_cp": classifier class projection
         Average of the class
-    - "mean_var_distn": Distance between mean-variance distibutions between cells of 2 groups.
+    - "mean_var_distribution": Distance between mean-variance distibutions between cells of 2 groups.
        Mean square distance between the mean-variance distributions of cells from 2 groups using Kernel Density Estimation (KDE).
     - "mahalanobis": Mahalanobis distance between the means of cells from two groups.
         It is originally used to measure distance between a point and a distribution.
@@ -130,7 +110,7 @@ class Distance:
     def __init__(
         self,
         metric: str = "edistance",
-        agg_fct: Literal["mean", "median", "medoid", "variance"] = "mean",
+        agg_fct: Literal["mean", "median", "sum"] = "mean",
         layer_key: str = None,
         obsm_key: str = None,
         cell_wise_metric: str = "euclidean",
@@ -188,8 +168,8 @@ def __init__(
             metric_fct = ClassifierProbaDistance()
         elif metric == "classifier_cp":
             metric_fct = ClassifierClassProjection()
-        elif metric == "mean_var_distn":
-            metric_fct = MeanVarDistnDistance()
+        elif metric == "mean_var_distribution":
+            metric_fct = MeanVarDistributionDistance()
         elif metric == "mahalanobis":
             metric_fct = MahalanobisDistance(self.aggregation_func)
         else:
@@ -282,7 +262,10 @@ def pairwise(
         # able to handle precomputed distances such as the PseudobulkDistance.
         if self.metric_fct.accepts_precomputed:
             # Precompute the pairwise distances if needed
-            if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys():
+            if (
+                f"{self.obsm_key}_{self.cell_wise_metric}_predistances"
+                not in adata.obsp.keys()
+            ):
                 self.precompute_distances(adata, n_jobs=n_jobs, **kwargs)
             pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"]
             for index_x, group_x in enumerate(fct(groups)):
@@ -295,7 +278,9 @@ def pairwise(
                         # subset the pairwise distance matrix to the two groups
                         sub_pwd = pwd[idx_x | idx_y, :][:, idx_x | idx_y]
                         sub_idx = grouping[idx_x | idx_y] == group_x
-                        dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs)
+                        dist = self.metric_fct.from_precomputed(
+                            sub_pwd, sub_idx, **kwargs
+                        )
                     df.loc[group_x, group_y] = dist
                     df.loc[group_y, group_x] = dist
         else:
@@ -373,7 +358,10 @@ def onesided_distances(
         # able to handle precomputed distances such as the PsuedobulkDistance.
         if self.metric_fct.accepts_precomputed:
             # Precompute the pairwise distances if needed
-            if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys():
+            if (
+                f"{self.obsm_key}_{self.cell_wise_metric}_predistances"
+                not in adata.obsp.keys()
+            ):
                 self.precompute_distances(adata, n_jobs=n_jobs, **kwargs)
             pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"]
             for group_x in fct(groups):
@@ -427,7 +415,9 @@ def precompute_distances(self, adata: AnnData, n_jobs: int = -1) -> None:
             cells = adata.layers[self.layer_key]
         else:
             cells = adata.obsm[self.obsm_key].copy()
-        pwd = pairwise_distances(cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs)
+        pwd = pairwise_distances(
+            cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs
+        )
         adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] = pwd
 
 
@@ -496,7 +486,9 @@ def __init__(self) -> None:
         super().__init__()
         self.accepts_precomputed = False
 
-    def __call__(self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs) -> float:
+    def __call__(
+        self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs
+    ) -> float:
         if kernel == "linear":
             XX = np.dot(X, X.T)
             YY = np.dot(Y, Y.T)
@@ -556,7 +548,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("EuclideanDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "EuclideanDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class MeanSquaredDistance(AbstractDistance):
@@ -568,10 +562,20 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
         self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=2, **kwargs) ** 2 / X.shape[1]
+        return (
+            np.linalg.norm(
+                self.aggregation_func(X, axis=0) - self.aggregation_func(Y, axis=0),
+                ord=2,
+                **kwargs,
+            )
+            ** 2
+            / X.shape[1]
+        )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("MeanSquaredDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "MeanSquaredDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class MeanAbsoluteDistance(AbstractDistance):
@@ -583,10 +587,19 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
         self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return np.linalg.norm(X.mean(axis=0) - Y.mean(axis=0), ord=1, **kwargs) / X.shape[1]
+        return (
+            np.linalg.norm(
+                self.aggregation_func(X, axis=0) - self.aggregation_func(Y, axis=0),
+                ord=1,
+                **kwargs,
+            )
+            / X.shape[1]
+        )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("MeanAbsoluteDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "MeanAbsoluteDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class MeanPairwiseDistance(AbstractDistance):
@@ -614,10 +627,17 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
         self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return 1 - pearsonr(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0))[0]
+        return (
+            1
+            - pearsonr(
+                self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)
+            )[0]
+        )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("PearsonDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "PearsonDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class SpearmanDistance(AbstractDistance):
@@ -629,10 +649,17 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
         self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return 1 - spearmanr(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0))[0]
+        return (
+            1
+            - spearmanr(
+                self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)
+            )[0]
+        )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("SpearmanDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "SpearmanDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class KendallTauDistance(AbstractDistance):
@@ -651,7 +678,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return tau_dist
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("KendallTauDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "KendallTauDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class CosineDistance(AbstractDistance):
@@ -663,10 +692,14 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
         self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return cosine(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0))
+        return cosine(
+            self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)
+        )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("CosineDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "CosineDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class R2ScoreDistance(AbstractDistance):
@@ -680,10 +713,14 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
         self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return 1 - r2_score(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0))
+        return 1 - r2_score(
+            self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)
+        )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("R2ScoreDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "R2ScoreDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class SymmetricKLDivergence(AbstractDistance):
@@ -704,13 +741,23 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa
         for i in range(X.shape[1]):
             x_mean, x_std = X[:, i].mean(), X[:, i].std() + epsilon
             y_mean, y_std = Y[:, i].mean(), Y[:, i].std() + epsilon
-            kl = np.log(y_std / x_std) + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2) - 1 / 2
-            klr = np.log(x_std / y_std) + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2) - 1 / 2
+            kl = (
+                np.log(y_std / x_std)
+                + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2)
+                - 1 / 2
+            )
+            klr = (
+                np.log(x_std / y_std)
+                + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2)
+                - 1 / 2
+            )
             kl_all.append(kl + klr)
         return sum(kl_all) / len(kl_all)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("SymmetricKLDivergence cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "SymmetricKLDivergence cannot be called on a pairwise distance matrix."
+        )
 
 
 class TTestDistance(AbstractDistance):
@@ -734,7 +781,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa
         return sum(t_test_all) / len(t_test_all)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("TTestDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "TTestDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class KSTestDistance(AbstractDistance):
@@ -751,7 +800,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return sum(stats) / len(stats)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("KSTestDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "KSTestDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class NBLL(AbstractDistance):
@@ -766,7 +817,9 @@ def __init__(self) -> None:
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> float:
         def _is_count_matrix(matrix, tolerance=1e-6):
-            if matrix.dtype.kind == "i" or np.all(np.abs(matrix - np.round(matrix)) < tolerance):
+            if matrix.dtype.kind == "i" or np.all(
+                np.abs(matrix - np.round(matrix)) < tolerance
+            ):
                 return True
             else:
                 return False
@@ -775,7 +828,9 @@ def _is_count_matrix(matrix, tolerance=1e-6):
             raise ValueError("NBLL distance only works for raw counts.")
 
         @numba.jit(forceobj=True)
-        def _compute_nll(y: np.ndarray, nb_params: tuple[float, float], epsilon: float) -> float:
+        def _compute_nll(
+            y: np.ndarray, nb_params: tuple[float, float], epsilon: float
+        ) -> float:
             mu = np.exp(nb_params[0])
             theta = 1 / nb_params[1]
             eps = epsilon
@@ -811,12 +866,16 @@ def _process_gene(x: np.ndarray, y: np.ndarray, epsilon: float) -> float:
                 nlls.append(nll)
 
         if genes_skipped > X.shape[1] / 2:
-            raise AttributeError(f"{genes_skipped} genes could not be fit, which is over half.")
+            raise AttributeError(
+                f"{genes_skipped} genes could not be fit, which is over half."
+            )
 
         return -np.sum(nlls) / len(nlls)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("NBLL cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "NBLL cannot be called on a pairwise distance matrix."
+        )
 
 
 def _sample(X, frac=None, n=None):
@@ -858,7 +917,9 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return np.mean(test_labels[:, 1])
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("ClassifierProbaDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "ClassifierProbaDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class ClassifierClassProjection(AbstractDistance):
@@ -872,7 +933,9 @@ def __init__(self) -> None:
         self.accepts_precomputed = False
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("ClassifierClassProjection can currently only be called with onesided.")
+        raise NotImplementedError(
+            "ClassifierClassProjection can currently only be called with onesided."
+        )
 
     def onesided_distances(
         self,
@@ -913,10 +976,12 @@ def onesided_distances(
         return df
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("ClassifierClassProjection cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "ClassifierClassProjection cannot be called on a pairwise distance matrix."
+        )
 
 
-class MeanVarDistnDistance(AbstractDistance):
+class MeanVarDistributionDistance(AbstractDistance):
     """
     Distance betweenv mean-var distributions of gene expression.
 
@@ -927,8 +992,8 @@ def __init__(self) -> None:
         self.accepts_precomputed = False
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        """
-        Difference of mean-var distibutions in 2 matrices
+        """Difference of mean-var distibutions in 2 matrices.
+
         Args:
             X: Normalized and log transformed cells x genes count matrix.
             Y: Normalized and log transformed cells x genes count matrix.
@@ -958,9 +1023,15 @@ def _grid_points(d, n_points=100):
             d_max = d_max + d_bin
             return np.arange(start=d_min + 0.5 * d_bin, stop=d_max, step=d_bin)
 
-        def _parrallel_score_samples(kde, samples, thread_count=int(0.875 * multiprocessing.cpu_count())):
+        def _parallel_score_samples(
+            kde, samples, thread_count=int(0.875 * multiprocessing.cpu_count())
+        ):
+            # the thread_count is determined using the factor 0.875 as recommended here:
+            # https://stackoverflow.com/questions/32625094/scipy-parallel-computing-in-ipython-notebook
             with multiprocessing.Pool(thread_count) as p:
-                return np.concatenate(p.map(kde.score_samples, np.array_split(samples, thread_count)))
+                return np.concatenate(
+                    p.map(kde.score_samples, np.array_split(samples, thread_count))
+                )
 
         def _kde_eval(d, grid):
             # Kernel choice: Gaussian is too smoothing and cosine or other kernels that do not stretch out
@@ -987,7 +1058,9 @@ def _kde_eval(d, grid):
         return kde_diff
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("MeanVarDistnDistance cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "MeanVarDistributionDistance cannot be called on a pairwise distance matrix."
+        )
 
 
 class MahalanobisDistance(AbstractDistance):
@@ -1006,4 +1079,6 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError("Mahalanobis cannot be called on a pairwise distance matrix.")
+        raise NotImplementedError(
+            "Mahalanobis cannot be called on a pairwise distance matrix."
+        )
diff --git a/tests/tools/_distances/test_distance_tests.py b/tests/tools/_distances/test_distance_tests.py
index 573e9e0f..369a2da9 100644
--- a/tests/tools/_distances/test_distance_tests.py
+++ b/tests/tools/_distances/test_distance_tests.py
@@ -23,7 +23,7 @@
     # "classifier_cp",
     # "nbll",
     "mahalanobis",
-    "mean_var_distn",
+    "mean_var_distribution",
 ]
 
 count_distances = ["nb_ll"]
@@ -42,7 +42,9 @@ def test_distancetest(adata, distance):
     if distance == "wasserstein":
         pytest.mark.apply(pytest.mark.slow)
 
-    etest = pt.tl.DistanceTest(distance, n_perms=10, obsm_key="X_pca", alpha=0.05, correction="holm-sidak")
+    etest = pt.tl.DistanceTest(
+        distance, n_perms=10, obsm_key="X_pca", alpha=0.05, correction="holm-sidak"
+    )
     tab = etest(adata, groupby="perturbation", contrast="control")
 
     # Well-defined output
diff --git a/tests/tools/_distances/test_distances.py b/tests/tools/_distances/test_distances.py
index a4583ab1..e02bf2b6 100644
--- a/tests/tools/_distances/test_distances.py
+++ b/tests/tools/_distances/test_distances.py
@@ -26,9 +26,13 @@
 non_distances = ["classifier_proba"]
 onesided_only = ["classifier_cp"]
 pseudo_counts_distances = ["nb_ll"]
-lognorm_counts_distances = ["mean_var_distn"]
+lognorm_counts_distances = ["mean_var_distribution"]
 all_distances = (
-    actual_distances + semi_distances + non_distances + pseudo_counts_distances + lognorm_counts_distances
+    actual_distances
+    + semi_distances
+    + non_distances
+    + pseudo_counts_distances
+    + lognorm_counts_distances
 )  # + onesided_only
 
 
@@ -42,9 +46,11 @@ def all_pairwise_distances():
         "classifier_proba",
         "classifier_cp",
         "mahalanobis",
-        "mean_var_distn",
+        "mean_var_distribution",
     ]
-    no_subsample_distances = ["mahalanobis"]  # mahalanobis only works on the full data without subsampling
+    no_subsample_distances = [
+        "mahalanobis"
+    ]  # mahalanobis only works on the full data without subsampling
 
     for distance in all_distances:
         adata = pt.dt.distance_example()
@@ -98,7 +104,10 @@ def test_triangle_inequality(all_pairwise_distances):
         for _i in range(10):
             rng = np.random.default_rng()
             triplet = rng.choice(df.index, size=3, replace=False)
-            assert df.loc[triplet[0], triplet[1]] + df.loc[triplet[1], triplet[2]] >= df.loc[triplet[0], triplet[2]]
+            assert (
+                df.loc[triplet[0], triplet[1]] + df.loc[triplet[1], triplet[2]]
+                >= df.loc[triplet[0], triplet[2]]
+            )
 
 
 def test_distance_layers(all_pairwise_distances):
@@ -128,7 +137,9 @@ def test_distance_output_type(all_pairwise_distances):
     # Test if distances are outputting floats
     for distance in all_distances:
         df = all_pairwise_distances[distance]
-        assert df.apply(lambda col: pd.api.types.is_float_dtype(col)).all(), "Not all values are floats."
+        assert df.apply(
+            lambda col: pd.api.types.is_float_dtype(col)
+        ).all(), "Not all values are floats."
 
 
 def test_distance_pairwise(all_pairwise_distances):
@@ -149,7 +160,9 @@ def test_distance_onesided():
 
     for distance in onesided_only:
         Distance = pt.tl.Distance(distance, obsm_key="X_pca")
-        df = Distance.onesided_distances(adata, groupby="perturbation", selected_group=selected_group)
+        df = Distance.onesided_distances(
+            adata, groupby="perturbation", selected_group=selected_group
+        )
 
         assert isinstance(df, Series)
         assert df.loc[selected_group] == 0  # distance to self is 0

From 47b41342829d8eb3f8ea4538627f7a86a9e3596d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 6 Jun 2024 18:47:59 +0000
Subject: [PATCH 17/34] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pertpy/tools/_distances/_distances.py         | 144 ++++--------------
 tests/tools/_distances/test_distance_tests.py |   4 +-
 tests/tools/_distances/test_distances.py      |  23 +--
 3 files changed, 39 insertions(+), 132 deletions(-)

diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index d3daf5cf..71b353c7 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -262,10 +262,7 @@ def pairwise(
         # able to handle precomputed distances such as the PseudobulkDistance.
         if self.metric_fct.accepts_precomputed:
             # Precompute the pairwise distances if needed
-            if (
-                f"{self.obsm_key}_{self.cell_wise_metric}_predistances"
-                not in adata.obsp.keys()
-            ):
+            if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys():
                 self.precompute_distances(adata, n_jobs=n_jobs, **kwargs)
             pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"]
             for index_x, group_x in enumerate(fct(groups)):
@@ -278,9 +275,7 @@ def pairwise(
                         # subset the pairwise distance matrix to the two groups
                         sub_pwd = pwd[idx_x | idx_y, :][:, idx_x | idx_y]
                         sub_idx = grouping[idx_x | idx_y] == group_x
-                        dist = self.metric_fct.from_precomputed(
-                            sub_pwd, sub_idx, **kwargs
-                        )
+                        dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs)
                     df.loc[group_x, group_y] = dist
                     df.loc[group_y, group_x] = dist
         else:
@@ -358,10 +353,7 @@ def onesided_distances(
         # able to handle precomputed distances such as the PsuedobulkDistance.
         if self.metric_fct.accepts_precomputed:
             # Precompute the pairwise distances if needed
-            if (
-                f"{self.obsm_key}_{self.cell_wise_metric}_predistances"
-                not in adata.obsp.keys()
-            ):
+            if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys():
                 self.precompute_distances(adata, n_jobs=n_jobs, **kwargs)
             pwd = adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"]
             for group_x in fct(groups):
@@ -415,9 +407,7 @@ def precompute_distances(self, adata: AnnData, n_jobs: int = -1) -> None:
             cells = adata.layers[self.layer_key]
         else:
             cells = adata.obsm[self.obsm_key].copy()
-        pwd = pairwise_distances(
-            cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs
-        )
+        pwd = pairwise_distances(cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs)
         adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] = pwd
 
 
@@ -486,9 +476,7 @@ def __init__(self) -> None:
         super().__init__()
         self.accepts_precomputed = False
 
-    def __call__(
-        self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs
-    ) -> float:
+    def __call__(self, X: np.ndarray, Y: np.ndarray, kernel="linear", **kwargs) -> float:
         if kernel == "linear":
             XX = np.dot(X, X.T)
             YY = np.dot(Y, Y.T)
@@ -548,9 +536,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "EuclideanDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("EuclideanDistance cannot be called on a pairwise distance matrix.")
 
 
 class MeanSquaredDistance(AbstractDistance):
@@ -573,9 +559,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "MeanSquaredDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("MeanSquaredDistance cannot be called on a pairwise distance matrix.")
 
 
 class MeanAbsoluteDistance(AbstractDistance):
@@ -597,9 +581,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "MeanAbsoluteDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("MeanAbsoluteDistance cannot be called on a pairwise distance matrix.")
 
 
 class MeanPairwiseDistance(AbstractDistance):
@@ -627,17 +609,10 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
         self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return (
-            1
-            - pearsonr(
-                self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)
-            )[0]
-        )
+        return 1 - pearsonr(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0))[0]
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "PearsonDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("PearsonDistance cannot be called on a pairwise distance matrix.")
 
 
 class SpearmanDistance(AbstractDistance):
@@ -649,17 +624,10 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
         self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return (
-            1
-            - spearmanr(
-                self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)
-            )[0]
-        )
+        return 1 - spearmanr(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0))[0]
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "SpearmanDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("SpearmanDistance cannot be called on a pairwise distance matrix.")
 
 
 class KendallTauDistance(AbstractDistance):
@@ -678,9 +646,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return tau_dist
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "KendallTauDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("KendallTauDistance cannot be called on a pairwise distance matrix.")
 
 
 class CosineDistance(AbstractDistance):
@@ -692,14 +658,10 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
         self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return cosine(
-            self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)
-        )
+        return cosine(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0))
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "CosineDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("CosineDistance cannot be called on a pairwise distance matrix.")
 
 
 class R2ScoreDistance(AbstractDistance):
@@ -713,14 +675,10 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
         self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return 1 - r2_score(
-            self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0)
-        )
+        return 1 - r2_score(self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0))
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "R2ScoreDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("R2ScoreDistance cannot be called on a pairwise distance matrix.")
 
 
 class SymmetricKLDivergence(AbstractDistance):
@@ -741,23 +699,13 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa
         for i in range(X.shape[1]):
             x_mean, x_std = X[:, i].mean(), X[:, i].std() + epsilon
             y_mean, y_std = Y[:, i].mean(), Y[:, i].std() + epsilon
-            kl = (
-                np.log(y_std / x_std)
-                + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2)
-                - 1 / 2
-            )
-            klr = (
-                np.log(x_std / y_std)
-                + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2)
-                - 1 / 2
-            )
+            kl = np.log(y_std / x_std) + (x_std**2 + (x_mean - y_mean) ** 2) / (2 * y_std**2) - 1 / 2
+            klr = np.log(x_std / y_std) + (y_std**2 + (y_mean - x_mean) ** 2) / (2 * x_std**2) - 1 / 2
             kl_all.append(kl + klr)
         return sum(kl_all) / len(kl_all)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "SymmetricKLDivergence cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("SymmetricKLDivergence cannot be called on a pairwise distance matrix.")
 
 
 class TTestDistance(AbstractDistance):
@@ -781,9 +729,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> floa
         return sum(t_test_all) / len(t_test_all)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "TTestDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("TTestDistance cannot be called on a pairwise distance matrix.")
 
 
 class KSTestDistance(AbstractDistance):
@@ -800,9 +746,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return sum(stats) / len(stats)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "KSTestDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("KSTestDistance cannot be called on a pairwise distance matrix.")
 
 
 class NBLL(AbstractDistance):
@@ -817,9 +761,7 @@ def __init__(self) -> None:
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, epsilon=1e-8, **kwargs) -> float:
         def _is_count_matrix(matrix, tolerance=1e-6):
-            if matrix.dtype.kind == "i" or np.all(
-                np.abs(matrix - np.round(matrix)) < tolerance
-            ):
+            if matrix.dtype.kind == "i" or np.all(np.abs(matrix - np.round(matrix)) < tolerance):
                 return True
             else:
                 return False
@@ -828,9 +770,7 @@ def _is_count_matrix(matrix, tolerance=1e-6):
             raise ValueError("NBLL distance only works for raw counts.")
 
         @numba.jit(forceobj=True)
-        def _compute_nll(
-            y: np.ndarray, nb_params: tuple[float, float], epsilon: float
-        ) -> float:
+        def _compute_nll(y: np.ndarray, nb_params: tuple[float, float], epsilon: float) -> float:
             mu = np.exp(nb_params[0])
             theta = 1 / nb_params[1]
             eps = epsilon
@@ -866,16 +806,12 @@ def _process_gene(x: np.ndarray, y: np.ndarray, epsilon: float) -> float:
                 nlls.append(nll)
 
         if genes_skipped > X.shape[1] / 2:
-            raise AttributeError(
-                f"{genes_skipped} genes could not be fit, which is over half."
-            )
+            raise AttributeError(f"{genes_skipped} genes could not be fit, which is over half.")
 
         return -np.sum(nlls) / len(nlls)
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "NBLL cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("NBLL cannot be called on a pairwise distance matrix.")
 
 
 def _sample(X, frac=None, n=None):
@@ -917,9 +853,7 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return np.mean(test_labels[:, 1])
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "ClassifierProbaDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("ClassifierProbaDistance cannot be called on a pairwise distance matrix.")
 
 
 class ClassifierClassProjection(AbstractDistance):
@@ -933,9 +867,7 @@ def __init__(self) -> None:
         self.accepts_precomputed = False
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "ClassifierClassProjection can currently only be called with onesided."
-        )
+        raise NotImplementedError("ClassifierClassProjection can currently only be called with onesided.")
 
     def onesided_distances(
         self,
@@ -976,9 +908,7 @@ def onesided_distances(
         return df
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "ClassifierClassProjection cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("ClassifierClassProjection cannot be called on a pairwise distance matrix.")
 
 
 class MeanVarDistributionDistance(AbstractDistance):
@@ -1023,15 +953,11 @@ def _grid_points(d, n_points=100):
             d_max = d_max + d_bin
             return np.arange(start=d_min + 0.5 * d_bin, stop=d_max, step=d_bin)
 
-        def _parallel_score_samples(
-            kde, samples, thread_count=int(0.875 * multiprocessing.cpu_count())
-        ):
+        def _parallel_score_samples(kde, samples, thread_count=int(0.875 * multiprocessing.cpu_count())):
             # the thread_count is determined using the factor 0.875 as recommended here:
             # https://stackoverflow.com/questions/32625094/scipy-parallel-computing-in-ipython-notebook
             with multiprocessing.Pool(thread_count) as p:
-                return np.concatenate(
-                    p.map(kde.score_samples, np.array_split(samples, thread_count))
-                )
+                return np.concatenate(p.map(kde.score_samples, np.array_split(samples, thread_count)))
 
         def _kde_eval(d, grid):
             # Kernel choice: Gaussian is too smoothing and cosine or other kernels that do not stretch out
@@ -1058,9 +984,7 @@ def _kde_eval(d, grid):
         return kde_diff
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "MeanVarDistributionDistance cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("MeanVarDistributionDistance cannot be called on a pairwise distance matrix.")
 
 
 class MahalanobisDistance(AbstractDistance):
@@ -1079,6 +1003,4 @@ def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
-        raise NotImplementedError(
-            "Mahalanobis cannot be called on a pairwise distance matrix."
-        )
+        raise NotImplementedError("Mahalanobis cannot be called on a pairwise distance matrix.")
diff --git a/tests/tools/_distances/test_distance_tests.py b/tests/tools/_distances/test_distance_tests.py
index 369a2da9..8e62154e 100644
--- a/tests/tools/_distances/test_distance_tests.py
+++ b/tests/tools/_distances/test_distance_tests.py
@@ -42,9 +42,7 @@ def test_distancetest(adata, distance):
     if distance == "wasserstein":
         pytest.mark.apply(pytest.mark.slow)
 
-    etest = pt.tl.DistanceTest(
-        distance, n_perms=10, obsm_key="X_pca", alpha=0.05, correction="holm-sidak"
-    )
+    etest = pt.tl.DistanceTest(distance, n_perms=10, obsm_key="X_pca", alpha=0.05, correction="holm-sidak")
     tab = etest(adata, groupby="perturbation", contrast="control")
 
     # Well-defined output
diff --git a/tests/tools/_distances/test_distances.py b/tests/tools/_distances/test_distances.py
index e02bf2b6..0508d9ce 100644
--- a/tests/tools/_distances/test_distances.py
+++ b/tests/tools/_distances/test_distances.py
@@ -28,11 +28,7 @@
 pseudo_counts_distances = ["nb_ll"]
 lognorm_counts_distances = ["mean_var_distribution"]
 all_distances = (
-    actual_distances
-    + semi_distances
-    + non_distances
-    + pseudo_counts_distances
-    + lognorm_counts_distances
+    actual_distances + semi_distances + non_distances + pseudo_counts_distances + lognorm_counts_distances
 )  # + onesided_only
 
 
@@ -48,9 +44,7 @@ def all_pairwise_distances():
         "mahalanobis",
         "mean_var_distribution",
     ]
-    no_subsample_distances = [
-        "mahalanobis"
-    ]  # mahalanobis only works on the full data without subsampling
+    no_subsample_distances = ["mahalanobis"]  # mahalanobis only works on the full data without subsampling
 
     for distance in all_distances:
         adata = pt.dt.distance_example()
@@ -104,10 +98,7 @@ def test_triangle_inequality(all_pairwise_distances):
         for _i in range(10):
             rng = np.random.default_rng()
             triplet = rng.choice(df.index, size=3, replace=False)
-            assert (
-                df.loc[triplet[0], triplet[1]] + df.loc[triplet[1], triplet[2]]
-                >= df.loc[triplet[0], triplet[2]]
-            )
+            assert df.loc[triplet[0], triplet[1]] + df.loc[triplet[1], triplet[2]] >= df.loc[triplet[0], triplet[2]]
 
 
 def test_distance_layers(all_pairwise_distances):
@@ -137,9 +128,7 @@ def test_distance_output_type(all_pairwise_distances):
     # Test if distances are outputting floats
     for distance in all_distances:
         df = all_pairwise_distances[distance]
-        assert df.apply(
-            lambda col: pd.api.types.is_float_dtype(col)
-        ).all(), "Not all values are floats."
+        assert df.apply(lambda col: pd.api.types.is_float_dtype(col)).all(), "Not all values are floats."
 
 
 def test_distance_pairwise(all_pairwise_distances):
@@ -160,9 +149,7 @@ def test_distance_onesided():
 
     for distance in onesided_only:
         Distance = pt.tl.Distance(distance, obsm_key="X_pca")
-        df = Distance.onesided_distances(
-            adata, groupby="perturbation", selected_group=selected_group
-        )
+        df = Distance.onesided_distances(adata, groupby="perturbation", selected_group=selected_group)
 
         assert isinstance(df, Series)
         assert df.loc[selected_group] == 0  # distance to self is 0

From d26141023075869ee008dae8c7b84085857a7df8 Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Thu, 6 Jun 2024 21:43:44 +0200
Subject: [PATCH 18/34] fix drug

---
 pertpy/metadata/_drug.py | 45 ++++++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 11 deletions(-)

diff --git a/pertpy/metadata/_drug.py b/pertpy/metadata/_drug.py
index 106ce93b..422a9718 100644
--- a/pertpy/metadata/_drug.py
+++ b/pertpy/metadata/_drug.py
@@ -45,7 +45,7 @@ def _download_drug_annotation(
                 block_size=4096,
                 is_zip=False,
             )
-        dgidb_df = pd.read_table(dgidb_path)
+        dgidb_df = pd.read_table(dgidb_path, delimiter=",")
         return dgidb_df
 
     else:
@@ -64,7 +64,10 @@ def _download_drug_annotation(
         pharmgkb_df = pharmgkb_df[pharmgkb_df["Association"] != "not associated"]
         pharmgkb_df = pharmgkb_df[
             (pharmgkb_df["Entity1_type"] == "Gene")
-            & ((pharmgkb_df["Entity2_type"] == "Chemical") | (pharmgkb_df["Entity2_type"] == "Disease"))
+            & (
+                (pharmgkb_df["Entity2_type"] == "Chemical")
+                | (pharmgkb_df["Entity2_type"] == "Disease")
+            )
         ]
         pharmgkb_df.rename(
             columns={
@@ -74,7 +77,9 @@ def _download_drug_annotation(
             },
             inplace=True,
         )
-        pharmgkb_df.drop(["Entity1_type", "Entity1_id", "Entity2_id"], axis=1, inplace=True)
+        pharmgkb_df.drop(
+            ["Entity1_type", "Entity1_id", "Entity2_id"], axis=1, inplace=True
+        )
 
         return pharmgkb_df
 
@@ -129,7 +134,9 @@ def annotate(
                 .to_dict()
             )
 
-            adata.var["compounds"] = adata.var_names.map(lambda gene: gene_compound_dict.get(gene, ""))
+            adata.var["compounds"] = adata.var_names.map(
+                lambda gene: gene_compound_dict.get(gene, "")
+            )
         else:
             compounds = interaction[interaction["Type"] == "Chemical"]
             exploded_df = compounds.explode("Gene")
@@ -139,7 +146,9 @@ def annotate(
                 .to_dict()
             )
 
-            adata.var["compounds"] = adata.var_names.map(lambda gene: gene_compound_dict.get(gene, ""))
+            adata.var["compounds"] = adata.var_names.map(
+                lambda gene: gene_compound_dict.get(gene, "")
+            )
             diseases = interaction[interaction["Type"] == "Disease"]
             exploded_df = diseases.explode("Gene")
             gene_disease_dict = (
@@ -148,7 +157,9 @@ def annotate(
                 .to_dict()
             )
 
-            adata.var["diseases"] = adata.var_names.map(lambda gene: gene_disease_dict.get(gene, ""))
+            adata.var["diseases"] = adata.var_names.map(
+                lambda gene: gene_disease_dict.get(gene, "")
+            )
         return adata
 
     def lookup(self) -> LookUp:
@@ -193,7 +204,9 @@ def set(self) -> None:
                     )
                 self.dictionary = data
                 targets = dict(ChainMap(*[data[cat] for cat in data]))
-                self.dataframe = pd.DataFrame([{"Compound": k, "Targets": v} for k, v in targets.items()])
+                self.dataframe = pd.DataFrame(
+                    [{"Compound": k, "Targets": v} for k, v in targets.items()]
+                )
                 self.dataframe.rename(
                     columns={"Targets": "targets", "Compound": "compounds"},
                     inplace=True,
@@ -203,7 +216,11 @@ def set(self) -> None:
                     raise ValueError(
                         "The dgidb data is in a wrong format. Please clear the cache and reinitialize the object."
                     )
-                self.dataframe = data.groupby("drug_claim_name")["gene_claim_name"].apply(list).reset_index()
+                self.dataframe = (
+                    data.groupby("drug_claim_name")["gene_claim_name"]
+                    .apply(list)
+                    .reset_index()
+                )
                 self.dataframe.rename(
                     columns={
                         "gene_claim_name": "targets",
@@ -211,13 +228,17 @@ def set(self) -> None:
                     },
                     inplace=True,
                 )
-                self.dictionary = self.dataframe.set_index("compounds")["targets"].to_dict()
+                self.dictionary = self.dataframe.set_index("compounds")[
+                    "targets"
+                ].to_dict()
             else:
                 if not isinstance(data, pd.DataFrame):
                     raise ValueError(
                         "The pharmGKB data is in a wrong format. Please clear the cache and reinitialize the object."
                     )
-                self.dataframe = data.groupby("Compound|Disease")["Gene"].apply(list).reset_index()
+                self.dataframe = (
+                    data.groupby("Compound|Disease")["Gene"].apply(list).reset_index()
+                )
                 self.dataframe.rename(
                     columns={
                         "Gene": "targets",
@@ -225,7 +246,9 @@ def set(self) -> None:
                     },
                     inplace=True,
                 )
-                self.dictionary = self.dataframe.set_index("compounds|diseases")["targets"].to_dict()
+                self.dictionary = self.dataframe.set_index("compounds|diseases")[
+                    "targets"
+                ].to_dict()
 
         def df(self) -> pd.DataFrame:
             if not self.loaded:

From 4fd29a0e12c1bcde982fa6e6218f14a3e8e98e4c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 6 Jun 2024 19:43:57 +0000
Subject: [PATCH 19/34] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pertpy/metadata/_drug.py | 43 ++++++++++------------------------------
 1 file changed, 10 insertions(+), 33 deletions(-)

diff --git a/pertpy/metadata/_drug.py b/pertpy/metadata/_drug.py
index 422a9718..0f6066eb 100644
--- a/pertpy/metadata/_drug.py
+++ b/pertpy/metadata/_drug.py
@@ -64,10 +64,7 @@ def _download_drug_annotation(
         pharmgkb_df = pharmgkb_df[pharmgkb_df["Association"] != "not associated"]
         pharmgkb_df = pharmgkb_df[
             (pharmgkb_df["Entity1_type"] == "Gene")
-            & (
-                (pharmgkb_df["Entity2_type"] == "Chemical")
-                | (pharmgkb_df["Entity2_type"] == "Disease")
-            )
+            & ((pharmgkb_df["Entity2_type"] == "Chemical") | (pharmgkb_df["Entity2_type"] == "Disease"))
         ]
         pharmgkb_df.rename(
             columns={
@@ -77,9 +74,7 @@ def _download_drug_annotation(
             },
             inplace=True,
         )
-        pharmgkb_df.drop(
-            ["Entity1_type", "Entity1_id", "Entity2_id"], axis=1, inplace=True
-        )
+        pharmgkb_df.drop(["Entity1_type", "Entity1_id", "Entity2_id"], axis=1, inplace=True)
 
         return pharmgkb_df
 
@@ -134,9 +129,7 @@ def annotate(
                 .to_dict()
             )
 
-            adata.var["compounds"] = adata.var_names.map(
-                lambda gene: gene_compound_dict.get(gene, "")
-            )
+            adata.var["compounds"] = adata.var_names.map(lambda gene: gene_compound_dict.get(gene, ""))
         else:
             compounds = interaction[interaction["Type"] == "Chemical"]
             exploded_df = compounds.explode("Gene")
@@ -146,9 +139,7 @@ def annotate(
                 .to_dict()
             )
 
-            adata.var["compounds"] = adata.var_names.map(
-                lambda gene: gene_compound_dict.get(gene, "")
-            )
+            adata.var["compounds"] = adata.var_names.map(lambda gene: gene_compound_dict.get(gene, ""))
             diseases = interaction[interaction["Type"] == "Disease"]
             exploded_df = diseases.explode("Gene")
             gene_disease_dict = (
@@ -157,9 +148,7 @@ def annotate(
                 .to_dict()
             )
 
-            adata.var["diseases"] = adata.var_names.map(
-                lambda gene: gene_disease_dict.get(gene, "")
-            )
+            adata.var["diseases"] = adata.var_names.map(lambda gene: gene_disease_dict.get(gene, ""))
         return adata
 
     def lookup(self) -> LookUp:
@@ -204,9 +193,7 @@ def set(self) -> None:
                     )
                 self.dictionary = data
                 targets = dict(ChainMap(*[data[cat] for cat in data]))
-                self.dataframe = pd.DataFrame(
-                    [{"Compound": k, "Targets": v} for k, v in targets.items()]
-                )
+                self.dataframe = pd.DataFrame([{"Compound": k, "Targets": v} for k, v in targets.items()])
                 self.dataframe.rename(
                     columns={"Targets": "targets", "Compound": "compounds"},
                     inplace=True,
@@ -216,11 +203,7 @@ def set(self) -> None:
                     raise ValueError(
                         "The dgidb data is in a wrong format. Please clear the cache and reinitialize the object."
                     )
-                self.dataframe = (
-                    data.groupby("drug_claim_name")["gene_claim_name"]
-                    .apply(list)
-                    .reset_index()
-                )
+                self.dataframe = data.groupby("drug_claim_name")["gene_claim_name"].apply(list).reset_index()
                 self.dataframe.rename(
                     columns={
                         "gene_claim_name": "targets",
@@ -228,17 +211,13 @@ def set(self) -> None:
                     },
                     inplace=True,
                 )
-                self.dictionary = self.dataframe.set_index("compounds")[
-                    "targets"
-                ].to_dict()
+                self.dictionary = self.dataframe.set_index("compounds")["targets"].to_dict()
             else:
                 if not isinstance(data, pd.DataFrame):
                     raise ValueError(
                         "The pharmGKB data is in a wrong format. Please clear the cache and reinitialize the object."
                     )
-                self.dataframe = (
-                    data.groupby("Compound|Disease")["Gene"].apply(list).reset_index()
-                )
+                self.dataframe = data.groupby("Compound|Disease")["Gene"].apply(list).reset_index()
                 self.dataframe.rename(
                     columns={
                         "Gene": "targets",
@@ -246,9 +225,7 @@ def set(self) -> None:
                     },
                     inplace=True,
                 )
-                self.dictionary = self.dataframe.set_index("compounds|diseases")[
-                    "targets"
-                ].to_dict()
+                self.dictionary = self.dataframe.set_index("compounds|diseases")["targets"].to_dict()
 
         def df(self) -> pd.DataFrame:
             if not self.loaded:

From 30baefecd309169d767450a8eb41a7164610bb04 Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Fri, 7 Jun 2024 17:03:38 +0200
Subject: [PATCH 20/34] add bootstrapping and metrics_3g

---
 pertpy/tools/_distances/_distances.py    | 239 +++++++++++++++++++----
 pertpy/tools/_metrics_3g.py              | 214 ++++++++++++++++++++
 tests/tools/_distances/test_distances.py | 171 ++++++++++------
 3 files changed, 525 insertions(+), 99 deletions(-)
 create mode 100644 pertpy/tools/_metrics_3g.py

diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index 71b353c7..815da084 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -2,7 +2,7 @@
 
 import multiprocessing
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Literal
+from typing import TYPE_CHECKING, Literal, NamedTuple
 
 import numba
 import numpy as np
@@ -31,6 +31,11 @@
     from anndata import AnnData
 
 
+class MeanVar(NamedTuple):
+    mean: float
+    variance: float
+
+
 class Distance:
     """Distance class, used to compute distances between groups of cells.
 
@@ -86,7 +91,7 @@ class Distance:
         Average of the classification probability of the perturbation for a binary classifier.
     - "classifier_cp": classifier class projection
         Average of the class
-    - "mean_var_distribution": Distance between mean-variance distibutions between cells of 2 groups.
+    - "mean_var_distribution": Distance between mean-variance distributions between cells of 2 groups.
        Mean square distance between the mean-variance distributions of cells from 2 groups using Kernel Density Estimation (KDE).
     - "mahalanobis": Mahalanobis distance between the means of cells from two groups.
         It is originally used to measure distance between a point and a distribution.
@@ -124,7 +129,7 @@ def __init__(
                               Mutually exclusive with 'obsm_key'.
                               Is not used if `None`.
             obsm_key: Name of embedding in adata.obsm to use.
-                      Mutually exclusive with 'counts_layer_key'.
+                      Mutually exclusive with 'layer_key'.
                       Defaults to None, but is set to "X_pca" if not explicitly set internally.
             cell_wise_metric: Metric from scipy.spatial.distance to use for pairwise distances between single cells.
         """
@@ -178,8 +183,7 @@ def __init__(
 
         if layer_key and obsm_key:
             raise ValueError(
-                "Cannot use 'counts_layer_key' and 'obsm_key' at the same time.\n"
-                "Please provide only one of the two keys."
+                "Cannot use 'layer_key' and 'obsm_key' at the same time.\n" "Please provide only one of the two keys."
             )
         if not layer_key and not obsm_key:
             obsm_key = "X_pca"
@@ -221,15 +225,54 @@ def __call__(
 
         return self.metric_fct(X, Y, **kwargs)
 
+    def bootstrap(
+        self,
+        X: np.ndarray,
+        Y: np.ndarray,
+        *,
+        n_bootstrap: int = 100,
+        bootstrap_random_state: int = 0,
+        **kwargs,
+    ) -> MeanVar:
+        """Bootstrap computation of mean and variance of the distance between vectors X and Y.
+
+        Args:
+            X: First vector of shape (n_samples, n_features).
+            Y: Second vector of shape (n_samples, n_features).
+            n_bootstrap: Number of bootstrap samples.
+            bootstrap_random_state: Random state for bootstrapping.
+
+        Returns:
+            MeanVar: Mean and variance of distance between X and Y.
+
+        Examples:
+            >>> import pertpy as pt
+            >>> adata = pt.dt.distance_example()
+            >>> Distance = pt.tools.Distance(metric="edistance")
+            >>> X = adata.obsm["X_pca"][adata.obs["perturbation"] == "p-sgCREB1-2"]
+            >>> Y = adata.obsm["X_pca"][adata.obs["perturbation"] == "control"]
+            >>> D = Distance.bootstrap(X, Y)
+        """
+        return self._bootstrap_mode(
+            X,
+            Y,
+            n_bootstraps=n_bootstrap,
+            bootstrap_random_state=bootstrap_random_state,
+            **kwargs,
+        )
+
     def pairwise(
         self,
         adata: AnnData,
         groupby: str,
         groups: list[str] | None = None,
+        bootstrap: bool = False,
+        n_bootstrap: int = 100,
+        bootstrap_random_state: int = 0,
         show_progressbar: bool = True,
         n_jobs: int = -1,
         **kwargs,
-    ) -> pd.DataFrame:
+    ) -> pd.DataFrame | tuple[pd.DataFrame, pd.DataFrame]:
         """Get pairwise distances between groups of cells.
 
         Args:
@@ -237,12 +280,16 @@ def pairwise(
             groupby: Column name in adata.obs.
             groups: List of groups to compute pairwise distances for.
                     If None, uses all groups.
+            bootstrap: Whether to bootstrap the distance.
+            n_bootstrap: Number of bootstrap samples.
+            bootstrap_random_state: Random state for bootstrapping.
             show_progressbar: Whether to show progress bar.
             n_jobs: Number of cores to use. Defaults to -1 (all).
             kwargs: Additional keyword arguments passed to the metric function.
 
         Returns:
             pd.DataFrame: Dataframe with pairwise distances.
+            tuple[pd.DataFrame, pd.DataFrame]: Two Dataframes, one for the mean and one for the variance of pairwise distances.
 
         Examples:
             >>> import pertpy as pt
@@ -253,6 +300,8 @@ def pairwise(
         groups = adata.obs[groupby].unique() if groups is None else groups
         grouping = adata.obs[groupby].copy()
         df = pd.DataFrame(index=groups, columns=groups, dtype=float)
+        if bootstrap:
+            df_var = pd.DataFrame(index=groups, columns=groups, dtype=float)
         fct = track if show_progressbar else lambda iterable: iterable
 
         # Some metrics are able to handle precomputed distances. This means that
@@ -268,16 +317,29 @@ def pairwise(
             for index_x, group_x in enumerate(fct(groups)):
                 idx_x = grouping == group_x
                 for group_y in groups[index_x:]:  # type: ignore
-                    if group_x == group_y:
-                        dist = 0.0  # by distance axiom
+                    # subset the pairwise distance matrix to the two groups
+                    idx_y = grouping == group_y
+                    sub_pwd = pwd[idx_x | idx_y, :][:, idx_x | idx_y]
+                    sub_idx = grouping[idx_x | idx_y] == group_x
+                    if not bootstrap:
+                        if group_x == group_y:
+                            dist = 0.0
+                        else:
+                            dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs)
+                        df.loc[group_x, group_y] = dist
+                        df.loc[group_y, group_x] = dist
+
                     else:
-                        idx_y = grouping == group_y
-                        # subset the pairwise distance matrix to the two groups
-                        sub_pwd = pwd[idx_x | idx_y, :][:, idx_x | idx_y]
-                        sub_idx = grouping[idx_x | idx_y] == group_x
-                        dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs)
-                    df.loc[group_x, group_y] = dist
-                    df.loc[group_y, group_x] = dist
+                        bootstrap_output = self._bootstrap_mode_precomputed(
+                            sub_pwd,
+                            sub_idx,
+                            n_bootstraps=n_bootstrap,
+                            bootstrap_random_state=bootstrap_random_state,
+                            **kwargs,
+                        )
+                        # In the bootstrap case, distance of group to itself is a mean and can be non-zero
+                        df.loc[group_x, group_y] = df.loc[group_y, group_x] = bootstrap_output.mean
+                        df_var.loc[group_x, group_y] = df_var.loc[group_y, group_x] = bootstrap_output.variance
         else:
             if self.layer_key:
                 embedding = adata.layers[self.layer_key]
@@ -286,18 +348,39 @@ def pairwise(
             for index_x, group_x in enumerate(fct(groups)):
                 cells_x = embedding[grouping == group_x].copy()
                 for group_y in groups[index_x:]:  # type: ignore
-                    if group_x == group_y:
-                        dist = 0.0
+                    cells_y = embedding[grouping == group_y].copy()
+                    if not bootstrap:
+                        # By distance axiom, the distance between a group and itself is 0
+                        dist = 0.0 if group_x == group_y else self(cells_x, cells_y, **kwargs)
+
+                        df.loc[group_x, group_y] = dist
+                        df.loc[group_y, group_x] = dist
                     else:
-                        cells_y = embedding[grouping == group_y].copy()
-                        dist = self(cells_x, cells_y, **kwargs)
-                    df.loc[group_x, group_y] = dist
-                    df.loc[group_y, group_x] = dist
+                        bootstrap_output = self.bootstrap(
+                            cells_x,
+                            cells_y,
+                            n_bootstrap=n_bootstrap,
+                            bootstrap_random_state=bootstrap_random_state,
+                            **kwargs,
+                        )
+                        # In the bootstrap case, distance of group to itself is a mean and can be non-zero
+                        df.loc[group_x, group_y] = df.loc[group_y, group_x] = bootstrap_output.mean
+                        df_var.loc[group_x, group_y] = df_var.loc[group_y, group_x] = bootstrap_output.variance
+
         df.index.name = groupby
         df.columns.name = groupby
         df.name = f"pairwise {self.metric}"
 
-        return df
+        if not bootstrap:
+            return df
+        else:
+            df = df.fillna(0)
+            df_var.index.name = groupby
+            df_var.columns.name = groupby
+            df_var = df_var.fillna(0)
+            df_var.name = f"pairwise {self.metric} variance"
+
+            return df, df_var
 
     def onesided_distances(
         self,
@@ -305,10 +388,13 @@ def onesided_distances(
         groupby: str,
         selected_group: str | None = None,
         groups: list[str] | None = None,
+        bootstrap: bool = False,
+        n_bootstrap: int = 100,
+        bootstrap_random_state: int = 0,
         show_progressbar: bool = True,
         n_jobs: int = -1,
         **kwargs,
-    ) -> pd.DataFrame:
+    ) -> pd.DataFrame | tuple[pd.DataFrame, pd.DataFrame]:
         """Get distances between one selected cell group and the remaining other cell groups.
 
         Args:
@@ -317,12 +403,17 @@ def onesided_distances(
             selected_group: Group to compute pairwise distances to all other.
             groups: List of groups to compute distances to selected_group for.
                     If None, uses all groups.
+            bootstrap: Whether to bootstrap the distance.
+            n_bootstrap: Number of bootstrap samples.
+            bootstrap_random_state: Random state for bootstrapping.
             show_progressbar: Whether to show progress bar.
             n_jobs: Number of cores to use. Defaults to -1 (all).
             kwargs: Additional keyword arguments passed to the metric function.
 
         Returns:
             pd.DataFrame: Dataframe with distances of groups to selected_group.
+            tuple[pd.DataFrame, pd.DataFrame]: Two Dataframes, one for the mean and one for the variance of distances of groups to selected_group.
+
 
         Examples:
             >>> import pertpy as pt
@@ -331,6 +422,8 @@ def onesided_distances(
             >>> pairwise_df = Distance.onesided_distances(adata, groupby="perturbation", selected_group="control")
         """
         if self.metric == "classifier_cp":
+            if bootstrap:
+                raise NotImplementedError("Currently, ClassifierClassProjection does not support bootstrapping.")
             return self.metric_fct.onesided_distances(  # type: ignore
                 adata,
                 groupby,
@@ -344,13 +437,15 @@ def onesided_distances(
         groups = adata.obs[groupby].unique() if groups is None else groups
         grouping = adata.obs[groupby].copy()
         df = pd.Series(index=groups, dtype=float)
+        if bootstrap:
+            df_var = pd.Series(index=groups, dtype=float)
         fct = track if show_progressbar else lambda iterable: iterable
 
         # Some metrics are able to handle precomputed distances. This means that
         # the pairwise distances between all cells are computed once and then
         # passed to the metric function. This is much faster than computing the
         # pairwise distances for each group separately. Other metrics are not
-        # able to handle precomputed distances such as the PsuedobulkDistance.
+        # able to handle precomputed distances such as the PseudobulkDistance.
         if self.metric_fct.accepts_precomputed:
             # Precompute the pairwise distances if needed
             if f"{self.obsm_key}_{self.cell_wise_metric}_predistances" not in adata.obsp.keys():
@@ -360,14 +455,25 @@ def onesided_distances(
                 idx_x = grouping == group_x
                 group_y = selected_group
                 if group_x == group_y:
-                    dist = 0.0  # by distance axiom
+                    df.loc[group_x] = 0.0  # by distance axiom
                 else:
                     idx_y = grouping == group_y
                     # subset the pairwise distance matrix to the two groups
                     sub_pwd = pwd[idx_x | idx_y, :][:, idx_x | idx_y]
                     sub_idx = grouping[idx_x | idx_y] == group_x
-                    dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs)
-                df.loc[group_x] = dist
+                    if not bootstrap:
+                        dist = self.metric_fct.from_precomputed(sub_pwd, sub_idx, **kwargs)
+                        df.loc[group_x] = dist
+                    else:
+                        bootstrap_output = self._bootstrap_mode_precomputed(
+                            sub_pwd,
+                            sub_idx,
+                            n_bootstraps=n_bootstrap,
+                            bootstrap_random_state=bootstrap_random_state,
+                            **kwargs,
+                        )
+                        df.loc[group_x] = bootstrap_output.mean
+                        df_var.loc[group_x] = bootstrap_output.variance
         else:
             if self.layer_key:
                 embedding = adata.layers[self.layer_key]
@@ -376,15 +482,32 @@ def onesided_distances(
             for group_x in fct(groups):
                 cells_x = embedding[grouping == group_x].copy()
                 group_y = selected_group
-                if group_x == group_y:
-                    dist = 0.0
+                cells_y = embedding[grouping == group_y].copy()
+                if not bootstrap:
+                    # By distance axiom, the distance between a group and itself is 0
+                    dist = 0.0 if group_x == group_y else self(cells_x, cells_y, **kwargs)
+                    df.loc[group_x] = dist
                 else:
-                    cells_y = embedding[grouping == group_y].copy()
-                    dist = self(cells_x, cells_y, **kwargs)
-                df.loc[group_x] = dist
+                    bootstrap_output = self.bootstrap(
+                        cells_x,
+                        cells_y,
+                        n_bootstrap=n_bootstrap,
+                        bootstrap_random_state=bootstrap_random_state,
+                        **kwargs,
+                    )
+                    # In the bootstrap case, distance of group to itself is a mean and can be non-zero
+                    df.loc[group_x] = bootstrap_output.mean
+                    df_var.loc[group_x] = bootstrap_output.variance
         df.index.name = groupby
         df.name = f"{self.metric} to {selected_group}"
-        return df
+        if not bootstrap:
+            return df
+        else:
+            df_var.index.name = groupby
+            df_var = df_var.fillna(0)
+            df_var.name = f"pairwise {self.metric} variance to {selected_group}"
+
+            return df, df_var
 
     def precompute_distances(self, adata: AnnData, n_jobs: int = -1) -> None:
         """Precompute pairwise distances between all cells, writes to adata.obsp.
@@ -410,6 +533,45 @@ def precompute_distances(self, adata: AnnData, n_jobs: int = -1) -> None:
         pwd = pairwise_distances(cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs)
         adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] = pwd
 
+    def _bootstrap_mode(self, X, Y, n_bootstraps=100, bootstrap_random_state=0, **kwargs) -> MeanVar:
+        rng = np.random.default_rng(bootstrap_random_state)
+
+        distances = []
+        for _ in range(n_bootstraps):
+            X_bootstrapped = X[rng.choice(a=X.shape[0], size=X.shape[0], replace=True)]
+            Y_bootstrapped = Y[rng.choice(a=Y.shape[0], size=X.shape[0], replace=True)]
+
+            distance = self(X_bootstrapped, Y_bootstrapped, **kwargs)
+            distances.append(distance)
+
+        mean = np.mean(distances)
+        variance = np.var(distances)
+        return MeanVar(mean=mean, variance=variance)
+
+    def _bootstrap_mode_precomputed(
+        self, sub_pwd, sub_idx, n_bootstraps=100, bootstrap_random_state=0, **kwargs
+    ) -> MeanVar:
+        rng = np.random.default_rng(bootstrap_random_state)
+
+        distances = []
+        for _ in range(n_bootstraps):
+            # in order to maintain the number of cells for both groups (whatever balancing they may have),
+            # we sample the positive and negative indices separately
+            bootstrap_pos_idx = rng.choice(a=sub_idx[sub_idx].index, size=sub_idx[sub_idx].size, replace=True)
+            bootstrap_neg_idx = rng.choice(a=sub_idx[~sub_idx].index, size=sub_idx[~sub_idx].size, replace=True)
+            bootstrap_idx = np.concatenate([bootstrap_pos_idx, bootstrap_neg_idx])
+            bootstrap_idx_nrs = sub_idx.index.get_indexer(bootstrap_idx)
+
+            bootstrap_sub_idx = sub_idx[bootstrap_idx]
+            bootstrap_sub_pwd = sub_pwd[bootstrap_idx_nrs, :][:, bootstrap_idx_nrs]
+
+            distance = self.metric_fct.from_precomputed(bootstrap_sub_pwd, bootstrap_sub_idx, **kwargs)
+            distances.append(distance)
+
+        mean = np.mean(distances)
+        variance = np.var(distances)
+        return MeanVar(mean=mean, variance=variance)
+
 
 class AbstractDistance(ABC):
     """Abstract class of distance metrics between two sets of vectors."""
@@ -913,7 +1075,7 @@ def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
 
 class MeanVarDistributionDistance(AbstractDistance):
     """
-    Distance betweenv mean-var distributions of gene expression.
+    Distance between mean-var distributions of gene expression.
 
     """
 
@@ -922,7 +1084,7 @@ def __init__(self) -> None:
         self.accepts_precomputed = False
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        """Difference of mean-var distibutions in 2 matrices.
+        """Difference of mean-var distributions in 2 matrices.
 
         Args:
             X: Normalized and log transformed cells x genes count matrix.
@@ -996,10 +1158,15 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
         self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
+        cov = np.cov(X.T)
+        if cov.shape[0] == cov.shape[1] and np.linalg.matrix_rank(cov) == cov.shape[0]:  # check invertiblity
+            inverse_cov = np.linalg.inv(cov)
+        else:  # if not invertible, compute the (Moore-Penrose) pseudo-inverse of a matrix
+            inverse_cov = np.linalg.pinv(cov)
         return mahalanobis(
             self.aggregation_func(X, axis=0),
             self.aggregation_func(Y, axis=0),
-            np.linalg.inv(np.cov(X.T)),
+            inverse_cov,
         )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
diff --git a/pertpy/tools/_metrics_3g.py b/pertpy/tools/_metrics_3g.py
new file mode 100644
index 00000000..8eb951f5
--- /dev/null
+++ b/pertpy/tools/_metrics_3g.py
@@ -0,0 +1,214 @@
+from collections.abc import Mapping
+from functools import partial
+from types import MappingProxyType
+from typing import TYPE_CHECKING, Any, Literal, Optional
+
+import anndata as ad
+import numpy as np
+import pandas as pd
+import pynndescent
+import scanpy as sc
+from scipy.sparse import issparse
+from scipy.sparse import vstack as sp_vstack
+from sklearn.base import ClassifierMixin
+from sklearn.linear_model import LogisticRegression
+
+from pertpy.tools._distances._distances import Distance
+
+if TYPE_CHECKING:
+    from numpy.typing import NDArray
+
+
+def compare_de(
+    X: np.ndarray, Y: np.ndarray, C: np.ndarray, shared_top: int = 100, **kwargs
+) -> dict:
+    """Compare DEG across real and simulated perturbations.
+
+    Computes DEG for real and simulated perturbations vs. control and calculates
+    metrics to evaluate similarity of the results.
+
+    Args:
+        X: Real perturbed data.
+        Y: Simulated perturbed data.
+        C: Control data
+        shared_top: The number of top DEG to compute the proportion of their intersection.
+        **kwargs: arguments for `scanpy.tl.rank_genes_groups`.
+    """
+    n_vars = X.shape[1]
+    assert n_vars == Y.shape[1] == C.shape[1]
+
+    shared_top = min(shared_top, n_vars)
+    vars_ranks = np.arange(1, n_vars + 1)
+
+    adatas_xy = {}
+    adatas_xy["x"] = ad.AnnData(X, obs={"label": "comp"})
+    adatas_xy["y"] = ad.AnnData(Y, obs={"label": "comp"})
+    adata_c = ad.AnnData(C, obs={"label": "ctrl"})
+
+    results = pd.DataFrame(index=adata_c.var_names)
+    top_names = []
+    for group in ("x", "y"):
+        adata_joint = ad.concat((adatas_xy[group], adata_c), index_unique="-")
+
+        sc.tl.rank_genes_groups(
+            adata_joint, groupby="label", reference="ctrl", key_added="de", **kwargs
+        )
+
+        srt_idx = np.argsort(adata_joint.uns["de"]["names"]["comp"])
+        results[f"scores_{group}"] = adata_joint.uns["de"]["scores"]["comp"][srt_idx]
+        results[f"pvals_adj_{group}"] = adata_joint.uns["de"]["pvals_adj"]["comp"][
+            srt_idx
+        ]
+        results[f"ranks_{group}"] = vars_ranks[srt_idx]
+
+        top_names.append(adata_joint.uns["de"]["names"]["comp"][:shared_top])
+
+    metrics = {}
+    metrics["shared_top_genes"] = (
+        len(set(top_names[0]).intersection(top_names[1])) / shared_top
+    )
+    metrics["scores_corr"] = results["scores_x"].corr(
+        results["scores_y"], method="pearson"
+    )
+    metrics["pvals_adj_corr"] = results["pvals_adj_x"].corr(
+        results["pvals_adj_y"], method="pearson"
+    )
+    metrics["scores_ranks_corr"] = results["ranks_x"].corr(
+        results["ranks_y"], method="spearman"
+    )
+
+    return metrics
+
+
+def compare_class(
+    X: np.ndarray, Y: np.ndarray, C: np.ndarray, clf: Optional[ClassifierMixin] = None
+) -> float:
+    """Compare classification accuracy between real and simulated perturbations.
+
+    Trains a classifier on the real perturbation data + the control data and reports a normalized
+    classification accuracy on the simulated perturbation.
+
+    Args:
+        X: Real perturbed data.
+        Y: Simulated perturbed data.
+        C: Control data
+        clf: sklearn classifier to use, `sklearn.linear_model.LogisticRegression` if not provided.
+    """
+    assert X.shape[1] == Y.shape[1] == C.shape[1]
+
+    if clf is None:
+        clf = LogisticRegression()
+
+    n_x = X.shape[0]
+    n_xc = n_x + C.shape[0]
+
+    data = sp_vstack((X, C)) if issparse(X) else np.vstack((X, C))
+
+    labels = np.full(n_xc, "ctrl")
+    labels[:n_x] = "comp"
+    clf.fit(data, labels)
+
+    norm_score = clf.score(Y, np.full(Y.shape[0], "comp")) / clf.score(X, labels[:n_x])
+    norm_score = min(1.0, norm_score)
+
+    return norm_score
+
+
+def compare_knn(
+    X: np.ndarray,
+    Y: np.ndarray,
+    C: Optional[np.ndarray] = None,
+    n_neighbors: int = 20,
+    use_Y_knn: bool = False,
+    random_state: int = 0,
+    n_jobs: int = 1,
+) -> dict[str, float]:
+    """Calculate proportions of real perturbed and control data points for simulated data.
+
+    Computes proportions of real perturbed (if provided), control and simulated (if `use_Y_knn=True`)
+    data points for simulated data. If control (`C`) is not provided, builds the knn graph from
+    real perturbed + simulated perturbed.
+
+    Args:
+        X: Real perturbed data.
+        Y: Simulated perturbed data.
+        C: Control data.
+        use_Y_knn: Include simulted perturbed data (`Y`) into the knn graph. Only valid when
+            control (`C`) is provided.
+    """
+    assert X.shape[1] == Y.shape[1]
+    if C is not None:
+        assert X.shape[1] == C.shape[1]
+
+    n_y = Y.shape[0]
+
+    if C is None:
+        index_data = sp_vstack((Y, X)) if issparse(X) else np.vstack((Y, X))
+    else:
+        datas = (Y, X, C) if use_Y_knn else (X, C)
+        index_data = sp_vstack(datas) if issparse(X) else np.vstack(datas)
+
+    y_in_index = use_Y_knn or C is None
+    c_in_index = C is not None
+    label_groups = ["comp"]
+    labels: NDArray[np.str_] = np.full(index_data.shape[0], "comp")
+    if y_in_index:
+        labels[:n_y] = "siml"
+        label_groups.append("siml")
+    if c_in_index:
+        labels[-C.shape[0] :] = "ctrl"
+        label_groups.append("ctrl")
+
+    index = pynndescent.NNDescent(
+        index_data,
+        n_neighbors=max(50, n_neighbors),
+        random_state=random_state,
+        n_jobs=n_jobs,
+    )
+    indices = index.query(Y, k=n_neighbors)[0]
+
+    uq, uq_counts = np.unique(labels[indices], return_counts=True)
+    uq_counts_norm = uq_counts / uq_counts.sum()
+    counts = dict(zip(label_groups, [0.0] * len(label_groups)))
+    for group, count_norm in zip(uq, uq_counts_norm):
+        counts[group] = count_norm
+
+    return counts
+
+
+def compare_dist(
+    pert: np.ndarray,
+    pred: np.ndarray,
+    ctrl: np.ndarray,
+    *,
+    metric: str = "euclidean",
+    mode: Literal["simple", "scaled"] = "simple",
+    metric_kwds: Mapping[str, Any] = MappingProxyType({}),
+    _fit_to_pert_and_ctrl: bool = False,
+) -> float:
+    """Compute the score of simulating a perturbation.
+
+    Args:
+        pert: Real perturbed data.
+        pred: Simulated perturbed data.
+        ctrl: Control data
+        kind: Kind of metric to use.
+    """
+    metric_fct = partial(Distance(metric).metric_fct, **metric_kwds)
+
+    if mode == "simple":
+        pass  # nothing to be done
+    elif mode == "scaled":
+        from sklearn.preprocessing import MinMaxScaler
+
+        scaler = MinMaxScaler().fit(
+            np.vstack((pert, ctrl)) if _fit_to_pert_and_ctrl else ctrl
+        )
+        pred = scaler.transform(pred)
+        pert = scaler.transform(pert)
+    else:
+        raise ValueError(f"Unknown mode {mod}. Please choose simple or scaled.")
+
+    d1 = metric_fct(pert, pred)
+    d2 = metric_fct(ctrl, pred)
+    return d1 / d2
diff --git a/tests/tools/_distances/test_distances.py b/tests/tools/_distances/test_distances.py
index 0508d9ce..c093e03c 100644
--- a/tests/tools/_distances/test_distances.py
+++ b/tests/tools/_distances/test_distances.py
@@ -4,6 +4,7 @@
 import pytest
 import scanpy as sc
 from pandas import DataFrame, Series
+from pytest import fixture, mark
 
 actual_distances = [
     # Euclidean distances and related
@@ -28,25 +29,26 @@
 pseudo_counts_distances = ["nb_ll"]
 lognorm_counts_distances = ["mean_var_distribution"]
 all_distances = (
-    actual_distances + semi_distances + non_distances + pseudo_counts_distances + lognorm_counts_distances
+    actual_distances + semi_distances + non_distances + lognorm_counts_distances + pseudo_counts_distances
 )  # + onesided_only
 
 
-@pytest.fixture
-def all_pairwise_distances():
-    all_calulated_distances = {}
-    low_subsample_distances = [
-        "sym_kldiv",
-        "t_test",
-        "ks_test",
-        "classifier_proba",
-        "classifier_cp",
-        "mahalanobis",
-        "mean_var_distribution",
-    ]
-    no_subsample_distances = ["mahalanobis"]  # mahalanobis only works on the full data without subsampling
-
-    for distance in all_distances:
+class TestDistances:
+    @fixture
+    def adata(self, request):
+        low_subsample_distances = [
+            "sym_kldiv",
+            "t_test",
+            "ks_test",
+            "classifier_proba",
+            "classifier_cp",
+            "mahalanobis",
+            "mean_var_distribution",
+        ]
+        no_subsample_distances = ["mahalanobis"]  # mahalanobis only works on the full data without subsampling
+
+        distance = request.node.callspec.params["distance"]
+
         adata = pt.dt.distance_example()
         if distance not in no_subsample_distances:
             if distance in low_subsample_distances:
@@ -62,37 +64,48 @@ def all_pairwise_distances():
             groups = np.unique(adata.obs["perturbation"])
             # KDE is slow, subset to 5 groups for speed up
             adata = adata[adata.obs["perturbation"].isin(groups[0:5])].copy()
+
+        return adata
+
+    @fixture
+    def distance_obj(self, request):
+        distance = request.node.callspec.params["distance"]
+        if distance in lognorm_counts_distances:
             Distance = pt.tl.Distance(distance, layer_key="lognorm")
         elif distance in pseudo_counts_distances:
             Distance = pt.tl.Distance(distance, layer_key="counts")
         else:
             Distance = pt.tl.Distance(distance, obsm_key="X_pca")
-        df = Distance.pairwise(adata, groupby="perturbation", show_progressbar=True)
-        all_calulated_distances[distance] = df
+        return Distance
 
-    return all_calulated_distances
+    @mark.parametrize("distance", actual_distances + semi_distances + non_distances)
+    def test_distance(self, adata, distance):
+        Distance = pt.tl.Distance(distance, obsm_key="X_pca")
+        df = Distance.pairwise(adata, groupby="perturbation", show_progressbar=True)
 
+        assert isinstance(df, DataFrame)
 
-def test_distance_axioms(all_pairwise_distances):
-    for distance in actual_distances + semi_distances:
+    @mark.parametrize("distance", actual_distances + semi_distances)
+    def test_distance_axioms(self, adata, distance):
         # This is equivalent to testing for a semimetric, defined as fulfilling all axioms except triangle inequality.
-        df = all_pairwise_distances[distance]
+        Distance = pt.tl.Distance(distance, obsm_key="X_pca")
+        df = Distance.pairwise(adata, groupby="perturbation", show_progressbar=True)
 
         # (M1) Definiteness
         assert all(np.diag(df.values) == 0)  # distance to self is 0
 
         # (M2) Positivity
-        assert len(df) == np.sum(df.values == 0)  # distance to other is not 0
+        assert len(df) == np.sum(df.values == 0)  # distance to other is not 0 (TODO)
         assert all(df.values.flatten() >= 0)  # distance is non-negative
 
         # (M3) Symmetry
         assert np.sum(df.values - df.values.T) == 0
 
-
-def test_triangle_inequality(all_pairwise_distances):
-    for distance in actual_distances:
+    @mark.parametrize("distance", actual_distances)
+    def test_triangle_inequality(self, adata, distance):
         # Test if distances are well-defined in accordance with metric axioms
-        df = all_pairwise_distances[distance]
+        Distance = pt.tl.Distance(distance, obsm_key="X_pca")
+        df = Distance.pairwise(adata, groupby="perturbation", show_progressbar=True)
 
         # (M4) Triangle inequality (we just probe this for a few random triplets)
         for _i in range(10):
@@ -100,56 +113,88 @@ def test_triangle_inequality(all_pairwise_distances):
             triplet = rng.choice(df.index, size=3, replace=False)
             assert df.loc[triplet[0], triplet[1]] + df.loc[triplet[1], triplet[2]] >= df.loc[triplet[0], triplet[2]]
 
+    @mark.parametrize("distance", all_distances)
+    def test_distance_layers(self, adata, distance_obj, distance):
+        df = distance_obj.pairwise(adata, groupby="perturbation")
 
-def test_distance_layers(all_pairwise_distances):
-    for distance in all_distances:
-        df = all_pairwise_distances[distance]
-
-    assert isinstance(df, DataFrame)
-    assert df.columns.equals(df.index)
-    assert np.sum(df.values - df.values.T) == 0  # symmetry
-
+        assert isinstance(df, DataFrame)
+        assert df.columns.equals(df.index)
+        assert np.sum(df.values - df.values.T) == 0  # symmetry
 
-def test_distance_counts(all_pairwise_distances):
-    for distance in actual_distances + pseudo_counts_distances:
-        df = all_pairwise_distances[distance]
+    @mark.parametrize("distance", actual_distances + pseudo_counts_distances)
+    def test_distance_counts(self, adata, distance):
+        Distance = pt.tl.Distance(distance, layer_key="counts")
+        df = Distance.pairwise(adata, groupby="perturbation")
         assert isinstance(df, DataFrame)
         assert df.columns.equals(df.index)
         assert np.sum(df.values - df.values.T) == 0
 
-
-def test_mutually_exclusive_keys():
-    for distance in all_distances:
+    @mark.parametrize("distance", all_distances)
+    def test_mutually_exclusive_keys(self, distance):
         with pytest.raises(ValueError):
             _ = pt.tl.Distance(distance, layer_key="counts", obsm_key="X_pca")
 
+    @mark.parametrize("distance", actual_distances + semi_distances + non_distances)
+    def test_distance_output_type(self, distance):
+        # Test if distances are outputting floats
+        Distance = pt.tl.Distance(distance, obsm_key="X_pca")
+        rng = np.random.default_rng()
+        X = rng.normal(size=(100, 10))
+        Y = rng.normal(size=(100, 10))
+        d = Distance(X, Y)
+        assert isinstance(d, float)
 
-def test_distance_output_type(all_pairwise_distances):
-    # Test if distances are outputting floats
-    for distance in all_distances:
-        df = all_pairwise_distances[distance]
-        assert df.apply(lambda col: pd.api.types.is_float_dtype(col)).all(), "Not all values are floats."
-
-
-def test_distance_pairwise(all_pairwise_distances):
-    # Test consistency of pairwise distance results
-    for distance in all_distances:
-        df = all_pairwise_distances[distance]
+    @mark.parametrize("distance", all_distances)
+    def test_distance_pairwise(self, adata, distance_obj, distance):
+        # Test consistency of pairwise distance results
+        df = distance_obj.pairwise(adata, groupby="perturbation")
 
         assert isinstance(df, DataFrame)
         assert df.columns.equals(df.index)
         assert np.sum(df.values - df.values.T) == 0  # symmetry
 
-
-def test_distance_onesided():
-    # Test consistency of one-sided distance results
-    adata = pt.dt.distance_example()
-    adata = sc.pp.subsample(adata, 0.1, copy=True)
-    selected_group = adata.obs.perturbation.unique()[0]
-
-    for distance in onesided_only:
-        Distance = pt.tl.Distance(distance, obsm_key="X_pca")
-        df = Distance.onesided_distances(adata, groupby="perturbation", selected_group=selected_group)
-
+    @mark.parametrize("distance", all_distances + onesided_only)
+    def test_distance_onesided(self, adata, distance_obj, distance):
+        # Test consistency of one-sided distance results
+        selected_group = adata.obs.perturbation.unique()[0]
+        df = distance_obj.onesided_distances(adata, groupby="perturbation", selected_group=selected_group)
         assert isinstance(df, Series)
         assert df.loc[selected_group] == 0  # distance to self is 0
+
+    @mark.parametrize("distance", actual_distances + semi_distances + non_distances)
+    def test_bootstrap_distance_output_type(self, distance):
+        # Test if distances are outputting floats
+        Distance = pt.tl.Distance(distance, obsm_key="X_pca")
+        rng = np.random.default_rng()
+        X = rng.normal(size=(100, 10))
+        Y = rng.normal(size=(100, 10))
+        d = Distance.bootstrap(X, Y, n_bootstrap=10)
+        assert hasattr(d, "mean")
+        assert hasattr(d, "variance")
+
+    @mark.parametrize("distance", all_distances)
+    def test_bootstrap_distance_pairwise(self, adata, distance_obj, distance):
+        # Test consistency of pairwise distance results
+        bootstrap_output = distance_obj.pairwise(adata, groupby="perturbation", bootstrap=True, n_bootstrap=10)
+        assert isinstance(bootstrap_output, tuple)
+
+        mean = bootstrap_output[0]
+        var = bootstrap_output[1]
+
+        assert mean.columns.equals(mean.index)
+        assert np.sum(mean.values - mean.values.T) == 0  # symmetry
+        assert np.sum(var.values - var.values.T) == 0  # symmetry
+
+    @mark.parametrize("distance", all_distances)
+    def test_bootstrap_distance_onesided(self, adata, distance_obj, distance):
+        # Test consistency of one-sided distance results
+        selected_group = adata.obs.perturbation.unique()[0]
+        bootstrap_output = distance_obj.onesided_distances(
+            adata,
+            groupby="perturbation",
+            selected_group=selected_group,
+            bootstrap=True,
+            n_bootstrap=10,
+        )
+
+        assert isinstance(bootstrap_output, tuple)

From 2c8127c66009898c1483e916de4caf4620a98771 Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Fri, 7 Jun 2024 21:42:48 +0200
Subject: [PATCH 21/34] speed up tests,

---
 pertpy/tools/_distances/_distances.py    |  9 +---
 pertpy/tools/_metrics_3g.py              | 46 +++++------------
 tests/tools/_distances/test_distances.py | 65 +++++++++---------------
 tests/tools/test_metrics_3g.py           | 55 ++++++++++++++++++++
 4 files changed, 95 insertions(+), 80 deletions(-)
 create mode 100644 tests/tools/test_metrics_3g.py

diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index 815da084..d6a249ac 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -1158,15 +1158,8 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
         self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        cov = np.cov(X.T)
-        if cov.shape[0] == cov.shape[1] and np.linalg.matrix_rank(cov) == cov.shape[0]:  # check invertiblity
-            inverse_cov = np.linalg.inv(cov)
-        else:  # if not invertible, compute the (Moore-Penrose) pseudo-inverse of a matrix
-            inverse_cov = np.linalg.pinv(cov)
         return mahalanobis(
-            self.aggregation_func(X, axis=0),
-            self.aggregation_func(Y, axis=0),
-            inverse_cov,
+            self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0), np.linalg.inv(np.cov(X.T))
         )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
diff --git a/pertpy/tools/_metrics_3g.py b/pertpy/tools/_metrics_3g.py
index 8eb951f5..03dc2172 100644
--- a/pertpy/tools/_metrics_3g.py
+++ b/pertpy/tools/_metrics_3g.py
@@ -19,9 +19,7 @@
     from numpy.typing import NDArray
 
 
-def compare_de(
-    X: np.ndarray, Y: np.ndarray, C: np.ndarray, shared_top: int = 100, **kwargs
-) -> dict:
+def compare_de(X: np.ndarray, Y: np.ndarray, C: np.ndarray, shared_top: int = 100, **kwargs) -> dict:
     """Compare DEG across real and simulated perturbations.
 
     Computes DEG for real and simulated perturbations vs. control and calculates
@@ -50,39 +48,25 @@ def compare_de(
     for group in ("x", "y"):
         adata_joint = ad.concat((adatas_xy[group], adata_c), index_unique="-")
 
-        sc.tl.rank_genes_groups(
-            adata_joint, groupby="label", reference="ctrl", key_added="de", **kwargs
-        )
+        sc.tl.rank_genes_groups(adata_joint, groupby="label", reference="ctrl", key_added="de", **kwargs)
 
         srt_idx = np.argsort(adata_joint.uns["de"]["names"]["comp"])
         results[f"scores_{group}"] = adata_joint.uns["de"]["scores"]["comp"][srt_idx]
-        results[f"pvals_adj_{group}"] = adata_joint.uns["de"]["pvals_adj"]["comp"][
-            srt_idx
-        ]
+        results[f"pvals_adj_{group}"] = adata_joint.uns["de"]["pvals_adj"]["comp"][srt_idx]
         results[f"ranks_{group}"] = vars_ranks[srt_idx]
 
         top_names.append(adata_joint.uns["de"]["names"]["comp"][:shared_top])
 
     metrics = {}
-    metrics["shared_top_genes"] = (
-        len(set(top_names[0]).intersection(top_names[1])) / shared_top
-    )
-    metrics["scores_corr"] = results["scores_x"].corr(
-        results["scores_y"], method="pearson"
-    )
-    metrics["pvals_adj_corr"] = results["pvals_adj_x"].corr(
-        results["pvals_adj_y"], method="pearson"
-    )
-    metrics["scores_ranks_corr"] = results["ranks_x"].corr(
-        results["ranks_y"], method="spearman"
-    )
+    metrics["shared_top_genes"] = len(set(top_names[0]).intersection(top_names[1])) / shared_top
+    metrics["scores_corr"] = results["scores_x"].corr(results["scores_y"], method="pearson")
+    metrics["pvals_adj_corr"] = results["pvals_adj_x"].corr(results["pvals_adj_y"], method="pearson")
+    metrics["scores_ranks_corr"] = results["ranks_x"].corr(results["ranks_y"], method="spearman")
 
     return metrics
 
 
-def compare_class(
-    X: np.ndarray, Y: np.ndarray, C: np.ndarray, clf: Optional[ClassifierMixin] = None
-) -> float:
+def compare_class(X: np.ndarray, Y: np.ndarray, C: np.ndarray, clf: ClassifierMixin | None = None) -> float:
     """Compare classification accuracy between real and simulated perturbations.
 
     Trains a classifier on the real perturbation data + the control data and reports a normalized
@@ -117,7 +101,7 @@ def compare_class(
 def compare_knn(
     X: np.ndarray,
     Y: np.ndarray,
-    C: Optional[np.ndarray] = None,
+    C: np.ndarray | None = None,
     n_neighbors: int = 20,
     use_Y_knn: bool = False,
     random_state: int = 0,
@@ -169,8 +153,8 @@ def compare_knn(
 
     uq, uq_counts = np.unique(labels[indices], return_counts=True)
     uq_counts_norm = uq_counts / uq_counts.sum()
-    counts = dict(zip(label_groups, [0.0] * len(label_groups)))
-    for group, count_norm in zip(uq, uq_counts_norm):
+    counts = dict(zip(label_groups, [0.0] * len(label_groups), strict=False))
+    for group, count_norm in zip(uq, uq_counts_norm, strict=False):
         counts[group] = count_norm
 
     return counts
@@ -192,7 +176,7 @@ def compare_dist(
         pert: Real perturbed data.
         pred: Simulated perturbed data.
         ctrl: Control data
-        kind: Kind of metric to use.
+        mode: Mode to use.
     """
     metric_fct = partial(Distance(metric).metric_fct, **metric_kwds)
 
@@ -201,13 +185,11 @@ def compare_dist(
     elif mode == "scaled":
         from sklearn.preprocessing import MinMaxScaler
 
-        scaler = MinMaxScaler().fit(
-            np.vstack((pert, ctrl)) if _fit_to_pert_and_ctrl else ctrl
-        )
+        scaler = MinMaxScaler().fit(np.vstack((pert, ctrl)) if _fit_to_pert_and_ctrl else ctrl)
         pred = scaler.transform(pred)
         pert = scaler.transform(pert)
     else:
-        raise ValueError(f"Unknown mode {mod}. Please choose simple or scaled.")
+        raise ValueError(f"Unknown mode {mode}. Please choose simple or scaled.")
 
     d1 = metric_fct(pert, pred)
     d2 = metric_fct(ctrl, pred)
diff --git a/tests/tools/_distances/test_distances.py b/tests/tools/_distances/test_distances.py
index c093e03c..441a46bd 100644
--- a/tests/tools/_distances/test_distances.py
+++ b/tests/tools/_distances/test_distances.py
@@ -78,56 +78,50 @@ def distance_obj(self, request):
             Distance = pt.tl.Distance(distance, obsm_key="X_pca")
         return Distance
 
-    @mark.parametrize("distance", actual_distances + semi_distances + non_distances)
-    def test_distance(self, adata, distance):
-        Distance = pt.tl.Distance(distance, obsm_key="X_pca")
-        df = Distance.pairwise(adata, groupby="perturbation", show_progressbar=True)
-
-        assert isinstance(df, DataFrame)
+    @fixture
+    @mark.parametrize("distance", all_distances)
+    def pairwise_distance(self, adata, distance_obj, distance):
+        return distance_obj.pairwise(adata, groupby="perturbation", show_progressbar=True)
 
     @mark.parametrize("distance", actual_distances + semi_distances)
-    def test_distance_axioms(self, adata, distance):
+    def test_distance_axioms(self, pairwise_distance, distance):
         # This is equivalent to testing for a semimetric, defined as fulfilling all axioms except triangle inequality.
-        Distance = pt.tl.Distance(distance, obsm_key="X_pca")
-        df = Distance.pairwise(adata, groupby="perturbation", show_progressbar=True)
-
         # (M1) Definiteness
-        assert all(np.diag(df.values) == 0)  # distance to self is 0
+        assert all(np.diag(pairwise_distance.values) == 0)  # distance to self is 0
 
         # (M2) Positivity
-        assert len(df) == np.sum(df.values == 0)  # distance to other is not 0 (TODO)
-        assert all(df.values.flatten() >= 0)  # distance is non-negative
+        assert len(pairwise_distance) == np.sum(pairwise_distance.values == 0)  # distance to other is not 0
+        assert all(pairwise_distance.values.flatten() >= 0)  # distance is non-negative
 
         # (M3) Symmetry
-        assert np.sum(df.values - df.values.T) == 0
+        assert np.sum(pairwise_distance.values - pairwise_distance.values.T) == 0
 
     @mark.parametrize("distance", actual_distances)
-    def test_triangle_inequality(self, adata, distance):
+    def test_triangle_inequality(self, pairwise_distance, distance):
         # Test if distances are well-defined in accordance with metric axioms
-        Distance = pt.tl.Distance(distance, obsm_key="X_pca")
-        df = Distance.pairwise(adata, groupby="perturbation", show_progressbar=True)
-
         # (M4) Triangle inequality (we just probe this for a few random triplets)
         for _i in range(10):
             rng = np.random.default_rng()
-            triplet = rng.choice(df.index, size=3, replace=False)
-            assert df.loc[triplet[0], triplet[1]] + df.loc[triplet[1], triplet[2]] >= df.loc[triplet[0], triplet[2]]
+            triplet = rng.choice(pairwise_distance.index, size=3, replace=False)
+            assert (
+                pairwise_distance.loc[triplet[0], triplet[1]] + pairwise_distance.loc[triplet[1], triplet[2]]
+                >= pairwise_distance.loc[triplet[0], triplet[2]]
+            )
 
     @mark.parametrize("distance", all_distances)
-    def test_distance_layers(self, adata, distance_obj, distance):
-        df = distance_obj.pairwise(adata, groupby="perturbation")
-
-        assert isinstance(df, DataFrame)
-        assert df.columns.equals(df.index)
-        assert np.sum(df.values - df.values.T) == 0  # symmetry
+    def test_distance_layers(self, pairwise_distance, distance):
+        assert isinstance(pairwise_distance, DataFrame)
+        assert pairwise_distance.columns.equals(pairwise_distance.index)
+        assert np.sum(pairwise_distance.values - pairwise_distance.values.T) == 0  # symmetry
 
     @mark.parametrize("distance", actual_distances + pseudo_counts_distances)
     def test_distance_counts(self, adata, distance):
-        Distance = pt.tl.Distance(distance, layer_key="counts")
-        df = Distance.pairwise(adata, groupby="perturbation")
-        assert isinstance(df, DataFrame)
-        assert df.columns.equals(df.index)
-        assert np.sum(df.values - df.values.T) == 0
+        if distance != "mahalanobis":  # doesn't work, covariance matrix is a singular matrix, not invertible
+            Distance = pt.tl.Distance(distance, layer_key="counts")
+            df = Distance.pairwise(adata, groupby="perturbation")
+            assert isinstance(df, DataFrame)
+            assert df.columns.equals(df.index)
+            assert np.sum(df.values - df.values.T) == 0
 
     @mark.parametrize("distance", all_distances)
     def test_mutually_exclusive_keys(self, distance):
@@ -144,15 +138,6 @@ def test_distance_output_type(self, distance):
         d = Distance(X, Y)
         assert isinstance(d, float)
 
-    @mark.parametrize("distance", all_distances)
-    def test_distance_pairwise(self, adata, distance_obj, distance):
-        # Test consistency of pairwise distance results
-        df = distance_obj.pairwise(adata, groupby="perturbation")
-
-        assert isinstance(df, DataFrame)
-        assert df.columns.equals(df.index)
-        assert np.sum(df.values - df.values.T) == 0  # symmetry
-
     @mark.parametrize("distance", all_distances + onesided_only)
     def test_distance_onesided(self, adata, distance_obj, distance):
         # Test consistency of one-sided distance results
diff --git a/tests/tools/test_metrics_3g.py b/tests/tools/test_metrics_3g.py
new file mode 100644
index 00000000..3339ac04
--- /dev/null
+++ b/tests/tools/test_metrics_3g.py
@@ -0,0 +1,55 @@
+import pertpy as pt
+import numpy as np
+import pytest
+from pertpy.tools._metrics_3g import (
+    compare_de,
+    compare_class,
+    compare_dist,
+    compare_knn,
+)
+
+
+class TestMetrics3G:
+    @pytest.fixture
+    def test_data(self):
+        rng = np.random.default_rng()
+        X = rng.normal(size=(100, 10))
+        Y = rng.normal(size=(100, 10))
+        C = rng.normal(size=(100, 10))
+        return X, Y, C
+
+    def test_compare_de(self, test_data):
+        X, Y, C = test_data
+        result = compare_de(X, Y, C, shared_top=5)
+        assert isinstance(result, dict)
+        required_keys = {
+            "shared_top_genes",
+            "scores_corr",
+            "pvals_adj_corr",
+            "scores_ranks_corr",
+        }
+        assert all(key in result for key in required_keys)
+
+    def test_compare_class(self, test_data):
+        X, Y, C = test_data
+        result = compare_class(X, Y, C)
+        assert result <= 1
+
+    def test_compare_knn(self, test_data):
+        X, Y, C = test_data
+        result = compare_knn(X, Y, C)
+        assert isinstance(result, dict)
+        assert "comp" in result
+        assert isinstance(result["comp"], float)
+
+        result_no_ctrl = compare_knn(X, Y)
+        assert isinstance(result_no_ctrl, dict)
+
+    def test_compare_dist(self, test_data):
+        X, Y, C = test_data
+        res_simple = compare_dist(X, Y, C, mode="simple")
+        assert isinstance(res_simple, float)
+        res_scaled = compare_dist(X, Y, C, mode="scaled")
+        assert isinstance(res_scaled, float)
+        with pytest.raises(ValueError):
+            compare_dist(X, Y, C, mode="new_mode")

From 57b14c384101d643d1c0f44808f2eedaef83d7ba Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Mon, 10 Jun 2024 11:36:47 +0200
Subject: [PATCH 22/34] remove test classes

---
 tests/tools/test_metrics_3g.py | 95 ++++++++++++++++++----------------
 1 file changed, 49 insertions(+), 46 deletions(-)

diff --git a/tests/tools/test_metrics_3g.py b/tests/tools/test_metrics_3g.py
index 3339ac04..11d6aebf 100644
--- a/tests/tools/test_metrics_3g.py
+++ b/tests/tools/test_metrics_3g.py
@@ -1,55 +1,58 @@
-import pertpy as pt
 import numpy as np
+import pertpy as pt
 import pytest
 from pertpy.tools._metrics_3g import (
-    compare_de,
     compare_class,
+    compare_de,
     compare_dist,
     compare_knn,
 )
 
 
-class TestMetrics3G:
-    @pytest.fixture
-    def test_data(self):
-        rng = np.random.default_rng()
-        X = rng.normal(size=(100, 10))
-        Y = rng.normal(size=(100, 10))
-        C = rng.normal(size=(100, 10))
-        return X, Y, C
-
-    def test_compare_de(self, test_data):
-        X, Y, C = test_data
-        result = compare_de(X, Y, C, shared_top=5)
-        assert isinstance(result, dict)
-        required_keys = {
-            "shared_top_genes",
-            "scores_corr",
-            "pvals_adj_corr",
-            "scores_ranks_corr",
-        }
-        assert all(key in result for key in required_keys)
-
-    def test_compare_class(self, test_data):
-        X, Y, C = test_data
-        result = compare_class(X, Y, C)
-        assert result <= 1
-
-    def test_compare_knn(self, test_data):
-        X, Y, C = test_data
-        result = compare_knn(X, Y, C)
-        assert isinstance(result, dict)
-        assert "comp" in result
-        assert isinstance(result["comp"], float)
-
-        result_no_ctrl = compare_knn(X, Y)
-        assert isinstance(result_no_ctrl, dict)
-
-    def test_compare_dist(self, test_data):
-        X, Y, C = test_data
-        res_simple = compare_dist(X, Y, C, mode="simple")
-        assert isinstance(res_simple, float)
-        res_scaled = compare_dist(X, Y, C, mode="scaled")
-        assert isinstance(res_scaled, float)
-        with pytest.raises(ValueError):
-            compare_dist(X, Y, C, mode="new_mode")
+@pytest.fixture
+def test_data():
+    rng = np.random.default_rng()
+    X = rng.normal(size=(100, 10))
+    Y = rng.normal(size=(100, 10))
+    C = rng.normal(size=(100, 10))
+    return X, Y, C
+
+
+def test_compare_de(test_data):
+    X, Y, C = test_data
+    result = compare_de(X, Y, C, shared_top=5)
+    assert isinstance(result, dict)
+    required_keys = {
+        "shared_top_genes",
+        "scores_corr",
+        "pvals_adj_corr",
+        "scores_ranks_corr",
+    }
+    assert all(key in result for key in required_keys)
+
+
+def test_compare_class(test_data):
+    X, Y, C = test_data
+    result = compare_class(X, Y, C)
+    assert result <= 1
+
+
+def test_compare_knn(test_data):
+    X, Y, C = test_data
+    result = compare_knn(X, Y, C)
+    assert isinstance(result, dict)
+    assert "comp" in result
+    assert isinstance(result["comp"], float)
+
+    result_no_ctrl = compare_knn(X, Y)
+    assert isinstance(result_no_ctrl, dict)
+
+
+def test_compare_dist(test_data):
+    X, Y, C = test_data
+    res_simple = compare_dist(X, Y, C, mode="simple")
+    assert isinstance(res_simple, float)
+    res_scaled = compare_dist(X, Y, C, mode="scaled")
+    assert isinstance(res_scaled, float)
+    with pytest.raises(ValueError):
+        compare_dist(X, Y, C, mode="new_mode")

From 78d00fafa772da8848b89f3defecf82c7745b8d3 Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Mon, 10 Jun 2024 12:34:35 +0200
Subject: [PATCH 23/34] drop test classes

---
 tests/tools/_distances/test_distances.py | 301 ++++++++++++-----------
 1 file changed, 156 insertions(+), 145 deletions(-)

diff --git a/tests/tools/_distances/test_distances.py b/tests/tools/_distances/test_distances.py
index 441a46bd..b9ffe3a2 100644
--- a/tests/tools/_distances/test_distances.py
+++ b/tests/tools/_distances/test_distances.py
@@ -33,153 +33,164 @@
 )  # + onesided_only
 
 
-class TestDistances:
-    @fixture
-    def adata(self, request):
-        low_subsample_distances = [
-            "sym_kldiv",
-            "t_test",
-            "ks_test",
-            "classifier_proba",
-            "classifier_cp",
-            "mahalanobis",
-            "mean_var_distribution",
-        ]
-        no_subsample_distances = ["mahalanobis"]  # mahalanobis only works on the full data without subsampling
-
-        distance = request.node.callspec.params["distance"]
-
-        adata = pt.dt.distance_example()
-        if distance not in no_subsample_distances:
-            if distance in low_subsample_distances:
-                adata = sc.pp.subsample(adata, 0.1, copy=True)
-            else:
-                adata = sc.pp.subsample(adata, 0.001, copy=True)
-
-        adata.layers["lognorm"] = adata.X.copy()
-        adata.layers["counts"] = np.round(adata.X.toarray()).astype(int)
-        if "X_pca" not in adata.obsm.keys():
-            sc.pp.pca(adata, n_comps=5)
-        if distance in lognorm_counts_distances:
-            groups = np.unique(adata.obs["perturbation"])
-            # KDE is slow, subset to 5 groups for speed up
-            adata = adata[adata.obs["perturbation"].isin(groups[0:5])].copy()
-
-        return adata
-
-    @fixture
-    def distance_obj(self, request):
-        distance = request.node.callspec.params["distance"]
-        if distance in lognorm_counts_distances:
-            Distance = pt.tl.Distance(distance, layer_key="lognorm")
-        elif distance in pseudo_counts_distances:
-            Distance = pt.tl.Distance(distance, layer_key="counts")
+@fixture
+def adata(request):
+    low_subsample_distances = [
+        "sym_kldiv",
+        "t_test",
+        "ks_test",
+        "classifier_proba",
+        "classifier_cp",
+        "mahalanobis",
+        "mean_var_distribution",
+    ]
+    no_subsample_distances = ["mahalanobis"]  # mahalanobis only works on the full data without subsampling
+
+    distance = request.node.callspec.params["distance"]
+
+    adata = pt.dt.distance_example()
+    if distance not in no_subsample_distances:
+        if distance in low_subsample_distances:
+            adata = sc.pp.subsample(adata, 0.1, copy=True)
         else:
-            Distance = pt.tl.Distance(distance, obsm_key="X_pca")
-        return Distance
-
-    @fixture
-    @mark.parametrize("distance", all_distances)
-    def pairwise_distance(self, adata, distance_obj, distance):
-        return distance_obj.pairwise(adata, groupby="perturbation", show_progressbar=True)
-
-    @mark.parametrize("distance", actual_distances + semi_distances)
-    def test_distance_axioms(self, pairwise_distance, distance):
-        # This is equivalent to testing for a semimetric, defined as fulfilling all axioms except triangle inequality.
-        # (M1) Definiteness
-        assert all(np.diag(pairwise_distance.values) == 0)  # distance to self is 0
-
-        # (M2) Positivity
-        assert len(pairwise_distance) == np.sum(pairwise_distance.values == 0)  # distance to other is not 0
-        assert all(pairwise_distance.values.flatten() >= 0)  # distance is non-negative
-
-        # (M3) Symmetry
-        assert np.sum(pairwise_distance.values - pairwise_distance.values.T) == 0
-
-    @mark.parametrize("distance", actual_distances)
-    def test_triangle_inequality(self, pairwise_distance, distance):
-        # Test if distances are well-defined in accordance with metric axioms
-        # (M4) Triangle inequality (we just probe this for a few random triplets)
-        for _i in range(10):
-            rng = np.random.default_rng()
-            triplet = rng.choice(pairwise_distance.index, size=3, replace=False)
-            assert (
-                pairwise_distance.loc[triplet[0], triplet[1]] + pairwise_distance.loc[triplet[1], triplet[2]]
-                >= pairwise_distance.loc[triplet[0], triplet[2]]
-            )
-
-    @mark.parametrize("distance", all_distances)
-    def test_distance_layers(self, pairwise_distance, distance):
-        assert isinstance(pairwise_distance, DataFrame)
-        assert pairwise_distance.columns.equals(pairwise_distance.index)
-        assert np.sum(pairwise_distance.values - pairwise_distance.values.T) == 0  # symmetry
-
-    @mark.parametrize("distance", actual_distances + pseudo_counts_distances)
-    def test_distance_counts(self, adata, distance):
-        if distance != "mahalanobis":  # doesn't work, covariance matrix is a singular matrix, not invertible
-            Distance = pt.tl.Distance(distance, layer_key="counts")
-            df = Distance.pairwise(adata, groupby="perturbation")
-            assert isinstance(df, DataFrame)
-            assert df.columns.equals(df.index)
-            assert np.sum(df.values - df.values.T) == 0
-
-    @mark.parametrize("distance", all_distances)
-    def test_mutually_exclusive_keys(self, distance):
-        with pytest.raises(ValueError):
-            _ = pt.tl.Distance(distance, layer_key="counts", obsm_key="X_pca")
-
-    @mark.parametrize("distance", actual_distances + semi_distances + non_distances)
-    def test_distance_output_type(self, distance):
-        # Test if distances are outputting floats
-        Distance = pt.tl.Distance(distance, obsm_key="X_pca")
-        rng = np.random.default_rng()
-        X = rng.normal(size=(100, 10))
-        Y = rng.normal(size=(100, 10))
-        d = Distance(X, Y)
-        assert isinstance(d, float)
-
-    @mark.parametrize("distance", all_distances + onesided_only)
-    def test_distance_onesided(self, adata, distance_obj, distance):
-        # Test consistency of one-sided distance results
-        selected_group = adata.obs.perturbation.unique()[0]
-        df = distance_obj.onesided_distances(adata, groupby="perturbation", selected_group=selected_group)
-        assert isinstance(df, Series)
-        assert df.loc[selected_group] == 0  # distance to self is 0
-
-    @mark.parametrize("distance", actual_distances + semi_distances + non_distances)
-    def test_bootstrap_distance_output_type(self, distance):
-        # Test if distances are outputting floats
+            adata = sc.pp.subsample(adata, 0.001, copy=True)
+
+    adata.layers["lognorm"] = adata.X.copy()
+    adata.layers["counts"] = np.round(adata.X.toarray()).astype(int)
+    if "X_pca" not in adata.obsm.keys():
+        sc.pp.pca(adata, n_comps=5)
+    if distance in lognorm_counts_distances:
+        groups = np.unique(adata.obs["perturbation"])
+        # KDE is slow, subset to 5 groups for speed up
+        adata = adata[adata.obs["perturbation"].isin(groups[0:5])].copy()
+
+    return adata
+
+
+@fixture
+def distance_obj(request):
+    distance = request.node.callspec.params["distance"]
+    if distance in lognorm_counts_distances:
+        Distance = pt.tl.Distance(distance, layer_key="lognorm")
+    elif distance in pseudo_counts_distances:
+        Distance = pt.tl.Distance(distance, layer_key="counts")
+    else:
         Distance = pt.tl.Distance(distance, obsm_key="X_pca")
+    return Distance
+
+
+@fixture
+@mark.parametrize("distance", all_distances)
+def pairwise_distance(adata, distance_obj, distance):
+    return distance_obj.pairwise(adata, groupby="perturbation", show_progressbar=True)
+
+
+@mark.parametrize("distance", actual_distances + semi_distances)
+def test_distance_axioms(pairwise_distance, distance):
+    # This is equivalent to testing for a semimetric, defined as fulfilling all axioms except triangle inequality.
+    # (M1) Definiteness
+    assert all(np.diag(pairwise_distance.values) == 0)  # distance to self is 0
+
+    # (M2) Positivity
+    assert len(pairwise_distance) == np.sum(pairwise_distance.values == 0)  # distance to other is not 0
+    assert all(pairwise_distance.values.flatten() >= 0)  # distance is non-negative
+
+    # (M3) Symmetry
+    assert np.sum(pairwise_distance.values - pairwise_distance.values.T) == 0
+
+
+@mark.parametrize("distance", actual_distances)
+def test_triangle_inequality(pairwise_distance, distance):
+    # Test if distances are well-defined in accordance with metric axioms
+    # (M4) Triangle inequality (we just probe this for a few random triplets)
+    for _i in range(10):
         rng = np.random.default_rng()
-        X = rng.normal(size=(100, 10))
-        Y = rng.normal(size=(100, 10))
-        d = Distance.bootstrap(X, Y, n_bootstrap=10)
-        assert hasattr(d, "mean")
-        assert hasattr(d, "variance")
-
-    @mark.parametrize("distance", all_distances)
-    def test_bootstrap_distance_pairwise(self, adata, distance_obj, distance):
-        # Test consistency of pairwise distance results
-        bootstrap_output = distance_obj.pairwise(adata, groupby="perturbation", bootstrap=True, n_bootstrap=10)
-        assert isinstance(bootstrap_output, tuple)
-
-        mean = bootstrap_output[0]
-        var = bootstrap_output[1]
-
-        assert mean.columns.equals(mean.index)
-        assert np.sum(mean.values - mean.values.T) == 0  # symmetry
-        assert np.sum(var.values - var.values.T) == 0  # symmetry
-
-    @mark.parametrize("distance", all_distances)
-    def test_bootstrap_distance_onesided(self, adata, distance_obj, distance):
-        # Test consistency of one-sided distance results
-        selected_group = adata.obs.perturbation.unique()[0]
-        bootstrap_output = distance_obj.onesided_distances(
-            adata,
-            groupby="perturbation",
-            selected_group=selected_group,
-            bootstrap=True,
-            n_bootstrap=10,
+        triplet = rng.choice(pairwise_distance.index, size=3, replace=False)
+        assert (
+            pairwise_distance.loc[triplet[0], triplet[1]] + pairwise_distance.loc[triplet[1], triplet[2]]
+            >= pairwise_distance.loc[triplet[0], triplet[2]]
         )
 
-        assert isinstance(bootstrap_output, tuple)
+
+@mark.parametrize("distance", all_distances)
+def test_distance_layers(pairwise_distance, distance):
+    assert isinstance(pairwise_distance, DataFrame)
+    assert pairwise_distance.columns.equals(pairwise_distance.index)
+    assert np.sum(pairwise_distance.values - pairwise_distance.values.T) == 0  # symmetry
+
+
+@mark.parametrize("distance", actual_distances + pseudo_counts_distances)
+def test_distance_counts(adata, distance):
+    if distance != "mahalanobis":  # doesn't work, covariance matrix is a singular matrix, not invertible
+        Distance = pt.tl.Distance(distance, layer_key="counts")
+        df = Distance.pairwise(adata, groupby="perturbation")
+        assert isinstance(df, DataFrame)
+        assert df.columns.equals(df.index)
+        assert np.sum(df.values - df.values.T) == 0
+
+
+@mark.parametrize("distance", all_distances)
+def test_mutually_exclusive_keys(distance):
+    with pytest.raises(ValueError):
+        _ = pt.tl.Distance(distance, layer_key="counts", obsm_key="X_pca")
+
+
+@mark.parametrize("distance", actual_distances + semi_distances + non_distances)
+def test_distance_output_type(distance):
+    # Test if distances are outputting floats
+    Distance = pt.tl.Distance(distance, obsm_key="X_pca")
+    rng = np.random.default_rng()
+    X = rng.normal(size=(100, 10))
+    Y = rng.normal(size=(100, 10))
+    d = Distance(X, Y)
+    assert isinstance(d, float)
+
+
+@mark.parametrize("distance", all_distances + onesided_only)
+def test_distance_onesided(adata, distance_obj, distance):
+    # Test consistency of one-sided distance results
+    selected_group = adata.obs.perturbation.unique()[0]
+    df = distance_obj.onesided_distances(adata, groupby="perturbation", selected_group=selected_group)
+    assert isinstance(df, Series)
+    assert df.loc[selected_group] == 0  # distance to self is 0
+
+
+@mark.parametrize("distance", actual_distances + semi_distances + non_distances)
+def test_bootstrap_distance_output_type(distance):
+    # Test if distances are outputting floats
+    Distance = pt.tl.Distance(distance, obsm_key="X_pca")
+    rng = np.random.default_rng()
+    X = rng.normal(size=(100, 10))
+    Y = rng.normal(size=(100, 10))
+    d = Distance.bootstrap(X, Y, n_bootstrap=10)
+    assert hasattr(d, "mean")
+    assert hasattr(d, "variance")
+
+
+@mark.parametrize("distance", all_distances)
+def test_bootstrap_distance_pairwise(adata, distance_obj, distance):
+    # Test consistency of pairwise distance results
+    bootstrap_output = distance_obj.pairwise(adata, groupby="perturbation", bootstrap=True, n_bootstrap=10)
+    assert isinstance(bootstrap_output, tuple)
+
+    mean = bootstrap_output[0]
+    var = bootstrap_output[1]
+
+    assert mean.columns.equals(mean.index)
+    assert np.sum(mean.values - mean.values.T) == 0  # symmetry
+    assert np.sum(var.values - var.values.T) == 0  # symmetry
+
+
+@mark.parametrize("distance", all_distances)
+def test_bootstrap_distance_onesided(adata, distance_obj, distance):
+    # Test consistency of one-sided distance results
+    selected_group = adata.obs.perturbation.unique()[0]
+    bootstrap_output = distance_obj.onesided_distances(
+        adata,
+        groupby="perturbation",
+        selected_group=selected_group,
+        bootstrap=True,
+        n_bootstrap=10,
+    )
+
+    assert isinstance(bootstrap_output, tuple)

From 052fd000044d01b6154f1c54954d694ae35a45d1 Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Wed, 12 Jun 2024 23:36:14 +0200
Subject: [PATCH 24/34] update compare_de

---
 pertpy/tools/_metrics_3g.py    | 107 +++++++++++++++++++++------------
 tests/tools/test_metrics_3g.py |  93 +++++++++++++++++++++++-----
 2 files changed, 148 insertions(+), 52 deletions(-)

diff --git a/pertpy/tools/_metrics_3g.py b/pertpy/tools/_metrics_3g.py
index 03dc2172..c73d719c 100644
--- a/pertpy/tools/_metrics_3g.py
+++ b/pertpy/tools/_metrics_3g.py
@@ -1,13 +1,12 @@
 from collections.abc import Mapping
 from functools import partial
 from types import MappingProxyType
-from typing import TYPE_CHECKING, Any, Literal, Optional
+from typing import TYPE_CHECKING, Any, Literal
 
-import anndata as ad
 import numpy as np
 import pandas as pd
 import pynndescent
-import scanpy as sc
+from anndata import AnnData
 from scipy.sparse import issparse
 from scipy.sparse import vstack as sp_vstack
 from sklearn.base import ClassifierMixin
@@ -19,49 +18,83 @@
     from numpy.typing import NDArray
 
 
-def compare_de(X: np.ndarray, Y: np.ndarray, C: np.ndarray, shared_top: int = 100, **kwargs) -> dict:
-    """Compare DEG across real and simulated perturbations.
+def compare_de(
+    adata: AnnData | None = None,
+    de_key1: str = None,
+    de_key2: str = None,
+    de_df1: pd.DataFrame | None = None,
+    de_df2: pd.DataFrame | None = None,
+    shared_top: int = 100,
+) -> dict:
+    """Compare two differential expression analyses.
 
-    Computes DEG for real and simulated perturbations vs. control and calculates
-    metrics to evaluate similarity of the results.
+    Compare two sets of DE results and evaluate the similarity by the overlap of top DEG and
+    the correlation of their scores and adjusted p-values.
 
     Args:
-        X: Real perturbed data.
-        Y: Simulated perturbed data.
-        C: Control data
+        adata: AnnData object containing DE results in `uns`. Required if `de_key1` and `de_key2` are used.
+        de_key1: Key for DE results in `adata.uns`, e.g., output of `tl.rank_genes_groups`.
+        de_key2: Another key for DE results in `adata.uns`, e.g., output of `tl.rank_genes_groups`.
+        de_df1: DataFrame containing DE results, e.g. output from pertpy differential gene expression interface.
+        de_df2: DataFrame containing DE results, e.g. output from pertpy differential gene expression interface.
         shared_top: The number of top DEG to compute the proportion of their intersection.
-        **kwargs: arguments for `scanpy.tl.rank_genes_groups`.
-    """
-    n_vars = X.shape[1]
-    assert n_vars == Y.shape[1] == C.shape[1]
-
-    shared_top = min(shared_top, n_vars)
-    vars_ranks = np.arange(1, n_vars + 1)
 
-    adatas_xy = {}
-    adatas_xy["x"] = ad.AnnData(X, obs={"label": "comp"})
-    adatas_xy["y"] = ad.AnnData(Y, obs={"label": "comp"})
-    adata_c = ad.AnnData(C, obs={"label": "ctrl"})
-
-    results = pd.DataFrame(index=adata_c.var_names)
+    """
+    if (de_key1 or de_key2) and (de_df1 is not None or de_df2 is not None):
+        raise ValueError(
+            "Please provide either both `de_key1` and `de_key2` with `adata`, or `de_df1` and `de_df2`, but not both."
+        )
+
+    if de_df1 is None and de_df2 is None:  # use df
+        if not de_key1 or not de_key2:
+            raise ValueError("Both `de_key1` and `de_key2` must be provided together if using `adata`.")
+
+    else:  # use keys
+        if de_df1 is None or de_df2 is None:
+            raise ValueError("Both `de_df1` and `de_df2` must be provided together if using dataframes.")
+
+    if de_key1:
+        if not adata:
+            raise ValueError("`adata` should be provided with `de_key1` and `de_key2`. ")
+        assert all(
+            k in adata.uns for k in [de_key1, de_key2]
+        ), "Provided `de_key1` and `de_key2` must exist in `adata.uns`."
+        vars = adata.var_names
+
+    if de_df1 is not None:
+        for df in (de_df1, de_df2):
+            if not {"variable", "log_fc", "adj_p_value"}.issubset(df.columns):
+                raise ValueError("Each DataFrame must contain columns: 'variable', 'log_fc', and 'adj_p_value'.")
+
+        assert set(de_df1["variable"]) == set(de_df2["variable"]), "Variables in both dataframes must match."
+        vars = de_df1["variable"].sort_values()
+
+    shared_top = min(shared_top, len(vars))
+    vars_ranks = np.arange(1, len(vars) + 1)
+    results = pd.DataFrame(index=vars)
     top_names = []
-    for group in ("x", "y"):
-        adata_joint = ad.concat((adatas_xy[group], adata_c), index_unique="-")
-
-        sc.tl.rank_genes_groups(adata_joint, groupby="label", reference="ctrl", key_added="de", **kwargs)
 
-        srt_idx = np.argsort(adata_joint.uns["de"]["names"]["comp"])
-        results[f"scores_{group}"] = adata_joint.uns["de"]["scores"]["comp"][srt_idx]
-        results[f"pvals_adj_{group}"] = adata_joint.uns["de"]["pvals_adj"]["comp"][srt_idx]
-        results[f"ranks_{group}"] = vars_ranks[srt_idx]
-
-        top_names.append(adata_joint.uns["de"]["names"]["comp"][:shared_top])
+    if de_key1 and de_key2:
+        for i, k in enumerate([de_key1, de_key2]):
+            label = adata.uns[k]["names"].dtype.names[0]
+            srt_idx = np.argsort(adata.uns[k]["names"][label])
+            results[f"scores_{i}"] = adata.uns[k]["scores"][label][srt_idx]
+            results[f"pvals_adj_{i}"] = adata.uns[k]["pvals_adj"][label][srt_idx]
+            results[f"ranks_{i}"] = vars_ranks[srt_idx]
+            top_names.append(adata.uns[k]["names"][label][:shared_top])
+    else:
+        for i, df in enumerate([de_df1, de_df2]):
+            srt_idx = np.argsort(df["variable"])
+            results[f"scores_{i}"] = df["log_fc"].values[srt_idx]
+            results[f"pvals_adj_{i}"] = df["adj_p_value"].values[srt_idx]
+            results[f"ranks_{i}"] = vars_ranks[srt_idx]
+            top_names.append(df["variable"][:shared_top])
 
     metrics = {}
     metrics["shared_top_genes"] = len(set(top_names[0]).intersection(top_names[1])) / shared_top
-    metrics["scores_corr"] = results["scores_x"].corr(results["scores_y"], method="pearson")
-    metrics["pvals_adj_corr"] = results["pvals_adj_x"].corr(results["pvals_adj_y"], method="pearson")
-    metrics["scores_ranks_corr"] = results["ranks_x"].corr(results["ranks_y"], method="spearman")
+    metrics["scores_corr"] = results["scores_0"].corr(results["scores_1"], method="pearson")
+    metrics["pvals_adj_corr"] = results["pvals_adj_0"].corr(results["pvals_adj_1"], method="pearson")
+    metrics["scores_ranks_corr"] = results["ranks_0"].corr(results["ranks_1"], method="spearman")
 
     return metrics
 
@@ -160,7 +193,7 @@ def compare_knn(
     return counts
 
 
-def compare_dist(
+def compare_distance(
     pert: np.ndarray,
     pred: np.ndarray,
     ctrl: np.ndarray,
diff --git a/tests/tools/test_metrics_3g.py b/tests/tools/test_metrics_3g.py
index 11d6aebf..b8891e71 100644
--- a/tests/tools/test_metrics_3g.py
+++ b/tests/tools/test_metrics_3g.py
@@ -1,10 +1,14 @@
+from typing import TYPE_CHECKING
+
 import numpy as np
+import pandas as pd
 import pertpy as pt
 import pytest
+from anndata import AnnData
 from pertpy.tools._metrics_3g import (
     compare_class,
     compare_de,
-    compare_dist,
+    compare_distance,
     compare_knn,
 )
 
@@ -18,17 +22,76 @@ def test_data():
     return X, Y, C
 
 
-def test_compare_de(test_data):
-    X, Y, C = test_data
-    result = compare_de(X, Y, C, shared_top=5)
-    assert isinstance(result, dict)
-    required_keys = {
-        "shared_top_genes",
-        "scores_corr",
-        "pvals_adj_corr",
-        "scores_ranks_corr",
+@pytest.fixture
+def compare_de_adata():
+    rng = np.random.default_rng()
+    adata = AnnData(rng.normal(size=(100, 10)))
+    genes = np.rec.fromarrays(
+        [np.array([f"gene{i}" for i in range(10)])],
+        names=["group1", "O"],
+    )
+    adata.uns["de_key1"] = {
+        "names": genes,
+        "scores": {"group1": rng.random(10)},
+        "pvals_adj": {"group1": rng.random(10)},
+    }
+    adata.uns["de_key2"] = {
+        "names": genes,
+        "scores": {"group1": rng.random(10)},
+        "pvals_adj": {"group1": rng.random(10)},
     }
-    assert all(key in result for key in required_keys)
+    return adata
+
+
+@pytest.fixture
+def compare_de_dataframe():
+    rng = np.random.default_rng()
+    df1 = pd.DataFrame(
+        {
+            "variable": ["gene" + str(i) for i in range(10)],
+            "log_fc": rng.random(10),
+            "adj_p_value": rng.random(10),
+        }
+    )
+    df2 = pd.DataFrame(
+        {
+            "variable": ["gene" + str(i) for i in range(10)],
+            "log_fc": rng.random(10),
+            "adj_p_value": rng.random(10),
+        }
+    )
+    return df1, df2
+
+
+def test_error_both_keys_and_dfs(compare_de_adata, compare_de_dataframe):
+    with pytest.raises(ValueError):
+        compare_de(adata=compare_de_adata, de_key1="de_key1", de_df1=compare_de_dataframe[0])
+
+
+def test_error_missing_adata():
+    with pytest.raises(ValueError):
+        compare_de(de_key1="de_key1", de_key2="de_key2")
+
+
+def test_error_missing_df(compare_de_dataframe):
+    with pytest.raises(ValueError):
+        compare_de(de_df1=compare_de_dataframe[0])
+
+
+def test_compare_de_key(compare_de_adata):
+    results = compare_de(adata=compare_de_adata, de_key1="de_key1", de_key2="de_key2", shared_top=5)
+    assert "shared_top_genes" in results
+    assert "scores_corr" in results
+    assert "pvals_adj_corr" in results
+    assert "scores_ranks_corr" in results
+
+
+def test_compare_de_df(compare_de_dataframe):
+    results = compare_de(de_df1=compare_de_dataframe[0], de_df2=compare_de_dataframe[1], shared_top=5)
+    assert "shared_top_genes" in results
+    assert "scores_corr" in results
+    assert "pvals_adj_corr" in results
+    assert "scores_ranks_corr" in results
 
 
 def test_compare_class(test_data):
@@ -48,11 +111,11 @@ def test_compare_knn(test_data):
     assert isinstance(result_no_ctrl, dict)
 
 
-def test_compare_dist(test_data):
+def test_compare_distance(test_data):
     X, Y, C = test_data
-    res_simple = compare_dist(X, Y, C, mode="simple")
+    res_simple = compare_distance(X, Y, C, mode="simple")
     assert isinstance(res_simple, float)
-    res_scaled = compare_dist(X, Y, C, mode="scaled")
+    res_scaled = compare_distance(X, Y, C, mode="scaled")
     assert isinstance(res_scaled, float)
     with pytest.raises(ValueError):
-        compare_dist(X, Y, C, mode="new_mode")
+        compare_distance(X, Y, C, mode="new_mode")

From 3a8eac67b3f84f994957e9ea70fb8abe90a5bb05 Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Wed, 12 Jun 2024 23:43:08 +0200
Subject: [PATCH 25/34] correct the comments

---
 pertpy/tools/_metrics_3g.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pertpy/tools/_metrics_3g.py b/pertpy/tools/_metrics_3g.py
index c73d719c..d19a875a 100644
--- a/pertpy/tools/_metrics_3g.py
+++ b/pertpy/tools/_metrics_3g.py
@@ -45,11 +45,11 @@ def compare_de(
             "Please provide either both `de_key1` and `de_key2` with `adata`, or `de_df1` and `de_df2`, but not both."
         )
 
-    if de_df1 is None and de_df2 is None:  # use df
+    if de_df1 is None and de_df2 is None:  # use keys
         if not de_key1 or not de_key2:
             raise ValueError("Both `de_key1` and `de_key2` must be provided together if using `adata`.")
 
-    else:  # use keys
+    else:  # use dfs
         if de_df1 is None or de_df2 is None:
             raise ValueError("Both `de_df1` and `de_df2` must be provided together if using dataframes.")
 

From 63ed17a37341cb1ef296510079364030607fd8e4 Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Thu, 13 Jun 2024 14:02:19 +0200
Subject: [PATCH 26/34] speed tests

---
 pertpy/metadata/_drug.py                 |  2 +-
 pertpy/tools/_distances/_distances.py    | 83 ++++++++++++++++--------
 pertpy/tools/_metrics_3g.py              | 43 +-----------
 tests/tools/_distances/test_distances.py | 43 +++++++-----
 tests/tools/test_metrics_3g.py           | 22 +------
 5 files changed, 88 insertions(+), 105 deletions(-)

diff --git a/pertpy/metadata/_drug.py b/pertpy/metadata/_drug.py
index 0f6066eb..106ce93b 100644
--- a/pertpy/metadata/_drug.py
+++ b/pertpy/metadata/_drug.py
@@ -45,7 +45,7 @@ def _download_drug_annotation(
                 block_size=4096,
                 is_zip=False,
             )
-        dgidb_df = pd.read_table(dgidb_path, delimiter=",")
+        dgidb_df = pd.read_table(dgidb_path)
         return dgidb_df
 
     else:
diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index d6a249ac..ee0f6e64 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -23,8 +23,6 @@
 from sklearn.neighbors import KernelDensity
 from statsmodels.discrete.discrete_model import NegativeBinomialP
 
-AGG_FCTS = {"mean": np.mean, "median": np.median, "sum": np.sum}
-
 if TYPE_CHECKING:
     from collections.abc import Callable
 
@@ -115,7 +113,7 @@ class Distance:
     def __init__(
         self,
         metric: str = "edistance",
-        agg_fct: Literal["mean", "median", "sum"] = "mean",
+        agg_fct: Callable = np.mean,
         layer_key: str = None,
         obsm_key: str = None,
         cell_wise_metric: str = "euclidean",
@@ -124,7 +122,7 @@ def __init__(
 
         Args:
             metric: Distance metric to use.
-            agg_fct: Name of the aggregation function to generate pseudobulk vectors.
+            agg_fct: Aggregation function to generate pseudobulk vectors.
             layer_key: Name of the counts layer containing raw counts to calculate distances for.
                               Mutually exclusive with 'obsm_key'.
                               Is not used if `None`.
@@ -133,8 +131,8 @@ def __init__(
                       Defaults to None, but is set to "X_pca" if not explicitly set internally.
             cell_wise_metric: Metric from scipy.spatial.distance to use for pairwise distances between single cells.
         """
-        self.aggregation_func = AGG_FCTS[agg_fct]
         metric_fct: AbstractDistance = None
+        self.aggregation_func = agg_fct
         if metric == "edistance":
             metric_fct = Edistance()
         elif metric == "euclidean":
@@ -231,7 +229,7 @@ def bootstrap(
         Y: np.ndarray,
         *,
         n_bootstrap: int = 100,
-        bootstrap_random_state: int = 0,
+        random_state: int = 0,
         **kwargs,
     ) -> MeanVar:
         """Bootstrap computation of mean and variance of the distance between vectors X and Y.
@@ -240,7 +238,7 @@ def bootstrap(
             X: First vector of shape (n_samples, n_features).
             Y: Second vector of shape (n_samples, n_features).
             n_bootstrap: Number of bootstrap samples.
-            bootstrap_random_state: Random state for bootstrapping.
+            random_state: Random state for bootstrapping.
 
         Returns:
             MeanVar: Mean and variance of distance between X and Y.
@@ -257,7 +255,7 @@ def bootstrap(
             X,
             Y,
             n_bootstraps=n_bootstrap,
-            bootstrap_random_state=bootstrap_random_state,
+            random_state=random_state,
             **kwargs,
         )
 
@@ -268,7 +266,7 @@ def pairwise(
         groups: list[str] | None = None,
         bootstrap: bool = False,
         n_bootstrap: int = 100,
-        bootstrap_random_state: int = 0,
+        random_state: int = 0,
         show_progressbar: bool = True,
         n_jobs: int = -1,
         **kwargs,
@@ -282,7 +280,7 @@ def pairwise(
                     If None, uses all groups.
             bootstrap: Whether to bootstrap the distance.
             n_bootstrap: Number of bootstrap samples.
-            bootstrap_random_state: Random state for bootstrapping.
+            random_state: Random state for bootstrapping.
             show_progressbar: Whether to show progress bar.
             n_jobs: Number of cores to use. Defaults to -1 (all).
             kwargs: Additional keyword arguments passed to the metric function.
@@ -334,7 +332,7 @@ def pairwise(
                             sub_pwd,
                             sub_idx,
                             n_bootstraps=n_bootstrap,
-                            bootstrap_random_state=bootstrap_random_state,
+                            random_state=random_state,
                             **kwargs,
                         )
                         # In the bootstrap case, distance of group to itself is a mean and can be non-zero
@@ -360,7 +358,7 @@ def pairwise(
                             cells_x,
                             cells_y,
                             n_bootstrap=n_bootstrap,
-                            bootstrap_random_state=bootstrap_random_state,
+                            random_state=random_state,
                             **kwargs,
                         )
                         # In the bootstrap case, distance of group to itself is a mean and can be non-zero
@@ -390,7 +388,7 @@ def onesided_distances(
         groups: list[str] | None = None,
         bootstrap: bool = False,
         n_bootstrap: int = 100,
-        bootstrap_random_state: int = 0,
+        random_state: int = 0,
         show_progressbar: bool = True,
         n_jobs: int = -1,
         **kwargs,
@@ -405,7 +403,7 @@ def onesided_distances(
                     If None, uses all groups.
             bootstrap: Whether to bootstrap the distance.
             n_bootstrap: Number of bootstrap samples.
-            bootstrap_random_state: Random state for bootstrapping.
+            random_state: Random state for bootstrapping.
             show_progressbar: Whether to show progress bar.
             n_jobs: Number of cores to use. Defaults to -1 (all).
             kwargs: Additional keyword arguments passed to the metric function.
@@ -469,7 +467,7 @@ def onesided_distances(
                             sub_pwd,
                             sub_idx,
                             n_bootstraps=n_bootstrap,
-                            bootstrap_random_state=bootstrap_random_state,
+                            random_state=random_state,
                             **kwargs,
                         )
                         df.loc[group_x] = bootstrap_output.mean
@@ -492,7 +490,7 @@ def onesided_distances(
                         cells_x,
                         cells_y,
                         n_bootstrap=n_bootstrap,
-                        bootstrap_random_state=bootstrap_random_state,
+                        random_state=random_state,
                         **kwargs,
                     )
                     # In the bootstrap case, distance of group to itself is a mean and can be non-zero
@@ -533,8 +531,42 @@ def precompute_distances(self, adata: AnnData, n_jobs: int = -1) -> None:
         pwd = pairwise_distances(cells, cells, metric=self.cell_wise_metric, n_jobs=n_jobs)
         adata.obsp[f"{self.obsm_key}_{self.cell_wise_metric}_predistances"] = pwd
 
-    def _bootstrap_mode(self, X, Y, n_bootstraps=100, bootstrap_random_state=0, **kwargs) -> MeanVar:
-        rng = np.random.default_rng(bootstrap_random_state)
+    def compare_distance(
+        self,
+        pert: np.ndarray,
+        pred: np.ndarray,
+        ctrl: np.ndarray,
+        mode: Literal["simple", "scaled"] = "simple",
+        fit_to_pert_and_ctrl: bool = False,
+        **kwargs,
+    ) -> float:
+        """Compute the score of simulating a perturbation.
+
+        Args:
+            pert: Real perturbed data.
+            pred: Simulated perturbed data.
+            ctrl: Control data
+            mode: Mode to use.
+            fit_to_pert_and_ctrl: Scales data based on both `pert` and `ctrl` if True, otherwise only on `ctrl`.
+            kwargs: Additional keyword arguments passed to the metric function.
+        """
+        if mode == "simple":
+            pass  # nothing to be done
+        elif mode == "scaled":
+            from sklearn.preprocessing import MinMaxScaler
+
+            scaler = MinMaxScaler().fit(np.vstack((pert, ctrl)) if fit_to_pert_and_ctrl else ctrl)
+            pred = scaler.transform(pred)
+            pert = scaler.transform(pert)
+        else:
+            raise ValueError(f"Unknown mode {mode}. Please choose simple or scaled.")
+
+        d1 = self.metric_fct(pert, pred, **kwargs)
+        d2 = self.metric_fct(ctrl, pred, **kwargs)
+        return d1 / d2
+
+    def _bootstrap_mode(self, X, Y, n_bootstraps=100, random_state=0, **kwargs) -> MeanVar:
+        rng = np.random.default_rng(random_state)
 
         distances = []
         for _ in range(n_bootstraps):
@@ -548,10 +580,8 @@ def _bootstrap_mode(self, X, Y, n_bootstraps=100, bootstrap_random_state=0, **kw
         variance = np.var(distances)
         return MeanVar(mean=mean, variance=variance)
 
-    def _bootstrap_mode_precomputed(
-        self, sub_pwd, sub_idx, n_bootstraps=100, bootstrap_random_state=0, **kwargs
-    ) -> MeanVar:
-        rng = np.random.default_rng(bootstrap_random_state)
+    def _bootstrap_mode_precomputed(self, sub_pwd, sub_idx, n_bootstraps=100, random_state=0, **kwargs) -> MeanVar:
+        rng = np.random.default_rng(random_state)
 
         distances = []
         for _ in range(n_bootstraps):
@@ -1074,10 +1104,7 @@ def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
 
 
 class MeanVarDistributionDistance(AbstractDistance):
-    """
-    Distance between mean-var distributions of gene expression.
-
-    """
+    """Distance between mean-var distributions of gene expression."""
 
     def __init__(self) -> None:
         super().__init__()
@@ -1159,7 +1186,9 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
         return mahalanobis(
-            self.aggregation_func(X, axis=0), self.aggregation_func(Y, axis=0), np.linalg.inv(np.cov(X.T))
+            self.aggregation_func(X, axis=0),
+            self.aggregation_func(Y, axis=0),
+            np.linalg.inv(np.cov(X.T)),
         )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
diff --git a/pertpy/tools/_metrics_3g.py b/pertpy/tools/_metrics_3g.py
index d19a875a..ec13ab1a 100644
--- a/pertpy/tools/_metrics_3g.py
+++ b/pertpy/tools/_metrics_3g.py
@@ -1,7 +1,4 @@
-from collections.abc import Mapping
-from functools import partial
-from types import MappingProxyType
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING
 
 import numpy as np
 import pandas as pd
@@ -12,8 +9,6 @@
 from sklearn.base import ClassifierMixin
 from sklearn.linear_model import LogisticRegression
 
-from pertpy.tools._distances._distances import Distance
-
 if TYPE_CHECKING:
     from numpy.typing import NDArray
 
@@ -191,39 +186,3 @@ def compare_knn(
         counts[group] = count_norm
 
     return counts
-
-
-def compare_distance(
-    pert: np.ndarray,
-    pred: np.ndarray,
-    ctrl: np.ndarray,
-    *,
-    metric: str = "euclidean",
-    mode: Literal["simple", "scaled"] = "simple",
-    metric_kwds: Mapping[str, Any] = MappingProxyType({}),
-    _fit_to_pert_and_ctrl: bool = False,
-) -> float:
-    """Compute the score of simulating a perturbation.
-
-    Args:
-        pert: Real perturbed data.
-        pred: Simulated perturbed data.
-        ctrl: Control data
-        mode: Mode to use.
-    """
-    metric_fct = partial(Distance(metric).metric_fct, **metric_kwds)
-
-    if mode == "simple":
-        pass  # nothing to be done
-    elif mode == "scaled":
-        from sklearn.preprocessing import MinMaxScaler
-
-        scaler = MinMaxScaler().fit(np.vstack((pert, ctrl)) if _fit_to_pert_and_ctrl else ctrl)
-        pred = scaler.transform(pred)
-        pert = scaler.transform(pert)
-    else:
-        raise ValueError(f"Unknown mode {mode}. Please choose simple or scaled.")
-
-    d1 = metric_fct(pert, pred)
-    d2 = metric_fct(ctrl, pred)
-    return d1 / d2
diff --git a/tests/tools/_distances/test_distances.py b/tests/tools/_distances/test_distances.py
index b9ffe3a2..4dc70219 100644
--- a/tests/tools/_distances/test_distances.py
+++ b/tests/tools/_distances/test_distances.py
@@ -61,8 +61,8 @@ def adata(request):
         sc.pp.pca(adata, n_comps=5)
     if distance in lognorm_counts_distances:
         groups = np.unique(adata.obs["perturbation"])
-        # KDE is slow, subset to 5 groups for speed up
-        adata = adata[adata.obs["perturbation"].isin(groups[0:5])].copy()
+        # KDE is slow, subset to 3 groups for speed up
+        adata = adata[adata.obs["perturbation"].isin(groups[0:3])].copy()
 
     return adata
 
@@ -100,11 +100,10 @@ def test_distance_axioms(pairwise_distance, distance):
 
 
 @mark.parametrize("distance", actual_distances)
-def test_triangle_inequality(pairwise_distance, distance):
+def test_triangle_inequality(pairwise_distance, distance, rng):
     # Test if distances are well-defined in accordance with metric axioms
     # (M4) Triangle inequality (we just probe this for a few random triplets)
     for _i in range(10):
-        rng = np.random.default_rng()
         triplet = rng.choice(pairwise_distance.index, size=3, replace=False)
         assert (
             pairwise_distance.loc[triplet[0], triplet[1]] + pairwise_distance.loc[triplet[1], triplet[2]]
@@ -121,7 +120,7 @@ def test_distance_layers(pairwise_distance, distance):
 
 @mark.parametrize("distance", actual_distances + pseudo_counts_distances)
 def test_distance_counts(adata, distance):
-    if distance != "mahalanobis":  # doesn't work, covariance matrix is a singular matrix, not invertible
+    if distance != "mahalanobis":  # skip, doesn't work because covariance matrix is a singular matrix, not invertible
         Distance = pt.tl.Distance(distance, layer_key="counts")
         df = Distance.pairwise(adata, groupby="perturbation")
         assert isinstance(df, DataFrame)
@@ -136,12 +135,11 @@ def test_mutually_exclusive_keys(distance):
 
 
 @mark.parametrize("distance", actual_distances + semi_distances + non_distances)
-def test_distance_output_type(distance):
+def test_distance_output_type(distance, rng):
     # Test if distances are outputting floats
     Distance = pt.tl.Distance(distance, obsm_key="X_pca")
-    rng = np.random.default_rng()
-    X = rng.normal(size=(100, 10))
-    Y = rng.normal(size=(100, 10))
+    X = rng.normal(size=(50, 10))
+    Y = rng.normal(size=(50, 10))
     d = Distance(X, Y)
     assert isinstance(d, float)
 
@@ -156,13 +154,12 @@ def test_distance_onesided(adata, distance_obj, distance):
 
 
 @mark.parametrize("distance", actual_distances + semi_distances + non_distances)
-def test_bootstrap_distance_output_type(distance):
+def test_bootstrap_distance_output_type(distance, rng):
     # Test if distances are outputting floats
     Distance = pt.tl.Distance(distance, obsm_key="X_pca")
-    rng = np.random.default_rng()
-    X = rng.normal(size=(100, 10))
-    Y = rng.normal(size=(100, 10))
-    d = Distance.bootstrap(X, Y, n_bootstrap=10)
+    X = rng.normal(size=(50, 10))
+    Y = rng.normal(size=(50, 10))
+    d = Distance.bootstrap(X, Y, n_bootstrap=3)
     assert hasattr(d, "mean")
     assert hasattr(d, "variance")
 
@@ -170,7 +167,8 @@ def test_bootstrap_distance_output_type(distance):
 @mark.parametrize("distance", all_distances)
 def test_bootstrap_distance_pairwise(adata, distance_obj, distance):
     # Test consistency of pairwise distance results
-    bootstrap_output = distance_obj.pairwise(adata, groupby="perturbation", bootstrap=True, n_bootstrap=10)
+    bootstrap_output = distance_obj.pairwise(adata, groupby="perturbation", bootstrap=True, n_bootstrap=3)
+
     assert isinstance(bootstrap_output, tuple)
 
     mean = bootstrap_output[0]
@@ -190,7 +188,20 @@ def test_bootstrap_distance_onesided(adata, distance_obj, distance):
         groupby="perturbation",
         selected_group=selected_group,
         bootstrap=True,
-        n_bootstrap=10,
+        n_bootstrap=3,
     )
 
     assert isinstance(bootstrap_output, tuple)
+
+
+def test_compare_distance(rng):
+    X = rng.normal(size=(50, 10))
+    Y = rng.normal(size=(50, 10))
+    C = rng.normal(size=(50, 10))
+    Distance = pt.tl.Distance()
+    res_simple = Distance.compare_distance(X, Y, C, mode="simple")
+    assert isinstance(res_simple, float)
+    res_scaled = Distance.compare_distance(X, Y, C, mode="scaled")
+    assert isinstance(res_scaled, float)
+    with pytest.raises(ValueError):
+        Distance.compare_distance(X, Y, C, mode="new_mode")
diff --git a/tests/tools/test_metrics_3g.py b/tests/tools/test_metrics_3g.py
index b8891e71..d481fc86 100644
--- a/tests/tools/test_metrics_3g.py
+++ b/tests/tools/test_metrics_3g.py
@@ -1,5 +1,3 @@
-from typing import TYPE_CHECKING
-
 import numpy as np
 import pandas as pd
 import pertpy as pt
@@ -8,14 +6,12 @@
 from pertpy.tools._metrics_3g import (
     compare_class,
     compare_de,
-    compare_distance,
     compare_knn,
 )
 
 
 @pytest.fixture
-def test_data():
-    rng = np.random.default_rng()
+def test_data(rng):
     X = rng.normal(size=(100, 10))
     Y = rng.normal(size=(100, 10))
     C = rng.normal(size=(100, 10))
@@ -23,8 +19,7 @@ def test_data():
 
 
 @pytest.fixture
-def compare_de_adata():
-    rng = np.random.default_rng()
+def compare_de_adata(rng):
     adata = AnnData(rng.normal(size=(100, 10)))
     genes = np.rec.fromarrays(
         [np.array([f"gene{i}" for i in range(10)])],
@@ -44,8 +39,7 @@ def compare_de_adata():
 
 
 @pytest.fixture
-def compare_de_dataframe():
-    rng = np.random.default_rng()
+def compare_de_dataframe(rng):
     df1 = pd.DataFrame(
         {
             "variable": ["gene" + str(i) for i in range(10)],
@@ -109,13 +103,3 @@ def test_compare_knn(test_data):
 
     result_no_ctrl = compare_knn(X, Y)
     assert isinstance(result_no_ctrl, dict)
-
-
-def test_compare_distance(test_data):
-    X, Y, C = test_data
-    res_simple = compare_distance(X, Y, C, mode="simple")
-    assert isinstance(res_simple, float)
-    res_scaled = compare_distance(X, Y, C, mode="scaled")
-    assert isinstance(res_scaled, float)
-    with pytest.raises(ValueError):
-        compare_distance(X, Y, C, mode="new_mode")

From f9e0d364f5f6dace2f1ed9a2ade7468c37e56731 Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Thu, 13 Jun 2024 16:25:38 +0200
Subject: [PATCH 27/34] speed up tests

---
 tests/tools/_distances/test_distances.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/tests/tools/_distances/test_distances.py b/tests/tools/_distances/test_distances.py
index 4dc70219..0f0e6fc4 100644
--- a/tests/tools/_distances/test_distances.py
+++ b/tests/tools/_distances/test_distances.py
@@ -137,7 +137,7 @@ def test_mutually_exclusive_keys(distance):
 @mark.parametrize("distance", actual_distances + semi_distances + non_distances)
 def test_distance_output_type(distance, rng):
     # Test if distances are outputting floats
-    Distance = pt.tl.Distance(distance, obsm_key="X_pca")
+    Distance = pt.tl.Distance(distance)
     X = rng.normal(size=(50, 10))
     Y = rng.normal(size=(50, 10))
     d = Distance(X, Y)
@@ -153,10 +153,9 @@ def test_distance_onesided(adata, distance_obj, distance):
     assert df.loc[selected_group] == 0  # distance to self is 0
 
 
-@mark.parametrize("distance", actual_distances + semi_distances + non_distances)
-def test_bootstrap_distance_output_type(distance, rng):
+def test_bootstrap_distance_output_type(rng):
     # Test if distances are outputting floats
-    Distance = pt.tl.Distance(distance, obsm_key="X_pca")
+    Distance = pt.tl.Distance(metric="edistance")
     X = rng.normal(size=(50, 10))
     Y = rng.normal(size=(50, 10))
     d = Distance.bootstrap(X, Y, n_bootstrap=3)
@@ -164,10 +163,11 @@ def test_bootstrap_distance_output_type(distance, rng):
     assert hasattr(d, "variance")
 
 
-@mark.parametrize("distance", all_distances)
-def test_bootstrap_distance_pairwise(adata, distance_obj, distance):
+@mark.parametrize("distance", ["edistance"])
+def test_bootstrap_distance_pairwise(adata, distance):
     # Test consistency of pairwise distance results
-    bootstrap_output = distance_obj.pairwise(adata, groupby="perturbation", bootstrap=True, n_bootstrap=3)
+    Distance = pt.tl.Distance(distance, obsm_key="X_pca")
+    bootstrap_output = Distance.pairwise(adata, groupby="perturbation", bootstrap=True, n_bootstrap=3)
 
     assert isinstance(bootstrap_output, tuple)
 
@@ -179,11 +179,12 @@ def test_bootstrap_distance_pairwise(adata, distance_obj, distance):
     assert np.sum(var.values - var.values.T) == 0  # symmetry
 
 
-@mark.parametrize("distance", all_distances)
-def test_bootstrap_distance_onesided(adata, distance_obj, distance):
+@mark.parametrize("distance", ["edistance"])
+def test_bootstrap_distance_onesided(adata, distance):
     # Test consistency of one-sided distance results
     selected_group = adata.obs.perturbation.unique()[0]
-    bootstrap_output = distance_obj.onesided_distances(
+    Distance = pt.tl.Distance(distance, obsm_key="X_pca")
+    bootstrap_output = Distance.onesided_distances(
         adata,
         groupby="perturbation",
         selected_group=selected_group,

From 2e65f9de7f1bd16d7a145711b00f6b895be243bd Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Tue, 18 Jun 2024 21:21:58 +0200
Subject: [PATCH 28/34] split metrics_3g

---
 pertpy/tools/__init__.py                      |  17 +-
 .../_differential_gene_expression/__init__.py |   1 +
 .../_dge_comparison.py                        | 105 ++++++++++
 pertpy/tools/_metrics_3g.py                   | 188 ------------------
 .../tools/_perturbation_space/_comparison.py  | 124 ++++++++++++
 .../test_dge.py}                              |  61 ++----
 .../_perturbation_space/test_comparison.py    |  29 +++
 7 files changed, 293 insertions(+), 232 deletions(-)
 create mode 100644 pertpy/tools/_differential_gene_expression/_dge_comparison.py
 delete mode 100644 pertpy/tools/_metrics_3g.py
 create mode 100644 pertpy/tools/_perturbation_space/_comparison.py
 rename tests/tools/{test_metrics_3g.py => _differential_gene_expression/test_dge.py} (52%)
 create mode 100644 tests/tools/_perturbation_space/test_comparison.py

diff --git a/pertpy/tools/__init__.py b/pertpy/tools/__init__.py
index 4bc976a2..18b11f1d 100644
--- a/pertpy/tools/__init__.py
+++ b/pertpy/tools/__init__.py
@@ -3,18 +3,31 @@
 from pertpy.tools._coda._sccoda import Sccoda
 from pertpy.tools._coda._tasccoda import Tasccoda
 from pertpy.tools._dialogue import Dialogue
-from pertpy.tools._differential_gene_expression import EdgeR, PyDESeq2, Statsmodels, TTest, WilcoxonTest
+from pertpy.tools._differential_gene_expression import (
+    DGE,
+    EdgeR,
+    PyDESeq2,
+    Statsmodels,
+    TTest,
+    WilcoxonTest,
+)
 from pertpy.tools._distances._distance_tests import DistanceTest
 from pertpy.tools._distances._distances import Distance
 from pertpy.tools._enrichment import Enrichment
 from pertpy.tools._milo import Milo
 from pertpy.tools._mixscape import Mixscape
 from pertpy.tools._perturbation_space._clustering import ClusteringSpace
+from pertpy.tools._perturbation_space._comparison import PerturbationComparison
 from pertpy.tools._perturbation_space._discriminator_classifiers import (
     LRClassifierSpace,
     MLPClassifierSpace,
 )
-from pertpy.tools._perturbation_space._simple import CentroidSpace, DBSCANSpace, KMeansSpace, PseudobulkSpace
+from pertpy.tools._perturbation_space._simple import (
+    CentroidSpace,
+    DBSCANSpace,
+    KMeansSpace,
+    PseudobulkSpace,
+)
 from pertpy.tools._scgen import Scgen
 
 __all__ = [
diff --git a/pertpy/tools/_differential_gene_expression/__init__.py b/pertpy/tools/_differential_gene_expression/__init__.py
index 178ecaa3..b349cedb 100644
--- a/pertpy/tools/_differential_gene_expression/__init__.py
+++ b/pertpy/tools/_differential_gene_expression/__init__.py
@@ -1,4 +1,5 @@
 from ._base import ContrastType, LinearModelBase, MethodBase
+from ._dge_comparison import DGE
 from ._edger import EdgeR
 from ._pydeseq2 import PyDESeq2
 from ._simple_tests import SimpleComparisonBase, TTest, WilcoxonTest
diff --git a/pertpy/tools/_differential_gene_expression/_dge_comparison.py b/pertpy/tools/_differential_gene_expression/_dge_comparison.py
new file mode 100644
index 00000000..16c8c9c7
--- /dev/null
+++ b/pertpy/tools/_differential_gene_expression/_dge_comparison.py
@@ -0,0 +1,105 @@
+import numpy as np
+import pandas as pd
+from anndata import AnnData
+from typing import Dict
+
+
+class DGE:
+    def compare(
+        self,
+        adata: AnnData | None = None,
+        de_key1: str = None,
+        de_key2: str = None,
+        de_df1: pd.DataFrame | None = None,
+        de_df2: pd.DataFrame | None = None,
+        shared_top: int = 100,
+    ) -> Dict[str, float]:
+        """Compare two differential expression analyses.
+
+        Compare two sets of DE results and evaluate the similarity by the overlap of top DEG and
+        the correlation of their scores and adjusted p-values.
+
+        Args:
+            adata: AnnData object containing DE results in `uns`. Required if `de_key1` and `de_key2` are used.
+            de_key1: Key for DE results in `adata.uns`, e.g., output of `tl.rank_genes_groups`.
+            de_key2: Another key for DE results in `adata.uns`, e.g., output of `tl.rank_genes_groups`.
+            de_df1: DataFrame containing DE results, e.g. output from pertpy differential gene expression interface.
+            de_df2: DataFrame containing DE results, e.g. output from pertpy differential gene expression interface.
+            shared_top: The number of top DEG to compute the proportion of their intersection.
+
+        """
+        if (de_key1 or de_key2) and (de_df1 is not None or de_df2 is not None):
+            raise ValueError(
+                "Please provide either both `de_key1` and `de_key2` with `adata`, or `de_df1` and `de_df2`, but not both."
+            )
+
+        if de_df1 is None and de_df2 is None:  # use keys
+            if not de_key1 or not de_key2:
+                raise ValueError(
+                    "Both `de_key1` and `de_key2` must be provided together if using `adata`."
+                )
+
+        else:  # use dfs
+            if de_df1 is None or de_df2 is None:
+                raise ValueError(
+                    "Both `de_df1` and `de_df2` must be provided together if using dataframes."
+                )
+
+        if de_key1:
+            if not adata:
+                raise ValueError(
+                    "`adata` should be provided with `de_key1` and `de_key2`. "
+                )
+            assert all(
+                k in adata.uns for k in [de_key1, de_key2]
+            ), "Provided `de_key1` and `de_key2` must exist in `adata.uns`."
+            vars = adata.var_names
+
+        if de_df1 is not None:
+            for df in (de_df1, de_df2):
+                if not {"variable", "log_fc", "adj_p_value"}.issubset(df.columns):
+                    raise ValueError(
+                        "Each DataFrame must contain columns: 'variable', 'log_fc', and 'adj_p_value'."
+                    )
+
+            assert set(de_df1["variable"]) == set(
+                de_df2["variable"]
+            ), "Variables in both dataframes must match."
+            vars = de_df1["variable"].sort_values()
+
+        shared_top = min(shared_top, len(vars))
+        vars_ranks = np.arange(1, len(vars) + 1)
+        results = pd.DataFrame(index=vars)
+        top_names = []
+
+        if de_key1 and de_key2:
+            for i, k in enumerate([de_key1, de_key2]):
+                label = adata.uns[k]["names"].dtype.names[0]
+                srt_idx = np.argsort(adata.uns[k]["names"][label])
+                results[f"scores_{i}"] = adata.uns[k]["scores"][label][srt_idx]
+                results[f"pvals_adj_{i}"] = adata.uns[k]["pvals_adj"][label][srt_idx]
+                results[f"ranks_{i}"] = vars_ranks[srt_idx]
+                top_names.append(adata.uns[k]["names"][label][:shared_top])
+        else:
+            for i, df in enumerate([de_df1, de_df2]):
+                srt_idx = np.argsort(df["variable"])
+                results[f"scores_{i}"] = df["log_fc"].values[srt_idx]
+                results[f"pvals_adj_{i}"] = df["adj_p_value"].values[srt_idx]
+                results[f"ranks_{i}"] = vars_ranks[srt_idx]
+                top_names.append(df["variable"][:shared_top])
+
+        metrics = {}
+        metrics["shared_top_genes"] = (
+            len(set(top_names[0]).intersection(top_names[1])) / shared_top
+        )
+        metrics["scores_corr"] = results["scores_0"].corr(
+            results["scores_1"], method="pearson"
+        )
+        metrics["pvals_adj_corr"] = results["pvals_adj_0"].corr(
+            results["pvals_adj_1"], method="pearson"
+        )
+        metrics["scores_ranks_corr"] = results["ranks_0"].corr(
+            results["ranks_1"], method="spearman"
+        )
+
+        return metrics
diff --git a/pertpy/tools/_metrics_3g.py b/pertpy/tools/_metrics_3g.py
deleted file mode 100644
index ec13ab1a..00000000
--- a/pertpy/tools/_metrics_3g.py
+++ /dev/null
@@ -1,188 +0,0 @@
-from typing import TYPE_CHECKING
-
-import numpy as np
-import pandas as pd
-import pynndescent
-from anndata import AnnData
-from scipy.sparse import issparse
-from scipy.sparse import vstack as sp_vstack
-from sklearn.base import ClassifierMixin
-from sklearn.linear_model import LogisticRegression
-
-if TYPE_CHECKING:
-    from numpy.typing import NDArray
-
-
-def compare_de(
-    adata: AnnData | None = None,
-    de_key1: str = None,
-    de_key2: str = None,
-    de_df1: pd.DataFrame | None = None,
-    de_df2: pd.DataFrame | None = None,
-    shared_top: int = 100,
-) -> dict:
-    """Compare two differential expression analyses.
-
-    Compare two sets of DE results and evaluate the similarity by the overlap of top DEG and
-    the correlation of their scores and adjusted p-values.
-
-    Args:
-        adata: AnnData object containing DE results in `uns`. Required if `de_key1` and `de_key2` are used.
-        de_key1: Key for DE results in `adata.uns`, e.g., output of `tl.rank_genes_groups`.
-        de_key2: Another key for DE results in `adata.uns`, e.g., output of `tl.rank_genes_groups`.
-        de_df1: DataFrame containing DE results, e.g. output from pertpy differential gene expression interface.
-        de_df2: DataFrame containing DE results, e.g. output from pertpy differential gene expression interface.
-        shared_top: The number of top DEG to compute the proportion of their intersection.
-
-    """
-    if (de_key1 or de_key2) and (de_df1 is not None or de_df2 is not None):
-        raise ValueError(
-            "Please provide either both `de_key1` and `de_key2` with `adata`, or `de_df1` and `de_df2`, but not both."
-        )
-
-    if de_df1 is None and de_df2 is None:  # use keys
-        if not de_key1 or not de_key2:
-            raise ValueError("Both `de_key1` and `de_key2` must be provided together if using `adata`.")
-
-    else:  # use dfs
-        if de_df1 is None or de_df2 is None:
-            raise ValueError("Both `de_df1` and `de_df2` must be provided together if using dataframes.")
-
-    if de_key1:
-        if not adata:
-            raise ValueError("`adata` should be provided with `de_key1` and `de_key2`. ")
-        assert all(
-            k in adata.uns for k in [de_key1, de_key2]
-        ), "Provided `de_key1` and `de_key2` must exist in `adata.uns`."
-        vars = adata.var_names
-
-    if de_df1 is not None:
-        for df in (de_df1, de_df2):
-            if not {"variable", "log_fc", "adj_p_value"}.issubset(df.columns):
-                raise ValueError("Each DataFrame must contain columns: 'variable', 'log_fc', and 'adj_p_value'.")
-
-        assert set(de_df1["variable"]) == set(de_df2["variable"]), "Variables in both dataframes must match."
-        vars = de_df1["variable"].sort_values()
-
-    shared_top = min(shared_top, len(vars))
-    vars_ranks = np.arange(1, len(vars) + 1)
-    results = pd.DataFrame(index=vars)
-    top_names = []
-
-    if de_key1 and de_key2:
-        for i, k in enumerate([de_key1, de_key2]):
-            label = adata.uns[k]["names"].dtype.names[0]
-            srt_idx = np.argsort(adata.uns[k]["names"][label])
-            results[f"scores_{i}"] = adata.uns[k]["scores"][label][srt_idx]
-            results[f"pvals_adj_{i}"] = adata.uns[k]["pvals_adj"][label][srt_idx]
-            results[f"ranks_{i}"] = vars_ranks[srt_idx]
-            top_names.append(adata.uns[k]["names"][label][:shared_top])
-    else:
-        for i, df in enumerate([de_df1, de_df2]):
-            srt_idx = np.argsort(df["variable"])
-            results[f"scores_{i}"] = df["log_fc"].values[srt_idx]
-            results[f"pvals_adj_{i}"] = df["adj_p_value"].values[srt_idx]
-            results[f"ranks_{i}"] = vars_ranks[srt_idx]
-            top_names.append(df["variable"][:shared_top])
-
-    metrics = {}
-    metrics["shared_top_genes"] = len(set(top_names[0]).intersection(top_names[1])) / shared_top
-    metrics["scores_corr"] = results["scores_0"].corr(results["scores_1"], method="pearson")
-    metrics["pvals_adj_corr"] = results["pvals_adj_0"].corr(results["pvals_adj_1"], method="pearson")
-    metrics["scores_ranks_corr"] = results["ranks_0"].corr(results["ranks_1"], method="spearman")
-
-    return metrics
-
-
-def compare_class(X: np.ndarray, Y: np.ndarray, C: np.ndarray, clf: ClassifierMixin | None = None) -> float:
-    """Compare classification accuracy between real and simulated perturbations.
-
-    Trains a classifier on the real perturbation data + the control data and reports a normalized
-    classification accuracy on the simulated perturbation.
-
-    Args:
-        X: Real perturbed data.
-        Y: Simulated perturbed data.
-        C: Control data
-        clf: sklearn classifier to use, `sklearn.linear_model.LogisticRegression` if not provided.
-    """
-    assert X.shape[1] == Y.shape[1] == C.shape[1]
-
-    if clf is None:
-        clf = LogisticRegression()
-
-    n_x = X.shape[0]
-    n_xc = n_x + C.shape[0]
-
-    data = sp_vstack((X, C)) if issparse(X) else np.vstack((X, C))
-
-    labels = np.full(n_xc, "ctrl")
-    labels[:n_x] = "comp"
-    clf.fit(data, labels)
-
-    norm_score = clf.score(Y, np.full(Y.shape[0], "comp")) / clf.score(X, labels[:n_x])
-    norm_score = min(1.0, norm_score)
-
-    return norm_score
-
-
-def compare_knn(
-    X: np.ndarray,
-    Y: np.ndarray,
-    C: np.ndarray | None = None,
-    n_neighbors: int = 20,
-    use_Y_knn: bool = False,
-    random_state: int = 0,
-    n_jobs: int = 1,
-) -> dict[str, float]:
-    """Calculate proportions of real perturbed and control data points for simulated data.
-
-    Computes proportions of real perturbed (if provided), control and simulated (if `use_Y_knn=True`)
-    data points for simulated data. If control (`C`) is not provided, builds the knn graph from
-    real perturbed + simulated perturbed.
-
-    Args:
-        X: Real perturbed data.
-        Y: Simulated perturbed data.
-        C: Control data.
-        use_Y_knn: Include simulted perturbed data (`Y`) into the knn graph. Only valid when
-            control (`C`) is provided.
-    """
-    assert X.shape[1] == Y.shape[1]
-    if C is not None:
-        assert X.shape[1] == C.shape[1]
-
-    n_y = Y.shape[0]
-
-    if C is None:
-        index_data = sp_vstack((Y, X)) if issparse(X) else np.vstack((Y, X))
-    else:
-        datas = (Y, X, C) if use_Y_knn else (X, C)
-        index_data = sp_vstack(datas) if issparse(X) else np.vstack(datas)
-
-    y_in_index = use_Y_knn or C is None
-    c_in_index = C is not None
-    label_groups = ["comp"]
-    labels: NDArray[np.str_] = np.full(index_data.shape[0], "comp")
-    if y_in_index:
-        labels[:n_y] = "siml"
-        label_groups.append("siml")
-    if c_in_index:
-        labels[-C.shape[0] :] = "ctrl"
-        label_groups.append("ctrl")
-
-    index = pynndescent.NNDescent(
-        index_data,
-        n_neighbors=max(50, n_neighbors),
-        random_state=random_state,
-        n_jobs=n_jobs,
-    )
-    indices = index.query(Y, k=n_neighbors)[0]
-
-    uq, uq_counts = np.unique(labels[indices], return_counts=True)
-    uq_counts_norm = uq_counts / uq_counts.sum()
-    counts = dict(zip(label_groups, [0.0] * len(label_groups), strict=False))
-    for group, count_norm in zip(uq, uq_counts_norm, strict=False):
-        counts[group] = count_norm
-
-    return counts
diff --git a/pertpy/tools/_perturbation_space/_comparison.py b/pertpy/tools/_perturbation_space/_comparison.py
new file mode 100644
index 00000000..390c2d9d
--- /dev/null
+++ b/pertpy/tools/_perturbation_space/_comparison.py
@@ -0,0 +1,124 @@
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pynndescent
+from scipy.sparse import issparse
+from scipy.sparse import vstack as sp_vstack
+from sklearn.base import ClassifierMixin
+from sklearn.linear_model import LogisticRegression
+
+if TYPE_CHECKING:
+    from numpy.typing import NDArray
+
+
+class PerturbationComparison:
+    """Comparison between real and simulated perturbations."""
+
+    def compare_classification(
+        self,
+        real: np.ndarray,
+        simulated: np.ndarray,
+        control: np.ndarray,
+        clf: ClassifierMixin | None = None,
+    ) -> float:
+        """Compare classification accuracy between real and simulated perturbations.
+
+        Trains a classifier on the real perturbation data + the control data and reports a normalized
+        classification accuracy on the simulated perturbation.
+
+        Args:
+            real: Real perturbed data.
+            simulated: Simulated perturbed data.
+            control: Control data
+            clf: sklearn classifier to use, `sklearn.linear_model.LogisticRegression` if not provided.
+        """
+        assert real.shape[1] == simulated.shape[1] == control.shape[1]
+        if clf is None:
+            clf = LogisticRegression()
+        n_x = real.shape[0]
+        data = (
+            sp_vstack((real, control)) if issparse(real) else np.vstack((real, control))
+        )
+        labels = np.concatenate(
+            [np.full(real.shape[0], "comp"), np.full(control.shape[0], "ctrl")]
+        )
+
+        clf.fit(data, labels)
+        norm_score = clf.score(
+            simulated, np.full(simulated.shape[0], "comp")
+        ) / clf.score(real, labels[:n_x])
+        norm_score = min(1.0, norm_score)
+
+        return norm_score
+
+    def compare_knn(
+        self,
+        real: np.ndarray,
+        simulated: np.ndarray,
+        control: np.ndarray | None = None,
+        use_simulated_for_knn: bool = False,
+        n_neighbors: int = 20,
+        random_state: int = 0,
+        n_jobs: int = 1,
+    ) -> dict[str, float]:
+        """Calculate proportions of real perturbed and control data points for simulated data.
+
+        Computes proportions of real perturbed, control and simulated (if `use_simulated_for_knn=True`)
+        data points for simulated data. If control (`C`) is not provided, builds the knn graph from
+        real perturbed + simulated perturbed.
+
+        Args:
+            real: Real perturbed data.
+            simulated: Simulated perturbed data.
+            control: Control data
+            use_simulated_for_knn: Include simulted perturbed data (`simulated`) into the knn graph. Only valid when
+                control (`control`) is provided.
+            n_neighbors: Number of neighbors to use in k-neighbor graph.
+            random_state: Random state used for k-neighbor graph construction.
+            n_jobs: Number of cores to use. Defaults to -1 (all).
+
+        """
+        assert real.shape[1] == simulated.shape[1]
+        if control is not None:
+            assert real.shape[1] == control.shape[1]
+
+        n_y = simulated.shape[0]
+
+        if control is None:
+            index_data = (
+                sp_vstack((simulated, real))
+                if issparse(real)
+                else np.vstack((simulated, real))
+            )
+        else:
+            datas = (
+                (simulated, real, control) if use_simulated_for_knn else (real, control)
+            )
+            index_data = sp_vstack(datas) if issparse(real) else np.vstack(datas)
+
+        y_in_index = use_simulated_for_knn or control is None
+        c_in_index = control is not None
+        label_groups = ["comp"]
+        labels: NDArray[np.str_] = np.full(index_data.shape[0], "comp")
+        if y_in_index:
+            labels[:n_y] = "siml"
+            label_groups.append("siml")
+        if c_in_index:
+            labels[-control.shape[0] :] = "ctrl"
+            label_groups.append("ctrl")
+
+        index = pynndescent.NNDescent(
+            index_data,
+            n_neighbors=max(50, n_neighbors),
+            random_state=random_state,
+            n_jobs=n_jobs,
+        )
+        indices = index.query(simulated, k=n_neighbors)[0]
+
+        uq, uq_counts = np.unique(labels[indices], return_counts=True)
+        uq_counts_norm = uq_counts / uq_counts.sum()
+        counts = dict(zip(label_groups, [0.0] * len(label_groups), strict=False))
+        for group, count_norm in zip(uq, uq_counts_norm, strict=False):
+            counts[group] = count_norm
+
+        return counts
diff --git a/tests/tools/test_metrics_3g.py b/tests/tools/_differential_gene_expression/test_dge.py
similarity index 52%
rename from tests/tools/test_metrics_3g.py
rename to tests/tools/_differential_gene_expression/test_dge.py
index d481fc86..4addf51a 100644
--- a/tests/tools/test_metrics_3g.py
+++ b/tests/tools/_differential_gene_expression/test_dge.py
@@ -1,25 +1,12 @@
+from anndata import AnnData
 import numpy as np
 import pandas as pd
 import pertpy as pt
 import pytest
-from anndata import AnnData
-from pertpy.tools._metrics_3g import (
-    compare_class,
-    compare_de,
-    compare_knn,
-)
-
-
-@pytest.fixture
-def test_data(rng):
-    X = rng.normal(size=(100, 10))
-    Y = rng.normal(size=(100, 10))
-    C = rng.normal(size=(100, 10))
-    return X, Y, C
 
 
 @pytest.fixture
-def compare_de_adata(rng):
+def adata(rng):
     adata = AnnData(rng.normal(size=(100, 10)))
     genes = np.rec.fromarrays(
         [np.array([f"gene{i}" for i in range(10)])],
@@ -39,7 +26,7 @@ def compare_de_adata(rng):
 
 
 @pytest.fixture
-def compare_de_dataframe(rng):
+def dataframe(rng):
     df1 = pd.DataFrame(
         {
             "variable": ["gene" + str(i) for i in range(10)],
@@ -57,49 +44,39 @@ def compare_de_dataframe(rng):
     return df1, df2
 
 
-def test_error_both_keys_and_dfs(compare_de_adata, compare_de_dataframe):
+def test_error_both_keys_and_dfs(adata, dataframe):
     with pytest.raises(ValueError):
-        compare_de(adata=compare_de_adata, de_key1="de_key1", de_df1=compare_de_dataframe[0])
+        pt_DGE = pt.tl.DGE()
+        pt_DGE.compare(adata=adata, de_key1="de_key1", de_df1=dataframe[0])
 
 
 def test_error_missing_adata():
     with pytest.raises(ValueError):
-        compare_de(de_key1="de_key1", de_key2="de_key2")
+        pt_DGE = pt.tl.DGE()
+        pt_DGE.compare(de_key1="de_key1", de_key2="de_key2")
 
 
-def test_error_missing_df(compare_de_dataframe):
+def test_error_missing_df(dataframe):
     with pytest.raises(ValueError):
-        compare_de(de_df1=compare_de_dataframe[0])
+        pt_DGE = pt.tl.DGE()
+        pt_DGE.compare(de_df1=dataframe[0])
 
 
-def test_compare_de_key(compare_de_adata):
-    results = compare_de(adata=compare_de_adata, de_key1="de_key1", de_key2="de_key2", shared_top=5)
+def test_key(adata):
+    pt_DGE = pt.tl.DGE()
+    results = pt_DGE.compare(
+        adata=adata, de_key1="de_key1", de_key2="de_key2", shared_top=5
+    )
     assert "shared_top_genes" in results
     assert "scores_corr" in results
     assert "pvals_adj_corr" in results
     assert "scores_ranks_corr" in results
 
 
-def test_compare_de_df(compare_de_dataframe):
-    results = compare_de(de_df1=compare_de_dataframe[0], de_df2=compare_de_dataframe[1], shared_top=5)
+def test_df(dataframe):
+    pt_DGE = pt.tl.DGE()
+    results = pt_DGE.compare(de_df1=dataframe[0], de_df2=dataframe[1], shared_top=5)
     assert "shared_top_genes" in results
     assert "scores_corr" in results
     assert "pvals_adj_corr" in results
     assert "scores_ranks_corr" in results
-
-
-def test_compare_class(test_data):
-    X, Y, C = test_data
-    result = compare_class(X, Y, C)
-    assert result <= 1
-
-
-def test_compare_knn(test_data):
-    X, Y, C = test_data
-    result = compare_knn(X, Y, C)
-    assert isinstance(result, dict)
-    assert "comp" in result
-    assert isinstance(result["comp"], float)
-
-    result_no_ctrl = compare_knn(X, Y)
-    assert isinstance(result_no_ctrl, dict)
diff --git a/tests/tools/_perturbation_space/test_comparison.py b/tests/tools/_perturbation_space/test_comparison.py
new file mode 100644
index 00000000..7f291f2b
--- /dev/null
+++ b/tests/tools/_perturbation_space/test_comparison.py
@@ -0,0 +1,29 @@
+import pytest
+import pertpy as pt
+
+
+@pytest.fixture
+def test_data(rng):
+    X = rng.normal(size=(100, 10))
+    Y = rng.normal(size=(100, 10))
+    C = rng.normal(size=(100, 10))
+    return X, Y, C
+
+
+def test_compare_class(test_data):
+    X, Y, C = test_data
+    pt_comparison = pt.tl.PerturbationComparison()
+    result = pt_comparison.compare_classification(X, Y, C)
+    assert result <= 1
+
+
+def test_compare_knn(test_data):
+    X, Y, C = test_data
+    pt_comparison = pt.tl.PerturbationComparison()
+    result = pt_comparison.compare_knn(X, Y, C)
+    assert isinstance(result, dict)
+    assert "comp" in result
+    assert isinstance(result["comp"], float)
+
+    result_no_ctrl = pt_comparison.compare_knn(X, Y)
+    assert isinstance(result_no_ctrl, dict)

From 2e7acf393f33523592603d2e30d7cb22262e855b Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Tue, 18 Jun 2024 21:26:12 +0200
Subject: [PATCH 29/34] fix pre-commit

---
 .../_dge_comparison.py                        | 41 ++++++-------------
 .../tools/_perturbation_space/_comparison.py  | 22 +++-------
 .../_differential_gene_expression/test_dge.py |  6 +--
 .../_perturbation_space/test_comparison.py    |  2 +-
 4 files changed, 20 insertions(+), 51 deletions(-)

diff --git a/pertpy/tools/_differential_gene_expression/_dge_comparison.py b/pertpy/tools/_differential_gene_expression/_dge_comparison.py
index 16c8c9c7..bd8c10fe 100644
--- a/pertpy/tools/_differential_gene_expression/_dge_comparison.py
+++ b/pertpy/tools/_differential_gene_expression/_dge_comparison.py
@@ -1,7 +1,8 @@
+from typing import Dict
+
 import numpy as np
 import pandas as pd
 from anndata import AnnData
-from typing import Dict
 
 
 class DGE:
@@ -13,7 +14,7 @@ def compare(
         de_df1: pd.DataFrame | None = None,
         de_df2: pd.DataFrame | None = None,
         shared_top: int = 100,
-    ) -> Dict[str, float]:
+    ) -> dict[str, float]:
         """Compare two differential expression analyses.
 
         Compare two sets of DE results and evaluate the similarity by the overlap of top DEG and
@@ -35,21 +36,15 @@ def compare(
 
         if de_df1 is None and de_df2 is None:  # use keys
             if not de_key1 or not de_key2:
-                raise ValueError(
-                    "Both `de_key1` and `de_key2` must be provided together if using `adata`."
-                )
+                raise ValueError("Both `de_key1` and `de_key2` must be provided together if using `adata`.")
 
         else:  # use dfs
             if de_df1 is None or de_df2 is None:
-                raise ValueError(
-                    "Both `de_df1` and `de_df2` must be provided together if using dataframes."
-                )
+                raise ValueError("Both `de_df1` and `de_df2` must be provided together if using dataframes.")
 
         if de_key1:
             if not adata:
-                raise ValueError(
-                    "`adata` should be provided with `de_key1` and `de_key2`. "
-                )
+                raise ValueError("`adata` should be provided with `de_key1` and `de_key2`. ")
             assert all(
                 k in adata.uns for k in [de_key1, de_key2]
             ), "Provided `de_key1` and `de_key2` must exist in `adata.uns`."
@@ -58,13 +53,9 @@ def compare(
         if de_df1 is not None:
             for df in (de_df1, de_df2):
                 if not {"variable", "log_fc", "adj_p_value"}.issubset(df.columns):
-                    raise ValueError(
-                        "Each DataFrame must contain columns: 'variable', 'log_fc', and 'adj_p_value'."
-                    )
+                    raise ValueError("Each DataFrame must contain columns: 'variable', 'log_fc', and 'adj_p_value'.")
 
-            assert set(de_df1["variable"]) == set(
-                de_df2["variable"]
-            ), "Variables in both dataframes must match."
+            assert set(de_df1["variable"]) == set(de_df2["variable"]), "Variables in both dataframes must match."
             vars = de_df1["variable"].sort_values()
 
         shared_top = min(shared_top, len(vars))
@@ -89,17 +80,9 @@ def compare(
                 top_names.append(df["variable"][:shared_top])
 
         metrics = {}
-        metrics["shared_top_genes"] = (
-            len(set(top_names[0]).intersection(top_names[1])) / shared_top
-        )
-        metrics["scores_corr"] = results["scores_0"].corr(
-            results["scores_1"], method="pearson"
-        )
-        metrics["pvals_adj_corr"] = results["pvals_adj_0"].corr(
-            results["pvals_adj_1"], method="pearson"
-        )
-        metrics["scores_ranks_corr"] = results["ranks_0"].corr(
-            results["ranks_1"], method="spearman"
-        )
+        metrics["shared_top_genes"] = len(set(top_names[0]).intersection(top_names[1])) / shared_top
+        metrics["scores_corr"] = results["scores_0"].corr(results["scores_1"], method="pearson")
+        metrics["pvals_adj_corr"] = results["pvals_adj_0"].corr(results["pvals_adj_1"], method="pearson")
+        metrics["scores_ranks_corr"] = results["ranks_0"].corr(results["ranks_1"], method="spearman")
 
         return metrics
diff --git a/pertpy/tools/_perturbation_space/_comparison.py b/pertpy/tools/_perturbation_space/_comparison.py
index 390c2d9d..2ea76cc2 100644
--- a/pertpy/tools/_perturbation_space/_comparison.py
+++ b/pertpy/tools/_perturbation_space/_comparison.py
@@ -36,17 +36,11 @@ def compare_classification(
         if clf is None:
             clf = LogisticRegression()
         n_x = real.shape[0]
-        data = (
-            sp_vstack((real, control)) if issparse(real) else np.vstack((real, control))
-        )
-        labels = np.concatenate(
-            [np.full(real.shape[0], "comp"), np.full(control.shape[0], "ctrl")]
-        )
+        data = sp_vstack((real, control)) if issparse(real) else np.vstack((real, control))
+        labels = np.concatenate([np.full(real.shape[0], "comp"), np.full(control.shape[0], "ctrl")])
 
         clf.fit(data, labels)
-        norm_score = clf.score(
-            simulated, np.full(simulated.shape[0], "comp")
-        ) / clf.score(real, labels[:n_x])
+        norm_score = clf.score(simulated, np.full(simulated.shape[0], "comp")) / clf.score(real, labels[:n_x])
         norm_score = min(1.0, norm_score)
 
         return norm_score
@@ -85,15 +79,9 @@ def compare_knn(
         n_y = simulated.shape[0]
 
         if control is None:
-            index_data = (
-                sp_vstack((simulated, real))
-                if issparse(real)
-                else np.vstack((simulated, real))
-            )
+            index_data = sp_vstack((simulated, real)) if issparse(real) else np.vstack((simulated, real))
         else:
-            datas = (
-                (simulated, real, control) if use_simulated_for_knn else (real, control)
-            )
+            datas = (simulated, real, control) if use_simulated_for_knn else (real, control)
             index_data = sp_vstack(datas) if issparse(real) else np.vstack(datas)
 
         y_in_index = use_simulated_for_knn or control is None
diff --git a/tests/tools/_differential_gene_expression/test_dge.py b/tests/tools/_differential_gene_expression/test_dge.py
index 4addf51a..74e31d61 100644
--- a/tests/tools/_differential_gene_expression/test_dge.py
+++ b/tests/tools/_differential_gene_expression/test_dge.py
@@ -1,8 +1,8 @@
-from anndata import AnnData
 import numpy as np
 import pandas as pd
 import pertpy as pt
 import pytest
+from anndata import AnnData
 
 
 @pytest.fixture
@@ -64,9 +64,7 @@ def test_error_missing_df(dataframe):
 
 def test_key(adata):
     pt_DGE = pt.tl.DGE()
-    results = pt_DGE.compare(
-        adata=adata, de_key1="de_key1", de_key2="de_key2", shared_top=5
-    )
+    results = pt_DGE.compare(adata=adata, de_key1="de_key1", de_key2="de_key2", shared_top=5)
     assert "shared_top_genes" in results
     assert "scores_corr" in results
     assert "pvals_adj_corr" in results
diff --git a/tests/tools/_perturbation_space/test_comparison.py b/tests/tools/_perturbation_space/test_comparison.py
index 7f291f2b..1751a5f1 100644
--- a/tests/tools/_perturbation_space/test_comparison.py
+++ b/tests/tools/_perturbation_space/test_comparison.py
@@ -1,5 +1,5 @@
-import pytest
 import pertpy as pt
+import pytest
 
 
 @pytest.fixture

From 69163ffd5d73974f5d93a2b5fdca2cd5131aa6c3 Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Wed, 19 Jun 2024 20:25:53 +0200
Subject: [PATCH 30/34] pin numpy <2

---
 pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7139813d..728f12b5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -62,7 +62,8 @@ dependencies = [
     "pubchempy",
     "pyarrow",
     "blitzgsea",
-    "lamin_utils"
+    "lamin_utils",
+    "numpy<2.0.0"
 ]
 
 [project.optional-dependencies]

From 67c54be63f444a5447b543f0a86d5464ee125492 Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Thu, 20 Jun 2024 20:21:54 +0200
Subject: [PATCH 31/34] unpin numpy

---
 pyproject.toml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 728f12b5..7139813d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -62,8 +62,7 @@ dependencies = [
     "pubchempy",
     "pyarrow",
     "blitzgsea",
-    "lamin_utils",
-    "numpy<2.0.0"
+    "lamin_utils"
 ]
 
 [project.optional-dependencies]

From 6e32f379b32df149e1543813af23f98060f7fb14 Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Thu, 20 Jun 2024 22:02:36 +0200
Subject: [PATCH 32/34] speed up mahalanobis distance

---
 pertpy/tools/_distances/_distances.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index ee0f6e64..b7edf15d 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -1185,11 +1185,13 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
         self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        return mahalanobis(
-            self.aggregation_func(X, axis=0),
-            self.aggregation_func(Y, axis=0),
-            np.linalg.inv(np.cov(X.T)),
-        )
+        delta = self.aggregation_func(X, axis=0) - self.aggregation_func(Y, axis=0)
+        cov = np.cov(X.T)
+        s, u = np.linalg.eigh(cov)
+        ci = u @ (1 / s[..., None] * u.T)
+        return np.sqrt(np.sum(((delta @ ci) * delta), axis=-1))
+        # ci = np.linalg.pinv(cov)
+        # return np.sqrt(np.sum(((delta @ ci) * delta), axis=-1))
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
         raise NotImplementedError("Mahalanobis cannot be called on a pairwise distance matrix.")

From 620e6453205c24550e75be83a46419cffea273e6 Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Thu, 20 Jun 2024 22:58:33 +0200
Subject: [PATCH 33/34] use scipy to calculate mahalanobis distance

---
 .../_dge_comparison.py                             |  2 +-
 pertpy/tools/_distances/_distances.py              | 14 ++++++--------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/pertpy/tools/_differential_gene_expression/_dge_comparison.py b/pertpy/tools/_differential_gene_expression/_dge_comparison.py
index bd8c10fe..34013aac 100644
--- a/pertpy/tools/_differential_gene_expression/_dge_comparison.py
+++ b/pertpy/tools/_differential_gene_expression/_dge_comparison.py
@@ -40,7 +40,7 @@ def compare(
 
         else:  # use dfs
             if de_df1 is None or de_df2 is None:
-                raise ValueError("Both `de_df1` and `de_df2` must be provided together if using dataframes.")
+                raise ValueError("Both `de_df1` and `de_df2` must be provided together if using DataFrames.")
 
         if de_key1:
             if not adata:
diff --git a/pertpy/tools/_distances/_distances.py b/pertpy/tools/_distances/_distances.py
index b7edf15d..5cde82d5 100644
--- a/pertpy/tools/_distances/_distances.py
+++ b/pertpy/tools/_distances/_distances.py
@@ -585,7 +585,7 @@ def _bootstrap_mode_precomputed(self, sub_pwd, sub_idx, n_bootstraps=100, random
 
         distances = []
         for _ in range(n_bootstraps):
-            # in order to maintain the number of cells for both groups (whatever balancing they may have),
+            # To maintain the number of cells for both groups (whatever balancing they may have),
             # we sample the positive and negative indices separately
             bootstrap_pos_idx = rng.choice(a=sub_idx[sub_idx].index, size=sub_idx[sub_idx].size, replace=True)
             bootstrap_neg_idx = rng.choice(a=sub_idx[~sub_idx].index, size=sub_idx[~sub_idx].size, replace=True)
@@ -1185,13 +1185,11 @@ def __init__(self, aggregation_func: Callable = np.mean) -> None:
         self.aggregation_func = aggregation_func
 
     def __call__(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float:
-        delta = self.aggregation_func(X, axis=0) - self.aggregation_func(Y, axis=0)
-        cov = np.cov(X.T)
-        s, u = np.linalg.eigh(cov)
-        ci = u @ (1 / s[..., None] * u.T)
-        return np.sqrt(np.sum(((delta @ ci) * delta), axis=-1))
-        # ci = np.linalg.pinv(cov)
-        # return np.sqrt(np.sum(((delta @ ci) * delta), axis=-1))
+        return mahalanobis(
+            self.aggregation_func(X, axis=0),
+            self.aggregation_func(Y, axis=0),
+            np.linalg.inv(np.cov(X.T)),
+        )
 
     def from_precomputed(self, P: np.ndarray, idx: np.ndarray, **kwargs) -> float:
         raise NotImplementedError("Mahalanobis cannot be called on a pairwise distance matrix.")

From 10d348370c17eeff4cec620c457b626b644a4950 Mon Sep 17 00:00:00 2001
From: wxicu <xichenwu@outlook.com>
Date: Sun, 23 Jun 2024 18:46:37 +0200
Subject: [PATCH 34/34] rename DGE to DGEEVAL

---
 pertpy/tools/__init__.py                               |  2 +-
 pertpy/tools/_differential_gene_expression/__init__.py |  2 +-
 .../_differential_gene_expression/_dge_comparison.py   |  4 +---
 tests/tools/_differential_gene_expression/test_dge.py  | 10 +++++-----
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/pertpy/tools/__init__.py b/pertpy/tools/__init__.py
index 18b11f1d..4e2c709e 100644
--- a/pertpy/tools/__init__.py
+++ b/pertpy/tools/__init__.py
@@ -4,7 +4,7 @@
 from pertpy.tools._coda._tasccoda import Tasccoda
 from pertpy.tools._dialogue import Dialogue
 from pertpy.tools._differential_gene_expression import (
-    DGE,
+    DGEEVAL,
     EdgeR,
     PyDESeq2,
     Statsmodels,
diff --git a/pertpy/tools/_differential_gene_expression/__init__.py b/pertpy/tools/_differential_gene_expression/__init__.py
index b349cedb..35ccef51 100644
--- a/pertpy/tools/_differential_gene_expression/__init__.py
+++ b/pertpy/tools/_differential_gene_expression/__init__.py
@@ -1,5 +1,5 @@
 from ._base import ContrastType, LinearModelBase, MethodBase
-from ._dge_comparison import DGE
+from ._dge_comparison import DGEEVAL
 from ._edger import EdgeR
 from ._pydeseq2 import PyDESeq2
 from ._simple_tests import SimpleComparisonBase, TTest, WilcoxonTest
diff --git a/pertpy/tools/_differential_gene_expression/_dge_comparison.py b/pertpy/tools/_differential_gene_expression/_dge_comparison.py
index 34013aac..0492dcc5 100644
--- a/pertpy/tools/_differential_gene_expression/_dge_comparison.py
+++ b/pertpy/tools/_differential_gene_expression/_dge_comparison.py
@@ -1,11 +1,9 @@
-from typing import Dict
-
 import numpy as np
 import pandas as pd
 from anndata import AnnData
 
 
-class DGE:
+class DGEEVAL:
     def compare(
         self,
         adata: AnnData | None = None,
diff --git a/tests/tools/_differential_gene_expression/test_dge.py b/tests/tools/_differential_gene_expression/test_dge.py
index 74e31d61..1f9d3eb8 100644
--- a/tests/tools/_differential_gene_expression/test_dge.py
+++ b/tests/tools/_differential_gene_expression/test_dge.py
@@ -46,24 +46,24 @@ def dataframe(rng):
 
 def test_error_both_keys_and_dfs(adata, dataframe):
     with pytest.raises(ValueError):
-        pt_DGE = pt.tl.DGE()
+        pt_DGE = pt.tl.DGEEVAL()
         pt_DGE.compare(adata=adata, de_key1="de_key1", de_df1=dataframe[0])
 
 
 def test_error_missing_adata():
     with pytest.raises(ValueError):
-        pt_DGE = pt.tl.DGE()
+        pt_DGE = pt.tl.DGEEVAL()
         pt_DGE.compare(de_key1="de_key1", de_key2="de_key2")
 
 
 def test_error_missing_df(dataframe):
     with pytest.raises(ValueError):
-        pt_DGE = pt.tl.DGE()
+        pt_DGE = pt.tl.DGEEVAL()
         pt_DGE.compare(de_df1=dataframe[0])
 
 
 def test_key(adata):
-    pt_DGE = pt.tl.DGE()
+    pt_DGE = pt.tl.DGEEVAL()
     results = pt_DGE.compare(adata=adata, de_key1="de_key1", de_key2="de_key2", shared_top=5)
     assert "shared_top_genes" in results
     assert "scores_corr" in results
@@ -72,7 +72,7 @@ def test_key(adata):
 
 
 def test_df(dataframe):
-    pt_DGE = pt.tl.DGE()
+    pt_DGE = pt.tl.DGEEVAL()
     results = pt_DGE.compare(de_df1=dataframe[0], de_df2=dataframe[1], shared_top=5)
     assert "shared_top_genes" in results
     assert "scores_corr" in results