diff --git a/Python/phate/phate.py b/Python/phate/phate.py index 216afab..64d9fc0 100644 --- a/Python/phate/phate.py +++ b/Python/phate/phate.py @@ -39,7 +39,9 @@ def calculate_kernel(data, k=15, a=10, alpha_decay=True, knn_dist='euclidean', Parameters ---------- data : array-like [n_samples, n_dimensions] - 2 dimensional input data array with n cells and p dimensions + 2 dimensional input data array with n cells and p dimensions If + `knn_dist` is 'precomputed', `data` should be a n_samples x n_samples + distance matrix k : int, optional, default: 15 used to set epsilon while autotuning kernel bandwidth @@ -51,9 +53,10 @@ def calculate_kernel(data, k=15, a=10, alpha_decay=True, knn_dist='euclidean', If true, use the alpha decaying kernel knn_dist : string, optional, default: 'euclidean' - recommended values: 'euclidean' and 'cosine' - Any metric from scipy.spatial.distance can be used - distance metric for building kNN graph + recommended values: 'euclidean', 'cosine', 'precomputed' + Any metric from `scipy.spatial.distance` can be used + distance metric for building kNN graph. If 'precomputed', + `data` should be an n_samples x n_samples distance matrix verbose : boolean, optional, default: True If true, print status messages @@ -79,9 +82,7 @@ def calculate_kernel(data, k=15, a=10, alpha_decay=True, knn_dist='euclidean', kernel : array-like [n_samples, n_samples] kernel matrix built from the input data """ - precomputed = isinstance(knn_dist, list) or \ - isinstance(knn_dist, np.ndarray) - if not precomputed and ndim < data.shape[1]: + if knn_dist != 'precomputed' and ndim < data.shape[1]: if verbose: print("Calculating PCA...") start = time.time() @@ -102,10 +103,10 @@ def calculate_kernel(data, k=15, a=10, alpha_decay=True, knn_dist='euclidean', # kernel includes self as connection but not in k # actually search for k+1 neighbors including self k = k + 1 - if alpha_decay: + if alpha_decay and a is not None: try: - if precomputed: - pdx = knn_dist + if knn_dist == 'precomputed': + pdx = data else: pdx = squareform(pdist(data, metric=knn_dist)) knn_dist = np.partition(pdx, k, axis=1)[:, :k] @@ -118,7 +119,8 @@ def calculate_kernel(data, k=15, a=10, alpha_decay=True, knn_dist='euclidean', 'Try removing duplicates.') kernel = np.exp(-1 * (pdx ** a)) # not really Gaussian kernel else: - if precomputed: + if knn_dist == 'precomputed': + # we already have pairwise distances pdx = knn_dist knn_idx = np.argpartition(pdx, k, axis=1)[:, :k] ind_ptr = np.arange(knn_idx.shape[0] + 1) * knn_idx.shape[1] @@ -184,7 +186,8 @@ def calculate_landmark_operator(kernel, n_landmark=2000, n_components=n_svd, random_state=random_state) if verbose: - print("SVD complete in {:.2f} seconds".format(time.time() - start)) + print("Calculated SVD in {:.2f} seconds".format( + time.time() - start)) start = time.time() print("Calculating Kmeans...") kmeans = MiniBatchKMeans(n_landmark, @@ -194,7 +197,8 @@ def calculate_landmark_operator(kernel, n_landmark=2000, clusters = kmeans.fit_predict(np.matmul(U, np.diagflat(S))) landmarks = np.unique(clusters) if verbose: - print("Complete in ", time.time() - start) + print("Calculated Kmeans in {:.2f} seconds".format( + time.time() - start)) # transition matrices if is_sparse: @@ -226,7 +230,9 @@ def calculate_operator(data, k=15, a=10, alpha_decay=True, n_landmark=2000, Parameters ---------- data : array-like [n_samples, n_dimensions] - 2 dimensional input data array with n cells and p dimensions + 2 dimensional input data array with n cells and p dimensions. If + `knn_dist` is 'precomputed', `data` should be a n_samples x n_samples + distance or affinity matrix k : int, optional, default: 15 used to set epsilon while autotuning kernel bandwidth @@ -241,9 +247,11 @@ def calculate_operator(data, k=15, a=10, alpha_decay=True, n_landmark=2000, number of landmarks to use in fast PHATE knn_dist : string, optional, default: 'euclidean' - recommended values: 'euclidean' and 'cosine' - Any metric from scipy.spatial.distance can be used - distance metric for building kNN graph + recommended values: 'euclidean', 'cosine', 'precomputed' + Any metric from `scipy.spatial.distance` can be used + distance metric for building kNN graph. If 'precomputed', + `data` should be an n_samples x n_samples distance or + affinity matrix diff_op : array-like, optional shape=[n_samples, n_samples], default: None Precomputed diffusion operator @@ -296,12 +304,26 @@ def calculate_operator(data, k=15, a=10, alpha_decay=True, n_landmark=2000, if diff_op is None: if verbose: print("Building kNN graph and diffusion operator...") - kernel = calculate_kernel(data, a=a, k=k, knn_dist=knn_dist, - ndim=n_pca, - alpha_decay=alpha_decay, - random_state=random_state, - n_jobs=n_jobs, - verbose=verbose) + if knn_dist == 'precomputed' and np.all(np.diagonal(data) != 0): + print("Using precomputed affinity matrix...") + kernel = data + else: + if knn_dist == 'precomputed': + if np.all(np.diagonal(data) == 0): + print("Using precomputed distance matrix...") + else: + raise ValueError( + "Cannot determine precomputed data type. " + "Precomputed affinity matrices should have " + "only non-zero entries on the diagonal, and" + " precomputed distance matrices should have" + " only zero entries on the diagonal.") + kernel = calculate_kernel(data, a=a, k=k, knn_dist=knn_dist, + ndim=n_pca, + alpha_decay=alpha_decay, + random_state=random_state, + n_jobs=n_jobs, + verbose=verbose) diff_op, landmark_transitions = calculate_landmark_operator( kernel, n_landmark=n_landmark, random_state=random_state, verbose=verbose) @@ -382,17 +404,32 @@ def embed_mds(diff_op, t=30, n_components=2, diff_potential=None, X = np.linalg.matrix_power(diff_op, t) # diffused diffusion operator - if potential_method == 'log': + if potential_method == 'log': # or potential_method == 1: # handling small values # X[X <= np.finfo(float).eps] = np.finfo( # float).eps - X = X + 1e-3 + X = X + 1e-7 diff_potential = -1 * np.log(X) # diffusion potential elif potential_method == 'sqrt': diff_potential = np.sqrt(X) # diffusion potential - else: - raise ValueError("Allowable 'potential_method' values: 'log' or " - "'sqrt'. '%s' was passed." % (potential_method)) + else: # if isinstance(potential_method, str): + raise ValueError( + "Allowable 'potential_method' values: 'log' or " + "'sqrt'. '{}' was passed.".format(potential_method)) + # else: + # # gamma + # print("Warning: gamma potential is not stable." + # " Recommended values: 'log' or 'sqrt'") + # if potential_method > 1 or potential_method < -1: + # raise ValueError( + # "Allowable 'potential_method' values between -1 and 1" + # " inclusive. '{}' was passed.".format(potential_method)) + # elif potential_method != -1: + # diff_potential = 2 / (1 - potential_method) * \ + # np.power(X, ((1 - potential_method) / 2)) + # else: + # # gamma = -1 is just MDS on DM + # diff_potential = X if verbose: print("Calculated diffusion potential in " @@ -425,9 +462,9 @@ class PHATE(BaseEstimator): """PHATE operator which performs dimensionality reduction. Potential of Heat-diffusion for Affinity-based Trajectory Embedding - (PHATE).[1]_ Embeds high dimensional single-cell data into two or three + (PHATE) embeds high dimensional single-cell data into two or three dimensions for visualization of biological progressions as described - in . + in Moon et al, 2017 [1]_. Parameters ---------- @@ -466,9 +503,11 @@ class PHATE(BaseEstimator): log(n_samples) time. knn_dist : string, optional, default: 'euclidean' - recommended values: 'euclidean' and 'cosine' + recommended values: 'euclidean', 'cosine', 'precomputed' Any metric from `scipy.spatial.distance` can be used - distance metric for building kNN graph + distance metric for building kNN graph. If 'precomputed', + `data` should be an n_samples x n_samples distance or + affinity matrix mds_dist : string, optional, default: 'euclidean' recommended values: 'euclidean' and 'cosine' @@ -628,7 +667,9 @@ def fit(self, X): X : array, shape=[n_samples, n_features] input data with `n_samples` samples and `n_dimensions` dimensions. Accepted data types: `numpy.ndarray`, - `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData` + `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. If + `knn_dist` is 'precomputed', `data` should be a n_samples x + n_samples distance or affinity matrix Returns ------- @@ -673,7 +714,9 @@ def transform(self, X=None, t_max=100, plot_optimal_t=False, ax=None): dimensions. Not required, since PHATE does not currently embed cells not given in the input matrix to `PHATE.fit()`. Accepted data types: `numpy.ndarray`, - `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. + `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. If + `knn_dist` is 'precomputed', `data` should be a n_samples x + n_samples distance or affinity matrix t_max : int, optional, default: 100 maximum t to test if `t` is set to 'auto' @@ -733,7 +776,9 @@ def fit_transform(self, X, **kwargs): X : array, shape=[n_samples, n_features] input data with `n_samples` samples and `n_dimensions` dimensions. Accepted data types: `numpy.ndarray`, - `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData` + `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData` If + `knn_dist` is 'precomputed', `data` should be a n_samples x + n_samples distance or affinity matrix kwargs : further arguments for `PHATE.transform()` Keyword arguments as specified in :func:`~phate.PHATE.transform` diff --git a/Python/phate/version.py b/Python/phate/version.py index d31c31e..788da1f 100644 --- a/Python/phate/version.py +++ b/Python/phate/version.py @@ -1 +1 @@ -__version__ = "0.2.3" +__version__ = "0.2.4" diff --git a/test/phate_examples.py b/test/phate_examples.py index fcf03f8..0afb9e8 100644 --- a/test/phate_examples.py +++ b/test/phate_examples.py @@ -7,137 +7,146 @@ python_version = "{}.{}".format(sys.version_info.major, sys.version_info.minor) -# generate DLA tree -M, C = phate.tree.gen_dla(n_dim=100, n_branch=10, branch_length=300, - rand_multiplier=2, seed=37, sigma=4) - -# instantiate phate_operator -phate_operator = phate.PHATE(n_components=2, a=10, k=5, t=30, mds='classic', - knn_dist='euclidean', mds_dist='euclidean', - n_jobs=-2, n_landmark=None) - -# run phate with classic MDS -print("DLA tree, classic MDS") -Y_cmds = phate_operator.fit_transform(M) - -# run phate with metric MDS -# change the MDS embedding without recalculating diffusion potential -phate_operator.reset_mds(mds="metric") -print("DLA tree, metric MDS (log)") -Y_mmds = phate_operator.fit_transform(M) - -# run phate with nonmetric MDS -phate_operator.reset_potential(potential_method="sqrt") -print("DLA tree, metric MDS (sqrt)") -Y_sqrt = phate_operator.fit_transform(M) - -phate_fast_operator = phate.PHATE( - n_components=2, a=10, t=90, k=5, mds='classic', mds_dist='euclidean', - alpha_decay=True, n_landmark=1000) -# run phate with classic MDS -print("DLA tree, fast classic MDS") -Y_cmds_fast = phate_fast_operator.fit_transform(M) - -# run phate with metric MDS -# change the MDS embedding without recalculating diffusion potential -phate_fast_operator.reset_mds(mds="metric") -print("DLA tree, fast metric MDS (log)") -Y_mmds_fast = phate_fast_operator.fit_transform(M) - -# run phate with nonmetric MDS -phate_fast_operator.reset_potential(potential_method="sqrt") -print("DLA tree, fast metric MDS (sqrt)") -Y_sqrt_fast = phate_fast_operator.fit_transform(M) - -fig, axes = plt.subplots(2, 3) - -(ax1, ax2, ax3) = axes[0] -fig.set_size_inches(12, 8) -plt.setp((ax1, ax2, ax3), xticks=[], xticklabels=[], yticks=[]) - -# plotting PHATE - classic MDS -ax1.scatter(Y_cmds[:, 0], Y_cmds[:, 1], s=10, c=C) -ax1.set_xlabel("phate 1") -ax1.set_ylabel("phate 2") -ax1.set_title("PHATE embedding of DLA fractal tree\nClassic MDS") - -# plotting PHATE - metric MDS -ax2.scatter(Y_mmds[:, 0], Y_mmds[:, 1], s=10, c=C) -ax2.set_xlabel("phate 1") -ax2.set_title("PHATE embedding of DLA fractal tree\nMetric MDS, log") - -# plotting PHATE - nonmetric MDS -ax3.scatter(Y_sqrt[:, 0], Y_sqrt[:, 1], s=10, c=C) -ax3.set_xlabel("phate 1") -ax3.set_title("PHATE embedding of DLA fractal tree\nMetric MDS, sqrt") - -(ax1, ax2, ax3) = axes[1] -plt.setp((ax1, ax2, ax3), xticks=[], xticklabels=[], yticks=[]) - -# plotting PHATE - classic MDS -ax1.scatter(Y_cmds_fast[:, 0], Y_cmds_fast[:, 1], s=10, c=C) -ax1.set_xlabel("phate 1") -ax1.set_ylabel("phate 2") -ax1.set_title("PHATE embedding of DLA fractal tree\nFast Classic MDS") - -# plotting PHATE - metric MDS -ax2.scatter(Y_mmds_fast[:, 0], Y_mmds_fast[:, 1], s=10, c=C) -ax2.set_xlabel("phate 1") -ax2.set_title("PHATE embedding of DLA fractal tree\nFast Metric MDS, log") - -# plotting PHATE - nonmetric MDS -ax3.scatter(Y_sqrt_fast[:, 0], Y_sqrt_fast[:, 1], s=10, c=C) -ax3.set_xlabel("phate 1") -ax3.set_title("PHATE embedding of DLA fractal tree\nFast Metric MDS, sqrt") - -plt.tight_layout() -plt.savefig("python{}_tree.png".format(python_version), dpi=100) - -clusters = pd.read_csv("../data/MAP.csv", header=None) -clusters.columns = pd.Index(['wells', 'clusters']) -bmmsc = pd.read_csv("../data/BMMC_myeloid.csv.gz", index_col=0) - -C = clusters['clusters'] # using cluster labels from original publication - -# library_size_normalize performs L1 normalization on each cell -bmmsc_norm = phate.preprocessing.library_size_normalize(bmmsc) -phate_operator = phate.PHATE( - n_components=2, t='auto', a=200, k=10, mds='metric', mds_dist='euclidean', - n_landmark=None) -phate_fast_operator = phate.PHATE( - n_components=2, t='auto', k=10, mds='metric', mds_dist='euclidean', - n_landmark=1000) - - -fig, (ax1, ax2) = plt.subplots(1, 2, sharey='all') -print("BMMSC, exact PHATE") -Y_mmds = phate_operator.fit_transform(bmmsc_norm, t_max=100, - plot_optimal_t=True, ax=ax1) -ax1.set_title("Optimal t selection on 2730 BMMSCs\n" - "Exact PHATE, " + ax1.get_title()) -print("BMMSC, fast PHATE") -Y_mmds_fast = phate_fast_operator.fit_transform(bmmsc_norm, t_max=100, - plot_optimal_t=True, ax=ax2) -ax2.set_title("Optimal t selection on 2730 BMMSCs\n" - "Fast PHATE, " + ax2.get_title()) -plt.tight_layout() -plt.savefig("python{}_bmmsc_optimal_t.png".format(python_version), dpi=100) - - -fig, (ax1, ax2) = plt.subplots(1, 2) -plt.setp((ax1, ax2), xticklabels=[], yticklabels=[]) - -ax1.scatter(Y_mmds[:, 0], Y_mmds[:, 1], s=1, c=C, cmap="Paired") -ax1.set_xlabel("phate 1") -ax1.set_ylabel("phate 2") -ax1.set_title("PHATE embedding of 2730 BMMSCs\nExact PHATE") - -ax2.scatter(Y_mmds_fast[:, 0], Y_mmds_fast[:, 1], - s=1, c=C, cmap="Paired") -ax2.set_xlabel("phate 1") -ax2.set_ylabel("phate 2") -ax2.set_title("PHATE embedding of 2730 BMMSCs\nFast PHATE") - -plt.gcf().set_size_inches(8, 8) -plt.tight_layout() -plt.savefig("python{}_bmmsc.png".format(python_version), dpi=100) + +def test_tree(): + # generate DLA tree + M, C = phate.tree.gen_dla(n_dim=100, n_branch=10, branch_length=300, + rand_multiplier=2, seed=37, sigma=4) + + # instantiate phate_operator + phate_operator = phate.PHATE(n_components=2, a=10, k=5, t=30, + mds='classic', knn_dist='euclidean', + mds_dist='euclidean', n_jobs=-2, + n_landmark=None) + + # run phate with classic MDS + print("DLA tree, classic MDS") + Y_cmds = phate_operator.fit_transform(M) + + # run phate with metric MDS + # change the MDS embedding without recalculating diffusion potential + phate_operator.reset_mds(mds="metric") + print("DLA tree, metric MDS (log)") + Y_mmds = phate_operator.fit_transform(M) + + # run phate with nonmetric MDS + phate_operator.reset_potential(potential_method="sqrt") + print("DLA tree, metric MDS (sqrt)") + Y_sqrt = phate_operator.fit_transform(M) + + phate_fast_operator = phate.PHATE( + n_components=2, a=10, t=90, k=5, mds='classic', mds_dist='euclidean', + alpha_decay=True, n_landmark=1000) + # run phate with classic MDS + print("DLA tree, fast classic MDS") + Y_cmds_fast = phate_fast_operator.fit_transform(M) + + # run phate with metric MDS + # change the MDS embedding without recalculating diffusion potential + phate_fast_operator.reset_mds(mds="metric") + print("DLA tree, fast metric MDS (log)") + Y_mmds_fast = phate_fast_operator.fit_transform(M) + + # run phate with nonmetric MDS + phate_fast_operator.reset_potential(potential_method="sqrt") + print("DLA tree, fast metric MDS (sqrt)") + Y_sqrt_fast = phate_fast_operator.fit_transform(M) + + fig, axes = plt.subplots(2, 3) + + (ax1, ax2, ax3) = axes[0] + fig.set_size_inches(12, 8) + plt.setp((ax1, ax2, ax3), xticks=[], xticklabels=[], yticks=[]) + + # plotting PHATE - classic MDS + ax1.scatter(Y_cmds[:, 0], Y_cmds[:, 1], s=10, c=C) + ax1.set_xlabel("phate 1") + ax1.set_ylabel("phate 2") + ax1.set_title("PHATE embedding of DLA fractal tree\nClassic MDS") + + # plotting PHATE - metric MDS + ax2.scatter(Y_mmds[:, 0], Y_mmds[:, 1], s=10, c=C) + ax2.set_xlabel("phate 1") + ax2.set_title("PHATE embedding of DLA fractal tree\nMetric MDS, log") + + # plotting PHATE - nonmetric MDS + ax3.scatter(Y_sqrt[:, 0], Y_sqrt[:, 1], s=10, c=C) + ax3.set_xlabel("phate 1") + ax3.set_title("PHATE embedding of DLA fractal tree\nMetric MDS, sqrt") + + (ax1, ax2, ax3) = axes[1] + plt.setp((ax1, ax2, ax3), xticks=[], xticklabels=[], yticks=[]) + + # plotting PHATE - classic MDS + ax1.scatter(Y_cmds_fast[:, 0], Y_cmds_fast[:, 1], s=10, c=C) + ax1.set_xlabel("phate 1") + ax1.set_ylabel("phate 2") + ax1.set_title("PHATE embedding of DLA fractal tree\nFast Classic MDS") + + # plotting PHATE - metric MDS + ax2.scatter(Y_mmds_fast[:, 0], Y_mmds_fast[:, 1], s=10, c=C) + ax2.set_xlabel("phate 1") + ax2.set_title("PHATE embedding of DLA fractal tree\nFast Metric MDS, log") + + # plotting PHATE - nonmetric MDS + ax3.scatter(Y_sqrt_fast[:, 0], Y_sqrt_fast[:, 1], s=10, c=C) + ax3.set_xlabel("phate 1") + ax3.set_title("PHATE embedding of DLA fractal tree\nFast Metric MDS, sqrt") + + plt.tight_layout() + plt.savefig("python{}_tree.png".format(python_version), dpi=100) + + +def test_bmmsc(): + clusters = pd.read_csv("../data/MAP.csv", header=None) + clusters.columns = pd.Index(['wells', 'clusters']) + bmmsc = pd.read_csv("../data/BMMC_myeloid.csv.gz", index_col=0) + + C = clusters['clusters'] # using cluster labels from original publication + + # library_size_normalize performs L1 normalization on each cell + bmmsc_norm = phate.preprocessing.library_size_normalize(bmmsc) + phate_operator = phate.PHATE( + n_components=2, t='auto', a=200, k=10, mds='metric', + mds_dist='euclidean', n_landmark=None) + phate_fast_operator = phate.PHATE( + n_components=2, t='auto', k=10, mds='metric', mds_dist='euclidean', + n_landmark=1000) + + fig, (ax1, ax2) = plt.subplots(1, 2, sharey='all') + print("BMMSC, exact PHATE") + Y_mmds = phate_operator.fit_transform(bmmsc_norm, t_max=100, + plot_optimal_t=True, ax=ax1) + ax1.set_title("Optimal t selection on 2730 BMMSCs\n" + "Exact PHATE, " + ax1.get_title()) + print("BMMSC, fast PHATE") + Y_mmds_fast = phate_fast_operator.fit_transform(bmmsc_norm, t_max=100, + plot_optimal_t=True, + ax=ax2) + ax2.set_title("Optimal t selection on 2730 BMMSCs\n" + "Fast PHATE, " + ax2.get_title()) + plt.tight_layout() + plt.savefig("python{}_bmmsc_optimal_t.png".format(python_version), dpi=100) + + fig, (ax1, ax2) = plt.subplots(1, 2) + plt.setp((ax1, ax2), xticklabels=[], yticklabels=[]) + + ax1.scatter(Y_mmds[:, 0], Y_mmds[:, 1], s=1, c=C, cmap="Paired") + ax1.set_xlabel("phate 1") + ax1.set_ylabel("phate 2") + ax1.set_title("PHATE embedding of 2730 BMMSCs\nExact PHATE") + + ax2.scatter(Y_mmds_fast[:, 0], Y_mmds_fast[:, 1], + s=1, c=C, cmap="Paired") + ax2.set_xlabel("phate 1") + ax2.set_ylabel("phate 2") + ax2.set_title("PHATE embedding of 2730 BMMSCs\nFast PHATE") + + plt.gcf().set_size_inches(8, 8) + plt.tight_layout() + plt.savefig("python{}_bmmsc.png".format(python_version), dpi=100) + + +if __name__ == '__main__': + test_tree() + test_bmmsc() diff --git a/test/python3.6_bmmsc.png b/test/python3.6_bmmsc.png index d569354..ba2254c 100644 Binary files a/test/python3.6_bmmsc.png and b/test/python3.6_bmmsc.png differ diff --git a/test/python3.6_bmmsc_optimal_t.png b/test/python3.6_bmmsc_optimal_t.png index 45f7c60..bc47c83 100644 Binary files a/test/python3.6_bmmsc_optimal_t.png and b/test/python3.6_bmmsc_optimal_t.png differ diff --git a/test/python3.6_tree.png b/test/python3.6_tree.png index b6dc217..5db2ef2 100644 Binary files a/test/python3.6_tree.png and b/test/python3.6_tree.png differ