From 5107b106825f4d26d8bb31c6e65cf94aa859a67e Mon Sep 17 00:00:00 2001 From: Martin Royer Date: Thu, 22 Feb 2018 16:02:26 +0100 Subject: [PATCH] clean-up and new test file --- pecok/admm.py | 9 +++-- pecok/pecok_clustering.py | 26 ------------ tests/test_clustering.py | 74 +++++++++++++++++++++++++++++++++++ tests/test_pointclustering.py | 24 ------------ 4 files changed, 79 insertions(+), 54 deletions(-) delete mode 100644 pecok/pecok_clustering.py create mode 100644 tests/test_clustering.py delete mode 100644 tests/test_pointclustering.py diff --git a/pecok/admm.py b/pecok/admm.py index 5a9eb90..16277e2 100644 --- a/pecok/admm.py +++ b/pecok/admm.py @@ -48,7 +48,7 @@ def proj_Snp_imp(Y): return Y -def pecok_admm(relational_data, K, n_iter_max=-1): +def pecok_admm(relational_data, K, n_iter_max=-1, rho=1, mat_init=None): """Implementation of Alternating Direction Method of Multipliers Parameters @@ -58,10 +58,11 @@ def pecok_admm(relational_data, K, n_iter_max=-1): n_samples,_ = relational_data.shape if n_iter_max < 0: n_iter_max = np.max((1000,2*n_samples)) - rho = 10.0 relational_data = relational_data / np.linalg.norm(relational_data) X, Y, Z = np.identity(n_samples), np.identity(n_samples), np.identity(n_samples) + if mat_init is not None: + X, Y, Z = mat_init, mat_init, mat_init X = X + 0.2*np.random.random((n_samples, n_samples)) U, V, W = np.zeros((n_samples,n_samples)), np.zeros((n_samples,n_samples)), np.zeros((n_samples,n_samples)) Xbar = (X + Y + Z)/3 @@ -90,8 +91,8 @@ def pecok_admm(relational_data, K, n_iter_max=-1): def is_primal_high(res_primal, X, Y, Z): - return res_primal > 1e-4 * np.max((np.linalg.norm(X), np.linalg.norm(Y), np.linalg.norm(Z))) + return res_primal > 1e-3 * np.max((np.linalg.norm(X), np.linalg.norm(Y), np.linalg.norm(Z))) def is_dual_high(res_dual, Y, Z): - return res_dual > 1e-4 * (np.sqrt(Y.shape[0]) + np.linalg.norm(Y) + np.linalg.norm(Z)) + return res_dual > 1e-3 * (np.sqrt(Y.shape[0]) + np.linalg.norm(Y) + np.linalg.norm(Z)) diff --git a/pecok/pecok_clustering.py b/pecok/pecok_clustering.py deleted file mode 100644 index edf0aa7..0000000 --- a/pecok/pecok_clustering.py +++ /dev/null @@ -1,26 +0,0 @@ -"""Gamma estimation""" - -# author: Martin Royer -# License: MIT - -from .gamma import gamma_hat4 -from .admm import pecok_admm - -############################################################################### -def cluster(X, K): - """Implementation of PECOK estimator of B* - - Parameters - ---------- - X : array-like or sparse matrix, shape=(n_samples, n_features) - Training instances to cluster.""" - return pecok_admm(X.dot(X.T)-gamma_hat4(X), K) - -def cluster_sbm(A, K): - """Implementation of PECOK estimator of B* - - Parameters - ---------- - A : adjacency matrix for network, shape=(n_samples, n_samples) - Training instances to cluster.""" - return pecok_admm(A.dot(A), K) diff --git a/tests/test_clustering.py b/tests/test_clustering.py new file mode 100644 index 0000000..e35b708 --- /dev/null +++ b/tests/test_clustering.py @@ -0,0 +1,74 @@ +"""Test for variable and point clustering""" + +# author: Martin Royer +# License: MIT + +import numpy as np +from sklearn import cluster +import scipy.sparse.linalg as ssl + +from pecok import gamma, admm + + +def hierarchial_clustering(obs, n_struct): + hclustering = cluster.AgglomerativeClustering(linkage='ward', n_clusters=n_struct) + return hclustering.fit(obs, n_struct) + + +def kmeans_clustering(obs, n_struct): + k_means = cluster.KMeans(n_clusters=n_struct, init='k-means++', n_init=100) + return k_means.fit(obs) + + +def spectral_clustering(obs, n_struct): + approx, _, _ = ssl.svds(obs, k=n_struct) + return hierarchial_clustering(approx, n_struct) + + +def pecok_clustering(obs, n_struct, rho=5): + gram_corrected = (obs.T.dot(obs) - np.diag(gamma.gamma_hat4(obs.T))) / obs.shape[0] + U, _, V = ssl.svds(gram_corrected, k=n_struct) + Bhat = admm.pecok_admm(gram_corrected, K=n_struct, rho=rho, mat_init=U.dot(V)) + return hierarchial_clustering(Bhat, n_struct=n_struct) + + +seed = 432 +np.random.seed(seed) +print("seed is %i" % seed) + +n_var = 10 +n_obs = 100 + +print("\nVAR CLUSTERING\n\n") + +truth = np.asmatrix(np.concatenate((np.repeat(0, n_var//2), np.repeat(1, n_var//2)))) +membership = truth.T.dot(np.matrix([1, 0])) + (1-truth).T.dot(np.matrix([0, 1])) +stds = np.ones(n_var) +stds[:(n_var//2)] = 0.1 +sigma = membership.dot(0.1*np.identity(2)).dot(membership.T) + np.diag(stds) +mat_data = np.random.multivariate_normal(mean=np.zeros(n_var), cov=sigma, size=n_obs) +gram_data = mat_data.T.dot(mat_data) / mat_data.shape[0] + +print("truth:".ljust(15), truth) +print("hierarchical:".ljust(15), hierarchial_clustering(mat_data.T, n_struct=2).labels_) +print("kmeans:".ljust(15), kmeans_clustering(mat_data.T, n_struct=2).labels_) +print("spectral:".ljust(15), spectral_clustering(gram_data, n_struct=2).labels_) +print("pecok:".ljust(15), pecok_clustering(mat_data, n_struct=2).labels_) + +print("\nPOINT CLUSTERING\n\n") + +n_var = 100 +n_obs = 10 + +truth = np.asmatrix(np.concatenate((np.repeat(0, n_obs//2), np.repeat(1, n_obs//2)))) +X = np.zeros((n_obs, n_var)) +snr = 0.3 +X[:n_obs//2, :] = np.ones(n_var)*snr + np.random.normal(scale=1, size=(n_obs//2, n_var)) +X[n_obs//2:, :] = -np.ones(n_var)*snr + np.random.normal(scale=0.1, size=(n_obs//2, n_var)) +gram = X.dot(X.T) / X.shape[1] + +print("truth:".ljust(15), truth) +print("hierarchical:".ljust(15), hierarchial_clustering(X, n_struct=2).labels_) +print("kmeans:".ljust(15), kmeans_clustering(X, n_struct=2).labels_) +print("spectral:".ljust(15), spectral_clustering(gram, n_struct=2).labels_) +print("pecok:".ljust(15), pecok_clustering(X.T, n_struct=2).labels_) diff --git a/tests/test_pointclustering.py b/tests/test_pointclustering.py deleted file mode 100644 index ceafa39..0000000 --- a/tests/test_pointclustering.py +++ /dev/null @@ -1,24 +0,0 @@ -import os -import sys -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) - -import numpy as np -from sklearn import cluster -from pecok import pecok_clustering - -seed = 432 -np.random.seed(seed) -print("seed is %i" % seed) - -n_samples = 10 -n_features = 100 -truth = np.concatenate((np.repeat(0, n_samples//2), np.repeat(1, n_samples//2))) -X = np.zeros((n_samples, n_features)) -X[:n_samples//2, :] = np.ones(n_features)*0.1 + np.random.normal(scale=1, size=(n_samples//2, n_features)) -X[n_samples//2:, :] = -np.ones(n_features)*0.1 + np.random.normal(scale=0.1, size=(n_samples//2, n_features)) - -Bhat = pecok_clustering.cluster(X, 2) -kMeans = cluster.KMeans(n_clusters=2, init='k-means++', n_init=100, copy_x=True) -print("truth:".ljust(10), truth) -print("pecok:".ljust(10), kMeans.fit(Bhat).labels_) -print("kmeans:".ljust(10), kMeans.fit(X).labels_)