clean-up and new test file

martinroyer · Apr 14, 2018 · 5107b10 · 5107b10
1 parent 10ee2db
commit 5107b10
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 54 deletions.
diff --git a/pecok/admm.py b/pecok/admm.py
@@ -48,7 +48,7 @@ def proj_Snp_imp(Y):
     return Y
 
 
-def pecok_admm(relational_data, K, n_iter_max=-1):
+def pecok_admm(relational_data, K, n_iter_max=-1, rho=1, mat_init=None):
     """Implementation of Alternating Direction Method of Multipliers
 
     Parameters
@@ -58,10 +58,11 @@ def pecok_admm(relational_data, K, n_iter_max=-1):
     n_samples,_ = relational_data.shape
     if n_iter_max < 0:
         n_iter_max = np.max((1000,2*n_samples))
-    rho = 10.0
     relational_data = relational_data / np.linalg.norm(relational_data)
 
     X, Y, Z = np.identity(n_samples), np.identity(n_samples), np.identity(n_samples)
+    if mat_init is not None:
+        X, Y, Z = mat_init, mat_init, mat_init
     X = X + 0.2*np.random.random((n_samples, n_samples))
     U, V, W = np.zeros((n_samples,n_samples)), np.zeros((n_samples,n_samples)), np.zeros((n_samples,n_samples))
     Xbar = (X + Y + Z)/3
@@ -90,8 +91,8 @@ def pecok_admm(relational_data, K, n_iter_max=-1):
 
 
 def is_primal_high(res_primal, X, Y, Z):
-    return res_primal > 1e-4 * np.max((np.linalg.norm(X), np.linalg.norm(Y), np.linalg.norm(Z)))
+    return res_primal > 1e-3 * np.max((np.linalg.norm(X), np.linalg.norm(Y), np.linalg.norm(Z)))
 
 
 def is_dual_high(res_dual, Y, Z):
-    return res_dual > 1e-4 * (np.sqrt(Y.shape[0]) + np.linalg.norm(Y) + np.linalg.norm(Z))
+    return res_dual > 1e-3 * (np.sqrt(Y.shape[0]) + np.linalg.norm(Y) + np.linalg.norm(Z))
diff --git a/pecok/pecok_clustering.py b/pecok/pecok_clustering.py
diff --git a/tests/test_clustering.py b/tests/test_clustering.py
@@ -0,0 +1,74 @@
+"""Test for variable and point clustering"""
+
+# author: Martin Royer <[email protected]>
+# License: MIT
+
+import numpy as np
+from sklearn import cluster
+import scipy.sparse.linalg as ssl
+
+from pecok import gamma, admm
+
+
+def hierarchial_clustering(obs, n_struct):
+    hclustering = cluster.AgglomerativeClustering(linkage='ward', n_clusters=n_struct)
+    return hclustering.fit(obs, n_struct)
+
+
+def kmeans_clustering(obs, n_struct):
+    k_means = cluster.KMeans(n_clusters=n_struct, init='k-means++', n_init=100)
+    return k_means.fit(obs)
+
+
+def spectral_clustering(obs, n_struct):
+    approx, _, _ = ssl.svds(obs, k=n_struct)
+    return hierarchial_clustering(approx, n_struct)
+
+
+def pecok_clustering(obs, n_struct, rho=5):
+    gram_corrected = (obs.T.dot(obs) - np.diag(gamma.gamma_hat4(obs.T))) / obs.shape[0]
+    U, _, V = ssl.svds(gram_corrected, k=n_struct)
+    Bhat = admm.pecok_admm(gram_corrected, K=n_struct, rho=rho, mat_init=U.dot(V))
+    return hierarchial_clustering(Bhat, n_struct=n_struct)
+
+
+seed = 432
+np.random.seed(seed)
+print("seed is %i" % seed)
+
+n_var = 10
+n_obs = 100
+
+print("\nVAR CLUSTERING\n\n")
+
+truth = np.asmatrix(np.concatenate((np.repeat(0, n_var//2), np.repeat(1, n_var//2))))
+membership = truth.T.dot(np.matrix([1, 0])) + (1-truth).T.dot(np.matrix([0, 1]))
+stds = np.ones(n_var)
+stds[:(n_var//2)] = 0.1
+sigma = membership.dot(0.1*np.identity(2)).dot(membership.T) + np.diag(stds)
+mat_data = np.random.multivariate_normal(mean=np.zeros(n_var), cov=sigma, size=n_obs)
+gram_data = mat_data.T.dot(mat_data) / mat_data.shape[0]
+
+print("truth:".ljust(15), truth)
+print("hierarchical:".ljust(15), hierarchial_clustering(mat_data.T, n_struct=2).labels_)
+print("kmeans:".ljust(15), kmeans_clustering(mat_data.T, n_struct=2).labels_)
+print("spectral:".ljust(15), spectral_clustering(gram_data, n_struct=2).labels_)
+print("pecok:".ljust(15), pecok_clustering(mat_data, n_struct=2).labels_)
+
+print("\nPOINT CLUSTERING\n\n")
+
+n_var = 100
+n_obs = 10
+
+truth = np.asmatrix(np.concatenate((np.repeat(0, n_obs//2), np.repeat(1, n_obs//2))))
+X = np.zeros((n_obs, n_var))
+snr = 0.3
+X[:n_obs//2, :] = np.ones(n_var)*snr + np.random.normal(scale=1, size=(n_obs//2, n_var))
+X[n_obs//2:, :] = -np.ones(n_var)*snr + np.random.normal(scale=0.1, size=(n_obs//2, n_var))
+gram = X.dot(X.T) / X.shape[1]
+
+print("truth:".ljust(15), truth)
+print("hierarchical:".ljust(15), hierarchial_clustering(X, n_struct=2).labels_)
+print("kmeans:".ljust(15), kmeans_clustering(X, n_struct=2).labels_)
+print("spectral:".ljust(15), spectral_clustering(gram, n_struct=2).labels_)
+print("pecok:".ljust(15), pecok_clustering(X.T, n_struct=2).labels_)
diff --git a/tests/test_pointclustering.py b/tests/test_pointclustering.py