From 5107b106825f4d26d8bb31c6e65cf94aa859a67e Mon Sep 17 00:00:00 2001
From: Martin Royer <martin.royer@m4x.org>
Date: Thu, 22 Feb 2018 16:02:26 +0100
Subject: [PATCH] clean-up and new test file

---
 pecok/admm.py                 |  9 +++--
 pecok/pecok_clustering.py     | 26 ------------
 tests/test_clustering.py      | 74 +++++++++++++++++++++++++++++++++++
 tests/test_pointclustering.py | 24 ------------
 4 files changed, 79 insertions(+), 54 deletions(-)
 delete mode 100644 pecok/pecok_clustering.py
 create mode 100644 tests/test_clustering.py
 delete mode 100644 tests/test_pointclustering.py

diff --git a/pecok/admm.py b/pecok/admm.py
index 5a9eb90..16277e2 100644
--- a/pecok/admm.py
+++ b/pecok/admm.py
@@ -48,7 +48,7 @@ def proj_Snp_imp(Y):
     return Y
 
 
-def pecok_admm(relational_data, K, n_iter_max=-1):
+def pecok_admm(relational_data, K, n_iter_max=-1, rho=1, mat_init=None):
     """Implementation of Alternating Direction Method of Multipliers
 
     Parameters
@@ -58,10 +58,11 @@ def pecok_admm(relational_data, K, n_iter_max=-1):
     n_samples,_ = relational_data.shape
     if n_iter_max < 0:
         n_iter_max = np.max((1000,2*n_samples))
-    rho = 10.0
     relational_data = relational_data / np.linalg.norm(relational_data)
 
     X, Y, Z = np.identity(n_samples), np.identity(n_samples), np.identity(n_samples)
+    if mat_init is not None:
+        X, Y, Z = mat_init, mat_init, mat_init
     X = X + 0.2*np.random.random((n_samples, n_samples))
     U, V, W = np.zeros((n_samples,n_samples)), np.zeros((n_samples,n_samples)), np.zeros((n_samples,n_samples))
     Xbar = (X + Y + Z)/3
@@ -90,8 +91,8 @@ def pecok_admm(relational_data, K, n_iter_max=-1):
 
 
 def is_primal_high(res_primal, X, Y, Z):
-    return res_primal > 1e-4 * np.max((np.linalg.norm(X), np.linalg.norm(Y), np.linalg.norm(Z)))
+    return res_primal > 1e-3 * np.max((np.linalg.norm(X), np.linalg.norm(Y), np.linalg.norm(Z)))
 
 
 def is_dual_high(res_dual, Y, Z):
-    return res_dual > 1e-4 * (np.sqrt(Y.shape[0]) + np.linalg.norm(Y) + np.linalg.norm(Z))
+    return res_dual > 1e-3 * (np.sqrt(Y.shape[0]) + np.linalg.norm(Y) + np.linalg.norm(Z))
diff --git a/pecok/pecok_clustering.py b/pecok/pecok_clustering.py
deleted file mode 100644
index edf0aa7..0000000
--- a/pecok/pecok_clustering.py
+++ /dev/null
@@ -1,26 +0,0 @@
-"""Gamma estimation"""
-
-# author: Martin Royer <martin.royer@math.u-psud.fr>
-# License: MIT
-
-from .gamma import gamma_hat4
-from .admm import pecok_admm
-
-###############################################################################
-def cluster(X, K):
-    """Implementation of PECOK estimator of B*
-
-    Parameters
-    ----------
-    X : array-like or sparse matrix, shape=(n_samples, n_features)
-        Training instances to cluster."""
-    return pecok_admm(X.dot(X.T)-gamma_hat4(X), K)
-
-def cluster_sbm(A, K):
-    """Implementation of PECOK estimator of B*
-
-    Parameters
-    ----------
-    A : adjacency matrix for network, shape=(n_samples, n_samples)
-        Training instances to cluster."""
-    return pecok_admm(A.dot(A), K)
diff --git a/tests/test_clustering.py b/tests/test_clustering.py
new file mode 100644
index 0000000..e35b708
--- /dev/null
+++ b/tests/test_clustering.py
@@ -0,0 +1,74 @@
+"""Test for variable and point clustering"""
+
+# author: Martin Royer <martin.royer@math.u-psud.fr>
+# License: MIT
+
+import numpy as np
+from sklearn import cluster
+import scipy.sparse.linalg as ssl
+
+from pecok import gamma, admm
+
+
+def hierarchial_clustering(obs, n_struct):
+    hclustering = cluster.AgglomerativeClustering(linkage='ward', n_clusters=n_struct)
+    return hclustering.fit(obs, n_struct)
+
+
+def kmeans_clustering(obs, n_struct):
+    k_means = cluster.KMeans(n_clusters=n_struct, init='k-means++', n_init=100)
+    return k_means.fit(obs)
+
+
+def spectral_clustering(obs, n_struct):
+    approx, _, _ = ssl.svds(obs, k=n_struct)
+    return hierarchial_clustering(approx, n_struct)
+
+
+def pecok_clustering(obs, n_struct, rho=5):
+    gram_corrected = (obs.T.dot(obs) - np.diag(gamma.gamma_hat4(obs.T))) / obs.shape[0]
+    U, _, V = ssl.svds(gram_corrected, k=n_struct)
+    Bhat = admm.pecok_admm(gram_corrected, K=n_struct, rho=rho, mat_init=U.dot(V))
+    return hierarchial_clustering(Bhat, n_struct=n_struct)
+
+
+seed = 432
+np.random.seed(seed)
+print("seed is %i" % seed)
+
+n_var = 10
+n_obs = 100
+
+print("\nVAR CLUSTERING\n\n")
+
+truth = np.asmatrix(np.concatenate((np.repeat(0, n_var//2), np.repeat(1, n_var//2))))
+membership = truth.T.dot(np.matrix([1, 0])) + (1-truth).T.dot(np.matrix([0, 1]))
+stds = np.ones(n_var)
+stds[:(n_var//2)] = 0.1
+sigma = membership.dot(0.1*np.identity(2)).dot(membership.T) + np.diag(stds)
+mat_data = np.random.multivariate_normal(mean=np.zeros(n_var), cov=sigma, size=n_obs)
+gram_data = mat_data.T.dot(mat_data) / mat_data.shape[0]
+
+print("truth:".ljust(15), truth)
+print("hierarchical:".ljust(15), hierarchial_clustering(mat_data.T, n_struct=2).labels_)
+print("kmeans:".ljust(15), kmeans_clustering(mat_data.T, n_struct=2).labels_)
+print("spectral:".ljust(15), spectral_clustering(gram_data, n_struct=2).labels_)
+print("pecok:".ljust(15), pecok_clustering(mat_data, n_struct=2).labels_)
+
+print("\nPOINT CLUSTERING\n\n")
+
+n_var = 100
+n_obs = 10
+
+truth = np.asmatrix(np.concatenate((np.repeat(0, n_obs//2), np.repeat(1, n_obs//2))))
+X = np.zeros((n_obs, n_var))
+snr = 0.3
+X[:n_obs//2, :] = np.ones(n_var)*snr + np.random.normal(scale=1, size=(n_obs//2, n_var))
+X[n_obs//2:, :] = -np.ones(n_var)*snr + np.random.normal(scale=0.1, size=(n_obs//2, n_var))
+gram = X.dot(X.T) / X.shape[1]
+
+print("truth:".ljust(15), truth)
+print("hierarchical:".ljust(15), hierarchial_clustering(X, n_struct=2).labels_)
+print("kmeans:".ljust(15), kmeans_clustering(X, n_struct=2).labels_)
+print("spectral:".ljust(15), spectral_clustering(gram, n_struct=2).labels_)
+print("pecok:".ljust(15), pecok_clustering(X.T, n_struct=2).labels_)
diff --git a/tests/test_pointclustering.py b/tests/test_pointclustering.py
deleted file mode 100644
index ceafa39..0000000
--- a/tests/test_pointclustering.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import os
-import sys
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-
-import numpy as np
-from sklearn import cluster
-from pecok import pecok_clustering
-
-seed = 432
-np.random.seed(seed)
-print("seed is %i" % seed)
-
-n_samples = 10
-n_features = 100
-truth = np.concatenate((np.repeat(0, n_samples//2), np.repeat(1, n_samples//2)))
-X = np.zeros((n_samples, n_features))
-X[:n_samples//2, :] = np.ones(n_features)*0.1 + np.random.normal(scale=1, size=(n_samples//2, n_features))
-X[n_samples//2:, :] = -np.ones(n_features)*0.1 + np.random.normal(scale=0.1, size=(n_samples//2, n_features))
-
-Bhat = pecok_clustering.cluster(X, 2)
-kMeans = cluster.KMeans(n_clusters=2, init='k-means++', n_init=100, copy_x=True)
-print("truth:".ljust(10), truth)
-print("pecok:".ljust(10), kMeans.fit(Bhat).labels_)
-print("kmeans:".ljust(10), kMeans.fit(X).labels_)