-
Notifications
You must be signed in to change notification settings - Fork 6
/
kkmedoid.m
129 lines (94 loc) · 4.04 KB
/
kkmedoid.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
%
% This function applies a (kernel) k-medoid algorithm (PMA)
% in the Feature Space defined by the kernel matrix K.
% Author: Celine Scheidt
% Date: April 2009
% Updated: February 2013
function Clustering = kkmedoid(Xd,nbclusters,nbrep,K)
%% Input Parameters
% - Xd: location of the points to be plotted (from MDS). Rows of Xd are
% the coordinates in p-dimensional space.
% Dimension: Number of models (n) x p
% - nbclusters: number of clusters to construct
% - nbrep: (optional). Number of clustering performed. The best cluster
% configuration is returned
% - K: Kernel matrix (if not provided, k-medoid is performed instead of
% kernel k-medoid
%% Output Parameters
% - Clustering: Results of the clustering, which contains:
% - label: is a vector of length n, which contains the cluster index that each model belongs to
% - medoids: is a vector of length nbcluster containing the index
% of the medoids
% - weights: is a vector of ength nbcluster containing the number
% of models in each cluster.
%
% Reference: http://en.wikipedia.org/wiki/K-medoids
% Kaufman, L. and Rousseeuw, P.J. (1987), Clustering by means of Medoids,
% in Statistical Data Analysis Based on the –Norm and Related Methods,
% edited by Y. Dodge, North-Holland, 405–416.
maxIterations = 50;
npoints = size(Xd,1);
minDistBest = Inf;
if nargin < 3
nbrep = 1;
end
% Definition of the distance pairwise distances between models
if nargin < 4
D = squareform(pdist(Xd)); % If kernel matrix not provided, Euclidean distance is used
else
D = dist_feature(K,1:npoints,1:npoints); % distance in Feature space
end
for iter = 1:nbrep % nbrep clustering are performed, the best is returned
% 1. Initalize: randonly select nbclusters of the npoints data points as the medoids
initMedoids = randsample(npoints,nbclusters);
% 2. Associate each data point to the closest medoid
[minDistInit, label] = min(D(initMedoids,:));
currentMedoids = initMedoids;
minDistCurrent = minDistInit;
label_prev = NaN(1,npoints);
nbIter = 0;
while any(label ~= label_prev) && nbIter < maxIterations % while cluster configuration is changing and maxIteration not reached
label_prev = label;
% 3. For each medoid m
for m = 1:nbclusters
NoMedoid = setdiff(1:npoints,currentMedoids);
NewMedoids = currentMedoids;
% For each non-medoid data point o
for o = 1:length(NoMedoid)
% Swap m and o and compute the cost of the configuration
NewMedoids(m) = NoMedoid(o);
[minDist, label] = min(D(NewMedoids,:));
cost = sum(minDist) - sum(minDistCurrent);
if cost < 0 % 4. Select the configuration with the lowest cost
currentMedoids(m) = NoMedoid(o);
[minDistCurrent, label] = min(D(currentMedoids,:));
end
end
end
nbIter = nbIter+1;
end
currentMedoids = sort(currentMedoids);
[minDist, label] = min(D(currentMedoids,:));
% Return the best clustering configuration among the nbrep tested
if sum(minDist) < sum(minDistBest)
minDistBest = minDist;
labelBest = label;
currentMedoidsBest = currentMedoids;
end
end
%% Once the medoids are defined, store the outputs
weights = zeros(nbclusters,1);
for i = 1:nbclusters
weights(i) = sum(labelBest == i);
end
Clustering.label = labelBest;
Clustering.medoids = currentMedoidsBest;
Clustering.weights = weights;
end
%% This function compute distance in Feature Space using the Kernel matrix
% d_f(x,y) = K(x,x) -2K(x,y) + K(y,y)
function d_f = dist_feature(K,x,y) % x and y can be vectors
k1 = diag(K(x,x));
k2 = diag(K(y,y));
d_f = k1(:,ones(1,length(y))) -2*K(x,y) + k2(:,ones(length(x),1))';
end