-
Notifications
You must be signed in to change notification settings - Fork 1
/
winecluster.R
223 lines (192 loc) · 10.4 KB
/
winecluster.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
#winecluster.R
#Custom k-means algorithm for data at http://archive.ics.uci.edu/ml/datasets/Wine
#Different functionality is commented out
#We used this code for both predictive model creating and whole-data clustering
results = matrix(nrow=100,ncol=3) #a matrix of our different clustering results
for (i in 1:100) { #we comment out the for loop when we are running with a seed we pick
#wine_data <- read.csv(file="wine_data_train3.csv",head=TRUE,sep=",") #for training
wine_data <-read.csv(file="winedata.csv", head=TRUE, sep=",") #for whole set
#randomize
wine_data = wine_data[sample(nrow(wine_data)),]
#a few different normalization/standardization options:
#scale just the ash, alcalinity of ash, hue, and proanthocyanin attributes
#wine_data_norm = wine_data
#wine_data_norm$ash = scale(wine_data$ash)
#wine_data_norm$alcal = scale(wine_data$alcal)
#wine_data_norm$proant = scale(wine_data$proant)
#wine_data_norm$hue = scale(wine_data$hue)
#wine_data_norm = apply(wine_data,2,function (x) (x-min(x)) / (max(x) - min(x) ) ) #then normalized
#merely normalize by min/max
wine_data_norm = apply(wine_data,2,function (x) (x-min(x)) / (max(x) - min(x) ) )
#scale the whole thing (make each attribute mean 0 with a standard deviation of 1)
#wine_data_norm = scale(wine_data)
#wine_data_norm = apply(wine_data,2,function (x) (x-min(x)) / (max(x) - min(x) ) )
#and everyone's favorite, do nothing:
#wine_data_norm = wine_data
#after normalizing/standardizing
#convert into a data frame, and fix the erroneously normalized/scaled/etc class field to be 1,2, or 3
wine_data_norm = as.data.frame(wine_data_norm)
wine_data_norm$class = wine_data$class
#get three random centroids.
#Was easy to copy rows to keep the names matching. There is probably a better way to deal with this.
#We didn't want a random value that was necessarily in the data set,
#so after copying a random row we make all relevant attributes random
set.seed(15) #i can be a number if we aren't using the for loop. this lets us get our best result from
#the brute-force for loop.
k1 = wine_data_norm[sample(length(wine_data_norm[,1]),1),]
k2 = wine_data_norm[sample(length(wine_data_norm[,1]),1),]
k3 = wine_data_norm[sample(length(wine_data_norm[,1]),1),]
#note that these random values range from 0 to 1, which is slightly incorrect for
#non-normalized data. But since we ended up sticking to normalized data most of the time
#we decided it wasn't worth it to fix this.
k1[2:14] = sapply(k1[2:14], function(x) runif(1,0.0,1.0))
k2[2:14] = sapply(k2[2:14], function(x) runif(1,0.0,1.0))
k3[2:14] = sapply(k3[2:14], function(x) runif(1,0.0,1.0))
#two options for distance, euclidean and arithmetic:
#vectors of euclidean distances, note we leave out the class
k1dist = apply(wine_data_norm[,2:14],1, function(x) sqrt(sum((x - k1[2:14])^2)))
k2dist = apply(wine_data_norm[,2:14],1, function(x) sqrt(sum((x - k2[2:14])^2)))
k3dist = apply(wine_data_norm[,2:14],1, function(x) sqrt(sum((x - k3[2:14])^2)))
#vectors of manhattan distance
#k1dist = apply(wine_data_norm[,2:14],1, function(x) sum(abs(x-k1[2:14])))
#k2dist = apply(wine_data_norm[,2:14],1, function(x) sum(abs(x-k2[2:14])))
#k3dist = apply(wine_data_norm[,2:14],1, function(x) sum(abs(x-k3[2:14])))
dists = cbind(k1dist,k2dist,k3dist)
#we compare the distances, adding a new column "cluster" to wine_data_norm
#cluster contains the closer cluster (the lesser distance)
wine_data_norm$cluster = apply(dists,1, function(x) which(x == min(x)))
#we want "old" centroids, for comparing in the main kmeans loop. When the new centroids match the old centroids
#kmeans stops.
k1old = k1
k2old = k2
k3old = k3
#compute new centroids for each cluster
k1[2:14] = apply(subset(wine_data_norm[2:14],wine_data_norm$cluster == 1),2, function(col) mean(col))
k2[2:14] = apply(subset(wine_data_norm[2:14],wine_data_norm$cluster == 2),2, function(col) mean(col))
k3[2:14] = apply(subset(wine_data_norm[2:14],wine_data_norm$cluster == 3),2, function(col) mean(col))
#if a cluster is empty, we want to make its centroid the point with the greatest SSE
class1samples = subset(wine_data_norm, wine_data_norm$cluster == 1)
class2samples = subset(wine_data_norm, wine_data_norm$cluster == 2)
class3samples = subset(wine_data_norm, wine_data_norm$cluster == 3)
#we have to find every sample's SSE
class1samples$sse = apply(class1samples[,2:14],1, function(x) sum( (x-k1[2:14])^2 ))
class2samples$sse = apply(class2samples[,2:14],1, function(x) sum( (x-k2[2:14])^2 ))
class3samples$sse = apply(class3samples[,2:14],1, function(x) sum( (x-k3[2:14])^2 ))
sse = rbind(class1samples,class2samples,class3samples)
sse = sse[order(sse$sse),] #sort by sse
#if a centroid is not a number, make that centroid the point with the highest SSE
#we then remove the centroid from our sse list so we don't reuse it if more than one
#centroid is null.
if(any(is.na(k1))) {
#print(c("EMPTY",k1[1,]))
k1 = sse[(length(sse[,1])),]
k1 = k1[,-16]
sse = sse[-length(sse[,1]),]
#print(c("FIXED",k1[1,]))
}
if(any(is.na(k2))) {
#print(c("EMPTY",k2[1,]))
k2 = sse[(length(sse[,1])),]
k2 = k2[,-16]
sse = sse[-length(sse[,1]),]
#print(c("FIXED",k2[1,]))
}
if(any(is.na(k3))) {
#print(c("EMPTY",k3[1,]))
k3 = sse[(length(sse[,1])),]
k3 = k3[,-16]
sse = sse[-length(sse[,1]),]
#print(c("FIXED",k3[1,]))
}
finaltotalerror = 99999 #for scope purposes
#now we loop until the previous iteration's centroids match (meaning centroids haven't moved)
#while (!all(k1==k1old) || !all(k2==k2old) || !all(k3=k3old) ){ kept giving errors
#this is still probably good enough to work in most instances
while ( !(sum(k1^2)==sum(k1old^2) && sum(k2^2)==sum(k2old^2) && sum(k3^2)==sum(k3old^2)) ){
#new distances euclidean
k1dist = apply(wine_data_norm[,2:14],1, function(x) sqrt(sum((x - k1[2:14])^2)))
k2dist = apply(wine_data_norm[,2:14],1, function(x) sqrt(sum((x - k2[2:14])^2)))
k3dist = apply(wine_data_norm[,2:14],1, function(x) sqrt(sum((x - k3[2:14])^2)))
#manhattan
#k1dist = apply(wine_data_norm[,2:14],1, function(x) sum(abs(x-k1[2:14])))
#k2dist = apply(wine_data_norm[,2:14],1, function(x) sum(abs(x-k2[2:14])))
#k3dist = apply(wine_data_norm[,2:14],1, function(x) sum(abs(x-k3[2:14])))
dists = cbind(k1dist,k2dist,k3dist)
#assign points to a cluster
wine_data_norm$cluster = apply(dists,1, function(x) which(x == min(x)))
#make new clusters
k1old = k1
k2old = k2
k3old = k3
#compute new centroids for each cluster
k1[2:14] = apply(subset(wine_data_norm[2:14],wine_data_norm$cluster == 1),2, function(col) mean(col))
k2[2:14] = apply(subset(wine_data_norm[2:14],wine_data_norm$cluster == 2),2, function(col) mean(col))
k3[2:14] = apply(subset(wine_data_norm[2:14],wine_data_norm$cluster == 3),2, function(col) mean(col))
#if a cluster is empty, we want to make its centroid the point with the greatest SSE
class1samples = subset(wine_data_norm, wine_data_norm$cluster == 1)
class2samples = subset(wine_data_norm, wine_data_norm$cluster == 2)
class3samples = subset(wine_data_norm, wine_data_norm$cluster == 3)
#we have to find every sample's SSE
class1samples$sse = apply(class1samples[,2:14],1, function(x) sum( (x-k1[2:14])^2 ))
class2samples$sse = apply(class2samples[,2:14],1, function(x) sum( (x-k2[2:14])^2 ))
class3samples$sse = apply(class3samples[,2:14],1, function(x) sum( (x-k3[2:14])^2 ))
sse = rbind(class1samples,class2samples,class3samples)
sse = sse[order(sse$sse),] #sort by sse
#if a centroid is not a number, make that centroid the point with the highest SSE
#we then remove the centroid from our sse list so we don't reuse it if more than one
#centroid is null.
if(any(is.na(k1))) {
#print(c("EMPTY",k1[1,]))
k1 = sse[(length(sse[,1])),]
k1 = k1[,-16]
sse = sse[-length(sse[,1]),]
#print(c("FIXED",k1[1,]))
}
if(any(is.na(k2))) {
#print(c("EMPTY",k2[1,]))
k2 = sse[(length(sse[,1])),]
k2 = k2[,-16]
sse = sse[-length(sse[,1]),]
#print(c("FIXED",k2[1,]))
}
if(any(is.na(k3))) {
#print(c("EMPTY",k3[1,]))
k3 = sse[(length(sse[,1])),]
k3 = k3[,-16]
sse = sse[-length(sse[,1]),]
#print(c("FIXED",k3[1,]))
}
#We want to measure the quality of our clustering. One way is SSE:
#print(c("Current SSE = ",sum(sse$sse)))
#Another way we're calling total error classes. It is the total number of misclasified examples.
#Note that this measurement isn't perfect, and that it will give strange results
#when clusters aren't relatively homogenous. But it is likely quite useful for comparing
#different clusterings.
k1subset = subset(wine_data_norm, wine_data_norm$cluster == 1)
k2subset = subset(wine_data_norm, wine_data_norm$cluster == 2)
k3subset = subset(wine_data_norm, wine_data_norm$cluster == 3)
#we compute total error by taking the mean of each cluster's class (which is usually a number
#close to the majority of the cluster's classes) and then rounding this (to get the actual
#value of the cluster's class), and then comparing each sample's class to the majority
#class of the cluster found by rounding. There is probably a better way to do this.
#then we count up the number of mistakes
totalerror = 0
totalerror = totalerror + length(which(k1subset$class != round(mean(k1subset$class))))
totalerror = totalerror + length(which(k2subset$class != round(mean(k2subset$class))))
totalerror = totalerror + length(which(k3subset$class != round(mean(k3subset$class))))
#print(c("Total Errors = ", totalerror))
finaltotalerror = totalerror
}
#print(k1)
#print(k2)
#print(k3)
#print(c("Mean Class Cluster 1 = ", mean(subset(wine_data_norm, wine_data_norm$cluster == 1)$class)))
#print(c("Mean Class Cluster 2 = ", mean(subset(wine_data_norm, wine_data_norm$cluster == 2)$class)))
#print(c("Mean Class Cluster 3 = ", mean(subset(wine_data_norm, wine_data_norm$cluster == 3)$class)))
print(c("Final SSE = ",sum(sse$sse)))
print(c("Final Total Errors = ", finaltotalerror))
#store the results, so we can reference them later to find a good seed
results[i,1] = i
results[i,2] = sum(sse$sse)
results[i,3] = finaltotalerror
}