Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update kmeans.py #52

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
256 changes: 115 additions & 141 deletions KMeans/kmeans.py
Original file line number Diff line number Diff line change
@@ -1,170 +1,144 @@
#coding=utf-8

'''
@author: wepon, http://2hwp.com
Reference:
Book: <<Machine Learning in Action>>
Software: sklearn.cluster.KMeans

'''
import numpy as np

class KMeans(object):
"""
- 参数
n_clusters:
聚类个数,即k
initCent:
质心初始化方式,可选"random"或指定一个具体的array,默认random,即随机初始化
max_iter:
最大迭代次数
"""
def __init__(self,n_clusters=5,initCent='random',max_iter=300):
if hasattr(initCent, '__array__'):
n_clusters = initCent.shape[0]
self.centroids = np.asarray(initCent, dtype=np.float)
else:
self.centroids = None

class KMeans:
def __init__(self, n_clusters=5, initCent='random', max_iter=300):
self.n_clusters = n_clusters
self.max_iter = max_iter
self.initCent = initCent
self.clusterAssment = None
self.labels = None
self.sse = None

#计算两点的欧式距离
def _distEclud(self, vecA, vecB):
self.centroids = None
self.cluster_assessment = None
self.labels = None
self.sse = None

def _dist_euclidean(self, vecA, vecB):
return np.linalg.norm(vecA - vecB)

#随机选取k个质心,必须在数据集的边界内
def _randCent(self, X, k):
n = X.shape[1] #特征维数
centroids = np.empty((k,n)) #k*n的矩阵,用于存储质心
for j in range(n): #产生k个质心,一维一维地随机初始化
minJ = min(X[:,j])
rangeJ = float(max(X[:,j]) - minJ)
centroids[:,j] = (minJ + rangeJ * np.random.rand(k,1)).flatten()

def _rand_centroids(self, X):
n = X.shape[1]
centroids = np.empty((self.n_clusters, n))
for j in range(n):
min_j = min(X[:, j])
range_j = float(max(X[:, j]) - min_j)
centroids[:, j] = (min_j + range_j * np.random.rand(self.n_clusters))
return centroids

def fit(self, X):
#类型检查
if not isinstance(X,np.ndarray):
try:
try:
if not isinstance(X, np.ndarray):
X = np.asarray(X)
except:
raise TypeError("numpy.ndarray required for X")

m = X.shape[0]#m代表样本数量
self.clusterAssment = np.empty((m,2))#m*2的矩阵,第一列存储样本点所属的族的索引值,
#第二列存储该点与所属族的质心的平方误差
except:
raise TypeError("numpy.ndarray required for X")

m = X.shape[0
self.cluster_assessment = np.empty((m, 2))
if self.initCent == 'random':
self.centroids = self._randCent(X, self.n_clusters)

clusterChanged = True
self.centroids = self._rand_centroids(X)

for _ in range(self.max_iter):
clusterChanged = False
for i in range(m):#将每个样本点分配到离它最近的质心所属的族
minDist = np.inf; minIndex = -1
cluster_changed = False
for i in range(m):
min_dist = np.inf
min_index = -1
for j in range(self.n_clusters):
distJI = self._distEclud(self.centroids[j,:],X[i,:])
if distJI < minDist:
minDist = distJI; minIndex = j
if self.clusterAssment[i,0] != minIndex:
clusterChanged = True
self.clusterAssment[i,:] = minIndex,minDist**2

if not clusterChanged:#若所有样本点所属的族都不改变,则已收敛,结束迭代
break
for i in range(self.n_clusters):#更新质心,即将每个族中的点的均值作为质心
ptsInClust = X[np.nonzero(self.clusterAssment[:,0]==i)[0]]#取出属于第i个族的所有点
self.centroids[i,:] = np.mean(ptsInClust, axis=0)

self.labels = self.clusterAssment[:,0]
self.sse = sum(self.clusterAssment[:,1])


def predict(self,X):#根据聚类结果,预测新输入数据所属的族
#类型检查
if not isinstance(X,np.ndarray):
try:
dist_ji = self._dist_euclidean(self.centroids[j, :], X[i, :])
if dist_ji < min_dist:
min_dist = dist_ji
min_index = j
if self.cluster_assessment[i, 0] != min_index:
cluster_changed = True
self.cluster_assessment[i, :] = min_index, min_dist ** 2

if not cluster_changed:
break
for i in range(self.n_clusters):
pts_in_cluster = X[np.where(self.cluster_assessment[:, 0] == i)[0]]
self.centroids[i, :] = np.mean(pts_in_cluster, axis=0)

self.labels = self.cluster_assessment[:, 0]
self.sse = sum(self.cluster_assessment[:, 1])

def predict(self, X):
try:
if not isinstance(X, np.ndarray):
X = np.asarray(X)
except:
raise TypeError("numpy.ndarray required for X")
m = X.shape[0]#m代表样本数量
except:
raise TypeError("numpy.ndarray required for X")

m = X.shape[0
preds = np.empty((m,))
for i in range(m):#将每个样本点分配到离它最近的质心所属的族
minDist = np.inf
for i in range(m):
min_dist = np.inf
for j in range(self.n_clusters):
distJI = self._distEclud(self.centroids[j,:],X[i,:])
if distJI < minDist:
minDist = distJI
dist_ji = self._dist_euclidean(self.centroids[j, :], X[i, :])
if dist_ji < min_dist:
min_dist = dist_ji
preds[i] = j
return preds


class biKMeans(object):
def __init__(self,n_clusters=5):

class BiKMeans:
def __init__(self, n_clusters=5):
self.n_clusters = n_clusters
self.centroids = None
self.clusterAssment = None
self.cluster_assessment = None
self.labels = None
self.sse = None


#计算两点的欧式距离
def _distEclud(self, vecA, vecB):

def _dist_euclidean(self, vecA, vecB):
return np.linalg.norm(vecA - vecB)

def fit(self,X):
m = X.shape[0]
self.clusterAssment = np.zeros((m,2))

def fit(self, X):
try:
if not isinstance(X, np.ndarray):
X = np.asarray(X)
except:
raise TypeError("numpy.ndarray required for X")

m = X.shape[0
self.cluster_assessment = np.zeros((m, 2))
centroid0 = np.mean(X, axis=0).tolist()
centList =[centroid0]
for j in range(m):#计算每个样本点与质心之间初始的平方误差
self.clusterAssment[j,1] = self._distEclud(np.asarray(centroid0), X[j,:])**2

while (len(centList) < self.n_clusters):
lowestSSE = np.inf
for i in range(len(centList)):#尝试划分每一族,选取使得误差最小的那个族进行划分
ptsInCurrCluster = X[np.nonzero(self.clusterAssment[:,0]==i)[0],:]
cent_list = [centroid0]

for j in range(m):
self.cluster_assessment[j, 1] = self._dist_euclidean(np.asarray(centroid0), X[j, :]) ** 2

while len(cent_list) < self.n_clusters:
lowest_sse = np.inf
for i in range(len(cent_list)):
pts_in_curr_cluster = X[np.where(self.cluster_assessment[:, 0] == i)[0], :]
clf = KMeans(n_clusters=2)
clf.fit(ptsInCurrCluster)
centroidMat, splitClustAss = clf.centroids, clf.clusterAssment#划分该族后,所得到的质心、分配结果及误差矩阵
sseSplit = sum(splitClustAss[:,1])
sseNotSplit = sum(self.clusterAssment[np.nonzero(self.clusterAssment[:,0]!=i)[0],1])
if (sseSplit + sseNotSplit) < lowestSSE:
bestCentToSplit = i
bestNewCents = centroidMat
bestClustAss = splitClustAss.copy()
lowestSSE = sseSplit + sseNotSplit
#该族被划分成两个子族后,其中一个子族的索引变为原族的索引,另一个子族的索引变为len(centList),然后存入centList
bestClustAss[np.nonzero(bestClustAss[:,0] == 1)[0],0] = len(centList)
bestClustAss[np.nonzero(bestClustAss[:,0] == 0)[0],0] = bestCentToSplit
centList[bestCentToSplit] = bestNewCents[0,:].tolist()
centList.append(bestNewCents[1,:].tolist())
self.clusterAssment[np.nonzero(self.clusterAssment[:,0] == bestCentToSplit)[0],:]= bestClustAss

self.labels = self.clusterAssment[:,0]
self.sse = sum(self.clusterAssment[:,1])
self.centroids = np.asarray(centList)

def predict(self,X):#根据聚类结果,预测新输入数据所属的族
#类型检查
if not isinstance(X,np.ndarray):
try:
clf.fit(pts_in_curr_cluster)
centroid_mat, split_cluster_assessment = clf.centroids, clf.cluster_assessment
sse_split = sum(split_cluster_assessment[:, 1])
sse_not_split = sum(self.cluster_assessment[np.where(self.cluster_assessment[:, 0] != i)[0], 1])
if (sse_split + sse_not_split) < lowest_sse:
best_cent_to_split = i
best_new_cents = centroid_mat
best_cluster_assessment = split_cluster_assessment.copy()
lowest_sse = sse_split + sse_not_split
best_cluster_assessment[np.where(best_cluster_assessment[:, 0] == 1)[0], 0] = len(cent_list)
best_cluster_assessment[np.where(best_cluster_assessment[:, 0] == 0)[0], 0] = best_cent_to_split
cent_list[best_cent_to_split] = best_new_cents[0, :].tolist()
cent_list.append(best_new_cents[1, :].tolist()
self.cluster_assessment[np.where(self.cluster_assessment[:, 0] == best_cent_to_split)[0], :] = best_cluster_assessment

self.labels = self.cluster_assessment[:, 0]
self.sse = sum(self.cluster_assessment[:, 1])
self.centroids = np.asarray(cent_list)

def predict(self, X):
try:
if not isinstance(X, np.ndarray):
X = np.asarray(X)
except:
raise TypeError("numpy.ndarray required for X")
m = X.shape[0]#m代表样本数量
except:
raise TypeError("numpy.ndarray required for X")

m = X.shape[0
preds = np.empty((m,))
for i in range(m):#将每个样本点分配到离它最近的质心所属的族
minDist = np.inf
for i in range(m):
min_dist = np.inf
for j in range(self.n_clusters):
distJI = self._distEclud(self.centroids[j,:],X[i,:])
if distJI < minDist:
minDist = distJI
dist_ji = self._dist_euclidean(self.centroids[j, :], X[i, :])
if dist_ji < min_dist:
min_dist = dist_ji
preds[i] = j
return preds