From 26cab825f63f1e795056c785268752df80567de3 Mon Sep 17 00:00:00 2001 From: explorer12345 <39146150+explorer12345@users.noreply.github.com> Date: Wed, 18 Dec 2019 08:36:54 +0530 Subject: [PATCH] Update Clustering.py --- Clustering.py | 98 ++++++++++++++++++++------------------------------- 1 file changed, 39 insertions(+), 59 deletions(-) diff --git a/Clustering.py b/Clustering.py index 627bc90..37bbd9a 100644 --- a/Clustering.py +++ b/Clustering.py @@ -1,14 +1,10 @@ - -# coding: utf-8 - - - # importing dependencies import numpy as np import pandas as pd import matplotlib.pyplot as plt import matplotlib.cm as cm import sys +import statistics # creating data mean_01 = np.array([0.0, 0.0]) @@ -75,47 +71,49 @@ def initialize(data, no_of_clusters): return centroids def classify_a_point(point, groups, k): - index=-1 + # k should be given as an odd number + index_data=[-1]*len(data) dist=[] for i in range(len(groups)): for j in range(len(groups[i])): - dist.append((distance(point,groups[i][j]),i)) - if len(dist)>k: - dist=sorted(dist)[:k] - else: - dist=sorted(dist) + if point.tolist()==groups[i][j]: + return i + index_data[data.tolist().index(groups[i][j])]=i + for i in range(len(data)): + dist.append((distance(point,data[i,:]),index_data[i])) + dist=sorted(dist) + dist.remove(dist[0]) #removing the distance of point with itself + dist=dist[:k] + #calculating frequencies of different groups - freq=[0]*len(groups) + freq=[0]*(len(groups)+1) #use if loops for no_of_clusters times for e in dist: - if e[1]==0: - freq[0]+=1 - if e[1]==1: - freq[1]+=1 - if e[1]==2: - freq[2]+=1 - if e[1]==3: - freq[3]+=1 - index = freq.index(max(freq)) - return index - -def cluster(data, no_of_clusters, k, max_iterations): + if e[1]==-1: + freq[-1]+=1 + continue + for i in range(len(groups)): + if e[1]==i: + freq[i]+=1 + continue + while(True): + if freq[-1]==k: + return -1 + elif freq[-1]<(k+1)/2: + return freq.index(max(freq[:-1])) + elif freq[-1]>=(k+1)/2: + for i in range(len(groups)): + if freq[-1]+freq[i]==k: + return i + return -1 + +def cluster(data, no_of_clusters, k): groups=initialize(data, no_of_clusters) - #print('groups are',groups) groups=[[element] for element in groups] - #print('groups converted as follows', groups) - #plot_clusters(groups) - #plt.show() - for n in range(max_iterations): - for i in range(data.shape[0]): - group_no = classify_a_point(data[i,:], groups,k) - #print('point chosen: ', data[i,:], 'classified in group no. :', group_no) - print('data point is: ', data[i,:]) - print('initialized point is: ',groups[group_no][0] ) - if groups[group_no][0].all()!=data[i,:].all(): #to prevent the initialized points from gettind re-added - groups[group_no].append(data[i,:]) - #plot_clusters(groups) - #plt.show() + for i in range(data.shape[0]): + group_no = classify_a_point(data[i,:], groups,k) + if groups[group_no][0]!=data[i,:].tolist(): #to prevent the initialized points from gettind re-added + groups[group_no].append(data[i,:].tolist()) return groups def plot_clusters(groups): @@ -124,28 +122,10 @@ def plot_clusters(groups): plt.scatter(*zip(*groups[1]),[6], 'b') plt.scatter(*zip(*groups[2]),[6], 'g') plt.scatter(*zip(*groups[3]),[6], 'c') - plt.show() - - - -l=initialize(data,4) -colors = cm.rainbow(np.linspace(0, 1, len(l))) -#cluster(data,4,5,1) -plt.scatter(*zip(*data),[6],'r') -for i in range(len(l)): - plt.scatter(l[i][0],l[i][1],[6],color=colors) -plt.show() - - - - - - - - - - + plt.show() +# iteration not taken into consideration +plot_clusters(cluster(data,4,7))