-
Notifications
You must be signed in to change notification settings - Fork 0
/
cancerOrig.py
272 lines (198 loc) · 6.98 KB
/
cancerOrig.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
import pandas as pd
import numpy as np
from matplotlib import style
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import MeanShift
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.cluster import Birch
from sklearn.neural_network import BernoulliRBM
from sklearn.mixture import BayesianGaussianMixture
from sklearn.manifold import Isomap
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.mixture import GaussianMixture
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
import random
import warnings
def fxn():
# In[1218]:
#read in the data
df = pd.read_csv('data.csv')
#columns to drop
df = df.drop(['id'], axis=1)
df.sample(frac=1)
#gets rid of ? and one hot encoding for all columns that need it
index = []
count = 0
for val in range(len(df.ix[:,0])):
flag = False
for column in df:
if df[column][val] == '?':
flag = True
break
if flag:
continue
if count<1000:
index.append(val)
count += 1
df = df[df.index.isin(index)]
# In[1219]:
#gets all columns which are not ints and integer encodes them
obj_df = df.select_dtypes(include=['object']).copy()
for column in obj_df:
le = preprocessing.LabelEncoder()
le.fit(df[column])
df[column] = le.transform(df[column])
# In[1220]:
#normalize all points between [0,1]
x = df.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled)
# In[1221]:
train, test = np.split(df.sample(frac=1), [int(.6*len(df))])
unlabel, label = np.split(train.sample(frac=1), [int(.8*len(train))])
test = test.values.tolist()
nolabels1 = unlabel.values.tolist()
del unlabel[0]
nolabels = unlabel.values.tolist()
labels = label.values.tolist()
# In[1222]:
true = {}
for row in nolabels1:
true[tuple(row[1:])] = row[:1][0]
# In[1223]:
#train f1...fn classifiers on labelled data, will use 6 types: decision stumps, knn, svm, guassian mixture mode
# native bayes, logistic regression
x = []
y = []
x_true = []
y_true = []
for row in labels:
x.append(row[1:])
y.append(row[:1][0])
x_true.append(row[1:])
y_true.append(row[:1][0])
clf1 = svm.SVC(kernel='linear', gamma='scale').fit(x, y)
clf2 = KNeighborsClassifier(n_neighbors=3).fit(x, y)
clf3 = DecisionTreeClassifier(splitter= 'random').fit(x, y)
clf4 = DecisionTreeClassifier(splitter= 'random').fit(x, y)
clf5 = DecisionTreeClassifier(splitter= 'random').fit(x, y)
clf6 = GaussianMixture(n_components = 2, init_params= 'random').fit(x,y)
clf7 = GaussianMixture(n_components = 2, init_params= 'random').fit(x,y)
clf8 = GaussianMixture(n_components = 2, init_params= 'random').fit(x,y)
clf9 = GaussianNB().fit(x,y)
clf10 = LogisticRegression(solver='liblinear').fit(x,y)
classifiers = [clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, clf10]
# In[1224]:
# make csv in form of rowNumber, clfNumber, clf prediction on that row
answers = []
for point in range(len(nolabels)):
for clf in range(len(classifiers)):
answers.append([point, clf, classifiers[clf].predict([nolabels[point]])])
count = 0
f = open("answer_file.csv", "w")
f.write('question,worker,answer;\n')
for answer in answers:
count += 1
f.write(str(answer[0]) + ',' + str(answer[1]) + ',' + str(int(answer[2]))+'\n')
f.close()
p = open("result_file.csv", "w")
p.close()
# In[1225]:
#run VI BP
import subprocess
subprocess.call(["python", "run.py", "methods/c_EM/method.py", "answer_file.csv", "result_file.csv","decision-making"])
# In[1226]:
#extract results, get noisy labels and
filepath = "result_file.csv"
noisy_labels = []
with open(filepath) as fp:
for line in fp:
questionAnswer = line.split(',')
noisy_labels.append(questionAnswer)
# In[1227]:
#assign noisy label to proper row
#combine noisy lables to real labels and randomize
df_noise_x = []
df_noise_y = []
for question in noisy_labels:
if question[0].rstrip() == 'question':
continue
df_noise_x += [nolabels[int(question[0].rstrip())]]
df_noise_y.append(int(question[1].rstrip()))
count_vi = 0
for el in range(len(df_noise_x)):
if true[tuple(df_noise_x[el])] != df_noise_y[el]:
count_vi += 1
# In[1228]:
df_noise_x += x
df_noise_y += y
df_noise = []
for el in range(len(df_noise_x)):
new = df_noise_x[el]
new.append(df_noise_y[el])
df_noise.append(new)
#need to shuffle the data
random.shuffle(df_noise)
df_noise_x = []
df_noise_y = []
for row in df_noise:
df_noise_x.append(row[:-1])
df_noise_y.append(row[-1:][0])
# In[1229]:
#run AdaBoost from Sklearn on noisy data
bdt2 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
algorithm="SAMME",
n_estimators=20)
bdt2.fit(df_noise_x, df_noise_y)
# In[1230]:
bdt1 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
algorithm="SAMME",
n_estimators=20)
bdt1.fit(x_true,y_true)
# In[1231]:
#Ada boosting on noisy data error rate
errors = []
count1 = 0
for point in test:
est = bdt2.predict([point[1:]])
true = int(point[0])
est = int(est[0])
if est == true:
errors.append([point[:-1],0])
else:
count1 += 1
errors.append([point[:-1],1])
#Ada boosting on 500 supervised data error rate
errors = []
count = 0
for point in test:
est = bdt1.predict([point[:-1]])
true = int(point[-1:][0])
est = int(est[0])
if est == true:
errors.append([point[:-1],0])
else:
count += 1
errors.append([point[:-1],1])
#error rate, noisy -> baseline
#print('\nBreast-cancer: error of VI-BP: ',count_vi, ' Out of: ', len(df_noise_x), ' Error on Noisy Ada Boosting: ',count1,' Error on Norm Ada Boosting: ', count, ' Error rate on Noisy Ada Boosting: ', count1/len(test), ' Error on Norm Ada Boosting: ',count/len(test))
return (count1/len(test),count/len(test))
# In[ ]:
avg1 = 0
avg2 = 0
for x in range(50):
ret = fxn()
avg1 += ret[0]
avg2 += ret[1]
avg1 = avg1/50
avg2 = avg2/50
print(avg1,avg2)