-
Notifications
You must be signed in to change notification settings - Fork 0
/
iGEMDataAnalysis.jl
215 lines (195 loc) · 5.68 KB
/
iGEMDataAnalysis.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
using Flux
using CSV
using DataFrames
using Random
using BSON: @save
#Counts number of lines in the cvs, this is the number of feature vectors
#Input: data, a CSV file
#Output: n,Integer number of rows
function countcsvlines(data)
#counter
n = 0
#iterate over rows and count
for row in eachrow(data)
n += 1
end
return n
end
#Gets the number of rows which is the number of features in each feature vector
#-1
#Input: data, a CSV file
#Output: n,Integer number of columns -1
function getNrFeatures(data)
#counter
n=0
#iterate over the last row and count
for i in data[end]
n +=1
end
return n-1
end
#Normalize data such that all elements are on [0,1] for each feature
#Input: data, a multidimensional array
#Output: data, a multidimensional array
function minMaxNormalization(data)
#for each colmn in data
for i in 1:size(data)[2]
#find max in column
max=maximum(data[:,i])
#find min in column
min=minimum(data[:,i])
#scale column
data[:,i]=(data[:,i].-min)/(max-min)
end
#return scaled data
return data
end
#=
Evaluates are model with the test data and returns
Input:
1. model, Flux model our nerual net
2. test_data, multidimensional array
Output:
1. Array of integers. [positive,negative,false positive,false negative]
=#
function getEvaluation(model,test_data)
#variables to store results
FP=0
FN=0
P=0
N=0
#for all rows in test data
for i in 1:size(test_data)[1]
#Evalute feature vector i with model
res=model(test_data[i][1])
#remove extra brackets
res=res[1]
#if target is 1
if test_data[i][2]==1
#rightly classified as positive
if res>0.5
P+=1
#wrongly classified as negative
else
FN+=1
end
#if target is 0
else
#rightly classified as negative
if res<=0.5
N+=1
#wrongly classified as positive
else
FP+=1
end
end
end
#return values in an array
return [P,N,FP,FN]
end
#=
Calculate accuracy based on output from evaluation function
Input:
1. eval, Array of integers [positive,negative,false positive,false negative]
Output:
1. Float64, (rightly classified)/(wrongly classified+rightly classified)
=#
function getAccuracy(eval)
return (eval[1]+eval[2])/(sum(eval))
end
#=
#Splits data as close as possible to the train_percentage then returns an array
#of the form [data_train, data_test]
Input:
1. train_percentage, Float 64,
2. data, Multidimensional array, data used of traning and testing the classifier
Output:
1. D_train,D_test multidimensional array
=#
function splitDataTraningAndTest(train_percentage,data)
#find cutoff index
cutoff=Int(ceil(length(data)*train_percentage))
#split data
D_test=data[1:cutoff]
D_train=data[cutoff+1:end]
return D_train,D_test
end
#Main script
let
#saves the neural net (Flux model) as bson file if set true
savemodel=true
#Imports CSV file, each line contains a target value of 0 or 1 followed by
#a feature vector
data=CSV.File("dataforAnalysis.CSV",header=false)
#percentage of data that is for traning, the rest is for testing
train_percentage=0.9
#get number of feature vectors
N_vec=countcsvlines(data)
#get number of features in each vector
N_feat=getNrFeatures(data)
#Initialization of matrix to hold feature vectors
Xs=zeros(N_vec,N_feat)
#initialization of vector to hold target values
Ys=zeros(N_vec,1)
#counter
i=1
#Convert CSV into feature vectors and targets
#for each row
for row in eachrow(data)
temp_row=[]
#add each element of that row to the temporary vector
for entry in row
append!(temp_row,entry)
end
#add all features to feature matrix s.t. each row is a feature
Xs[i,:]=temp_row[2:N_feat+1]
#add the target to the target vector
Ys[i]=Int(temp_row[1])
#increment
i+=1
end
#Normalize data
Xs=minMaxNormalization(Xs)
#Convert to Float32 to save memmory
Xs=convert(Array{Float32},Xs)
Ys=convert(Array{Float32},Ys)
#Create nerual net model, example: Flux.Chain(Dense(N_feat,20,σ),Dense(20,1))
#has N feature inputs, with one hidde layer that has 20 nodes and one
#output layer with one node
model=Flux.Chain(Dense(N_feat,40,σ),Dense(40,30),Dense(30,15),Dense(15,10),Dense(10,5),Dense(5,1))
#Create our loss function, mse = mean square error
L(x,y)=Flux.Losses.mse(model(x), y)
#model parameters
par=Flux.params(model)
#eta our learning rate
η=0.001
#The optimizer, decides how we change our parameters to minimize loss
#Use classic gradient decent
opt = ADAM(η, (0.9, 0.999))
#Combine feature vector and target into a vector of touples
D=Vector(undef,N_vec)
for i in 1:N_vec
dat=(Xs[i,:],Ys[i])
D[i,:]=[dat]
end
#Shoufle data randomly
shuffle!(D)
D_test,D_train=splitDataTraningAndTest(train_percentage,D)
#Number of traning steps
echos=1000
for e in 1:echos
#train our model (update weights)
Flux.train!(L,par,D_train,opt)
end
#evalute model
eval=getEvaluation(model,D_test)
#get accuracy given output from getEvaluation
acc=getAccuracy(eval)
#print results
println("Accuracy: ",acc," ","Positive: ",eval[1]," ","Negative: ",
eval[2]," ","False positive: ",eval[3]," ","False negative: ",eval[4])
#save model as "FluxModel.bson" in project directory
if savemodel
@save "FluxModel.bson" model
end
end