-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPlain_DNN.py
584 lines (434 loc) · 18.8 KB
/
Plain_DNN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 29 12:20:22 2018
@author: Esmaeil Seraj <[email protected]>
@website: https://github.com/EsiSeraj/
Plain Deep Neural Network (Without any improvement technique, a plain
implementation of deep neural networks)
- All required helper functions
- tanh, relu and sigmoid activations for hidden layers
- Sigmoid output for binary classification
- A comprehensive but plain DNN model generator
Reguired Packages
- numpy
- matplotlib.pyplot
# NOTE: this function gets regular updats; for now, it only includes equations
and computations for sigmoid, tanh and ReLU non-linearities, additional
non-linearities are to be added in the future
Copyright (C) <2018> <Esmaeil Seraj>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
"""
# The main steps for building a Neural Network are:
1. Define the model structure (such as number of input features)
2. Initialize the model's parameters and set the hyperparameters
3. Loop:
- Calculate current loss (forward propagation)
- Calculate current gradient (backward propagation)
- Update parameters (gradient descent)
4. Use learnt parameters to predict the labels on train data (forward)
5. Use learnt parameters to predict the labels on test data
"""
# In[0]: loading packages
import numpy as np
import matplotlib.pyplot as plt
# In[1]: activation functions
def sigmoid(Z):
"""
This function Computes the sigmoid activation of z in numpy
Arguments:
Z -- A scalar or numpy array of any shape
Returns:
A -- output of sigmoid(z), same shape as Z
cache -- returns Z as well, useful during backpropagation
"""
A = 1/(1 + np.exp(-Z))
cache = Z
assert(A.shape == Z.shape)
return A, cache
def relu(Z):
"""
This function implements the RELU function in numoy
Arguments:
Z -- A scalar or numpy array of any shape
Returns:
A -- Post-activation parameter, of the same shape as Z
cache -- returns Z as well, useful during backpropagation
"""
A = np.maximum(0, Z)
cache = Z
assert(A.shape == Z.shape)
return A, cache
def tanh(Z):
"""
This function implements (uses) the tanh function in numoy for hidden-layer
activation function
Arguments:
Z -- A scalar or numpy array of any shape
Returns:
A -- Post-activation parameter, of the same shape as Z
cache -- returns Z as well, useful during backpropagation
"""
A = np.tanh(Z)
cache = Z
assert(A.shape == Z.shape)
return A, cache
def sigmoid_backward(dA, cache):
"""
This function implements the backward propagation for a single SIGMOID unit
Arguments:
dA -- post-activation gradient, of any shape
cache -- 'Z' where we store for computing backward propagation
Returns:
dZ -- Gradient of the cost with respect to Z
"""
Z = cache
s = 1/(1 + np.exp(-Z))
dZ = dA * s * (1-s)
assert (dZ.shape == Z.shape)
return dZ
def relu_backward(dA, cache):
"""
This function implements the backward propagation for a single RELU unit
Arguments:
dA -- post-activation gradient, of any shape
cache -- 'Z' where we store for computing backward propagation
Returns:
dZ -- Gradient of the cost with respect to Z
"""
Z = cache
dZ = np.array(dA, copy = True) # just converting dz to a correct object
# setting dz = 0, When z <= 0
dZ[Z <= 0] = 0
assert (dZ.shape == Z.shape)
return dZ
def tanh_backward(dA, cache):
"""
This function implements the backward propagation for a single tanh unit
Arguments:
dA -- post-activation gradient, of any shape
cache -- 'Z' where we store for computing backward propagation
Returns:
dZ -- Gradient of the cost with respect to Z
"""
Z = cache
s = np.tanh(Z)
dZ = dA * (1 - np.power(s, 2))
assert (dZ.shape == Z.shape)
return dZ
# In[2]: initialize parameters
def initialize_parameters_deep(layer_dims):
"""
This function initializes the parameters w and b for an L-layer NN
Arguments:
layer_dims -- python array (list) containing the dimensions of each layer
in our network
Returns:
parameters -- python dictionary containing all parameters "W1..l", "b1..l",
where:
Wl -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
bl -- bias vector of shape (layer_dims[l], 1)
"""
parameters = {}
L = len(layer_dims) # number of layers in the network
for l in range(1, L):
parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 0.01
parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
assert(parameters['W' + str(l)].shape == (layer_dims[l], layer_dims[l-1]))
assert(parameters['b' + str(l)].shape == (layer_dims[l], 1))
return parameters
# In[3]: forward propagation module
def linear_forward(A_prev, W, b):
"""
This function implements the linear part of a layer's forward propagation
Arguments:
A_prev -- activations from previous layer (or input data) of shape:
(dimension of previous layer, number of examples)
W -- weights matrix: numpy array of shape:
(dimension of current layer, size of previous layer)
b -- bias vector, numpy array of shape:
(dimension of the current layer, 1)
Returns:
Z -- pre-activation, the input of the activation function (non-linear part)
cache -- a python dictionary containing "A", "W" and "b"; stored for
computing the backward pass efficiently
"""
Z = np.dot(W, A_prev) + b
cache = (A_prev, W, b)
assert(Z.shape == (W.shape[0], A_prev.shape[1]))
return Z, cache
def activation_forward(A_prev, W, b, activation):
"""
This function implements the activation function applied to the linear
forward (pre-activation)
Arguments:
A_prev -- activations from previous layer (or input data) of shape:
(dimension of previous layer, number of examples)
W -- weights matrix: numpy array of shape:
(size of current layer, size of previous layer)
b -- bias vector, numpy array of shape:
(size of the current layer, 1)
activation -- the activation to be used in this layer, stored as a text
string: "sigmoid", "relu" or "tanh"
Returns:
A -- post-activation, the output of the activation function
cache -- python dictionary containing "linear_cache" & "activation_cache";
stored for computing the backward pass efficiently
just remember that, linear_cache contains "activation from
previous layer" + "W" + "b" and activation_cache contains "Z", all
corresponding to the same block (layer)
"""
if activation == "sigmoid":
Z, linear_cache = linear_forward(A_prev, W, b)
A, activation_cache = sigmoid(Z)
elif activation == "relu":
Z, linear_cache = linear_forward(A_prev, W, b)
A, activation_cache = relu(Z)
elif activation == "tanh":
Z, linear_cache = linear_forward(A_prev, W, b)
A, activation_cache = tanh(Z)
assert (A.shape == (W.shape[0], A_prev.shape[1]))
cache = (linear_cache, activation_cache)
return A, cache
def forward_propagation_deep(X, parameters, activation):
"""
This function implements forward propagation for an L-layer deep NN with
sigmoid output layer
Arguments:
X -- data, numpy array of shape (input dimension, number of examples)
parameters -- output of initialize_parameters_deep(), containing all model
parameters
activation -- the activation function to be used in forward path, stored
as a string "sigmoid", "tanh", "relu"
Returns:
AL -- last post-activation value (or y_hat, a.k.a probabilities)
caches -- list of caches containing every cache of activation_forward()
(there are L-1 of them, indexed from 0 to L-1)
"""
caches = []
A = X
L = len(parameters) // 2 # number of layers in the neural network
# forward path from input to layer L-1
for l in range(1, L):
A_prev = A
A, cache = activation_forward(A_prev, parameters['W' + str(l)],
parameters['b' + str(l)], activation)
caches.append(cache)
# last part of the forward path (L-th layer), or output layer
AL, cache = activation_forward(A, parameters['W' + str(L)],
parameters['b'+str(L)], activation="sigmoid")
caches.append(cache)
assert(AL.shape == (1,X.shape[1]))
return AL, caches
# In[4]: cost function
def compute_cost(AL, Y):
"""
This function Computes the cross-entropy cost function
Arguments:
AL -- probability vector corresponding to all label predictions of shape:
(1, number of examples)
Y -- "true" binary labels vector (i.e. containing 0 if non-cat, 1 if cat)
of shape (1, number of examples)
Returns:
cost -- cross-entropy cost
"""
m = Y.shape[1]
cost = -(1/m)*np.sum((np.multiply(Y, np.log(AL))) + (np.multiply(1-Y, np.log(1-AL))))
cost = np.squeeze(cost)
assert(cost.shape == ())
return cost
# In[5]: backward propagation module
def linear_backward(dZ, cache):
"""
This function implements the linear portion of backward propagation for a
single layer (layer l)
Arguments:
dZ -- Gradient of the cost with respect to the linear output
(of current layer l)
cache -- tuple of values (A_prev, W, b) coming from the forward propagation
in the current layer
Returns:
dA_prev -- Gradient of the cost with respect to the activation
(of the previous layer l-1), same shape as A_prev
dW -- Gradient of the cost with respect to W (current layer l),
same shape as W
db -- Gradient of the cost with respect to b (current layer l),
same shape as b
"""
A_prev, W, b = cache
m = A_prev.shape[1]
dW = (1/m)*np.dot(dZ, A_prev.T)
db = (1/m)*np.sum(dZ, axis=1, keepdims = True)
dA_prev = np.dot(W.T, dZ)
assert (dA_prev.shape == A_prev.shape)
assert (dW.shape == W.shape)
assert (db.shape == b.shape)
return dA_prev, dW, db
def activation_backward(dA, cache, activation):
"""
This function implements the backward propagation for the activation
function applied to the linear forward during forward pass
Arguments:
dA -- post-activation gradient for current layer l
cache -- tuple of values (linear_cache, activation_cache) that we stored in
forward path
activation -- the activation that was used in this layer during forward
path, stored as a text string: "sigmoid", "relu" or "tanh"
Returns:
dA_prev -- Gradient of the cost with respect to the activation
(of the previous layer l-1), same shape as A_prev
dW -- Gradient of the cost with respect to W (current layer l),
same shape as W
db -- Gradient of the cost with respect to b (current layer l),
same shape as b
"""
linear_cache, activation_cache = cache
if activation == "relu":
dZ = relu_backward(dA, activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache)
elif activation == "tanh":
dZ = tanh_backward(dA, activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache)
elif activation == "sigmoid":
dZ = sigmoid_backward(dA, activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache)
return dA_prev, dW, db
def backward_propagation_deep(AL, Y, caches, activation):
"""
This function implements the backward propagation for an L-layer deep NN
with sigmoid output layer
Arguments:
AL -- probability vector, output of the forward propagation
Y -- "true" binary labels vector (i.e. containing 0 if non-cat, 1 if cat)
caches -- list of caches we stored in forward path
activation -- the activation function that was used in forward path, stored
as a string "sigmoid", "tanh", "relu"
Returns:
grads -- A dictionary with all the gradients, including: "dA", "dW" & "db"
"""
grads = {}
L = len(caches) # the number of layers
# m = AL.shape[1]
Y = Y.reshape(AL.shape) # just to make sure!
# Initializing the backpropagation with derivative of cost with respect to AL
dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
current_cache = caches[L-1] # corresponding to the last layer
grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = activation_backward(dAL,
current_cache, activation = "sigmoid")
# Loop from l=L-2 to l=0
for l in reversed(range(L-1)):
current_cache = caches[l]
dA_prev_temp, dW_temp, db_temp = activation_backward(grads["dA" + str(l+1)],
current_cache, activation)
grads["dA" + str(l)] = dA_prev_temp
grads["dW" + str(l + 1)] = dW_temp
grads["db" + str(l + 1)] = db_temp
return grads
# In[6]: parameter update
def update_parameters_deep(parameters, grads, learning_rate = 0.005):
"""
This function updates parameters in an L-layer DNN using gradient descent
Arguments:
parameters -- python dictionary containing all model parameters
grads -- python dictionary containing all gradients
learning_rate -- learning rate of the gradient descent update rule,
(default = 0.005)
Returns:
parameters -- python dictionary containing your updated parameters
"""
L = len(parameters) // 2 # number of layers in the neural network
for l in range(L):
parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate*grads["dW" + str(l+1)]
parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate*grads["db" + str(l+1)]
return parameters
# In[7]: Plain deep neural network model generator
def plain_nn_model_deep(X, Y, layer_dims, activation, learning_rate = 0.005,
num_iterations = 3000, print_cost = True, plot_lrn_curve = True):
"""
This function implements an L-layer deep neural network in the format of:
["linear"->"activation"]*(L-1) --> "linear"->"sigmoid"
Arguments:
X -- data, numpy array of shape (dimension, number of examples)
Y -- "true" binary labels vector (i.e. 0 if cat, 1 if non-cat), of shape:
(1, number of examples)
layer_dims -- list containing the input and each other layer's dimension,
of length (number of layers + 1)
activation -- the activation function to be used in forward path, stored
as a string "sigmoid", "tanh", "relu"
learning_rate -- learning rate of the gradient descent update rule,
(default = 0.005)
num_iterations -- number of iterations of the optimization loop
print_cost -- if True, it prints the cost every 100 steps
plot_lrn_curve -- if True, it plots the learning curve
Returns:
parameters -- parameters learnt by the model by which prediction can occur
costs -- list of all the costs computed during the optimization, this will
be used to plot the learning curve
"""
costs = []
# Parameters initialization
parameters = initialize_parameters_deep(layer_dims)
# Loop (gradient descent)
for i in range(0, num_iterations):
# Forward propagation
AL, caches = forward_propagation_deep(X, parameters, activation)
# Compute cost
cost = compute_cost(AL, Y)
# Backward propagation
grads = backward_propagation_deep(AL, Y, caches, activation)
# Update parameters
parameters = update_parameters_deep(parameters, grads, learning_rate)
# Print the cost every 100 training example
if print_cost and i % 100 == 0:
print ("Cost after iteration %i: %f" % (i, cost))
# Record the cost value every 100 iterations
if i % 100 == 0:
costs.append(cost)
# plot the cost
if plot_lrn_curve == True:
plt.plot(np.squeeze(costs))
plt.ylabel('cost')
plt.xlabel('iterations (per hundreds)')
plt.title("Learning rate =" + str(learning_rate))
plt.show()
return parameters, costs
# In[8]: prediction
def predict_deep(X, Y, parameters, activation, print_accuracy = True):
"""
This function is used to predict the results of an L-layer deep neural net
Arguments:
X -- data, numpy array of shape (dimension, number of examples)
Y -- "true" binary labels vector (i.e. 0 if cat, 1 if non-cat), of shape:
(1, number of examples)
parameters -- parameters of the trained model
activation -- the activation function that was used to train the model in
its hidden layers
print_accuracy -- if True, it prints the value of accuracy
Returns:
predictions -- predictions for the given dataset X
"""
m = X.shape[1]
# n = len(parameters) // 2 # number of layers in the neural network
predictions = np.zeros((1, m))
# Forward propagation
probabilities, caches = forward_propagation_deep(X, parameters, activation)
# convert the probabilities to 0/1 predictions
for i in range(0, probabilities.shape[1]):
if probabilities[0, i] > 0.5:
predictions[0, i] = 1
else:
predictions[0, i] = 0
#print results
if print_accuracy == True:
print("Accuracy on this dataset is: " + str(np.sum((predictions == Y)/m)*100) + "%")
return predictions
# In[]: