-
Notifications
You must be signed in to change notification settings - Fork 11
/
basics.py
292 lines (235 loc) · 9.93 KB
/
basics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
import torch
import numpy as np
# ################################################################### #
# Implementing a 2 layer network with 1 hidden layer using only numpy #
# ################################################################### #
N = 64 # batch size
D_in = 1000 # input dimension
H = 100 # hidden dimension
D_out = 10 # output dimension
# create a random input of shape [batch_size, input_dim]
x = np.random.randn(N, D_in)
# create a random output of shape [batch_size, output_dim]
y = np.random.randn(N, D_out)
# randomly initialize the weights of the network
# w1 of shape [input_dim, hidden_dim]
w1 = np.random.randn(D_in, H)
# w2 of shape [hidden_dim, output_dim]
w2 = np.random.randn(H, D_out)
# learning rate
learning_rate = 1e-6
print('--------------------------------')
print("Training the network using numpy")
print('--------------------------------')
for epoch in range(500):
# forward pass
i2h = x.dot(w1)
relu_activation = np.maximum(i2h, 0)
y_pred = relu_activation.dot(w2)
# loss
loss = np.square(y_pred - y).sum()
if epoch % 50 == 0:
print(f"epoch : {epoch}, loss : {loss}")
# backpropagation
# compute the gradients of w1, w2 w.r.t loss
# dl/dw2 = dl/dy * dy/dw2
# = 2*(y_pred - y) * (relu_activation)
grad_y_pred = 2.0 * (y_pred - y)
# transpose on relu_activation since w2 is of shape [h, h_out]
grad_w2 = relu_activation.T.dot(grad_y_pred)
# dl/dw1 = dl/dy * dy/drelu * drelu/dw1
# = 2*(y_pred - y) * (w2) * relu(x)
grad_h_relu = grad_y_pred.dot(w2.T) # shape is [N, h]
grad_h = grad_h_relu.copy()
grad_h[i2h < 0] = 0 # relu(dl/dy * dy/drelu)
grad_w1 = x.T.dot(grad_h) # shape is [h_in, h]
# update the weights
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2
# ################################################################################ #
# Implementing the above 2 layer network with 1 hidden layer using pytorch tensors #
# ################################################################################ #
print('-----------------------------------------')
print("Training the network using pytorch tensor")
print('-----------------------------------------')
# define the default dtype and device to use
dtype = torch.float
device = torch.device("cpu") # can use gpu also with torch.device("cuda:0")
# create a random input of shape [batch_size, input_dim]
x = torch.randn(N, D_in, device=device, dtype=dtype)
# create a random output of shape [batch_size, output_dim]
y = torch.randn(N, D_out, device=device, dtype=dtype)
# randomly initialize the weights of the network
# w1 of shape [input_dim, hidden_dim]
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
# w2 of shape [hidden_dim, output_dim]
w2 = torch.randn(H, D_out, device=device, dtype=dtype)
for epoch in range(500):
# forward pass
i2h = x.mm(w1)
relu_activation = i2h.clamp(min=0)
y_pred = relu_activation.mm(w2)
# loss
loss = (y_pred - y).pow(2).sum().item()
if epoch % 50 == 0:
print(f"epoch : {epoch}, loss : {loss}")
# backpropagation
# compute the gradients of w1, w2 w.r.t loss
# dl/dw2 = dl/dy * dy/dw2
# = 2*(y_pred - y) * (relu_activation)
grad_y_pred = 2.0 * (y_pred - y)
# transpose on relu_activation since w2 is of shape [h, h_out]
grad_w2 = relu_activation.t().mm(grad_y_pred)
# dl/dw1 = dl/dy * dy/drelu * drelu/dw1
# = 2*(y_pred - y) * (w2) * relu(x)
grad_h_relu = grad_y_pred.mm(w2.t()) # shape is [N, h]
grad_h = grad_h_relu.clone()
grad_h[i2h < 0] = 0 # relu(dl/dy * dy/drelu)
grad_w1 = x.t().mm(grad_h) # shape is [h_in, h]
# update the weights
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2
# ################################################################################# #
# Implementing the above 2 layer network with 1 hidden layer using pytorch autograd #
# ################################################################################# #
print('-------------------------------------------')
print("Training the network using pytorch autograd")
print('-------------------------------------------')
# create a random input of shape [batch_size, input_dim]
x = torch.randn(N, D_in, device=device, dtype=dtype)
# create a random output of shape [batch_size, output_dim]
y = torch.randn(N, D_out, device=device, dtype=dtype)
# randomly initialize the weights of the network
# setting the requires_grad = True indicates that we want to compute gradients w.r.t
# tensors during the backward propagation
# w1 of shape [input_dim, hidden_dim]
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
# w2 of shape [hidden_dim, output_dim]
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)
for epoch in range(500):
# forward pass
i2h = x.mm(w1)
relu_activation = i2h.clamp(min=0)
y_pred = relu_activation.mm(w2)
# loss
loss = (y_pred - y).pow(2).sum()
if epoch % 50 == 0:
print(f"epoch : {epoch}, loss : {loss.item()}")
# this call will compute the gradients of loss w.r.t all tensors
# with requires_grad = True
# after the call, w1.grad and w2.grad will be the tensors holding
# the gradient of the loss w.r.t w1 and w2.
loss.backward()
# update the weights
# weight updation don't need to be tracked, so wrap the updation code
# in torch.no_grad()
# updates can also be done with torch.optim.SGD
with torch.no_grad():
w1 -= learning_rate * w1.grad
w2 -= learning_rate * w2.grad
# manually zero the gradients after updating the weights
w1.grad.zero_()
w2.grad.zero_()
# ################################################################################### #
# Implementing the above 2 layer network with 1 hidden layer using pytorch nn package #
# ################################################################################### #
print('--------------------------------------------')
print("Training the network using pytorch nn module")
print('--------------------------------------------')
# create a random input of shape [batch_size, input_dim]
x = torch.randn(N, D_in, device=device, dtype=dtype)
# create a random output of shape [batch_size, output_dim]
y = torch.randn(N, D_out, device=device, dtype=dtype)
# define the model using torch.nn. nn.Sequential is module contains other modules
# and applies them in sequence to produce the output.
# nn.Linear computes output from input using a linear function
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H),
torch.nn.ReLU(),
torch.nn.Linear(H, D_out)
)
# loss function
loss_fn = torch.nn.MSELoss(reduction='sum')
learning_rate = 1e-4
for epoch in range(500):
# forward pass
y_pred = model(x)
loss = loss_fn(y_pred, y)
if epoch % 50 == 0:
print(f"epoch : {epoch}, loss : {loss.item()}")
# zero the gradients before the backward pass
model.zero_grad()
# backward pass
loss.backward()
# weight update
with torch.no_grad():
for param in model.parameters():
param -= learning_rate * param.grad
# ############################################################################## #
# Implementing the above 2 layer network with 1 hidden layer using pytorch optim #
# ############################################################################## #
print('----------------------------------------')
print("Training the network using pytorch optim")
print('----------------------------------------')
# create a random input of shape [batch_size, input_dim]
x = torch.randn(N, D_in, device=device, dtype=dtype)
# create a random output of shape [batch_size, output_dim]
y = torch.randn(N, D_out, device=device, dtype=dtype)
# define the model using torch.nn. nn.Sequential is module contains other modules
# and applies them in sequence to produce the output.
# nn.Linear computes output from input using a linear function
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H),
torch.nn.ReLU(),
torch.nn.Linear(H, D_out)
)
# loss function
loss_fn = torch.nn.MSELoss(reduction='sum')
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(500):
# forward pass
y_pred = model(x)
loss = loss_fn(y_pred, y)
if epoch % 50 == 0:
print(f"epoch : {epoch}, loss : {loss.item()}")
# zero the gradients before the backward pass
model.zero_grad()
# backward pass
loss.backward()
# weight update using the optimizer
optimizer.step()
# ######################################################################## #
# Implementing the above 2 layer network with 1 hidden layer using classes #
# ######################################################################## #
print('------------------------------------------')
print("Training the network using pytorch classes")
print('------------------------------------------')
class TwoLayerNet(torch.nn.Module):
def __init__(self, D_in, H, D_out):
super(TwoLayerNet, self).__init__()
self.linear1 = torch.nn.Linear(D_in, H)
self.linear2 = torch.nn.Linear(H, D_out)
def forward(self, x):
h = self.linear1(x)
h_relu = h.clamp(min=0)
y_pred = self.linear2(h_relu)
return y_pred
learning_rate = 1e-4
model = TwoLayerNet(D_in, H, D_out)
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# create a random input of shape [batch_size, input_dim]
x = torch.randn(N, D_in, device=device, dtype=dtype)
# create a random output of shape [batch_size, output_dim]
y = torch.randn(N, D_out, device=device, dtype=dtype)
for epoch in range(500):
# forward pass
y_pred = model(x)
loss = criterion(y_pred, y)
if epoch % 50 == 0:
print(f"epoch : {epoch}, loss : {loss.item()}")
# Zero gradients, perform a backward pass, and update the weights.
optimizer.zero_grad()
loss.backward()
optimizer.step()