-
Notifications
You must be signed in to change notification settings - Fork 0
/
CustomOptimizers.py
84 lines (67 loc) · 3.14 KB
/
CustomOptimizers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import math
import torch
from torch.optim.optimizer import Optimizer
class SGD(Optimizer):
def __init__(self, parameters, lr_min, lr_max, period, weight_decay=0, momentum=0, dampening = 0):
"""
:param parameters: iterable of parameters to optimize or dicts defining
parameter groups
:param lr_min: lower bound for cosine rate decay (crd)
:param lr_max: upper bound for crd
:param iters: the number of iterations (used in crd)
:param weight_decay: weight decay (weights in L-2 norm)
"""
if lr_min < 0.0 or lr_max < 0.0:
raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr_min, lr_max))
if weight_decay < 0.0:
raise ValueError("Invalid weight_decay value: " + weight_decay)
if momentum < 0.0:
raise ValueError("Invalid momentum value: " + momentum)
if dampening < 0:
raise ValueError("Invalid dampening value: " + dampening)
# question:
defaults = {"lr_min": lr_min, "lr_max": lr_max, "period": period, "weight_decay": weight_decay,
"momentum": momentum, "dampening": dampening}
super(SGD, self).__init__(parameters, defaults)
def __setstate__(self, state):
super(SGD, self).__setstate__(state)
def step(self, curr_iter, closure = None):
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
momentum_list = []
param_list = []
d_p_list = []
for i, p in enumerate(group['params']):
d_p_list.append(p.grad)
param_list.append(p)
# question:
state = self.state[p]
if 'prev_momentum' not in state:
momentum_list.append(None)
else:
momentum_list.append(state['prev_momentum'])
curr_lr = self.rateDecay(group['lr_min'], group['lr_max'], curr_iter, group['period'])
d_p = d_p_list[i]
if group['weight_decay'] != 0:
d_p = d_p.add(p, group['weight_decay'])
if group['momentum'] != 0:
momentum_grad = momentum_list[i]
if momentum_grad is None:
momentum_grad = torch.clone(d_p).detach()
momentum_list[i] = momentum_grad
else:
momentum_grad.mul_(momentum).add_(d_p, alpha=1 - group['dampening'])
d_p = momentum_grad
print('before ' + str(param_list[i]))
lr = self.rateDecay(group['lr_min'], group['lr_max'], curr_iter, group['period'])
param_list[i].data.add_(d_p, alpha= -lr)
for p, momentum in zip(param_list, momentum_list):
state = self.state[p]
state['prev_momentum'] = momentum
print(param_list[0])
return loss
def rateDecay(self, lr_min, lr_max, iter, period):
new_lr = lr_min + 1/2*(lr_max - lr_min)*(1 + math.cos(iter/period))
return new_lr