-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathoptimizers.py
113 lines (84 loc) · 4.78 KB
/
optimizers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from utils import itemlist
import numpy as np
import theano
import theano.tensor as tensor
# optimizers
# name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update
def adadelta(lr, tparams, grads, inp, cost, extra):
zipped_grads = [theano.shared(p.get_value() * np.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()]
running_up2 = [theano.shared(p.get_value() * np.float32(0.), name='%s_rup2'%k) for k, p in tparams.iteritems()]
running_grads2 = [theano.shared(p.get_value() * np.float32(0.), name='%s_rgrad2'%k) for k, p in tparams.iteritems()]
zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)]
f_grad_shared = theano.function(inp, [cost] + extra, updates=zgup+rg2up,
profile=False, on_unused_input='ignore')
updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)]
ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)]
param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)]
f_update = theano.function([lr], [], updates=ru2up+param_up, on_unused_input='ignore', profile=False)
return f_grad_shared, f_update
def adam(lr, tparams, grads, inp, cost):
gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) for k, p in tparams.iteritems()]
gsup = [(gs, g) for gs, g in zip(gshared, grads)]
f_grad_shared = theano.function(inp, cost, updates=gsup)
lr0 = 0.0002
b1 = 0.1
b2 = 0.001
e = 1e-8
updates = []
i = theano.shared(np.float32(0.))
i_t = i + 1.
fix1 = 1. - b1**(i_t)
fix2 = 1. - b2**(i_t)
lr_t = lr0 * (tensor.sqrt(fix2) / fix1)
for p, g in zip(tparams.values(), gshared):
m = theano.shared(p.get_value() * 0.)
v = theano.shared(p.get_value() * 0.)
m_t = (b1 * g) + ((1. - b1) * m)
v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v)
g_t = m_t / (tensor.sqrt(v_t) + e)
p_t = p - (lr_t * g_t)
updates.append((m, m_t))
updates.append((v, v_t))
updates.append((p, p_t))
updates.append((i, i_t))
f_update = theano.function([lr], [], updates=updates, on_unused_input='ignore')
return f_grad_shared, f_update
def rmsprop(lr, tparams, grads, inp, cost):
zipped_grads = [theano.shared(p.get_value() * np.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()]
running_grads = [theano.shared(p.get_value() * np.float32(0.), name='%s_rgrad'%k) for k, p in tparams.iteritems()]
running_grads2 = [theano.shared(p.get_value() * np.float32(0.), name='%s_rgrad2'%k) for k, p in tparams.iteritems()]
zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)]
f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up, profile=False)
updir = [theano.shared(p.get_value() * np.float32(0.), name='%s_updir'%k) for k, p in tparams.iteritems()]
updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)]
param_up = [(p, p + udn[1]) for p, udn in zip(itemlist(tparams), updir_new)]
f_update = theano.function([lr], [], updates=updir_new+param_up, on_unused_input='ignore', profile=False)
return f_grad_shared, f_update
def sgd(lr, tparams, grads, inp, cost, extra):
'''
grads is already the shared variable containing the gradients, we only
need to do a accumulation and then do an updated
'''
momentum = 0.9
# the cache here can not be reseach by outside function
grad_cache = [theano.shared(p.get_value() * np.float32(0.),
name='%s_grad_cache' % k )
for k, p in tparams.iteritems()]
# update the caches
grad_cache_update = [(g_c, g_c * momentum + g)
for g_c, g in zip (grad_cache, grads)]
param_update = [(p, p - lr * g_c )
for k, p, g_c in zip(tparams.keys(),
tparams.values(),
grad_cache)]
# two functions: do the grad cache updates and param_update
f_grad_cache_update = theano.function(inp, [cost]+extra,
updates = grad_cache_update,
name = 'f_grad_cache_update')
f_param_update = theano.function([lr], [],
updates = param_update,
name = 'f_param_update')
return f_grad_cache_update, f_param_update