-
Notifications
You must be signed in to change notification settings - Fork 106
/
rnn.py
351 lines (276 loc) · 13.2 KB
/
rnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
import numpy as np
def initialize_parameters(n_a, n_x, n_y):
"""
Initialize parameters with small random values
Returns:
parameters -- python dictionary containing:
Wax -- Weight matrix multiplying the input, of shape (n_a, n_x)
Waa -- Weight matrix multiplying the hidden state, of shape (n_a, n_a)
Wya -- Weight matrix relating the hidden-state to the output, of shape (n_y, n_a)
b -- Bias, numpy array of shape (n_a, 1)
by -- Bias relating the hidden-state to the output, of shape (n_y, 1)
"""
np.random.seed(1)
Wax = np.random.randn(n_a, n_x) * 0.01 # input to hidden
Waa = np.random.randn(n_a, n_a) * 0.01 # hidden to hidden
Wya = np.random.randn(n_y, n_a) * 0.01 # hidden to output
ba = np.zeros((n_a, 1)) # hidden bias
by = np.zeros((n_y, 1)) # output bias
parameters = {"Wax": Wax, "Waa": Waa, "Wya": Wya, "ba": ba, "by": by}
return parameters
def softmax(x):
#这里减去最大值,是为了防止大数溢出,根据softmax的参数冗余性,减去任意一个数,结果不变
e_x = np.exp(x - np.max(x))
return e_x / np.sum(e_x, axis=0)
def rnn_step_forward(xt, a_prev, parameters):
"""
Implements a single forward step of the RNN-cell that uses a tanh
activation function
Arguments:
xt -- the input data at timestep "t", of shape (n_x, m).
a_prev -- Hidden state at timestep "t-1", of shape (n_a, m)
**here, n_x denotes the dimension of word vector, n_a denotes the number of hidden units in a RNN cell
parameters -- python dictionary containing:
Wax -- Weight matrix multiplying the input, of shape (n_a, n_x)
Waa -- Weight matrix multiplying the hidden state, of shape (n_a, n_a)
Wya -- Weight matrix relating the hidden-state to the output, of shape (n_y, n_a)
ba -- Bias,of shape (n_a, 1)
by -- Bias relating the hidden-state to the output,of shape (n_y, 1)
Returns:
a_next -- next hidden state, of shape (n_a, 1)
yt_pred -- prediction at timestep "t", of shape (n_y, 1)
"""
# get parameters from "parameters"
Wax = parameters["Wax"] #(n_a, n_x)
Waa = parameters["Waa"] #(n_a, n_a)
Wya = parameters["Wya"] #(n_y, n_a)
ba = parameters["ba"] #(n_a, 1)
by = parameters["by"] #(n_y, 1)
a_next = np.tanh(np.dot(Waa, a_prev) + np.dot(Wax, xt) + ba) #(n_a, 1)
yt_pred = softmax(np.dot(Wya, a_next) + by) #(n_y,1)
return a_next, yt_pred
def rnn_forward(X, Y, a0, parameters, vocab_size=27):
x, a, y_hat = {}, {}, {}
a[-1] = np.copy(a0)
# initialize your loss to 0
loss = 0
for t in range(len(X)):
# Set x[t] to be the one-hot vector representation of the t'th character in X.
# if X[t] == None, we just have x[t]=0. This is used to set the input for the first timestep to the zero vector.
x[t] = np.zeros((vocab_size, 1))
if (X[t] != None):
x[t][X[t]] = 1
# Run one step forward of the RNN
a[t], y_hat[t] = rnn_step_forward(x[t], a[t - 1], parameters) #a[t]: (n_a,1), y_hat[t]:(n_y,1)
# Update the loss by substracting the cross-entropy term of this time-step from it.
#这里因为真实的label也是采用onehot表示的,因此只要把label向量中1对应的概率拿出来就行了
loss -= np.log(y_hat[t][Y[t], 0])
cache = (y_hat, a, x)
return loss, cache
def rnn_step_backward(dy, gradients, parameters, x, a, a_prev):
gradients['dWya'] += np.dot(dy, a.T)
gradients['dby'] += dy
Wya = parameters['Wya']
Waa = parameters['Waa']
da = np.dot(Wya.T, dy) + gradients['da_next'] #每个cell的Upstream有两条,一条da_next过来的,一条y_hat过来的
dtanh = (1 - a * a) * da # backprop through tanh nonlinearity
gradients['dba'] += dtanh
gradients['dWax'] += np.dot(dtanh, x.T)
gradients['dWaa'] += np.dot(dtanh, a_prev.T)
gradients['da_next'] = np.dot(Waa.T, dtanh)
return gradients
def rnn_backward(X, Y, parameters, cache):
# Initialize gradients as an empty dictionary
gradients = {}
# Retrieve from cache and parameters
(y_hat, a, x) = cache
Waa, Wax, Wya, by, ba = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['ba']
# each one should be initialized to zeros of the same dimension as its corresponding parameter
gradients['dWax'], gradients['dWaa'], gradients['dWya'] = np.zeros_like(Wax), np.zeros_like(Waa), np.zeros_like(Wya)
gradients['dba'], gradients['dby'] = np.zeros_like(ba), np.zeros_like(by)
gradients['da_next'] = np.zeros_like(a[0])
# Backpropagate through time
for t in reversed(range(len(X))):
dy = np.copy(y_hat[t])
dy[Y[t]] -= 1 #计算y_hat - y,即预测值-真实值,因为真实值是one-hot向量,只有1个1,其它都是0,
# 所以只要在预测值(向量)对应位置减去1即可,其它位置减去0相当于没变
gradients = rnn_step_backward(dy, gradients, parameters, x[t], a[t], a[t - 1])
return gradients, a
#梯度裁剪
def clip(gradients, maxValue):
'''
Clips the gradients' values between minimum and maximum.
Arguments:
gradients -- a dictionary containing the gradients "dWaa", "dWax", "dWya", "db", "dby"
maxValue -- everything above this number is set to this number, and everything less than -maxValue is set to -maxValue
Returns:
gradients -- a dictionary with the clipped gradients.
'''
dWaa, dWax, dWya, db, dby = gradients['dWaa'], gradients['dWax'], gradients['dWya'], gradients['dba'], gradients['dby']
# clip to mitigate exploding gradients, loop over [dWax, dWaa, dWya, db, dby].
for gradient in [dWax, dWaa, dWya, db, dby]:
np.clip(gradient, -maxValue, maxValue, gradient)
gradients = {"dWaa": dWaa, "dWax": dWax, "dWya": dWya, "dba": db, "dby": dby}
return gradients
def update_parameters(parameters, gradients, lr):
parameters['Wax'] += -lr * gradients['dWax']
parameters['Waa'] += -lr * gradients['dWaa']
parameters['Wya'] += -lr * gradients['dWya']
parameters['ba'] += -lr * gradients['dba']
parameters['by'] += -lr * gradients['dby']
return parameters
def sample(parameters, char_to_ix, seed):
"""
Sample a sequence of characters according to a sequence of probability distributions output of the RNN
Arguments:
parameters -- python dictionary containing the parameters Waa, Wax, Wya, by, and b.
char_to_ix -- python dictionary mapping each character to an index.
seed -- used for grading purposes. Do not worry about it.
Returns:
indices -- a list of length n containing the indices of the sampled characters.
"""
# Retrieve parameters and relevant shapes from "parameters" dictionary
Waa, Wax, Wya, by, ba = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['ba']
vocab_size = by.shape[0]
n_a = Waa.shape[1]
# Step 1: Create the one-hot vector x for the first character (initializing the sequence generation).
x = np.zeros((vocab_size, 1))
# Step 1': Initialize a_prev as zeros
a_prev = np.zeros((n_a, 1))
# Create an empty list of indices, this is the list which will contain the list of indices of the characters to generate
indices = []
# Idx is a flag to detect a newline character, we initialize it to -1
idx = -1
# Loop over time-steps t. At each time-step, sample a character from a probability distribution and append
# its index to "indices". We'll stop if we reach 50 characters (which should be very unlikely with a well
# trained model), which helps debugging and prevents entering an infinite loop.
counter = 0
newline_character = char_to_ix['\n']
while (idx != newline_character and counter != 50):
# Step 2: Forward propagate x using the equations (1), (2) and (3)
a = np.tanh(np.dot(Wax, x) + np.dot(Waa, a_prev) + ba)
z = np.dot(Wya, a) + by
y = softmax(z)
# for grading purposes
np.random.seed(counter + seed)
# Step 3: Sample the index of a character within the vocabulary from the probability distribution y
idx = np.random.choice(vocab_size, p = y.ravel()) # 等价于np.random.choice([0,1,...,vocab_size-1], p = y.ravel()),
# 一维数组或者int型变量,如果是数组,就按照里面的范围来进行采样,如果是单个变量,则采用np.arange(a)的形式
# Append the index to "indices"
indices.append(idx)
# Step 4: Overwrite the input character as the one corresponding to the sampled index.
#每次生成的字符是下一个时间步的输入
x = np.zeros((vocab_size, 1))
x[idx] = 1
# Update "a_prev" to be "a"
a_prev = a
# for grading purposes
seed += 1
counter += 1
if (counter == 50):
indices.append(char_to_ix['\n'])
return indices
def optimize(X, Y, a_prev, parameters, learning_rate=0.01):
"""
Execute one step of the optimization to train the model.
Arguments:
X -- list of integers, where each integer is a number that maps to a character in the vocabulary.
Y -- list of integers, exactly the same as X but shifted one index to the left.
a_prev -- previous hidden state.
parameters -- python dictionary containing:
Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
b -- Bias, numpy array of shape (n_a, 1)
by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
learning_rate -- learning rate for the model.
Returns:
loss -- value of the loss function (cross-entropy)
gradients -- python dictionary containing:
dWax -- Gradients of input-to-hidden weights, of shape (n_a, n_x)
dWaa -- Gradients of hidden-to-hidden weights, of shape (n_a, n_a)
dWya -- Gradients of hidden-to-output weights, of shape (n_y, n_a)
db -- Gradients of bias vector, of shape (n_a, 1)
dby -- Gradients of output bias vector, of shape (n_y, 1)
a[len(X)-1] -- the last hidden state, of shape (n_a, 1)
"""
# Forward propagate through time
loss, cache = rnn_forward(X, Y, a_prev, parameters)
# Backpropagate through time
gradients, a = rnn_backward(X, Y, parameters, cache)
# Clip your gradients between -5 (min) and 5 (max)
gradients = clip(gradients, 5)
# Update parameters
parameters = update_parameters(parameters, gradients, learning_rate)
return loss, parameters, a[len(X) - 1]
def get_initial_loss(vocab_size, seq_length):
return -np.log(1.0/vocab_size)*seq_length
def smooth(loss, cur_loss):
return loss * 0.999 + cur_loss * 0.001
def print_sample(sample_ix, ix_to_char):
txt = ''.join(ix_to_char[ix] for ix in sample_ix)
txt = txt[0].upper() + txt[1:] # capitalize first character
print ('%s' % (txt, ), end='')
def model(data, ix_to_char, char_to_ix, num_iterations=35000, n_a=50, dino_names=7, vocab_size=27):
"""
Trains the model and generates dinosaur names.
Arguments:
data -- text corpus
ix_to_char -- dictionary that maps the index to a character
char_to_ix -- dictionary that maps a character to an index
num_iterations -- number of iterations to train the model for
n_a -- number of units of the RNN cell
dino_names -- number of dinosaur names you want to sample at each iteration.
vocab_size -- number of unique characters found in the text, size of the vocabulary
Returns:
parameters -- learned parameters
"""
# Retrieve n_x and n_y from vocab_size
n_x, n_y = vocab_size, vocab_size
# Initialize parameters
parameters = initialize_parameters(n_a, n_x, n_y)
# Initialize loss (this is required because we want to smooth our loss, don't worry about it)
loss = get_initial_loss(vocab_size, dino_names)
# Initialize the hidden state of your LSTM
a_prev = np.zeros((n_a, 1))
# Optimization loop
for j in range(num_iterations):
# Use the hint above to define one training example (X,Y) (≈ 2 lines)
index = j % len(data)
X = [None] + [char_to_ix[ch] for ch in data[index]]
Y = X[1:] + [char_to_ix["\n"]]
# Perform one optimization step: Forward-prop -> Backward-prop -> Clip -> Update parameters
# Choose a learning rate of 0.01
curr_loss, parameters, a_prev = optimize(X, Y, a_prev, parameters, learning_rate = 0.01)
# Use a latency trick to keep the loss smooth. It happens here to accelerate the training.
loss = smooth(loss, curr_loss)
# Every 2000 Iteration, generate "n" characters thanks to sample() to check if the model is learning properly
if j % 2000 == 0:
print('Iteration: %d, Loss: %f' % (j, loss) + '\n')
# The number of dinosaur names to print
seed = 0
for name in range(dino_names):
# Sample indices and print them
sampled_indices = sample(parameters, char_to_ix, seed)
print_sample(sampled_indices, ix_to_char)
seed += 1 # To get the same result for grading purposed, increment the seed by one.
print('\n')
return parameters
if __name__ == "__main__":
data = open('dinos.txt', 'r').read()
data = data.lower()
chars = list(set(data)) # str->set,例如:'123'转set,会转为无序不重复的,形如:{'1','2','3'}
print(chars)
data_size, vocab_size = len(data), len(chars)
print('There are %d total characters and %d unique characters in your data.' % (data_size, vocab_size))
char_to_ix = {ch: i for i, ch in enumerate(sorted(chars))}
ix_to_char = {i: ch for i, ch in enumerate(sorted(chars))}
print(ix_to_char)
# Build list of all dinosaur names (training examples).
with open("dinos.txt") as f:
examples = f.readlines()
examples = [x.lower().strip() for x in examples]
# Shuffle list of all dinosaur names
np.random.seed(0)
np.random.shuffle(examples)
parameters = model(examples, ix_to_char, char_to_ix)