From 80a79a3c4ce46679a11335667069d4fb04eaafe4 Mon Sep 17 00:00:00 2001 From: Alexey Kruglov Date: Sun, 19 Nov 2017 17:46:02 +0300 Subject: [PATCH 1/3] Speed up training ... by feeding the whole sequence to CuDNN RNN, as opposed to character-by-character. --- generate.py | 11 +++++------ model.py | 18 ++++++++---------- train.py | 12 +++++++----- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/generate.py b/generate.py index 0fdf414..45d9ed3 100755 --- a/generate.py +++ b/generate.py @@ -18,14 +18,13 @@ def generate(decoder, prime_str='A', predict_len=100, temperature=0.8, cuda=Fals predicted = prime_str # Use priming string to "build up" hidden state - for p in range(len(prime_str) - 1): - _, hidden = decoder(prime_input[:,p], hidden) - - inp = prime_input[:,-1] - + _, hidden = decoder(prime_input, hidden) + + inp = prime_input[0,-1].unsqueeze(0) + for p in range(predict_len): output, hidden = decoder(inp, hidden) - + # Sample from the network as a multinomial distribution output_dist = output.data.view(-1).div(temperature).exp() top_i = torch.multinomial(output_dist, 1)[0] diff --git a/model.py b/model.py index b619634..1768352 100644 --- a/model.py +++ b/model.py @@ -15,22 +15,20 @@ def __init__(self, input_size, hidden_size, output_size, model="gru", n_layers=1 self.encoder = nn.Embedding(input_size, hidden_size) if self.model == "gru": - self.rnn = nn.GRU(hidden_size, hidden_size, n_layers) + self.rnn = nn.GRU(hidden_size, hidden_size, n_layers, batch_first=True) elif self.model == "lstm": - self.rnn = nn.LSTM(hidden_size, hidden_size, n_layers) + self.rnn = nn.LSTM(hidden_size, hidden_size, n_layers, batch_first=True) self.decoder = nn.Linear(hidden_size, output_size) def forward(self, input, hidden): + """ + input: shape=(batch_size, seq_size) + output: shape=(batch_size, seq_size, output_size) + """ batch_size = input.size(0) encoded = self.encoder(input) - output, hidden = self.rnn(encoded.view(1, batch_size, -1), hidden) - output = self.decoder(output.view(batch_size, -1)) - return output, hidden - - def forward2(self, input, hidden): - encoded = self.encoder(input.view(1, -1)) - output, hidden = self.rnn(encoded.view(1, 1, -1), hidden) - output = self.decoder(output.view(1, -1)) + output, hidden = self.rnn(encoded, hidden) + output = self.decoder(output) return output, hidden def init_hidden(self, batch_size): diff --git a/train.py b/train.py index 9050e57..a02a864 100755 --- a/train.py +++ b/train.py @@ -49,20 +49,22 @@ def random_training_set(chunk_len, batch_size): return inp, target def train(inp, target): + """ + inp: (batch_size, seq_size) + target: (batch_size, seq_size) + """ hidden = decoder.init_hidden(args.batch_size) if args.cuda: hidden = hidden.cuda() decoder.zero_grad() - loss = 0 - for c in range(args.chunk_len): - output, hidden = decoder(inp[:,c], hidden) - loss += criterion(output.view(args.batch_size, -1), target[:,c]) + output, hidden = decoder(inp, hidden) + loss = criterion(output.view(-1, output.size(-1)), target.view(-1)) loss.backward() decoder_optimizer.step() - return loss.data[0] / args.chunk_len + return loss.data[0] def save(): save_filename = os.path.splitext(os.path.basename(args.filename))[0] + '.pt' From 3e0a865fe39c2c69b4e1997dcc90716cb23222ea Mon Sep 17 00:00:00 2001 From: Alexey Kruglov Date: Sun, 19 Nov 2017 17:58:51 +0300 Subject: [PATCH 2/3] Fix bug with index out of range --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index a02a864..766b0de 100755 --- a/train.py +++ b/train.py @@ -36,7 +36,7 @@ def random_training_set(chunk_len, batch_size): inp = torch.LongTensor(batch_size, chunk_len) target = torch.LongTensor(batch_size, chunk_len) for bi in range(batch_size): - start_index = random.randint(0, file_len - chunk_len) + start_index = random.randint(0, file_len - chunk_len - 1) end_index = start_index + chunk_len + 1 chunk = file[start_index:end_index] inp[bi] = char_tensor(chunk[:-1]) From 601f1ed08abfd815353afdae436a858d6488b2e8 Mon Sep 17 00:00:00 2001 From: Alexey Kruglov Date: Sun, 19 Nov 2017 18:28:36 +0300 Subject: [PATCH 3/3] Save GPU memory by not storing history during inference --- generate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/generate.py b/generate.py index 45d9ed3..d1057fc 100755 --- a/generate.py +++ b/generate.py @@ -10,7 +10,7 @@ def generate(decoder, prime_str='A', predict_len=100, temperature=0.8, cuda=False): hidden = decoder.init_hidden(1) - prime_input = Variable(char_tensor(prime_str).unsqueeze(0)) + prime_input = Variable(char_tensor(prime_str).unsqueeze(0), volatile=True) if cuda: hidden = hidden.cuda() @@ -32,7 +32,7 @@ def generate(decoder, prime_str='A', predict_len=100, temperature=0.8, cuda=Fals # Add predicted character to string and use as next input predicted_char = all_characters[top_i] predicted += predicted_char - inp = Variable(char_tensor(predicted_char).unsqueeze(0)) + inp = Variable(char_tensor(predicted_char).unsqueeze(0), volatile=True) if cuda: inp = inp.cuda()