diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6d63ec3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +__pycache__/ +.idea/ +*.pt +Save/ diff --git a/README.md b/README.md index 21a04b8..e5182bc 100644 --- a/README.md +++ b/README.md @@ -9,28 +9,36 @@ Download [this Shakespeare dataset](https://raw.githubusercontent.com/karpathy/c Run `train.py` with the dataset filename to train and save the network: ``` -> python train.py shakespeare.txt +> python train.py --train shakespeare.txt Training for 2000 epochs... (... 10 minutes later ...) Saved as shakespeare.pt ``` After training the model will be saved as `[filename].pt`. +According to the --print_every arg model checkpoints will be saved in the `Save/` folder that should be in the same fold where the train.py script is called ### Training options ``` -Usage: train.py [filename] [options] +Usage: train.py [options] Options: ---model Whether to use LSTM or GRU units gru ---n_epochs Number of epochs to train 2000 ---print_every Log learning rate at this interval 100 ---hidden_size Hidden size of GRU 50 ---n_layers Number of GRU layers 2 ---learning_rate Learning rate 0.01 ---chunk_len Length of training chunks 200 ---batch_size Number of examples per batch 100 +--train Train data +--valid Validation data +--model Whether to use LSTM or GRU units gru +--n_epochs Number of epochs to train 10 +--print_every Log learning rate at this interval 100 +--hidden_size Hidden size of GRU 50 +--n_layers Number of GRU layers 2 +--learning_rate Learning rate 0.01 +--chunk_len Length of training chunks 200 +--batch_size Number of examples per batch 100 +--batch_type Batch random (0) or sequential (1) 0 +--drop_out drop-out rate between Recurrent layers 0 +--early_stopping Number of validation step with no impr. 10 +--model_name model(session) name, used in checkpoints + --cuda Use CUDA ``` @@ -59,3 +67,43 @@ Options: --cuda Use CUDA ``` + +### Grid search +``` +Usage : search_params.py --train [options] + +Hard-coded params : + -learning_rate : [0.001,0.01] + -max_epochs : [500] + -n_batch_size : [32,1024] (should be changed according to available memory) + -batch_type : [0,1] (random vs consequent sampling) + -model_type : [lstm, gru] + +Options: +--train training file +--valid validation file +--hidden_size_init 50 +--hidden_size_end 800 +--hidden_size_step 200 +--n_layer_init 1 +--n_layer_end 4 +--n_layer_step 1 +--chunk_len_init 20 +--chunk_len_end 90 +--chunk_len_step 10 +--early_stopping 10 +--optimizer adam +--cuda +``` + +### TODO + +* [] Grid search (Need improv.) + +## DONE +* [x] Early stopping +* [x] Add Dropout (p) +* [x] Add Validation set to monitor overfitting +* [x] Saving model at checkpoint +* [x] Saving train and validation error, with training params to file +* [x] Refact to more OO paradigm diff --git a/generate.py b/generate.py index 0fdf414..4731141 100755 --- a/generate.py +++ b/generate.py @@ -1,37 +1,49 @@ #!/usr/bin/env python -# https://github.com/spro/char-rnn.pytorch +# https://github.com/zutotonno/char-rnn.pytorch import torch import os import argparse +import string from helpers import * from model import * +all_characters = string.printable +n_characters = len(all_characters) + def generate(decoder, prime_str='A', predict_len=100, temperature=0.8, cuda=False): hidden = decoder.init_hidden(1) prime_input = Variable(char_tensor(prime_str).unsqueeze(0)) if cuda: - hidden = hidden.cuda() + if isinstance(hidden, tuple): + hidden = (hidden[0].cuda(), hidden[1].cuda()) + else: + hidden = hidden.cuda() prime_input = prime_input.cuda() predicted = prime_str # Use priming string to "build up" hidden state for p in range(len(prime_str) - 1): - _, hidden = decoder(prime_input[:,p], hidden) + _, hidden = decoder(prime_input[:, p], hidden) - inp = prime_input[:,-1] + inp = prime_input[:, -1] for p in range(predict_len): + + output, hidden = decoder(inp, hidden) - + # Sample from the network as a multinomial distribution output_dist = output.data.view(-1).div(temperature).exp() top_i = torch.multinomial(output_dist, 1)[0] # Add predicted character to string and use as next input predicted_char = all_characters[top_i] + # if(predicted_char=='\n'): + # break + # else: predicted += predicted_char inp = Variable(char_tensor(predicted_char).unsqueeze(0)) if cuda: diff --git a/helpers.py b/helpers.py index abbd56a..6169ac5 100644 --- a/helpers.py +++ b/helpers.py @@ -1,19 +1,19 @@ # https://github.com/spro/char-rnn.pytorch import unidecode -import string import random import time import math import torch +import string # Reading and un-unicode-encoding data - all_characters = string.printable + n_characters = len(all_characters) def read_file(filename): - file = unidecode.unidecode(open(filename).read()) + file = unidecode.unidecode(open(filename, encoding="utf8").read()) return file, len(file) # Turning a string into a tensor diff --git a/model.py b/model.py index b619634..f6f4e85 100644 --- a/model.py +++ b/model.py @@ -1,24 +1,36 @@ -# https://github.com/spro/char-rnn.pytorch +# https://github.com/zutotonno/char-rnn.pytorch import torch import torch.nn as nn from torch.autograd import Variable class CharRNN(nn.Module): - def __init__(self, input_size, hidden_size, output_size, model="gru", n_layers=1): + def __init__(self, input_size, hidden_size, output_size, model="gru", n_layers=1, + dropout = 0, gpu = True, batch_size = 32, chunk_len = 30, learning_rate = 0.001, optimizer = "adam"): super(CharRNN, self).__init__() self.model = model.lower() self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size self.n_layers = n_layers + self.gpu = gpu + self.batch_size = batch_size + self.chunk_len = chunk_len + self.optimizer = optimizer self.encoder = nn.Embedding(input_size, hidden_size) if self.model == "gru": - self.rnn = nn.GRU(hidden_size, hidden_size, n_layers) + self.rnn = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout) elif self.model == "lstm": - self.rnn = nn.LSTM(hidden_size, hidden_size, n_layers) + self.rnn = nn.LSTM(hidden_size, hidden_size, n_layers, dropout=dropout) self.decoder = nn.Linear(hidden_size, output_size) + if self.optimizer == "adam": + self.optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate) + elif self.optimizer == "rms": + self.optimizer = torch.optim.RMSprop(self.parameters(), lr=learning_rate) + self.criterion = nn.CrossEntropyLoss() + if self.gpu: + self.cuda() def forward(self, input, hidden): batch_size = input.size(0) @@ -35,7 +47,30 @@ def forward2(self, input, hidden): def init_hidden(self, batch_size): if self.model == "lstm": - return (Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)), + return (Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)), Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size))) return Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)) + + def train(self,inp, target, validation): + self.zero_grad() + loss = 0 + hidden = self.init_hidden(self.batch_size) + if self.cuda: + if self.model == "gru": + hidden = hidden.cuda() + else: + hidden = (hidden[0].cuda(), hidden[1].cuda()) + for c in range(self.chunk_len): + output, hidden = self(inp[:, c], hidden) + loss += self.criterion(output.view(self.batch_size, -1), target[:, c]) + ### The losses are averaged across observations for each minibatch (see doc CrossEntropyLoss) + if not validation: + loss.backward() + self.optimizer.step() + currentLoss = loss.item()/ self.chunk_len + return currentLoss + + + + diff --git a/search_params.py b/search_params.py new file mode 100755 index 0000000..029d49d --- /dev/null +++ b/search_params.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python +# https://github.com/zutotonno/char-rnn.pytorch + +import torch +import torch.nn as nn +from torch.autograd import Variable +import argparse +import os +import json +import numpy as np +from tqdm import tqdm +import string +import itertools + +from helpers import * +from model import * +from generate import * + +def random_dataset(program_args,args,file,file_len): + inp = torch.LongTensor(args['batch_size'], args['chunk_len']) + target = torch.LongTensor(args['batch_size'], args['chunk_len']) + for bi in range(args['batch_size']): + start_index = random.randint(0, file_len -args['chunk_len']) + + # while(file[start_index]!='\n'): # first word should be the actual start of a sentence. + # start_index = start_index+1 + + end_index = start_index + args['chunk_len'] + 1 + + if(end_index>file_len): # if we ended after the last char of the file, come back to get a correct chunk len + start_index = file_len-args['chunk_len']-1 + + chunk = file[start_index:end_index] + + inp[bi] = char_tensor(chunk[:-1]) + target[bi] = char_tensor(chunk[1:]) + inp = Variable(inp) + target = Variable(target) + if program_args.cuda: + inp = inp.cuda() + target = target.cuda() + return inp, target + + +def consequent_dataset(program_args,args, num_batches, file, file_len): + inp = torch.LongTensor(args['batch_size'], args['chunk_len']) + target = torch.LongTensor(args['batch_size'], args['chunk_len']) + end_index = args['chunk_len']*num_batches*args['batch_size'] + (args['batch_size']*num_batches) + end_reached = False + for bi in range(args['batch_size']): + start_index = end_index + + if (end_reached == True): + start_index = random.randint(0, file_len - args['chunk_len'] - 1) + + if (start_index + args['chunk_len'] + 1 > file_len): # if we ended after the last char of the file, come back to get a correct chunk len + start_index = file_len - args['chunk_len'] - 1 + end_reached = True + + end_index = start_index + args['chunk_len'] + 1 # Adding 1 to create target + chunk = file[start_index:end_index] + + inp[bi] = char_tensor(chunk[:-1]) + target[bi] = char_tensor(chunk[1:]) + inp = Variable(inp) + target = Variable(target) + if program_args.cuda: + inp = inp.cuda() + target = target.cuda() + return inp, target + + +def savemodel(modelName,args): + save_filename = 'Save/' + directoryCheckpoint = 'Save/'+modelName + if not os.path.exists(directoryCheckpoint): + os.makedirs(directoryCheckpoint) + if modelName is not None: + directoryCheckpoint +='/'+ os.path.splitext(os.path.basename(args.train))[0] +'_'+modelName+ '_Checkpoint' +'.pt' + else: + directoryCheckpoint +='/'+ os.path.splitext(os.path.basename(args.train))[0] + '_Checkpoint'+'.pt' + + torch.save(decoder, directoryCheckpoint) + + +def save(modelName,params,train_losses,valid_losses): + save_filename = 'Save/' + save_filename += modelName + + jsonName = save_filename + '.json' + with open(jsonName, 'w') as json_file: + json.dump(params, json_file) + saveLossesName = save_filename+'.csv' + if(valid_losses is not None): + np.savetxt(saveLossesName, np.column_stack((train_losses, valid_losses)), delimiter=",", fmt='%s', header='Train,Valid') + else: + np.savetxt(saveLossesName, train_losses, delimiter=",", fmt='%s', header='Train') + print('Saved as %s' % save_filename) + + +# Initialize models and start training + +if __name__ == '__main__': + + # Parse command line arguments + argparser = argparse.ArgumentParser() + argparser.add_argument('--train', type=str) + argparser.add_argument('--valid', type=str) + + argparser.add_argument('--hidden_size_init', type=int, default=50) + argparser.add_argument('--hidden_size_end', type=int, default=800) + argparser.add_argument('--hidden_size_step', type=int, default=200) + + argparser.add_argument('--n_layers_init', type=int, default=1) + argparser.add_argument('--n_layers_end', type=int, default=4) + argparser.add_argument('--n_layers_step', type=int, default=1) + + + argparser.add_argument('--chunk_len_init', type=int, default=20) + argparser.add_argument('--chunk_len_end', type=int, default=90) + argparser.add_argument('--chunk_len_step', type=int, default=10) + + argparser.add_argument('--early_stopping', type=int, default=10) + + argparser.add_argument('--cuda', action='store_true') + argparser.add_argument('--optimizer', type=str, default="adam") + argparser.add_argument('--print_every', type=int, default=10) + args = argparser.parse_args() + + if args.cuda: + print("Using CUDA") + + fileTrain, file_lenTrain = read_file(args.train) + try: + fileValid, file_lenValid = read_file(args.valid) + early_stopping_patience = args.early_stopping + except: + print('No validation data supplied') + + all_characters = string.printable + n_characters = len(all_characters) + + params_list = [] + + ##0 + n_epochs_list = [500] + params_list.append(n_epochs_list) + ##1 + n_hidden_list = list(range(args.hidden_size_init,args.hidden_size_end,args.hidden_size_step)) + params_list.append(n_hidden_list) + ##2 + n_layers_list = list(range(args.n_layers_init,args.n_layers_end,args.n_layers_step)) + params_list.append(n_layers_list) + ### TODO: + # n_dropout_list = [0,0.3] + # params_list.append(n_dropout_list) + + ##3 + n_chunk_len_list = list(range(args.chunk_len_init,args.chunk_len_end,args.chunk_len_step)) + params_list.append(n_chunk_len_list) + ##4 + n_batch_size_list = [32,1024] + params_list.append(n_batch_size_list) + ##5 + n_learning_rate_list = [0.001,0.01] + params_list.append(n_learning_rate_list) + ##6 + batch_type = [0,1] + params_list.append(batch_type) + ##7 + model_type = ['lstm','gru'] + params_list.append(model_type) + + param_combinations = list(itertools.product(*params_list)) + + currentCombination = 1 + for params in param_combinations : + param_dict = dict() + param_dict['model'] = params[-1] + param_dict['hidden_size'] = params[1] + param_dict['n_layers'] = params[2] + param_dict['learning_rate'] = params[5] + param_dict['chunk_len'] = params[3] + param_dict['batch_size'] = params[4] + + decoder = CharRNN( + input_size = n_characters, + output_size = n_characters, + **param_dict + ) + + + param_dict['batch_type'] = params[6] + param_dict['epochs'] = params[0] + train_losses = [] + valid_losses = [] + loss_avg = 0 + valid_loss_avg = 0 + start = time.time() + valid_loss_best = np.inf + patience = 1 + + try: + print("Training for %d epochs..." % param_dict['epochs']) + modelName = str(currentCombination) + print(param_dict) + numFileBatches = math.ceil(file_lenTrain/((param_dict['batch_size']*param_dict['chunk_len'])+param_dict['batch_size'])) + numValidBatches = math.ceil(file_lenValid/((param_dict['batch_size']*param_dict['chunk_len'])+param_dict['batch_size'])) + for epoch in tqdm(range(1, param_dict['epochs'] + 1)): + # end_index = 0 + numBatches = 0 + numBatchesValid = 0 + loss_avg = 0 + while(numBatches < numFileBatches) : + if(param_dict['batch_type'] == 0): ### Sampling batches at random + loss = decoder.train(*random_dataset(args,param_dict,fileTrain,file_lenTrain),validation=False) + elif(param_dict['batch_type'] == 1): ### Get consequent batches of chars without replacement + loss = decoder.train(*consequent_dataset(args, param_dict, numBatches,fileTrain, file_lenTrain),validation=False) + loss_avg += loss + numBatches += 1 + loss_avg /= numFileBatches + train_losses.append(loss_avg) + if args.valid is not None: + valid_loss_avg = 0 + while(numBatchesValid < numValidBatches) : + valid_loss_avg += decoder.train(*consequent_dataset(args,param_dict,numBatchesValid,fileValid,file_lenValid),validation=True) + numBatchesValid += 1 + valid_loss_avg /= numValidBatches + valid_losses.append(valid_loss_avg) + if valid_loss_avg < valid_loss_best: + print("New best checkpoint: %.4f, old: %.4f" % (valid_loss_avg,valid_loss_best)) + savemodel(modelName, args) + valid_loss_best = valid_loss_avg + args.early_stopping = epoch + patience = 1 + else: + patience += 1 + if(patience >= early_stopping_patience): + break + if epoch % args.print_every == 0: + print('[%s (%d %d%%) Train: %.4f Valid: %.4f]' % (time_since(start), epoch, epoch / param_dict['epochs'] * 100, loss_avg, valid_loss_avg)) + print(generate(decoder, 'Renzi', 200, cuda=args.cuda), '\n') + + print("Saving...") + param_dict['early_stopping'] = args.early_stopping + save(modelName,param_dict,train_losses,valid_losses) + currentCombination += 1 + except KeyboardInterrupt: + print("Saving before quit...") + save(param_dict) + diff --git a/train.py b/train.py index 4d47bba..b77f44a 100755 --- a/train.py +++ b/train.py @@ -1,45 +1,36 @@ #!/usr/bin/env python -# https://github.com/spro/char-rnn.pytorch +# https://github.com/zutotonno/char-rnn.pytorch import torch import torch.nn as nn from torch.autograd import Variable import argparse import os - +import json +import numpy as np from tqdm import tqdm +import string from helpers import * from model import * from generate import * -# Parse command line arguments -argparser = argparse.ArgumentParser() -argparser.add_argument('filename', type=str) -argparser.add_argument('--model', type=str, default="gru") -argparser.add_argument('--n_epochs', type=int, default=2000) -argparser.add_argument('--print_every', type=int, default=100) -argparser.add_argument('--hidden_size', type=int, default=100) -argparser.add_argument('--n_layers', type=int, default=2) -argparser.add_argument('--learning_rate', type=float, default=0.01) -argparser.add_argument('--chunk_len', type=int, default=200) -argparser.add_argument('--batch_size', type=int, default=100) -argparser.add_argument('--shuffle', action='store_true') -argparser.add_argument('--cuda', action='store_true') -args = argparser.parse_args() - -if args.cuda: - print("Using CUDA") - -file, file_len = read_file(args.filename) - -def random_training_set(chunk_len, batch_size): - inp = torch.LongTensor(batch_size, chunk_len) - target = torch.LongTensor(batch_size, chunk_len) - for bi in range(batch_size): - start_index = random.randint(0, file_len - chunk_len) - end_index = start_index + chunk_len + 1 +def random_dataset(args,file,file_len): + inp = torch.LongTensor(args.batch_size, args.chunk_len) + target = torch.LongTensor(args.batch_size, args.chunk_len) + for bi in range(args.batch_size): + start_index = random.randint(0, file_len - args.chunk_len) + + # while(file[start_index]!='\n'): # first word should be the actual start of a sentence. + # start_index = start_index+1 + + end_index = start_index + args.chunk_len + 1 + + if(end_index>file_len): # if we ended after the last char of the file, come back to get a correct chunk len + start_index = file_len-args.chunk_len-1 + chunk = file[start_index:end_index] + inp[bi] = char_tensor(chunk[:-1]) target[bi] = char_tensor(chunk[1:]) inp = Variable(inp) @@ -49,60 +40,180 @@ def random_training_set(chunk_len, batch_size): target = target.cuda() return inp, target -def train(inp, target): - hidden = decoder.init_hidden(args.batch_size) - if args.cuda: - hidden = hidden.cuda() - decoder.zero_grad() - loss = 0 - for c in range(args.chunk_len): - output, hidden = decoder(inp[:,c], hidden) - loss += criterion(output.view(args.batch_size, -1), target[:,c]) +def consequent_dataset(args, num_batches, file, file_len): + inp = torch.LongTensor(args.batch_size, args.chunk_len) + target = torch.LongTensor(args.batch_size, args.chunk_len) + end_index = args.chunk_len*num_batches*args.batch_size + (args.batch_size*num_batches) + end_reached = False + for bi in range(args.batch_size): + start_index = end_index + + if (end_reached == True): + start_index = random.randint(0, file_len - args.chunk_len - 1) + + if (start_index + args.chunk_len + 1 > file_len): # if we ended after the last char of the file, come back to get a correct chunk len + start_index = file_len - args.chunk_len - 1 + end_reached = True - loss.backward() - decoder_optimizer.step() + end_index = start_index + args.chunk_len + 1 # Adding 1 to create target + chunk = file[start_index:end_index] - return loss.data[0] / args.chunk_len + inp[bi] = char_tensor(chunk[:-1]) + target[bi] = char_tensor(chunk[1:]) + inp = Variable(inp) + target = Variable(target) + if args.cuda: + inp = inp.cuda() + target = target.cuda() + return inp, target -def save(): - save_filename = os.path.splitext(os.path.basename(args.filename))[0] + '.pt' +def save(args): + save_filename = 'Save/' + if modelName is not None: + save_filename += os.path.splitext(os.path.basename(args.train))[0] +'_'+modelName+ '.pt' + else: + save_filename += os.path.splitext(os.path.basename(args.train))[0] + '.pt' + + jsonName = save_filename + '.json' + with open(jsonName, 'w') as json_file: + json.dump(vars(args), json_file) + saveLossesName = save_filename+'.csv' + if(args.valid is not None): + np.savetxt(saveLossesName, np.column_stack((train_losses, valid_losses)), delimiter=",", fmt='%s', header='Train,Valid') + else: + np.savetxt(saveLossesName, train_losses, delimiter=",", fmt='%s', header='Train') torch.save(decoder, save_filename) print('Saved as %s' % save_filename) +def savemodel(args): + save_filename = 'Save/' + directoryCheckpoint = 'Save/'+modelName + if not os.path.exists(directoryCheckpoint): + os.makedirs(directoryCheckpoint) + if modelName is not None: + directoryCheckpoint +='/'+ os.path.splitext(os.path.basename(args.train))[0] +'_'+modelName+ '_Checkpoint' +'.pt' + else: + directoryCheckpoint +='/'+ os.path.splitext(os.path.basename(args.train))[0] + '_Checkpoint'+'.pt' + + torch.save(decoder, directoryCheckpoint) + + + + # Initialize models and start training -decoder = CharRNN( - n_characters, - args.hidden_size, - n_characters, - model=args.model, - n_layers=args.n_layers, -) -decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=args.learning_rate) -criterion = nn.CrossEntropyLoss() - -if args.cuda: - decoder.cuda() - -start = time.time() -all_losses = [] -loss_avg = 0 - -try: - print("Training for %d epochs..." % args.n_epochs) - for epoch in tqdm(range(1, args.n_epochs + 1)): - loss = train(*random_training_set(args.chunk_len, args.batch_size)) - loss_avg += loss - - if epoch % args.print_every == 0: - print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / args.n_epochs * 100, loss)) - print(generate(decoder, 'Wh', 100, cuda=args.cuda), '\n') - - print("Saving...") - save() - -except KeyboardInterrupt: - print("Saving before quit...") - save() +if __name__ == '__main__': + + # Parse command line arguments + argparser = argparse.ArgumentParser() + argparser.add_argument('--train', type=str) + argparser.add_argument('--valid', type=str) + argparser.add_argument('--model', type=str, default="gru") + argparser.add_argument('--n_epochs', type=int, default=2000) + argparser.add_argument('--print_every', type=int, default=100) + argparser.add_argument('--hidden_size', type=int, default=100) + argparser.add_argument('--n_layers', type=int, default=2) + argparser.add_argument('--dropout', type=float, default=0.3) + argparser.add_argument('--learning_rate', type=float, default=0.01) + argparser.add_argument('--chunk_len', type=int, default=200) + argparser.add_argument('--batch_size', type=int, default=100) + argparser.add_argument('--batch_type', type=int, default=0) + argparser.add_argument('--early_stopping', type=int, default=10) + argparser.add_argument('--optimizer', type=str, default="adam") + argparser.add_argument('--cuda', action='store_true') + argparser.add_argument('--modelname', type=str, default=None) + args = argparser.parse_args() + if args.cuda: + print("Using CUDA") + + fileTrain, file_lenTrain = read_file(args.train) + + numFileBatches = math.ceil(file_lenTrain/((args.batch_size*args.chunk_len)+args.batch_size)) + try: + fileValid, file_lenValid = read_file(args.valid) + numValidBatches = math.ceil(file_lenValid/((args.batch_size*args.chunk_len)+args.batch_size)) + early_stopping_patience = args.early_stopping + except: + print('No validation data supplied') + if(args.modelname is None): + print('No model name supplied -> Model checkpoint disabled') + modelName = args.modelname + + all_characters = string.printable + n_characters = len(all_characters) + + decoder = CharRNN( + n_characters, + args.hidden_size, + n_characters, + model=args.model, + n_layers=args.n_layers, + dropout=args.dropout, + learning_rate=args.learning_rate, + chunk_len= args.chunk_len, + batch_size=args.batch_size, + gpu = args.cuda, + optimizer = args.optimizer + ) + + + + + batch_type = args.batch_type + + start = time.time() + train_losses = [] + valid_losses = [] + valid_loss_best = np.inf + patience = 1 + try: + print("Training for %d epochs..." % args.n_epochs) + + for epoch in tqdm(range(1, args.n_epochs + 1)): + # end_index = 0 + numBatches = 0 + numBatchesValid = 0 + loss_avg = 0 + while(numBatches < numFileBatches) : + if(batch_type == 0): ### Sampling batches at random + loss = decoder.train(*random_dataset(args,fileTrain,file_lenTrain),validation=False) + elif(batch_type == 1): ### Get consequent batches of chars without replacement + loss = decoder.train(*consequent_dataset(args, numBatches,fileTrain, file_lenTrain),validation=False) + loss_avg += loss + numBatches += 1 + loss_avg /= numFileBatches + train_losses.append(loss_avg) + if args.valid is not None: + valid_loss_avg = 0 + while(numBatchesValid < numValidBatches) : + valid_loss_avg += decoder.train(*consequent_dataset(args,numBatchesValid,fileValid,file_lenValid),validation=True) + numBatchesValid += 1 + valid_loss_avg /= numValidBatches + valid_losses.append(valid_loss_avg) + if valid_loss_avg < valid_loss_best: + if(args.modelname is not None): + print("New best checkpoint: %.4f, old: %.4f" % (valid_loss_avg,valid_loss_best)) + savemodel(args) + valid_loss_best = valid_loss_avg + args.early_stopping = epoch + patience = 1 + else: + patience += 1 + if(patience >= early_stopping_patience): + break + + if epoch % args.print_every == 0: + if args.valid is not None: + print('[%s (%d %d%%) Train: %.4f Valid: %.4f]' % (time_since(start), epoch, epoch / args.n_epochs * 100, loss_avg, valid_loss_avg)) + else: + print('[%s (%d %d%%) Train: %.4f]' % (time_since(start), epoch, epoch / args.n_epochs * 100, loss_avg)) + print(generate(decoder, 'Renzi', 200, cuda=args.cuda), '\n') + + print("Saving...") + save(args) + + except KeyboardInterrupt: + print("Saving before quit...") + save(args)