spro · zutotonno · May 29, 2019 · May 30, 2019 · May 30, 2019 · May 30, 2019
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+__pycache__/
+.idea/
+*.pt
+Save/
diff --git a/README.md b/README.md
@@ -9,28 +9,36 @@ Download [this Shakespeare dataset](https://raw.githubusercontent.com/karpathy/c
 Run `train.py` with the dataset filename to train and save the network:
 
 ```
-> python train.py shakespeare.txt
+> python train.py --train shakespeare.txt
 
 Training for 2000 epochs...
 (... 10 minutes later ...)
 Saved as shakespeare.pt
 ```
 After training the model will be saved as `[filename].pt`.
+According to the --print_every arg model checkpoints will be saved in the `Save/` folder that should be in the same fold where the train.py script is called
 
 ### Training options
 
 ```
-Usage: train.py [filename] [options]
+Usage: train.py [options]
 
 Options:
---model            Whether to use LSTM or GRU units    gru
---n_epochs         Number of epochs to train           2000
---print_every      Log learning rate at this interval  100
---hidden_size      Hidden size of GRU                  50
---n_layers         Number of GRU layers                2
---learning_rate    Learning rate                       0.01
---chunk_len        Length of training chunks           200
---batch_size       Number of examples per batch        100
+--train            Train data
+--valid            Validation data
+--model            Whether to use LSTM or GRU units         gru
+--n_epochs         Number of epochs to train                10
+--print_every      Log learning rate at this interval       100
+--hidden_size      Hidden size of GRU                       50
+--n_layers         Number of GRU layers                     2
+--learning_rate    Learning rate                            0.01
+--chunk_len        Length of training chunks                200
+--batch_size       Number of examples per batch             100
+--batch_type       Batch random (0) or sequential (1)       0
+--drop_out         drop-out rate between Recurrent layers   0
+--early_stopping   Number of validation step with no impr.  10
+--model_name       model(session) name, used in checkpoints 
+
 --cuda             Use CUDA
 ```
 
@@ -59,3 +67,43 @@ Options:
 --cuda               Use CUDA
 ```
 
+
+### Grid search
+```
+Usage : search_params.py --train [options]
+
+Hard-coded params :
+  -learning_rate : [0.001,0.01]
+  -max_epochs : [500]
+  -n_batch_size : [32,1024] (should be changed according to available memory)
+  -batch_type : [0,1] (random vs consequent sampling)
+  -model_type : [lstm, gru]
+
+Options:
+--train     training file
+--valid     validation file
+--hidden_size_init    50
+--hidden_size_end     800
+--hidden_size_step    200
+--n_layer_init        1
+--n_layer_end         4
+--n_layer_step        1
+--chunk_len_init      20
+--chunk_len_end       90
+--chunk_len_step      10
+--early_stopping      10
+--optimizer           adam 
+--cuda                
+```
+
+### TODO
+
+* [] Grid search (Need improv.)
+
+## DONE
+* [x] Early stopping
+* [x] Add Dropout (p)
+* [x] Add Validation set to monitor overfitting
+* [x] Saving model at checkpoint
+* [x] Saving train and validation error, with training params to file
+* [x] Refact to more OO paradigm
diff --git a/generate.py b/generate.py
@@ -1,37 +1,49 @@
 #!/usr/bin/env python
-# https://github.com/spro/char-rnn.pytorch
+# https://github.com/zutotonno/char-rnn.pytorch
 
 import torch
 import os
 import argparse
+import string
 
 from helpers import *
 from model import *
 
+all_characters = string.printable
+n_characters = len(all_characters)
+
 def generate(decoder, prime_str='A', predict_len=100, temperature=0.8, cuda=False):
     hidden = decoder.init_hidden(1)
     prime_input = Variable(char_tensor(prime_str).unsqueeze(0))
 
     if cuda:
-        hidden = hidden.cuda()
+        if isinstance(hidden, tuple):
+            hidden = (hidden[0].cuda(), hidden[1].cuda())
+        else:
+            hidden = hidden.cuda()
         prime_input = prime_input.cuda()
     predicted = prime_str
 
     # Use priming string to "build up" hidden state
     for p in range(len(prime_str) - 1):
-        _, hidden = decoder(prime_input[:,p], hidden)
+        _, hidden = decoder(prime_input[:, p], hidden)
 
-    inp = prime_input[:,-1]
+    inp = prime_input[:, -1]
 
     for p in range(predict_len):
+
+
         output, hidden = decoder(inp, hidden)
-        
+
         # Sample from the network as a multinomial distribution
         output_dist = output.data.view(-1).div(temperature).exp()
         top_i = torch.multinomial(output_dist, 1)[0]
 
         # Add predicted character to string and use as next input
         predicted_char = all_characters[top_i]
+        # if(predicted_char=='\n'):
+        #     break
+        # else:
         predicted += predicted_char
         inp = Variable(char_tensor(predicted_char).unsqueeze(0))
         if cuda:

diff --git a/helpers.py b/helpers.py
@@ -1,19 +1,19 @@
 # https://github.com/spro/char-rnn.pytorch
 
 import unidecode
-import string
 import random
 import time
 import math
 import torch
+import string
 
 # Reading and un-unicode-encoding data
-
 all_characters = string.printable
+
 n_characters = len(all_characters)
 
 def read_file(filename):
-    file = unidecode.unidecode(open(filename).read())
+    file = unidecode.unidecode(open(filename, encoding="utf8").read())
     return file, len(file)
 
 # Turning a string into a tensor

diff --git a/model.py b/model.py
@@ -1,24 +1,36 @@
-# https://github.com/spro/char-rnn.pytorch
+# https://github.com/zutotonno/char-rnn.pytorch
 
 import torch
 import torch.nn as nn
 from torch.autograd import Variable
 
 class CharRNN(nn.Module):
-    def __init__(self, input_size, hidden_size, output_size, model="gru", n_layers=1):
+    def __init__(self, input_size, hidden_size, output_size, model="gru", n_layers=1,
+     dropout = 0, gpu = True, batch_size = 32, chunk_len = 30, learning_rate = 0.001, optimizer = "adam"):
         super(CharRNN, self).__init__()
         self.model = model.lower()
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.output_size = output_size
         self.n_layers = n_layers
+        self.gpu = gpu
+        self.batch_size = batch_size
+        self.chunk_len = chunk_len
+        self.optimizer = optimizer
 
         self.encoder = nn.Embedding(input_size, hidden_size)
         if self.model == "gru":
-            self.rnn = nn.GRU(hidden_size, hidden_size, n_layers)
+            self.rnn = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout)
         elif self.model == "lstm":
-            self.rnn = nn.LSTM(hidden_size, hidden_size, n_layers)
+            self.rnn = nn.LSTM(hidden_size, hidden_size, n_layers, dropout=dropout)
         self.decoder = nn.Linear(hidden_size, output_size)
+        if self.optimizer == "adam":
+            self.optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
+        elif self.optimizer == "rms":
+            self.optimizer = torch.optim.RMSprop(self.parameters(), lr=learning_rate)
+        self.criterion = nn.CrossEntropyLoss()
+        if self.gpu:
+            self.cuda()
 
     def forward(self, input, hidden):
         batch_size = input.size(0)
@@ -35,7 +47,30 @@ def forward2(self, input, hidden):
 
     def init_hidden(self, batch_size):
         if self.model == "lstm":
-            return (Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)),
+             return (Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)),
                     Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)))
         return Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size))
 
+
+    def train(self,inp, target, validation):
+        self.zero_grad()
+        loss = 0
+        hidden = self.init_hidden(self.batch_size)
+        if self.cuda:
+            if self.model == "gru":
+                hidden = hidden.cuda()
+            else:
+                hidden = (hidden[0].cuda(), hidden[1].cuda())
+        for c in range(self.chunk_len):
+            output, hidden = self(inp[:, c], hidden)
+            loss += self.criterion(output.view(self.batch_size, -1), target[:, c])       
+         ### The losses are averaged across observations for each minibatch (see doc CrossEntropyLoss)
+        if not validation:
+            loss.backward()
+            self.optimizer.step()
+        currentLoss = loss.item()/ self.chunk_len
+        return currentLoss
+
+
+
+