Merge branch 'master' of https://github.com/vansky/neural-complexity

Conflicts: main.py
vansky · Sep 10, 2019 · cd055d3 · cd055d3
2 parents ab444ab + 1b90823
commit cd055d3
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 43 deletions.
diff --git a/data.py b/data.py
@@ -34,17 +34,23 @@ class SentenceCorpus(object):
     """ Loads train/dev/test corpora and dictionary """
     def __init__(self, path, vocab_file, test_flag=False, interact_flag=False,
                  checkpoint_flag=False, predefined_vocab_flag=False, lower_flag=False,
-                 collapse_nums_flag=False,
+                 collapse_nums_flag=False,multisentence_test_flag=False,generate_flag=False,
                  trainfname='train.txt',
                  validfname='valid.txt',
                  testfname='test.txt'):
         self.lower = lower_flag
         self.collapse_nums = collapse_nums_flag
-        if not (test_flag or interact_flag or checkpoint_flag or predefined_vocab_flag):
+        if not (test_flag or interact_flag or checkpoint_flag or predefined_vocab_flag or generate_flag):
             # training mode
             self.dictionary = Dictionary()
             self.train = self.tokenize(os.path.join(path, trainfname))
             self.valid = self.tokenize_with_unks(os.path.join(path, validfname))
+            try:
+                # don't require a test set at train time,
+                # but if there is one, get a sense of whether unks will be required
+                self.test = self.tokenize_with_unks(os.path.join(path, testfname))
+            except:
+                pass
             self.save_dict(vocab_file)
         else:
             # load pretrained model
@@ -55,7 +61,10 @@ def __init__(self, path, vocab_file, test_flag=False, interact_flag=False,
                 self.load_dict(vocab_file)
             if test_flag:
                 # test mode
-                self.test = self.sent_tokenize_with_unks(os.path.join(path, testfname))
+                if multisentence_test_flag:
+                    self.test = self.tokenize_with_unks(os.path.join(path, testfname))
+                else:
+                    self.test = self.sent_tokenize_with_unks(os.path.join(path, testfname))
             elif checkpoint_flag or predefined_vocab_flag:
                 # load from a checkpoint
                 self.train = self.tokenize_with_unks(os.path.join(path, trainfname))
@@ -78,7 +87,7 @@ def save_dict(self, path):
 
     def load_dict(self, path):
         """ Loads dictionary from disk """
-        assert os.path.exists(path)
+        assert os.path.exists(path), "Bad path: %s" % path
         if path[-3:] == 'bin':
             # This check actually seems to be faster than passing in a binary flag
             # Assume dict is binarized
@@ -97,7 +106,7 @@ def load_dict(self, path):
 
     def tokenize(self, path):
         """ Tokenizes a text file. """
-        assert os.path.exists(path)
+        assert os.path.exists(path), "Bad path: %s" % path
         # Add words to the dictionary
         if path[-2:] == 'gz':
             with gzip.open(path, 'rb') as file_handle:
@@ -217,7 +226,7 @@ def tokenize(self, path):
 
     def tokenize_with_unks(self, path):
         """ Tokenizes a text file, adding unks if needed. """
-        assert os.path.exists(path)
+        assert os.path.exists(path), "Bad path: %s" % path
         if path[-2:] == 'gz':
             # Determine the length of the corpus
             with gzip.open(path, 'rb') as file_handle:
@@ -326,7 +335,7 @@ def tokenize_with_unks(self, path):
 
     def sent_tokenize_with_unks(self, path):
         """ Tokenizes a text file into sentences, adding unks if needed. """
-        assert os.path.exists(path)
+        assert os.path.exists(path), "Bad path: %s" % path
         all_ids = []
         sents = []
         if path[-2:] == 'gz':

diff --git a/generate.py b/generate.py
@@ -61,7 +61,7 @@
         model.rnn.flatten_parameters()
 model.eval()
 
-corpus = data.SentenceCorpus(args.data_dir, args.vocab_file, True)
+corpus = data.SentenceCorpus(args.data_dir, args.vocab_file, generate_flag=True)
 
 ntokens = len(corpus.dictionary)
 if args.cuda and (not args.single) and (torch.cuda.device_count() > 1):

diff --git a/main.py b/main.py
@@ -90,8 +90,12 @@
                     help='test a trained LM')
 parser.add_argument('--load_checkpoint', action='store_true',
                     help='continue training a pre-trained LM')
+parser.add_argument('--freeze_embedding', action='store_true',
+                    help='do not train embedding weights')
 parser.add_argument('--single', action='store_true',
                     help='use only a single GPU (even if more are available)')
+parser.add_argument('--multisentence_test', action='store_true',
+                    help='treat multiple sentences as a single stream at test time')
 
 parser.add_argument('--adapt', action='store_true',
                     help='adapt model weights during evaluation')
@@ -101,6 +105,8 @@
                     help='which layer should output cell states')
 parser.add_argument('--view_hidden', action='store_true',
                     help='output the hidden state rather than the cell state')
+parser.add_argument('--verbose_view_layer', action='store_true',
+                    help='output the input observation followed by the vector activations')
 
 parser.add_argument('--words', action='store_true',
                     help='evaluate word-level complexities (instead of sentence-level loss)')
@@ -146,6 +152,10 @@
     # If adapting, we must be in test mode
     args.test = True
 
+if args.view_layer != -1:
+    # There shouldn't be a cheader if we're looking at model internals
+    args.nocheader = True
+
 # Set the random seed manually for reproducibility.
 torch.manual_seed(args.seed)
 if torch.cuda.is_available():
@@ -197,14 +207,18 @@ def batchify(data, bsz):
                              checkpoint_flag=args.load_checkpoint,
                              predefined_vocab_flag=args.predefined_vocab_flag,
                              collapse_nums_flag=args.collapse_nums_flag,
+                             multisentence_test_flag=args.multisentence_test,
                              lower_flag=args.lowercase,
                              trainfname=args.trainfname,
                              validfname=args.validfname,
                              testfname=args.testfname)
 
 if not args.interact:
     if args.test:
-        test_sents, test_data = corpus.test
+        if args.multisentence_test:
+            test_data = [corpus.test]
+        else:
+            test_sents, test_data = corpus.test
     else:
         train_data = batchify(corpus.train, args.batch_size)
         val_data = batchify(corpus.valid, args.batch_size)
@@ -226,18 +240,18 @@ def batchify(data, bsz):
         ntokens = len(corpus.dictionary)
         model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid,
                                args.nlayers, embedding_file=args.embedding_file,
-                               dropout=args.dropout, tie_weights=args.tied).to(device)
+                               dropout=args.dropout, tie_weights=args.tied,
+                               freeze_embedding=args.freeze_embedding).to(device)
 
-    # after load the rnn params are not a continuous chunk of memory
-    # this makes them a continuous chunk, and will speed up forward pass
- #   if args.cuda and (not args.single) and (torch.cuda.device_count() > 1):
- #       model.module.rnn.flatten_parameters()
- #   else:
     if args.cuda and (not args.single) and (torch.cuda.device_count() > 1):
+        # If applicable, use multi-gpu for training
         # Scatters minibatches (in dim=1) across available GPUs
         model = nn.DataParallel(model, dim=1)
     if isinstance(model, torch.nn.DataParallel):
+        # if multi-gpu, access real model for training
         model = model.module
+    # after load the rnn params are not a continuous chunk of memory
+    # this makes them a continuous chunk, and will speed up forward pass
     model.rnn.flatten_parameters()
 
 criterion = nn.CrossEntropyLoss()
@@ -418,11 +432,8 @@ def test_evaluate(test_sentences, data_source):
     for i in range(len(data_source)):
         sent_ids = data_source[i].to(device)
         # We predict all words but the first, so determine loss for those
-        sent = test_sentences[i]
-#        if args.cuda and (not args.single) and (torch.cuda.device_count() > 1):
-#            # "module" is necessary when using DataParallel
-#            hidden = model.module.init_hidden(1) # number of parallel sentences being processed
-#        else:
+        if test_sentences:
+            sent = test_sentences[i]
         hidden = model.init_hidden(1) # number of parallel sentences being processed
         data, targets = test_get_batch(sent_ids)
         if args.view_layer >= 0:
@@ -437,9 +448,12 @@ def test_evaluate(test_sentences, data_source):
                 output_flat = output.view(-1, ntokens)
                 loss = criterion(output_flat, target)
                 total_loss += loss.item()
+                input_word = corpus.dictionary.idx2word[int(word_input.data)]
                 targ_word = corpus.dictionary.idx2word[int(target.data)]
                 nwords += 1
-                if targ_word != '<eos>':
+                if input_word != '<eos>': # not in (input_word,targ_word):
+                    if args.verbose_view_layer:
+                        print(input_word,end=" ")
                     # don't output <eos> markers to align with input
                     # output raw activations
                     if args.view_hidden:
@@ -451,15 +465,22 @@ def test_evaluate(test_sentences, data_source):
         else:
             data = data.unsqueeze(1) # only needed when a single sentence is being processed
             output, hidden = model(data, hidden)
-            output_flat = output.view(-1, ntokens)
+            try:
+                output_flat = output.view(-1, ntokens)
+            except RuntimeError:
+                print("Vocabulary Error! Most likely there weren't unks in training and unks are now needed for testing")
+                raise
             loss = criterion(output_flat, targets)
             total_loss += loss.item()
             if args.words:
                 # output word-level complexity metrics
                 get_complexity(output_flat, targets, i)
             else:
                 # output sentence-level loss
-                print(str(sent)+":"+str(loss.item()))
+                if test_sentences:
+                    print(str(sent)+":"+str(loss.item()))
+                else:
+                    print(str(loss.item()))
 
             if args.adapt:
                 loss.backward()
@@ -488,10 +509,6 @@ def evaluate(data_source):
     model.eval()
     total_loss = 0.
     ntokens = len(corpus.dictionary)
-#    if args.cuda and (not args.single) and (torch.cuda.device_count() > 1):
-#        # "module" is necessary when using DataParallel
-#        hidden = model.module.init_hidden(args.batch_size)
-#    else:
     hidden = model.init_hidden(args.batch_size)
     with torch.no_grad():
         for i in range(0, data_source.size(0) - 1, args.bptt):
@@ -509,10 +526,6 @@ def train():
     total_loss = 0.
     start_time = time.time()
     ntokens = len(corpus.dictionary)
-    #if args.cuda and (not args.single) and (torch.cuda.device_count() > 1):
-        # "module" is necessary when using DataParallel
-    #    hidden = model.module.init_hidden(args.batch_size)
-    #else:
     hidden = model.init_hidden(args.batch_size)
     for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
         data, targets = get_batch(train_data, i)
@@ -556,9 +569,9 @@ def train():
             train()
             val_loss = evaluate(val_data)
             print('-' * 89)
-            print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
+            print('| end of epoch {:3d} | time: {:5.2f}s | lr: {:4.8f} | '
                   'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
-                                             val_loss, math.exp(val_loss)))
+                                             lr, math.exp(val_loss)))
             print('-' * 89)
             # Save the model if the validation loss is the best we've seen so far.
             if not best_val_loss or val_loss < best_val_loss:
@@ -593,12 +606,8 @@ def train():
         # after load the rnn params are not a continuous chunk of memory
         # this makes them a continuous chunk, and will speed up forward pass
         if isinstance(model, torch.nn.DataParallel):
+            # if multi-gpu, access real model for testing
             model = model.module
-#        if args.cuda and (not args.single) and (torch.cuda.device_count() > 1):
-#            model.module.rnn.flatten_parameters()
-#        else:
-#            if isinstance(model, torch.nn.DataParallel):
-#                model = model.module
         model.rnn.flatten_parameters()
 
     # Run on test data.
@@ -639,7 +648,10 @@ def train():
         except KeyboardInterrupt:
             print(' ')
     else:
-        test_loss = test_evaluate(test_sents, test_data)
+        if args.multisentence_test:
+            test_loss = test_evaluate(None, test_data)
+        else:
+            test_loss = test_evaluate(test_sents, test_data)
         if args.adapt:
             with open(args.adapted_model, 'wb') as f:
                 torch.save(model, f)

diff --git a/model.py b/model.py
@@ -8,7 +8,7 @@ class RNNModel(nn.Module):
     """Container module with an encoder, a recurrent module, and a decoder."""
 
     def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers,
-                 embedding_file=None, dropout=0.5, tie_weights=False):
+                 embedding_file=None, dropout=0.5, tie_weights=False, freeze_embedding=False):
         super(RNNModel, self).__init__()
         self.drop = nn.Dropout(dropout)
         if embedding_file:
@@ -28,7 +28,10 @@ def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers,
             self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
         self.decoder = nn.Linear(nhid, ntoken)
 
-        self.init_weights()
+        self.init_weights(freeze_embedding)
+        if freeze_embedding:
+            for param in self.encoder.parameters():
+                param.requires_grad = False
 
         # Optionally tie weights as in:
         # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2017)
@@ -46,10 +49,11 @@ def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers,
         self.nhid = nhid
         self.nlayers = nlayers
 
-    def init_weights(self):
+    def init_weights(self, freeze_embedding):
         """ Initialize encoder and decoder weights """
         initrange = 0.1
-        self.encoder.weight.data.uniform_(-initrange, initrange)
+        if not freeze_embedding:
+            self.encoder.weight.data.uniform_(-initrange, initrange)
         self.decoder.bias.data.fill_(0)
         self.decoder.weight.data.uniform_(-initrange, initrange)