Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Conflicts:
	main.py
  • Loading branch information
vansky committed Sep 10, 2019
2 parents ab444ab + 1b90823 commit cd055d3
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 43 deletions.
23 changes: 16 additions & 7 deletions data.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,23 @@ class SentenceCorpus(object):
""" Loads train/dev/test corpora and dictionary """
def __init__(self, path, vocab_file, test_flag=False, interact_flag=False,
checkpoint_flag=False, predefined_vocab_flag=False, lower_flag=False,
collapse_nums_flag=False,
collapse_nums_flag=False,multisentence_test_flag=False,generate_flag=False,
trainfname='train.txt',
validfname='valid.txt',
testfname='test.txt'):
self.lower = lower_flag
self.collapse_nums = collapse_nums_flag
if not (test_flag or interact_flag or checkpoint_flag or predefined_vocab_flag):
if not (test_flag or interact_flag or checkpoint_flag or predefined_vocab_flag or generate_flag):
# training mode
self.dictionary = Dictionary()
self.train = self.tokenize(os.path.join(path, trainfname))
self.valid = self.tokenize_with_unks(os.path.join(path, validfname))
try:
# don't require a test set at train time,
# but if there is one, get a sense of whether unks will be required
self.test = self.tokenize_with_unks(os.path.join(path, testfname))
except:
pass
self.save_dict(vocab_file)
else:
# load pretrained model
Expand All @@ -55,7 +61,10 @@ def __init__(self, path, vocab_file, test_flag=False, interact_flag=False,
self.load_dict(vocab_file)
if test_flag:
# test mode
self.test = self.sent_tokenize_with_unks(os.path.join(path, testfname))
if multisentence_test_flag:
self.test = self.tokenize_with_unks(os.path.join(path, testfname))
else:
self.test = self.sent_tokenize_with_unks(os.path.join(path, testfname))
elif checkpoint_flag or predefined_vocab_flag:
# load from a checkpoint
self.train = self.tokenize_with_unks(os.path.join(path, trainfname))
Expand All @@ -78,7 +87,7 @@ def save_dict(self, path):

def load_dict(self, path):
""" Loads dictionary from disk """
assert os.path.exists(path)
assert os.path.exists(path), "Bad path: %s" % path
if path[-3:] == 'bin':
# This check actually seems to be faster than passing in a binary flag
# Assume dict is binarized
Expand All @@ -97,7 +106,7 @@ def load_dict(self, path):

def tokenize(self, path):
""" Tokenizes a text file. """
assert os.path.exists(path)
assert os.path.exists(path), "Bad path: %s" % path
# Add words to the dictionary
if path[-2:] == 'gz':
with gzip.open(path, 'rb') as file_handle:
Expand Down Expand Up @@ -217,7 +226,7 @@ def tokenize(self, path):

def tokenize_with_unks(self, path):
""" Tokenizes a text file, adding unks if needed. """
assert os.path.exists(path)
assert os.path.exists(path), "Bad path: %s" % path
if path[-2:] == 'gz':
# Determine the length of the corpus
with gzip.open(path, 'rb') as file_handle:
Expand Down Expand Up @@ -326,7 +335,7 @@ def tokenize_with_unks(self, path):

def sent_tokenize_with_unks(self, path):
""" Tokenizes a text file into sentences, adding unks if needed. """
assert os.path.exists(path)
assert os.path.exists(path), "Bad path: %s" % path
all_ids = []
sents = []
if path[-2:] == 'gz':
Expand Down
2 changes: 1 addition & 1 deletion generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
model.rnn.flatten_parameters()
model.eval()

corpus = data.SentenceCorpus(args.data_dir, args.vocab_file, True)
corpus = data.SentenceCorpus(args.data_dir, args.vocab_file, generate_flag=True)

ntokens = len(corpus.dictionary)
if args.cuda and (not args.single) and (torch.cuda.device_count() > 1):
Expand Down
74 changes: 43 additions & 31 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,12 @@
help='test a trained LM')
parser.add_argument('--load_checkpoint', action='store_true',
help='continue training a pre-trained LM')
parser.add_argument('--freeze_embedding', action='store_true',
help='do not train embedding weights')
parser.add_argument('--single', action='store_true',
help='use only a single GPU (even if more are available)')
parser.add_argument('--multisentence_test', action='store_true',
help='treat multiple sentences as a single stream at test time')

parser.add_argument('--adapt', action='store_true',
help='adapt model weights during evaluation')
Expand All @@ -101,6 +105,8 @@
help='which layer should output cell states')
parser.add_argument('--view_hidden', action='store_true',
help='output the hidden state rather than the cell state')
parser.add_argument('--verbose_view_layer', action='store_true',
help='output the input observation followed by the vector activations')

parser.add_argument('--words', action='store_true',
help='evaluate word-level complexities (instead of sentence-level loss)')
Expand Down Expand Up @@ -146,6 +152,10 @@
# If adapting, we must be in test mode
args.test = True

if args.view_layer != -1:
# There shouldn't be a cheader if we're looking at model internals
args.nocheader = True

# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)
if torch.cuda.is_available():
Expand Down Expand Up @@ -197,14 +207,18 @@ def batchify(data, bsz):
checkpoint_flag=args.load_checkpoint,
predefined_vocab_flag=args.predefined_vocab_flag,
collapse_nums_flag=args.collapse_nums_flag,
multisentence_test_flag=args.multisentence_test,
lower_flag=args.lowercase,
trainfname=args.trainfname,
validfname=args.validfname,
testfname=args.testfname)

if not args.interact:
if args.test:
test_sents, test_data = corpus.test
if args.multisentence_test:
test_data = [corpus.test]
else:
test_sents, test_data = corpus.test
else:
train_data = batchify(corpus.train, args.batch_size)
val_data = batchify(corpus.valid, args.batch_size)
Expand All @@ -226,18 +240,18 @@ def batchify(data, bsz):
ntokens = len(corpus.dictionary)
model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid,
args.nlayers, embedding_file=args.embedding_file,
dropout=args.dropout, tie_weights=args.tied).to(device)
dropout=args.dropout, tie_weights=args.tied,
freeze_embedding=args.freeze_embedding).to(device)

# after load the rnn params are not a continuous chunk of memory
# this makes them a continuous chunk, and will speed up forward pass
# if args.cuda and (not args.single) and (torch.cuda.device_count() > 1):
# model.module.rnn.flatten_parameters()
# else:
if args.cuda and (not args.single) and (torch.cuda.device_count() > 1):
# If applicable, use multi-gpu for training
# Scatters minibatches (in dim=1) across available GPUs
model = nn.DataParallel(model, dim=1)
if isinstance(model, torch.nn.DataParallel):
# if multi-gpu, access real model for training
model = model.module
# after load the rnn params are not a continuous chunk of memory
# this makes them a continuous chunk, and will speed up forward pass
model.rnn.flatten_parameters()

criterion = nn.CrossEntropyLoss()
Expand Down Expand Up @@ -418,11 +432,8 @@ def test_evaluate(test_sentences, data_source):
for i in range(len(data_source)):
sent_ids = data_source[i].to(device)
# We predict all words but the first, so determine loss for those
sent = test_sentences[i]
# if args.cuda and (not args.single) and (torch.cuda.device_count() > 1):
# # "module" is necessary when using DataParallel
# hidden = model.module.init_hidden(1) # number of parallel sentences being processed
# else:
if test_sentences:
sent = test_sentences[i]
hidden = model.init_hidden(1) # number of parallel sentences being processed
data, targets = test_get_batch(sent_ids)
if args.view_layer >= 0:
Expand All @@ -437,9 +448,12 @@ def test_evaluate(test_sentences, data_source):
output_flat = output.view(-1, ntokens)
loss = criterion(output_flat, target)
total_loss += loss.item()
input_word = corpus.dictionary.idx2word[int(word_input.data)]
targ_word = corpus.dictionary.idx2word[int(target.data)]
nwords += 1
if targ_word != '<eos>':
if input_word != '<eos>': # not in (input_word,targ_word):
if args.verbose_view_layer:
print(input_word,end=" ")
# don't output <eos> markers to align with input
# output raw activations
if args.view_hidden:
Expand All @@ -451,15 +465,22 @@ def test_evaluate(test_sentences, data_source):
else:
data = data.unsqueeze(1) # only needed when a single sentence is being processed
output, hidden = model(data, hidden)
output_flat = output.view(-1, ntokens)
try:
output_flat = output.view(-1, ntokens)
except RuntimeError:
print("Vocabulary Error! Most likely there weren't unks in training and unks are now needed for testing")
raise
loss = criterion(output_flat, targets)
total_loss += loss.item()
if args.words:
# output word-level complexity metrics
get_complexity(output_flat, targets, i)
else:
# output sentence-level loss
print(str(sent)+":"+str(loss.item()))
if test_sentences:
print(str(sent)+":"+str(loss.item()))
else:
print(str(loss.item()))

if args.adapt:
loss.backward()
Expand Down Expand Up @@ -488,10 +509,6 @@ def evaluate(data_source):
model.eval()
total_loss = 0.
ntokens = len(corpus.dictionary)
# if args.cuda and (not args.single) and (torch.cuda.device_count() > 1):
# # "module" is necessary when using DataParallel
# hidden = model.module.init_hidden(args.batch_size)
# else:
hidden = model.init_hidden(args.batch_size)
with torch.no_grad():
for i in range(0, data_source.size(0) - 1, args.bptt):
Expand All @@ -509,10 +526,6 @@ def train():
total_loss = 0.
start_time = time.time()
ntokens = len(corpus.dictionary)
#if args.cuda and (not args.single) and (torch.cuda.device_count() > 1):
# "module" is necessary when using DataParallel
# hidden = model.module.init_hidden(args.batch_size)
#else:
hidden = model.init_hidden(args.batch_size)
for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
data, targets = get_batch(train_data, i)
Expand Down Expand Up @@ -556,9 +569,9 @@ def train():
train()
val_loss = evaluate(val_data)
print('-' * 89)
print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
print('| end of epoch {:3d} | time: {:5.2f}s | lr: {:4.8f} | '
'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
val_loss, math.exp(val_loss)))
lr, math.exp(val_loss)))
print('-' * 89)
# Save the model if the validation loss is the best we've seen so far.
if not best_val_loss or val_loss < best_val_loss:
Expand Down Expand Up @@ -593,12 +606,8 @@ def train():
# after load the rnn params are not a continuous chunk of memory
# this makes them a continuous chunk, and will speed up forward pass
if isinstance(model, torch.nn.DataParallel):
# if multi-gpu, access real model for testing
model = model.module
# if args.cuda and (not args.single) and (torch.cuda.device_count() > 1):
# model.module.rnn.flatten_parameters()
# else:
# if isinstance(model, torch.nn.DataParallel):
# model = model.module
model.rnn.flatten_parameters()

# Run on test data.
Expand Down Expand Up @@ -639,7 +648,10 @@ def train():
except KeyboardInterrupt:
print(' ')
else:
test_loss = test_evaluate(test_sents, test_data)
if args.multisentence_test:
test_loss = test_evaluate(None, test_data)
else:
test_loss = test_evaluate(test_sents, test_data)
if args.adapt:
with open(args.adapted_model, 'wb') as f:
torch.save(model, f)
Expand Down
12 changes: 8 additions & 4 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class RNNModel(nn.Module):
"""Container module with an encoder, a recurrent module, and a decoder."""

def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers,
embedding_file=None, dropout=0.5, tie_weights=False):
embedding_file=None, dropout=0.5, tie_weights=False, freeze_embedding=False):
super(RNNModel, self).__init__()
self.drop = nn.Dropout(dropout)
if embedding_file:
Expand All @@ -28,7 +28,10 @@ def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers,
self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
self.decoder = nn.Linear(nhid, ntoken)

self.init_weights()
self.init_weights(freeze_embedding)
if freeze_embedding:
for param in self.encoder.parameters():
param.requires_grad = False

# Optionally tie weights as in:
# "Using the Output Embedding to Improve Language Models" (Press & Wolf 2017)
Expand All @@ -46,10 +49,11 @@ def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers,
self.nhid = nhid
self.nlayers = nlayers

def init_weights(self):
def init_weights(self, freeze_embedding):
""" Initialize encoder and decoder weights """
initrange = 0.1
self.encoder.weight.data.uniform_(-initrange, initrange)
if not freeze_embedding:
self.encoder.weight.data.uniform_(-initrange, initrange)
self.decoder.bias.data.fill_(0)
self.decoder.weight.data.uniform_(-initrange, initrange)

Expand Down

0 comments on commit cd055d3

Please sign in to comment.