From 786a68a64fb3a608b317560c89af2905f90c88d6 Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Thu, 7 Sep 2017 14:13:42 -0700
Subject: [PATCH 01/42] Only changing AttModel. Other model may break. Thinking
 of a better way,

---
 dataloader.py           |  8 +++++++-
 eval_utils.py           | 13 +++++++------
 models/AttModel.py      | 38 +++++++++++++++++++++++---------------
 scripts/make_bu_data.py | 30 ++++++++++++++++++++++++++++++
 train.py                |  6 +++---
 5 files changed, 70 insertions(+), 25 deletions(-)
 create mode 100644 scripts/make_bu_data.py

diff --git a/dataloader.py b/dataloader.py
index f1175356..510a7225 100644
--- a/dataloader.py
+++ b/dataloader.py
@@ -160,7 +160,13 @@ def get_batch(self, split, batch_size=None, seq_per_img=None):
 
         data = {}
         data['fc_feats'] = np.stack(fc_batch)
-        data['att_feats'] = np.stack(att_batch)
+        max_att_len = max([_.shape[0] for _ in att_batch])
+        data['att_feats'] = np.zeros([len(att_batch), max_att_len, att_batch[0].shape[1]], dtype = 'float32')
+        for i in range(len(att_batch)):
+            data['att_feats'][i][:att_batch[i].shape[0]] = att_batch[i]
+        data['att_masks'] = np.zeros(data['att_feats'].shape[:2], dtype='float32')
+        for i in range(len(att_batch)):
+            data['att_masks'][i][:att_batch[i].shape[0]] = 1
         data['labels'] = label_batch
         data['gts'] = gts
         data['masks'] = mask_batch 
diff --git a/eval_utils.py b/eval_utils.py
index ab0abd06..75cf3c18 100644
--- a/eval_utils.py
+++ b/eval_utils.py
@@ -84,22 +84,23 @@ def eval_split(model, crit, loader, eval_kwargs={}):
 
         if data.get('labels', None) is not None:
             # forward the model to get loss
-            tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks']]
+            tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks']]
             tmp = [Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp]
-            fc_feats, att_feats, labels, masks = tmp
+            fc_feats, att_feats, labels, masks, att_masks = tmp
 
-            loss = crit(model(fc_feats, att_feats, labels), labels[:,1:], masks[:,1:]).data[0]
+            loss = crit(model(fc_feats, att_feats, labels, att_masks), labels[:,1:], masks[:,1:]).data[0]
             loss_sum = loss_sum + loss
             loss_evals = loss_evals + 1
 
         # forward the model to also get generated samples for each image
         # Only leave one feature for each image, in case duplicate sample
         tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 
-            data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
+            data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
+            data['att_masks'][np.arange(loader.batch_size) * loader.seq_per_img]]
         tmp = [Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp]
-        fc_feats, att_feats = tmp
+        fc_feats, att_feats, att_masks = tmp
         # forward the model to also get generated samples for each image
-        seq, _ = model.sample(fc_feats, att_feats, eval_kwargs)
+        seq, _ = model.sample(fc_feats, att_feats, att_masks, eval_kwargs)
         
         #set_trace()
         sents = utils.decode_sequence(loader.get_vocab(), seq)
diff --git a/models/AttModel.py b/models/AttModel.py
index 3382c644..f995872f 100644
--- a/models/AttModel.py
+++ b/models/AttModel.py
@@ -55,7 +55,7 @@ def init_hidden(self, bsz):
         return (Variable(weight.new(self.num_layers, bsz, self.rnn_size).zero_()),
                 Variable(weight.new(self.num_layers, bsz, self.rnn_size).zero_()))
 
-    def forward(self, fc_feats, att_feats, seq):
+    def forward(self, fc_feats, att_feats, seq, att_masks=None):
         batch_size = fc_feats.size(0)
         state = self.init_hidden(batch_size)
 
@@ -92,13 +92,13 @@ def forward(self, fc_feats, att_feats, seq):
 
             xt = self.embed(it)
 
-            output, state = self.core(xt, fc_feats, att_feats, p_att_feats, state)
+            output, state = self.core(xt, fc_feats, att_feats, p_att_feats, state, att_masks)
             output = F.log_softmax(self.logit(output))
             outputs.append(output)
 
         return torch.cat([_.unsqueeze(1) for _ in outputs], 1)
 
-    def sample_beam(self, fc_feats, att_feats, opt={}):
+    def sample_beam(self, fc_feats, att_feats, att_masks=None, opt={}):
         beam_size = opt.get('beam_size', 10)
         batch_size = fc_feats.size(0)
 
@@ -189,7 +189,7 @@ def sample_beam(self, fc_feats, att_feats, opt={}):
                 if t >= 1:
                     state = new_state
 
-                output, state = self.core(xt, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, state)
+                output, state = self.core(xt, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, state, att_masks)
                 logprobs = F.log_softmax(self.logit(output))
 
             self.done_beams[k] = sorted(self.done_beams[k], key=lambda x: -x['p'])
@@ -198,12 +198,12 @@ def sample_beam(self, fc_feats, att_feats, opt={}):
         # return the samples and their log likelihoods
         return seq.transpose(0, 1), seqLogprobs.transpose(0, 1)
 
-    def sample(self, fc_feats, att_feats, opt={}):
+    def sample(self, fc_feats, att_feats, att_masks=None, opt={}):
         sample_max = opt.get('sample_max', 1)
         beam_size = opt.get('beam_size', 1)
         temperature = opt.get('temperature', 1.0)
         if beam_size > 1:
-            return self.sample_beam(fc_feats, att_feats, opt)
+            return self.sample_beam(fc_feats, att_feats, att_masks, opt)
 
         batch_size = fc_feats.size(0)
         state = self.init_hidden(batch_size)
@@ -250,7 +250,7 @@ def sample(self, fc_feats, att_feats, opt={}):
 
                 seqLogprobs.append(sampleLogprobs.view(-1))
 
-            output, state = self.core(xt, fc_feats, att_feats, p_att_feats, state)
+            output, state = self.core(xt, fc_feats, att_feats, p_att_feats, state, att_masks)
             logprobs = F.log_softmax(self.logit(output))
 
         return torch.cat([_.unsqueeze(1) for _ in seq], 1), torch.cat([_.unsqueeze(1) for _ in seqLogprobs], 1)
@@ -369,7 +369,7 @@ def __init__(self, opt):
         self.alpha_net = nn.Linear(self.att_hid_size, 1)
         self.att2h = nn.Linear(self.rnn_size, self.rnn_size)
 
-    def forward(self, h_out, fake_region, conv_feat, conv_feat_embed):
+    def forward(self, h_out, fake_region, conv_feat, conv_feat_embed, att_masks=None):
 
         # View into three dimensions
         att_size = conv_feat.numel() // conv_feat.size(0) // self.rnn_size
@@ -394,6 +394,11 @@ def forward(self, h_out, fake_region, conv_feat, conv_feat_embed):
         hAflat = self.alpha_net(hA.view(-1, self.att_hid_size))
         PI = F.softmax(hAflat.view(-1, att_size + 1))
 
+        if att_masks is not None:
+            att_masks = att_masks.view(-1, att_size)
+            PI = PI * torch.cat([att_masks[:,:1], att_masks], 1) # assume one one at the first time step.
+            PI = PI / PI.sum(1, keepdim=True)
+
         visAtt = torch.bmm(PI.unsqueeze(1), img_all)
         visAttdim = visAtt.squeeze(1)
 
@@ -409,9 +414,9 @@ def __init__(self, opt, use_maxout=False):
         self.lstm = AdaAtt_lstm(opt, use_maxout)
         self.attention = AdaAtt_attention(opt)
 
-    def forward(self, xt, fc_feats, att_feats, p_att_feats, state):
+    def forward(self, xt, fc_feats, att_feats, p_att_feats, state, att_masks=None):
         h_out, p_out, state = self.lstm(xt, fc_feats, state)
-        atten_out = self.attention(h_out, p_out, att_feats, p_att_feats)
+        atten_out = self.attention(h_out, p_out, att_feats, p_att_feats, att_masks)
         return atten_out, state
 
 class TopDownCore(nn.Module):
@@ -423,13 +428,13 @@ def __init__(self, opt, use_maxout=False):
         self.lang_lstm = nn.LSTMCell(opt.rnn_size * 2, opt.rnn_size) # h^1_t, \hat v
         self.attention = Attention(opt)
 
-    def forward(self, xt, fc_feats, att_feats, p_att_feats, state):
+    def forward(self, xt, fc_feats, att_feats, p_att_feats, state, att_masks=None):
         prev_h = state[0][-1]
         att_lstm_input = torch.cat([prev_h, fc_feats, xt], 1)
 
         h_att, c_att = self.att_lstm(att_lstm_input, (state[0][0], state[1][0]))
 
-        att = self.attention(h_att, att_feats, p_att_feats)
+        att = self.attention(h_att, att_feats, p_att_feats, att_masks)
 
         lang_lstm_input = torch.cat([att, h_att], 1)
         # lang_lstm_input = torch.cat([att, F.dropout(h_att, self.drop_prob_lm, self.training)], 1) ?????
@@ -450,7 +455,7 @@ def __init__(self, opt):
         self.h2att = nn.Linear(self.rnn_size, self.att_hid_size)
         self.alpha_net = nn.Linear(self.att_hid_size, 1)
 
-    def forward(self, h, att_feats, p_att_feats):
+    def forward(self, h, att_feats, p_att_feats, att_masks=None):
         # The p_att_feats here is already projected
         att_size = att_feats.numel() // att_feats.size(0) // self.rnn_size
         att = p_att_feats.view(-1, att_size, self.att_hid_size)
@@ -464,6 +469,9 @@ def forward(self, h, att_feats, p_att_feats):
         dot = dot.view(-1, att_size)                        # batch * att_size
         
         weight = F.softmax(dot)                             # batch * att_size
+        if att_masks is not None:
+            weight = weight * att_masks.view(-1, att_size).float()
+            weight = weight / weight.sum(1, keepdim=True) # normalize to 1
         att_feats_ = att_feats.view(-1, att_size, self.rnn_size) # batch * att_size * att_feat_size
         att_res = torch.bmm(weight.unsqueeze(1), att_feats_).squeeze(1) # batch * att_feat_size
 
@@ -490,8 +498,8 @@ def __init__(self, opt):
 
         self.attention = Attention(opt)
 
-    def forward(self, xt, fc_feats, att_feats, p_att_feats, state):
-        att_res = self.attention(state[0][-1], att_feats, p_att_feats)
+    def forward(self, xt, fc_feats, att_feats, p_att_feats, state, att_masks=None):
+        att_res = self.attention(state[0][-1], att_feats, p_att_feats, att_masks)
 
         all_input_sums = self.i2h(xt) + self.h2h(state[0][-1])
         sigmoid_chunk = all_input_sums.narrow(1, 0, 3 * self.rnn_size)
diff --git a/scripts/make_bu_data.py b/scripts/make_bu_data.py
new file mode 100644
index 00000000..5dd8b0f4
--- /dev/null
+++ b/scripts/make_bu_data.py
@@ -0,0 +1,30 @@
+import os
+import base64
+import numpy as np
+import csv
+import sys
+import zlib
+import time
+import mmap
+
+csv.field_size_limit(sys.maxsize)
+
+
+FIELDNAMES = ['image_id', 'image_w','image_h','num_boxes', 'boxes', 'features']
+#infiles = ['trainval/karpathy_test_resnet101_faster_rcnn_genome.tsv',
+#          'trainval/karpathy_val_resnet101_faster_rcnn_genome.tsv',
+infiles = ['trainval/karpathy_train_resnet101_faster_rcnn_genome.tsv.0', \
+           'trainval/karpathy_train_resnet101_faster_rcnn_genome.tsv.1']
+
+for infile in infiles:
+    with open(infile, "r+b") as tsv_in_file:
+        reader = csv.DictReader(tsv_in_file, delimiter='\t', fieldnames = FIELDNAMES)
+        for item in reader:
+            item['image_id'] = int(item['image_id'])
+            item['num_boxes'] = int(item['num_boxes'])
+            for field in ['boxes', 'features']:
+                item[field] = np.frombuffer(base64.decodestring(item[field]), 
+                      dtype=np.float32).reshape((item['num_boxes'],-1))
+            np.savez_compressed(os.path.join('../cocobu_att', str(item['image_id'])), feat=item['features'])
+            np.save(os.path.join('../cocobu_fc', str(item['image_id'])), item['features'].mean(0))
+
diff --git a/train.py b/train.py
index 7a626977..9e44c6ea 100644
--- a/train.py
+++ b/train.py
@@ -105,12 +105,12 @@ def train(opt):
         torch.cuda.synchronize()
         start = time.time()
 
-        tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks']]
+        tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks']]
         tmp = [Variable(torch.from_numpy(_), requires_grad=False).cuda() for _ in tmp]
-        fc_feats, att_feats, labels, masks = tmp
+        fc_feats, att_feats, labels, masks, att_masks = tmp
         
         optimizer.zero_grad()
-        loss = crit(model(fc_feats, att_feats, labels), labels[:,1:], masks[:,1:])
+        loss = crit(model(fc_feats, att_feats, labels, att_masks), labels[:,1:], masks[:,1:])
         loss.backward()
         utils.clip_gradient(optimizer, opt.grad_clip)
         optimizer.step()

From 77edf62ea1f8af6cd7b8f0d82860ae8463c2b63a Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Fri, 15 Sep 2017 18:07:09 -0700
Subject: [PATCH 02/42] Fix a bug in sample_beam.

---
 models/AttModel.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/models/AttModel.py b/models/AttModel.py
index f995872f..ffdfb605 100644
--- a/models/AttModel.py
+++ b/models/AttModel.py
@@ -122,6 +122,7 @@ def sample_beam(self, fc_feats, att_feats, att_masks=None, opt={}):
             tmp_fc_feats = fc_feats[k:k+1].expand(beam_size, fc_feats.size(1))
             tmp_att_feats = att_feats[k:k+1].expand(*((beam_size,)+att_feats.size()[1:])).contiguous()
             tmp_p_att_feats = p_att_feats[k:k+1].expand(*((beam_size,)+p_att_feats.size()[1:])).contiguous()
+            tmp_att_masks = att_masks[k:k+1].expand(*((beam_size,)+att_masks.size()[1:])).contiguous()
 
             beam_seq = torch.LongTensor(self.seq_length, beam_size).zero_()
             beam_seq_logprobs = torch.FloatTensor(self.seq_length, beam_size).zero_()
@@ -189,7 +190,7 @@ def sample_beam(self, fc_feats, att_feats, att_masks=None, opt={}):
                 if t >= 1:
                     state = new_state
 
-                output, state = self.core(xt, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, state, att_masks)
+                output, state = self.core(xt, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, state, tmp_att_masks)
                 logprobs = F.log_softmax(self.logit(output))
 
             self.done_beams[k] = sorted(self.done_beams[k], key=lambda x: -x['p'])

From 83fc69c411cee4e6f87ae4a72ddf8b9f4619ee24 Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Tue, 26 Sep 2017 11:31:40 -0500
Subject: [PATCH 03/42] Add diverse beam search.

---
 eval.py           |   4 +
 eval_utils.py     |   5 +-
 models/FCModel.py | 198 +++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 205 insertions(+), 2 deletions(-)

diff --git a/eval.py b/eval.py
index 9d26932b..839b1989 100644
--- a/eval.py
+++ b/eval.py
@@ -46,6 +46,10 @@
                 help='1 = sample argmax words. 0 = sample from distributions.')
 parser.add_argument('--beam_size', type=int, default=2,
                 help='used when sample_max = 1, indicates number of beams in beam search. Usually 2 or 3 works well. More is not better. Set this to 1 for faster runtime but a bit worse performance.')
+parser.add_argument('--group_size', type=int, default=1,
+                help='used for diverse beam search. if group_size is 1, then it\'s normal beam search')
+parser.add_argument('--diversity_lambda', type=float, default=0.5,
+                help='used for diverse beam search. Usually from 0.2 to 0.8. Higher value of lambda produces a more diverse list')
 parser.add_argument('--temperature', type=float, default=1.0,
                 help='temperature when sampling from distributions (i.e. when sample_max = 0). Lower = "safer" predictions.')
 # For evaluation on a folder of images:
diff --git a/eval_utils.py b/eval_utils.py
index ab0abd06..02abf867 100644
--- a/eval_utils.py
+++ b/eval_utils.py
@@ -101,7 +101,10 @@ def eval_split(model, crit, loader, eval_kwargs={}):
         # forward the model to also get generated samples for each image
         seq, _ = model.sample(fc_feats, att_feats, eval_kwargs)
         
-        #set_trace()
+        # Print beam search
+        # for i in range(loader.batch_size):
+        #     print('\n'.join([utils.decode_sequence(loader.get_vocab(), _['seq'].unsqueeze(0))[0] for _ in model.done_beams[i]]))
+        #     print('--' * 10)
         sents = utils.decode_sequence(loader.get_vocab(), seq)
 
         for k, sent in enumerate(sents):
diff --git a/models/FCModel.py b/models/FCModel.py
index 88467285..93b7fa3e 100644
--- a/models/FCModel.py
+++ b/models/FCModel.py
@@ -204,9 +204,12 @@ def sample_beam(self, fc_feats, att_feats, opt={}):
     def sample(self, fc_feats, att_feats, opt={}):
         sample_max = opt.get('sample_max', 1)
         beam_size = opt.get('beam_size', 1)
+        group_size = opt.get('group_size', 1)
         temperature = opt.get('temperature', 1.0)
-        if beam_size > 1:
+        if beam_size > 1 and group_size == 1:
             return self.sample_beam(fc_feats, att_feats, opt)
+        elif beam_size > 1:
+            return self.sample_diverse_beam(fc_feats, att_feats, opt)
 
         batch_size = fc_feats.size(0)
         state = self.init_hidden(batch_size)
@@ -250,3 +253,196 @@ def sample(self, fc_feats, att_feats, opt={}):
 
         return torch.cat([_.unsqueeze(1) for _ in seq], 1), torch.cat([_.unsqueeze(1) for _ in seqLogprobs], 1)
 
+# implements beam search
+# calls beam_step and returns the final set of beams
+# augments log-probabilities with diversity terms when number of groups > 1
+
+    def diverse_beam_search(self, init_state, init_logprobs, opt={}):
+
+        # function computes the similarity score to be augmented
+        def add_diversity(beam_seq_table,logprobsf,t,divm,diversity_lambda,bdash):
+            local_time = t - divm
+            unaug_logprobsf = logprobsf.clone()
+            for prev_choice in range(divm):
+                prev_decisions = beam_seq_table[prev_choice][local_time]
+                for sub_beam in range(bdash):
+                    for prev_labels in range(bdash):
+                        logprobsf[sub_beam][prev_decisions[prev_labels]] = logprobsf[sub_beam][prev_decisions[prev_labels]] - diversity_lambda
+            return unaug_logprobsf
+
+        # does one step of classical beam search
+
+        def beam_step(logprobsf,unaug_logprobsf,beam_size,t,beam_seq,beam_seq_logprobs,beam_logprobs_sum,state):
+        #INPUTS:
+        #logprobsf: probabilities augmented after diversity
+        #beam_size: obvious
+        #t        : time instant
+        #beam_seq : tensor contanining the beams
+        #beam_seq_logprobs: tensor contanining the beam logprobs
+        #beam_logprobs_sum: tensor contanining joint logprobs
+        #OUPUTS:
+        #beam_seq : tensor containing the word indices of the decoded captions
+        #beam_seq_logprobs : log-probability of each decision made, same size as beam_seq
+        #beam_logprobs_sum : joint log-probability of each beam
+
+            ys,ix = torch.sort(logprobsf,1,True)
+            candidates = []
+            cols = min(beam_size, ys.size(1))
+            rows = beam_size
+            if t == 0:
+                rows = 1
+            for c in range(cols): # for each column (word, essentially)
+                for q in range(rows): # for each beam expansion
+                    #compute logprob of expanding beam q with word in (sorted) position c
+                    local_logprob = ys[q,c]
+                    candidate_logprob = beam_logprobs_sum[q] + local_logprob
+                    local_unaug_logprob = unaug_logprobsf[q,ix[q,c]]
+                    candidates.append({'c':ix[q,c], 'q':q, 'p':candidate_logprob, 'r':local_unaug_logprob})
+            candidates = sorted(candidates,  key=lambda x: -x['p'])
+            
+            new_state = [_.clone() for _ in state]
+            #beam_seq_prev, beam_seq_logprobs_prev
+            if t >= 1:
+            #we''ll need these as reference when we fork beams around
+                beam_seq_prev = beam_seq[:t].clone()
+                beam_seq_logprobs_prev = beam_seq_logprobs[:t].clone()
+            for vix in range(beam_size):
+                v = candidates[vix]
+                #fork beam index q into index vix
+                if t >= 1:
+                    beam_seq[:t, vix] = beam_seq_prev[:, v['q']]
+                    beam_seq_logprobs[:t, vix] = beam_seq_logprobs_prev[:, v['q']]
+                #rearrange recurrent states
+                for state_ix in range(len(new_state)):
+                #  copy over state in previous beam q to new beam at vix
+                    new_state[state_ix][:, vix] = state[state_ix][:, v['q']] # dimension one is time step
+                #append new end terminal at the end of this beam
+                beam_seq[t, vix] = v['c'] # c'th word is the continuation
+                beam_seq_logprobs[t, vix] = v['r'] # the raw logprob here
+                beam_logprobs_sum[vix] = v['p'] # the new (sum) logprob along this beam
+            state = new_state
+            return beam_seq,beam_seq_logprobs,beam_logprobs_sum,state,candidates
+
+        # Start diverse_beam_search
+        beam_size = opt.get('beam_size', 10)
+        group_size = opt.get('group_size', 5)
+        diversity_lambda = opt.get('diversity_lambda', 0.5)
+
+        # init
+        bdash = beam_size // group_size
+        # init_state = init_params[1]
+        # init_logprobs = init_params[2]
+        state_table = []
+        beam_seq_table = []
+        beam_seq_logprobs_table = []
+        beam_logprobs_sum_table = []
+
+        # INITIALIZATIONS
+        beam_seq_table = [torch.LongTensor(self.seq_length, bdash).zero_() for _ in range(group_size)]
+        beam_seq_logprobs_table = [torch.FloatTensor(self.seq_length, bdash).zero_() for _ in range(group_size)]
+        beam_logprobs_sum_table = [torch.zeros(bdash) for _ in range(group_size)]
+
+        # logprobs # logprobs predicted in last time step, shape (beam_size, vocab_size+1)
+        done_beams_table = [[] for _ in range(group_size)]
+        # state = self.init_hidden(bdash)
+        # state = [_.unsqueeze(0).repeat(bdash, 1) for _ in init_state]
+        # state_table = [[_.clone() for _ in state] for __ in range(group_size)]
+        state_table = [list(torch.unbind(_)) for _ in torch.stack(init_state).chunk(group_size, 2)]
+        logprobs_table = list(init_logprobs.chunk(group_size, 0))
+        # logprobs_table = [torch.zeros(bdash, init_logprobs.size(1)) for _ in range(group_size)]
+        # for i in range(group_size):
+        #     for j in range(bdash):
+        #         logprobs_table[i][j] = init_logprobs.clone()
+        # END INIT
+
+        for t in range(self.seq_length + group_size - 1):
+            for divm in range(group_size): 
+                if t >= divm and t <= self.seq_length + divm - 1:
+                    # add diversity
+                    logprobsf = logprobs_table[divm].data.float()
+                    # suppress UNK tokens in the decoding
+                    logprobsf[:,logprobsf.size(1)-1] =  logprobsf[:, logprobsf.size(1)-1] - 1000  
+                    # diversity is added here
+                    # the function directly modifies the logprobsf values and hence, we need to return
+                    # the unaugmented ones for sorting the candidates in the end. # for historical
+                    # reasons :-)
+                    unaug_logprobsf = add_diversity(beam_seq_table,logprobsf,t,divm,diversity_lambda,bdash)
+
+                    # infer new beams
+                    beam_seq_table[divm],\
+                    beam_seq_logprobs_table[divm],\
+                    beam_logprobs_sum_table[divm],\
+                    state_table[divm],\
+                    candidates_divm = beam_step(logprobsf,
+                                                unaug_logprobsf,
+                                                bdash,
+                                                t-divm,
+                                                beam_seq_table[divm],
+                                                beam_seq_logprobs_table[divm],
+                                                beam_logprobs_sum_table[divm],
+                                                state_table[divm])
+
+                    # if time's up... or if end token is reached then copy beams
+                    for vix in range(bdash):
+                        # is_first_end_token = ((beam_seq_table[divm][:,vix][t-divm] == 0) and (torch.eq(beam_seq_table[divm][:,vix],0).sum()==1))
+                        # final_time_without_end_token = ((t == self.seq_length+divm-1) and (torch.eq(beam_seq_table[divm][:,vix],0).sum()==0))
+                        if beam_seq_table[divm][t-divm,vix] == 0 or t == self.seq_length+divm-1: # if is_first_end_token or final_time_without_end_token:
+                            final_beam = {
+                                'seq': beam_seq_table[divm][:, vix].clone(), 
+                                'logps': beam_seq_logprobs_table[divm][:, vix].clone(),
+                                'unaug_logp': beam_seq_logprobs_table[divm][:, vix].sum(),
+                                'logp': beam_logprobs_sum_table[divm][vix]
+                            }
+                            done_beams_table[divm].append(final_beam)
+                        # # don't continue beams from finished sequences
+                        # if is_first_end_token:
+                            # make continuation of already completed sequences improbable
+                            beam_logprobs_sum_table[divm][vix] = -1000
+
+                    # move the current group one step forward in time
+                    
+                    it = beam_seq_table[divm][t-divm]
+                    xt = self.embed(Variable(it.cuda()))
+                    output, state = self.core(xt, state_table[divm])
+                    logprobs_table[divm] = F.log_softmax(self.logit(output))
+                    state_table[divm] = state
+
+        for i in range(group_size):
+            # all beams are sorted by their log-probabilities
+            done_beams_table[i] = sorted(done_beams_table[i], key=lambda x: -x['logp'])[:bdash]
+        return done_beams_table
+
+
+    def sample_diverse_beam(self, fc_feats, att_feats, opt={}):
+        beam_size = opt.get('beam_size', 10)
+        batch_size = fc_feats.size(0)
+
+        assert beam_size <= self.vocab_size + 1, 'lets assume this for now, otherwise this corner case causes a few headaches down the road. can be dealt with in future if needed'
+        seq = torch.LongTensor(self.seq_length, batch_size).zero_()
+        seqLogprobs = torch.FloatTensor(self.seq_length, batch_size)
+        # lets process every image independently for now, for simplicity
+
+        self.done_beams = [[] for _ in range(batch_size)]
+        for k in range(batch_size):
+            state = self.init_hidden(beam_size)
+
+            beam_seq = torch.LongTensor(self.seq_length, beam_size).zero_()
+            beam_seq_logprobs = torch.FloatTensor(self.seq_length, beam_size).zero_()
+            beam_logprobs_sum = torch.zeros(beam_size) # running sum of logprobs for each beam
+            for t in range(2):
+                if t == 0:
+                    xt = self.img_embed(fc_feats[k:k+1]).expand(beam_size, self.input_encoding_size)
+                elif t == 1: # input <bos>
+                    it = fc_feats.data.new(beam_size).long().zero_()
+                    xt = self.embed(Variable(it, requires_grad=False))
+
+                output, state = self.core(xt, state)
+                logprobs = F.log_softmax(self.logit(output))
+
+            done_beams_table = self.diverse_beam_search(state, logprobs, opt)
+
+            self.done_beams[k] = reduce(lambda a,b:a+b, done_beams_table)
+            seq[:, k] = self.done_beams[k][0]['seq'] # the first beam has highest cumulative score
+            seqLogprobs[:, k] = self.done_beams[k][0]['logps']
+        # return the samples and their log likelihoods
+        return seq.transpose(0, 1), seqLogprobs.transpose(0, 1)
\ No newline at end of file

From 49d5a00b0afbf4390c2fd427bef454d470a1d19f Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Tue, 26 Sep 2017 15:32:23 -0500
Subject: [PATCH 04/42] Replace normal beam search with diverse beam search.
 (Normal beam search is a special case of diverse beam search.)

---
 models/CaptionModel.py | 133 ++++++++++++++++++----------
 models/FCModel.py      | 196 +----------------------------------------
 2 files changed, 86 insertions(+), 243 deletions(-)

diff --git a/models/CaptionModel.py b/models/CaptionModel.py
index c92c195f..b3271175 100644
--- a/models/CaptionModel.py
+++ b/models/CaptionModel.py
@@ -20,11 +20,26 @@ class CaptionModel(nn.Module):
     def __init__(self):
         super(CaptionModel, self).__init__()
 
-    def beam_search(self, state, logprobs, *args, **kwargs):
-        # args are the miscelleous inputs to the core in addition to embedded word and state
-        # kwargs only accept opt
+    # implements beam search
+    # calls beam_step and returns the final set of beams
+    # augments log-probabilities with diversity terms when number of groups > 1
 
-        def beam_step(logprobsf, beam_size, t, beam_seq, beam_seq_logprobs, beam_logprobs_sum, state):
+    def beam_search(self, init_state, init_logprobs, *args, **kwargs):
+
+        # function computes the similarity score to be augmented
+        def add_diversity(beam_seq_table, logprobsf, t, divm, diversity_lambda, bdash):
+            local_time = t - divm
+            unaug_logprobsf = logprobsf.clone()
+            for prev_choice in range(divm):
+                prev_decisions = beam_seq_table[prev_choice][local_time]
+                for sub_beam in range(bdash):
+                    for prev_labels in range(bdash):
+                        logprobsf[sub_beam][prev_decisions[prev_labels]] = logprobsf[sub_beam][prev_decisions[prev_labels]] - diversity_lambda
+            return unaug_logprobsf
+
+        # does one step of classical beam search
+
+        def beam_step(logprobsf, unaug_logprobsf, beam_size, t, beam_seq, beam_seq_logprobs, beam_logprobs_sum, state):
             #INPUTS:
             #logprobsf: probabilities augmented after diversity
             #beam_size: obvious
@@ -48,7 +63,8 @@ def beam_step(logprobsf, beam_size, t, beam_seq, beam_seq_logprobs, beam_logprob
                     #compute logprob of expanding beam q with word in (sorted) position c
                     local_logprob = ys[q,c]
                     candidate_logprob = beam_logprobs_sum[q] + local_logprob
-                    candidates.append({'c':ix[q,c], 'q':q, 'p':candidate_logprob, 'r':local_logprob})
+                    local_unaug_logprob = unaug_logprobsf[q,ix[q,c]]
+                    candidates.append({'c':ix[q,c], 'q':q, 'p':candidate_logprob, 'r':local_unaug_logprob})
             candidates = sorted(candidates,  key=lambda x: -x['p'])
             
             new_state = [_.clone() for _ in state]
@@ -72,51 +88,72 @@ def beam_step(logprobsf, beam_size, t, beam_seq, beam_seq_logprobs, beam_logprob
                 beam_seq_logprobs[t, vix] = v['r'] # the raw logprob here
                 beam_logprobs_sum[vix] = v['p'] # the new (sum) logprob along this beam
             state = new_state
-            return beam_seq, beam_seq_logprobs, beam_logprobs_sum, state, candidates
+            return beam_seq,beam_seq_logprobs,beam_logprobs_sum,state,candidates
 
-        # start beam search
+        # Start diverse_beam_search
         opt = kwargs['opt']
         beam_size = opt.get('beam_size', 10)
+        group_size = opt.get('group_size', 1)
+        diversity_lambda = opt.get('diversity_lambda', 0.5)
+        bdash = beam_size // group_size # beam per group
 
-        beam_seq = torch.LongTensor(self.seq_length, beam_size).zero_()
-        beam_seq_logprobs = torch.FloatTensor(self.seq_length, beam_size).zero_()
-        beam_logprobs_sum = torch.zeros(beam_size) # running sum of logprobs for each beam
-        done_beams = []
-
-        for t in range(self.seq_length):
-            """pem a beam merge. that is,
-            for every previous beam we now many new possibilities to branch out
-            we need to resort our beams to maintain the loop invariant of keeping
-            the top beam_size most likely sequences."""
-            logprobsf = logprobs.data.float() # lets go to CPU for more efficiency in indexing operations
-        
-            beam_seq,\
-            beam_seq_logprobs,\
-            beam_logprobs_sum,\
-            state,\
-            candidates_divm = beam_step(logprobsf,
-                                        beam_size,
-                                        t,
-                                        beam_seq,
-                                        beam_seq_logprobs,
-                                        beam_logprobs_sum,
-                                        state)
+        # INITIALIZATIONS
+        beam_seq_table = [torch.LongTensor(self.seq_length, bdash).zero_() for _ in range(group_size)]
+        beam_seq_logprobs_table = [torch.FloatTensor(self.seq_length, bdash).zero_() for _ in range(group_size)]
+        beam_logprobs_sum_table = [torch.zeros(bdash) for _ in range(group_size)]
 
-            for vix in range(beam_size):
-                # if time's up... or if end token is reached then copy beams
-                if beam_seq[t, vix] == 0 or t == self.seq_length - 1:
-                    final_beam = {
-                        'seq': beam_seq[:, vix].clone(), 
-                        'logps': beam_seq_logprobs[:, vix].clone(),
-                        'p': beam_logprobs_sum[vix]
-                    }
-                    done_beams.append(final_beam)
-                    # don't continue beams from finished sequences
-                    beam_logprobs_sum[vix] = -1000
-
-            # encode as vectors
-            it = beam_seq[t]
-            logprobs, state = self.get_logprobs_state(Variable(it.cuda()), *(args + (state,)))
-
-        done_beams = sorted(done_beams, key=lambda x: -x['p'])[:beam_size]
-        return done_beams
+        # logprobs # logprobs predicted in last time step, shape (beam_size, vocab_size+1)
+        done_beams_table = [[] for _ in range(group_size)]
+        state_table = [list(torch.unbind(_)) for _ in torch.stack(init_state).chunk(group_size, 2)]
+        logprobs_table = list(init_logprobs.chunk(group_size, 0))
+        # END INIT
+
+        for t in range(self.seq_length + group_size - 1):
+            for divm in range(group_size): 
+                if t >= divm and t <= self.seq_length + divm - 1:
+                    # add diversity
+                    logprobsf = logprobs_table[divm].data.float()
+                    # suppress UNK tokens in the decoding
+                    logprobsf[:,logprobsf.size(1)-1] =  logprobsf[:, logprobsf.size(1)-1] - 1000  
+                    # diversity is added here
+                    # the function directly modifies the logprobsf values and hence, we need to return
+                    # the unaugmented ones for sorting the candidates in the end. # for historical
+                    # reasons :-)
+                    unaug_logprobsf = add_diversity(beam_seq_table,logprobsf,t,divm,diversity_lambda,bdash)
+
+                    # infer new beams
+                    beam_seq_table[divm],\
+                    beam_seq_logprobs_table[divm],\
+                    beam_logprobs_sum_table[divm],\
+                    state_table[divm],\
+                    candidates_divm = beam_step(logprobsf,
+                                                unaug_logprobsf,
+                                                bdash,
+                                                t-divm,
+                                                beam_seq_table[divm],
+                                                beam_seq_logprobs_table[divm],
+                                                beam_logprobs_sum_table[divm],
+                                                state_table[divm])
+
+                    # if time's up... or if end token is reached then copy beams
+                    for vix in range(bdash):
+                        if beam_seq_table[divm][t-divm,vix] == 0 or t == self.seq_length + divm - 1:
+                            final_beam = {
+                                'seq': beam_seq_table[divm][:, vix].clone(), 
+                                'logps': beam_seq_logprobs_table[divm][:, vix].clone(),
+                                'unaug_p': beam_seq_logprobs_table[divm][:, vix].sum(),
+                                'p': beam_logprobs_sum_table[divm][vix]
+                            }
+                            done_beams_table[divm].append(final_beam)
+                            # don't continue beams from finished sequences
+                            beam_logprobs_sum_table[divm][vix] = -1000
+
+                    # move the current group one step forward in time
+                    
+                    it = beam_seq_table[divm][t-divm]
+                    logprobs_table[divm], state_table[divm] = self.get_logprobs_state(Variable(it.cuda()), *(args + (state_table[divm],)))
+
+        # all beams are sorted by their log-probabilities
+        done_beams_table = [sorted(done_beams_table[i], key=lambda x: -x['p'])[:bdash] for i in range(group_size)]
+        done_beams = reduce(lambda a,b:a+b, done_beams_table)
+        return done_beams
\ No newline at end of file
diff --git a/models/FCModel.py b/models/FCModel.py
index 9cf38dae..d609e9db 100644
--- a/models/FCModel.py
+++ b/models/FCModel.py
@@ -200,198 +200,4 @@ def sample(self, fc_feats, att_feats, opt={}):
             output, state = self.core(xt, state)
             logprobs = F.log_softmax(self.logit(output))
 
-        return torch.cat([_.unsqueeze(1) for _ in seq], 1), torch.cat([_.unsqueeze(1) for _ in seqLogprobs], 1)
-
-# implements beam search
-# calls beam_step and returns the final set of beams
-# augments log-probabilities with diversity terms when number of groups > 1
-
-    def diverse_beam_search(self, init_state, init_logprobs, opt={}):
-
-        # function computes the similarity score to be augmented
-        def add_diversity(beam_seq_table,logprobsf,t,divm,diversity_lambda,bdash):
-            local_time = t - divm
-            unaug_logprobsf = logprobsf.clone()
-            for prev_choice in range(divm):
-                prev_decisions = beam_seq_table[prev_choice][local_time]
-                for sub_beam in range(bdash):
-                    for prev_labels in range(bdash):
-                        logprobsf[sub_beam][prev_decisions[prev_labels]] = logprobsf[sub_beam][prev_decisions[prev_labels]] - diversity_lambda
-            return unaug_logprobsf
-
-        # does one step of classical beam search
-
-        def beam_step(logprobsf,unaug_logprobsf,beam_size,t,beam_seq,beam_seq_logprobs,beam_logprobs_sum,state):
-        #INPUTS:
-        #logprobsf: probabilities augmented after diversity
-        #beam_size: obvious
-        #t        : time instant
-        #beam_seq : tensor contanining the beams
-        #beam_seq_logprobs: tensor contanining the beam logprobs
-        #beam_logprobs_sum: tensor contanining joint logprobs
-        #OUPUTS:
-        #beam_seq : tensor containing the word indices of the decoded captions
-        #beam_seq_logprobs : log-probability of each decision made, same size as beam_seq
-        #beam_logprobs_sum : joint log-probability of each beam
-
-            ys,ix = torch.sort(logprobsf,1,True)
-            candidates = []
-            cols = min(beam_size, ys.size(1))
-            rows = beam_size
-            if t == 0:
-                rows = 1
-            for c in range(cols): # for each column (word, essentially)
-                for q in range(rows): # for each beam expansion
-                    #compute logprob of expanding beam q with word in (sorted) position c
-                    local_logprob = ys[q,c]
-                    candidate_logprob = beam_logprobs_sum[q] + local_logprob
-                    local_unaug_logprob = unaug_logprobsf[q,ix[q,c]]
-                    candidates.append({'c':ix[q,c], 'q':q, 'p':candidate_logprob, 'r':local_unaug_logprob})
-            candidates = sorted(candidates,  key=lambda x: -x['p'])
-            
-            new_state = [_.clone() for _ in state]
-            #beam_seq_prev, beam_seq_logprobs_prev
-            if t >= 1:
-            #we''ll need these as reference when we fork beams around
-                beam_seq_prev = beam_seq[:t].clone()
-                beam_seq_logprobs_prev = beam_seq_logprobs[:t].clone()
-            for vix in range(beam_size):
-                v = candidates[vix]
-                #fork beam index q into index vix
-                if t >= 1:
-                    beam_seq[:t, vix] = beam_seq_prev[:, v['q']]
-                    beam_seq_logprobs[:t, vix] = beam_seq_logprobs_prev[:, v['q']]
-                #rearrange recurrent states
-                for state_ix in range(len(new_state)):
-                #  copy over state in previous beam q to new beam at vix
-                    new_state[state_ix][:, vix] = state[state_ix][:, v['q']] # dimension one is time step
-                #append new end terminal at the end of this beam
-                beam_seq[t, vix] = v['c'] # c'th word is the continuation
-                beam_seq_logprobs[t, vix] = v['r'] # the raw logprob here
-                beam_logprobs_sum[vix] = v['p'] # the new (sum) logprob along this beam
-            state = new_state
-            return beam_seq,beam_seq_logprobs,beam_logprobs_sum,state,candidates
-
-        # Start diverse_beam_search
-        beam_size = opt.get('beam_size', 10)
-        group_size = opt.get('group_size', 5)
-        diversity_lambda = opt.get('diversity_lambda', 0.5)
-
-        # init
-        bdash = beam_size // group_size
-        # init_state = init_params[1]
-        # init_logprobs = init_params[2]
-        state_table = []
-        beam_seq_table = []
-        beam_seq_logprobs_table = []
-        beam_logprobs_sum_table = []
-
-        # INITIALIZATIONS
-        beam_seq_table = [torch.LongTensor(self.seq_length, bdash).zero_() for _ in range(group_size)]
-        beam_seq_logprobs_table = [torch.FloatTensor(self.seq_length, bdash).zero_() for _ in range(group_size)]
-        beam_logprobs_sum_table = [torch.zeros(bdash) for _ in range(group_size)]
-
-        # logprobs # logprobs predicted in last time step, shape (beam_size, vocab_size+1)
-        done_beams_table = [[] for _ in range(group_size)]
-        # state = self.init_hidden(bdash)
-        # state = [_.unsqueeze(0).repeat(bdash, 1) for _ in init_state]
-        # state_table = [[_.clone() for _ in state] for __ in range(group_size)]
-        state_table = [list(torch.unbind(_)) for _ in torch.stack(init_state).chunk(group_size, 2)]
-        logprobs_table = list(init_logprobs.chunk(group_size, 0))
-        # logprobs_table = [torch.zeros(bdash, init_logprobs.size(1)) for _ in range(group_size)]
-        # for i in range(group_size):
-        #     for j in range(bdash):
-        #         logprobs_table[i][j] = init_logprobs.clone()
-        # END INIT
-
-        for t in range(self.seq_length + group_size - 1):
-            for divm in range(group_size): 
-                if t >= divm and t <= self.seq_length + divm - 1:
-                    # add diversity
-                    logprobsf = logprobs_table[divm].data.float()
-                    # suppress UNK tokens in the decoding
-                    logprobsf[:,logprobsf.size(1)-1] =  logprobsf[:, logprobsf.size(1)-1] - 1000  
-                    # diversity is added here
-                    # the function directly modifies the logprobsf values and hence, we need to return
-                    # the unaugmented ones for sorting the candidates in the end. # for historical
-                    # reasons :-)
-                    unaug_logprobsf = add_diversity(beam_seq_table,logprobsf,t,divm,diversity_lambda,bdash)
-
-                    # infer new beams
-                    beam_seq_table[divm],\
-                    beam_seq_logprobs_table[divm],\
-                    beam_logprobs_sum_table[divm],\
-                    state_table[divm],\
-                    candidates_divm = beam_step(logprobsf,
-                                                unaug_logprobsf,
-                                                bdash,
-                                                t-divm,
-                                                beam_seq_table[divm],
-                                                beam_seq_logprobs_table[divm],
-                                                beam_logprobs_sum_table[divm],
-                                                state_table[divm])
-
-                    # if time's up... or if end token is reached then copy beams
-                    for vix in range(bdash):
-                        # is_first_end_token = ((beam_seq_table[divm][:,vix][t-divm] == 0) and (torch.eq(beam_seq_table[divm][:,vix],0).sum()==1))
-                        # final_time_without_end_token = ((t == self.seq_length+divm-1) and (torch.eq(beam_seq_table[divm][:,vix],0).sum()==0))
-                        if beam_seq_table[divm][t-divm,vix] == 0 or t == self.seq_length+divm-1: # if is_first_end_token or final_time_without_end_token:
-                            final_beam = {
-                                'seq': beam_seq_table[divm][:, vix].clone(), 
-                                'logps': beam_seq_logprobs_table[divm][:, vix].clone(),
-                                'unaug_logp': beam_seq_logprobs_table[divm][:, vix].sum(),
-                                'logp': beam_logprobs_sum_table[divm][vix]
-                            }
-                            done_beams_table[divm].append(final_beam)
-                        # # don't continue beams from finished sequences
-                        # if is_first_end_token:
-                            # make continuation of already completed sequences improbable
-                            beam_logprobs_sum_table[divm][vix] = -1000
-
-                    # move the current group one step forward in time
-                    
-                    it = beam_seq_table[divm][t-divm]
-                    xt = self.embed(Variable(it.cuda()))
-                    output, state = self.core(xt, state_table[divm])
-                    logprobs_table[divm] = F.log_softmax(self.logit(output))
-                    state_table[divm] = state
-
-        for i in range(group_size):
-            # all beams are sorted by their log-probabilities
-            done_beams_table[i] = sorted(done_beams_table[i], key=lambda x: -x['logp'])[:bdash]
-        return done_beams_table
-
-
-    def sample_diverse_beam(self, fc_feats, att_feats, opt={}):
-        beam_size = opt.get('beam_size', 10)
-        batch_size = fc_feats.size(0)
-
-        assert beam_size <= self.vocab_size + 1, 'lets assume this for now, otherwise this corner case causes a few headaches down the road. can be dealt with in future if needed'
-        seq = torch.LongTensor(self.seq_length, batch_size).zero_()
-        seqLogprobs = torch.FloatTensor(self.seq_length, batch_size)
-        # lets process every image independently for now, for simplicity
-
-        self.done_beams = [[] for _ in range(batch_size)]
-        for k in range(batch_size):
-            state = self.init_hidden(beam_size)
-
-            beam_seq = torch.LongTensor(self.seq_length, beam_size).zero_()
-            beam_seq_logprobs = torch.FloatTensor(self.seq_length, beam_size).zero_()
-            beam_logprobs_sum = torch.zeros(beam_size) # running sum of logprobs for each beam
-            for t in range(2):
-                if t == 0:
-                    xt = self.img_embed(fc_feats[k:k+1]).expand(beam_size, self.input_encoding_size)
-                elif t == 1: # input <bos>
-                    it = fc_feats.data.new(beam_size).long().zero_()
-                    xt = self.embed(Variable(it, requires_grad=False))
-
-                output, state = self.core(xt, state)
-                logprobs = F.log_softmax(self.logit(output))
-
-            done_beams_table = self.diverse_beam_search(state, logprobs, opt)
-
-            self.done_beams[k] = reduce(lambda a,b:a+b, done_beams_table)
-            seq[:, k] = self.done_beams[k][0]['seq'] # the first beam has highest cumulative score
-            seqLogprobs[:, k] = self.done_beams[k][0]['logps']
-        # return the samples and their log likelihoods
-        return seq.transpose(0, 1), seqLogprobs.transpose(0, 1)
\ No newline at end of file
+        return torch.cat([_.unsqueeze(1) for _ in seq], 1), torch.cat([_.unsqueeze(1) for _ in seqLogprobs], 1)
\ No newline at end of file

From 5d8c72f4d08ab21110b692e9bdaf96691d67fb6f Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Thu, 28 Sep 2017 11:53:52 -0500
Subject: [PATCH 05/42] Fix some bugs.

---
 models/CaptionModel.py | 9 +++++++--
 models/FCModel.py      | 5 +----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/models/CaptionModel.py b/models/CaptionModel.py
index b3271175..a4e5e10d 100644
--- a/models/CaptionModel.py
+++ b/models/CaptionModel.py
@@ -108,13 +108,18 @@ def beam_step(logprobsf, unaug_logprobsf, beam_size, t, beam_seq, beam_seq_logpr
         logprobs_table = list(init_logprobs.chunk(group_size, 0))
         # END INIT
 
+        # Chunk elements in the args
+        args = list(args)
+        args = [_.chunk(group_size) for _ in args]
+        args = [[args[i][j] for i in range(len(args))] for j in range(group_size)]
+
         for t in range(self.seq_length + group_size - 1):
             for divm in range(group_size): 
                 if t >= divm and t <= self.seq_length + divm - 1:
                     # add diversity
                     logprobsf = logprobs_table[divm].data.float()
                     # suppress UNK tokens in the decoding
-                    logprobsf[:,logprobsf.size(1)-1] =  logprobsf[:, logprobsf.size(1)-1] - 1000  
+                    logprobsf[:,logprobsf.size(1)-1] = logprobsf[:, logprobsf.size(1)-1] - 1000  
                     # diversity is added here
                     # the function directly modifies the logprobsf values and hence, we need to return
                     # the unaugmented ones for sorting the candidates in the end. # for historical
@@ -151,7 +156,7 @@ def beam_step(logprobsf, unaug_logprobsf, beam_size, t, beam_seq, beam_seq_logpr
                     # move the current group one step forward in time
                     
                     it = beam_seq_table[divm][t-divm]
-                    logprobs_table[divm], state_table[divm] = self.get_logprobs_state(Variable(it.cuda()), *(args + (state_table[divm],)))
+                    logprobs_table[divm], state_table[divm] = self.get_logprobs_state(Variable(it.cuda()), *(args[divm] + [state_table[divm]]))
 
         # all beams are sorted by their log-probabilities
         done_beams_table = [sorted(done_beams_table[i], key=lambda x: -x['p'])[:bdash] for i in range(group_size)]
diff --git a/models/FCModel.py b/models/FCModel.py
index d609e9db..7a51f02e 100644
--- a/models/FCModel.py
+++ b/models/FCModel.py
@@ -153,12 +153,9 @@ def sample_beam(self, fc_feats, att_feats, opt={}):
     def sample(self, fc_feats, att_feats, opt={}):
         sample_max = opt.get('sample_max', 1)
         beam_size = opt.get('beam_size', 1)
-        group_size = opt.get('group_size', 1)
         temperature = opt.get('temperature', 1.0)
-        if beam_size > 1 and group_size == 1:
+        if beam_size > 1:
             return self.sample_beam(fc_feats, att_feats, opt)
-        elif beam_size > 1:
-            return self.sample_diverse_beam(fc_feats, att_feats, opt)
 
         batch_size = fc_feats.size(0)
         state = self.init_hidden(batch_size)

From a8e81f4369ac41473d13969d4d3d43752df0a420 Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Thu, 28 Sep 2017 12:03:11 -0500
Subject: [PATCH 06/42] Add verbose_beam option, print all beams.

---
 eval.py       | 2 ++
 eval_utils.py | 8 +++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/eval.py b/eval.py
index 839b1989..738c8867 100644
--- a/eval.py
+++ b/eval.py
@@ -73,6 +73,8 @@
 # misc
 parser.add_argument('--id', type=str, default='', 
                 help='an id identifying this run/job. used only if language_eval = 1 for appending to intermediate files')
+parser.add_argument('--verbose_beam', type=int, default=1, 
+                help='if we need to print out all beam search beams.')
 
 opt = parser.parse_args()
 
diff --git a/eval_utils.py b/eval_utils.py
index 02abf867..0025b453 100644
--- a/eval_utils.py
+++ b/eval_utils.py
@@ -62,6 +62,7 @@ def language_eval(dataset, preds, model_id, split):
 
 def eval_split(model, crit, loader, eval_kwargs={}):
     verbose = eval_kwargs.get('verbose', True)
+    verbose_beam = eval_kwargs.get('verbose_beam', 1)
     num_images = eval_kwargs.get('num_images', eval_kwargs.get('val_images_use', -1))
     split = eval_kwargs.get('split', 'val')
     lang_eval = eval_kwargs.get('language_eval', 0)
@@ -102,9 +103,10 @@ def eval_split(model, crit, loader, eval_kwargs={}):
         seq, _ = model.sample(fc_feats, att_feats, eval_kwargs)
         
         # Print beam search
-        # for i in range(loader.batch_size):
-        #     print('\n'.join([utils.decode_sequence(loader.get_vocab(), _['seq'].unsqueeze(0))[0] for _ in model.done_beams[i]]))
-        #     print('--' * 10)
+        if beam_size > 1 and verbose_beam:
+            for i in range(loader.batch_size):
+                print('\n'.join([utils.decode_sequence(loader.get_vocab(), _['seq'].unsqueeze(0))[0] for _ in model.done_beams[i]]))
+                print('--' * 10)
         sents = utils.decode_sequence(loader.get_vocab(), seq)
 
         for k, sent in enumerate(sents):

From f4777c4e039dbf175d75a6579bd82b123e176537 Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Mon, 16 Oct 2017 13:27:44 -0500
Subject: [PATCH 07/42] Add Bleu4 self critical.

---
 misc/rewards.py | 30 ++++++++++++++++++++++--------
 opts.py         |  7 +++++++
 train.py        |  6 +++---
 3 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/misc/rewards.py b/misc/rewards.py
index c82ee729..873e799a 100644
--- a/misc/rewards.py
+++ b/misc/rewards.py
@@ -12,13 +12,18 @@
 import sys
 sys.path.append("cider")
 from pyciderevalcap.ciderD.ciderD import CiderD
+sys.path.append("coco-caption")
+from pycocoevalcap.bleu.bleu import Bleu
 
 CiderD_scorer = None
+Bleu_scorer = None
 #CiderD_scorer = CiderD(df='corpus')
 
-def init_cider_scorer(cached_tokens):
+def init_scorer(cached_tokens):
     global CiderD_scorer
-    CiderD_scorer = CiderD(df=cached_tokens)
+    CiderD_scorer = CiderD_scorer or CiderD(df=cached_tokens)
+    global Bleu_scorer
+    Bleu_scorer = Bleu_scorer or Bleu(4)
 
 def array_to_str(arr):
     out = ''
@@ -28,7 +33,7 @@ def array_to_str(arr):
             break
     return out.strip()
 
-def get_self_critical_reward(model, fc_feats, att_feats, data, gen_result):
+def get_self_critical_reward(model, fc_feats, att_feats, data, gen_result, opt):
     batch_size = gen_result.size(0)# batch_size = sample_size * seq_per_img
     seq_per_img = batch_size // len(data['gts'])
     
@@ -48,12 +53,21 @@ def get_self_critical_reward(model, fc_feats, att_feats, data, gen_result):
     for i in range(len(data['gts'])):
         gts[i] = [array_to_str(data['gts'][i][j]) for j in range(len(data['gts'][i]))]
 
-    #_, scores = Bleu(4).compute_score(gts, res)
-    #scores = np.array(scores[3])
-    res = [{'image_id':i, 'caption': res[i]} for i in range(2 * batch_size)]
+    res_ = [{'image_id':i, 'caption': res[i]} for i in range(2 * batch_size)]
+    res__ = {i: res[i] for i in range(2 * batch_size)}
     gts = {i: gts[i % batch_size // seq_per_img] for i in range(2 * batch_size)}
-    _, scores = CiderD_scorer.compute_score(gts, res)
-    print('Cider scores:', _)
+    if opt.cider_reward_weight > 0:
+        _, cider_scores = CiderD_scorer.compute_score(gts, res_)
+        print('Cider scores:', _)
+    else:
+        cider_scores = 0
+    if opt.bleu_reward_weight > 0:
+        _, bleu_scores = Bleu_scorer.compute_score(gts, res__)
+        bleu_scores = np.array(bleu_scores[3])
+        print('Bleu scores:', _[3])
+    else:
+        bleu_scores = 0
+    scores = opt.cider_reward_weight * cider_scores + opt.bleu_reward_weight * bleu_scores
 
     scores = scores[:batch_size] - scores[batch_size:]
 
diff --git a/opts.py b/opts.py
index 326da6ab..3b2c0796 100644
--- a/opts.py
+++ b/opts.py
@@ -105,6 +105,13 @@ def parse_opt():
     parser.add_argument('--train_only', type=int, default=0,
                     help='if true then use 80k, else use 110k')
 
+
+    # Reward
+    parser.add_argument('--cider_reward_weight', type=float, default=1,
+                    help='The reward weight from cider')
+    parser.add_argument('--bleu_reward_weight', type=float, default=0,
+                    help='The reward weight from bleu4')
+
     args = parser.parse_args()
 
     # Check if args are valid
diff --git a/train.py b/train.py
index 1b6dc705..ebbc4af9 100644
--- a/train.py
+++ b/train.py
@@ -18,7 +18,7 @@
 from dataloader import *
 import eval_utils
 import misc.utils as utils
-from misc.rewards import init_cider_scorer, get_self_critical_reward
+from misc.rewards import init_scorer, get_self_critical_reward
 
 try:
     import tensorflow as tf
@@ -101,7 +101,7 @@ def train(opt):
             # If start self critical training
             if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                 sc_flag = True
-                init_cider_scorer(opt.cached_tokens)
+                init_scorer(opt.cached_tokens)
             else:
                 sc_flag = False
 
@@ -124,7 +124,7 @@ def train(opt):
             loss = crit(model(fc_feats, att_feats, labels, att_masks), labels[:,1:], masks[:,1:])
         else:
             gen_result, sample_logprobs = model.sample(fc_feats, att_feats, att_masks, {'sample_max':0})
-            reward = get_self_critical_reward(model, fc_feats, att_feats, data, gen_result)
+            reward = get_self_critical_reward(model, fc_feats, att_feats, data, gen_result, opt)
             loss = rl_crit(sample_logprobs, gen_result, Variable(torch.from_numpy(reward).float().cuda(), requires_grad=False))
 
         loss.backward()

From af8b7d760f3b3fbb8dbc58810f2cc8d50422e771 Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Mon, 16 Oct 2017 13:28:38 -0500
Subject: [PATCH 08/42] Add att2all.

---
 models/AttModel.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++
 models/__init__.py |  2 ++
 opts.py            |  2 +-
 3 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/models/AttModel.py b/models/AttModel.py
index 1be16583..93ac1aa0 100644
--- a/models/AttModel.py
+++ b/models/AttModel.py
@@ -471,6 +471,47 @@ def forward(self, xt, fc_feats, att_feats, p_att_feats, state, att_masks=None):
         state = (next_h.unsqueeze(0), next_c.unsqueeze(0))
         return output, state
 
+class Att2all2Core(nn.Module):
+    def __init__(self, opt):
+        super(Att2all2Core, self).__init__()
+        self.input_encoding_size = opt.input_encoding_size
+        #self.rnn_type = opt.rnn_type
+        self.rnn_size = opt.rnn_size
+        #self.num_layers = opt.num_layers
+        self.drop_prob_lm = opt.drop_prob_lm
+        self.fc_feat_size = opt.fc_feat_size
+        self.att_feat_size = opt.att_feat_size
+        self.att_hid_size = opt.att_hid_size
+        
+        # Build a LSTM
+        self.a2h = nn.Linear(self.rnn_size, 5 * self.rnn_size)
+        self.i2h = nn.Linear(self.input_encoding_size, 5 * self.rnn_size)
+        self.h2h = nn.Linear(self.rnn_size, 5 * self.rnn_size)
+        self.dropout = nn.Dropout(self.drop_prob_lm)
+
+        self.attention = Attention(opt)
+
+    def forward(self, xt, fc_feats, att_feats, p_att_feats, state, att_masks=None):
+        att_res = self.attention(state[0][-1], att_feats, p_att_feats, att_masks)
+
+        all_input_sums = self.i2h(xt) + self.h2h(state[0][-1]) + self.a2h(att_res)
+        sigmoid_chunk = all_input_sums.narrow(1, 0, 3 * self.rnn_size)
+        sigmoid_chunk = F.sigmoid(sigmoid_chunk)
+        in_gate = sigmoid_chunk.narrow(1, 0, self.rnn_size)
+        forget_gate = sigmoid_chunk.narrow(1, self.rnn_size, self.rnn_size)
+        out_gate = sigmoid_chunk.narrow(1, self.rnn_size * 2, self.rnn_size)
+
+        in_transform = all_input_sums.narrow(1, 3 * self.rnn_size, 2 * self.rnn_size)
+        in_transform = torch.max(\
+            in_transform.narrow(1, 0, self.rnn_size),
+            in_transform.narrow(1, self.rnn_size, self.rnn_size))
+        next_c = forget_gate * state[1][-1] + in_gate * in_transform
+        next_h = out_gate * F.tanh(next_c)
+
+        output = self.dropout(next_h)
+        state = (next_h.unsqueeze(0), next_c.unsqueeze(0))
+        return output, state
+
 class AdaAttModel(AttModel):
     def __init__(self, opt):
         super(AdaAttModel, self).__init__(opt)
@@ -489,6 +530,13 @@ def __init__(self, opt):
         delattr(self, 'fc_embed')
         self.fc_embed = lambda x : x
 
+class Att2all2Model(AttModel):
+    def __init__(self, opt):
+        super(Att2all2Model, self).__init__(opt)
+        self.core = Att2all2Core(opt)
+        delattr(self, 'fc_embed')
+        self.fc_embed = lambda x : x
+
 class TopDownModel(AttModel):
     def __init__(self, opt):
         super(TopDownModel, self).__init__(opt)
diff --git a/models/__init__.py b/models/__init__.py
index 2467622f..5db63943 100644
--- a/models/__init__.py
+++ b/models/__init__.py
@@ -25,6 +25,8 @@ def setup(opt):
     # Att2in model with two-layer MLP img embedding and word embedding
     elif opt.caption_model == 'att2in2':
         model = Att2in2Model(opt)
+    elif opt.caption_model == 'att2all2':
+        model = Att2all2Model(opt)
     # Adaptive Attention model from Knowing when to look
     elif opt.caption_model == 'adaatt':
         model = AdaAttModel(opt)
diff --git a/opts.py b/opts.py
index 3b2c0796..2eac84e0 100644
--- a/opts.py
+++ b/opts.py
@@ -23,7 +23,7 @@ def parse_opt():
 
     # Model settings
     parser.add_argument('--caption_model', type=str, default="show_tell",
-                    help='show_tell, show_attend_tell, all_img, fc, att2in, att2in2, adaatt, adaattmo, topdown')
+                    help='show_tell, show_attend_tell, all_img, fc, att2in, att2in2, att2all2, adaatt, adaattmo, topdown')
     parser.add_argument('--rnn_size', type=int, default=512,
                     help='size of the rnn in number of hidden nodes in each layer')
     parser.add_argument('--num_layers', type=int, default=1,

From 7e72bea839547db89cf555768d209f468f7be3ce Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Tue, 17 Oct 2017 20:31:11 -0500
Subject: [PATCH 09/42] Simplify language model criterion.

---
 misc/utils.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/misc/utils.py b/misc/utils.py
index fd676b0c..cf03ed92 100644
--- a/misc/utils.py
+++ b/misc/utils.py
@@ -58,10 +58,8 @@ def forward(self, input, target, mask):
         # truncate to the same size
         target = target[:, :input.size(1)]
         mask =  mask[:, :input.size(1)]
-        input = to_contiguous(input).view(-1, input.size(2))
-        target = to_contiguous(target).view(-1, 1)
-        mask = to_contiguous(mask).view(-1, 1)
-        output = - input.gather(1, target) * mask
+
+        output = -input.gather(2, target.unsqueeze(2)).squeeze(2) * mask
         output = torch.sum(output) / torch.sum(mask)
 
         return output

From ff3cfae418fc645b35c745752845dff28061a69a Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Tue, 17 Oct 2017 20:18:59 -0500
Subject: [PATCH 10/42] Allow parallel training.

---
 eval_utils.py          |  2 +-
 misc/rewards.py        |  6 +++---
 models/AttModel.py     | 43 +++++++++++++++++++++++++-----------------
 models/CaptionModel.py |  6 ++++++
 train.py               | 16 ++++++++--------
 5 files changed, 44 insertions(+), 29 deletions(-)

diff --git a/eval_utils.py b/eval_utils.py
index 7cf1ac3a..f5e3a423 100644
--- a/eval_utils.py
+++ b/eval_utils.py
@@ -97,7 +97,7 @@ def eval_split(model, crit, loader, eval_kwargs={}):
         tmp = [Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp]
         fc_feats, att_feats, att_masks = tmp
         # forward the model to also get generated samples for each image
-        seq, _ = model.sample(fc_feats, att_feats, att_masks, eval_kwargs)
+        seq = model(fc_feats, att_feats, att_masks, opt=eval_kwargs, mode='sample')[0].data
         
         # Print beam search
         if beam_size > 1 and verbose_beam:
diff --git a/misc/rewards.py b/misc/rewards.py
index 873e799a..000f80ab 100644
--- a/misc/rewards.py
+++ b/misc/rewards.py
@@ -38,12 +38,12 @@ def get_self_critical_reward(model, fc_feats, att_feats, data, gen_result, opt):
     seq_per_img = batch_size // len(data['gts'])
     
     # get greedy decoding baseline
-    greedy_res, _ = model.sample(Variable(fc_feats.data, volatile=True), Variable(att_feats.data, volatile=True))
+    greedy_res, _ = model(Variable(fc_feats.data, volatile=True), Variable(att_feats.data, volatile=True), mode='sample')
 
     res = OrderedDict()
     
-    gen_result = gen_result.cpu().numpy()
-    greedy_res = greedy_res.cpu().numpy()
+    gen_result = gen_result.data.cpu().numpy()
+    greedy_res = greedy_res.data.cpu().numpy()
     for i in range(batch_size):
         res[i] = [array_to_str(gen_result[i])]
     for i in range(batch_size):
diff --git a/models/AttModel.py b/models/AttModel.py
index 93ac1aa0..0e88ccfb 100644
--- a/models/AttModel.py
+++ b/models/AttModel.py
@@ -57,11 +57,12 @@ def init_hidden(self, bsz):
         return (Variable(weight.new(self.num_layers, bsz, self.rnn_size).zero_()),
                 Variable(weight.new(self.num_layers, bsz, self.rnn_size).zero_()))
 
-    def forward(self, fc_feats, att_feats, seq, att_masks=None):
+    def _forward(self, fc_feats, att_feats, seq, att_masks=None):
         batch_size = fc_feats.size(0)
         state = self.init_hidden(batch_size)
 
-        outputs = []
+        # outputs = []
+        outputs = Variable(fc_feats.data.new(batch_size, seq.size(1) - 1, self.vocab_size+1).zero_())
 
         # embed fc and att feats
         fc_feats = self.fc_embed(fc_feats)
@@ -83,7 +84,8 @@ def forward(self, fc_feats, att_feats, seq, att_masks=None):
                     it = seq[:, i].data.clone()
                     #prob_prev = torch.exp(outputs[-1].data.index_select(0, sample_ind)) # fetch prev distribution: shape Nx(M+1)
                     #it.index_copy_(0, sample_ind, torch.multinomial(prob_prev, 1).view(-1))
-                    prob_prev = torch.exp(outputs[-1].data) # fetch prev distribution: shape Nx(M+1)
+                    # prob_prev = torch.exp(outputs[-1].data) # fetch prev distribution: shape Nx(M+1)
+                    prob_prev = torch.exp(outputs[:, i-1].data) # fetch prev distribution: shape Nx(M+1)
                     it.index_copy_(0, sample_ind, torch.multinomial(prob_prev, 1).view(-1).index_select(0, sample_ind))
                     it = Variable(it, requires_grad=False)
             else:
@@ -96,9 +98,11 @@ def forward(self, fc_feats, att_feats, seq, att_masks=None):
 
             output, state = self.core(xt, fc_feats, att_feats, p_att_feats, state, att_masks)
             output = F.log_softmax(self.logit(output))
-            outputs.append(output)
+            outputs[:, i] = output
+            # outputs.append(output)
 
-        return torch.cat([_.unsqueeze(1) for _ in outputs], 1)
+        return outputs
+        # return torch.cat([_.unsqueeze(1) for _ in outputs], 1)
 
     def get_logprobs_state(self, it, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, tmp_att_masks, state):
         # 'it' is Variable contraining a word index
@@ -109,7 +113,7 @@ def get_logprobs_state(self, it, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, t
 
         return logprobs, state
 
-    def sample_beam(self, fc_feats, att_feats, att_masks=None, opt={}):
+    def _sample_beam(self, fc_feats, att_feats, att_masks=None, opt={}):
         beam_size = opt.get('beam_size', 10)
         batch_size = fc_feats.size(0)
 
@@ -147,14 +151,14 @@ def sample_beam(self, fc_feats, att_feats, att_masks=None, opt={}):
             seq[:, k] = self.done_beams[k][0]['seq'] # the first beam has highest cumulative score
             seqLogprobs[:, k] = self.done_beams[k][0]['logps']
         # return the samples and their log likelihoods
-        return seq.transpose(0, 1), seqLogprobs.transpose(0, 1)
+        return Variable(seq.transpose(0, 1)), Variable(seqLogprobs.transpose(0, 1))
 
-    def sample(self, fc_feats, att_feats, att_masks=None, opt={}):
+    def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
         sample_max = opt.get('sample_max', 1)
         beam_size = opt.get('beam_size', 1)
         temperature = opt.get('temperature', 1.0)
         if beam_size > 1:
-            return self.sample_beam(fc_feats, att_feats, att_masks, opt)
+            return self._sample_beam(fc_feats, att_feats, att_masks, opt)
 
         batch_size = fc_feats.size(0)
         state = self.init_hidden(batch_size)
@@ -168,8 +172,10 @@ def sample(self, fc_feats, att_feats, att_masks=None, opt={}):
         p_att_feats = self.ctx2att(att_feats.view(-1, self.rnn_size))
         p_att_feats = p_att_feats.view(*(att_feats.size()[:-1] + (self.att_hid_size,)))
 
-        seq = []
-        seqLogprobs = []
+        # seq = []
+        # seqLogprobs = []
+        seq = Variable(fc_feats.data.new(batch_size, self.seq_length).long().zero_())
+        seqLogprobs = Variable(fc_feats.data.new(batch_size, self.seq_length).zero_())
         for t in range(self.seq_length + 1):
             if t == 0: # input <bos>
                 it = fc_feats.data.new(batch_size).long().zero_()
@@ -178,11 +184,11 @@ def sample(self, fc_feats, att_feats, att_masks=None, opt={}):
                 it = it.view(-1).long()
             else:
                 if temperature == 1.0:
-                    prob_prev = torch.exp(logprobs.data).cpu() # fetch prev distribution: shape Nx(M+1)
+                    prob_prev = torch.exp(logprobs.data) # fetch prev distribution: shape Nx(M+1)
                 else:
                     # scale logprobs by temperature
-                    prob_prev = torch.exp(torch.div(logprobs.data, temperature)).cpu()
-                it = torch.multinomial(prob_prev, 1).cuda()
+                    prob_prev = torch.exp(torch.div(logprobs.data, temperature))
+                it = torch.multinomial(prob_prev, 1)
                 sampleLogprobs = logprobs.gather(1, Variable(it, requires_grad=False)) # gather the logprobs at sampled positions
                 it = it.view(-1).long() # and flatten indices for downstream processing
 
@@ -197,14 +203,17 @@ def sample(self, fc_feats, att_feats, att_masks=None, opt={}):
                 if unfinished.sum() == 0:
                     break
                 it = it * unfinished.type_as(it)
-                seq.append(it) #seq[t] the input of t+2 time step
+                seq[:,t-1] = it
+                # seq.append(it) #seq[t] the input of t+2 time step
 
-                seqLogprobs.append(sampleLogprobs.view(-1))
+                # seqLogprobs.append(sampleLogprobs.view(-1))
+                seqLogprobs[:,t-1] = sampleLogprobs.view(-1)
 
             output, state = self.core(xt, fc_feats, att_feats, p_att_feats, state, att_masks)
             logprobs = F.log_softmax(self.logit(output))
 
-        return torch.cat([_.unsqueeze(1) for _ in seq], 1), torch.cat([_.unsqueeze(1) for _ in seqLogprobs], 1)
+        return seq, seqLogprobs
+        # return torch.cat([_.unsqueeze(1) for _ in seq], 1), torch.cat([_.unsqueeze(1) for _ in seqLogprobs], 1)
 
 class AdaAtt_lstm(nn.Module):
     def __init__(self, opt, use_maxout=True):
diff --git a/models/CaptionModel.py b/models/CaptionModel.py
index a4e5e10d..ef35fe21 100644
--- a/models/CaptionModel.py
+++ b/models/CaptionModel.py
@@ -24,6 +24,12 @@ def __init__(self):
     # calls beam_step and returns the final set of beams
     # augments log-probabilities with diversity terms when number of groups > 1
 
+    def forward(self, *args, **kwargs):
+        mode = kwargs.get('mode', 'forward')
+        if 'mode' in kwargs:
+            del kwargs['mode']
+        return getattr(self, '_'+mode)(*args, **kwargs)
+
     def beam_search(self, init_state, init_logprobs, *args, **kwargs):
 
         # function computes the similarity score to be augmented
diff --git a/train.py b/train.py
index ebbc4af9..c490a47f 100644
--- a/train.py
+++ b/train.py
@@ -66,12 +66,12 @@ def train(opt):
     if opt.load_best_score == 1:
         best_val_score = infos.get('best_val_score', None)
 
-    model = models.setup(opt)
-    model.cuda()
+    model = models.setup(opt).cuda()
+    dp_model = torch.nn.DataParallel(model)
 
     update_lr_flag = True
     # Assure in training mode
-    model.train()
+    dp_model.train()
 
     crit = utils.LanguageModelCriterion()
     rl_crit = utils.RewardCriterion()
@@ -121,11 +121,11 @@ def train(opt):
         
         optimizer.zero_grad()
         if not sc_flag:
-            loss = crit(model(fc_feats, att_feats, labels, att_masks), labels[:,1:], masks[:,1:])
+            loss = crit(dp_model(fc_feats, att_feats, labels, att_masks), labels[:,1:], masks[:,1:])
         else:
-            gen_result, sample_logprobs = model.sample(fc_feats, att_feats, att_masks, {'sample_max':0})
-            reward = get_self_critical_reward(model, fc_feats, att_feats, data, gen_result, opt)
-            loss = rl_crit(sample_logprobs, gen_result, Variable(torch.from_numpy(reward).float().cuda(), requires_grad=False))
+            gen_result, sample_logprobs = dp_model(fc_feats, att_feats, att_masks, opt={'sample_max':0}, mode='sample')
+            reward = get_self_critical_reward(dp_model, fc_feats, att_feats, data, gen_result, opt)
+            loss = rl_crit(sample_logprobs, gen_result.data, Variable(torch.from_numpy(reward).float().cuda(), requires_grad=False))
 
         loss.backward()
         utils.clip_gradient(optimizer, opt.grad_clip)
@@ -166,7 +166,7 @@ def train(opt):
             eval_kwargs = {'split': 'val',
                             'dataset': opt.input_json}
             eval_kwargs.update(vars(opt))
-            val_loss, predictions, lang_stats = eval_utils.eval_split(model, crit, loader, eval_kwargs)
+            val_loss, predictions, lang_stats = eval_utils.eval_split(dp_model, crit, loader, eval_kwargs)
 
             # Write validation result into summary
             if tf is not None:

From 829773f1ee49640c7e3e5ac8bb77bc32b73b258a Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Wed, 25 Oct 2017 19:22:15 -0500
Subject: [PATCH 11/42] Add an option during evaluation to not report loss.

---
 eval.py       | 2 ++
 eval_utils.py | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/eval.py b/eval.py
index 738c8867..d2f774e2 100644
--- a/eval.py
+++ b/eval.py
@@ -75,6 +75,8 @@
                 help='an id identifying this run/job. used only if language_eval = 1 for appending to intermediate files')
 parser.add_argument('--verbose_beam', type=int, default=1, 
                 help='if we need to print out all beam search beams.')
+parser.add_argument('--verbose_loss', type=int, default=0, 
+                help='if we need to calculate loss.')
 
 opt = parser.parse_args()
 
diff --git a/eval_utils.py b/eval_utils.py
index f5e3a423..2eec7214 100644
--- a/eval_utils.py
+++ b/eval_utils.py
@@ -59,6 +59,7 @@ def language_eval(dataset, preds, model_id, split):
 def eval_split(model, crit, loader, eval_kwargs={}):
     verbose = eval_kwargs.get('verbose', True)
     verbose_beam = eval_kwargs.get('verbose_beam', 1)
+    verbose_loss = eval_kwargs.get('verbose_loss', 1)
     num_images = eval_kwargs.get('num_images', eval_kwargs.get('val_images_use', -1))
     split = eval_kwargs.get('split', 'val')
     lang_eval = eval_kwargs.get('language_eval', 0)
@@ -79,7 +80,7 @@ def eval_split(model, crit, loader, eval_kwargs={}):
         data = loader.get_batch(split)
         n = n + loader.batch_size
 
-        if data.get('labels', None) is not None:
+        if data.get('labels', None) is not None and verbose_loss:
             # forward the model to get loss
             tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks']]
             tmp = [Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp]

From bb3bc8ad0e116126fd8624900fcac0b9ad34e26a Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Wed, 25 Oct 2017 19:26:04 -0500
Subject: [PATCH 12/42] Add ensemble.

---
 eval_ensemble.py      | 144 ++++++++++++++++++
 models/AttEnsemble.py | 340 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 484 insertions(+)
 create mode 100644 eval_ensemble.py
 create mode 100644 models/AttEnsemble.py

diff --git a/eval_ensemble.py b/eval_ensemble.py
new file mode 100644
index 00000000..dfdfddf8
--- /dev/null
+++ b/eval_ensemble.py
@@ -0,0 +1,144 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import numpy as np
+
+import time
+import os
+from six.moves import cPickle
+
+import opts
+import models
+from dataloader import *
+from dataloaderraw import *
+import eval_utils
+import argparse
+import misc.utils as utils
+import torch
+
+# Input arguments and options
+parser = argparse.ArgumentParser()
+# Input paths
+parser.add_argument('--ids', nargs='+', required=True, help='id of the models to ensemble')
+# parser.add_argument('--models', nargs='+', required=True
+#                 help='path to model to evaluate')
+# parser.add_argument('--infos_paths', nargs='+', required=True, help='path to infos to evaluate')
+# Basic options
+parser.add_argument('--batch_size', type=int, default=0,
+                help='if > 0 then overrule, otherwise load from checkpoint.')
+parser.add_argument('--num_images', type=int, default=-1,
+                help='how many images to use when periodically evaluating the loss? (-1 = all)')
+parser.add_argument('--language_eval', type=int, default=0,
+                help='Evaluate language as well (1 = yes, 0 = no)? BLEU/CIDEr/METEOR/ROUGE_L? requires coco-caption code from Github.')
+parser.add_argument('--dump_images', type=int, default=1,
+                help='Dump images into vis/imgs folder for vis? (1=yes,0=no)')
+parser.add_argument('--dump_json', type=int, default=1,
+                help='Dump json with predictions into vis folder? (1=yes,0=no)')
+parser.add_argument('--dump_path', type=int, default=0,
+                help='Write image paths along with predictions into vis json? (1=yes,0=no)')
+
+# Sampling options
+parser.add_argument('--sample_max', type=int, default=1,
+                help='1 = sample argmax words. 0 = sample from distributions.')
+parser.add_argument('--beam_size', type=int, default=2,
+                help='used when sample_max = 1, indicates number of beams in beam search. Usually 2 or 3 works well. More is not better. Set this to 1 for faster runtime but a bit worse performance.')
+parser.add_argument('--group_size', type=int, default=1,
+                help='used for diverse beam search. if group_size is 1, then it\'s normal beam search')
+parser.add_argument('--diversity_lambda', type=float, default=0.5,
+                help='used for diverse beam search. Usually from 0.2 to 0.8. Higher value of lambda produces a more diverse list')
+parser.add_argument('--temperature', type=float, default=1.0,
+                help='temperature when sampling from distributions (i.e. when sample_max = 0). Lower = "safer" predictions.')
+# For evaluation on a folder of images:
+parser.add_argument('--image_folder', type=str, default='', 
+                help='If this is nonempty then will predict on the images in this folder path')
+parser.add_argument('--image_root', type=str, default='', 
+                help='In case the image paths have to be preprended with a root path to an image folder')
+# For evaluation on MSCOCO images from some split:
+parser.add_argument('--input_fc_dir', type=str, default='',
+                help='path to the h5file containing the preprocessed dataset')
+parser.add_argument('--input_att_dir', type=str, default='',
+                help='path to the h5file containing the preprocessed dataset')
+parser.add_argument('--input_label_h5', type=str, default='',
+                help='path to the h5file containing the preprocessed dataset')
+parser.add_argument('--input_json', type=str, default='', 
+                help='path to the json file containing additional info and vocab. empty = fetch from model checkpoint.')
+parser.add_argument('--split', type=str, default='test', 
+                help='if running on MSCOCO images, which split to use: val|test|train')
+parser.add_argument('--coco_json', type=str, default='', 
+                help='if nonempty then use this file in DataLoaderRaw (see docs there). Used only in MSCOCO test evaluation, where we have a specific json file of only test set images.')
+parser.add_argument('--seq_length', type=int, default=40, 
+                help='maximum sequence length during sampling')
+# misc
+parser.add_argument('--id', type=str, default='', 
+                help='an id identifying this run/job. used only if language_eval = 1 for appending to intermediate files')
+parser.add_argument('--verbose_beam', type=int, default=1, 
+                help='if we need to print out all beam search beams.')
+parser.add_argument('--verbose_loss', type=int, default=0, 
+                help='If calculate loss using ground truth during evaluation')
+
+opt = parser.parse_args()
+
+model_infos = [cPickle.load(open('log_%s/infos_%s-best.pkl' %(id, id))) for id in opt.ids]
+model_paths = ['log_%s/model-best.pth' %(id) for id in opt.ids]
+
+# Load one infos
+infos = model_infos[0]
+
+# override and collect parameters
+if len(opt.input_fc_dir) == 0:
+    opt.input_fc_dir = infos['opt'].input_fc_dir
+    opt.input_att_dir = infos['opt'].input_att_dir
+    opt.input_label_h5 = infos['opt'].input_label_h5
+if len(opt.input_json) == 0:
+    opt.input_json = infos['opt'].input_json
+if opt.batch_size == 0:
+    opt.batch_size = infos['opt'].batch_size
+if len(opt.id) == 0:
+    opt.id = infos['opt'].id
+opt.seq_per_img = infos['opt'].seq_per_img
+
+vocab = infos['vocab'] # ix -> word mapping
+
+# Setup the model
+from models.AttEnsemble import AttEnsemble
+
+_models = []
+for i in range(len(model_infos)):
+    model_infos[i]['opt'].start_from = None
+    tmp = models.setup(model_infos[i]['opt'])
+    tmp.load_state_dict(torch.load(model_paths[i]))
+    tmp.cuda()
+    tmp.eval()
+    _models.append(tmp)
+
+model = AttEnsemble(_models)
+model.seq_length = opt.seq_length
+model.eval()
+crit = utils.LanguageModelCriterion()
+
+# Create the Data Loader instance
+if len(opt.image_folder) == 0:
+  loader = DataLoader(opt)
+else:
+  loader = DataLoaderRaw({'folder_path': opt.image_folder, 
+                            'coco_json': opt.coco_json,
+                            'batch_size': opt.batch_size,
+                            'cnn_model': opt.cnn_model})
+# When eval using provided pretrained model, the vocab may be different from what you have in your cocotalk.json
+# So make sure to use the vocab in infos file.
+loader.ix_to_word = infos['vocab']
+
+
+# Set sample options
+loss, split_predictions, lang_stats = eval_utils.eval_split(model, crit, loader, 
+    vars(opt))
+
+print('loss: ', loss)
+if lang_stats:
+  print(lang_stats)
+
+if opt.dump_json == 1:
+    # dump the json
+    json.dump(split_predictions, open('vis/vis.json', 'w'))
diff --git a/models/AttEnsemble.py b/models/AttEnsemble.py
new file mode 100644
index 00000000..71d3ef4d
--- /dev/null
+++ b/models/AttEnsemble.py
@@ -0,0 +1,340 @@
+# This file contains Att2in2, AdaAtt, AdaAttMO, TopDown model
+
+# AdaAtt is from Knowing When to Look: Adaptive Attention via A Visual Sentinel for Image Captioning
+# https://arxiv.org/abs/1612.01887
+# AdaAttMO is a modified version with maxout lstm
+
+# Att2in is from Self-critical Sequence Training for Image Captioning
+# https://arxiv.org/abs/1612.00563
+# In this file we only have Att2in2, which is a slightly different version of att2in,
+# in which the img feature embedding and word embedding is the same as what in adaatt.
+
+# TopDown is from Bottom-Up and Top-Down Attention for Image Captioning and VQA
+# https://arxiv.org/abs/1707.07998
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import *
+import misc.utils as utils
+
+from .CaptionModel import CaptionModel
+
+class AttEnsemble(CaptionModel):
+    def __init__(self, models):
+        super(AttEnsemble, self).__init__()
+
+        self.models = nn.ModuleList(models)
+        self.vocab_size = models[0].vocab_size
+        self.seq_length = models[0].seq_length
+        self.ss_prob = 0
+
+    def init_hidden(self, batch_size):
+        return [m.init_hidden(batch_size) for m in self.models]
+
+    def embed(self, it):
+        return [m.embed(it) for m in self.models]
+
+    def core(self, *args):
+        return zip(*[m.core(*_) for m, _ in zip(self.models, zip(*args))])
+
+    def get_logprobs_state(self, it, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, tmp_att_masks, state):
+        # 'it' is Variable contraining a word index
+        xt = self.embed(Variable(it, requires_grad=False))
+
+        output, state = self.core(xt, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, state, tmp_att_masks)
+        logprobs = torch.stack([F.softmax(m.logit(output[i])) for i,m in enumerate(self.models)], 2).mean(2).log()
+
+        return logprobs, state
+
+    def _forward(self, fc_feats, att_feats, seq, att_masks=None):
+        batch_size = fc_feats.size(0)
+        state = self.init_hidden(batch_size)
+
+        # outputs = []
+        outputs = Variable(fc_feats.data.new(batch_size, seq.size(1) - 1, self.vocab_size+1).zero_())
+
+        # embed fc and att feats
+        fc_feats = [m.fc_embed(fc_feats) for m in self.models]
+        _att_feats = [m.att_embed(att_feats.view(-1, m.att_feat_size)) for m in self.models]
+        att_feats = [_att_feats[i].view(*(att_feats.size()[:-1] + (m.rnn_size,))) for i,m in enumerate(self.models)]
+
+        # Project the attention feats first to reduce memory and computation comsumptions.
+        p_att_feats = [m.ctx2att(att_feats[i].view(-1, m.rnn_size)) for i, m in enumerate(self.models)]
+        p_att_feats = [p_att_feats[i].view(*(att_feats[i].size()[:-1] + (m.att_hid_size,))) for i,m in enumerate(self.models)]
+
+        for i in range(seq.size(1) - 1):
+            if self.training and i >= 1 and self.ss_prob > 0.0: # otherwiste no need to sample
+                sample_prob = fc_feats.data.new(batch_size).uniform_(0, 1)
+                sample_mask = sample_prob < self.ss_prob
+                if sample_mask.sum() == 0:
+                    it = seq[:, i].clone()
+                else:
+                    sample_ind = sample_mask.nonzero().view(-1)
+                    it = seq[:, i].data.clone()
+                    #prob_prev = torch.exp(outputs[-1].data.index_select(0, sample_ind)) # fetch prev distribution: shape Nx(M+1)
+                    #it.index_copy_(0, sample_ind, torch.multinomial(prob_prev, 1).view(-1))
+                    # prob_prev = torch.exp(outputs[-1].data) # fetch prev distribution: shape Nx(M+1)
+                    prob_prev = torch.exp(outputs[:, i-1].data) # fetch prev distribution: shape Nx(M+1)
+                    it.index_copy_(0, sample_ind, torch.multinomial(prob_prev, 1).view(-1).index_select(0, sample_ind))
+                    it = Variable(it, requires_grad=False)
+            else:
+                it = seq[:, i].clone()          
+            # break if all the sequences end
+            if i >= 1 and seq[:, i].data.sum() == 0:
+                break
+
+            xt = self.embed(it)
+
+            output, state = self.core(xt, fc_feats, att_feats, p_att_feats, state, att_masks)
+            output = torch.stack([F.softmax(m.logit(output[i])) for i,m in enumerate(self.models)], 2).mean(2).log()
+            outputs[:, i] = output
+            # outputs.append(output)
+
+        return outputs
+        # return torch.cat([_.unsqueeze(1) for _ in outputs], 1)
+
+    def _sample_beam(self, fc_feats, att_feats, att_masks=None, opt={}):
+        beam_size = opt.get('beam_size', 10)
+        batch_size = fc_feats.size(0)
+
+        # embed fc and att feats
+        fc_feats = [m.fc_embed(fc_feats) for m in self.models]
+        _att_feats = [m.att_embed(att_feats.view(-1, m.att_feat_size)) for m in self.models]
+        att_feats = [_att_feats[i].view(*(att_feats.size()[:-1] + (m.rnn_size,))) for i,m in enumerate(self.models)]
+
+        # Project the attention feats first to reduce memory and computation comsumptions.
+        p_att_feats = [m.ctx2att(att_feats[i].view(-1, m.rnn_size)) for i, m in enumerate(self.models)]
+        p_att_feats = [p_att_feats[i].view(*(att_feats[i].size()[:-1] + (m.att_hid_size,))) for i,m in enumerate(self.models)]
+
+        assert beam_size <= self.vocab_size + 1, 'lets assume this for now, otherwise this corner case causes a few headaches down the road. can be dealt with in future if needed'
+        seq = torch.LongTensor(self.seq_length, batch_size).zero_()
+        seqLogprobs = torch.FloatTensor(self.seq_length, batch_size)
+        # lets process every image independently for now, for simplicity
+
+        self.done_beams = [[] for _ in range(batch_size)]
+        for k in range(batch_size):
+            state = self.init_hidden(beam_size)
+            tmp_fc_feats = [fc_feats[i][k:k+1].expand(beam_size, fc_feats[i].size(1)) for i,m in enumerate(self.models)]
+            tmp_att_feats = [att_feats[i][k:k+1].expand(*((beam_size,)+att_feats[i].size()[1:])).contiguous() for i,m in enumerate(self.models)]
+            tmp_p_att_feats = [p_att_feats[i][k:k+1].expand(*((beam_size,)+p_att_feats[i].size()[1:])).contiguous() for i,m in enumerate(self.models)]
+            tmp_att_masks = [att_masks[k:k+1].expand(*((beam_size,)+att_masks.size()[1:])).contiguous() if att_masks is not None else None] * len(self.models)
+
+            for t in range(1):
+                if t == 0: # input <bos>
+                    it = fc_feats[0].data.new(beam_size).long().zero_()
+                    xt = self.embed(Variable(it, requires_grad=False))
+
+                output, state = self.core(xt, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, state, tmp_att_masks)
+                logprobs = torch.stack([F.softmax(m.logit(output[i])) for i,m in enumerate(self.models)], 2).mean(2).log()
+
+            self.done_beams[k] = self.beam_search(state, logprobs, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, tmp_att_masks, opt=opt)
+            seq[:, k] = self.done_beams[k][0]['seq'] # the first beam has highest cumulative score
+            seqLogprobs[:, k] = self.done_beams[k][0]['logps']
+        # return the samples and their log likelihoods
+        return Variable(seq.transpose(0, 1)), Variable(seqLogprobs.transpose(0, 1))
+
+    def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
+        sample_max = opt.get('sample_max', 1)
+        beam_size = opt.get('beam_size', 1)
+        temperature = opt.get('temperature', 1.0)
+        if beam_size > 1:
+            return self._sample_beam(fc_feats, att_feats, att_masks, opt)
+
+        batch_size = fc_feats.size(0)
+        state = self.init_hidden(batch_size)
+
+        # embed fc and att feats
+        fc_feats = [m.fc_embed(fc_feats) for m in self.models]
+        _att_feats = [m.att_embed(att_feats.view(-1, m.att_feat_size)) for m in self.models]
+        att_feats = [_att_feats[i].view(*(att_feats.size()[:-1] + (m.rnn_size,))) for i,m in enumerate(self.models)]
+
+        # Project the attention feats first to reduce memory and computation comsumptions.
+        p_att_feats = [m.ctx2att(att_feats[i].view(-1, m.rnn_size)) for i, m in enumerate(self.models)]
+        p_att_feats = [p_att_feats[i].view(*(att_feats[i].size()[:-1] + (m.att_hid_size,))) for i,m in enumerate(self.models)]
+
+        # seq = []
+        # seqLogprobs = []
+        seq = Variable(fc_feats.data.new(batch_size, self.seq_length).long().zero_())
+        seqLogprobs = Variable(fc_feats.data.new(batch_size, self.seq_length).zero_())
+        for t in range(self.seq_length + 1):
+            if t == 0: # input <bos>
+                it = fc_feats[0].data.new(batch_size).long().zero_()
+            elif sample_max:
+                sampleLogprobs, it = torch.max(logprobs.data, 1)
+                it = it.view(-1).long()
+            else:
+                if temperature == 1.0:
+                    prob_prev = torch.exp(logprobs.data) # fetch prev distribution: shape Nx(M+1)
+                else:
+                    # scale logprobs by temperature
+                    prob_prev = torch.exp(torch.div(logprobs.data, temperature))
+                it = torch.multinomial(prob_prev, 1)
+                sampleLogprobs = logprobs.gather(1, Variable(it, requires_grad=False)) # gather the logprobs at sampled positions
+                it = it.view(-1).long() # and flatten indices for downstream processing
+
+            xt = self.embed(Variable(it, requires_grad=False))
+
+            if t >= 1:
+                # stop when all finished
+                if t == 1:
+                    unfinished = it > 0
+                else:
+                    unfinished = unfinished * (it > 0)
+                if unfinished.sum() == 0:
+                    break
+                it = it * unfinished.type_as(it)
+                seq[:,t-1] = it
+                # seq.append(it) #seq[t] the input of t+2 time step
+
+                # seqLogprobs.append(sampleLogprobs.view(-1))
+                seqLogprobs[:,t-1] = sampleLogprobs.view(-1)
+
+            output, state = self.core(xt, fc_feats, att_feats, p_att_feats, state, [att_masks] * len(self.models))
+            logprobs = torch.stack([F.softmax(m.logit(output[i])) for i,m in enumerate(self.models)], 2).mean(2).log()
+
+        return seq, seqLogprobs
+        # return torch.cat([_.unsqueeze(1) for _ in seq], 1), torch.cat([_.unsqueeze(1) for _ in seqLogprobs], 1)
+
+    def beam_search(self, init_state, init_logprobs, *args, **kwargs):
+
+        # function computes the similarity score to be augmented
+        def add_diversity(beam_seq_table, logprobsf, t, divm, diversity_lambda, bdash):
+            local_time = t - divm
+            unaug_logprobsf = logprobsf.clone()
+            for prev_choice in range(divm):
+                prev_decisions = beam_seq_table[prev_choice][local_time]
+                for sub_beam in range(bdash):
+                    for prev_labels in range(bdash):
+                        logprobsf[sub_beam][prev_decisions[prev_labels]] = logprobsf[sub_beam][prev_decisions[prev_labels]] - diversity_lambda
+            return unaug_logprobsf
+
+        # does one step of classical beam search
+
+        def beam_step(logprobsf, unaug_logprobsf, beam_size, t, beam_seq, beam_seq_logprobs, beam_logprobs_sum, state):
+            #INPUTS:
+            #logprobsf: probabilities augmented after diversity
+            #beam_size: obvious
+            #t        : time instant
+            #beam_seq : tensor contanining the beams
+            #beam_seq_logprobs: tensor contanining the beam logprobs
+            #beam_logprobs_sum: tensor contanining joint logprobs
+            #OUPUTS:
+            #beam_seq : tensor containing the word indices of the decoded captions
+            #beam_seq_logprobs : log-probability of each decision made, same size as beam_seq
+            #beam_logprobs_sum : joint log-probability of each beam
+
+            ys,ix = torch.sort(logprobsf,1,True)
+            candidates = []
+            cols = min(beam_size, ys.size(1))
+            rows = beam_size
+            if t == 0:
+                rows = 1
+            for c in range(cols): # for each column (word, essentially)
+                for q in range(rows): # for each beam expansion
+                    #compute logprob of expanding beam q with word in (sorted) position c
+                    local_logprob = ys[q,c]
+                    candidate_logprob = beam_logprobs_sum[q] + local_logprob
+                    local_unaug_logprob = unaug_logprobsf[q,ix[q,c]]
+                    candidates.append({'c':ix[q,c], 'q':q, 'p':candidate_logprob, 'r':local_unaug_logprob})
+            candidates = sorted(candidates,  key=lambda x: -x['p'])
+            
+            new_state = [[_.clone() for _ in state_] for state_ in state]
+            #beam_seq_prev, beam_seq_logprobs_prev
+            if t >= 1:
+            #we''ll need these as reference when we fork beams around
+                beam_seq_prev = beam_seq[:t].clone()
+                beam_seq_logprobs_prev = beam_seq_logprobs[:t].clone()
+            for vix in range(beam_size):
+                v = candidates[vix]
+                #fork beam index q into index vix
+                if t >= 1:
+                    beam_seq[:t, vix] = beam_seq_prev[:, v['q']]
+                    beam_seq_logprobs[:t, vix] = beam_seq_logprobs_prev[:, v['q']]
+                #rearrange recurrent states
+                for ii in range(len(new_state)):
+                    for state_ix in range(len(new_state[ii])):
+                    #  copy over state in previous beam q to new beam at vix
+                        new_state[ii][state_ix][:, vix] = state[ii][state_ix][:, v['q']] # dimension one is time step
+                #append new end terminal at the end of this beam
+                beam_seq[t, vix] = v['c'] # c'th word is the continuation
+                beam_seq_logprobs[t, vix] = v['r'] # the raw logprob here
+                beam_logprobs_sum[vix] = v['p'] # the new (sum) logprob along this beam
+            state = new_state
+            return beam_seq,beam_seq_logprobs,beam_logprobs_sum,state,candidates
+
+        # Start diverse_beam_search
+        opt = kwargs['opt']
+        beam_size = opt.get('beam_size', 10)
+        group_size = opt.get('group_size', 1)
+        diversity_lambda = opt.get('diversity_lambda', 0.5)
+        bdash = beam_size // group_size # beam per group
+
+        # INITIALIZATIONS
+        beam_seq_table = [torch.LongTensor(self.seq_length, bdash).zero_() for _ in range(group_size)]
+        beam_seq_logprobs_table = [torch.FloatTensor(self.seq_length, bdash).zero_() for _ in range(group_size)]
+        beam_logprobs_sum_table = [torch.zeros(bdash) for _ in range(group_size)]
+
+        # logprobs # logprobs predicted in last time step, shape (beam_size, vocab_size+1)
+        done_beams_table = [[] for _ in range(group_size)]
+        state_table = zip(*[[list(torch.unbind(_)) for _ in torch.stack(init_state_).chunk(group_size, 2)] for init_state_ in init_state])
+        logprobs_table = list(init_logprobs.chunk(group_size, 0))
+        # END INIT
+
+        # Chunk elements in the args
+        args = [[_.chunk(group_size) for _ in args_] for args_ in args] # arg_name, model_name, group_name
+        args = [[[args[j][i][k] for i in range(len(self.models))] for j in range(len(args))] for k in range(group_size)] # group_name, arg_name, model_name
+
+        for t in range(self.seq_length + group_size - 1):
+            for divm in range(group_size): 
+                if t >= divm and t <= self.seq_length + divm - 1:
+                    # add diversity
+                    logprobsf = logprobs_table[divm].data.float()
+                    # suppress UNK tokens in the decoding
+                    logprobsf[:,logprobsf.size(1)-1] = logprobsf[:, logprobsf.size(1)-1] - 1000  
+                    # diversity is added here
+                    # the function directly modifies the logprobsf values and hence, we need to return
+                    # the unaugmented ones for sorting the candidates in the end. # for historical
+                    # reasons :-)
+                    unaug_logprobsf = add_diversity(beam_seq_table,logprobsf,t,divm,diversity_lambda,bdash)
+
+                    # infer new beams
+                    beam_seq_table[divm],\
+                    beam_seq_logprobs_table[divm],\
+                    beam_logprobs_sum_table[divm],\
+                    state_table[divm],\
+                    candidates_divm = beam_step(logprobsf,
+                                                unaug_logprobsf,
+                                                bdash,
+                                                t-divm,
+                                                beam_seq_table[divm],
+                                                beam_seq_logprobs_table[divm],
+                                                beam_logprobs_sum_table[divm],
+                                                state_table[divm])
+
+                    # if time's up... or if end token is reached then copy beams
+                    for vix in range(bdash):
+                        if beam_seq_table[divm][t-divm,vix] == 0 or t == self.seq_length + divm - 1:
+                            final_beam = {
+                                'seq': beam_seq_table[divm][:, vix].clone(), 
+                                'logps': beam_seq_logprobs_table[divm][:, vix].clone(),
+                                'unaug_p': beam_seq_logprobs_table[divm][:, vix].sum(),
+                                'p': beam_logprobs_sum_table[divm][vix]
+                            }
+                            done_beams_table[divm].append(final_beam)
+                            # don't continue beams from finished sequences
+                            beam_logprobs_sum_table[divm][vix] = -1000
+
+                    # move the current group one step forward in time
+                    
+                    it = beam_seq_table[divm][t-divm]
+                    logprobs_table[divm], state_table[divm] = self.get_logprobs_state(it.cuda(), *(args[divm] + [state_table[divm]]))
+
+        # all beams are sorted by their log-probabilities
+        done_beams_table = [sorted(done_beams_table[i], key=lambda x: -x['p'])[:bdash] for i in range(group_size)]
+        done_beams = reduce(lambda a,b:a+b, done_beams_table)
+        return done_beams

From e8bd5e57f8a0c8e570de239fcfe07317a254284b Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Wed, 25 Oct 2017 19:26:23 -0500
Subject: [PATCH 13/42] Add two models.

---
 models/AttModel.py | 83 ++++++++++++++++++++++++++++++++++++++++++++++
 models/__init__.py |  6 ++++
 opts.py            |  2 +-
 3 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/models/AttModel.py b/models/AttModel.py
index 0e88ccfb..72b846d9 100644
--- a/models/AttModel.py
+++ b/models/AttModel.py
@@ -406,6 +406,77 @@ def forward(self, xt, fc_feats, att_feats, p_att_feats, state, att_masks=None):
 
         return output, state
 
+
+from .FCModel import LSTMCore
+class StackAttCore(nn.Module):
+    def __init__(self, opt, use_maxout=False):
+        super(StackAttCore, self).__init__()
+        self.drop_prob_lm = opt.drop_prob_lm
+
+        # self.att0 = Attention(opt)
+        self.att1 = Attention(opt)
+        self.att2 = Attention(opt)
+
+        opt_input_encoding_size = opt.input_encoding_size
+        opt.input_encoding_size = opt.input_encoding_size + opt.rnn_size
+        self.lstm0 = LSTMCore(opt) # att_feat + word_embedding
+        opt.input_encoding_size = opt.rnn_size * 2
+        self.lstm1 = LSTMCore(opt)
+        self.lstm2 = LSTMCore(opt)
+        opt.input_encoding_size = opt_input_encoding_size
+
+        # self.emb1 = nn.Linear(opt.rnn_size, opt.rnn_size)
+        self.emb2 = nn.Linear(opt.rnn_size, opt.rnn_size)
+
+    def forward(self, xt, fc_feats, att_feats, p_att_feats, state, att_masks=None):
+        # att_res_0 = self.att0(state[0][-1], att_feats, p_att_feats, att_masks)
+        h_0, state_0 = self.lstm0(torch.cat([xt,fc_feats],1), [state[0][0:1], state[1][0:1]])
+        att_res_1 = self.att1(h_0, att_feats, p_att_feats, att_masks)
+        h_1, state_1 = self.lstm1(torch.cat([h_0,att_res_1],1), [state[0][1:2], state[1][1:2]])
+        att_res_2 = self.att2(h_1 + self.emb2(att_res_1), att_feats, p_att_feats, att_masks)
+        h_2, state_2 = self.lstm2(torch.cat([h_1,att_res_2],1), [state[0][2:3], state[1][2:3]])
+
+        return h_2, [torch.cat(_, 0) for _ in zip(state_0, state_1, state_2)]
+
+class DenseAttCore(nn.Module):
+    def __init__(self, opt, use_maxout=False):
+        super(DenseAttCore, self).__init__()
+        self.drop_prob_lm = opt.drop_prob_lm
+
+        # self.att0 = Attention(opt)
+        self.att1 = Attention(opt)
+        self.att2 = Attention(opt)
+
+        opt_input_encoding_size = opt.input_encoding_size
+        opt.input_encoding_size = opt.input_encoding_size + opt.rnn_size
+        self.lstm0 = LSTMCore(opt) # att_feat + word_embedding
+        opt.input_encoding_size = opt.rnn_size * 2
+        self.lstm1 = LSTMCore(opt)
+        self.lstm2 = LSTMCore(opt)
+        opt.input_encoding_size = opt_input_encoding_size
+
+        # self.emb1 = nn.Linear(opt.rnn_size, opt.rnn_size)
+        self.emb2 = nn.Linear(opt.rnn_size, opt.rnn_size)
+
+        # fuse h_0 and h_1
+        self.fusion1 = nn.Sequential(nn.Linear(opt.rnn_size*2, opt.rnn_size),
+                                     nn.ReLU(),
+                                     nn.Dropout(opt.drop_prob_lm))
+        # fuse h_0, h_1 and h_2
+        self.fusion2 = nn.Sequential(nn.Linear(opt.rnn_size*3, opt.rnn_size),
+                                     nn.ReLU(),
+                                     nn.Dropout(opt.drop_prob_lm))
+
+    def forward(self, xt, fc_feats, att_feats, p_att_feats, state, att_masks=None):
+        # att_res_0 = self.att0(state[0][-1], att_feats, p_att_feats, att_masks)
+        h_0, state_0 = self.lstm0(torch.cat([xt,fc_feats],1), [state[0][0:1], state[1][0:1]])
+        att_res_1 = self.att1(h_0, att_feats, p_att_feats, att_masks)
+        h_1, state_1 = self.lstm1(torch.cat([h_0,att_res_1],1), [state[0][1:2], state[1][1:2]])
+        att_res_2 = self.att2(h_1 + self.emb2(att_res_1), att_feats, p_att_feats, att_masks)
+        h_2, state_2 = self.lstm2(torch.cat([self.fusion1(torch.cat([h_0, h_1], 1)),att_res_2],1), [state[0][2:3], state[1][2:3]])
+
+        return self.fusion2(torch.cat([h_0, h_1, h_2], 1)), [torch.cat(_, 0) for _ in zip(state_0, state_1, state_2)]
+
 class Attention(nn.Module):
     def __init__(self, opt):
         super(Attention, self).__init__()
@@ -551,3 +622,15 @@ def __init__(self, opt):
         super(TopDownModel, self).__init__(opt)
         self.num_layers = 2
         self.core = TopDownCore(opt)
+
+class StackAttModel(AttModel):
+    def __init__(self, opt):
+        super(StackAttModel, self).__init__(opt)
+        self.num_layers = 3
+        self.core = StackAttCore(opt)
+
+class DenseAttModel(AttModel):
+    def __init__(self, opt):
+        super(DenseAttModel, self).__init__(opt)
+        self.num_layers = 3
+        self.core = DenseAttCore(opt)
diff --git a/models/__init__.py b/models/__init__.py
index 5db63943..ef11b7d4 100644
--- a/models/__init__.py
+++ b/models/__init__.py
@@ -36,6 +36,12 @@ def setup(opt):
     # Top-down attention model
     elif opt.caption_model == 'topdown':
         model = TopDownModel(opt)
+    # StackAtt
+    elif opt.caption_model == 'stackatt':
+        model = StackAttModel(opt)
+    # DenseAtt
+    elif opt.caption_model == 'denseatt':
+        model = DenseAttModel(opt)
     else:
         raise Exception("Caption model not supported: {}".format(opt.caption_model))
 
diff --git a/opts.py b/opts.py
index 2eac84e0..cf4a2135 100644
--- a/opts.py
+++ b/opts.py
@@ -23,7 +23,7 @@ def parse_opt():
 
     # Model settings
     parser.add_argument('--caption_model', type=str, default="show_tell",
-                    help='show_tell, show_attend_tell, all_img, fc, att2in, att2in2, att2all2, adaatt, adaattmo, topdown')
+                    help='show_tell, show_attend_tell, all_img, fc, att2in, att2in2, att2all2, adaatt, adaattmo, topdown, stackatt, denseatt')
     parser.add_argument('--rnn_size', type=int, default=512,
                     help='size of the rnn in number of hidden nodes in each layer')
     parser.add_argument('--num_layers', type=int, default=1,

From 1eb84f792ea8a18839b53e8673163d646de78e2b Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Sun, 5 Nov 2017 22:32:36 -0600
Subject: [PATCH 14/42] Add decoding constraint of not generating same word in
 a row (doesn't seem to help.)

---
 eval.py                |  2 ++
 eval_ensemble.py       |  2 ++
 models/AttEnsemble.py  | 12 +++++++++++-
 models/AttModel.py     |  8 +++++++-
 models/CaptionModel.py |  4 ++++
 5 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/eval.py b/eval.py
index d2f774e2..045d3355 100644
--- a/eval.py
+++ b/eval.py
@@ -52,6 +52,8 @@
                 help='used for diverse beam search. Usually from 0.2 to 0.8. Higher value of lambda produces a more diverse list')
 parser.add_argument('--temperature', type=float, default=1.0,
                 help='temperature when sampling from distributions (i.e. when sample_max = 0). Lower = "safer" predictions.')
+parser.add_argument('--decoding_constraint', type=int, default=0,
+                help='If 1, not allowing same word in a row')
 # For evaluation on a folder of images:
 parser.add_argument('--image_folder', type=str, default='', 
                 help='If this is nonempty then will predict on the images in this folder path')
diff --git a/eval_ensemble.py b/eval_ensemble.py
index dfdfddf8..28d84b50 100644
--- a/eval_ensemble.py
+++ b/eval_ensemble.py
@@ -50,6 +50,8 @@
                 help='used for diverse beam search. Usually from 0.2 to 0.8. Higher value of lambda produces a more diverse list')
 parser.add_argument('--temperature', type=float, default=1.0,
                 help='temperature when sampling from distributions (i.e. when sample_max = 0). Lower = "safer" predictions.')
+parser.add_argument('--decoding_constraint', type=int, default=0,
+                help='If 1, not allowing same word in a row')
 # For evaluation on a folder of images:
 parser.add_argument('--image_folder', type=str, default='', 
                 help='If this is nonempty then will predict on the images in this folder path')
diff --git a/models/AttEnsemble.py b/models/AttEnsemble.py
index 71d3ef4d..84753c87 100644
--- a/models/AttEnsemble.py
+++ b/models/AttEnsemble.py
@@ -142,6 +142,7 @@ def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
         sample_max = opt.get('sample_max', 1)
         beam_size = opt.get('beam_size', 1)
         temperature = opt.get('temperature', 1.0)
+        decoding_constraint = opt.get('decoding_constraint', 0)
         if beam_size > 1:
             return self._sample_beam(fc_feats, att_feats, att_masks, opt)
 
@@ -195,7 +196,12 @@ def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
                 seqLogprobs[:,t-1] = sampleLogprobs.view(-1)
 
             output, state = self.core(xt, fc_feats, att_feats, p_att_feats, state, [att_masks] * len(self.models))
-            logprobs = torch.stack([F.softmax(m.logit(output[i])) for i,m in enumerate(self.models)], 2).mean(2).log()
+            if decoding_constraint and t > 0:
+                tmp = output.data.new(output.size(0), self.vocab_size + 1).zero_()
+                tmp.scatter_(1, seq[:,t-1].data.unsqueeze(1), float('-inf'))
+                logprobs = torch.stack([F.softmax(m.logit(output[i]+Variable(tmp))) for i,m in enumerate(self.models)], 2).mean(2).log()
+            else:
+                logprobs = torch.stack([F.softmax(m.logit(output[i])) for i,m in enumerate(self.models)], 2).mean(2).log()
 
         return seq, seqLogprobs
         # return torch.cat([_.unsqueeze(1) for _ in seq], 1), torch.cat([_.unsqueeze(1) for _ in seqLogprobs], 1)
@@ -272,6 +278,7 @@ def beam_step(logprobsf, unaug_logprobsf, beam_size, t, beam_seq, beam_seq_logpr
         beam_size = opt.get('beam_size', 10)
         group_size = opt.get('group_size', 1)
         diversity_lambda = opt.get('diversity_lambda', 0.5)
+        decoding_constraint = opt.get('decoding_constraint', 0)
         bdash = beam_size // group_size # beam per group
 
         # INITIALIZATIONS
@@ -294,6 +301,9 @@ def beam_step(logprobsf, unaug_logprobsf, beam_size, t, beam_seq, beam_seq_logpr
                 if t >= divm and t <= self.seq_length + divm - 1:
                     # add diversity
                     logprobsf = logprobs_table[divm].data.float()
+                    # suppress previous word
+                    if decoding_constraint and t-divm > 0:
+                        logprobsf.scatter_(1, beam_seq_table[divm][t-divm-1].unsqueeze(1).cuda(), float('-inf'))
                     # suppress UNK tokens in the decoding
                     logprobsf[:,logprobsf.size(1)-1] = logprobsf[:, logprobsf.size(1)-1] - 1000  
                     # diversity is added here
diff --git a/models/AttModel.py b/models/AttModel.py
index 72b846d9..7ef61299 100644
--- a/models/AttModel.py
+++ b/models/AttModel.py
@@ -157,6 +157,7 @@ def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
         sample_max = opt.get('sample_max', 1)
         beam_size = opt.get('beam_size', 1)
         temperature = opt.get('temperature', 1.0)
+        decoding_constraint = opt.get('decoding_constraint', 0)
         if beam_size > 1:
             return self._sample_beam(fc_feats, att_feats, att_masks, opt)
 
@@ -210,7 +211,12 @@ def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
                 seqLogprobs[:,t-1] = sampleLogprobs.view(-1)
 
             output, state = self.core(xt, fc_feats, att_feats, p_att_feats, state, att_masks)
-            logprobs = F.log_softmax(self.logit(output))
+            if decoding_constraint and t > 0:
+                tmp = output.data.new(output.size(0), self.vocab_size + 1).zero_()
+                tmp.scatter_(1, seq[:,t-1].data.unsqueeze(1), float('-inf'))
+                logprobs = F.log_softmax(self.logit(output)+Variable(tmp))
+            else:
+                logprobs = F.log_softmax(self.logit(output))
 
         return seq, seqLogprobs
         # return torch.cat([_.unsqueeze(1) for _ in seq], 1), torch.cat([_.unsqueeze(1) for _ in seqLogprobs], 1)
diff --git a/models/CaptionModel.py b/models/CaptionModel.py
index ef35fe21..1a54ec36 100644
--- a/models/CaptionModel.py
+++ b/models/CaptionModel.py
@@ -101,6 +101,7 @@ def beam_step(logprobsf, unaug_logprobsf, beam_size, t, beam_seq, beam_seq_logpr
         beam_size = opt.get('beam_size', 10)
         group_size = opt.get('group_size', 1)
         diversity_lambda = opt.get('diversity_lambda', 0.5)
+        decoding_constraint = opt.get('decoding_constraint', 0)
         bdash = beam_size // group_size # beam per group
 
         # INITIALIZATIONS
@@ -124,6 +125,9 @@ def beam_step(logprobsf, unaug_logprobsf, beam_size, t, beam_seq, beam_seq_logpr
                 if t >= divm and t <= self.seq_length + divm - 1:
                     # add diversity
                     logprobsf = logprobs_table[divm].data.float()
+                    # suppress previous word
+                    if decoding_constraint and t-divm > 0:
+                        logprobsf.scatter_(1, beam_seq_table[divm][t-divm-1].unsqueeze(1).cuda(), float('-inf'))
                     # suppress UNK tokens in the decoding
                     logprobsf[:,logprobsf.size(1)-1] = logprobsf[:, logprobsf.size(1)-1] - 1000  
                     # diversity is added here

From 020aff852d3cc8161ce1b22a1dac83115d0a7a20 Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Sun, 5 Nov 2017 22:39:56 -0600
Subject: [PATCH 15/42] Dump image width and height into image info files.

---
 scripts/prepro_labels.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/scripts/prepro_labels.py b/scripts/prepro_labels.py
index e85cef8d..ffde89f0 100644
--- a/scripts/prepro_labels.py
+++ b/scripts/prepro_labels.py
@@ -39,6 +39,7 @@
 import torchvision.models as models
 from torch.autograd import Variable
 import skimage.io
+from PIL import Image
 
 def build_vocab(imgs, params):
   count_thr = params['word_count_threshold']
@@ -171,6 +172,9 @@ def main(params):
     if 'filename' in img: jimg['file_path'] = os.path.join(img['filepath'], img['filename']) # copy it over, might need
     if 'cocoid' in img: jimg['id'] = img['cocoid'] # copy over & mantain an id, if present (e.g. coco ids, useful)
     
+    with Image.open(os.path.join(params['images_root'], img['filepath'], img['filename'])) as _img:
+      jimg['width'], jimg['height'] = _img.size
+
     out['images'].append(jimg)
   
   json.dump(out, open(params['output_json'], 'w'))
@@ -184,6 +188,7 @@ def main(params):
   parser.add_argument('--input_json', required=True, help='input json file to process into hdf5')
   parser.add_argument('--output_json', default='data.json', help='output json file')
   parser.add_argument('--output_h5', default='data', help='output h5 file')
+  parser.add_argument('--images_root', default='', help='root location in which images are stored, to be prepended to file_path in input json')
 
   # options
   parser.add_argument('--max_length', default=16, type=int, help='max length of a caption, in number of words. captions longer than this get clipped.')

From 7d049432c201090fa58948e3ec492729beeb4170 Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Mon, 6 Nov 2017 01:40:33 -0600
Subject: [PATCH 16/42] Remove get_npy_data function.

---
 dataloader.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/dataloader.py b/dataloader.py
index 510a7225..fc03b121 100644
--- a/dataloader.py
+++ b/dataloader.py
@@ -13,12 +13,6 @@
 
 import multiprocessing
 
-def get_npy_data(ix, fc_file, att_file, use_att):
-    if use_att == True:
-        return (np.load(fc_file), np.load(att_file)['feat'], ix)
-    else:
-        return (np.load(fc_file), np.zeros((1,1,1)), ix)
-
 class DataLoader(data.Dataset):
 
     def reset_iterator(self, split):
@@ -182,11 +176,9 @@ def __getitem__(self, index):
         """This function returns a tuple that is further passed to collate_fn
         """
         ix = index #self.split_ix[index]
-        return get_npy_data(ix, \
-                os.path.join(self.input_fc_dir, str(self.info['images'][ix]['id']) + '.npy'),
-                os.path.join(self.input_att_dir, str(self.info['images'][ix]['id']) + '.npz'),
-                self.use_att
-                )
+        return (np.load(os.path.join(self.input_fc_dir, str(self.info['images'][ix]['id']) + '.npy')),
+                np.load(os.path.join(self.input_att_dir, str(self.info['images'][ix]['id']) + '.npz'))['feat'] if self.use_att else np.zeros((1,1,1)),
+                ix)
 
     def __len__(self):
         return len(self.info['images'])

From 1103ff3e171d4916b0a2f3cf292830f789f4531f Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Mon, 6 Nov 2017 17:50:06 -0600
Subject: [PATCH 17/42] Add box feature.

---
 dataloader.py | 19 +++++++++++++++++--
 opts.py       |  4 ++++
 train.py      |  3 +++
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/dataloader.py b/dataloader.py
index fc03b121..47ec4cd0 100644
--- a/dataloader.py
+++ b/dataloader.py
@@ -33,7 +33,10 @@ def __init__(self, opt):
         self.opt = opt
         self.batch_size = self.opt.batch_size
         self.seq_per_img = opt.seq_per_img
+        
+        # feature related options
         self.use_att = getattr(opt, 'use_att', True)
+        self.use_box = getattr(opt, 'use_box', 0)
 
         # load the json file which contains additional information about the dataset
         print('DataLoader loading json file: ', opt.input_json)
@@ -43,11 +46,12 @@ def __init__(self, opt):
         print('vocab size is ', self.vocab_size)
         
         # open the hdf5 file
-        print('DataLoader loading h5 file: ', opt.input_fc_dir, opt.input_att_dir, opt.input_label_h5)
+        print('DataLoader loading h5 file: ', opt.input_fc_dir, opt.input_att_dir, opt.input_box_dir, opt.input_label_h5)
         self.h5_label_file = h5py.File(self.opt.input_label_h5, 'r', driver='core')
 
         self.input_fc_dir = self.opt.input_fc_dir
         self.input_att_dir = self.opt.input_att_dir
+        self.input_box_dir = self.opt.input_box_dir
 
         # load in the sequence data
         seq_size = self.h5_label_file['labels'].shape
@@ -176,8 +180,19 @@ def __getitem__(self, index):
         """This function returns a tuple that is further passed to collate_fn
         """
         ix = index #self.split_ix[index]
+        if self.use_att:
+            att_feat = np.load(os.path.join(self.input_att_dir, str(self.info['images'][ix]['id']) + '.npz'))['feat']
+            if self.use_box:
+                box_feat = np.load(os.path.join(self.input_box_dir, str(self.info['images'][ix]['id']) + '.npy'))
+                # devided by image width and height
+                x1,y1,x2,y2 = np.hsplit(box_feat, 4)
+                h,w = self.info['images'][ix]['height'], self.info['images'][ix]['width']
+                box_feat = np.hstack((x1/w, y1/h, x2/w, y2/h, (x2-x1)*(y2-y1)/(w*h))) # question? x2-x1+1??
+                att_feat = np.hstack([att_feat, box_feat])
+        else:
+            att_feat = np.zeros((1,1,1))
         return (np.load(os.path.join(self.input_fc_dir, str(self.info['images'][ix]['id']) + '.npy')),
-                np.load(os.path.join(self.input_att_dir, str(self.info['images'][ix]['id']) + '.npz'))['feat'] if self.use_att else np.zeros((1,1,1)),
+                att_feat,
                 ix)
 
     def __len__(self):
diff --git a/opts.py b/opts.py
index cf4a2135..f4cc80e2 100644
--- a/opts.py
+++ b/opts.py
@@ -9,6 +9,8 @@ def parse_opt():
                     help='path to the directory containing the preprocessed fc feats')
     parser.add_argument('--input_att_dir', type=str, default='data/cocotalk_att',
                     help='path to the directory containing the preprocessed att feats')
+    parser.add_argument('--input_box_dir', type=str, default='data/cocotalk_box',
+                    help='path to the directory containing the boxes of att feats')
     parser.add_argument('--input_label_h5', type=str, default='data/coco_label.h5',
                     help='path to the h5file containing the preprocessed dataset')
     parser.add_argument('--start_from', type=str, default=None,
@@ -39,6 +41,8 @@ def parse_opt():
     parser.add_argument('--att_feat_size', type=int, default=2048,
                     help='2048 for resnet, 512 for vgg')
 
+    parser.add_argument('--use_box', type=int, default=0,
+                    help='If use box features')
     # Optimization: General
     parser.add_argument('--max_epochs', type=int, default=-1,
                     help='number of epochs')
diff --git a/train.py b/train.py
index c490a47f..d105d509 100644
--- a/train.py
+++ b/train.py
@@ -31,7 +31,10 @@ def add_summary_value(writer, key, value, iteration):
     writer.add_summary(summary, iteration)
 
 def train(opt):
+    # Deal with feature things before anything
     opt.use_att = utils.if_use_att(opt.caption_model)
+    if opt.use_box: opt.att_feat_size = opt.att_feat_size + 5
+
     loader = DataLoader(opt)
     opt.vocab_size = loader.vocab_size
     opt.seq_length = loader.seq_length

From 2274e3973b17a25504ac518989754ef97ee1cc35 Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Mon, 6 Nov 2017 19:02:13 -0600
Subject: [PATCH 18/42] Clean up the dataloader a little bit.

---
 dataloader.py | 48 +++++++++++++++++++++++-------------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/dataloader.py b/dataloader.py
index 47ec4cd0..0e10bcc6 100644
--- a/dataloader.py
+++ b/dataloader.py
@@ -94,6 +94,25 @@ def cleanup():
         import atexit
         atexit.register(cleanup)
 
+    def get_captions(self, ix, seq_per_img):
+        # fetch the sequence labels
+        ix1 = self.label_start_ix[ix] - 1 #label_start_ix starts from 1
+        ix2 = self.label_end_ix[ix] - 1
+        ncap = ix2 - ix1 + 1 # number of captions available for this image
+        assert ncap > 0, 'an image does not have any label. this can be handled but right now isn\'t'
+
+        if ncap < seq_per_img:
+            # we need to subsample (with replacement)
+            seq = np.zeros([seq_per_img, self.seq_length], dtype = 'int')
+            for q in range(seq_per_img):
+                ixl = random.randint(ix1,ix2)
+                seq[q, :] = self.h5_label_file['labels'][ixl, :self.seq_length]
+        else:
+            ixl = random.randint(ix1, ix2 - seq_per_img + 1)
+            seq = self.h5_label_file['labels'][ixl: ixl + seq_per_img, :self.seq_length]
+
+        return seq
+
     def get_batch(self, split, batch_size=None, seq_per_img=None):
         batch_size = batch_size or self.batch_size
         seq_per_img = seq_per_img or self.seq_per_img
@@ -109,31 +128,13 @@ def get_batch(self, split, batch_size=None, seq_per_img=None):
         gts = []
 
         for i in range(batch_size):
-            import time
-            t_start = time.time()
             # fetch image
             tmp_fc, tmp_att,\
                 ix, tmp_wrapped = self._prefetch_process[split].get()
             fc_batch += [tmp_fc] * seq_per_img
             att_batch += [tmp_att] * seq_per_img
-
-            # fetch the sequence labels
-            ix1 = self.label_start_ix[ix] - 1 #label_start_ix starts from 1
-            ix2 = self.label_end_ix[ix] - 1
-            ncap = ix2 - ix1 + 1 # number of captions available for this image
-            assert ncap > 0, 'an image does not have any label. this can be handled but right now isn\'t'
-
-            if ncap < seq_per_img:
-                # we need to subsample (with replacement)
-                seq = np.zeros([seq_per_img, self.seq_length], dtype = 'int')
-                for q in range(seq_per_img):
-                    ixl = random.randint(ix1,ix2)
-                    seq[q, :] = self.h5_label_file['labels'][ixl, :self.seq_length]
-            else:
-                ixl = random.randint(ix1, ix2 - seq_per_img + 1)
-                seq = self.h5_label_file['labels'][ixl: ixl + seq_per_img, :self.seq_length]
             
-            label_batch[i * seq_per_img : (i + 1) * seq_per_img, 1 : self.seq_length + 1] = seq
+            label_batch[i * seq_per_img : (i + 1) * seq_per_img, 1 : self.seq_length + 1] = self.get_captions(ix, seq_per_img)
 
             if tmp_wrapped:
                 wrapped = True
@@ -147,14 +148,11 @@ def get_batch(self, split, batch_size=None, seq_per_img=None):
             info_dict['id'] = self.info['images'][ix]['id']
             info_dict['file_path'] = self.info['images'][ix]['file_path']
             infos.append(info_dict)
-            #print(i, time.time() - t_start)
 
         # generate mask
-        t_start = time.time()
         nonzeros = np.array(list(map(lambda x: (x != 0).sum()+2, label_batch)))
         for ix, row in enumerate(mask_batch):
             row[:nonzeros[ix]] = 1
-        #print('mask', time.time() - t_start)
 
         data = {}
         data['fc_feats'] = np.stack(fc_batch)
@@ -211,17 +209,17 @@ def __init__(self, split, dataloader, if_shuffle=False):
     # Add more in the queue
     def reset(self):
         """
-        Two cases:
+        Two cases for this function to be triggered:
         1. not hasattr(self, 'split_loader'): Resume from previous training. Create the dataset given the saved split_ix and iterator
         2. wrapped: a new epoch, the split_ix and iterator have been updated in the get_minibatch_inds already.
         """
-        # batch_size is 0, the merge is done in DataLoader class
+        # batch_size is 1, the merge is done in DataLoader class
         self.split_loader = iter(data.DataLoader(dataset=self.dataloader,
                                             batch_size=1,
                                             sampler=self.dataloader.split_ix[self.split][self.dataloader.iterators[self.split]:],
                                             shuffle=False,
                                             pin_memory=True,
-                                            num_workers=multiprocessing.cpu_count(),
+                                            num_workers=4, # 4 is usually enough
                                             collate_fn=lambda x: x[0]))
 
     def _get_next_minibatch_inds(self):

From f73a2baf9cb2bdb83561859fec6e1f087c0187d9 Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Mon, 6 Nov 2017 19:17:52 -0600
Subject: [PATCH 19/42] Allow other options of optimizer.

---
 misc/utils.py | 20 +++++++++++++++++++-
 train.py      |  3 +--
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/misc/utils.py b/misc/utils.py
index cf03ed92..f9eb81a8 100644
--- a/misc/utils.py
+++ b/misc/utils.py
@@ -7,6 +7,7 @@
 import torch.nn as nn
 from torch.autograd import Variable
 import numpy as np
+import torch.optim as optim
 
 def if_use_att(caption_model):
     # Decide if load attention feature according to caption model
@@ -71,4 +72,21 @@ def set_lr(optimizer, lr):
 def clip_gradient(optimizer, grad_clip):
     for group in optimizer.param_groups:
         for param in group['params']:
-            param.grad.data.clamp_(-grad_clip, grad_clip)
\ No newline at end of file
+            param.grad.data.clamp_(-grad_clip, grad_clip)
+
+def build_optimizer(params, opt):
+    if opt.optim == 'rmsprop':
+        return optim.RMSprop(params, opt.learning_rate, opt.optim_alpha, opt.optim_epsilon, weight_decay=opt.weight_decay)
+    elif opt.optim == 'adagrad':
+        return optim.Adagrad(params, opt.learning_rate, weight_decay=opt.weight_decay)
+    elif opt.optim == 'sgd':
+        return optim.SGD(params, opt.learning_rate, weight_decay=opt.weight_decay)
+    elif opt.optim == 'sgdm':
+        return optim.SGD(params, opt.learning_rate, opt.optim_alpha, weight_decay=opt.weight_decay)
+    elif opt.optim == 'sgdmom':
+        return optim.SGD(params, opt.learning_rate, opt.optim_alpha, weight_decay=opt.weight_decay, nesterov=True)
+    elif opt.optim == 'adam':
+        return optim.Adam(params, opt.learning_rate, (opt.optim_alpha, opt.optim_beta), opt.optim_epsilon, weight_decay=opt.weight_decay)
+    else:
+        raise Exception("bad option opt.optim: {}".format(opt.optim))
+    
\ No newline at end of file
diff --git a/train.py b/train.py
index d105d509..5a61bbf2 100644
--- a/train.py
+++ b/train.py
@@ -79,8 +79,7 @@ def train(opt):
     crit = utils.LanguageModelCriterion()
     rl_crit = utils.RewardCriterion()
 
-    optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate, weight_decay=opt.weight_decay)
-
+    optimizer = utils.build_optimizer(model.parameters(), opt)
     # Load the optimizer
     if vars(opt).get('start_from', None) is not None and os.path.isfile(os.path.join(opt.start_from,"optimizer.pth")):
         optimizer.load_state_dict(torch.load(os.path.join(opt.start_from, 'optimizer.pth')))

From 3a33158c24f438705c2b0a2618d6e512bb61c3d6 Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Mon, 6 Nov 2017 19:29:36 -0600
Subject: [PATCH 20/42] Add norm_att_feat and norm_box_feat options.

---
 dataloader.py | 6 ++++++
 opts.py       | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/dataloader.py b/dataloader.py
index 0e10bcc6..18ed9025 100644
--- a/dataloader.py
+++ b/dataloader.py
@@ -37,6 +37,8 @@ def __init__(self, opt):
         # feature related options
         self.use_att = getattr(opt, 'use_att', True)
         self.use_box = getattr(opt, 'use_box', 0)
+        self.norm_att_feat = getattr(opt, 'norm_att_feat', 0)
+        self.norm_box_feat = getattr(opt, 'norm_box_feat', 0)
 
         # load the json file which contains additional information about the dataset
         print('DataLoader loading json file: ', opt.input_json)
@@ -180,12 +182,16 @@ def __getitem__(self, index):
         ix = index #self.split_ix[index]
         if self.use_att:
             att_feat = np.load(os.path.join(self.input_att_dir, str(self.info['images'][ix]['id']) + '.npz'))['feat']
+            if self.norm_att_feat:
+                att_feat = att_feat / np.linalg.norm(att_feat, 2, 1, keepdims=True)
             if self.use_box:
                 box_feat = np.load(os.path.join(self.input_box_dir, str(self.info['images'][ix]['id']) + '.npy'))
                 # devided by image width and height
                 x1,y1,x2,y2 = np.hsplit(box_feat, 4)
                 h,w = self.info['images'][ix]['height'], self.info['images'][ix]['width']
                 box_feat = np.hstack((x1/w, y1/h, x2/w, y2/h, (x2-x1)*(y2-y1)/(w*h))) # question? x2-x1+1??
+                if self.norm_box_feat:
+                    box_feat = box_feat / np.linalg.norm(box_feat, 2, 1, keepdims=True)
                 att_feat = np.hstack([att_feat, box_feat])
         else:
             att_feat = np.zeros((1,1,1))
diff --git a/opts.py b/opts.py
index f4cc80e2..9fb4c2a2 100644
--- a/opts.py
+++ b/opts.py
@@ -41,8 +41,14 @@ def parse_opt():
     parser.add_argument('--att_feat_size', type=int, default=2048,
                     help='2048 for resnet, 512 for vgg')
 
+    # feature manipulation
+    parser.add_argument('--norm_att_feat', type=int, default=0,
+                    help='If normalize attention features')
     parser.add_argument('--use_box', type=int, default=0,
                     help='If use box features')
+    parser.add_argument('--norm_box_feat', type=int, default=0,
+                    help='If use box, do we normalize box feature')
+
     # Optimization: General
     parser.add_argument('--max_epochs', type=int, default=-1,
                     help='number of epochs')

From 82ec707d8acbe54355a6a267c30aa9adf8a91201 Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Fri, 10 Nov 2017 02:22:43 -0600
Subject: [PATCH 21/42] Fix a fatal bug: not using att_masks when calculating
 greedy decoding baseline.

---
 misc/rewards.py | 8 ++++++--
 train.py        | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/misc/rewards.py b/misc/rewards.py
index 000f80ab..0a29267e 100644
--- a/misc/rewards.py
+++ b/misc/rewards.py
@@ -33,12 +33,16 @@ def array_to_str(arr):
             break
     return out.strip()
 
-def get_self_critical_reward(model, fc_feats, att_feats, data, gen_result, opt):
+def get_self_critical_reward(model, fc_feats, att_feats, att_masks, data, gen_result, opt):
     batch_size = gen_result.size(0)# batch_size = sample_size * seq_per_img
     seq_per_img = batch_size // len(data['gts'])
     
     # get greedy decoding baseline
-    greedy_res, _ = model(Variable(fc_feats.data, volatile=True), Variable(att_feats.data, volatile=True), mode='sample')
+    model.eval()
+    greedy_res, _ = model(Variable(fc_feats.data, volatile=True),
+                          Variable(att_feats.data, volatile=True),
+                          att_masks=Variable(att_masks.data, volatile=True), mode='sample')
+    model.train()
 
     res = OrderedDict()
     
diff --git a/train.py b/train.py
index 5a61bbf2..145a4cb1 100644
--- a/train.py
+++ b/train.py
@@ -126,7 +126,7 @@ def train(opt):
             loss = crit(dp_model(fc_feats, att_feats, labels, att_masks), labels[:,1:], masks[:,1:])
         else:
             gen_result, sample_logprobs = dp_model(fc_feats, att_feats, att_masks, opt={'sample_max':0}, mode='sample')
-            reward = get_self_critical_reward(dp_model, fc_feats, att_feats, data, gen_result, opt)
+            reward = get_self_critical_reward(dp_model, fc_feats, att_feats, att_masks, data, gen_result, opt)
             loss = rl_crit(sample_logprobs, gen_result.data, Variable(torch.from_numpy(reward).float().cuda(), requires_grad=False))
 
         loss.backward()

From d1f352a71ba7056a4dc3adb6698e0e792d78b0b2 Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Fri, 10 Nov 2017 22:25:08 -0600
Subject: [PATCH 22/42] 1. Sort the feature according to number of attention
 features. 2. Use pack_padded_sequence to do att_embed. (The first step is for
 this.) 3. Clean some useless view change.

---
 dataloader.py         | 35 +++++++++++++++++++++--------------
 eval.py               |  3 +++
 eval_ensemble.py      |  7 +++++++
 models/AttEnsemble.py | 19 +++++++------------
 models/AttModel.py    | 26 ++++++++++++++------------
 5 files changed, 52 insertions(+), 38 deletions(-)

diff --git a/dataloader.py b/dataloader.py
index 18ed9025..dfe8d1bb 100644
--- a/dataloader.py
+++ b/dataloader.py
@@ -133,8 +133,8 @@ def get_batch(self, split, batch_size=None, seq_per_img=None):
             # fetch image
             tmp_fc, tmp_att,\
                 ix, tmp_wrapped = self._prefetch_process[split].get()
-            fc_batch += [tmp_fc] * seq_per_img
-            att_batch += [tmp_att] * seq_per_img
+            fc_batch.append(tmp_fc)
+            att_batch.append(tmp_att)
             
             label_batch[i * seq_per_img : (i + 1) * seq_per_img, 1 : self.seq_length + 1] = self.get_captions(ix, seq_per_img)
 
@@ -151,23 +151,28 @@ def get_batch(self, split, batch_size=None, seq_per_img=None):
             info_dict['file_path'] = self.info['images'][ix]['file_path']
             infos.append(info_dict)
 
-        # generate mask
-        nonzeros = np.array(list(map(lambda x: (x != 0).sum()+2, label_batch)))
-        for ix, row in enumerate(mask_batch):
-            row[:nonzeros[ix]] = 1
-
+        #sort by att_feat length
+        fc_batch, att_batch, label_batch, gts, infos = \
+            zip(*sorted(zip(fc_batch, att_batch, np.vsplit(label_batch, batch_size), gts, infos), key=lambda x: len(x[1]), reverse=True))
         data = {}
-        data['fc_feats'] = np.stack(fc_batch)
+        data['fc_feats'] = np.stack(reduce(lambda x,y:x+y, [[_]*seq_per_img for _ in fc_batch]))
+        # merge att_feats
         max_att_len = max([_.shape[0] for _ in att_batch])
-        data['att_feats'] = np.zeros([len(att_batch), max_att_len, att_batch[0].shape[1]], dtype = 'float32')
+        data['att_feats'] = np.zeros([len(att_batch)*seq_per_img, max_att_len, att_batch[0].shape[1]], dtype = 'float32')
         for i in range(len(att_batch)):
-            data['att_feats'][i][:att_batch[i].shape[0]] = att_batch[i]
+            data['att_feats'][i*seq_per_img:(i+1)*seq_per_img, :att_batch[i].shape[0]] = att_batch[i]
         data['att_masks'] = np.zeros(data['att_feats'].shape[:2], dtype='float32')
         for i in range(len(att_batch)):
-            data['att_masks'][i][:att_batch[i].shape[0]] = 1
-        data['labels'] = label_batch
-        data['gts'] = gts
-        data['masks'] = mask_batch 
+            data['att_masks'][i*seq_per_img:(i+1)*seq_per_img, :att_batch[i].shape[0]] = 1
+
+        data['labels'] = np.vstack(label_batch)
+        # generate mask
+        nonzeros = np.array(list(map(lambda x: (x != 0).sum()+2, data['labels'])))
+        for ix, row in enumerate(mask_batch):
+            row[:nonzeros[ix]] = 1
+        data['masks'] = mask_batch
+
+        data['gts'] = gts # all ground truth captions of each images
         data['bounds'] = {'it_pos_now': self.iterators[split], 'it_max': len(self.split_ix[split]), 'wrapped': wrapped}
         data['infos'] = infos
 
@@ -193,6 +198,8 @@ def __getitem__(self, index):
                 if self.norm_box_feat:
                     box_feat = box_feat / np.linalg.norm(box_feat, 2, 1, keepdims=True)
                 att_feat = np.hstack([att_feat, box_feat])
+                # sort the features by the size of boxes
+                att_feat = np.stack(sorted(att_feat, key=lambda x:x[-1], reverse=True))
         else:
             att_feat = np.zeros((1,1,1))
         return (np.load(os.path.join(self.input_fc_dir, str(self.info['images'][ix]['id']) + '.npy')),
diff --git a/eval.py b/eval.py
index 045d3355..59c99911 100644
--- a/eval.py
+++ b/eval.py
@@ -64,6 +64,8 @@
                 help='path to the h5file containing the preprocessed dataset')
 parser.add_argument('--input_att_dir', type=str, default='',
                 help='path to the h5file containing the preprocessed dataset')
+parser.add_argument('--input_box_dir', type=str, default='',
+                help='path to the h5file containing the preprocessed dataset')
 parser.add_argument('--input_label_h5', type=str, default='',
                 help='path to the h5file containing the preprocessed dataset')
 parser.add_argument('--input_json', type=str, default='', 
@@ -90,6 +92,7 @@
 if len(opt.input_fc_dir) == 0:
     opt.input_fc_dir = infos['opt'].input_fc_dir
     opt.input_att_dir = infos['opt'].input_att_dir
+    opt.input_att_dir = infos['opt'].input_box_dir
     opt.input_label_h5 = infos['opt'].input_label_h5
 if len(opt.input_json) == 0:
     opt.input_json = infos['opt'].input_json
diff --git a/eval_ensemble.py b/eval_ensemble.py
index 28d84b50..34ac61ad 100644
--- a/eval_ensemble.py
+++ b/eval_ensemble.py
@@ -62,6 +62,8 @@
                 help='path to the h5file containing the preprocessed dataset')
 parser.add_argument('--input_att_dir', type=str, default='',
                 help='path to the h5file containing the preprocessed dataset')
+parser.add_argument('--input_box_dir', type=str, default='',
+                help='path to the h5file containing the preprocessed dataset')
 parser.add_argument('--input_label_h5', type=str, default='',
                 help='path to the h5file containing the preprocessed dataset')
 parser.add_argument('--input_json', type=str, default='', 
@@ -92,6 +94,7 @@
 if len(opt.input_fc_dir) == 0:
     opt.input_fc_dir = infos['opt'].input_fc_dir
     opt.input_att_dir = infos['opt'].input_att_dir
+    opt.input_box_dir = infos['opt'].input_box_dir
     opt.input_label_h5 = infos['opt'].input_label_h5
 if len(opt.input_json) == 0:
     opt.input_json = infos['opt'].input_json
@@ -101,6 +104,10 @@
     opt.id = infos['opt'].id
 opt.seq_per_img = infos['opt'].seq_per_img
 
+opt.use_box = max([getattr(infos['opt'], 'use_box', 0) for infos in model_infos])
+assert max([getattr(infos['opt'], 'norm_att_feat', 0) for infos in model_infos]) == max([getattr(infos['opt'], 'norm_att_feat', 0) for infos in model_infos]), 'Not support different norm_att_feat'
+assert max([getattr(infos['opt'], 'norm_box_feat', 0) for infos in model_infos]) == max([getattr(infos['opt'], 'norm_box_feat', 0) for infos in model_infos]), 'Not support different norm_box_feat'
+
 vocab = infos['vocab'] # ix -> word mapping
 
 # Setup the model
diff --git a/models/AttEnsemble.py b/models/AttEnsemble.py
index 84753c87..c5d951fe 100644
--- a/models/AttEnsemble.py
+++ b/models/AttEnsemble.py
@@ -23,6 +23,7 @@
 import misc.utils as utils
 
 from .CaptionModel import CaptionModel
+from .AttModel import pack_wrapper
 
 class AttEnsemble(CaptionModel):
     def __init__(self, models):
@@ -60,12 +61,10 @@ def _forward(self, fc_feats, att_feats, seq, att_masks=None):
 
         # embed fc and att feats
         fc_feats = [m.fc_embed(fc_feats) for m in self.models]
-        _att_feats = [m.att_embed(att_feats.view(-1, m.att_feat_size)) for m in self.models]
-        att_feats = [_att_feats[i].view(*(att_feats.size()[:-1] + (m.rnn_size,))) for i,m in enumerate(self.models)]
+        att_feats = [pack_wrapper(m.att_embed, att_feats[...,:m.att_feat_size], att_masks) for m in self.models]
 
         # Project the attention feats first to reduce memory and computation comsumptions.
-        p_att_feats = [m.ctx2att(att_feats[i].view(-1, m.rnn_size)) for i, m in enumerate(self.models)]
-        p_att_feats = [p_att_feats[i].view(*(att_feats[i].size()[:-1] + (m.att_hid_size,))) for i,m in enumerate(self.models)]
+        p_att_feats = [m.ctx2att(att_feats[i]) for i,m in enumerate(self.models)]
 
         for i in range(seq.size(1) - 1):
             if self.training and i >= 1 and self.ss_prob > 0.0: # otherwiste no need to sample
@@ -104,12 +103,10 @@ def _sample_beam(self, fc_feats, att_feats, att_masks=None, opt={}):
 
         # embed fc and att feats
         fc_feats = [m.fc_embed(fc_feats) for m in self.models]
-        _att_feats = [m.att_embed(att_feats.view(-1, m.att_feat_size)) for m in self.models]
-        att_feats = [_att_feats[i].view(*(att_feats.size()[:-1] + (m.rnn_size,))) for i,m in enumerate(self.models)]
+        att_feats = [pack_wrapper(m.att_embed, att_feats[...,:m.att_feat_size], att_masks) for m in self.models]
 
         # Project the attention feats first to reduce memory and computation comsumptions.
-        p_att_feats = [m.ctx2att(att_feats[i].view(-1, m.rnn_size)) for i, m in enumerate(self.models)]
-        p_att_feats = [p_att_feats[i].view(*(att_feats[i].size()[:-1] + (m.att_hid_size,))) for i,m in enumerate(self.models)]
+        p_att_feats = [m.ctx2att(att_feats[i]) for i,m in enumerate(self.models)]
 
         assert beam_size <= self.vocab_size + 1, 'lets assume this for now, otherwise this corner case causes a few headaches down the road. can be dealt with in future if needed'
         seq = torch.LongTensor(self.seq_length, batch_size).zero_()
@@ -151,12 +148,10 @@ def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
 
         # embed fc and att feats
         fc_feats = [m.fc_embed(fc_feats) for m in self.models]
-        _att_feats = [m.att_embed(att_feats.view(-1, m.att_feat_size)) for m in self.models]
-        att_feats = [_att_feats[i].view(*(att_feats.size()[:-1] + (m.rnn_size,))) for i,m in enumerate(self.models)]
+        att_feats = [pack_wrapper(m.att_embed, att_feats[...,:m.att_feat_size], att_masks) for m in self.models]
 
         # Project the attention feats first to reduce memory and computation comsumptions.
-        p_att_feats = [m.ctx2att(att_feats[i].view(-1, m.rnn_size)) for i, m in enumerate(self.models)]
-        p_att_feats = [p_att_feats[i].view(*(att_feats[i].size()[:-1] + (m.att_hid_size,))) for i,m in enumerate(self.models)]
+        p_att_feats = [m.ctx2att(att_feats[i]) for i,m in enumerate(self.models)]
 
         # seq = []
         # seqLogprobs = []
diff --git a/models/AttModel.py b/models/AttModel.py
index 7ef61299..046404eb 100644
--- a/models/AttModel.py
+++ b/models/AttModel.py
@@ -21,9 +21,17 @@
 import torch.nn.functional as F
 from torch.autograd import *
 import misc.utils as utils
+from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence, pad_packed_sequence
 
 from .CaptionModel import CaptionModel
 
+def pack_wrapper(module, att_feats, att_masks):
+    if att_masks is not None:
+        packed = pack_padded_sequence(att_feats, list(att_masks.data.long().sum(1)), batch_first=True)
+        return pad_packed_sequence(PackedSequence(module(packed[0]), packed[1]), batch_first=True)[0]
+    else:
+        return module(att_feats)
+
 class AttModel(CaptionModel):
     def __init__(self, opt):
         super(AttModel, self).__init__()
@@ -66,12 +74,10 @@ def _forward(self, fc_feats, att_feats, seq, att_masks=None):
 
         # embed fc and att feats
         fc_feats = self.fc_embed(fc_feats)
-        _att_feats = self.att_embed(att_feats.view(-1, self.att_feat_size))
-        att_feats = _att_feats.view(*(att_feats.size()[:-1] + (self.rnn_size,)))
+        att_feats = pack_wrapper(self.att_embed, att_feats, att_masks)
 
         # Project the attention feats first to reduce memory and computation comsumptions.
-        p_att_feats = self.ctx2att(att_feats.view(-1, self.rnn_size))
-        p_att_feats = p_att_feats.view(*(att_feats.size()[:-1] + (self.att_hid_size,)))
+        p_att_feats = self.ctx2att(att_feats)
 
         for i in range(seq.size(1) - 1):
             if self.training and i >= 1 and self.ss_prob > 0.0: # otherwiste no need to sample
@@ -119,12 +125,10 @@ def _sample_beam(self, fc_feats, att_feats, att_masks=None, opt={}):
 
         # embed fc and att feats
         fc_feats = self.fc_embed(fc_feats)
-        _att_feats = self.att_embed(att_feats.view(-1, self.att_feat_size))
-        att_feats = _att_feats.view(*(att_feats.size()[:-1] + (self.rnn_size,)))
+        att_feats = pack_wrapper(self.att_embed, att_feats, att_masks)
 
         # Project the attention feats first to reduce memory and computation comsumptions.
-        p_att_feats = self.ctx2att(att_feats.view(-1, self.rnn_size))
-        p_att_feats = p_att_feats.view(*(att_feats.size()[:-1] + (self.att_hid_size,)))
+        p_att_feats = self.ctx2att(att_feats)
 
         assert beam_size <= self.vocab_size + 1, 'lets assume this for now, otherwise this corner case causes a few headaches down the road. can be dealt with in future if needed'
         seq = torch.LongTensor(self.seq_length, batch_size).zero_()
@@ -166,12 +170,10 @@ def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
 
         # embed fc and att feats
         fc_feats = self.fc_embed(fc_feats)
-        _att_feats = self.att_embed(att_feats.view(-1, self.att_feat_size))
-        att_feats = _att_feats.view(*(att_feats.size()[:-1] + (self.rnn_size,)))
+        att_feats = pack_wrapper(self.att_embed, att_feats, att_masks)
 
         # Project the attention feats first to reduce memory and computation comsumptions.
-        p_att_feats = self.ctx2att(att_feats.view(-1, self.rnn_size))
-        p_att_feats = p_att_feats.view(*(att_feats.size()[:-1] + (self.att_hid_size,)))
+        p_att_feats = self.ctx2att(att_feats)
 
         # seq = []
         # seqLogprobs = []

From e7cb8e2da07f62549cc6dbb1241c5dbe6f5622dd Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Fri, 10 Nov 2017 22:25:29 -0600
Subject: [PATCH 23/42] Add batch normalization layer in att_embed.

---
 models/AttModel.py | 8 ++++++--
 opts.py            | 4 ++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/models/AttModel.py b/models/AttModel.py
index 046404eb..293f822a 100644
--- a/models/AttModel.py
+++ b/models/AttModel.py
@@ -46,6 +46,8 @@ def __init__(self, opt):
         self.att_feat_size = opt.att_feat_size
         self.att_hid_size = opt.att_hid_size
 
+        self.use_bn = getattr(opt, 'use_bn', 0)
+
         self.ss_prob = 0.0 # Schedule sampling probability
 
         self.embed = nn.Sequential(nn.Embedding(self.vocab_size + 1, self.input_encoding_size),
@@ -54,9 +56,11 @@ def __init__(self, opt):
         self.fc_embed = nn.Sequential(nn.Linear(self.fc_feat_size, self.rnn_size),
                                     nn.ReLU(),
                                     nn.Dropout(self.drop_prob_lm))
-        self.att_embed = nn.Sequential(nn.Linear(self.att_feat_size, self.rnn_size),
+        self.att_embed = nn.Sequential(*(
+                                    ((nn.BatchNorm1d(self.att_feat_size),) if self.use_bn else ())+
+                                    (nn.Linear(self.att_feat_size, self.rnn_size),
                                     nn.ReLU(),
-                                    nn.Dropout(self.drop_prob_lm))
+                                    nn.Dropout(self.drop_prob_lm))))
         self.logit = nn.Linear(self.rnn_size, self.vocab_size + 1)
         self.ctx2att = nn.Linear(self.rnn_size, self.att_hid_size)
 
diff --git a/opts.py b/opts.py
index 9fb4c2a2..ce865df7 100644
--- a/opts.py
+++ b/opts.py
@@ -41,6 +41,10 @@ def parse_opt():
     parser.add_argument('--att_feat_size', type=int, default=2048,
                     help='2048 for resnet, 512 for vgg')
 
+
+    parser.add_argument('--use_bn', type=int, default=0,
+                    help='If 1, then do batch_normalization first in att_embed')
+
     # feature manipulation
     parser.add_argument('--norm_att_feat', type=int, default=0,
                     help='If normalize attention features')

From 742c37bb746fc2dc65545929048a08bd6168696f Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Mon, 20 Nov 2017 02:52:23 -0600
Subject: [PATCH 24/42] Allow new ways of computing (using pack sequence)
 capable of using dataparallel.

---
 models/AttModel.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/models/AttModel.py b/models/AttModel.py
index 293f822a..d667f9ce 100644
--- a/models/AttModel.py
+++ b/models/AttModel.py
@@ -69,7 +69,17 @@ def init_hidden(self, bsz):
         return (Variable(weight.new(self.num_layers, bsz, self.rnn_size).zero_()),
                 Variable(weight.new(self.num_layers, bsz, self.rnn_size).zero_()))
 
+    def clip_att(self, att_feats, att_masks):
+        # Clip the length of att_masks and att_feats to the maximum length
+        if att_masks is not None:
+            max_len = att_masks.data.long().sum(1).max()
+            att_feats = att_feats[:, :max_len].contiguous()
+            att_masks = att_masks[:, :max_len].contiguous()
+        return att_feats, att_masks
+
     def _forward(self, fc_feats, att_feats, seq, att_masks=None):
+        att_feats, att_masks = self.clip_att(att_feats, att_masks)
+
         batch_size = fc_feats.size(0)
         state = self.init_hidden(batch_size)
 
@@ -162,6 +172,8 @@ def _sample_beam(self, fc_feats, att_feats, att_masks=None, opt={}):
         return Variable(seq.transpose(0, 1)), Variable(seqLogprobs.transpose(0, 1))
 
     def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
+        att_feats, att_masks = self.clip_att(att_feats, att_masks)
+
         sample_max = opt.get('sample_max', 1)
         beam_size = opt.get('beam_size', 1)
         temperature = opt.get('temperature', 1.0)

From 997c908a6cd71bfa01032afe1b4e92e055865feb Mon Sep 17 00:00:00 2001
From: rluo <rluo@ttic.edu>
Date: Mon, 20 Nov 2017 02:52:40 -0600
Subject: [PATCH 25/42] Add logit layers option. (haven't reigourously tested
 if it works or not)

---
 models/AttModel.py | 11 +++++++++--
 opts.py            |  4 +++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/models/AttModel.py b/models/AttModel.py
index d667f9ce..710ade67 100644
--- a/models/AttModel.py
+++ b/models/AttModel.py
@@ -60,8 +60,15 @@ def __init__(self, opt):
                                     ((nn.BatchNorm1d(self.att_feat_size),) if self.use_bn else ())+
                                     (nn.Linear(self.att_feat_size, self.rnn_size),
                                     nn.ReLU(),
-                                    nn.Dropout(self.drop_prob_lm))))
-        self.logit = nn.Linear(self.rnn_size, self.vocab_size + 1)
+                                    nn.Dropout(self.drop_prob_lm))+
+                                    ((nn.BatchNorm1d(self.rnn_size),) if self.use_bn==2 else ())))
+
+        self.logit_layers = getattr(opt, 'logit_layers', 1)
+        if self.logit_layers == 1:
+            self.logit = nn.Linear(self.rnn_size, self.vocab_size + 1)
+        else:
+            self.logit = [[nn.Linear(self.rnn_size, self.rnn_size), nn.ReLU(), nn.Dropout(0.5)] for _ in range(opt.logit_layers - 1)]
+            self.logit = nn.Sequential(*(reduce(lambda x,y:x+y, self.logit) + [nn.Linear(self.rnn_size, self.vocab_size + 1)]))
         self.ctx2att = nn.Linear(self.rnn_size, self.att_hid_size)
 
     def init_hidden(self, bsz):
diff --git a/opts.py b/opts.py
index ce865df7..6d519fab 100644
--- a/opts.py
+++ b/opts.py
@@ -40,10 +40,12 @@ def parse_opt():
                     help='2048 for resnet, 4096 for vgg')
     parser.add_argument('--att_feat_size', type=int, default=2048,
                     help='2048 for resnet, 512 for vgg')
+    parser.add_argument('--logit_layers', type=int, default=1,
+                    help='number of layers in the RNN')
 
 
     parser.add_argument('--use_bn', type=int, default=0,
-                    help='If 1, then do batch_normalization first in att_embed')
+                    help='If 1, then do batch_normalization first in att_embed, if 2 then do bn both in the beginning and the end of att_embed')
 
     # feature manipulation
     parser.add_argument('--norm_att_feat', type=int, default=0,

From df7269d89d90bbe8597e5ccda59a08b06ec49623 Mon Sep 17 00:00:00 2001
From: Ruotian Luo <rluo@ttic.edu>
Date: Thu, 7 Dec 2017 01:12:23 -0600
Subject: [PATCH 26/42] Fix a bug in ensemble sample.

---
 models/AttEnsemble.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/models/AttEnsemble.py b/models/AttEnsemble.py
index c5d951fe..3fe51a74 100644
--- a/models/AttEnsemble.py
+++ b/models/AttEnsemble.py
@@ -155,8 +155,8 @@ def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
 
         # seq = []
         # seqLogprobs = []
-        seq = Variable(fc_feats.data.new(batch_size, self.seq_length).long().zero_())
-        seqLogprobs = Variable(fc_feats.data.new(batch_size, self.seq_length).zero_())
+        seq = Variable(fc_feats[0].data.new(batch_size, self.seq_length).long().zero_())
+        seqLogprobs = Variable(fc_feats[0].data.new(batch_size, self.seq_length).zero_())
         for t in range(self.seq_length + 1):
             if t == 0: # input <bos>
                 it = fc_feats[0].data.new(batch_size).long().zero_()

From e533d49d18ce64ce3eaae642981dfd65dbebf597 Mon Sep 17 00:00:00 2001
From: Ruotian Luo <rluo@ttic.edu>
Date: Thu, 7 Dec 2017 11:48:41 -0600
Subject: [PATCH 27/42] Add max ppl option (beam search sorted by perplexity
 instead of logprob) (it doens't seem changing too much)

---
 eval.py                | 2 ++
 eval_ensemble.py       | 2 ++
 models/AttEnsemble.py  | 3 +++
 models/CaptionModel.py | 3 +++
 4 files changed, 10 insertions(+)

diff --git a/eval.py b/eval.py
index 59c99911..033a32af 100644
--- a/eval.py
+++ b/eval.py
@@ -44,6 +44,8 @@
 # Sampling options
 parser.add_argument('--sample_max', type=int, default=1,
                 help='1 = sample argmax words. 0 = sample from distributions.')
+parser.add_argument('--max_ppl', type=int, default=0,
+                help='beam search by max perplexity or max probability.')
 parser.add_argument('--beam_size', type=int, default=2,
                 help='used when sample_max = 1, indicates number of beams in beam search. Usually 2 or 3 works well. More is not better. Set this to 1 for faster runtime but a bit worse performance.')
 parser.add_argument('--group_size', type=int, default=1,
diff --git a/eval_ensemble.py b/eval_ensemble.py
index 34ac61ad..412f0b11 100644
--- a/eval_ensemble.py
+++ b/eval_ensemble.py
@@ -42,6 +42,8 @@
 # Sampling options
 parser.add_argument('--sample_max', type=int, default=1,
                 help='1 = sample argmax words. 0 = sample from distributions.')
+parser.add_argument('--max_ppl', type=int, default=0,
+                help='beam search by max perplexity or max probability.')
 parser.add_argument('--beam_size', type=int, default=2,
                 help='used when sample_max = 1, indicates number of beams in beam search. Usually 2 or 3 works well. More is not better. Set this to 1 for faster runtime but a bit worse performance.')
 parser.add_argument('--group_size', type=int, default=1,
diff --git a/models/AttEnsemble.py b/models/AttEnsemble.py
index 3fe51a74..6d0ebb06 100644
--- a/models/AttEnsemble.py
+++ b/models/AttEnsemble.py
@@ -274,6 +274,7 @@ def beam_step(logprobsf, unaug_logprobsf, beam_size, t, beam_seq, beam_seq_logpr
         group_size = opt.get('group_size', 1)
         diversity_lambda = opt.get('diversity_lambda', 0.5)
         decoding_constraint = opt.get('decoding_constraint', 0)
+        max_ppl = opt.get('max_ppl', 0)
         bdash = beam_size // group_size # beam per group
 
         # INITIALIZATIONS
@@ -330,6 +331,8 @@ def beam_step(logprobsf, unaug_logprobsf, beam_size, t, beam_seq, beam_seq_logpr
                                 'unaug_p': beam_seq_logprobs_table[divm][:, vix].sum(),
                                 'p': beam_logprobs_sum_table[divm][vix]
                             }
+                            if max_ppl:
+                                final_beam['p'] = final_beam['p'] / (t-divm+1)
                             done_beams_table[divm].append(final_beam)
                             # don't continue beams from finished sequences
                             beam_logprobs_sum_table[divm][vix] = -1000
diff --git a/models/CaptionModel.py b/models/CaptionModel.py
index 1a54ec36..f9c10e8a 100644
--- a/models/CaptionModel.py
+++ b/models/CaptionModel.py
@@ -102,6 +102,7 @@ def beam_step(logprobsf, unaug_logprobsf, beam_size, t, beam_seq, beam_seq_logpr
         group_size = opt.get('group_size', 1)
         diversity_lambda = opt.get('diversity_lambda', 0.5)
         decoding_constraint = opt.get('decoding_constraint', 0)
+        max_ppl = opt.get('max_ppl', 0)
         bdash = beam_size // group_size # beam per group
 
         # INITIALIZATIONS
@@ -159,6 +160,8 @@ def beam_step(logprobsf, unaug_logprobsf, beam_size, t, beam_seq, beam_seq_logpr
                                 'unaug_p': beam_seq_logprobs_table[divm][:, vix].sum(),
                                 'p': beam_logprobs_sum_table[divm][vix]
                             }
+                            if max_ppl:
+                                final_beam['p'] = final_beam['p'] / (t-divm+1)
                             done_beams_table[divm].append(final_beam)
                             # don't continue beams from finished sequences
                             beam_logprobs_sum_table[divm][vix] = -1000

From ec6d4ec35fac268e96e1daca72af8c3914f3dde0 Mon Sep 17 00:00:00 2001
From: Ruotian Luo <rluo@ttic.edu>
Date: Fri, 19 Jan 2018 20:31:04 -0600
Subject: [PATCH 28/42] Move set_lr to the right place in train.py

---
 train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train.py b/train.py
index 145a4cb1..18aecfc6 100644
--- a/train.py
+++ b/train.py
@@ -91,9 +91,9 @@ def train(opt):
                 frac = (epoch - opt.learning_rate_decay_start) // opt.learning_rate_decay_every
                 decay_factor = opt.learning_rate_decay_rate  ** frac
                 opt.current_lr = opt.learning_rate * decay_factor
-                utils.set_lr(optimizer, opt.current_lr) # set the decayed rate
             else:
                 opt.current_lr = opt.learning_rate
+            utils.set_lr(optimizer, opt.current_lr) # set the decayed rate
             # Assign the scheduled sampling prob
             if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                 frac = (epoch - opt.scheduled_sampling_start) // opt.scheduled_sampling_increase_every

From d8164b08ef0ec8438c98b0bf34077732cd74919f Mon Sep 17 00:00:00 2001
From: Ruotian Luo <rluo@ttic.edu>
Date: Wed, 11 Apr 2018 14:55:30 -0500
Subject: [PATCH 29/42] Update FC Model to the compatible version (previously
 FC Model is depreacated and not adapted to new structure.)

---
 models/FCModel.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/models/FCModel.py b/models/FCModel.py
index ff6321a5..3275824b 100644
--- a/models/FCModel.py
+++ b/models/FCModel.py
@@ -37,9 +37,7 @@ def forward(self, xt, state):
         next_c = forget_gate * state[1][-1] + in_gate * in_transform
         next_h = out_gate * F.tanh(next_c)
 
-        next_h = self.dropout(next_h)
-
-        output = next_h
+        output = self.dropout(next_h)
         state = (next_h.unsqueeze(0), next_c.unsqueeze(0))
         return output, state
 
@@ -78,7 +76,7 @@ def init_hidden(self, bsz):
         else:
             return Variable(weight.new(self.num_layers, bsz, self.rnn_size).zero_())
 
-    def forward(self, fc_feats, att_feats, seq):
+    def _forward(self, fc_feats, att_feats, seq, att_masks=None):
         batch_size = fc_feats.size(0)
         state = self.init_hidden(batch_size)
         outputs = []
@@ -122,7 +120,7 @@ def get_logprobs_state(self, it, state):
 
         return logprobs, state
 
-    def sample_beam(self, fc_feats, att_feats, opt={}):
+    def _sample_beam(self, fc_feats, att_feats, att_masks=None, opt={}):
         beam_size = opt.get('beam_size', 10)
         batch_size = fc_feats.size(0)
 
@@ -148,9 +146,9 @@ def sample_beam(self, fc_feats, att_feats, opt={}):
             seq[:, k] = self.done_beams[k][0]['seq'] # the first beam has highest cumulative score
             seqLogprobs[:, k] = self.done_beams[k][0]['logps']
         # return the samples and their log likelihoods
-        return seq.transpose(0, 1), seqLogprobs.transpose(0, 1)
+        return Variable(seq.transpose(0, 1)), Variable(seqLogprobs.transpose(0, 1))
 
-    def sample(self, fc_feats, att_feats, opt={}):
+    def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
         sample_max = opt.get('sample_max', 1)
         beam_size = opt.get('beam_size', 1)
         temperature = opt.get('temperature', 1.0)
@@ -159,8 +157,8 @@ def sample(self, fc_feats, att_feats, opt={}):
 
         batch_size = fc_feats.size(0)
         state = self.init_hidden(batch_size)
-        seq = []
-        seqLogprobs = []
+        seq = Variable(fc_feats.data.new(batch_size, self.seq_length).long().zero_())
+        seqLogprobs = Variable(fc_feats.data.new(batch_size, self.seq_length).zero_())
         for t in range(self.seq_length + 2):
             if t == 0:
                 xt = self.img_embed(fc_feats)
@@ -191,10 +189,10 @@ def sample(self, fc_feats, att_feats, opt={}):
                 if unfinished.sum() == 0:
                     break
                 it = it * unfinished.type_as(it)
-                seq.append(it) #seq[t] the input of t+2 time step
-                seqLogprobs.append(sampleLogprobs.view(-1))
+                seq[:,t-2] = it #seq[t] the input of t+2 time step
+                seqLogprobs[:,t-2] = sampleLogprobs.view(-1)
 
             output, state = self.core(xt, state)
             logprobs = F.log_softmax(self.logit(output))
 
-        return torch.cat([_.unsqueeze(1) for _ in seq], 1), torch.cat([_.unsqueeze(1) for _ in seqLogprobs], 1)
+        return seq, seqLogprobs

From 5f4f3d1aa7f3e5ac7bfd1f950454bcf7ad672f8e Mon Sep 17 00:00:00 2001
From: Ruotian Luo <rluo@ttic.edu>
Date: Sat, 14 Apr 2018 15:40:20 -0500
Subject: [PATCH 30/42] Simplify AttModel.

---
 models/AttModel.py | 56 ++++++++++++++++++----------------------------
 1 file changed, 22 insertions(+), 34 deletions(-)

diff --git a/models/AttModel.py b/models/AttModel.py
index 710ade67..45a8603a 100644
--- a/models/AttModel.py
+++ b/models/AttModel.py
@@ -84,6 +84,17 @@ def clip_att(self, att_feats, att_masks):
             att_masks = att_masks[:, :max_len].contiguous()
         return att_feats, att_masks
 
+    def _prepare_feature(self, fc_feats, att_feats, att_masks):
+
+        # embed fc and att feats
+        fc_feats = self.fc_embed(fc_feats)
+        att_feats = pack_wrapper(self.att_embed, att_feats, att_masks)
+
+        # Project the attention feats first to reduce memory and computation comsumptions.
+        p_att_feats = self.ctx2att(att_feats)
+
+        return fc_feats, att_feats, p_att_feats
+
     def _forward(self, fc_feats, att_feats, seq, att_masks=None):
         att_feats, att_masks = self.clip_att(att_feats, att_masks)
 
@@ -93,12 +104,7 @@ def _forward(self, fc_feats, att_feats, seq, att_masks=None):
         # outputs = []
         outputs = Variable(fc_feats.data.new(batch_size, seq.size(1) - 1, self.vocab_size+1).zero_())
 
-        # embed fc and att feats
-        fc_feats = self.fc_embed(fc_feats)
-        att_feats = pack_wrapper(self.att_embed, att_feats, att_masks)
-
-        # Project the attention feats first to reduce memory and computation comsumptions.
-        p_att_feats = self.ctx2att(att_feats)
+        fc_feats, att_feats, p_att_feats = self._prepare_feature(fc_feats, att_feats, att_masks)
 
         for i in range(seq.size(1) - 1):
             if self.training and i >= 1 and self.ss_prob > 0.0: # otherwiste no need to sample
@@ -121,21 +127,18 @@ def _forward(self, fc_feats, att_feats, seq, att_masks=None):
             if i >= 1 and seq[:, i].data.sum() == 0:
                 break
 
-            xt = self.embed(it)
-
-            output, state = self.core(xt, fc_feats, att_feats, p_att_feats, state, att_masks)
-            output = F.log_softmax(self.logit(output))
+            output, state = self.get_logprobs_state(it, fc_feats, att_feats, p_att_feats, att_masks, state)
             outputs[:, i] = output
             # outputs.append(output)
 
         return outputs
         # return torch.cat([_.unsqueeze(1) for _ in outputs], 1)
 
-    def get_logprobs_state(self, it, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, tmp_att_masks, state):
+    def get_logprobs_state(self, it, fc_feats, att_feats, p_att_feats, att_masks, state):
         # 'it' is Variable contraining a word index
         xt = self.embed(it)
 
-        output, state = self.core(xt, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, state, tmp_att_masks)
+        output, state = self.core(xt, fc_feats, att_feats, p_att_feats, state, att_masks)
         logprobs = F.log_softmax(self.logit(output))
 
         return logprobs, state
@@ -144,12 +147,7 @@ def _sample_beam(self, fc_feats, att_feats, att_masks=None, opt={}):
         beam_size = opt.get('beam_size', 10)
         batch_size = fc_feats.size(0)
 
-        # embed fc and att feats
-        fc_feats = self.fc_embed(fc_feats)
-        att_feats = pack_wrapper(self.att_embed, att_feats, att_masks)
-
-        # Project the attention feats first to reduce memory and computation comsumptions.
-        p_att_feats = self.ctx2att(att_feats)
+        fc_feats, att_feats, p_att_feats = self._prepare_feature(fc_feats, att_feats, att_masks)
 
         assert beam_size <= self.vocab_size + 1, 'lets assume this for now, otherwise this corner case causes a few headaches down the road. can be dealt with in future if needed'
         seq = torch.LongTensor(self.seq_length, batch_size).zero_()
@@ -166,11 +164,9 @@ def _sample_beam(self, fc_feats, att_feats, att_masks=None, opt={}):
 
             for t in range(1):
                 if t == 0: # input <bos>
-                    it = fc_feats.data.new(beam_size).long().zero_()
-                    xt = self.embed(Variable(it, requires_grad=False))
+                    it = Variable(fc_feats.data.new(beam_size).long().zero_())
 
-                output, state = self.core(xt, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, state, tmp_att_masks)
-                logprobs = F.log_softmax(self.logit(output))
+                logprobs, state = self.get_logprobs_state(it, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, tmp_att_masks, state)
 
             self.done_beams[k] = self.beam_search(state, logprobs, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, tmp_att_masks, opt=opt)
             seq[:, k] = self.done_beams[k][0]['seq'] # the first beam has highest cumulative score
@@ -191,12 +187,7 @@ def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
         batch_size = fc_feats.size(0)
         state = self.init_hidden(batch_size)
 
-        # embed fc and att feats
-        fc_feats = self.fc_embed(fc_feats)
-        att_feats = pack_wrapper(self.att_embed, att_feats, att_masks)
-
-        # Project the attention feats first to reduce memory and computation comsumptions.
-        p_att_feats = self.ctx2att(att_feats)
+        fc_feats, att_feats, p_att_feats = self._prepare_feature(fc_feats, att_feats, att_masks)
 
         # seq = []
         # seqLogprobs = []
@@ -218,8 +209,6 @@ def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
                 sampleLogprobs = logprobs.gather(1, Variable(it, requires_grad=False)) # gather the logprobs at sampled positions
                 it = it.view(-1).long() # and flatten indices for downstream processing
 
-            xt = self.embed(Variable(it, requires_grad=False))
-
             if t >= 1:
                 # stop when all finished
                 if t == 1:
@@ -235,13 +224,12 @@ def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
                 # seqLogprobs.append(sampleLogprobs.view(-1))
                 seqLogprobs[:,t-1] = sampleLogprobs.view(-1)
 
-            output, state = self.core(xt, fc_feats, att_feats, p_att_feats, state, att_masks)
+            it = Variable(it)
+            logprobs, state = self.get_logprobs_state(it, fc_feats, att_feats, p_att_feats, att_masks, state)
             if decoding_constraint and t > 0:
                 tmp = output.data.new(output.size(0), self.vocab_size + 1).zero_()
                 tmp.scatter_(1, seq[:,t-1].data.unsqueeze(1), float('-inf'))
-                logprobs = F.log_softmax(self.logit(output)+Variable(tmp))
-            else:
-                logprobs = F.log_softmax(self.logit(output))
+                logprobs = logprobs + Variable(tmp)
 
         return seq, seqLogprobs
         # return torch.cat([_.unsqueeze(1) for _ in seq], 1), torch.cat([_.unsqueeze(1) for _ in seqLogprobs], 1)

From 32503c2f85ed5cc7d345a9f83f1e83c87a1f2c88 Mon Sep 17 00:00:00 2001
From: Ruotian Luo <rluo@ttic.edu>
Date: Sat, 14 Apr 2018 16:03:39 -0500
Subject: [PATCH 31/42] Fix some in evals.

---
 eval.py       | 2 +-
 eval_utils.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/eval.py b/eval.py
index 033a32af..5d4e31fe 100644
--- a/eval.py
+++ b/eval.py
@@ -94,7 +94,7 @@
 if len(opt.input_fc_dir) == 0:
     opt.input_fc_dir = infos['opt'].input_fc_dir
     opt.input_att_dir = infos['opt'].input_att_dir
-    opt.input_att_dir = infos['opt'].input_box_dir
+    opt.input_box_dir = infos['opt'].input_box_dir
     opt.input_label_h5 = infos['opt'].input_label_h5
 if len(opt.input_json) == 0:
     opt.input_json = infos['opt'].input_json
diff --git a/eval_utils.py b/eval_utils.py
index 2eec7214..3ca270af 100644
--- a/eval_utils.py
+++ b/eval_utils.py
@@ -23,7 +23,7 @@ def language_eval(dataset, preds, model_id, split):
     from pycocotools.coco import COCO
     from pycocoevalcap.eval import COCOEvalCap
 
-    encoder.FLOAT_REPR = lambda o: format(o, '.3f')
+    # encoder.FLOAT_REPR = lambda o: format(o, '.3f')
 
     if not os.path.isdir('eval_results'):
         os.mkdir('eval_results')

From 68f970aca1f95cddbc1c5fc4e4c7e6b6bffb2293 Mon Sep 17 00:00:00 2001
From: Ruotian Luo <rluo@ttic.edu>
Date: Thu, 26 Apr 2018 15:19:58 -0500
Subject: [PATCH 32/42] Update to pytorch 0.4

---
 dataloader.py          | 17 ++++++++++++++++-
 eval_utils.py          | 11 ++++++-----
 misc/rewards.py        |  6 ++----
 misc/utils.py          |  4 ++--
 models/Att2inModel.py  | 27 +++++++++++++--------------
 models/AttEnsemble.py  | 29 ++++++++++++++---------------
 models/AttModel.py     | 41 +++++++++++++++++++----------------------
 models/CaptionModel.py |  2 +-
 models/FCModel.py      | 33 ++++++++++++++++-----------------
 train.py               |  7 +++----
 10 files changed, 92 insertions(+), 85 deletions(-)

diff --git a/dataloader.py b/dataloader.py
index dfe8d1bb..fd76ec77 100644
--- a/dataloader.py
+++ b/dataloader.py
@@ -209,6 +209,21 @@ def __getitem__(self, index):
     def __len__(self):
         return len(self.info['images'])
 
+class SubsetSampler(torch.utils.data.sampler.Sampler):
+    r"""Samples elements randomly from a given list of indices, without replacement.
+    Arguments:
+        indices (list): a list of indices
+    """
+
+    def __init__(self, indices):
+        self.indices = indices
+
+    def __iter__(self):
+        return (self.indices[i] for i in range(len(self.indices)))
+
+    def __len__(self):
+        return len(self.indices)
+
 class BlobFetcher():
     """Experimental class for prefetching blobs in a separate process."""
     def __init__(self, split, dataloader, if_shuffle=False):
@@ -229,7 +244,7 @@ def reset(self):
         # batch_size is 1, the merge is done in DataLoader class
         self.split_loader = iter(data.DataLoader(dataset=self.dataloader,
                                             batch_size=1,
-                                            sampler=self.dataloader.split_ix[self.split][self.dataloader.iterators[self.split]:],
+                                            sampler=SubsetSampler(self.dataloader.split_ix[self.split][self.dataloader.iterators[self.split]:]),
                                             shuffle=False,
                                             pin_memory=True,
                                             num_workers=4, # 4 is usually enough
diff --git a/eval_utils.py b/eval_utils.py
index 3ca270af..8fd2a709 100644
--- a/eval_utils.py
+++ b/eval_utils.py
@@ -4,7 +4,6 @@
 
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
 
 import numpy as np
 import json
@@ -83,10 +82,11 @@ def eval_split(model, crit, loader, eval_kwargs={}):
         if data.get('labels', None) is not None and verbose_loss:
             # forward the model to get loss
             tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks']]
-            tmp = [Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp]
+            tmp = [torch.from_numpy(_).cuda() for _ in tmp]
             fc_feats, att_feats, labels, masks, att_masks = tmp
 
-            loss = crit(model(fc_feats, att_feats, labels, att_masks), labels[:,1:], masks[:,1:]).data[0]
+            with torch.no_grad():
+                loss = crit(model(fc_feats, att_feats, labels, att_masks), labels[:,1:], masks[:,1:]).item()
             loss_sum = loss_sum + loss
             loss_evals = loss_evals + 1
 
@@ -95,10 +95,11 @@ def eval_split(model, crit, loader, eval_kwargs={}):
         tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 
             data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
             data['att_masks'][np.arange(loader.batch_size) * loader.seq_per_img]]
-        tmp = [Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp]
+        tmp = [torch.from_numpy(_).cuda() for _ in tmp]
         fc_feats, att_feats, att_masks = tmp
         # forward the model to also get generated samples for each image
-        seq = model(fc_feats, att_feats, att_masks, opt=eval_kwargs, mode='sample')[0].data
+        with torch.no_grad():
+            seq = model(fc_feats, att_feats, att_masks, opt=eval_kwargs, mode='sample')[0].data
         
         # Print beam search
         if beam_size > 1 and verbose_beam:
diff --git a/misc/rewards.py b/misc/rewards.py
index 0a29267e..0935336b 100644
--- a/misc/rewards.py
+++ b/misc/rewards.py
@@ -7,7 +7,6 @@
 import misc.utils as utils
 from collections import OrderedDict
 import torch
-from torch.autograd import Variable
 
 import sys
 sys.path.append("cider")
@@ -39,9 +38,8 @@ def get_self_critical_reward(model, fc_feats, att_feats, att_masks, data, gen_re
     
     # get greedy decoding baseline
     model.eval()
-    greedy_res, _ = model(Variable(fc_feats.data, volatile=True),
-                          Variable(att_feats.data, volatile=True),
-                          att_masks=Variable(att_masks.data, volatile=True), mode='sample')
+    with torch.no_grad():
+        greedy_res, _ = model(fc_feats, att_feats, att_masks=att_masks, mode='sample')
     model.train()
 
     res = OrderedDict()
diff --git a/misc/utils.py b/misc/utils.py
index f9eb81a8..2f49bf8d 100644
--- a/misc/utils.py
+++ b/misc/utils.py
@@ -5,7 +5,6 @@
 import collections
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
 import numpy as np
 import torch.optim as optim
 
@@ -26,7 +25,7 @@ def decode_sequence(ix_to_word, seq):
             if ix > 0 :
                 if j >= 1:
                     txt = txt + ' '
-                txt = txt + ix_to_word[str(ix)]
+                txt = txt + ix_to_word[str(ix.item())]
             else:
                 break
         out.append(txt)
@@ -51,6 +50,7 @@ def forward(self, input, seq, reward):
         output = torch.sum(output) / torch.sum(mask)
 
         return output
+
 class LanguageModelCriterion(nn.Module):
     def __init__(self):
         super(LanguageModelCriterion, self).__init__()
diff --git a/models/Att2inModel.py b/models/Att2inModel.py
index 9af2d6e2..daf3481b 100644
--- a/models/Att2inModel.py
+++ b/models/Att2inModel.py
@@ -51,7 +51,7 @@ def forward(self, xt, fc_feats, att_feats, p_att_feats, state):
         dot = self.alpha_net(dot)                           # (batch * att_size) * 1
         dot = dot.view(-1, att_size)                        # batch * att_size
         
-        weight = F.softmax(dot)                             # batch * att_size
+        weight = F.softmax(dot, dim=1)                             # batch * att_size
         att_feats_ = att_feats.view(-1, att_size, self.att_feat_size) # batch * att_size * att_feat_size
         att_res = torch.bmm(weight.unsqueeze(1), att_feats_).squeeze(1) # batch * att_feat_size
 
@@ -104,9 +104,9 @@ def init_weights(self):
         self.logit.weight.data.uniform_(-initrange, initrange)
 
     def init_hidden(self, bsz):
-        weight = next(self.parameters()).data
-        return (Variable(weight.new(self.num_layers, bsz, self.rnn_size).zero_()),
-                Variable(weight.new(self.num_layers, bsz, self.rnn_size).zero_()))
+        weight = next(self.parameters())
+        return (weight.new_zeros(self.num_layers, bsz, self.rnn_size),
+                weight.new_zeros(self.num_layers, bsz, self.rnn_size))
 
     def forward(self, fc_feats, att_feats, seq):
         batch_size = fc_feats.size(0)
@@ -131,27 +131,26 @@ def forward(self, fc_feats, att_feats, seq):
                     #it.index_copy_(0, sample_ind, torch.multinomial(prob_prev, 1).view(-1))
                     prob_prev = torch.exp(outputs[-1].data) # fetch prev distribution: shape Nx(M+1)
                     it.index_copy_(0, sample_ind, torch.multinomial(prob_prev, 1).view(-1).index_select(0, sample_ind))
-                    it = Variable(it, requires_grad=False)
             else:
                 it = seq[:, i].clone()          
             # break if all the sequences end
-            if i >= 1 and seq[:, i].data.sum() == 0:
+            if i >= 1 and seq[:, i].sum() == 0:
                 break
 
             xt = self.embed(it)
 
             output, state = self.core(xt, fc_feats, att_feats, p_att_feats, state)
-            output = F.log_softmax(self.logit(output))
+            output = F.log_softmax(self.logit(output), dim=1)
             outputs.append(output)
 
         return torch.cat([_.unsqueeze(1) for _ in outputs], 1)
 
     def get_logprobs_state(self, it, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, state):
-        # 'it' is Variable contraining a word index
+        # 'it' contains a word index
         xt = self.embed(it)
 
         output, state = self.core(xt, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, state)
-        logprobs = F.log_softmax(self.logit(output))
+        logprobs = F.log_softmax(self.logit(output), dim=1)
 
         return logprobs, state
 
@@ -178,10 +177,10 @@ def sample_beam(self, fc_feats, att_feats, opt={}):
             for t in range(1):
                 if t == 0: # input <bos>
                     it = fc_feats.data.new(beam_size).long().zero_()
-                    xt = self.embed(Variable(it, requires_grad=False))
+                    xt = self.embed(it)
 
                 output, state = self.core(xt, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, state)
-                logprobs = F.log_softmax(self.logit(output))
+                logprobs = F.log_softmax(self.logit(output), dim=1)
 
             self.done_beams[k] = self.beam_search(state, logprobs, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, opt=opt)
             seq[:, k] = self.done_beams[k][0]['seq'] # the first beam has highest cumulative score
@@ -218,10 +217,10 @@ def sample(self, fc_feats, att_feats, opt={}):
                     # scale logprobs by temperature
                     prob_prev = torch.exp(torch.div(logprobs.data, temperature)).cpu()
                 it = torch.multinomial(prob_prev, 1).cuda()
-                sampleLogprobs = logprobs.gather(1, Variable(it, requires_grad=False)) # gather the logprobs at sampled positions
+                sampleLogprobs = logprobs.gather(1, it) # gather the logprobs at sampled positions
                 it = it.view(-1).long() # and flatten indices for downstream processing
 
-            xt = self.embed(Variable(it, requires_grad=False))
+            xt = self.embed(it)
 
             if t >= 1:
                 # stop when all finished
@@ -237,6 +236,6 @@ def sample(self, fc_feats, att_feats, opt={}):
                 seqLogprobs.append(sampleLogprobs.view(-1))
 
             output, state = self.core(xt, fc_feats, att_feats, p_att_feats, state)
-            logprobs = F.log_softmax(self.logit(output))
+            logprobs = F.log_softmax(self.logit(output), dim=1)
 
         return torch.cat([_.unsqueeze(1) for _ in seq], 1), torch.cat([_.unsqueeze(1) for _ in seqLogprobs], 1)
\ No newline at end of file
diff --git a/models/AttEnsemble.py b/models/AttEnsemble.py
index 6d0ebb06..a50e3e32 100644
--- a/models/AttEnsemble.py
+++ b/models/AttEnsemble.py
@@ -44,11 +44,11 @@ def core(self, *args):
         return zip(*[m.core(*_) for m, _ in zip(self.models, zip(*args))])
 
     def get_logprobs_state(self, it, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, tmp_att_masks, state):
-        # 'it' is Variable contraining a word index
-        xt = self.embed(Variable(it, requires_grad=False))
+        # 'it' contains a word index
+        xt = self.embed(it, requires_grad=False)
 
         output, state = self.core(xt, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, state, tmp_att_masks)
-        logprobs = torch.stack([F.softmax(m.logit(output[i])) for i,m in enumerate(self.models)], 2).mean(2).log()
+        logprobs = torch.stack([F.softmax(m.logit(output[i]), dim=1) for i,m in enumerate(self.models)], 2).mean(2).log()
 
         return logprobs, state
 
@@ -57,7 +57,7 @@ def _forward(self, fc_feats, att_feats, seq, att_masks=None):
         state = self.init_hidden(batch_size)
 
         # outputs = []
-        outputs = Variable(fc_feats.data.new(batch_size, seq.size(1) - 1, self.vocab_size+1).zero_())
+        outputs = fc_feats.new_zeros(batch_size, seq.size(1) - 1, self.vocab_size+1)
 
         # embed fc and att feats
         fc_feats = [m.fc_embed(fc_feats) for m in self.models]
@@ -80,7 +80,6 @@ def _forward(self, fc_feats, att_feats, seq, att_masks=None):
                     # prob_prev = torch.exp(outputs[-1].data) # fetch prev distribution: shape Nx(M+1)
                     prob_prev = torch.exp(outputs[:, i-1].data) # fetch prev distribution: shape Nx(M+1)
                     it.index_copy_(0, sample_ind, torch.multinomial(prob_prev, 1).view(-1).index_select(0, sample_ind))
-                    it = Variable(it, requires_grad=False)
             else:
                 it = seq[:, i].clone()          
             # break if all the sequences end
@@ -90,7 +89,7 @@ def _forward(self, fc_feats, att_feats, seq, att_masks=None):
             xt = self.embed(it)
 
             output, state = self.core(xt, fc_feats, att_feats, p_att_feats, state, att_masks)
-            output = torch.stack([F.softmax(m.logit(output[i])) for i,m in enumerate(self.models)], 2).mean(2).log()
+            output = torch.stack([F.softmax(m.logit(output[i]), dim=1) for i,m in enumerate(self.models)], 2).mean(2).log()
             outputs[:, i] = output
             # outputs.append(output)
 
@@ -124,16 +123,16 @@ def _sample_beam(self, fc_feats, att_feats, att_masks=None, opt={}):
             for t in range(1):
                 if t == 0: # input <bos>
                     it = fc_feats[0].data.new(beam_size).long().zero_()
-                    xt = self.embed(Variable(it, requires_grad=False))
+                    xt = self.embed(it)
 
                 output, state = self.core(xt, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, state, tmp_att_masks)
-                logprobs = torch.stack([F.softmax(m.logit(output[i])) for i,m in enumerate(self.models)], 2).mean(2).log()
+                logprobs = torch.stack([F.softmax(m.logit(output[i]), dim=1) for i,m in enumerate(self.models)], 2).mean(2).log()
 
             self.done_beams[k] = self.beam_search(state, logprobs, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, tmp_att_masks, opt=opt)
             seq[:, k] = self.done_beams[k][0]['seq'] # the first beam has highest cumulative score
             seqLogprobs[:, k] = self.done_beams[k][0]['logps']
         # return the samples and their log likelihoods
-        return Variable(seq.transpose(0, 1)), Variable(seqLogprobs.transpose(0, 1))
+        return seq.transpose(0, 1), seqLogprobs.transpose(0, 1)
 
     def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
         sample_max = opt.get('sample_max', 1)
@@ -155,8 +154,8 @@ def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
 
         # seq = []
         # seqLogprobs = []
-        seq = Variable(fc_feats[0].data.new(batch_size, self.seq_length).long().zero_())
-        seqLogprobs = Variable(fc_feats[0].data.new(batch_size, self.seq_length).zero_())
+        seq = fc_feats[0].new_zeros((batch_size, self.seq_length), dtype=torch.long)
+        seqLogprobs = fc_feats[0].new_zeros(batch_size, self.seq_length)
         for t in range(self.seq_length + 1):
             if t == 0: # input <bos>
                 it = fc_feats[0].data.new(batch_size).long().zero_()
@@ -170,10 +169,10 @@ def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
                     # scale logprobs by temperature
                     prob_prev = torch.exp(torch.div(logprobs.data, temperature))
                 it = torch.multinomial(prob_prev, 1)
-                sampleLogprobs = logprobs.gather(1, Variable(it, requires_grad=False)) # gather the logprobs at sampled positions
+                sampleLogprobs = logprobs.gather(1, it) # gather the logprobs at sampled positions
                 it = it.view(-1).long() # and flatten indices for downstream processing
 
-            xt = self.embed(Variable(it, requires_grad=False))
+            xt = self.embed(it)
 
             if t >= 1:
                 # stop when all finished
@@ -194,9 +193,9 @@ def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
             if decoding_constraint and t > 0:
                 tmp = output.data.new(output.size(0), self.vocab_size + 1).zero_()
                 tmp.scatter_(1, seq[:,t-1].data.unsqueeze(1), float('-inf'))
-                logprobs = torch.stack([F.softmax(m.logit(output[i]+Variable(tmp))) for i,m in enumerate(self.models)], 2).mean(2).log()
+                logprobs = torch.stack([F.softmax(m.logit(output[i]+tmp), dim=1) for i,m in enumerate(self.models)], 2).mean(2).log()
             else:
-                logprobs = torch.stack([F.softmax(m.logit(output[i])) for i,m in enumerate(self.models)], 2).mean(2).log()
+                logprobs = torch.stack([F.softmax(m.logit(output[i]), dim=1) for i,m in enumerate(self.models)], 2).mean(2).log()
 
         return seq, seqLogprobs
         # return torch.cat([_.unsqueeze(1) for _ in seq], 1), torch.cat([_.unsqueeze(1) for _ in seqLogprobs], 1)
diff --git a/models/AttModel.py b/models/AttModel.py
index 45a8603a..0d1cbf12 100644
--- a/models/AttModel.py
+++ b/models/AttModel.py
@@ -19,7 +19,6 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.autograd import *
 import misc.utils as utils
 from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence, pad_packed_sequence
 
@@ -72,9 +71,9 @@ def __init__(self, opt):
         self.ctx2att = nn.Linear(self.rnn_size, self.att_hid_size)
 
     def init_hidden(self, bsz):
-        weight = next(self.parameters()).data
-        return (Variable(weight.new(self.num_layers, bsz, self.rnn_size).zero_()),
-                Variable(weight.new(self.num_layers, bsz, self.rnn_size).zero_()))
+        weight = next(self.parameters())
+        return (weight.new_zeros(self.num_layers, bsz, self.rnn_size),
+                weight.new_zeros(self.num_layers, bsz, self.rnn_size))
 
     def clip_att(self, att_feats, att_masks):
         # Clip the length of att_masks and att_feats to the maximum length
@@ -102,13 +101,13 @@ def _forward(self, fc_feats, att_feats, seq, att_masks=None):
         state = self.init_hidden(batch_size)
 
         # outputs = []
-        outputs = Variable(fc_feats.data.new(batch_size, seq.size(1) - 1, self.vocab_size+1).zero_())
+        outputs = fc_feats.new_zeros(batch_size, seq.size(1) - 1, self.vocab_size+1)
 
         fc_feats, att_feats, p_att_feats = self._prepare_feature(fc_feats, att_feats, att_masks)
 
         for i in range(seq.size(1) - 1):
             if self.training and i >= 1 and self.ss_prob > 0.0: # otherwiste no need to sample
-                sample_prob = fc_feats.data.new(batch_size).uniform_(0, 1)
+                sample_prob = fc_feats.new(batch_size).uniform_(0, 1)
                 sample_mask = sample_prob < self.ss_prob
                 if sample_mask.sum() == 0:
                     it = seq[:, i].clone()
@@ -118,13 +117,12 @@ def _forward(self, fc_feats, att_feats, seq, att_masks=None):
                     #prob_prev = torch.exp(outputs[-1].data.index_select(0, sample_ind)) # fetch prev distribution: shape Nx(M+1)
                     #it.index_copy_(0, sample_ind, torch.multinomial(prob_prev, 1).view(-1))
                     # prob_prev = torch.exp(outputs[-1].data) # fetch prev distribution: shape Nx(M+1)
-                    prob_prev = torch.exp(outputs[:, i-1].data) # fetch prev distribution: shape Nx(M+1)
+                    prob_prev = torch.exp(outputs[:, i-1].detach()) # fetch prev distribution: shape Nx(M+1)
                     it.index_copy_(0, sample_ind, torch.multinomial(prob_prev, 1).view(-1).index_select(0, sample_ind))
-                    it = Variable(it, requires_grad=False)
             else:
                 it = seq[:, i].clone()          
             # break if all the sequences end
-            if i >= 1 and seq[:, i].data.sum() == 0:
+            if i >= 1 and seq[:, i].sum() == 0:
                 break
 
             output, state = self.get_logprobs_state(it, fc_feats, att_feats, p_att_feats, att_masks, state)
@@ -135,11 +133,11 @@ def _forward(self, fc_feats, att_feats, seq, att_masks=None):
         # return torch.cat([_.unsqueeze(1) for _ in outputs], 1)
 
     def get_logprobs_state(self, it, fc_feats, att_feats, p_att_feats, att_masks, state):
-        # 'it' is Variable contraining a word index
+        # 'it' contains a word index
         xt = self.embed(it)
 
         output, state = self.core(xt, fc_feats, att_feats, p_att_feats, state, att_masks)
-        logprobs = F.log_softmax(self.logit(output))
+        logprobs = F.log_softmax(self.logit(output), dim=1)
 
         return logprobs, state
 
@@ -164,7 +162,7 @@ def _sample_beam(self, fc_feats, att_feats, att_masks=None, opt={}):
 
             for t in range(1):
                 if t == 0: # input <bos>
-                    it = Variable(fc_feats.data.new(beam_size).long().zero_())
+                    it = fc_feats.new_zeros([beam_size], dtype=torch.long)
 
                 logprobs, state = self.get_logprobs_state(it, tmp_fc_feats, tmp_att_feats, tmp_p_att_feats, tmp_att_masks, state)
 
@@ -172,7 +170,7 @@ def _sample_beam(self, fc_feats, att_feats, att_masks=None, opt={}):
             seq[:, k] = self.done_beams[k][0]['seq'] # the first beam has highest cumulative score
             seqLogprobs[:, k] = self.done_beams[k][0]['logps']
         # return the samples and their log likelihoods
-        return Variable(seq.transpose(0, 1)), Variable(seqLogprobs.transpose(0, 1))
+        return seq.transpose(0, 1), seqLogprobs.transpose(0, 1)
 
     def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
         att_feats, att_masks = self.clip_att(att_feats, att_masks)
@@ -191,11 +189,11 @@ def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
 
         # seq = []
         # seqLogprobs = []
-        seq = Variable(fc_feats.data.new(batch_size, self.seq_length).long().zero_())
-        seqLogprobs = Variable(fc_feats.data.new(batch_size, self.seq_length).zero_())
+        seq = fc_feats.new_zeros((batch_size, self.seq_length), dtype=torch.long)
+        seqLogprobs = fc_feats.new_zeros(batch_size, self.seq_length)
         for t in range(self.seq_length + 1):
             if t == 0: # input <bos>
-                it = fc_feats.data.new(batch_size).long().zero_()
+                it = fc_feats.new_zeros(batch_size, dtype=torch.long)
             elif sample_max:
                 sampleLogprobs, it = torch.max(logprobs.data, 1)
                 it = it.view(-1).long()
@@ -206,7 +204,7 @@ def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
                     # scale logprobs by temperature
                     prob_prev = torch.exp(torch.div(logprobs.data, temperature))
                 it = torch.multinomial(prob_prev, 1)
-                sampleLogprobs = logprobs.gather(1, Variable(it, requires_grad=False)) # gather the logprobs at sampled positions
+                sampleLogprobs = logprobs.gather(1, it) # gather the logprobs at sampled positions
                 it = it.view(-1).long() # and flatten indices for downstream processing
 
             if t >= 1:
@@ -224,12 +222,11 @@ def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
                 # seqLogprobs.append(sampleLogprobs.view(-1))
                 seqLogprobs[:,t-1] = sampleLogprobs.view(-1)
 
-            it = Variable(it)
             logprobs, state = self.get_logprobs_state(it, fc_feats, att_feats, p_att_feats, att_masks, state)
             if decoding_constraint and t > 0:
-                tmp = output.data.new(output.size(0), self.vocab_size + 1).zero_()
+                tmp = output.new_zeros(output.size(0), self.vocab_size + 1)
                 tmp.scatter_(1, seq[:,t-1].data.unsqueeze(1), float('-inf'))
-                logprobs = logprobs + Variable(tmp)
+                logprobs = logprobs + tmp
 
         return seq, seqLogprobs
         # return torch.cat([_.unsqueeze(1) for _ in seq], 1), torch.cat([_.unsqueeze(1) for _ in seqLogprobs], 1)
@@ -371,7 +368,7 @@ def forward(self, h_out, fake_region, conv_feat, conv_feat_embed, att_masks=None
         hA = F.dropout(hA,self.drop_prob_lm, self.training)
         
         hAflat = self.alpha_net(hA.view(-1, self.att_hid_size))
-        PI = F.softmax(hAflat.view(-1, att_size + 1))
+        PI = F.softmax(hAflat.view(-1, att_size + 1), dim=1)
 
         if att_masks is not None:
             att_masks = att_masks.view(-1, att_size)
@@ -518,7 +515,7 @@ def forward(self, h, att_feats, p_att_feats, att_masks=None):
         dot = self.alpha_net(dot)                           # (batch * att_size) * 1
         dot = dot.view(-1, att_size)                        # batch * att_size
         
-        weight = F.softmax(dot)                             # batch * att_size
+        weight = F.softmax(dot, dim=1)                             # batch * att_size
         if att_masks is not None:
             weight = weight * att_masks.view(-1, att_size).float()
             weight = weight / weight.sum(1, keepdim=True) # normalize to 1
diff --git a/models/CaptionModel.py b/models/CaptionModel.py
index f9c10e8a..35ce100f 100644
--- a/models/CaptionModel.py
+++ b/models/CaptionModel.py
@@ -169,7 +169,7 @@ def beam_step(logprobsf, unaug_logprobsf, beam_size, t, beam_seq, beam_seq_logpr
                     # move the current group one step forward in time
                     
                     it = beam_seq_table[divm][t-divm]
-                    logprobs_table[divm], state_table[divm] = self.get_logprobs_state(Variable(it.cuda()), *(args[divm] + [state_table[divm]]))
+                    logprobs_table[divm], state_table[divm] = self.get_logprobs_state(it.cuda(), *(args[divm] + [state_table[divm]]))
 
         # all beams are sorted by their log-probabilities
         done_beams_table = [sorted(done_beams_table[i], key=lambda x: -x['p'])[:bdash] for i in range(group_size)]
diff --git a/models/FCModel.py b/models/FCModel.py
index 3275824b..e25b923b 100644
--- a/models/FCModel.py
+++ b/models/FCModel.py
@@ -69,12 +69,12 @@ def init_weights(self):
         self.logit.weight.data.uniform_(-initrange, initrange)
 
     def init_hidden(self, bsz):
-        weight = next(self.parameters()).data
+        weight = next(self.parameters())
         if self.rnn_type == 'lstm':
-            return (Variable(weight.new(self.num_layers, bsz, self.rnn_size).zero_()),
-                    Variable(weight.new(self.num_layers, bsz, self.rnn_size).zero_()))
+            return (weight.new_zeros(self.num_layers, bsz, self.rnn_size),
+                    weight.new_zeros(self.num_layers, bsz, self.rnn_size))
         else:
-            return Variable(weight.new(self.num_layers, bsz, self.rnn_size).zero_())
+            return weight.new_zeros(self.num_layers, bsz, self.rnn_size)
 
     def _forward(self, fc_feats, att_feats, seq, att_masks=None):
         batch_size = fc_feats.size(0)
@@ -97,26 +97,25 @@ def _forward(self, fc_feats, att_feats, seq, att_masks=None):
                         #it.index_copy_(0, sample_ind, torch.multinomial(prob_prev, 1).view(-1))
                         prob_prev = torch.exp(outputs[-1].data) # fetch prev distribution: shape Nx(M+1)
                         it.index_copy_(0, sample_ind, torch.multinomial(prob_prev, 1).view(-1).index_select(0, sample_ind))
-                        it = Variable(it, requires_grad=False)
                 else:
                     it = seq[:, i-1].clone()
                 # break if all the sequences end
-                if i >= 2 and seq[:, i-1].data.sum() == 0:
+                if i >= 2 and seq[:, i-1].sum() == 0:
                     break
                 xt = self.embed(it)
 
             output, state = self.core(xt, state)
-            output = F.log_softmax(self.logit(output))
+            output = F.log_softmax(self.logit(output), dim=1)
             outputs.append(output)
 
         return torch.cat([_.unsqueeze(1) for _ in outputs[1:]], 1).contiguous()
 
     def get_logprobs_state(self, it, state):
-        # 'it' is Variable contraining a word index
+        # 'it' is contains a word index
         xt = self.embed(it)
 
         output, state = self.core(xt, state)
-        logprobs = F.log_softmax(self.logit(output))
+        logprobs = F.log_softmax(self.logit(output), dim=1)
 
         return logprobs, state
 
@@ -137,16 +136,16 @@ def _sample_beam(self, fc_feats, att_feats, att_masks=None, opt={}):
                     xt = self.img_embed(fc_feats[k:k+1]).expand(beam_size, self.input_encoding_size)
                 elif t == 1: # input <bos>
                     it = fc_feats.data.new(beam_size).long().zero_()
-                    xt = self.embed(Variable(it, requires_grad=False))
+                    xt = self.embed(it)
 
                 output, state = self.core(xt, state)
-                logprobs = F.log_softmax(self.logit(output))
+                logprobs = F.log_softmax(self.logit(output), dim=1)
 
             self.done_beams[k] = self.beam_search(state, logprobs, opt=opt)
             seq[:, k] = self.done_beams[k][0]['seq'] # the first beam has highest cumulative score
             seqLogprobs[:, k] = self.done_beams[k][0]['logps']
         # return the samples and their log likelihoods
-        return Variable(seq.transpose(0, 1)), Variable(seqLogprobs.transpose(0, 1))
+        return seq.transpose(0, 1), seqLogprobs.transpose(0, 1)
 
     def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
         sample_max = opt.get('sample_max', 1)
@@ -157,8 +156,8 @@ def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
 
         batch_size = fc_feats.size(0)
         state = self.init_hidden(batch_size)
-        seq = Variable(fc_feats.data.new(batch_size, self.seq_length).long().zero_())
-        seqLogprobs = Variable(fc_feats.data.new(batch_size, self.seq_length).zero_())
+        seq = fc_feats.new_zeros(batch_size, self.seq_length, dtype=torch.long)
+        seqLogprobs = fc_feats.new_zeros(batch_size, self.seq_length)
         for t in range(self.seq_length + 2):
             if t == 0:
                 xt = self.img_embed(fc_feats)
@@ -175,10 +174,10 @@ def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
                         # scale logprobs by temperature
                         prob_prev = torch.exp(torch.div(logprobs.data, temperature)).cpu()
                     it = torch.multinomial(prob_prev, 1).cuda()
-                    sampleLogprobs = logprobs.gather(1, Variable(it, requires_grad=False)) # gather the logprobs at sampled positions
+                    sampleLogprobs = logprobs.gather(1, it) # gather the logprobs at sampled positions
                     it = it.view(-1).long() # and flatten indices for downstream processing
 
-                xt = self.embed(Variable(it, requires_grad=False))
+                xt = self.embed(it)
 
             if t >= 2:
                 # stop when all finished
@@ -193,6 +192,6 @@ def _sample(self, fc_feats, att_feats, att_masks=None, opt={}):
                 seqLogprobs[:,t-2] = sampleLogprobs.view(-1)
 
             output, state = self.core(xt, state)
-            logprobs = F.log_softmax(self.logit(output))
+            logprobs = F.log_softmax(self.logit(output), dim=1)
 
         return seq, seqLogprobs
diff --git a/train.py b/train.py
index 18aecfc6..c3dc2e5d 100644
--- a/train.py
+++ b/train.py
@@ -4,7 +4,6 @@
 
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
 import torch.optim as optim
 
 import numpy as np
@@ -118,7 +117,7 @@ def train(opt):
         start = time.time()
 
         tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks']]
-        tmp = [Variable(torch.from_numpy(_), requires_grad=False).cuda() for _ in tmp]
+        tmp = [torch.from_numpy(_).cuda() for _ in tmp]
         fc_feats, att_feats, labels, masks, att_masks = tmp
         
         optimizer.zero_grad()
@@ -127,12 +126,12 @@ def train(opt):
         else:
             gen_result, sample_logprobs = dp_model(fc_feats, att_feats, att_masks, opt={'sample_max':0}, mode='sample')
             reward = get_self_critical_reward(dp_model, fc_feats, att_feats, att_masks, data, gen_result, opt)
-            loss = rl_crit(sample_logprobs, gen_result.data, Variable(torch.from_numpy(reward).float().cuda(), requires_grad=False))
+            loss = rl_crit(sample_logprobs, gen_result.data, torch.from_numpy(reward).float().cuda())
 
         loss.backward()
         utils.clip_gradient(optimizer, opt.grad_clip)
         optimizer.step()
-        train_loss = loss.data[0]
+        train_loss = loss.item()
         torch.cuda.synchronize()
         end = time.time()
         if not sc_flag:

From c41264ad3116ead592c7ccb87352d0333d7b0177 Mon Sep 17 00:00:00 2001
From: Ruotian Luo <rluo@ttic.edu>
Date: Sat, 28 Apr 2018 17:05:30 -0500
Subject: [PATCH 33/42] Update more to 0.4 version.

---
 dataloaderraw.py         |  6 +++---
 misc/resnet_utils.py     |  1 -
 misc/utils.py            |  2 +-
 models/OldModel.py       | 21 ++++++++++-----------
 models/ShowTellModel.py  | 23 +++++++++++------------
 scripts/prepro_feats.py  |  6 +++---
 scripts/prepro_labels.py |  1 -
 7 files changed, 28 insertions(+), 32 deletions(-)

diff --git a/dataloaderraw.py b/dataloaderraw.py
index d2180770..fbfe1557 100644
--- a/dataloaderraw.py
+++ b/dataloaderraw.py
@@ -8,7 +8,6 @@
 import numpy as np
 import random
 import torch
-from torch.autograd import Variable
 import skimage
 import skimage.io
 import scipy.misc
@@ -109,8 +108,9 @@ def get_batch(self, split, batch_size=None):
 
             img = img.astype('float32')/255.0
             img = torch.from_numpy(img.transpose([2,0,1])).cuda()
-            img = Variable(preprocess(img), volatile=True)
-            tmp_fc, tmp_att = self.my_resnet(img)
+            img = preprocess(img)
+            with torch.no_grad():
+                tmp_fc, tmp_att = self.my_resnet(img)
 
             fc_batch[i] = tmp_fc.data.cpu().float().numpy()
             att_batch[i] = tmp_att.data.cpu().float().numpy()
diff --git a/misc/resnet_utils.py b/misc/resnet_utils.py
index 6e76bbb3..e1df171a 100644
--- a/misc/resnet_utils.py
+++ b/misc/resnet_utils.py
@@ -1,6 +1,5 @@
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
 import torch.nn.functional as F
 
 class myResnet(nn.Module):
diff --git a/misc/utils.py b/misc/utils.py
index 2f49bf8d..95b227cf 100644
--- a/misc/utils.py
+++ b/misc/utils.py
@@ -46,7 +46,7 @@ def forward(self, input, seq, reward):
         reward = to_contiguous(reward).view(-1)
         mask = (seq>0).float()
         mask = to_contiguous(torch.cat([mask.new(mask.size(0), 1).fill_(1), mask[:, :-1]], 1)).view(-1)
-        output = - input * reward * Variable(mask)
+        output = - input * reward * mask
         output = torch.sum(output) / torch.sum(mask)
 
         return output
diff --git a/models/OldModel.py b/models/OldModel.py
index 91e66ea0..351e8164 100644
--- a/models/OldModel.py
+++ b/models/OldModel.py
@@ -71,27 +71,26 @@ def forward(self, fc_feats, att_feats, seq):
                     #it.index_copy_(0, sample_ind, torch.multinomial(prob_prev, 1).view(-1))
                     prob_prev = torch.exp(outputs[-1].data) # fetch prev distribution: shape Nx(M+1)
                     it.index_copy_(0, sample_ind, torch.multinomial(prob_prev, 1).view(-1).index_select(0, sample_ind))
-                    it = Variable(it, requires_grad=False)
             else:
                 it = seq[:, i].clone()          
             # break if all the sequences end
-            if i >= 1 and seq[:, i].data.sum() == 0:
+            if i >= 1 and seq[:, i].sum() == 0:
                 break
 
             xt = self.embed(it)
 
             output, state = self.core(xt, fc_feats, att_feats, state)
-            output = F.log_softmax(self.logit(self.dropout(output)))
+            output = F.log_softmax(self.logit(self.dropout(output)), dim=1)
             outputs.append(output)
 
         return torch.cat([_.unsqueeze(1) for _ in outputs], 1)
 
     def get_logprobs_state(self, it, tmp_fc_feats, tmp_att_feats, state):
-        # 'it' is Variable contraining a word index
+        # 'it' contains a word index
         xt = self.embed(it)
 
         output, state = self.core(xt, tmp_fc_feats, tmp_att_feats, state)
-        logprobs = F.log_softmax(self.logit(self.dropout(output)))
+        logprobs = F.log_softmax(self.logit(self.dropout(output)), dim=1)
 
         return logprobs, state
 
@@ -118,10 +117,10 @@ def sample_beam(self, fc_feats, att_feats, opt={}):
             for t in range(1):
                 if t == 0: # input <bos>
                     it = fc_feats.data.new(beam_size).long().zero_()
-                    xt = self.embed(Variable(it, requires_grad=False))
+                    xt = self.embed(it)
 
                 output, state = self.core(xt, tmp_fc_feats, tmp_att_feats, state)
-                logprobs = F.log_softmax(self.logit(self.dropout(output)))
+                logprobs = F.log_softmax(self.logit(self.dropout(output)), dim=1)
 
             self.done_beams[k] = self.beam_search(state, logprobs, tmp_fc_feats, tmp_att_feats, opt=opt)
             seq[:, k] = self.done_beams[k][0]['seq'] # the first beam has highest cumulative score
@@ -154,10 +153,10 @@ def sample(self, fc_feats, att_feats, opt={}):
                     # scale logprobs by temperature
                     prob_prev = torch.exp(torch.div(logprobs.data, temperature)).cpu()
                 it = torch.multinomial(prob_prev, 1).cuda()
-                sampleLogprobs = logprobs.gather(1, Variable(it, requires_grad=False)) # gather the logprobs at sampled positions
+                sampleLogprobs = logprobs.gather(1, it) # gather the logprobs at sampled positions
                 it = it.view(-1).long() # and flatten indices for downstream processing
 
-            xt = self.embed(Variable(it, requires_grad=False))
+            xt = self.embed(it)
 
             if t >= 1:
                 # stop when all finished
@@ -172,7 +171,7 @@ def sample(self, fc_feats, att_feats, opt={}):
                 seqLogprobs.append(sampleLogprobs.view(-1))
 
             output, state = self.core(xt, fc_feats, att_feats, state)
-            logprobs = F.log_softmax(self.logit(self.dropout(output)))
+            logprobs = F.log_softmax(self.logit(self.dropout(output)), dim=1)
 
         return torch.cat([_.unsqueeze(1) for _ in seq], 1), torch.cat([_.unsqueeze(1) for _ in seqLogprobs], 1)
 
@@ -220,7 +219,7 @@ def forward(self, xt, fc_feats, att_feats, state):
             att_h = att_h.expand_as(att)                        # batch * att_size
             dot = att_h + att                                   # batch * att_size
         
-        weight = F.softmax(dot)
+        weight = F.softmax(dot, dim=1)
         att_feats_ = att_feats.view(-1, att_size, self.att_feat_size) # batch * att_size * att_feat_size
         att_res = torch.bmm(weight.unsqueeze(1), att_feats_).squeeze(1) # batch * att_feat_size
 
diff --git a/models/ShowTellModel.py b/models/ShowTellModel.py
index c82885e0..93ffa85a 100644
--- a/models/ShowTellModel.py
+++ b/models/ShowTellModel.py
@@ -41,10 +41,10 @@ def init_weights(self):
     def init_hidden(self, bsz):
         weight = next(self.parameters()).data
         if self.rnn_type == 'lstm':
-            return (Variable(weight.new(self.num_layers, bsz, self.rnn_size).zero_()),
-                    Variable(weight.new(self.num_layers, bsz, self.rnn_size).zero_()))
+            return (weight.new_zeros(self.num_layers, bsz, self.rnn_size),
+                    weight.new_zeros(self.num_layers, bsz, self.rnn_size))
         else:
-            return Variable(weight.new(self.num_layers, bsz, self.rnn_size).zero_())
+            return weight.new_zeros(self.num_layers, bsz, self.rnn_size)
 
     def forward(self, fc_feats, att_feats, seq):
         batch_size = fc_feats.size(0)
@@ -67,7 +67,6 @@ def forward(self, fc_feats, att_feats, seq):
                         #it.index_copy_(0, sample_ind, torch.multinomial(prob_prev, 1).view(-1))
                         prob_prev = torch.exp(outputs[-1].data) # fetch prev distribution: shape Nx(M+1)
                         it.index_copy_(0, sample_ind, torch.multinomial(prob_prev, 1).view(-1).index_select(0, sample_ind))
-                        it = Variable(it, requires_grad=False)
                 else:
                     it = seq[:, i-1].clone()                
                 # break if all the sequences end
@@ -76,17 +75,17 @@ def forward(self, fc_feats, att_feats, seq):
                 xt = self.embed(it)
 
             output, state = self.core(xt.unsqueeze(0), state)
-            output = F.log_softmax(self.logit(self.dropout(output.squeeze(0))))
+            output = F.log_softmax(self.logit(self.dropout(output.squeeze(0))), dim=1)
             outputs.append(output)
 
         return torch.cat([_.unsqueeze(1) for _ in outputs[1:]], 1).contiguous()
 
     def get_logprobs_state(self, it, state):
-        # 'it' is Variable contraining a word index
+        # 'it' contains a word index
         xt = self.embed(it)
                 
         output, state = self.core(xt.unsqueeze(0), state)
-        logprobs = F.log_softmax(self.logit(self.dropout(output.squeeze(0))))
+        logprobs = F.log_softmax(self.logit(self.dropout(output.squeeze(0))), dim=1)
 
         return logprobs, state
 
@@ -107,10 +106,10 @@ def sample_beam(self, fc_feats, att_feats, opt={}):
                     xt = self.img_embed(fc_feats[k:k+1]).expand(beam_size, self.input_encoding_size)
                 elif t == 1: # input <bos>
                     it = fc_feats.data.new(beam_size).long().zero_()
-                    xt = self.embed(Variable(it, requires_grad=False))
+                    xt = self.embed(it)
 
                 output, state = self.core(xt.unsqueeze(0), state)
-                logprobs = F.log_softmax(self.logit(self.dropout(output.squeeze(0))))
+                logprobs = F.log_softmax(self.logit(self.dropout(output.squeeze(0))), dim=1)
 
             self.done_beams[k] = self.beam_search(state, logprobs, opt=opt)
             seq[:, k] = self.done_beams[k][0]['seq'] # the first beam has highest cumulative score
@@ -145,10 +144,10 @@ def sample(self, fc_feats, att_feats, opt={}):
                         # scale logprobs by temperature
                         prob_prev = torch.exp(torch.div(logprobs.data, temperature)).cpu()
                     it = torch.multinomial(prob_prev, 1).cuda()
-                    sampleLogprobs = logprobs.gather(1, Variable(it, requires_grad=False)) # gather the logprobs at sampled positions
+                    sampleLogprobs = logprobs.gather(1, it) # gather the logprobs at sampled positions
                     it = it.view(-1).long() # and flatten indices for downstream processing
 
-                xt = self.embed(Variable(it, requires_grad=False))
+                xt = self.embed(it)
 
             if t >= 2:
                 # stop when all finished
@@ -163,6 +162,6 @@ def sample(self, fc_feats, att_feats, opt={}):
                 seqLogprobs.append(sampleLogprobs.view(-1))
 
             output, state = self.core(xt.unsqueeze(0), state)
-            logprobs = F.log_softmax(self.logit(self.dropout(output.squeeze(0))))
+            logprobs = F.log_softmax(self.logit(self.dropout(output.squeeze(0))), dim=1)
 
         return torch.cat([_.unsqueeze(1) for _ in seq], 1), torch.cat([_.unsqueeze(1) for _ in seqLogprobs], 1)
\ No newline at end of file
diff --git a/scripts/prepro_feats.py b/scripts/prepro_feats.py
index 6489e49f..3f1e793c 100644
--- a/scripts/prepro_feats.py
+++ b/scripts/prepro_feats.py
@@ -38,7 +38,6 @@
 import numpy as np
 import torch
 import torchvision.models as models
-from torch.autograd import Variable
 import skimage.io
 
 from torchvision import transforms as trn
@@ -80,8 +79,9 @@ def main(params):
 
     I = I.astype('float32')/255.0
     I = torch.from_numpy(I.transpose([2,0,1])).cuda()
-    I = Variable(preprocess(I), volatile=True)
-    tmp_fc, tmp_att = my_resnet(I, params['att_size'])
+    I = preprocess(I)
+    with torch.no_grad():
+      tmp_fc, tmp_att = my_resnet(I, params['att_size'])
     # write to pkl
     np.save(os.path.join(dir_fc, str(img['cocoid'])), tmp_fc.data.cpu().float().numpy())
     np.savez_compressed(os.path.join(dir_att, str(img['cocoid'])), feat=tmp_att.data.cpu().float().numpy())
diff --git a/scripts/prepro_labels.py b/scripts/prepro_labels.py
index ffde89f0..d3db2b5b 100644
--- a/scripts/prepro_labels.py
+++ b/scripts/prepro_labels.py
@@ -37,7 +37,6 @@
 import numpy as np
 import torch
 import torchvision.models as models
-from torch.autograd import Variable
 import skimage.io
 from PIL import Image
 

From e0ffc77a7a7cb30006444977cd4123c57b7d245b Mon Sep 17 00:00:00 2001
From: Ruotian Luo <rluo@ttic.edu>
Date: Sat, 28 Apr 2018 18:02:26 -0500
Subject: [PATCH 34/42] Simplify resnet code.

---
 misc/resnet.py | 157 +++----------------------------------------------
 1 file changed, 8 insertions(+), 149 deletions(-)

diff --git a/misc/resnet.py b/misc/resnet.py
index 07a9c994..e8aaff42 100644
--- a/misc/resnet.py
+++ b/misc/resnet.py
@@ -1,156 +1,15 @@
+import torch
 import torch.nn as nn
-import math
-import torch.utils.model_zoo as model_zoo
+import torchvision.models.resnet
+from torchvision.models.resnet import BasicBlock, Bottleneck
 
-
-__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
-           'resnet152']
-
-
-model_urls = {
-    'resnet18': 'https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth',
-    'resnet34': 'https://s3.amazonaws.com/pytorch/models/resnet34-333f7ec4.pth',
-    'resnet50': 'https://s3.amazonaws.com/pytorch/models/resnet50-19c8e357.pth',
-    'resnet101': 'https://s3.amazonaws.com/pytorch/models/resnet101-5d3b4d8f.pth',
-    'resnet152': 'https://s3.amazonaws.com/pytorch/models/resnet152-b121ed2d.pth',
-}
-
-
-def conv3x3(in_planes, out_planes, stride=1):
-    "3x3 convolution with padding"
-    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
-                     padding=1, bias=False)
-
-
-class BasicBlock(nn.Module):
-    expansion = 1
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None):
-        super(BasicBlock, self).__init__()
-        self.conv1 = conv3x3(inplanes, planes, stride)
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = conv3x3(planes, planes)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x):
-        residual = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-
-        if self.downsample is not None:
-            residual = self.downsample(x)
-
-        out += residual
-        out = self.relu(out)
-
-        return out
-
-
-class Bottleneck(nn.Module):
-    expansion = 4
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None):
-        super(Bottleneck, self).__init__()
-        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False) # change
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, # change
-                               padding=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(planes * 4)
-        self.relu = nn.ReLU(inplace=True)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x):
-        residual = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.relu(out)
-
-        out = self.conv3(out)
-        out = self.bn3(out)
-
-        if self.downsample is not None:
-            residual = self.downsample(x)
-
-        out += residual
-        out = self.relu(out)
-
-        return out
-
-
-class ResNet(nn.Module):
+class ResNet(torchvision.models.resnet.ResNet):
     def __init__(self, block, layers, num_classes=1000):
-        self.inplanes = 64
-        super(ResNet, self).__init__()
-        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
-                               bias=False)
-        self.bn1 = nn.BatchNorm2d(64)
-        self.relu = nn.ReLU(inplace=True)
+        super(ResNet, self).__init__(block, layers, num_classes)
         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=0, ceil_mode=True) # change
-        self.layer1 = self._make_layer(block, 64, layers[0])
-        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
-        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
-        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
-        self.avgpool = nn.AvgPool2d(7)
-        self.fc = nn.Linear(512 * block.expansion, num_classes)
-
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-                m.weight.data.normal_(0, math.sqrt(2. / n))
-            elif isinstance(m, nn.BatchNorm2d):
-                m.weight.data.fill_(1)
-                m.bias.data.zero_()
-
-    def _make_layer(self, block, planes, blocks, stride=1):
-        downsample = None
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                nn.Conv2d(self.inplanes, planes * block.expansion,
-                          kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(planes * block.expansion),
-            )
-
-        layers = []
-        layers.append(block(self.inplanes, planes, stride, downsample))
-        self.inplanes = planes * block.expansion
-        for i in range(1, blocks):
-            layers.append(block(self.inplanes, planes))
-
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.relu(x)
-        x = self.maxpool(x)
-
-        x = self.layer1(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        x = self.layer4(x)
-
-        x = self.avgpool(x)
-        x = x.view(x.size(0), -1)
-        x = self.fc(x)
-
-        return x
-
+        for i in range(2, 5):
+            getattr(self, 'layer%d'%i)[0].conv1.stride = (2,2)
+            getattr(self, 'layer%d'%i)[0].conv2.stride = (1,1)
 
 def resnet18(pretrained=False):
     """Constructs a ResNet-18 model.

From 21cf0614165ca6f0a3ffd2ca4cfc4c26015e176a Mon Sep 17 00:00:00 2001
From: Ruotian Luo <rluo@ttic.edu>
Date: Sat, 28 Apr 2018 18:03:20 -0500
Subject: [PATCH 35/42] Add cider submodule

---
 .gitmodules | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 .gitmodules

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..0468205f
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "cider"]
+	path = cider
+	url = https://github.com/ruotianluo/cider.git

From 19d8d616d0ed99ae9879ad0a7ed5950b330cbe86 Mon Sep 17 00:00:00 2001
From: Ruotian Luo <rluo@ttic.edu>
Date: Sat, 28 Apr 2018 18:06:38 -0500
Subject: [PATCH 36/42] Add options and verbose for make_bu_data.

---
 scripts/make_bu_data.py | 36 +++++++++++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 7 deletions(-)

diff --git a/scripts/make_bu_data.py b/scripts/make_bu_data.py
index 5dd8b0f4..ee30a5f8 100644
--- a/scripts/make_bu_data.py
+++ b/scripts/make_bu_data.py
@@ -1,3 +1,7 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
 import os
 import base64
 import numpy as np
@@ -6,25 +10,43 @@
 import zlib
 import time
 import mmap
+import argparse
+
+parser = argparse.ArgumentParser()
+
+# output_dir
+parser.add_argument('--downloaded_feats', default='data/bu_data', help='downloaded feature directory')
+parser.add_argument('--output_dir', default='data/cocobu', help='output feature files')
+
+args = parser.parse_args()
 
 csv.field_size_limit(sys.maxsize)
 
 
 FIELDNAMES = ['image_id', 'image_w','image_h','num_boxes', 'boxes', 'features']
-#infiles = ['trainval/karpathy_test_resnet101_faster_rcnn_genome.tsv',
-#          'trainval/karpathy_val_resnet101_faster_rcnn_genome.tsv',
-infiles = ['trainval/karpathy_train_resnet101_faster_rcnn_genome.tsv.0', \
+infiles = ['trainval/karpathy_test_resnet101_faster_rcnn_genome.tsv',
+          'trainval/karpathy_val_resnet101_faster_rcnn_genome.tsv',\
+          'trainval/karpathy_train_resnet101_faster_rcnn_genome.tsv.0', \
            'trainval/karpathy_train_resnet101_faster_rcnn_genome.tsv.1']
 
+os.makedirs(args.output_dir+'_att')
+os.makedirs(args.output_dir+'_fc')
+os.makedirs(args.output_dir+'_box')
+
 for infile in infiles:
-    with open(infile, "r+b") as tsv_in_file:
+    print('Reading ' + infile)
+    with open(os.path.join(args.downloaded_feats, infile), "r+b") as tsv_in_file:
         reader = csv.DictReader(tsv_in_file, delimiter='\t', fieldnames = FIELDNAMES)
         for item in reader:
             item['image_id'] = int(item['image_id'])
             item['num_boxes'] = int(item['num_boxes'])
             for field in ['boxes', 'features']:
                 item[field] = np.frombuffer(base64.decodestring(item[field]), 
-                      dtype=np.float32).reshape((item['num_boxes'],-1))
-            np.savez_compressed(os.path.join('../cocobu_att', str(item['image_id'])), feat=item['features'])
-            np.save(os.path.join('../cocobu_fc', str(item['image_id'])), item['features'].mean(0))
+                        dtype=np.float32).reshape((item['num_boxes'],-1))
+            np.savez_compressed(os.path.join(args.output_dir+'_att', str(item['image_id'])), feat=item['features'])
+            np.save(os.path.join(args.output_dir+'_fc', str(item['image_id'])), item['features'].mean(0))
+            np.save(os.path.join(args.output_dir+'_box', str(item['image_id'])), item['boxes'])
+
+
+
 

From 7603d03fce192063b70b9bd8ecd25e73f58cefb7 Mon Sep 17 00:00:00 2001
From: Ruotian Luo <rluo@ttic.edu>
Date: Sat, 28 Apr 2018 18:07:01 -0500
Subject: [PATCH 37/42] Make image_root an optional option when prepro_label.

---
 scripts/prepro_labels.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/prepro_labels.py b/scripts/prepro_labels.py
index d3db2b5b..ced5bb7b 100644
--- a/scripts/prepro_labels.py
+++ b/scripts/prepro_labels.py
@@ -171,8 +171,9 @@ def main(params):
     if 'filename' in img: jimg['file_path'] = os.path.join(img['filepath'], img['filename']) # copy it over, might need
     if 'cocoid' in img: jimg['id'] = img['cocoid'] # copy over & mantain an id, if present (e.g. coco ids, useful)
     
-    with Image.open(os.path.join(params['images_root'], img['filepath'], img['filename'])) as _img:
-      jimg['width'], jimg['height'] = _img.size
+    if params['images_root'] != '':
+      with Image.open(os.path.join(params['images_root'], img['filepath'], img['filename'])) as _img:
+        jimg['width'], jimg['height'] = _img.size
 
     out['images'].append(jimg)
   

From 5f8f7e7e0680694ce493831650e20786b62cadd6 Mon Sep 17 00:00:00 2001
From: Ruotian Luo <rluo@ttic.edu>
Date: Sat, 28 Apr 2018 18:08:47 -0500
Subject: [PATCH 38/42] Add comments in Attmodel.

---
 models/AttModel.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/models/AttModel.py b/models/AttModel.py
index 0d1cbf12..ba7c8df5 100644
--- a/models/AttModel.py
+++ b/models/AttModel.py
@@ -11,6 +11,7 @@
 
 # TopDown is from Bottom-Up and Top-Down Attention for Image Captioning and VQA
 # https://arxiv.org/abs/1707.07998
+# However, it may not be identical to the author's architecture.
 
 from __future__ import absolute_import
 from __future__ import division
@@ -423,6 +424,12 @@ def forward(self, xt, fc_feats, att_feats, p_att_feats, state, att_masks=None):
         return output, state
 
 
+############################################################################
+# Notice:
+# StackAtt and DenseAtt are models that I randomly designed.
+# They are not related to any paper.
+############################################################################
+
 from .FCModel import LSTMCore
 class StackAttCore(nn.Module):
     def __init__(self, opt, use_maxout=False):
@@ -567,6 +574,11 @@ def forward(self, xt, fc_feats, att_feats, p_att_feats, state, att_masks=None):
         state = (next_h.unsqueeze(0), next_c.unsqueeze(0))
         return output, state
 
+
+"""
+Note this is my attempt to replicate att2all model in self-critical paper.
+However, this is not a correct replication actually. Will fix it.
+"""
 class Att2all2Core(nn.Module):
     def __init__(self, opt):
         super(Att2all2Core, self).__init__()

From 9ebae0a1957a001d05d7873ae00190ce4d97983a Mon Sep 17 00:00:00 2001
From: Ruotian Luo <rluo@ttic.edu>
Date: Sat, 28 Apr 2018 18:17:28 -0500
Subject: [PATCH 39/42] Add compatibility to resnet features.

---
 dataloader.py | 5 +++++
 train.py      | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/dataloader.py b/dataloader.py
index fd76ec77..79f7fbac 100644
--- a/dataloader.py
+++ b/dataloader.py
@@ -164,6 +164,9 @@ def get_batch(self, split, batch_size=None, seq_per_img=None):
         data['att_masks'] = np.zeros(data['att_feats'].shape[:2], dtype='float32')
         for i in range(len(att_batch)):
             data['att_masks'][i*seq_per_img:(i+1)*seq_per_img, :att_batch[i].shape[0]] = 1
+        # set att_masks to None if attention features have same length
+        if data['att_masks'].sum() == data['att_masks'].size:
+            data['att_masks'] = None
 
         data['labels'] = np.vstack(label_batch)
         # generate mask
@@ -187,6 +190,8 @@ def __getitem__(self, index):
         ix = index #self.split_ix[index]
         if self.use_att:
             att_feat = np.load(os.path.join(self.input_att_dir, str(self.info['images'][ix]['id']) + '.npz'))['feat']
+            # Reshape to K x C
+            att_feat = att_feat.reshape(-1, att_feat.shape[-1])
             if self.norm_att_feat:
                 att_feat = att_feat / np.linalg.norm(att_feat, 2, 1, keepdims=True)
             if self.use_box:
diff --git a/train.py b/train.py
index c3dc2e5d..8b0335b8 100644
--- a/train.py
+++ b/train.py
@@ -117,7 +117,7 @@ def train(opt):
         start = time.time()
 
         tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks']]
-        tmp = [torch.from_numpy(_).cuda() for _ in tmp]
+        tmp = [_ if _ is None else torch.from_numpy(_).cuda() for _ in tmp]
         fc_feats, att_feats, labels, masks, att_masks = tmp
         
         optimizer.zero_grad()

From 8c1b8aa211a70213aacd939516baf07735e4484a Mon Sep 17 00:00:00 2001
From: Ruotian Luo <rluo@ttic.edu>
Date: Sat, 28 Apr 2018 19:43:06 -0500
Subject: [PATCH 40/42] Sort the features in the forwarding instead of
 dataloader.

---
 dataloader.py      |  6 ++++--
 models/AttModel.py | 16 ++++++++++++++--
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/dataloader.py b/dataloader.py
index 79f7fbac..d90cea71 100644
--- a/dataloader.py
+++ b/dataloader.py
@@ -151,9 +151,11 @@ def get_batch(self, split, batch_size=None, seq_per_img=None):
             info_dict['file_path'] = self.info['images'][ix]['file_path']
             infos.append(info_dict)
 
-        #sort by att_feat length
+        # #sort by att_feat length
+        # fc_batch, att_batch, label_batch, gts, infos = \
+        #     zip(*sorted(zip(fc_batch, att_batch, np.vsplit(label_batch, batch_size), gts, infos), key=lambda x: len(x[1]), reverse=True))
         fc_batch, att_batch, label_batch, gts, infos = \
-            zip(*sorted(zip(fc_batch, att_batch, np.vsplit(label_batch, batch_size), gts, infos), key=lambda x: len(x[1]), reverse=True))
+            zip(*sorted(zip(fc_batch, att_batch, np.vsplit(label_batch, batch_size), gts, infos), key=lambda x: 0, reverse=True))
         data = {}
         data['fc_feats'] = np.stack(reduce(lambda x,y:x+y, [[_]*seq_per_img for _ in fc_batch]))
         # merge att_feats
diff --git a/models/AttModel.py b/models/AttModel.py
index ba7c8df5..40184f9d 100644
--- a/models/AttModel.py
+++ b/models/AttModel.py
@@ -25,10 +25,22 @@
 
 from .CaptionModel import CaptionModel
 
+def sort_pack_padded_sequence(input, lengths):
+    sorted_lengths, indices = torch.sort(lengths, descending=True)
+    tmp = pack_padded_sequence(input[indices], sorted_lengths, batch_first=True)
+    inv_ix = indices.clone()
+    inv_ix[indices] = torch.arange(0,len(indices)).type_as(inv_ix)
+    return tmp, inv_ix
+
+def pad_unsort_packed_sequence(input, inv_ix):
+    tmp, _ = pad_packed_sequence(input, batch_first=True)
+    tmp = tmp[inv_ix]
+    return tmp
+
 def pack_wrapper(module, att_feats, att_masks):
     if att_masks is not None:
-        packed = pack_padded_sequence(att_feats, list(att_masks.data.long().sum(1)), batch_first=True)
-        return pad_packed_sequence(PackedSequence(module(packed[0]), packed[1]), batch_first=True)[0]
+        packed, inv_ix = sort_pack_padded_sequence(att_feats, att_masks.data.long().sum(1))
+        return pad_unsort_packed_sequence(PackedSequence(module(packed[0]), packed[1]), inv_ix)
     else:
         return module(att_feats)
 

From b8d55de881b7c9ede9b73a80e2a0475a121d7709 Mon Sep 17 00:00:00 2001
From: Ruotian Luo <rluo@ttic.edu>
Date: Sat, 28 Apr 2018 19:43:41 -0500
Subject: [PATCH 41/42] Update readme.

---
 README.md | 81 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 65 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 5d5b0442..6ecc9d99 100644
--- a/README.md
+++ b/README.md
@@ -1,46 +1,78 @@
-# Self-critical Sequence Training for Image Captioning
+# Self-critical Sequence Training for Image Captioning (+ misc.)
 
-This is an unofficial implementation for [Self-critical Sequence Training for Image Captioning](https://arxiv.org/abs/1612.00563). The result of FC model can be replicated. (Not able to replicate Att2in result.)
+This repository includes the unofficial implementation [Self-critical Sequence Training for Image Captioning](https://arxiv.org/abs/1612.00563) and [Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering](https://arxiv.org/abs/1707.07998).
 
-The author helped me a lot when I tried to replicate the result. Great thanks. The latest topdown and att2in2 model can achieve 1.12 Cider score on Karpathy's test split after self-critical training.
+The author of SCST helped me a lot when I tried to replicate the result. Great thanks. The att2in2 model can achieve more than 1.20 Cider score on Karpathy's test split (with self-critical training, bottom-up feature, large rnn hidden size, without ensemble)
 
-This is based on my [neuraltalk2.pytorch](https://github.com/ruotianluo/neuraltalk2.pytorch) repository. The modifications is:
-- Add self critical training.
+This is based on my [ImageCaptioning.pytorch](https://github.com/ruotianluo/ImageCaptioning.pytorch) repository. The modifications is:
+- Self critical training.
+- Bottom up feature support from [ref](https://arxiv.org/abs/1707.07998). (Evaluation on arbitrary images is not supported.)
+- Ensemble
+- Multi-GPU training
 
 ## Requirements
 Python 2.7 (because there is no [coco-caption](https://github.com/tylin/coco-caption) version for python 3)
-PyTorch 0.2 (along with torchvision)
+PyTorch 0.4 (along with torchvision)
+cider (already been added as a submodule)
 
-You need to download pretrained resnet model for both training and evaluation. The models can be downloaded from [here](https://drive.google.com/open?id=0B7fNdx_jAqhtbVYzOURMdDNHSGM), and should be placed in `data/imagenet_weights`.
+(**Skip if you are using bottom-up feature**): If you want to use resnet to extract image features, you need to download pretrained resnet model for both training and evaluation. The models can be downloaded from [here](https://drive.google.com/open?id=0B7fNdx_jAqhtbVYzOURMdDNHSGM), and should be placed in `data/imagenet_weights`.
 
-## Pretrained models
+## Pretrained models (using resnet101 feature)
 Pretrained models are provided [here](https://drive.google.com/open?id=0B7fNdx_jAqhtdE1JRXpmeGJudTg). And the performances of each model will be maintained in this [issue](https://github.com/ruotianluo/neuraltalk2.pytorch/issues/10).
 
-If you want to do evaluation only, then you can follow [this section](#generate-image-captions) after downloading the pretrained models.
+If you want to do evaluation only, you can then follow [this section](#generate-image-captions) after downloading the pretrained models (and also the pretrained resnet101).
 
 ## Train your own network on COCO
 
-### Download COCO dataset and preprocessing
-
-First, download the coco images from [link](http://mscoco.org/dataset/#download). We need 2014 training images and 2014 val. images. You should put the `train2014/` and `val2014/` in the same directory, denoted as `$IMAGE_ROOT`.
+### Download COCO captions and preprocess them
 
 Download preprocessed coco captions from [link](http://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip) from Karpathy's homepage. Extract `dataset_coco.json` from the zip file and copy it in to `data/`. This file provides preprocessed captions and also standard train-val-test splits.
 
-Once we have these, we can now invoke the `prepro_*.py` script, which will read all of this in and create a dataset (two feature folders, a hdf5 label file and a json file).
+Then do:
 
 ```bash
 $ python scripts/prepro_labels.py --input_json data/dataset_coco.json --output_json data/cocotalk.json --output_h5 data/cocotalk
-$ python scripts/prepro_feats.py --input_json data/dataset_coco.json --output_dir data/cocotalk --images_root $IMAGE_ROOT
 ```
 
 `prepro_labels.py` will map all words that occur <= 5 times to a special `UNK` token, and create a vocabulary for all the remaining words. The image information and vocabulary are dumped into `data/cocotalk.json` and discretized caption data are dumped into `data/cocotalk_label.h5`.
 
+### Download COCO dataset and pre-extract the image features (Skip if you are using bottom-up feature)
+
+Download the coco images from [link](http://mscoco.org/dataset/#download). We need 2014 training images and 2014 val. images. You should put the `train2014/` and `val2014/` in the same directory, denoted as `$IMAGE_ROOT`.
+
+Then:
+
+```
+$ python scripts/prepro_feats.py --input_json data/dataset_coco.json --output_dir data/cocotalk --images_root $IMAGE_ROOT
+```
+
+
 `prepro_feats.py` extract the resnet101 features (both fc feature and last conv feature) of each image. The features are saved in `data/cocotalk_fc` and `data/cocotalk_att`, and resulting files are about 200GB.
 
 (Check the prepro scripts for more options, like other resnet models or other attention sizes.)
 
 **Warning**: the prepro script will fail with the default MSCOCO data because one of their images is corrupted. See [this issue](https://github.com/karpathy/neuraltalk2/issues/4) for the fix, it involves manually replacing one image in the dataset.
 
+### Download Bottom-up features (Skip if you are using resnet features)
+
+Download pre-extracted feature from [link](https://github.com/peteanderson80/bottom-up-attention). You can either download adaptive one or fixed one.
+
+For example:
+```
+mkdir data/bu_data; cd data/bu_data
+wget https://storage.googleapis.com/bottom-up-attention/trainval.zip
+unzip trainval.zip
+
+```
+
+Then:
+
+```bash
+python script/make_bu_data.py --output_dir data/cocobu
+```
+
+This will create `data/cocobu_fc`, `data/cocobu_att` and `data/cocobu_box`. If you want to use bottom-up feature, you can just follow the following steps and replace all cocotalk with cocobu.
+
 ### Start training
 
 ```bash
@@ -68,8 +100,6 @@ First you should preprocess the dataset and get the cache for calculating cider
 $ python scripts/prepro_ngrams.py --input_json .../dataset_coco.json --dict_json data/cocotalk.json --output_pkl data/coco-train --split train
 ```
 
-And also you need to clone my forked [cider](https://github.com/ruotianluo/cider) repository.
-
 Then, copy the model from the pretrained model using cross entropy. (It's not mandatory to copy the model, just for back-up)
 ```
 $ bash scripts/copy_model.sh fc fc_rl
@@ -122,6 +152,25 @@ The defualt split to evaluate is test. The default inference method is greedy de
 
 **Live demo**. Not supported now. Welcome pull request.
 
+## For more advanced features:
+
+Checkout `ADVANCED.md`.
+
+## Reference
+
+If you find this repo useful, please consider citing (no obligation at all):
+
+```
+@article{luo2018discriminability,
+  title={Discriminability objective for training descriptive captions},
+  author={Luo, Ruotian and Price, Brian and Cohen, Scott and Shakhnarovich, Gregory},
+  journal={arXiv preprint arXiv:1803.04376},
+  year={2018}
+}
+```
+
+Of course, please cite the original paper of models you are using (You can find references in the model files).
+
 ## Acknowledgements
 
 Thanks the original [neuraltalk2](https://github.com/karpathy/neuraltalk2) and awesome PyTorch team.
\ No newline at end of file

From 403141d2e9c29a8817a75fdaa951430712405cd0 Mon Sep 17 00:00:00 2001
From: Ruotian Luo <rluo@ttic.edu>
Date: Sat, 28 Apr 2018 19:59:57 -0500
Subject: [PATCH 42/42] Add advanced. (Still nothing in it.)

---
 ADVANCED.md | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 ADVANCED.md

diff --git a/ADVANCED.md b/ADVANCED.md
new file mode 100644
index 00000000..aab996c6
--- /dev/null
+++ b/ADVANCED.md
@@ -0,0 +1,7 @@
+# Advanced
+
+## Ensemble
+
+## Batch normalization
+
+## Box feature
\ No newline at end of file