Skip to content


Assignment xixiaoyao#5
Browse files Browse the repository at this point in the history
hw5.ipynb - for running sanity checks.
colab gpu training time ~ 6 hours
BLEU 36.38
  • Loading branch information
Herais committed May 22, 2020
1 parent cec6d3f commit 1cb4689
Show file tree
Hide file tree
Showing 25 changed files with 169,396 additions and 0 deletions.
20,030 changes: 20,030 additions & 0 deletions Assignments/assignment5/Herais/Ans 1 Coding.mht

Large diffs are not rendered by default.

20,397 changes: 20,397 additions & 0 deletions Assignments/assignment5/Herais/Ans 1 Written.mht

Large diffs are not rendered by default.

8,833 changes: 8,833 additions & 0 deletions Assignments/assignment5/Herais/Ans 2 Coding.mht

Large diffs are not rendered by default.

12,322 changes: 12,322 additions & 0 deletions Assignments/assignment5/Herais/Ans 3.mht

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions Assignments/assignment5/Herais/
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Assgnment #5 Written and Coding
- Ans 1 Written.mht
- Ans 1 Coding.mht
- Ans 2 Coding.mht
- Ans 3.mht
- hw5 jupyter
Empty file.
186 changes: 186 additions & 0 deletions Assignments/assignment5/Herais/
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

CS224N 2019-20: Homework 5

import torch
import torch.nn as nn
import torch.nn.functional as F
from vocab import VocabEntry
import numpy as np
import re

class CharDecoder(nn.Module):
def __init__(self, hidden_size, char_embedding_size=50, target_vocab=None):
""" Init Character Decoder.
@param hidden_size (int): Hidden size of the decoder LSTM
@param char_embedding_size (int): dimensionality of character embeddings
@param target_vocab (VocabEntry): vocabulary for the target language. See for documentation.
super(CharDecoder, self).__init__()
self.target_vocab = target_vocab
self.charDecoder = nn.LSTM(char_embedding_size, hidden_size)
self.char_output_projection = nn.Linear(hidden_size, len(self.target_vocab.char2id))
self.decoderCharEmb = nn.Embedding(len(self.target_vocab.char2id), char_embedding_size,

def forward(self, input, dec_hidden=None):
""" Forward pass of character decoder.
@param input (Tensor): tensor of integers, shape (length, batch_size)
@param dec_hidden (tuple(Tensor, Tensor)): internal state of the LSTM before reading the input characters. A tuple of two tensors of shape (1, batch, hidden_size)
@returns scores (Tensor): called s_t in the PDF, shape (length, batch_size, self.vocab_size)
@returns dec_hidden (tuple(Tensor, Tensor)): internal state of the LSTM after reading the input characters. A tuple of two tensors of shape (1, batch, hidden_size)
### YOUR CODE HERE for part 2a
### TODO - Implement the forward pass of the character decoder.

# dec_hidden (tuple(Tensor, Tensor)), each Tensor (1, batch, h)

# input: [length, b] ==> decoderCharEmb => X: [length, b, char_embed_size]
X = self.decoderCharEmb(input)

# X: [length, b, char_embed_size], dec_hidden = (h_n, c_n): ([1, b, h], [1, b, h])
# ==> charDecoder ==>
# h_t: [length, b char_embed_size], dec_hidden = (h_n, c_n): ([1, b, h], [1, b, h])
h_t, dec_hidden = self.charDecoder(X, dec_hidden)

# h_t: [length, b char_embed_size] ==> char_output_projection ==> scores = s_t : [length, b, self.vocab_size]
scores = self.char_output_projection(h_t)

return scores, dec_hidden

def train_forward(self, char_sequence, dec_hidden=None):
""" Forward computation during training.
@param char_sequence (Tensor): tensor of integers, shape (length, batch_size). Note that "length" here and in forward() need not be the same.
@param dec_hidden (tuple(Tensor, Tensor)): initial internal state of the LSTM, obtained from the output of the word-level decoder. A tuple of two tensors of shape (1, batch_size, hidden_size)
@returns The cross-entropy loss (Tensor), computed as the *sum* of cross-entropy losses of all the words in the batch.
### YOUR CODE HERE for part 2b
### TODO - Implement training forward pass.
### Hint: - Make sure padding characters do not contribute to the cross-entropy loss. Check to find the padding token's index.
### - char_sequence corresponds to the sequence x_1 ... x_{n+1} (e.g., <START>,m,u,s,i,c,<END>). Read the handout about how to construct input and target sequence of CharDecoderLSTM.
### - Carefully read the documentation for nn.CrossEntropyLoss and our handout to see what this criterion have already included:

# char_sequence: [length, b] => delete end_token => input_sequence: [length, b]
X_input = char_sequence[:-1]

# char_sequence: [length, b] => delete start_token => input_sequence: [length, b]
X_target = char_sequence[1:]

# X_input: [length, b], dec_hidden = (h_n, c_n): ([1, b, h], [1, b, h])
# ==> softmax ==>
# s_t: [length, b, self.vocab_size], dec_hidden = (h_n, c_n): ([1, b, h], [1, b, h])
s_t, dec_hidden = self.forward(X_input, dec_hidden)

# For lookup char_pad index value, shall be 0
vocab_entry = VocabEntry()
idx_char_pad = vocab_entry.char_pad

# Initialiate CrossEntropyLoss Instances, combines logsoftmax and nllloss
compute_loss = nn.CrossEntropyLoss(ignore_index = idx_char_pad,
reduction ='sum'

# Reshape s_t for compute_loss, length*b => b_char
# length = length of a word, b = batch size, length*b = # of characters in the batch
# s_t: [length, b, self.vocab_size] ==> s_t: [length*b, self.vocab_size] = [N, C]
s_t = s_t.reshape(s_t.shape[0]*s_t.shape[1], -1)

# Reshape X_target for compute_loss
# X_target: [length, b] ==> X_target: [length*b] = [N]
X_target = X_target.reshape(-1)

# s_t: [length*b, self.vocab_size] = [N, C,], X_target: [length*b] = [N]
# ==> compute_loss ==> loss_char_dec:
loss_char_dec = compute_loss(s_t, X_target)

return loss_char_dec

def decode_greedy(self, initialStates, device, max_length=21):
""" Greedy decoding
@param initialStates (tuple(Tensor, Tensor)): initial internal state of the LSTM, a tuple of two tensors of size (1, batch_size, hidden_size)
@param device: torch.device (indicates whether the model is on CPU or GPU)
@param max_length (int): maximum length of words to decode
@returns decodedWords (List[str]): a list (of length batch_size) of strings, each of which has length <= max_length.
The decoded strings should NOT contain the start-of-word and end-of-word characters.

### YOUR CODE HERE for part 2c
### TODO - Implement greedy decoding.
### Hints:
### - Use initialStates to get batch_size = b.
### - Use target_vocab.char2id and target_vocab.id2char to convert between integers and characters
### - Use torch.tensor(..., device=device) to turn a list of character indices into a tensor.
### - You may find torch.argmax or torch.argmax useful
### - We use curly brackets as start-of-word and end-of-word characters. That is, use the character '{' for <START> and '}' for <END>.
### Their indices are self.target_vocab.start_of_word and self.target_vocab.end_of_word, respectively.


# initialStates (tuple(Tensor, Tensor)): ([1, batch_size, hidden_size], [1, batch_size, hidden_size])
# ==> read ==> batch_size [int]
batch_size = initialStates[0].shape[1]

# iniitalStates ==> dec_hidden = (h0, c0)
# (tuple(Tensor, Tensor)): [1, batch_size, hidden_size], [1, batch_size, hidden_size]
dec_hidden = initialStates

# Initialize output_word as an empty, output_word (Tensor): [length <= max_length = 0, batch_size]
output_word = torch.empty(0, batch_size, dtype=torch.long , device=device)

# Initiated VocabEntry Instance for character-index lookups
vocab_entry = VocabEntry() # vocab_entry.start_of_word = index of (<START>='{')

# Initialize current_char (Tensor): [1, batch_size]
current_char = torch.tensor([vocab_entry.start_of_word]*batch_size, dtype=torch.long, device=device).reshape(1, -1).contiguous()

# Keep finding next character, until reaching max-length of word.
for i in range(0, max_length-1):

# current_char (Tensor): [1, b], dec_hidden = (h_n, c_n) (tuple(Tensor, Tensor)): ([1, b, h], [1, b, h])
# ==> self.forward ==>
# s_t (Tensor): [1, b, self.vocab_size], dec_hidden (tuple(Tensor, Tensor)): ([1, b, h], [1, b, h])
s_t, dec_hidden = self.forward(current_char, dec_hidden)

# s_t (Tensor): [1, b, self.vocab_size] ==> softmax ==> p_t (Tensor): [1, b, self.vocab.size]
p_t = F.softmax(s_t, dim=2)

# p_t (Tensor): [1, b, self.vocab_size]
# ==> argmax ==> current_char (Tensor): [1, b]
current_char = torch.argmax(p_t, dim=2)

# current_char (Tensor): [1, b] ==> output_word (Tensor): [length <= max_length, b]
output_word =, current_char), dim=0)

# output_word (Tensor): [max_length, b] ==> output_word (List(List[int]): [b, max_length]
output_word = output_word.permute(1,0).tolist()

# Trucate each word in batch starting from the first end_of_word token <END>='}'
# output_word (List(List[int]): [b, max_length] ==> output_word (List(List(int))): [b, length <= max_length]
output_word = [cids[0:cids.index(vocab_entry.end_of_word)] if vocab_entry.end_of_word in cids else cids for cids in output_word]

# Convert character indices to characters
# output_word (List(List[int])): [b, length <= max_length]
# ==> (List(List[str])): [b, length <= max_length, str_len=1]
decodedWords = [[vocab_entry.id2char[cid] for cid in word] for word in output_word]

# decodedWords (List(List[str])): [b, length <= max_length, str_len = 1]
# ==> decodedWords (List[str]): [b, length <= max_length]
decodedWords = [''.join(char) for char in decodedWords]

return decodedWords

87 changes: 87 additions & 0 deletions Assignments/assignment5/Herais/
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

CS224N 2019-20: Homework 5

import torch
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
Class of Convolution Neural Network
that applys kernel over x_reshaped to compute x_conv_out
# Remember to delete the above 'pass' after your implementation
### YOUR CODE HERE for part 1g
def __init__(self, word_embed_size, char_embed_size=50, k=5, padding=1):
Init CNN Layers.
@param word_embed_size (int): size of word embedding
@param char_embed_size (int) = 50: size of word embedding
@param k (int) = 5: kernel size for convolution
@padding (int) = 1: size of padding applied to x_reshaped bilaterally
super(CNN, self).__init__() # Initialize self._modules as OrderedDict

self.word_embed_size = word_embed_size
self.char_embed_size = char_embed_size
self.k = k
self.padding = padding

# Default Values
self.apply_conv = None
self.apply_maxpool = None

# Initialize Variables
torch.nn.Conv1d(in_channels, #
out_channels, # f = number of output channels = word_embed_size
kernel_size, # k=5
padding=0, # padding=1
self.apply_conv = nn.Conv1d(in_channels = self.char_embed_size,
out_channels = self.word_embed_size,

def forward(self, x):
@param x (tensor): x_reshaped in shape (b, char_embed_size, m_word),
where b = batch size,
char_embed_size = size of the character embedding, and
m_word = length of longest word in the batch.
@return x_conv_out (tensor): tensor of shape (b, word_embed_size)
# x_conv shape (b, word_embed_size, m_word-k+1)
x_conv = self.apply_conv(x) # (b, word_embed_size, m_word+2*padding-k+1)

m_word = x.shape[2]
apply_maxpool = nn.MaxPool1d(kernel_size = m_word + 2*self.padding - self.k + 1)
x_conv_out = apply_maxpool(F.relu(x_conv)).squeeze(2)

return x_conv_out


2 changes: 2 additions & 0 deletions Assignments/assignment5/Herais/
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
rm -f
zip -r *.py ./en_es_data ./sanity_check_en_es_data ./outputs
3 changes: 3 additions & 0 deletions Assignments/assignment5/Herais/gpu_requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
55 changes: 55 additions & 0 deletions Assignments/assignment5/Herais/
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

CS224N 2019-20: Homework 5

import torch
import torch.nn as nn
import torch.nn.functional as F

class Highway(nn.Module):
Class that computes X_highway from X_conv_out
# Remember to delete the above 'pass' after your implementation
### YOUR CODE HERE for part 1f

def __init__(self, word_embed_size):
Init Highway Layer.
@param word_embed_size (int): Embedding size (dimensionality) of word
super(Highway, self).__init__() # Initialize self._modules as OrderedDict
self.word_embed_size = word_embed_size

# default values
self.w_proj = None
self.w_gate = None

# initialize variables
# torch.nn.Linear(in_features, out_features, bias=True)
self.w_proj = nn.Linear(word_embed_size, word_embed_size, bias=True) # W_project
self.w_gate = nn.Linear(word_embed_size, word_embed_size, bias=True) # W_gate

def forward(self, x):
"""Maps x_conv_out to x_highway
# nn.Linear
@param x (tensor): x_conv_out tensor of shape (b, word_embed_size),
where b = batch size
@returns x_highway (tensor): tenosor of shape (b, word_embed_size)

x_proj = F.relu(self.w_proj(x)) # (b, word_embed_size)
x_gate = torch.sigmoid(self.w_gate(x)) # (b, word_embed_size)

# element wise multiplication: * or mul()
x_highway = x_gate * x_proj + (1 - x_gate) * x # (b, word_embed_size)

return x_highway

1 change: 1 addition & 0 deletions Assignments/assignment5/Herais/hw5.ipynb

Large diffs are not rendered by default.

14 changes: 14 additions & 0 deletions Assignments/assignment5/Herais/local_env.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: local_nmt
- soumith
- defaults
- pytorch=1.0.0
- python=3.5
- numpy
- scipy
- tqdm
- docopt
- pytorch
- nltk
- torchvision

0 comments on commit 1cb4689

Please sign in to comment.