-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
120 lines (99 loc) · 4.08 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# %%
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.nn import functional as feature_dim
import random
from tqdm import tqdm
# %% md
## Data for task #1: classify if first element in the sequence is the same as the last one
## Data for task #2: reverse the sequence
# %%
## Data for task #1: classify if first element in the sequence is the same as the last one
## Data for task #2: reverse the sequence
"""
Assume data in the form of a list with N elements, where each element indexed by i has the dimensionality of (E_i, F), with
E_i - number of elements in the sequence (we assume variable number to simulate real-world data and make use of masking)
D - dimensionality of each token, this is one-hot encoded
Throughout we assume that maximum sequence length is 128 elements, there are N = 1,000 sequences,
and we deal with input_dim = 10 dimensional tokens (in one-hot encoding).
Data is generated by first picking a number of elements, and with probability p picking the
last element to be the equal to the first one (otherwise the sequence isn't further modified)
"""
def get_data():
N = 1000
E_min, E_max = 5, 128
feature_dim = 10
input_dim = feature_dim + 2 # extra one for the SOS and EOS tokens
p = 0.3
data_x, data_y_t1, data_y_t2, data_shifted_x = [], [], [], []
data_x_decoder_t2 = []
sequence_lengths = []
idx_to_one_hot_map = np.eye(input_dim)
for i in range(N):
# Draw # of elements uniformly
E_i = random.randint(E_min, E_max)
# Generate E_i elements
data_i = np.random.randint(0, feature_dim - 1, size=(E_i)) + 1
if random.random() < p:
data_i[-2] = data_i[0]
# Overwrite the last symbol with EOS token
data_i[-1] = feature_dim + 1
# This sequence will be used as the input to the model
data_x.append(idx_to_one_hot_map[data_i])
# Count the number of digit 1 in the sequence, as one-hot
data_y_t1.append(np.eye(2)[int(data_i[0] == data_i[-2])])
# The data is assumed to be reversed here
# but need to append EOS token
data_y_t2.append(
np.concatenate(
[
data_x[-1][:-1, :][::-1, :],
idx_to_one_hot_map[feature_dim + 1][None, :],
]
)
)
# The sequences that the decoder sees: shifted sequence with a appended SOS
data_x_encoder_t2_i = np.concatenate(
[idx_to_one_hot_map[0][None, :], np.copy(data_y_t2[-1])[:-1, :]]
)
data_x_decoder_t2.append(data_x_encoder_t2_i)
sequence_lengths.append(len(data_x[-1]))
# %%
""" Prepare the data by padding the input sequence and transforming it to a Pytorch tensor,
also pad the output sequence (for task 2));
"""
padded_data_x = []
padded_data_y_t2 = []
padded_data_x_decoder_t2 = []
for i in range(len(data_x)):
padded_data_x.append(
np.concatenate(
(data_x[i], np.zeros((E_max - data_x[i].shape[0], feature_dim + 2)))
)
)
padded_data_y_t2.append(
np.concatenate(
(
data_y_t2[i],
np.zeros((E_max - data_y_t2[i].shape[0], feature_dim + 2)),
)
)
)
padded_data_x_decoder_t2.append(
np.concatenate(
(
data_x_decoder_t2[i],
np.zeros((E_max - data_x_decoder_t2[i].shape[0], feature_dim + 2)),
)
)
)
padded_data_np = np.array(padded_data_x)
padded_data_y_t2_np = np.array(padded_data_y_t2)
padded_data_x_decoder_t2_np = np.array(padded_data_x_decoder_t2)
data_x_torch = torch.from_numpy(padded_data_np).float()
data_x_decoder_torch_2 = torch.from_numpy(padded_data_x_decoder_t2_np).float()
data_y_torch_t1 = torch.Tensor(data_y_t1).float()
data_y_torch_t2 = torch.Tensor(padded_data_y_t2_np).float()
return data_x_torch, data_y_torch_t1, data_y_torch_t2, data_x_decoder_torch_2