data.py

# %%
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.nn import functional as feature_dim
import random
from tqdm import tqdm

# %% md
## Data for task #1: classify if first element in the sequence is the same as the last one
## Data for task #2: reverse the sequence
# %%

## Data for task #1: classify if first element in the sequence is the same as the last one
## Data for task #2: reverse the sequence

"""
Assume data in the form of a list with N elements, where each element indexed by i has the dimensionality of (E_i, F), with 
    E_i - number of elements in the sequence (we assume variable number to simulate real-world data and make use of masking)
    D - dimensionality of each token, this is one-hot encoded
Throughout we assume that maximum sequence length is 128 elements, there are N = 1,000 sequences,
and we deal with input_dim = 10 dimensional tokens (in one-hot encoding). 

Data is generated by first picking a number of elements, and with probability p picking the
last element to be the equal to the first one (otherwise the sequence isn't further modified)
"""


def get_data():
    N = 1000
    E_min, E_max = 5, 128
    feature_dim = 10
    input_dim = feature_dim + 2  # extra one for the SOS and EOS tokens

    p = 0.3

    data_x, data_y_t1, data_y_t2, data_shifted_x = [], [], [], []
    data_x_decoder_t2 = []
    sequence_lengths = []
    idx_to_one_hot_map = np.eye(input_dim)

    for i in range(N):
        # Draw # of elements uniformly
        E_i = random.randint(E_min, E_max)
        # Generate E_i elements
        data_i = np.random.randint(0, feature_dim - 1, size=(E_i)) + 1
        if random.random() < p:
            data_i[-2] = data_i[0]
        # Overwrite the last symbol with EOS token
        data_i[-1] = feature_dim + 1

        # This sequence will be used as the input to the model
        data_x.append(idx_to_one_hot_map[data_i])

        # Count the number of digit 1 in the sequence, as one-hot
        data_y_t1.append(np.eye(2)[int(data_i[0] == data_i[-2])])

        # The data is assumed to be reversed here
        # but need to append EOS token
        data_y_t2.append(
            np.concatenate(
                [
                    data_x[-1][:-1, :][::-1, :],
                    idx_to_one_hot_map[feature_dim + 1][None, :],
                ]
            )
        )

        # The sequences that the decoder sees: shifted sequence with a appended SOS
        data_x_encoder_t2_i = np.concatenate(
            [idx_to_one_hot_map[0][None, :], np.copy(data_y_t2[-1])[:-1, :]]
        )
        data_x_decoder_t2.append(data_x_encoder_t2_i)

        sequence_lengths.append(len(data_x[-1]))

    # %%
    """ Prepare the data by padding the input sequence and transforming it to a Pytorch tensor,
        also pad the output sequence (for task 2)); 
    """

    padded_data_x = []
    padded_data_y_t2 = []
    padded_data_x_decoder_t2 = []
    for i in range(len(data_x)):
        padded_data_x.append(
            np.concatenate(
                (data_x[i], np.zeros((E_max - data_x[i].shape[0], feature_dim + 2)))
            )
        )
        padded_data_y_t2.append(
            np.concatenate(
                (
                    data_y_t2[i],
                    np.zeros((E_max - data_y_t2[i].shape[0], feature_dim + 2)),
                )
            )
        )
        padded_data_x_decoder_t2.append(
            np.concatenate(
                (
                    data_x_decoder_t2[i],
                    np.zeros((E_max - data_x_decoder_t2[i].shape[0], feature_dim + 2)),
                )
            )
        )

    padded_data_np = np.array(padded_data_x)
    padded_data_y_t2_np = np.array(padded_data_y_t2)
    padded_data_x_decoder_t2_np = np.array(padded_data_x_decoder_t2)

    data_x_torch = torch.from_numpy(padded_data_np).float()

    data_x_decoder_torch_2 = torch.from_numpy(padded_data_x_decoder_t2_np).float()

    data_y_torch_t1 = torch.Tensor(data_y_t1).float()
    data_y_torch_t2 = torch.Tensor(padded_data_y_t2_np).float()

    return data_x_torch, data_y_torch_t1, data_y_torch_t2, data_x_decoder_torch_2