tokenization.py

import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import BertTokenizer

KG_EMBEDDING_SIZE = 200

bert_model_name = "bert-base-cased"
chunk_size = 128
batch_size = 100

max_length = 512
truncation = 'only_second'
padding = 'max_length'

nsp_1 = 0
num_qid = 0
num_kg_qid = 0

qid_dict = {} # initialized in main()

linked_wikitext_2 = "linked-wikitext-2/"
train = linked_wikitext_2+"train.jsonl"
valid = linked_wikitext_2+"valid.jsonl"
test = linked_wikitext_2+"test.jsonl"
data_files = {"train": train, "valid": valid, "test": test}

class BertTokenizerModified(BertTokenizer):

    kg_MASK_id = -100
    kg_PAD_id = -4
    kg_SEP_id = -3
    kg_CLS_id = -2
    kg_0_id = -1
    special_tokens = ["[MASK]","[PAD]","[SEP]","[CLS]","0"]

    def __init__(self, vocab_file,**kwargs):
        super().__init__(vocab_file, **kwargs)
        self.tokenized_list = []

    def _tokenize(self, text):
        token_list = text.split()
        split_tokens = []
        tokenized_list = []
        
        if self.do_basic_tokenize:
            for token in token_list:

                # If the token is part of the never_split set
                if token in self.basic_tokenizer.never_split:
                    split_tokens.append(token)
                    tokenized_list.append(1)
                else:
                    word_tokenized = self.wordpiece_tokenizer.tokenize(token)
                    split_tokens += word_tokenized
                    tokenized_list.append(len(word_tokenized))

        self.tokenized_list.append(tokenized_list)
        return split_tokens

my_tokenizer = BertTokenizerModified.from_pretrained(bert_model_name)

def set_qid(data):
    """
    use 'annotaions' to set a qid for each word in 'tokens'.
    applies to linked-wikitext-2
    """
    
    tokens_list = data["tokens"]
    annotations_list = data['annotations']
    qids_list = []
    
    # loop through each sample in the batch
    for tokens, annotations in zip(tokens_list, annotations_list):
        # initialize qid list
        qids = ['0']*len(tokens)
        
        for annotation in annotations:
            start_ix, end_ix = annotation['span']
            qid = annotation['id']
            
            # set qid wrt span
            qids[start_ix:end_ix] = [qid]*(end_ix-start_ix)
        
        qids_list.append(qids)
            
    return {
        "qid": qids_list
    }

def remove_start_end_tokens(data):
    new_data = {k:[] for k in data}
    
    tokens_list = data["tokens"]

    indices_list = [[i for i,token in enumerate(tokens) if token!="@@START@@" and token!="@@END@@"]
                        for tokens in tokens_list]
    
    for k in data:
        for indices, data_list in zip(indices_list, data[k]):
            new_data[k].append([data_list[ind] for ind in indices])
        
    return new_data

def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in ["tokens", "qid"]}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of chunk_size
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    return result

def make_sent_pair(data):
    # the second sentence in the pair always follows the first sentence (as was in the document)
    # except the last sentence in the batch: the second sentence for it is a random sentence in the batch
    
    qids_list = data["qid"]
    tokens_list = data["tokens"]
    batch_size = len(qids_list)  # len(tokens_list) same
    
    nsp_labels = []
    new_tokens_list = []
    new_qids_list = []
    
    for i in range(batch_size):
        if i < batch_size-1: # till second last sentence in batch
            j = i+1
            nsp_labels.append(1)
        else:
            j = np.random.randint(i)
            nsp_labels.append(0)
            
        new_qids_list.append((qids_list[i], qids_list[j]))
        new_tokens_list.append((tokens_list[i], tokens_list[j]))
    
    return {
        "tokens": new_tokens_list,
        "qid": new_qids_list,
        "nsp_labels": nsp_labels
    }

# randomly shuffle second sentence for nsp
def shuffle_sent_pair(data):
    global nsp_1

    qids_list = data["qid"]
    tokens_list = data["tokens"]
    nsp_labels = data["nsp_labels"]
    batch_size = len(qids_list)
    
    new_nsp_labels = []
    new_tokens_list = []
    new_qids_list = []
    
    for i in range(batch_size):
        qids, tokens, nsp = qids_list[i], tokens_list[i], nsp_labels[i]

        # randomly set nsp 0 or 1, for 1 everything remains the same
        if np.random.randint(2) == 0:
            j = np.random.randint(batch_size)
            while j!=i:
                # make sure j!=i, i.e. ensure shuffle
                j = np.random.randint(batch_size)
                
            new_qids_list.append((qids[0], qids_list[j][0]))
            new_tokens_list.append((tokens[0], tokens_list[j][0]))
            new_nsp_labels.append(0)
        else:
            new_qids_list.append(qids)
            new_tokens_list.append(tokens)
            new_nsp_labels.append(nsp)
            
    nsp_1 += sum(new_nsp_labels)
    return {
        "tokens": new_tokens_list,
        "qid": new_qids_list,
        "nsp_labels": new_nsp_labels
    }

def listofdict_to_dictoflist(data):
    keys = data[0].keys()
    new_dict = {key:[] for key in keys}
    
    for datum in data:
        for k, v in datum.items():
            new_dict[k].append(v)
            
    return new_dict

def my_tokenize_function(data):
    
    result_set = []
    
    for sample in data["tokens"]:
        sent_1 = " ".join(sample[0])
        sent_2 = " ".join(sample[1])
        
        # tokenize sentence pair
        my_tokenizer.tokenized_list = []
        result = my_tokenizer(sent_1, sent_2, max_length=max_length, truncation=truncation, padding=padding)
        result["word_tokens"] = (my_tokenizer.tokenized_list[0], my_tokenizer.tokenized_list[1]) # len(tokenized_list) == 2 for the two sentences
        
        result_set.append(result)
        
    return listofdict_to_dictoflist(result_set)

def match_qid_to_input_ids(data):
    qids_list = data["qid"]
    word_tokens_list = data["word_tokens"]
    
    new_qids_list = []
    
    for qids, word_tokens in zip(qids_list, word_tokens_list):
        # handle first sentence
        new_qid_1 = []
        for qid, word in zip(qids[0], word_tokens[0]):
            new_qid_1 += [qid]*word
        
        # handle second sentence
        new_qid_2 = []
        for qid, word in zip(qids[1], word_tokens[1]):
            new_qid_2 += [qid]*word
        
        new_qids_list.append(['[CLS]'] + new_qid_1 + ['[SEP]'] + new_qid_2 + ['[SEP]'])
    
    return {
        "qid": new_qids_list,
    }

def truncate_pad_qid(data):
    qids_list = data["qid"]
    input_ids_list = data["input_ids"]
    
    new_qids_list = []
    
    for qids, input_ids in zip(qids_list, input_ids_list):
        len_input_ids = len(input_ids)
        len_qids = len(qids)
        
        # QID TRUNCATION
        # assumption: tokenization truncated from 'second only'
        # assumption: length of second sentence is long enough to retain tokens after truncation
        if len_qids > len_input_ids:
            qids = qids[:len_input_ids-1] + ['SEP']
            
        ## QID PADDING
        if len_qids < len_input_ids:
            qids += ['[PAD]']*(len_input_ids-len_qids)
            
        new_qids_list.append(qids)
        
    return {
        "qid": new_qids_list,
    }

def get_kg_index(data):
    global num_qid, num_kg_qid
    """
    When you specify batched=True the function receives a dictionary with the fields of the dataset, 
    but each value is now a list of values, and not just a single value. 
    """
    qid_list = data["qid"]
    
    qid_mask_list = [] ## store a masking array that says whether or not an item has kg embedding
    qid_index_list = [] ## the index of qid, acts as label for qid classification
    
    for qids in qid_list:
        seq_len = len(qids)
        
        mask = [0]*seq_len
        mask_index = [my_tokenizer.kg_0_id]*seq_len
          
        for i, qid in enumerate(qids):

            if qid == '0':
                continue
            
            is_special_token = qid in my_tokenizer.special_tokens

            if not is_special_token:
                num_qid += 1 # Consider only Q-IDs
            
            if qid in qid_dict:
                mask_index[i] = qid_dict[qid]
                # Consider only Q-IDs
                if not is_special_token:
                    num_kg_qid += 1
                    mask[i] = 1
          
        # kg_embeds_list.append(embeds)
        qid_mask_list.append(mask)        
        qid_index_list.append(mask_index)

    return {
        "qid_mask": qid_mask_list,
        "qid_index": qid_index_list
    }

def my_tokenize_function_single_sentence(data, max_length=100):
    my_tokenizer.tokenized_list = []
    result = my_tokenizer([" ".join(eg) for eg in data["tokens"]], max_length=max_length, 
                          truncation='only_second', padding='max_length')
    result["word_tokens"] = my_tokenizer.tokenized_list
    return result

def match_qid_to_input_ids_single_sentence(data):
    qids_list = data["qid"]
    word_tokens_list = data["word_tokens"]
    
    new_qids_list = []
    
    for qids, word_tokens in zip(qids_list, word_tokens_list):
        new_qid = []
        for qid, word in zip(qids, word_tokens):
            new_qid += [qid]*word
        
        new_qids_list.append(['[CLS]'] + new_qid + ['[SEP]'])
    
    return {
        "qid": new_qids_list,
    }

def tokenize_save_wikitext2(filename):
    wikitest2_dataset = load_dataset("json", data_files=data_files)
    # here we do 'group_text' before 'tokenization'. So all 'chunk_size' length sentences
    # break into longer sequence due to each word breaking into multiple tokens.
    # we rely on tokenizer for padding and truncation.
    tokenized_dataset = wikitest2_dataset\
        .map(set_qid, batched=True, batch_size=batch_size, keep_in_memory=False)\
        .remove_columns(['annotations', 'title'])\
        .map(remove_start_end_tokens, batched=True, batch_size=batch_size, keep_in_memory=False)\
        .map(group_texts, batched=True, batch_size=batch_size, keep_in_memory=False)\
        .map(make_sent_pair, batched=True, batch_size=batch_size, keep_in_memory=False)\
        .shuffle()\
        .map(shuffle_sent_pair, batched=True, batch_size=batch_size, keep_in_memory=False)\
        .map(my_tokenize_function, batched=True, batch_size=batch_size, keep_in_memory=False)\
        .map(match_qid_to_input_ids, batched=True, batch_size=batch_size, keep_in_memory=False)\
        .remove_columns(['tokens', 'word_tokens'])\
        .map(truncate_pad_qid, batched=True, batch_size=batch_size, keep_in_memory=False)\
        .map(get_kg_index, batched=True, batch_size=batch_size, keep_in_memory=False)
    
    tokenized_dataset.save_to_disk(filename)

    dataset_size = sum(tokenized_dataset.num_rows.values())
    print("Number of NSP=1", nsp_1, nsp_1*100/dataset_size)
    print("Number of qids that have embeds", num_kg_qid, num_kg_qid*100/num_qid)

    return tokenized_dataset

def tokenize_save_synthetic(input_filename, output_filename):
    synthetic_dataset = load_dataset("json", data_files={"synthetic": input_filename})
    tokenized_synthetic_dataset = synthetic_dataset\
        .map(set_qid, batched=True, batch_size=batch_size, keep_in_memory=False)\
        .remove_columns(['annotations'])\
        .map(my_tokenize_function_single_sentence, batched=True, batch_size=batch_size, keep_in_memory=False)\
        .map(match_qid_to_input_ids_single_sentence, batched=True, batch_size=batch_size, keep_in_memory=False)\
        .remove_columns(['tokens', 'word_tokens'])\
        .map(truncate_pad_qid, batched=True, batch_size=batch_size, keep_in_memory=False)\
        .map(get_kg_index, batched=True, batch_size=batch_size, keep_in_memory=False)

    tokenized_synthetic_dataset.save_to_disk(output_filename)
    return tokenized_synthetic_dataset

def initialize_kg_dict():
    global qid_dict
    relevant_qid = pd.read_csv("relevant_qids.csv")
    qid_dict = {row["id"]:ix for ix, row in relevant_qid.iterrows()}
    qid_dict["[MASK]"] = my_tokenizer.kg_MASK_id
    qid_dict["[PAD]"] = my_tokenizer.kg_PAD_id
    qid_dict["[SEP]"] = my_tokenizer.kg_SEP_id
    qid_dict["[CLS]"] = my_tokenizer.kg_CLS_id
    qid_dict["0"] = my_tokenizer.kg_0_id

def main():
    initialize_kg_dict()
    tokenize_save_wikitext2(filename="wikitext2_dataset_tokenized_v2")
    tokenize_save_synthetic(input_filename="sythetic_dataset_w_negative_samples.jsonl", 
                            output_filename="synthetic_dataset_tokenized_v2")

if __name__ == "__main__":
    main()

# Number of NSP=1 9457 49.18348242146869
# Number of qids that have embeds 739093 99.06337247564272