-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataloader.py
59 lines (46 loc) · 2.08 KB
/
dataloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import torch
import transformers
FEATURES_NAMES = ['nFix', 'FFD', 'GPT', 'TRT', 'fixProp']
class EyeTrackingCSV(torch.utils.data.Dataset):
"""Tokenize sentences and load them into tensors. Assume dataframe has sentence_id."""
def __init__(self, df, model_name='roberta-base'):
self.model_name = model_name
self.df = df.copy()
# Re-number the sentence ids, assuming they are [N, N+1, ...] for some N
self.df.sentence_id = self.df.sentence_id - self.df.sentence_id.min()
self.num_sentences = self.df.sentence_id.max() + 1
assert self.num_sentences == self.df.sentence_id.nunique()
self.texts = []
for i in range(self.num_sentences):
rows = self.df[self.df.sentence_id == i]
text = rows.word.tolist()
text[-1] = text[-1].replace('<EOS>', '')
self.texts.append(text)
# Tokenize all sentences
if 'distil' in model_name:
self.tokenizer = transformers.DistilBertTokenizerFast.from_pretrained(model_name, add_prefix_space=True)
else: # for Bert models
self.tokenizer = transformers.BertTokenizerFast.from_pretrained(model_name, add_prefix_space=True)
self.ids = self.tokenizer(self.texts, padding=True, is_split_into_words=True, return_offsets_mapping=True)
def __len__(self):
return self.num_sentences
def __getitem__(self, ix):
input_ids = self.ids['input_ids'][ix]
offset_mapping = self.ids['offset_mapping'][ix]
attention_mask = self.ids['attention_mask'][ix]
input_tokens = [self.tokenizer.convert_ids_to_tokens(x) for x in input_ids]
# First subword of each token starts with special character
if 'roberta' in self.model_name:
is_first_subword = [t[0] == 'Ġ' for t in input_tokens]
elif 'bert' in self.model_name:
is_first_subword = [t0 == 0 and t1 > 0 for t0, t1 in offset_mapping]
features = -torch.ones((len(input_ids), 5))
features[is_first_subword] = torch.Tensor(
self.df[self.df.sentence_id == ix][FEATURES_NAMES].to_numpy()
)
return (
input_tokens,
torch.LongTensor(input_ids),
torch.LongTensor(attention_mask),
features,
)