-
Notifications
You must be signed in to change notification settings - Fork 3
/
utils.py
162 lines (141 loc) · 5.43 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import math
import string
from pathlib import Path
import pandas as pd
import numpy as np
import joblib
import torch
from patents4IPPC.embedders.base_embedder import BaseEmbedder
def load_dataset(
path_to_dataset,
train_portion=None,
valid_portion=None,
test_portion=None,
seed=None,
normalize_labels=True
):
dataset = pd.read_csv(path_to_dataset).dropna(subset=["query", "response"])
if normalize_labels:
dataset.loc[:, "label"] = dataset["label"] - dataset["label"].min()
dataset.loc[:, "label"] = dataset["label"] / dataset["label"].max()
if "split" in dataset.columns:
dataset_train = dataset[dataset["split"] == "train"]
dataset_valid = dataset[dataset["split"] == "dev"]
dataset_test = dataset[dataset["split"] == "test"]
else:
assert train_portion + valid_portion + test_portion == 1.0, \
"Fractions of train, validation and test do not sum up to 1."
# Get the unique query ids
query_ids = dataset["query_id"].unique()
# Shuffle them
np.random.seed(seed)
np.random.shuffle(query_ids)
# Pick train, validation and test portions
n_queries = len(query_ids)
n_train_queries = math.ceil(n_queries * train_portion)
nontrain_portion = 1 - train_portion
rescaled_valid_portion = (0 if nontrain_portion == 0
else valid_portion / nontrain_portion)
rescaled_test_portion = (0 if nontrain_portion == 0
else test_portion / nontrain_portion)
n_nontrain_queries = math.floor(n_queries * nontrain_portion)
n_valid_queries = math.ceil(n_nontrain_queries * rescaled_valid_portion)
n_test_queries = math.floor(n_nontrain_queries * rescaled_test_portion)
train_end_idx = n_train_queries
train_ids = query_ids[:train_end_idx]
valid_end_idx = train_end_idx + n_valid_queries
valid_ids = query_ids[train_end_idx:valid_end_idx]
test_end_idx = valid_end_idx + n_test_queries
test_ids = query_ids[valid_end_idx:test_end_idx]
dataset_train = dataset[dataset["query_id"].isin(train_ids)]
dataset_valid = dataset[dataset["query_id"].isin(valid_ids)]
dataset_test = dataset[dataset["query_id"].isin(test_ids)]
return dataset_train, dataset_valid, dataset_test
def index_documents_as_python_dictionary(
documents,
ids,
embedder: BaseEmbedder,
batch_size=64,
do_lowercase=False,
store_on_disk=False,
filename=None
):
# Embed the documents
embeddings = embedder.embed_documents(
documents,
batch_size=batch_size,
do_lowercase=do_lowercase,
show_progress=True
)
index = dict(zip(ids, embeddings))
# NOTE: The embeddings are not normalized in this case. That's
# because the dictionary mode is used to train a DualTransformer
# model, meaning that the normalization step is embedded in the
# computation of the cosine similarity between query and
# response embeddings within the loss function
# TODO: You need to explicitly normalize embeddings if using
# another loss function that doesn't involve computing the
# cosine similarity
if not store_on_disk:
return index
# Write the index to disk (can be loaded again later)
if filename is None:
raise ValueError(
"A filename must be provided when you want to store an index on "
"disk."
)
Path(filename).parent.mkdir(parents=True, exist_ok=True)
joblib.dump(index, filename)
# Taken from Huggingface Hub
def max_pool_embeddings_with_attention_mask(embeddings, attention_mask):
input_mask_expanded = (
attention_mask
.unsqueeze(-1)
.expand(embeddings.size())
.float()
)
embeddings[input_mask_expanded == 0] = -1e9
return torch.max(embeddings, 1)[0]
# Taken from Huggingface Hub
def mean_pool_embeddings_with_attention_mask(embeddings, attention_mask):
input_mask_expanded = (
attention_mask
.unsqueeze(-1)
.expand(embeddings.size())
.float()
)
return (torch.sum(embeddings * input_mask_expanded, axis=1)
/ torch.clamp(input_mask_expanded.sum(axis=1), min=1e-9))
def pool_embeddings_with_attention_mask(embeddings, attention_mask, mode="mean"):
if mode == "cls":
CLS_TOKEN_POSITION = 0
return embeddings[:, CLS_TOKEN_POSITION]
# ^ NOTE: If the [CLS] token is not the first token of the
# sequence, then all this obviously doesn't make
# any sense
if mode == "max":
return max_pool_embeddings_with_attention_mask(
embeddings, attention_mask
)
if mode == "mean":
return mean_pool_embeddings_with_attention_mask(
embeddings, attention_mask
)
raise ValueError(
f"Pooling mode '{mode}' is not supported."
)
def ucid_to_int(ucid):
capital_letters = string.ascii_uppercase
ucid_no_dashes = ucid.replace("-", "")
characters = list(ucid_no_dashes)
encoded_characters = list(map(
lambda c: str(capital_letters.index(c)) if c in capital_letters else c,
characters
))
return int("".join(encoded_characters))
def unique_values_in_order_of_appearance(values):
unique_values = []
for v in values:
if v not in unique_values:
unique_values.append(v)
return unique_values