-
Notifications
You must be signed in to change notification settings - Fork 12
/
utils.py
187 lines (154 loc) · 5.24 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
from typing import List, Dict
from pytorch_lightning import Callback
import pandas as pd
class MetricTracker(Callback):
def __init__(self, run_name):
self.df = None
self.run_name = run_name
def on_validation_end(self, trainer, module):
print(trainer.logged_metrics)
elogs = trainer.logged_metrics # access it here
elogs = {k: [v.item()] for k, v in elogs.items()}
new_df = pd.DataFrame(elogs)
# new_df = new_df[self.df.columns]
# self.df = pd.concat([self.df, new_df])
new_df.to_csv(f'csv_out/{self.run_name}.csv')
print('Hello World')
# def on_validation_epoch_end(self, trainer, module):
# if isinstance(module, GPT2Valid):
# elogs = trainer.logged_metrics # access it here
# elogs = {k: [v.item()] for k, v in elogs.items()}
# self.df = pd.DataFrame(elogs)
DIALOG_DATASETS = [
'wizard_of_wikipedia',
'empathetic_dialogues',
'blended_skill_talk',
'wizard_of_internet'
]
CLASSIFICATION_DATASETS = [
'piqa',
'hellaswag',
'ai2_arc',
'winogrande',
'math_qa',
'pubmed_qa',
'copa'
]
PPL_DATASETS = [
'wikitext',
'pile'
]
COMPLETION_DATASETS = [
'lambada'
]
class Trie(object):
def __init__(self, sequences: List[List[int]] = []):
self.trie_dict = {}
self.len = 0
if sequences:
for sequence in sequences:
Trie._add_to_trie(sequence, self.trie_dict)
self.len += 1
self.append_trie = None
self.bos_token_id = None
def append(self, trie, bos_token_id):
self.append_trie = trie
self.bos_token_id = bos_token_id
def add(self, sequence: List[int]):
Trie._add_to_trie(sequence, self.trie_dict)
self.len += 1
def get(self, prefix_sequence: List[int]):
return Trie._get_from_trie(
prefix_sequence, self.trie_dict, self.append_trie, self.bos_token_id
)
@staticmethod
def load_from_dict(trie_dict):
trie = Trie()
trie.trie_dict = trie_dict
trie.len = sum(1 for _ in trie)
return trie
@staticmethod
def _add_to_trie(sequence: List[int], trie_dict: Dict):
if sequence:
if sequence[0] not in trie_dict:
trie_dict[sequence[0]] = {}
Trie._add_to_trie(sequence[1:], trie_dict[sequence[0]])
@staticmethod
def _get_from_trie(
prefix_sequence: List[int],
trie_dict: Dict,
append_trie=None,
bos_token_id: int = None,
):
if len(prefix_sequence) == 0:
output = list(trie_dict.keys())
if append_trie and bos_token_id in output:
output.remove(bos_token_id)
output += list(append_trie.trie_dict.keys())
return output
elif prefix_sequence[0] in trie_dict:
return Trie._get_from_trie(
prefix_sequence[1:],
trie_dict[prefix_sequence[0]],
append_trie,
bos_token_id,
)
else:
if append_trie:
return append_trie.get(prefix_sequence)
else:
return []
def __iter__(self):
def _traverse(prefix_sequence, trie_dict):
if trie_dict:
for next_token in trie_dict:
yield from _traverse(
prefix_sequence + [next_token], trie_dict[next_token]
)
else:
yield prefix_sequence
return _traverse([], self.trie_dict)
def __len__(self):
return self.len
def __getitem__(self, value):
return self.get(value)
def normalize_reply(text: str, version=2) -> str:
"""
Standardize the capitalization and punctuation spacing of the input text.
Version 1: Fix sentence start casing, and punctuation.
Version 2: Add trailing period, if missing.
"""
switch_list = [(' .', '.'), (' ,', ','), (' ?', '?'), (' !', '!'), (" ' ", "'")]
# add spaces so that words and punctuation can be seaprated
new_text = text.lower()
# normalize in case of human:
for new, old in switch_list:
new_text = new_text.replace(old, new).replace(' ', ' ')
# split on punctuation to find sentence boundaries
# capitalize stuff
tokens = new_text.split(' ')
for i in range(len(tokens)):
if i == 0:
tokens[i] = uppercase(tokens[i])
elif tokens[i] in ('i', "i'm", "i've", "i'll", "i'd"):
tokens[i] = uppercase(tokens[i])
elif tokens[i] in '?.!' and i < len(tokens) - 1:
tokens[i + 1] = uppercase(tokens[i + 1])
new_text = ' '.join(tokens)
new_text = ' ' + new_text + ' '
for tup in switch_list:
new_text = new_text.replace(tup[0], tup[1])
# get rid of surrounding whitespace
new_text = new_text.strip()
new_text = new_text.replace(' ', ' ')
if version > 1 and new_text and new_text[-1] not in '!.?)"\'':
new_text += '.'
return new_text
def uppercase(string: str) -> str:
"""
Make the first character of the string uppercase, if the string is non-empty.
"""
if len(string) == 0:
return string
else:
return string[0].upper() + string[1:]