-
Notifications
You must be signed in to change notification settings - Fork 15
/
evaluation.py
159 lines (124 loc) · 4.75 KB
/
evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
"""Main evaluation script.
Please consider making a copy and customizing it yourself if you aim to use a custom class that is not already
defined on the library.
### Usage
```bash
a2t.evaluation [-h] [--config CONFIG]
optional arguments:
-h, --help show this help message and exit
--config CONFIG Config with task (schema) and data information.
```
### Configuration file
A configuration file containing the task and evaluation information should look like this:
```json
{
"name": "BabelDomains",
"task_name": "topic-classification",
"features_class": "a2t.tasks.text_classification.TopicClassificationFeatures",
"hypothesis_template": "The domain of the sentence is about {label}.",
"nli_models": [
"roberta-large-mnli"
],
"labels": [
"Animals",
"Art, architecture, and archaeology",
"Biology",
"Business, economics, and finance",
"Chemistry and mineralogy",
"Computing",
"Culture and society",
...
"Royalty and nobility",
"Sport and recreation",
"Textile and clothing",
"Transport and travel",
"Warfare and defense"
],
"preprocess_labels": true,
"dataset": "babeldomains",
"test_path": "data/babeldomains.domain.gloss.tsv",
"use_cuda": true,
"half": true
}
```
"""
import argparse
import json
import os
from types import SimpleNamespace
import numpy as np
import torch
from a2t.tasks import PREDEFINED_TASKS
from a2t.data import PREDEFINED_DATASETS
from a2t.base import EntailmentClassifier
def main(args):
with open(args.config, "rt") as f:
config = SimpleNamespace(**json.load(f))
os.makedirs(f"experiments/{config.name}", exist_ok=True)
task_class, _ = PREDEFINED_TASKS[config.task_name]
task = task_class.from_config(args.config) # (**vars(config))
dataset_class = PREDEFINED_DATASETS[config.dataset]
assert hasattr(config, "dev_path") or hasattr(config, "test_path"), "At least a test or dev path must be provided."
# Run dev evaluation
if hasattr(config, "dev_path"):
dev_dataset = dataset_class(config.dev_path, task.labels)
else:
dev_dataset = None
if hasattr(config, "test_path"):
test_dataset = dataset_class(config.test_path, task.labels)
else:
test_dataset = None
results = {}
for pretrained_model in config.nli_models:
nlp = EntailmentClassifier(pretrained_model, **vars(config))
results[pretrained_model] = {}
if dev_dataset:
_, output = nlp(task=task, features=dev_dataset, negative_threshold=0.0, return_raw_output=True, **vars(config))
dev_labels = dev_dataset.labels
# Save the output
os.makedirs(
f"experiments/{config.name}/{pretrained_model}/dev",
exist_ok=True,
)
np.save(
f"experiments/{config.name}/{pretrained_model}/dev/output.npy",
output,
)
np.save(
f"experiments/{config.name}/{pretrained_model}/dev/labels.npy",
dev_labels,
)
# If dev data then optimize the threshold on it
dev_results = task.compute_metrics(dev_labels, output, threshold="optimize")
results[pretrained_model]["dev"] = dev_results
with open(f"experiments/{config.name}/results.json", "wt") as f:
json.dump(results, f, indent=4)
if test_dataset:
_, output = nlp(task=task, features=test_dataset, negative_threshold=0.0, return_raw_output=True, **vars(config))
test_labels = test_dataset.labels
# Save the output
os.makedirs(
f"experiments/{config.name}/{pretrained_model}/test",
exist_ok=True,
)
np.save(
f"experiments/{config.name}/{pretrained_model}/test/output.npy",
output,
)
np.save(
f"experiments/{config.name}/{pretrained_model}/test/labels.npy",
test_labels,
)
optimal_threshold = 0.5 if not dev_dataset else dev_results["optimal_threshold"]
test_results = task.compute_metrics(test_labels, output, threshold=optimal_threshold)
results[pretrained_model]["test"] = test_results
with open(f"experiments/{config.name}/results.json", "wt") as f:
json.dump(results, f, indent=4)
nlp.clear_gpu_memory()
del nlp
torch.cuda.empty_cache()
if __name__ == "__main__":
parser = argparse.ArgumentParser("a2t.evaluation")
parser.add_argument("--config", type=str, help="Config with task (schema) and data information.")
args = parser.parse_args()
main(args)