-
Notifications
You must be signed in to change notification settings - Fork 15
/
active_learning.py
297 lines (242 loc) · 14.2 KB
/
active_learning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
"""Train a model using active learning on a dataset."""
from copy import deepcopy
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pathlib import Path
import shutil
import time, datetime
import os, sys
import torch
from chemprop.parsing import parse_train_args
from chemprop.train.run_training import run_training, get_dataset_splits, get_atomistic_splits, evaluate_models
from chemprop.utils import create_logger
def ordered_list_diff(a, b):
""" Returns the elements of a without any elements of b, while preserving
order.
:param a: list or array to remove elements from
:param b: list or array to find and remove
"""
list_diff = a[~np.in1d(a, b)]
return list_diff
if __name__ == '__main__':
args = parse_train_args()
logger = create_logger(name='train', save_dir=args.save_dir, quiet=args.quiet)
method = args.confidence
dataset = os.path.basename(os.path.splitext(args.data_path)[0])
results_root = Path(args.save_dir).parent #f"./al_results/{dataset}/{method}/"
Path(results_root).mkdir(parents=True, exist_ok=True)
for i_trial in range(args.num_folds):
df = pd.DataFrame(
columns=["Trial", "Train Data Ratio", "Score", "Uncertainty", "Entropy"])
### Load the data
## Atomistic network
if args.atomistic:
# Copy atomistic db to a temp folder
if args.slurm_job and os.environ.get("TMPDIR") is not None:
tmp_dir = os.environ.get("TMPDIR")
_, file_name = os.path.split(args.data_path)
old_loc = args.data_path
new_loc = os.path.join(tmp_dir, file_name)
shutil.copy2(args.data_path, new_loc)
args.data_path = new_loc
clean_up= lambda : os.remove(new_loc)
(all_train_data, val_data, test_data), features_scaler, scaler = \
get_atomistic_splits(args.data_path, args, logger)
else:
(all_train_data, val_data, test_data), features_scaler, scaler = \
get_dataset_splits(args.data_path, args, logger)
### Define active learning step variables and subsample the tasks
n_total = len(all_train_data)
n_sample = n_total
n_loops = args.num_al_loops
### Change active learning n_sample for early stopping
if args.al_end_ratio is not None:
if args.al_end_ratio > 1:
raise ValueError("Arg al_end_ratio must be less than train size")
total_data = len(all_train_data) + len(val_data) + len(test_data)
early_stop_num = int(n_total * args.al_end_ratio)
n_sample = early_stop_num
n_start = int(n_total * args.al_init_ratio)
if args.task_inds != []:
all_train_data.sample_task_ind(args.task_inds)
val_data.sample_task_ind(args.task_inds)
test_data.sample_task_ind(args.task_inds)
if scaler is not None:
scaler.means = scaler.means[args.task_inds]
scaler.stds = scaler.stds[args.task_inds]
print(f"Ratio targets 0/1: {np.nanmean(np.array(all_train_data.targets(), dtype=np.float), axis=0)}")
### Compute the number of samples to use at each step of active learning
if args.al_step_scale == "linear":
n_samples_per_run = np.linspace(n_start, n_sample, n_loops)
elif args.al_step_scale == "log":
n_samples_per_run = np.logspace(np.log10(n_start), np.log10(n_sample), n_loops)
else:
raise ValueError(f"unknown args.al_step_scale = {args.al_step_scale}")
n_samples_per_run = np.round(n_samples_per_run).astype(int)
### SLG: Move this to outside strategy loop to sample the same initial
# batch per strategy
train_subset_inds_start = np.random.choice(n_total, n_start, replace=False)
for strategy in args.al_strategy:
train_subset_inds = np.copy(train_subset_inds_start)
tic_time = time.time() # grab the current time for logging
### Main active learning loop
for i in range(n_loops):
print(f"===> [{strategy}] Running trial {i_trial} with {n_samples_per_run[i]} samples")
train_data = all_train_data.sample_inds(train_subset_inds)
### Train with the data subset, return the best models
models = run_training(
train_data, val_data, scaler, features_scaler, args, logger)
### Sample according to a strategy
if "explorative" in strategy or "score" in strategy or "exploit" in strategy:
### Evaluate confidences on entire training set
all_train_data_unscaled = deepcopy(all_train_data)
if scaler is not None:
all_train_data_unscaled.set_targets(scaler.inverse_transform(all_train_data.targets()))
# Modified this line such that call with export_std flag, then grab the stds.
# will return: ensemble_scores, ensemble_predictions, confidence, std, entropy
all_train_scores, all_train_preds, all_train_conf, all_train_std, all_train_entropy = evaluate_models(
models, train_data, all_train_data_unscaled, scaler, args, logger, export_std=True)
### Find the lowest confidence (highest unc) samples, add
# them to the training inds consider average entropy across
# tasks
sq_error = np.square(
np.array(all_train_data_unscaled.targets()) - all_train_preds)
rmse = np.sqrt( np.mean(sq_error.astype(np.float32), axis=1) )
# for evidence, either use std or conf for the uncertainty
if args.use_std:
mean_uncertainty = np.mean(all_train_std, axis=1)
else:
mean_uncertainty = np.mean(all_train_conf, axis=1)
if "explorative" in strategy:
per_sample_weight = mean_uncertainty
elif "score" in strategy:
per_sample_weight = rmse
elif "exploit" in strategy:
scaled_preds = scaler.transform(all_train_preds)
per_sample_weight = np.mean(scaled_preds, 1).astype(np.float32)
# Reverse and make sure weights (preds) are positive
if args.acquire_min:
per_sample_weight *= -1
std_mult = args.al_std_mult
if "_lcb" in strategy: # lower confidence bound
per_sample_weight += -std_mult * mean_uncertainty
elif "_ucb" in strategy: # upper confidence bound
per_sample_weight += +std_mult * mean_uncertainty
elif "_ts" in strategy: # thompson sampling
per_sample_weight = np.random.normal(
per_sample_weight, mean_uncertainty)
per_sample_weight -= per_sample_weight.min()
### Save all the smiles along with their uncertainties/errors
train_subset_mask = np.zeros((n_total,))
train_subset_mask[train_subset_inds] = 1
df_scores = pd.DataFrame(data={
"Smiles": all_train_data.smiles(),
"Uncertainty": mean_uncertainty,
"Error": rmse,
"TrainInds": train_subset_mask
})
Path(os.path.join(results_root, "tracks")).mkdir(
parents=True, exist_ok=True)
df_scores.to_csv(os.path.join(results_root, "tracks",
f"{strategy}_step_{i}_{tic_time}.csv"))
elif strategy == "random":
per_sample_weight = np.ones((n_total,)) # uniform
else:
raise ValueError(f"Unknown active learning strategy {strategy}")
### Evaluate performance on test set and save
evals_results = evaluate_models(models, train_data, test_data,
scaler, args, logger,
export_std=True, export_single_model_preds=True)
if args.confidence:
test_scores, test_preds, test_conf, test_std, test_entropy, test_single_scores, test_single_preds = evals_results
else:
test_scores, test_preds, test_conf, test_entropy, test_single_scores, test_single_preds = evals_results
test_std = test_conf = test_entropy = np.zeros_like(test_preds)
### Compute the top-k percent acquired
# Grab the indicies that are in the top-k of only the training data
top_k_scores_in_pool = np.sort(
np.mean(all_train_data.targets(), 1))
top_k_scores_in_pool = top_k_scores_in_pool[:args.al_topk] \
if args.acquire_min else \
top_k_scores_in_pool[-args.al_topk:]
top_k_scores_in_selection = np.sort(
np.mean(train_data.targets(), 1))
top_k_scores_in_selection = top_k_scores_in_selection[:args.al_topk] \
if args.acquire_min else \
top_k_scores_in_selection[-args.al_topk:]
# Find the overlap in indicies with our already acquired data points
selection_overlap = np.in1d(top_k_scores_in_selection,
top_k_scores_in_pool)
# Compute the percent overlap
percent_top_k_overlap = np.mean(selection_overlap) * 100
###
### Evaluate mae performmance
args_other = deepcopy(args)
args_other.metric = "mae" if args.metric == "rmse" else "rmse"
if args.confidence:
test_scores_other, _, _, _, _, test_single_scores_other, test_single_preds_other = evaluate_models(
models, train_data, test_data, scaler, args_other, logger, export_std=True, export_single_model_preds=True)
else:
test_scores_other, _, _, _, test_single_scores_other, test_single_preds_other = evaluate_models(
models, train_data, test_data, scaler, args_other, logger, export_std=True, export_single_model_preds=True)
if args.confidence == "ensemble":
test_scores = test_single_scores
test_preds = test_single_preds
test_scores_other = test_single_scores_other
test_preds_other = test_single_preds_other
df = df.append({
'Train Data Ratio': n_samples_per_run[i]/float(n_total),
'Score': np.mean(test_scores),
'Score_'+args_other.metric: np.mean(test_scores_other),
'Uncertainty': np.mean(test_conf),
'Standard Deviation': np.mean(test_std),
'Entropy': np.mean(test_entropy),
'Trial': i_trial,
'Strategy': strategy,
'Tasks': args.task_inds,
}, ignore_index=True)
### Save the complete test performance (including uncs) to log
test_error = test_preds - np.array(test_data.targets())
log_data_dict = {
f"Error_{t}": test_error[:,t]
for t in range(test_error.shape[1])}
log_data_dict.update({
"Smiles": test_data.smiles(),
"Uncertainty": np.mean(test_conf, 1),
"Entropy": np.mean(test_entropy, 1),
"Std": np.mean(test_std, 1),
"TopK": percent_top_k_overlap,
"Train Data Ratio": n_samples_per_run[i]/float(n_total),
})
df_test_log = pd.DataFrame(data=log_data_dict)
Path(os.path.join(results_root, "scores")).mkdir(
parents=True, exist_ok=True)
df_test_log.to_csv(os.path.join(results_root, "scores",
f"{strategy}_step_{i}_{tic_time}.csv"))
logger.info("Percent top-k = {}".format(round(percent_top_k_overlap, 2)))
### Add new samples to training set
n_add = n_samples_per_run[min(i+1, n_loops-1)] - n_samples_per_run[i]
if n_add > 0: # n_add = 0 on the last iteration, when we are done
# Probability of sampling a new point, depends on the weight
per_sample_prob = deepcopy(per_sample_weight)
# Exclude data we've already trained with, and normalize to probability
per_sample_prob[train_subset_inds] = 0.0
per_sample_prob = per_sample_prob / per_sample_prob.sum()
# Sample accordingly and add to our training inds
if "sample" in strategy:
train_inds_to_add = np.random.choice(n_total, size=n_add, p=per_sample_prob, replace=False)
else:
# greedy, just pick the highest probability indicies
inds_sorted = np.argsort(per_sample_prob) # smallest to largest
train_inds_to_add = inds_sorted[-n_add:] # grab the last k inds
# Add the indices to the training set
train_subset_inds = np.append(train_subset_inds, train_inds_to_add)
del models
torch.cuda.empty_cache()
# END SINGLE FOLD
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S.%f")
df.to_csv(os.path.join(results_root, f"{timestamp}_{args.task_inds}.csv"))
print(f"Done with all folds and saved into {results_root}")
os._exit(1)