-
Notifications
You must be signed in to change notification settings - Fork 31
/
run_derive_ckpts.py
70 lines (56 loc) · 2.15 KB
/
run_derive_ckpts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# pylint: disable=invalid-name,missing-docstring,redefined-outer-name
# -- coding: utf-8 -*-
import os
import copy
import argparse
import subprocess
import multiprocessing
from io import StringIO
import yaml
GPUs = [4, 5, 6, 7]
parser = argparse.ArgumentParser()
parser.add_argument("--epochs", default=None)
parser.add_argument("--arch-file", default=None)
parser.add_argument("--gpus", default=None, type=str)
args, base_dir = parser.parse_known_args()
if args.gpus is not None:
GPUs = [int(g) for g in args.gpus.split(",")]
base_dir = base_dir[0]
epochs = args.epochs.split(",")
cfg_file = os.path.join(base_dir, "config.yaml")
cfg = yaml.load(open(cfg_file, "r"))
if "eval_arch" in cfg["weights_manager_cfg"].keys():
cfg["weights_manager_cfg"]["eval_arch"] = True
cfg_file = os.path.join(base_dir, "config_derive.yaml")
yaml.dump(cfg, open(cfg_file, "w"))
num_processes = len(GPUs)
queue = multiprocessing.Queue(maxsize=num_processes)
def _worker(p_id, gpu_id, queue):
while 1:
token = queue.get()
if token is None:
break
# ckpt_dir, res_file = token
cfg_file, arch_file, ckpt_dir, out_file, derive_log = token
# call eval-arch
cmd = (
"awnas eval-arch {} {} --load {} --dump-rollouts {} --gpu {} --seed 123 "
">{} 2>&1"
).format(cfg_file, arch_file, ckpt_dir, out_file, gpu_id, derive_log)
print("Process #{}: ckpt {}; CMD: {}".format(p_id, ckpt_dir, cmd))
subprocess.check_call(cmd, shell=True)
print("Process #{} end".format(p_id))
for p_id in range(num_processes):
p = multiprocessing.Process(target=_worker, args=(p_id, GPUs[p_id], queue))
p.start()
derive_logs = os.path.join(base_dir, "derive_logs")
if os.path.exists(derive_logs) == False:
os.makedirs(derive_logs)
for epoch in epochs:
ckpt_dir = os.path.join(base_dir, epoch)
out_file = os.path.join(base_dir, "derive_results_{}.pkl".format(epoch))
derive_log = os.path.join(base_dir, "derive_logs" ,"{}.log".format(epoch))
queue.put((cfg_file, args.arch_file, ckpt_dir, out_file, derive_log))
# close all the workers
for _ in range(num_processes):
queue.put(None)