-
Notifications
You must be signed in to change notification settings - Fork 2
/
config.yml
172 lines (125 loc) · 5.04 KB
/
config.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
{
# MinimalConfig
# Base config class providing general settings for non-mutability and json serialization options
#
"runner": {
# RunnerConfig
# Base config class providing general settings for non-mutability and json serialization options
# Type of the runner to be invoked.
"runner_type": "pdsh",
# Hostsfile path (in MPI style) that defines the resource pool available to the job (e.g., worker-0 slots=4)
"hostsfile": null,
# List of hosts alternative to hostsfile (e.g., worker-0 slots=4)
"hosts": null,
# (optional) Port used by PyTorch distributed for communication during training.
"master_port": 29501,
# (optional) IP address of node 0, will be inferred via 'hostname -I' if not specified.
"master_addr": null,
# User script to launch
"script": "examples/mlp_example/train.py",
# Number of GPUs per node, is used if not defined in hosts' slots
"default_gpu_count": 8,
# docker configuration in case using a docker runner type
"docker_config": {
# RunnerDockerConfig
# Base config class providing general settings for non-mutability and json serialization options
# Name of the docker container to be started
"docker_container": null,
# Run docker command with sudo
"docker_sudo": false,
# List of directories to be mounted in the docker under the same path
"docker_mounts": null
}
}
,
#
"topology": {
# TopologyConfig
# Base config class providing general settings for non-mutability and json serialization options
#
"global_rank": null,
#
#
"local_slot": null,
#
"model_parallel_size": 1,
#
"pipe_parallel_size": 1,
#
"data_parallel_size": 1,
# global train batch size including all gradient accumulation steps
"global_batch_size": null,
# Batch size for one training micro step. This is used when the global_batch_size cannot fit in GPU memory to determine the number of gradient accumulation steps.
"micro_batch_size": 256,
# Number of gradient accumulation. This is used when the global_batch_size cannot fit in GPU memory to determine the number of gradient accumulation steps.
"gradient_accumulation_steps": 1
}
,
#
"optimizer": {
# AdamWOptimizerConfig
# Base config class providing general settings for non-mutability and json serialization options
# First coefficient used for computing running averages of gradient and its square
"beta1": 0.9,
# Second coefficient used for computing running averages of gradient and its square
"beta2": 0.95,
# term added to the denominator to improve numerical stability (default: 1e-8)
"eps": 1e-08,
# clip global l2 grads to this value, deactivate if 0.0
"gradient_clipping": 1.0,
# number of floating points to allreduce in one go
"allreduce_bucket_size": 500000000
}
,
#
"learning_rate_scheduler": {
# LearningRateSchedulerConfig
# Base config class providing general settings for non-mutability and json serialization options
# Base learning rate; this is also the maximum learning rate.
"learning_rate": 0.01,
# Minimum learning rate below which a step's learning rate will never drop. This is the final learning rate after the schedule has been applied.
"learning_rate_minimum": 0.0,
# Shape of the learning rate decay after warm up
"learning_rate_decay_style": "cosine",
# Number of iterations within which the learning rate follows the schedule. Warmup iterations are included.
"learning_rate_decay_iters": 1000,
# Number of warmup steps during which the learning rate is linearly increased to the maximum learning rate. The actual schedule starts after the warmup steps.
"learning_rate_warmup_steps": 0
}
,
#
"training": {
# TrainingConfig
# Base config class providing general settings for non-mutability and json serialization options
#
"weight_decay": 0.0001
},
#
"trainer": {
# directory for saving checkpoints
"save_dir": ".checkpoints",
# save a checkpoint every 'save_interval' steps to save_dir, iff save_dir is defined
"save_interval": 101,
# directory for loading checkpoints
"load_dir": null,
#
"train_iterations": 100,
#
"eval_interval": 10,
#
"eval_iterations": 1000,
#
"seed": 42,
# error out if a checkpoint could not be loaded
"assert_checkpoint_loaded": False,
},
"logger": {
"log_dir": "debug_logs",
"use_wandb": False,
"use_tensorboard": False,
},
"architecture": {
"n_hidden_layers": 3,
"hidden_dim": 128,
},
}