forked from tensorfork/OBST
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.json
76 lines (71 loc) · 2.14 KB
/
config.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
{
"summary_flush_interval": 16,
"time_patch": 1,
"n_ctx": 1024,
"chunk_size": 16384,
"language_token_per_frame": 1024,
"n_head": 8,
"storage_dtype": "bfloat16",
"n_embd": 8192,
"n_blocks": 16,
"intermediate_feed_forward_multiplier": 0.0625,
"group_linear_factor": 32,
"buffer_size": 16,
"interleaved_datasets": 8,
"shuffle_buffer": 64,
"is_training": true,
"num_parallel_readers": 8,
"num_parallel_calls": 64,
"cache_decoded": true,
"cache": false,
"num_cores": 8,
"prefetch_depth_auto_tune": true,
"input_fn": "gpt_random_input",
"memory_token": [0, 0],
"token_patch_size": 1,
"z_loss": 0.0001,
"label_smoothing":0,
"optimizer": "sm3",
"debug_train_step": false,
"learning_rate": 0.0005,
"recompute_grad": true,
"three_axes": false,
"use_language": true,
"use_video": false,
"padding_token": 4,
"model_mode": "gpt",
"mesh_shape": "b:1,y:8",
"layout": "batch:b,heads:y",
"dataset_configs": [{"type": "text", "path": "gs://jannet/the-char-pile/*", "weight": 1}],
"block_config":[{"layer": ["norm-group-instance", "activation", "feed_forward-group", "rezero"]},
{"layer": ["norm-group-instance", "activation", "attention-context", "rezero"]}],
"model_path": "gs://mlpublic-euw4/char-runs/video-head=8-embd=8192-blocks=16-ctx=1024-gll=32-iffm=0.0625-decay=0-z=1e-5-lr=5e-5-batch=1-sm3-storage=bfp16-3",
"vocab_size": 256,
"embed_dropout": 0,
"lr_decay": "cosine",
"warmup_steps": 3000,
"opt_beta1": 0.9,
"opt_beta2": 0.95,
"opt_epsilon": 1e-8,
"ada_epsilon1": 1e-30,
"ada_epsilon2": 1e-3,
"opt_name": "adam",
"weight_decay": 0,
"attn_dropout": 0,
"train_steps": 572300,
"eval_steps": 0,
"predict_steps": 1,
"res_dropout": 0,
"eval_batch_size": 64,
"predict_batch_size": 1,
"datasets": [],
"shuffle": false,
"residual": true,
"scale_by_depth": true,
"scale_by_in": false,
"activation_function": "gelu",
"gradient_clipping": 1.0,
"train_batch_size": 32,
"iterations": 10,
"warmup_steps": 0
}