forked from NVIDIA/TensorRT-LLM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_checkpoint_config.py
157 lines (142 loc) · 6.2 KB
/
generate_checkpoint_config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import argparse
import json
import os
from tensorrt_llm.quantization import KV_CACHE_QUANT_ALGO_LIST, QUANT_ALGO_LIST
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument(
'--output_path',
type=str,
default='config.json',
help='The path to save the TensorRT-LLM checkpoint config.json file')
parser.add_argument('--architecture', type=str, default='GPTForCausalLM')
parser.add_argument('--dtype',
type=str,
default='float16',
choices=['float32', 'bfloat16', 'float16'])
parser.add_argument('--vocab_size', type=int, default=32000)
parser.add_argument('--max_position_embeddings', type=int, default=1024)
parser.add_argument('--hidden_size', type=int, default=768)
parser.add_argument('--intermediate_size', type=int, default=None)
parser.add_argument('--num_hidden_layers', type=int, default=12)
parser.add_argument('--num_attention_heads', type=int, default=12)
parser.add_argument('--num_key_value_heads', type=int, default=None)
parser.add_argument('--hidden_act', type=str, default='gelu')
parser.add_argument('--norm_epsilon', type=float, default=1e-5)
parser.add_argument('--position_embedding_type',
type=str,
default='learned_absolute')
parser.add_argument(
'--use_parallel_embedding',
action='store_true',
default=False,
help=
'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled'
)
parser.add_argument(
'--embedding_sharding_dim',
type=int,
default=0,
choices=[0, 1],
help=
'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). '
'To shard it along hidden dimension, set embedding_sharding_dim=1'
'Note: embedding sharing is only enabled when embedding_sharding_dim = 0'
)
parser.add_argument(
'--share_embedding_table',
action='store_true',
default=False,
help=
'Try to reduce the engine size by sharing the embedding lookup table between two layers.'
'Note: the flag might not take effect when the criteria are not met.')
parser.add_argument('--tp_size',
type=int,
default=1,
help='N-way tensor parallelism size')
parser.add_argument('--pp_size',
type=int,
default=1,
help='N-way pipeline parallelism size')
parser.add_argument('--quant_algo',
type=str,
default=None,
choices=[None] + QUANT_ALGO_LIST)
parser.add_argument('--kv_cache_quant_algo',
type=str,
default=None,
choices=[None] + KV_CACHE_QUANT_ALGO_LIST)
parser.add_argument('--group_size', type=int, default=64)
parser.add_argument('--smoothquant_val', type=float, default=None)
parser.add_argument('--has_zero_point', default=False, action='store_true')
parser.add_argument('--pre_quant_scale', default=False, action='store_true')
parser.add_argument('--exclude_modules', nargs='+', default=None)
parser.add_argument('--bias', default=False, action='store_true')
parser.add_argument('--apply_query_key_layer_scaling',
default=False,
action='store_true')
parser.add_argument('--rotary_pct', type=float, default=1.0)
parser.add_argument('--rotary_base', type=float, default=10000.0)
parser.add_argument('--rotary_scaling', nargs=2, type=str, default=None)
args = parser.parse_args()
return args
if __name__ == '__main__':
args = parse_arguments()
world_size = args.tp_size * args.pp_size
assert args.output_path.endswith('.json')
output_dir = os.path.dirname(args.output_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
config = {
'architecture': args.architecture,
'dtype': args.dtype,
'vocab_size': args.vocab_size,
'max_position_embeddings': args.max_position_embeddings,
'hidden_size': args.hidden_size,
'intermediate_size': args.intermediate_size,
'num_hidden_layers': args.num_hidden_layers,
'num_attention_heads': args.num_attention_heads,
'num_key_value_heads': args.num_key_value_heads,
'hidden_act': args.hidden_act,
'norm_epsilon': args.norm_epsilon,
'position_embedding_type': args.position_embedding_type,
'use_parallel_embedding': args.use_parallel_embedding,
'embedding_sharding_dim': args.embedding_sharding_dim,
'share_embedding_table': args.share_embedding_table,
'quantization': {
'quant_algo': args.quant_algo,
'kv_cache_quant_algo': args.kv_cache_quant_algo,
'exclude_modules': args.exclude_modules,
},
'mapping': {
'world_size': world_size,
'tp_size': args.tp_size,
'pp_size': args.pp_size,
},
'bias': args.bias,
'apply_query_key_layer_scaling': args.apply_query_key_layer_scaling,
'rotary_pct': args.rotary_pct,
'rotary_base': args.rotary_base,
'rotary_scaling': args.rotary_scaling,
}
if args.intermediate_size is None:
config['intermediate_size'] = args.hidden_size * 4
if args.num_key_value_heads is None:
config['num_key_value_heads'] = args.num_attention_heads
if args.quant_algo is not None:
if 'AWQ' in args.quant_algo or 'GPTQ' in args.quant_algo:
config['quantization'].update({
'group_size':
args.group_size,
'has_zero_point':
args.has_zero_point,
'pre_quant_scale':
args.pre_quant_scale,
})
if 'SQ' in args.quant_algo:
config['quantization'].update({
'smoothquant_val':
args.smoothquant_val,
})
with open(args.output_path, 'w') as f:
json.dump(config, f, indent=4)