forked from dusty-nv/jetson-containers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquantize.py
executable file
·55 lines (40 loc) · 2.45 KB
/
quantize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/usr/bin/env python3
# quantize a HuggingFace CausalLM .pt model with AWQ
import os
import argparse
import subprocess
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, required=True, help="model name or path to model directory (should be a .pt model)")
parser.add_argument('--output', type=str, default='', help="directory to output the quantized model to (default is /data/models/awq/$MODEL)")
parser.add_argument('--load_awq', type=str, default='', help="load a model that's already had AWQ search performed on it (i.e. from the AWQ Model Zoo), and skip the search step")
parser.add_argument('--w_bit', type=int, default=4, choices=[3,4], help="the number of bits (3 or 4)")
parser.add_argument('--q_group_size', type=int, default=128, help="the group size (default 128)")
parser.add_argument('--no_cache', action='store_true', help="dump the quantized AWQ weights even if the file already exists")
parser.add_argument('--skip_eval', action='store_true', help="evaluate the real quantized model on wikitext")
parser.add_argument('--simulate', action='store_true', help="print out the commands without actually running them")
args = parser.parse_args()
if not args.output:
args.output = f"/data/models/awq/{os.path.basename(args.model)}"
print(args)
os.makedirs(args.output, exist_ok=True)
prefix_awq = f"w{args.w_bit}-g{args.q_group_size}"
model_search = os.path.join(args.output, f"{prefix_awq}.pt") if not args.load_awq else args.load_awq
model_quant = os.path.join(args.output, f"{prefix_awq}-awq-v2.pt")
def run_cmd(cmd):
print("\nRunning command:\n")
print(cmd, "\n")
if not args.simulate:
subprocess.run(cmd, executable='/bin/bash', shell=True, check=True)
cmd_prefix = f"python3 -m awq.entry --model_path {args.model} --w_bit {args.w_bit} --q_group_size {args.q_group_size}"
# Perform AWQ search
if not args.load_awq:
run_cmd(f"{cmd_prefix} --run_awq --dump_awq {model_search}")
# Evaluate the AWQ quantized model on WikiText-2 (simulated pseudo quantization)
if not args.load_awq and not args.skip_eval:
run_cmd(f"{cmd_prefix} --tasks wikitext --load_awq {model_search} --q_backend fake")
# Generate real quantized weights (INT4)
if args.no_cache or not os.path.isfile(model_quant):
run_cmd(f"{cmd_prefix} --load_awq {model_search} --q_backend real --dump_quant {model_quant}")
# Load and evaluate the real quantized model
if not args.skip_eval:
run_cmd(f"{cmd_prefix} --tasks wikitext --load_quant {model_quant}")