You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
when I run the script:
CUDA_VISIBLE_DEVICES=0 python llama.py ${MODEL_DIR} c4 --wbits 4 --true-sequential --act-order --groupsize 128 --eval --save llama7b-4bit-128g.pt &>baseline.txt &
I got the same ppl as readme,But when infer with saved int4 weight:
CUDA_VISIBLE_DEVICES=0 python llama_inference.py decapoda-research/llama-7b-hf --wbits 4 --groupsize 128 --load llama7b-4bit-128g.pt --text "this is llama"
I get the error as follows:
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/jingming.guo/Mars/GPTQ-for-LLaMa/llama.py", line 483, in
model = load_quant(args.model, args.load, args.wbits, args.groupsize)
File "/home/jingming.guo/Mars/GPTQ-for-LLaMa/llama.py", line 319, in load_quant
quant.autotune_warmup_linear(model, transpose=not (eval))
File "/home/jingming.guo/Mars/GPTQ-for-LLaMa/quant/quant_linear.py", line 419, in autotune_warmup_linear
matmul248(a, qweight, scales, qzeros, g_idx, bits, maxq)
File "/home/jingming.guo/Mars/GPTQ-for-LLaMa/quant/quant_linear.py", line 267, in matmul248
matmul_248_kernel[grid](input, qweight, output, scales, qzeros, g_idx, input.shape[0], qweight.shape[1], input.shape[1], bits, maxq, input.stride(0), input.stride(1), qweight.stride(0),
File "/home/jingming.guo/Mars/GPTQ-for-LLaMa/quant/custom_autotune.py", line 90, in run
timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
File "/home/jingming.guo/Mars/GPTQ-for-LLaMa/quant/custom_autotune.py", line 90, in
timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
File "/home/jingming.guo/Mars/GPTQ-for-LLaMa/quant/custom_autotune.py", line 72, in _bench
return triton.testing.do_bench(kernel_call, percentiles=(0.5, 0.2, 0.8), rep=40)
File "/home/anaconda3/envs/py39_ptq_jingming/lib/python3.9/site-packages/triton/testing.py", line 143, in do_bench
fn()
File "/home/jingming.guo/Mars/GPTQ-for-LLaMa/quant/custom_autotune.py", line 67, in kernel_call
self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **current)
File "", line 41, in matmul_248_kernel
File "/home/anaconda3/envs/py39_ptq_jingming/lib/python3.9/site-packages/triton/compiler.py", line 1588, in compile
so_path = make_stub(name, signature, constants)
File "/home/anaconda3/envs/py39_ptq_jingming/lib/python3.9/site-packages/triton/compiler.py", line 1477, in make_stub
so = _build(name, src_path, tmpdir)
File "/home/anaconda3/envs/py39_ptq_jingming/lib/python3.9/site-packages/triton/compiler.py", line 1392, in _build
ret = subprocess.check_call(cc_cmd)
File "/home/anaconda3/envs/py39_ptq_jingming/lib/python3.9/subprocess.py", line 373, in check_call
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['/usr/bin/gcc', '/tmp/tmpaxcp747n/main.c', '-O3', '-I/usr/local/cuda/include', '-I/home/anaconda3/envs/py39_ptq_jingming/include/python3.9', '-I/tmp/tmpaxcp747n', '-shared', '-fPIC', '-lcuda', '-o', '/tmp/tmpaxcp747n/matmul_248_kernel.cpython-39-x86_64-linux-gnu.so', '-L/usr/share/man/man7']' returned non-zero exit status 1.
The text was updated successfully, but these errors were encountered:
when I run the script:
CUDA_VISIBLE_DEVICES=0 python llama.py ${MODEL_DIR} c4 --wbits 4 --true-sequential --act-order --groupsize 128 --eval --save llama7b-4bit-128g.pt &>baseline.txt &
I got the same ppl as readme,But when infer with saved int4 weight:
CUDA_VISIBLE_DEVICES=0 python llama_inference.py decapoda-research/llama-7b-hf --wbits 4 --groupsize 128 --load llama7b-4bit-128g.pt --text "this is llama"
I get the error as follows:
Loading model ...
Found 3 unique KN Linear values.
Warming up autotune cache ...
0%| | 0/12 [00:00<?, ?it/s]/usr/bin/ld: cannot find -lcuda
collect2: error: ld returned 1 exit status
0%| | 0/12 [00:00<?, ?it/s]
Traceback (most recent call last):
File "", line 21, in matmul_248_kernel
KeyError: ('2-.-0-.-0-37ce7529e37ca1a0b8a47b63bc5fd4b0-d6252949da17ceb5f3a278a70250af13-3b85c7bef5f0a641282f3b73af50f599-2d732a2488b7ed996facc3e641ee56bf-3498c340fd4b6ee7805fd54b882a04f5-e1f133f98d04093da2078dfc51c36b72-b26258bf01f839199e39d64851821f26-d7c06e3b46e708006c15224aac7a1378-f585402118c8a136948ce0a49cfe122c', (torch.float16, torch.int32, torch.float16, torch.float16, torch.int32, torch.int32, 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32'), (16, 256, 32, 8), (True, True, True, True, True, True, (False, True), (True, False), (True, False), (False, False), (False, False), (True, False), (False, True), (True, False), (False, True), (True, False), (False, True), (True, False), (True, False)))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/jingming.guo/Mars/GPTQ-for-LLaMa/llama.py", line 483, in
model = load_quant(args.model, args.load, args.wbits, args.groupsize)
File "/home/jingming.guo/Mars/GPTQ-for-LLaMa/llama.py", line 319, in load_quant
quant.autotune_warmup_linear(model, transpose=not (eval))
File "/home/jingming.guo/Mars/GPTQ-for-LLaMa/quant/quant_linear.py", line 419, in autotune_warmup_linear
matmul248(a, qweight, scales, qzeros, g_idx, bits, maxq)
File "/home/jingming.guo/Mars/GPTQ-for-LLaMa/quant/quant_linear.py", line 267, in matmul248
matmul_248_kernel[grid](input, qweight, output, scales, qzeros, g_idx, input.shape[0], qweight.shape[1], input.shape[1], bits, maxq, input.stride(0), input.stride(1), qweight.stride(0),
File "/home/jingming.guo/Mars/GPTQ-for-LLaMa/quant/custom_autotune.py", line 90, in run
timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
File "/home/jingming.guo/Mars/GPTQ-for-LLaMa/quant/custom_autotune.py", line 90, in
timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
File "/home/jingming.guo/Mars/GPTQ-for-LLaMa/quant/custom_autotune.py", line 72, in _bench
return triton.testing.do_bench(kernel_call, percentiles=(0.5, 0.2, 0.8), rep=40)
File "/home/anaconda3/envs/py39_ptq_jingming/lib/python3.9/site-packages/triton/testing.py", line 143, in do_bench
fn()
File "/home/jingming.guo/Mars/GPTQ-for-LLaMa/quant/custom_autotune.py", line 67, in kernel_call
self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **current)
File "", line 41, in matmul_248_kernel
File "/home/anaconda3/envs/py39_ptq_jingming/lib/python3.9/site-packages/triton/compiler.py", line 1588, in compile
so_path = make_stub(name, signature, constants)
File "/home/anaconda3/envs/py39_ptq_jingming/lib/python3.9/site-packages/triton/compiler.py", line 1477, in make_stub
so = _build(name, src_path, tmpdir)
File "/home/anaconda3/envs/py39_ptq_jingming/lib/python3.9/site-packages/triton/compiler.py", line 1392, in _build
ret = subprocess.check_call(cc_cmd)
File "/home/anaconda3/envs/py39_ptq_jingming/lib/python3.9/subprocess.py", line 373, in check_call
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['/usr/bin/gcc', '/tmp/tmpaxcp747n/main.c', '-O3', '-I/usr/local/cuda/include', '-I/home/anaconda3/envs/py39_ptq_jingming/include/python3.9', '-I/tmp/tmpaxcp747n', '-shared', '-fPIC', '-lcuda', '-o', '/tmp/tmpaxcp747n/matmul_248_kernel.cpython-39-x86_64-linux-gnu.so', '-L/usr/share/man/man7']' returned non-zero exit status 1.
The text was updated successfully, but these errors were encountered: