Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

[CI]add int3&int5 test #275

Merged
merged 1 commit into from
Jun 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/workflows/scripts/models/cpp_graph_inference.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ function main() {
quant_script="./build/bin/quant_llama"
infer_cmd="./build/bin/run_llama"
input_model="/tf_dataset2/models/pytorch/Meta-Llama-3-8B"
precision_list=("q4_j_b128" "q4_j_b32" "q4_0")
precision_list=("q4_j_b128" "q4_j_b32" "q4_0" "q5_j_i8_pc_asym" "q3_j_i8_b128_asym")
elif [[ "${model}" == "gpt-neox-20b" ]]; then
convert_script="${scripts_dir}/convert_gptneox.py"
quant_script="./build/bin/quant_gptneox"
Expand Down Expand Up @@ -129,6 +129,10 @@ function main() {
${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype int4 --group_size 32 --scale_dtype fp32 --compute_dtype fp32 --alg sym
elif [[ ${precision} == "q4_j_b128" ]]; then
${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype int4 --group_size 128 --scale_dtype fp32 --compute_dtype fp32 --alg sym
elif [[ ${precision} == "q3_j_i8_b128_asym" ]]; then
${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype int3 --group_size 128 --scale_dtype fp32 --compute_dtype int8 --alg asym
elif [[ ${precision} == "q5_j_i8_pc_asym" ]]; then
${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype int5 --group_size -1 --scale_dtype fp32 --compute_dtype int8 --alg asym
elif [[ ${precision} == "q4_j_b128_asym" ]]; then
${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype int4 --group_size 128 --scale_dtype fp32 --compute_dtype fp32 --alg asym
elif [[ ${precision} == "q4_0" ]]; then
Expand Down
19 changes: 18 additions & 1 deletion tests/model-test/cpp_graph_inference.sh
Original file line number Diff line number Diff line change
Expand Up @@ -206,11 +206,16 @@ function main() {
quant_script="./build/bin/quant_llama"
convert_script="${convert_script}/convert_llama.py"
infer_cmd="./build/bin/run_llama"
precision_list+=("q5_j_i8_g128" "q3_j_i8_g128" "q5_j_i8_g128_asym" "q3_j_i8_g128_asym"
"q3_j_i8_pc_asym" "q5_j_i8_pc_asym"
)
elif [[ "${model}" == "gptj-6b" ]]; then
quant_script="./build/bin/quant_gptj"
convert_script="${convert_script}/convert_gptj.py"
infer_cmd="./build/bin/run_gptj"
precision_list+=("q4_j1_i8_g128" "q4_j1_bf16_pc")
precision_list+=("q4_j1_i8_g128" "q4_j1_bf16_pc" "q5_j_i8_g128" "q3_j_i8_g128" "q5_j_i8_g128_asym" "q3_j_i8_g128_asym"
"q3_j_i8_pc_asym" "q5_j_i8_pc_asym"
)
elif [[ "${model}" == "gpt-neox-20b" ]]; then
quant_script="./build/bin/quant_gptneox"
convert_script="${convert_script}/convert_gptneox.py"
Expand Down Expand Up @@ -421,6 +426,18 @@ function main() {
eval "$quant_script_prologue --weight_dtype int4 --group_size 32 --compute_dtype int8 --scale_dtype fp32 --alg sym"
elif [[ ${precision} == "q4_j_f32_g128" ]]; then
eval "$quant_script_prologue --weight_dtype int4 --group_size 128 --compute_dtype fp32 --scale_dtype fp32 --alg sym"
elif [[ ${precision} == "q3_j_i8_g128" ]]; then
eval "$quant_script_prologue --weight_dtype int3 --group_size 128 --compute_dtype int8 --scale_dtype fp32 --alg sym"
elif [[ ${precision} == "q5_j_i8_g128" ]]; then
eval "$quant_script_prologue --weight_dtype int5 --group_size 128 --compute_dtype int8 --scale_dtype fp32 --alg sym"
elif [[ ${precision} == "q3_j_i8_g128_asym" ]]; then
eval "$quant_script_prologue --weight_dtype int3 --group_size 128 --compute_dtype int8 --scale_dtype fp32 --alg asym"
elif [[ ${precision} == "q5_j_i8_g128_asym" ]]; then
eval "$quant_script_prologue --weight_dtype int5 --group_size 128 --compute_dtype int8 --scale_dtype fp32 --alg asym"
elif [[ ${precision} == "q3_j_i8_pc_asym" ]]; then
eval "$quant_script_prologue --weight_dtype int3 --group_size -1 --compute_dtype int8 --scale_dtype fp32 --alg asym"
elif [[ ${precision} == "q5_j_i8_pc_asym" ]]; then
eval "$quant_script_prologue --weight_dtype int5 --group_size -1 --compute_dtype int8 --scale_dtype fp32 --alg asym"
elif [[ ${precision} == "q4_j1_i8_g128" ]]; then
eval "$quant_script_prologue --weight_dtype int4 --group_size 128 --compute_dtype int8 --scale_dtype fp32 --alg asym"
elif [[ ${precision} == "q4_j1_bf16_pc" ]]; then
Expand Down
Loading