From 23f3b0ae3f0c4876f55392fe643091263c23995e Mon Sep 17 00:00:00 2001 From: luoyu-intel Date: Tue, 30 Apr 2024 15:21:56 +0800 Subject: [PATCH] refactor quantization data process in python --- neural_speed/convert/common.py | 18 +++++++++--------- neural_speed/convert/convert_quantized_gptj.py | 12 ++++++------ .../convert/convert_quantized_llama.py | 6 +++--- .../convert/convert_quantized_mistral.py | 6 +++--- .../convert/convert_quantized_mixtral.py | 6 +++--- 5 files changed, 24 insertions(+), 24 deletions(-) diff --git a/neural_speed/convert/common.py b/neural_speed/convert/common.py index 7ca30291a..9231da4b8 100644 --- a/neural_speed/convert/common.py +++ b/neural_speed/convert/common.py @@ -684,9 +684,9 @@ def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head, write_header(fout, shape[::-1], dst_name, GGML_QJBLAS_TYPE) if q_config['bits'] == 4: - int_weight = (int_weight - 8) * 16 - gptq_scales = gptq_scales / 16 - gptq_zeros = (gptq_zeros - 8) * 16 + int_weight = (int_weight - 8) + gptq_scales = gptq_scales + gptq_zeros = (gptq_zeros - 8) dst = np.zeros((int_weight.shape[0], int_weight.shape[1] * 4), dtype=np.int8) int_weight = np.ascontiguousarray(int_weight.numpy()) gptq_scales = np.ascontiguousarray((gptq_scales.float()).numpy()) @@ -758,14 +758,14 @@ def convert_to_qx_bestla_tensor(src_name, dst_name, model, fout, q_config): # Int3 is the same as int4, but offset=4, mul scale==32. weight_dtype = "int8" if q_config['bits'] == 4: - int_weight = (int_weight - 8) * 16 - gptq_scales = gptq_scales / 16 - gptq_zeros = (gptq_zeros - 8) * 16 + int_weight = (int_weight - 8) + gptq_scales = gptq_scales + gptq_zeros = (gptq_zeros - 8) weight_dtype = "int4" elif q_config['bits'] == 3: - int_weight = (int_weight - 4) * 32 - gptq_scales = gptq_scales / 32 - gptq_zeros = (gptq_zeros - 4) * 32 + int_weight = (int_weight - 4) + gptq_scales = gptq_scales + gptq_zeros = (gptq_zeros - 4) weight_dtype = "int3" else: ValueError(f"Unsupported q_config[bits]: {q_config['bits']}") diff --git a/neural_speed/convert/convert_quantized_gptj.py b/neural_speed/convert/convert_quantized_gptj.py index 85a1e769b..4db43af24 100644 --- a/neural_speed/convert/convert_quantized_gptj.py +++ b/neural_speed/convert/convert_quantized_gptj.py @@ -62,14 +62,14 @@ def convert_to_qx_bestla_tensor(src_name, dst_name, model, fout, q_config): # Int3 is the same as int4, but offset=4, mul scale==32. weight_dtype = "int8" if q_config['bits'] == 4: - int_weight = (int_weight - 8) * 16 - gptq_scales = gptq_scales / 16 - gptq_zeros = (gptq_zeros - 8) * 16 + int_weight = (int_weight - 8) + gptq_scales = gptq_scales + gptq_zeros = (gptq_zeros - 8) weight_dtype = "int4" elif q_config['bits'] == 3: - int_weight = (int_weight - 4) * 32 - gptq_scales = gptq_scales / 32 - gptq_zeros = (gptq_zeros - 4) * 32 + int_weight = (int_weight - 4) + gptq_scales = gptq_scales + gptq_zeros = (gptq_zeros - 4) weight_dtype = "int3" dst = np.zeros((int_weight.shape[0], int_weight.shape[1] * 4), dtype=np.int8) diff --git a/neural_speed/convert/convert_quantized_llama.py b/neural_speed/convert/convert_quantized_llama.py index 9475f1adb..bebfed383 100644 --- a/neural_speed/convert/convert_quantized_llama.py +++ b/neural_speed/convert/convert_quantized_llama.py @@ -69,9 +69,9 @@ def convert_to_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_hea weight_dtype = "int8" if q_config['bits'] == 4: - int_weight = (int_weight - 8) * 16 - gptq_scales = gptq_scales / 16 - gptq_zeros = (gptq_zeros - 8) * 16 + int_weight = (int_weight - 8) + gptq_scales = gptq_scales + gptq_zeros = (gptq_zeros - 8) weight_dtype = "int4" dst = np.zeros((int_weight.shape[0], int_weight.shape[1] * 4), dtype=np.int8) diff --git a/neural_speed/convert/convert_quantized_mistral.py b/neural_speed/convert/convert_quantized_mistral.py index c89bcec41..c5e85c8e8 100644 --- a/neural_speed/convert/convert_quantized_mistral.py +++ b/neural_speed/convert/convert_quantized_mistral.py @@ -71,9 +71,9 @@ def convert_to_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_hea weight_dtype = "int8" if q_config['bits'] == 4: - int_weight = (int_weight - 8) * 16 - gptq_scales = gptq_scales / 16 - gptq_zeros = (gptq_zeros - 8) * 16 + int_weight = (int_weight - 8) + gptq_scales = gptq_scales + gptq_zeros = (gptq_zeros - 8) weight_dtype = "int4" dst = np.zeros((int_weight.shape[0], int_weight.shape[1] * 4), dtype=np.int8) diff --git a/neural_speed/convert/convert_quantized_mixtral.py b/neural_speed/convert/convert_quantized_mixtral.py index df793cda6..77ce866be 100644 --- a/neural_speed/convert/convert_quantized_mixtral.py +++ b/neural_speed/convert/convert_quantized_mixtral.py @@ -77,9 +77,9 @@ def convert_to_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_hea weight_dtype = "int8" if q_config['bits'] == 4: - int_weight = (int_weight - 8) * 16 - gptq_scales = gptq_scales / 16 - gptq_zeros = (gptq_zeros - 8) * 16 + int_weight = (int_weight - 8) + gptq_scales = gptq_scales + gptq_zeros = (gptq_zeros - 8) weight_dtype = "int4" dst = np.zeros((int_weight.shape[0], int_weight.shape[1] * 4), dtype=np.int8)