From 23f3b0ae3f0c4876f55392fe643091263c23995e Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Tue, 30 Apr 2024 15:21:56 +0800
Subject: [PATCH] refactor quantization data process in python

---
 neural_speed/convert/common.py                 | 18 +++++++++---------
 neural_speed/convert/convert_quantized_gptj.py | 12 ++++++------
 .../convert/convert_quantized_llama.py         |  6 +++---
 .../convert/convert_quantized_mistral.py       |  6 +++---
 .../convert/convert_quantized_mixtral.py       |  6 +++---
 5 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/neural_speed/convert/common.py b/neural_speed/convert/common.py
index 7ca30291a..9231da4b8 100644
--- a/neural_speed/convert/common.py
+++ b/neural_speed/convert/common.py
@@ -684,9 +684,9 @@ def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head,
     write_header(fout, shape[::-1], dst_name, GGML_QJBLAS_TYPE)
 
     if q_config['bits'] == 4:
-        int_weight = (int_weight - 8) * 16
-        gptq_scales = gptq_scales / 16
-        gptq_zeros = (gptq_zeros - 8) * 16
+        int_weight = (int_weight - 8) 
+        gptq_scales = gptq_scales
+        gptq_zeros = (gptq_zeros - 8)
     dst = np.zeros((int_weight.shape[0], int_weight.shape[1] * 4), dtype=np.int8)
     int_weight = np.ascontiguousarray(int_weight.numpy())
     gptq_scales = np.ascontiguousarray((gptq_scales.float()).numpy())
@@ -758,14 +758,14 @@ def convert_to_qx_bestla_tensor(src_name, dst_name, model, fout, q_config):
     # Int3 is the same as int4, but offset=4, mul scale==32.
     weight_dtype = "int8"
     if q_config['bits'] == 4:
-        int_weight = (int_weight - 8) * 16
-        gptq_scales = gptq_scales / 16
-        gptq_zeros = (gptq_zeros - 8) * 16
+        int_weight = (int_weight - 8)
+        gptq_scales = gptq_scales
+        gptq_zeros = (gptq_zeros - 8) 
         weight_dtype = "int4"
     elif q_config['bits'] == 3:
-        int_weight = (int_weight - 4) * 32
-        gptq_scales = gptq_scales / 32
-        gptq_zeros = (gptq_zeros - 4) * 32
+        int_weight = (int_weight - 4) 
+        gptq_scales = gptq_scales
+        gptq_zeros = (gptq_zeros - 4)
         weight_dtype = "int3"
     else:
         ValueError(f"Unsupported q_config[bits]: {q_config['bits']}")
diff --git a/neural_speed/convert/convert_quantized_gptj.py b/neural_speed/convert/convert_quantized_gptj.py
index 85a1e769b..4db43af24 100644
--- a/neural_speed/convert/convert_quantized_gptj.py
+++ b/neural_speed/convert/convert_quantized_gptj.py
@@ -62,14 +62,14 @@ def convert_to_qx_bestla_tensor(src_name, dst_name, model, fout, q_config):
     # Int3 is the same as int4, but offset=4, mul scale==32.
     weight_dtype = "int8"
     if q_config['bits'] == 4:
-        int_weight = (int_weight - 8) * 16
-        gptq_scales = gptq_scales / 16
-        gptq_zeros = (gptq_zeros - 8) * 16
+        int_weight = (int_weight - 8)
+        gptq_scales = gptq_scales 
+        gptq_zeros = (gptq_zeros - 8)
         weight_dtype = "int4"
     elif q_config['bits'] == 3:
-        int_weight = (int_weight - 4) * 32
-        gptq_scales = gptq_scales / 32
-        gptq_zeros = (gptq_zeros - 4) * 32
+        int_weight = (int_weight - 4)
+        gptq_scales = gptq_scales
+        gptq_zeros = (gptq_zeros - 4)
         weight_dtype = "int3"
 
     dst = np.zeros((int_weight.shape[0], int_weight.shape[1] * 4), dtype=np.int8)
diff --git a/neural_speed/convert/convert_quantized_llama.py b/neural_speed/convert/convert_quantized_llama.py
index 9475f1adb..bebfed383 100644
--- a/neural_speed/convert/convert_quantized_llama.py
+++ b/neural_speed/convert/convert_quantized_llama.py
@@ -69,9 +69,9 @@ def convert_to_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_hea
 
     weight_dtype = "int8"
     if q_config['bits'] == 4:
-        int_weight = (int_weight - 8) * 16
-        gptq_scales = gptq_scales / 16
-        gptq_zeros = (gptq_zeros - 8) * 16
+        int_weight = (int_weight - 8) 
+        gptq_scales = gptq_scales 
+        gptq_zeros = (gptq_zeros - 8)
         weight_dtype = "int4"
 
     dst = np.zeros((int_weight.shape[0], int_weight.shape[1] * 4), dtype=np.int8)
diff --git a/neural_speed/convert/convert_quantized_mistral.py b/neural_speed/convert/convert_quantized_mistral.py
index c89bcec41..c5e85c8e8 100644
--- a/neural_speed/convert/convert_quantized_mistral.py
+++ b/neural_speed/convert/convert_quantized_mistral.py
@@ -71,9 +71,9 @@ def convert_to_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_hea
 
     weight_dtype = "int8"
     if q_config['bits'] == 4:
-        int_weight = (int_weight - 8) * 16
-        gptq_scales = gptq_scales / 16
-        gptq_zeros = (gptq_zeros - 8) * 16
+        int_weight = (int_weight - 8) 
+        gptq_scales = gptq_scales 
+        gptq_zeros = (gptq_zeros - 8) 
         weight_dtype = "int4"
 
     dst = np.zeros((int_weight.shape[0], int_weight.shape[1] * 4), dtype=np.int8)
diff --git a/neural_speed/convert/convert_quantized_mixtral.py b/neural_speed/convert/convert_quantized_mixtral.py
index df793cda6..77ce866be 100644
--- a/neural_speed/convert/convert_quantized_mixtral.py
+++ b/neural_speed/convert/convert_quantized_mixtral.py
@@ -77,9 +77,9 @@ def convert_to_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_hea
 
     weight_dtype = "int8"
     if q_config['bits'] == 4:
-        int_weight = (int_weight - 8) * 16
-        gptq_scales = gptq_scales / 16
-        gptq_zeros = (gptq_zeros - 8) * 16
+        int_weight = (int_weight - 8) 
+        gptq_scales = gptq_scales
+        gptq_zeros = (gptq_zeros - 8)
         weight_dtype = "int4"
 
     dst = np.zeros((int_weight.shape[0], int_weight.shape[1] * 4), dtype=np.int8)