intel · wenhuach21 · Dec 9, 2024 · Dec 10, 2024 · Dec 10, 2024 · Dec 12, 2024
diff --git a/auto_round/__main__.py b/auto_round/__main__.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
+sys.path.insert(0, '../')
 
 def run():
     from auto_round.script.llm import setup_parser, tune, eval

diff --git a/auto_round/auto_quantizer.py b/auto_round/auto_quantizer.py
@@ -545,6 +545,8 @@ def remove_device_str(s, device_str):
                         "via `pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@b8b4127`")
 
             QuantLinear = dynamic_import_inference_linear(layer_backend, bits, group_size, sym)
+            # from auto_round_extension.cuda.qlinear_exllamav2_gptq import QuantLinear
+
 
             layer_device = get_device(layer)
 

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
@@ -203,7 +203,6 @@ def __init__(
             all_blocks = get_block_names(model)
             self.quant_block_list = find_matching_blocks(model, all_blocks, self.to_quant_block_names)
         self.cache_device = torch.device("cpu") if self.low_gpu_mem_usage else self.device
-
 
         ##activation
         self.act_group_size = act_group_size if not (act_group_size is None) else self.group_size
@@ -373,6 +372,8 @@ def dump_qinfo_to_layer_config(self):
         for n, m in self.model.named_modules():
             if n not in self.layer_config.keys():
                 continue
+            if hasattr(m,"orig_layer"):
+                m = m.orig_layer
             if hasattr(m, "scale"):
                 self.layer_config[n]["scale"] = m.scale
                 self.layer_config[n]["zp"] = m.zp
@@ -862,8 +863,13 @@ def quant_layer(self, layer_name, inputs, q_inputs=None, device=torch.device("cp
             if q_inputs is not None:
                 q_inputs[i] = q_inputs[i].to(layer.weight.dtype)
 
-        wrapper_linear = WrapperLinear(layer, enable_minmax_tuning=self.enable_minmax_tuning, device=device).to(
-            device)
+        wrapper_linear = WrapperLinear(
+            layer,
+            enable_minmax_tuning=self.enable_minmax_tuning,
+            device=device,
+            _inner_layer_name=layer_name,
+        ).to(device)
+
         round_params = []
         minmax_params = []
         for key in wrapper_linear.params.keys():
@@ -1029,10 +1035,11 @@ def quant_block(self, block, input_ids, input_others, q_input=None, device=torch
             if hasattr(m, "orig_layer"):
                 for key in m.params.keys():
                     if "min" in key or "max" in key:
+                        logger.info(f"add minmax param {key} in module{n} to optimizer, shape: {m.params[key].shape}")
                         minmax_params.append(m.params[key])
                     else:
+                        logger.info(f"add minmax param {key} in module{n} to optimizer, shape {m.params[key].shape}")
                         round_params.append(m.params[key])
-
         if self.enable_minmax_tuning:
             optimizer = self.optimizer(
                 [{"params": round_params}, {"params": minmax_params, "lr": self.minmax_lr}], lr=self.lr, weight_decay=0
@@ -1073,6 +1080,7 @@ def quant_block(self, block, input_ids, input_others, q_input=None, device=torch
         total_loss = 0
 
         for i in range(self.iters):
+            logger.info(f"iter {i} / {self.iters}")
             total_loss = 0
             if self.sampler == "rand":
                 whole_indices = torch.randperm(nsamples)[:pick_samples]
@@ -1209,6 +1217,8 @@ def quant_blocks(
             pbar = tqdm(range(0, len(block_names), nblocks))
         # for i in pbar:
         for i in range(len(block_names)):
+            if os.getenv("DEBUG_QUANT_BLOCK", "0") == "1" and i > 2: 
+                break
             if nblocks == 1:
                 n = block_names[i]
                 pbar.set_description(f"Quantizing {n}")
@@ -1258,7 +1268,7 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
         if not self.quantized:
             logger.warning("please run autoround.quantize first")
             return
-        if format == "fake" or format == "qdq" or self.act_bits <= 8:  ##TODO fix act quantizaiton later
+        if format == "fake" or format == "qdq":  ##TODO fix act quantizaiton later
             self.model = self.model.to("cpu")
             self.model.save_pretrained(output_dir)
             if self.tokenizer is not None:
@@ -1740,4 +1750,3 @@ def __init__(
             optimizer=optimizer,
             **kwargs,
         )
-
diff --git a/auto_round/config.py b/auto_round/config.py
@@ -0,0 +1,36 @@
+import os
+
+from dataclasses import dataclass
+
+
+@dataclass
+class GlobalConfig:
+    FP8_INPUT_BACKOFF: float = 1.0
+    FP8_WEIGHT_BACKOFF: float = 1.0
+    # enbale weight_fp8_max_scale
+    ENABLE_WEIGHT_FP8_MAX_SCALE: bool = False
+    W4A8_PC: bool = False
+    W4A8_DYNAMIC: bool = False
+
+    def __repr__(self):
+        return (
+            f"GlobalConfig(FP8_INPUT_BACKOFF={self.FP8_INPUT_BACKOFF}, "
+            f"FP8_WEIGHT_BACKOFF={self.FP8_WEIGHT_BACKOFF}, "
+            f"ENABLE_WEIGHT_FP8_MAX_SCALE={self.ENABLE_WEIGHT_FP8_MAX_SCALE}, "
+            f"W4A8_PC={self.W4A8_PC}, "
+            f"W4A8_DYNAMIC={self.W4A8_DYNAMIC}"
+            f")"
+
+        )
+
+
+# https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html?highlight=fp8#configuring-backoff-factors
+# The default values are input_backoff=0.25 and weight_backoff=0.5
+global_config = GlobalConfig()
+global_config.FP8_INPUT_BACKOFF = float(os.environ.get("AR_FP8_INPUT_BACKOFF", 1.0))
+global_config.FP8_WEIGHT_BACKOFF = float(os.environ.get("AR_FP8_WEIGHT_BACKOFF", 1.0))
+global_config.ENABLE_WEIGHT_FP8_MAX_SCALE = os.environ.get("AR_ENABLE_WEIGHT_FP8_MAX_SCALE", "0") == "1"
+global_config.W4A8_PC = os.environ.get("W4A8_PC", "0") == "1"
+global_config.W4A8_DYNAMIC = os.environ.get("W4A8_DYNAMIC", "0") == "1"
+
+print(global_config)