fix qbits issue

intel · Jun 8, 2024 · 65d73fc · 65d73fc
1 parent 34274fb
commit 65d73fc
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 16 deletions.
diff --git a/auto_round/auto_quantizer.py b/auto_round/auto_quantizer.py
@@ -316,7 +316,9 @@ def convert_model(self, model: nn.Module):
         self._replace_by_quant_layers(model, layer_configs, backend)
         return model
 
-    def _dynamic_import_inference_linear(self, bits, backend):
+    def _dynamic_import_inference_linear(self, bits, backend, device):
+        if (str(device) == "cpu" and not torch.cuda.is_available()) or "qbits" in backend:
+            return qlinear_qbits.QuantLinear
         if bits == 4 and self.exllama2_available and "exllamav2" in backend:
             from auto_round_extension.cuda.qliner_exllamav2 import QuantLinear
         else:
@@ -341,9 +343,10 @@ def _replace_by_quant_layers(self, module: nn.Module, layer_configs, backend):
             data_type = config["data_type"]
             if not (bits <= 8 and data_type == "int"):
                 continue
-            QuantLinear = self._dynamic_import_inference_linear(bits, backend)
+
             layer = get_module(module, layer_name)
             device = get_device(layer)
+            QuantLinear = self._dynamic_import_inference_linear(bits, backend, device)
             if isinstance(layer, nn.Linear):
                 in_features = layer.in_features
                 out_features = layer.out_features
@@ -363,24 +366,24 @@ def _replace_by_quant_layers(self, module: nn.Module, layer_configs, backend):
                 weight_dtype=layer.weight.dtype,
             )
 
-            if new_layer.qweight.device.type == "cpu": # fallback to qbits linear when qweight on cpu device
-                QuantLinear = qlinear_qbits.QuantLinear
-                new_layer = QuantLinear(  # pylint: disable=E1123
-                bits,
-                group_size,
-                in_features,
-                out_features,
-                bias,
-                weight_dtype=layer.weight.dtype,
-            )
+            # if new_layer.qweight.device.type == "cpu":  # fallback to qbits linear when qweight on cpu device
+            #     QuantLinear = qlinear_qbits.QuantLinear
+            #     new_layer = QuantLinear(  # pylint: disable=E1123
+            #         bits,
+            #         group_size,
+            #         in_features,
+            #         out_features,
+            #         bias,
+            #         weight_dtype=layer.weight.dtype,
+            #     )
 
             new_layer.device = device
             set_module(module, layer_name, new_layer)
 
     def qbits_post_init(self, model):
         dep_check = True
         for layer in model.modules():
-            if isinstance(layer,qlinear_qbits.QuantLinear):
+            if isinstance(layer, qlinear_qbits.QuantLinear):
                 if dep_check:
                     layer.req_check()
                 layer.post_init()
@@ -408,7 +411,7 @@ class StoreAttr(object):
         model = autoround_post_init(model)
         # there are no side-effects after call qbits_post_init when model quant-type not equal to qbits. 
         model = self.qbits_post_init(model)
-        
+
         return model
 
     def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs):

diff --git a/auto_round_extension/qbits/qlinear_qbits.py b/auto_round_extension/qbits/qlinear_qbits.py
@@ -24,8 +24,8 @@
     from intel_extension_for_transformers import qbits  # noqa: F401
 except Exception as e:
     QBITS_AVAILABLE = False
-    logger.warning(
-        "qlinear_qbits should be used with Intel Extension for Transformers.")
+    # logger.warning(
+    #     "qlinear_qbits should be used with Intel Extension for Transformers.")
 
 BITS_DTYPE_MAPPING = {
     2: "int2_clip",