Skip to content

Commit

Permalink
fix qbits issue
Browse files Browse the repository at this point in the history
  • Loading branch information
wenhuach21 committed Jun 8, 2024
1 parent 34274fb commit 65d73fc
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 16 deletions.
31 changes: 17 additions & 14 deletions auto_round/auto_quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,9 @@ def convert_model(self, model: nn.Module):
self._replace_by_quant_layers(model, layer_configs, backend)
return model

def _dynamic_import_inference_linear(self, bits, backend):
def _dynamic_import_inference_linear(self, bits, backend, device):
if (str(device) == "cpu" and not torch.cuda.is_available()) or "qbits" in backend:
return qlinear_qbits.QuantLinear
if bits == 4 and self.exllama2_available and "exllamav2" in backend:
from auto_round_extension.cuda.qliner_exllamav2 import QuantLinear
else:
Expand All @@ -341,9 +343,10 @@ def _replace_by_quant_layers(self, module: nn.Module, layer_configs, backend):
data_type = config["data_type"]
if not (bits <= 8 and data_type == "int"):
continue
QuantLinear = self._dynamic_import_inference_linear(bits, backend)

layer = get_module(module, layer_name)
device = get_device(layer)
QuantLinear = self._dynamic_import_inference_linear(bits, backend, device)
if isinstance(layer, nn.Linear):
in_features = layer.in_features
out_features = layer.out_features
Expand All @@ -363,24 +366,24 @@ def _replace_by_quant_layers(self, module: nn.Module, layer_configs, backend):
weight_dtype=layer.weight.dtype,
)

if new_layer.qweight.device.type == "cpu": # fallback to qbits linear when qweight on cpu device
QuantLinear = qlinear_qbits.QuantLinear
new_layer = QuantLinear( # pylint: disable=E1123
bits,
group_size,
in_features,
out_features,
bias,
weight_dtype=layer.weight.dtype,
)
# if new_layer.qweight.device.type == "cpu": # fallback to qbits linear when qweight on cpu device
# QuantLinear = qlinear_qbits.QuantLinear
# new_layer = QuantLinear( # pylint: disable=E1123
# bits,
# group_size,
# in_features,
# out_features,
# bias,
# weight_dtype=layer.weight.dtype,
# )

new_layer.device = device
set_module(module, layer_name, new_layer)

def qbits_post_init(self, model):
dep_check = True
for layer in model.modules():
if isinstance(layer,qlinear_qbits.QuantLinear):
if isinstance(layer, qlinear_qbits.QuantLinear):
if dep_check:
layer.req_check()
layer.post_init()
Expand Down Expand Up @@ -408,7 +411,7 @@ class StoreAttr(object):
model = autoround_post_init(model)
# there are no side-effects after call qbits_post_init when model quant-type not equal to qbits.
model = self.qbits_post_init(model)

return model

def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs):
Expand Down
4 changes: 2 additions & 2 deletions auto_round_extension/qbits/qlinear_qbits.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
from intel_extension_for_transformers import qbits # noqa: F401
except Exception as e:
QBITS_AVAILABLE = False
logger.warning(
"qlinear_qbits should be used with Intel Extension for Transformers.")
# logger.warning(
# "qlinear_qbits should be used with Intel Extension for Transformers.")

BITS_DTYPE_MAPPING = {
2: "int2_clip",
Expand Down

0 comments on commit 65d73fc

Please sign in to comment.