[Quant tool] Handle input models with pre-quantized weights (#22633)

### Description Allows the QDQ quantizer to handle input models that already have some pre-quantized weights. In this case, the qdq quantizer will properly skip/handle the pre-quantized weights. Also handles an operator (e.g., Conv) with a pre-quantized weight and a float bias. The tool will read the pre-quantized weight's quantization scale to compute the bias's scale (`bias_scale = input_scale * weight_scale`). Input model (pre-quantized Conv weight): ![image](https://github.com/user-attachments/assets/7d2626e4-49ad-47ae-bd0e-6339ac590435) Output QDQ model (everything is quantized): ![image](https://github.com/user-attachments/assets/393804d3-f042-47bd-895f-3d667fb2ae94) ### Motivation and Context Customers may use external tools to quantize some weights (e.g., int4 for Conv/MatMul). The qdq quantizer should still be able to quantize the rest of the model (float weights and activations) in this case.
microsoft · Nov 14, 2024 · 0733733 · 0733733
1 parent 562ddce
commit 0733733
Show file tree

Hide file tree

Showing 3 changed files with 342 additions and 11 deletions.
diff --git a/onnxruntime/python/tools/quantization/base_quantizer.py b/onnxruntime/python/tools/quantization/base_quantizer.py
@@ -19,7 +19,9 @@
 from .calibrate import TensorData
 from .onnx_model import ONNXModel
 from .quant_utils import (
+    DEQUANT_OP_NAME,
     ONNX_TYPE_TO_NP_TYPE,
+    QUANT_OP_NAME,
     TENSOR_NAME_QUANT_SUFFIX,
     find_by_name,
     model_has_infer_metadata,
@@ -178,6 +180,9 @@ def should_quantize_node(self, node):
         if node.op_type not in self.op_types_to_quantize:
             return False
 
+        if node.op_type in (DEQUANT_OP_NAME, QUANT_OP_NAME):
+            return False
+
         if self.nodes_to_exclude is not None and node.name in self.nodes_to_exclude:
             return False
 

diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py
@@ -195,7 +195,11 @@ def __init__(
         # The default behavior is that multiple nodes can share a QDQ pair as their inputs.
         # In TRT, QDQ pair can`t be shared between nodes, so it will create dedicated QDQ pairs for each node.
         self.dedicated_qdq_pair = extra_options.get("DedicatedQDQPair", False)
-        self.tensor_to_its_receiving_nodes = {}
+        self.tensor_to_its_receiving_nodes: dict[str, list[onnx.NodeProto]] = {}
+
+        # Maps a tensor to the DequantizeLinear node (in the original input model) that outputs the tensor.
+        # Populated for input models with some pre-quantized weights (typically via a different tool).
+        self.tensor_to_producing_dq: dict[str, onnx.NodeProto] = {}
 
         # Let user set channel axis for specific op type and it's effective only when per channel quantization is supported and per_channel is True.
         self.qdq_op_type_per_channel_support_to_axis = extra_options.get("QDQOpTypePerChannelSupportToAxis", {})
@@ -555,6 +559,9 @@ def quantize_model(self):
                     if tensor_name not in self.tensor_to_its_receiving_nodes:
                         self.tensor_to_its_receiving_nodes[tensor_name] = []
                     self.tensor_to_its_receiving_nodes[tensor_name].append(node)
+            if node.op_type == DEQUANT_OP_NAME:
+                for tensor_name in node.output:
+                    self.tensor_to_producing_dq[tensor_name] = node
 
         self.initializer_quant_params = self._calc_initializer_quant_params()
         self._adjust_weight_quant_params_for_bias_tensors()
@@ -958,6 +965,14 @@ def _quantize_normal_tensors(self):
                 if initializer:
                     self._add_qdq_nodes_for_initializer(initializer)
                 else:
+                    # Check if this tensor is already a dequantized value. If so, skip it.
+                    # This happens if the original input model already has some pre-quantized weights
+                    # generated by a different tool.
+                    # Ex: (quantized_weight -> DequantizeLinear -> this_tensor)
+                    if tensor_name in self.tensor_to_producing_dq:
+                        del self.tensors_to_quantize[tensor_name]
+                        continue
+
                     tensor_qparam_initializers = self._make_tensor_scale_zp_initializers(tensor_name)
                     if not tensor_qparam_initializers:
                         raise ValueError(
@@ -1009,6 +1024,12 @@ def _quantize_sharing_param_tensors(self):
                     if self.is_input_a_initializer(tensor_name):
                         raise ValueError("Quantization parameter shared mode is not supported for weight yet")
 
+                    if tensor_name in self.tensor_to_producing_dq:
+                        raise ValueError(
+                            f"Quantization parameter sharing is invalid for tensor {tensor_name} "
+                            "because it has already been quantized"
+                        )
+
                     # Need to check if this tensor's quant_type is converted for some consumers.
                     # If so, create new scale/zp initializers for these consumers.
                     converted_qparam_inits = None
@@ -1147,6 +1168,30 @@ def is_tensor_per_channel(
 
         return True, axis
 
+    def _get_tensor_quantization_scale(self, tensor_name: str, consumer_node_name: str) -> np.ndarray | None:
+        """
+        Returns the quantization scale of a tensor that is consumed by the given node.
+        :parameter tensor_name: The name of the tensor.
+        :parameter consumer_node_name: The name of the node that consumes the tensor as input. Necessary in case
+                                       the quantization type of the tensor was converted.
+                                       Refer: QDQQuantizer::_add_qdq_ops_for_converted_activation.
+        :returns: The quantization scale or None.
+        """
+        initializers = self.model.initializer()
+        scale_initializer: onnx.TensorProto | None = None
+
+        if tensor_name in self.quantized_value_map:
+            # Tensor was quantized by this tool, so get scale from initializer created by this tool run.
+            scale_name = self.quantized_value_map[tensor_name].get_for_consumer(consumer_node_name).scale_name
+            scale_initializer = find_by_name(scale_name, initializers)
+        else:
+            # Tensor was already quantized in original model, so get scale from DQ node that outputs the tensor.
+            dq_node = self.tensor_to_producing_dq.get(tensor_name, None)
+            if dq_node:
+                scale_initializer = find_by_name(dq_node.input[1], initializers)
+
+        return tensor_proto_to_array(scale_initializer) if scale_initializer is not None else None
+
     def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> str:
         """
         Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale
@@ -1156,17 +1201,21 @@ def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> s
         if bias_name in self.quantized_value_map:
             return self.quantized_value_map[bias_name].original.q_name
 
-        # get scale for weight
-        weight_scale_name = self.quantized_value_map[bias_info.weight_name].original.scale_name
-        weight_scale_initializer = find_by_name(weight_scale_name, self.model.initializer())
-        weight_scale = tensor_proto_to_array(weight_scale_initializer)
+        # get scale for weight.
+        weight_scale = self._get_tensor_quantization_scale(bias_info.weight_name, bias_info.node_name)
+        if weight_scale is None:
+            raise ValueError(
+                f"Unable to get valid quantization scale for weight input '{bias_info.weight_name}' "
+                f"when quantizing bias '{bias_name}' to int32."
+            )
 
-        # get scale for input
-        input_scale_name = (
-            self.quantized_value_map[bias_info.input_name].get_for_consumer(bias_info.node_name).scale_name
-        )
-        input_scale_initializer = find_by_name(input_scale_name, self.model.initializer())
-        input_scale = tensor_proto_to_array(input_scale_initializer)
+        # get scale for input.
+        input_scale = self._get_tensor_quantization_scale(bias_info.input_name, bias_info.node_name)
+        if input_scale is None:
+            raise ValueError(
+                f"Unable to get valid quantization scale for input '{bias_info.input_name}' "
+                f"when quantizing bias '{bias_name}' to int32."
+            )
 
         (
             quantized_bias_name,