From 4b8f6dcbb69ee9c74330d7785fe5b7ef656a94f5 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Wed, 31 Jul 2024 21:05:11 -0700
Subject: [PATCH] [QNN EP] Improve INT4 accuracy (#21582)

### Description
Masks off top 4-bits of INT4 weights, improving accuracy.



### Motivation and Context
This is a workaround as the QNN docs state masking is not required.
---
 .../qnn/builder/qnn_model_wrapper.cc          |   6 +
 onnxruntime/test/providers/qnn/conv_test.cc   |   5 +-
 .../test/providers/qnn/matmul_test.cpp        | 154 ++++++++++++++++--
 3 files changed, 151 insertions(+), 14 deletions(-)
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
index c8537307ef3ba..9d3f460572d84 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
@@ -617,6 +617,12 @@ Status QnnModelWrapper::UnpackInitializerData(const ONNX_NAMESPACE::TensorProto&
     auto dst = gsl::make_span(reinterpret_cast<int8_t*>(unpacked_tensor.data()), unpacked_tensor.size());
     auto src = gsl::make_span(reinterpret_cast<const Int4x2*>(packed_int4_bytes.data()), packed_int4_bytes.size());
     ORT_RETURN_IF_NOT(Int4x2::Unpack(dst, src), "Failed to unpack Tensor<Int4x2> for QNN");
+
+    // NOTE: Masking off top 4 bits to workaround a QNN INT4 accuracy bug.
+    // Docs explicitly state that masking off top 4 bits should not be required.
+    for (size_t i = 0; i < dst.size(); i++) {
+      dst[i] &= 0x0F;  // -3 (0b1111_1101) becomes 13 (0b0000_1101)
+    }
   } else if (onnx_data_type == ONNX_NAMESPACE::TensorProto_DataType_UINT4) {
     TensorShape shape = onnxruntime::utils::GetTensorShapeFromTensorProto(initializer);
     const size_t num_elems = shape.Size();
diff --git a/onnxruntime/test/providers/qnn/conv_test.cc b/onnxruntime/test/providers/qnn/conv_test.cc
index 99636976b9c05..35889c9fa2307 100644
--- a/onnxruntime/test/providers/qnn/conv_test.cc
+++ b/onnxruntime/test/providers/qnn/conv_test.cc
@@ -799,7 +799,7 @@ TEST_F(QnnHTPBackendTests, ConvU16S4S32_PerChannel_NegativeWeightQuantAxis) {
 // CPU EP (f32 model): 25.143 21.554 17.964 10.785 7.195 3.605  -3.574  -7.164  -10.753
 // CPU EP (qdq model): 24.670 21.103 17.536 10.254 6.689 2.972  -4.161  -7.728  -10.700
 // QNN EP (qdq model): 27.186 27.186 27.186 21.541 6.685 -8.022 -10.548 -10.548 -10.548
-TEST_F(QnnHTPBackendTests, DISABLED_ConvU16S4S32_PerChannel_AccuracyIssue) {
+TEST_F(QnnHTPBackendTests, ConvU16S4S32_PerChannel_AccuracyIssue) {
   std::vector<int64_t> input_shape = {1, 2, 4, 4};
   std::vector<int64_t> weight_shape = {3, 2, 2, 2};
   std::vector<int64_t> bias_shape = {3};
@@ -835,7 +835,8 @@ TEST_F(QnnHTPBackendTests, DISABLED_ConvU16S4S32_PerChannel_AccuracyIssue) {
                                               "NOTSET",
                                               ExpectedEPNodeAssignment::All,
                                               false,  // use_qdq_contrib_ops
-                                              21);    // opset
+                                              21,     // opset
+                                              QDQTolerance(0.005f));
 }
 
 // Test per-channel QDQ Conv is rejected with weight axis != 0
diff --git a/onnxruntime/test/providers/qnn/matmul_test.cpp b/onnxruntime/test/providers/qnn/matmul_test.cpp
index dba60b1041696..d8c34d6a6c6ed 100644
--- a/onnxruntime/test/providers/qnn/matmul_test.cpp
+++ b/onnxruntime/test/providers/qnn/matmul_test.cpp
@@ -28,26 +28,25 @@ static GetTestModelFn BuildMatMulOpTestCase(const TestInputDef<float>& input1_de
 
 // Returns a function that creates a graph with a QDQ MatMul operator.
 template <typename Input0QType, typename Input1QType, typename OutputQType>
-static GetTestQDQModelFn<OutputQType> BuildMatMulOpQDQTestCase(const TestInputDef<float>& input1_def,
-                                                               const TestInputDef<float>& input2_def,
+static GetTestQDQModelFn<OutputQType> BuildMatMulOpQDQTestCase(const TestInputDef<float>& input0_def,
+                                                               const TestInputDef<float>& input1_def,
                                                                bool use_contrib_qdq) {
-  return [input1_def, input2_def, use_contrib_qdq](ModelTestBuilder& builder,
+  return [input0_def, input1_def, use_contrib_qdq](ModelTestBuilder& builder,
                                                    std::vector<QuantParams<OutputQType>>& output_qparams) {
     // input1 -> Q -> DQ ->
-    NodeArg* input1 = MakeTestInput(builder, input1_def);
-    QuantParams<Input0QType> input1_qparams = GetTestInputQuantParams<Input0QType>(input1_def);
-    auto* input1_qdq = AddQDQNodePair<Input0QType>(builder, input1, input1_qparams.scale, input1_qparams.zero_point,
+    NodeArg* input0 = MakeTestInput(builder, input0_def);
+    QuantParams<Input0QType> input0_qparams = GetTestInputQuantParams<Input0QType>(input0_def);
+    auto* input0_qdq = AddQDQNodePair<Input0QType>(builder, input0, input0_qparams.scale, input0_qparams.zero_point,
                                                    use_contrib_qdq);
-
-    // input2 -> Q -> DQ ->
-    NodeArg* input2 = MakeTestInput(builder, input2_def);
-    QuantParams<Input1QType> input2_qparams = GetTestInputQuantParams<Input1QType>(input2_def);
-    auto* input2_qdq = AddQDQNodePair<Input1QType>(builder, input2, input2_qparams.scale, input2_qparams.zero_point,
+    // input1 -> Q -> DQ ->
+    NodeArg* input1 = MakeTestInput(builder, input1_def);
+    QuantParams<Input1QType> input1_qparams = GetTestInputQuantParams<Input1QType>(input1_def);
+    auto* input1_qdq = AddQDQNodePair<Input1QType>(builder, input1, input1_qparams.scale, input1_qparams.zero_point,
                                                    use_contrib_qdq);
 
     // MatMul
     auto* op_output = builder.MakeIntermediate();
-    builder.AddNode("MatMul", {input1_qdq, input2_qdq}, {op_output});
+    builder.AddNode("MatMul", {input0_qdq, input1_qdq}, {op_output});
 
     // op_output -> Q -> DQ -> output
     AddQDQNodePairWithOutputAsGraphOutput<OutputQType>(builder, op_output, output_qparams[0].scale,
@@ -55,6 +54,88 @@ static GetTestQDQModelFn<OutputQType> BuildMatMulOpQDQTestCase(const TestInputDe
   };
 }
 
+template <typename Input0QType, typename WeightQType, typename OutputQType>
+static GetTestQDQModelFn<OutputQType> BuildQDQPerChannelMatMulTestCase(const TestInputDef<float>& input_def,
+                                                                       const TestInputDef<float>& weights_def,
+                                                                       int64_t weight_quant_axis,
+                                                                       bool use_contrib_qdq = false) {
+  return [input_def, weights_def, weight_quant_axis,
+          use_contrib_qdq](ModelTestBuilder& builder,
+                           std::vector<QuantParams<OutputQType>>& output_qparams) {
+    std::vector<NodeArg*> matmul_inputs;
+
+    // input -> Q/DQ ->
+    auto* input = MakeTestInput(builder, input_def);
+    QuantParams<Input0QType> input_qparams = GetTestInputQuantParams<Input0QType>(input_def);
+    auto* input_qdq = AddQDQNodePair<Input0QType>(builder, input, input_qparams.scale, input_qparams.zero_point,
+                                                  use_contrib_qdq);
+    matmul_inputs.push_back(input_qdq);
+
+    // Quantized(weights) -> DQ ->
+    ORT_ENFORCE(weights_def.IsInitializer() && weights_def.IsRawData());
+    std::vector<float> weight_scales;
+    std::vector<WeightQType> weight_zero_points;
+    TensorShape weights_shape = weights_def.GetTensorShape();
+    int64_t pos_weight_quant_axis = weight_quant_axis;
+    if (pos_weight_quant_axis < 0) {
+      pos_weight_quant_axis += static_cast<int64_t>(weights_shape.NumDimensions());
+    }
+    GetTestInputQuantParamsPerChannel<WeightQType>(weights_def, weight_scales, weight_zero_points,
+                                                   static_cast<size_t>(pos_weight_quant_axis), true);
+
+    std::vector<WeightQType> quantized_weights;
+    size_t num_weight_storage_elems = weights_shape.Size();
+    if constexpr (std::is_same_v<WeightQType, Int4x2> || std::is_same_v<WeightQType, UInt4x2>) {
+      num_weight_storage_elems = Int4x2::CalcNumInt4Pairs(weights_shape.Size());
+    }
+    quantized_weights.resize(num_weight_storage_elems);
+    QuantizeValues<float, WeightQType>(weights_def.GetRawData(), quantized_weights, weights_shape,
+                                       weight_scales, weight_zero_points, pos_weight_quant_axis);
+
+    NodeArg* weights_initializer = builder.MakeInitializer<WeightQType>(weights_def.GetShape(), quantized_weights);
+    NodeArg* weights_dq = builder.MakeIntermediate();
+    Node& weights_dq_node = builder.AddDequantizeLinearNode<WeightQType>(weights_initializer, weight_scales,
+                                                                         weight_zero_points, weights_dq,
+                                                                         nullptr, use_contrib_qdq);
+    weights_dq_node.AddAttribute("axis", weight_quant_axis);
+    matmul_inputs.push_back(weights_dq);
+
+    auto* matmul_output = builder.MakeIntermediate();
+    builder.AddNode("MatMul", matmul_inputs, {matmul_output});
+
+    AddQDQNodePairWithOutputAsGraphOutput<OutputQType>(builder, matmul_output, output_qparams[0].scale,
+                                                       output_qparams[0].zero_point, use_contrib_qdq);
+  };
+}
+
+// Runs a QDQ per-channel MatMul model on the QNN HTP backend. Checks the graph node assignment, and that the
+// QDQ model is accurate on QNN EP (compared to CPU EP).
+template <typename Input0QType, typename WeightQType, typename OutputQType>
+static void RunQDQPerChannelMatMulOpOpTest(const TestInputDef<float>& input_def,
+                                           const TestInputDef<float>& weights_def,
+                                           int64_t weight_quant_axis,
+                                           ExpectedEPNodeAssignment expected_ep_assignment,
+                                           int opset = 21,
+                                           bool use_contrib_qdq = false,
+                                           QDQTolerance tolerance = QDQTolerance()) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  TestQDQModelAccuracy(BuildMatMulOpTestCase(input_def, weights_def),
+                       BuildQDQPerChannelMatMulTestCase<Input0QType, WeightQType, OutputQType>(input_def,
+                                                                                               weights_def,
+                                                                                               weight_quant_axis,
+                                                                                               use_contrib_qdq),
+                       provider_options,
+                       opset,
+                       expected_ep_assignment,
+                       tolerance);
+}
+
 // Runs an MatMul model on the QNN CPU backend. Checks the graph node assignment, and that inference
 // outputs for QNN and CPU match.
 static void RunMatMulOpOpTest(const TestInputDef<float>& input1_def,
@@ -160,6 +241,55 @@ TEST_F(QnnHTPBackendTests, MatMulOp_HTP_A16_W8Static) {
                                                     true);  // Use com.microsoft Q/DQ ops
 }
 
+// Test QDQ per-channel MatMul with 16-bit act, signed 4-bit weights (static)
+TEST_F(QnnHTPBackendTests, MatMulOp_PerChannel_A16_WeightInt4) {
+  std::vector<float> input0_data = {-10.0f, -4.0f, -2.0f, 0.0f, 5.0f, 10.0f};
+  std::vector<float> input1_data = {-10.0f, -6.0f, -1.0f, 0.0f, 3.0f, 10.0f};
+  RunQDQPerChannelMatMulOpOpTest<uint16_t, Int4x2, uint16_t>(TestInputDef<float>({1, 1, 2, 3}, false, input0_data),
+                                                             TestInputDef<float>({1, 1, 3, 2}, true, input1_data),
+                                                             1,  // quantization axis
+                                                             ExpectedEPNodeAssignment::All,
+                                                             21,
+                                                             false);
+}
+
+// Test QDQ per-channel MatMul with 16-bit act, unsigned 4-bit weights (static)
+TEST_F(QnnHTPBackendTests, MatMulOp_PerChannel_A16_WeightUInt4) {
+  std::vector<float> input0_data = {-10.0f, -4.0f, -2.0f, 0.0f, 5.0f, 10.0f};
+  std::vector<float> input1_data = {-10.0f, -6.0f, -1.0f, 0.0f, 3.0f, 10.0f};
+  RunQDQPerChannelMatMulOpOpTest<uint16_t, UInt4x2, uint16_t>(TestInputDef<float>({1, 1, 2, 3}, false, input0_data),
+                                                              TestInputDef<float>({1, 1, 3, 2}, true, input1_data),
+                                                              1,  // quantization axis
+                                                              ExpectedEPNodeAssignment::All,
+                                                              21,
+                                                              false);
+}
+
+// Test QDQ per-channel MatMul with int8 act, int4 weights (static)
+TEST_F(QnnHTPBackendTests, MatMulOp_PerChannel_AS8_WeightInt4) {
+  std::vector<float> input0_data = GetFloatDataInRange(-5.0f, 5.0f, 6);
+  std::vector<float> input1_data = {-2.0f, -1.0f, -0.5f, 0.0f, 1.0f, 2.0f};
+  RunQDQPerChannelMatMulOpOpTest<int8_t, Int4x2, int8_t>(TestInputDef<float>({1, 1, 2, 3}, false, input0_data),
+                                                         TestInputDef<float>({1, 1, 3, 2}, true, input1_data),
+                                                         1,  // quantization axis
+                                                         ExpectedEPNodeAssignment::All,
+                                                         21,
+                                                         false,
+                                                         QDQTolerance(0.007f));
+}
+
+// Test QDQ per-channel MatMul with 16-bit act, int8 weights (static)
+TEST_F(QnnHTPBackendTests, MatMulOp_PerChannel_A16_WeightInt8) {
+  std::vector<float> input0_data = {-10.0f, -4.0f, -2.0f, 0.0f, 5.0f, 10.0f};
+  std::vector<float> input1_data = {-10.0f, -6.0f, -1.0f, 0.0f, 3.0f, 10.0f};
+  RunQDQPerChannelMatMulOpOpTest<uint16_t, int8_t, uint16_t>(TestInputDef<float>({1, 1, 2, 3}, false, input0_data),
+                                                             TestInputDef<float>({1, 1, 3, 2}, true, input1_data),
+                                                             1,  // quantization axis
+                                                             ExpectedEPNodeAssignment::All,
+                                                             21,
+                                                             false);
+}
+
 // Test QDQ MatMul with uint16 activation uint16 weights, both dynamic
 // Inaccuracy detected for output 'output_0', element 1.
 // Output quant params: scale=0.0015259021893143654, zero_point=0.