From 4b8f6dcbb69ee9c74330d7785fe5b7ef656a94f5 Mon Sep 17 00:00:00 2001 From: Adrian Lizarraga Date: Wed, 31 Jul 2024 21:05:11 -0700 Subject: [PATCH] [QNN EP] Improve INT4 accuracy (#21582) ### Description Masks off top 4-bits of INT4 weights, improving accuracy. ### Motivation and Context This is a workaround as the QNN docs state masking is not required. --- .../qnn/builder/qnn_model_wrapper.cc | 6 + onnxruntime/test/providers/qnn/conv_test.cc | 5 +- .../test/providers/qnn/matmul_test.cpp | 154 ++++++++++++++++-- 3 files changed, 151 insertions(+), 14 deletions(-) diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc index c8537307ef3ba..9d3f460572d84 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc @@ -617,6 +617,12 @@ Status QnnModelWrapper::UnpackInitializerData(const ONNX_NAMESPACE::TensorProto& auto dst = gsl::make_span(reinterpret_cast(unpacked_tensor.data()), unpacked_tensor.size()); auto src = gsl::make_span(reinterpret_cast(packed_int4_bytes.data()), packed_int4_bytes.size()); ORT_RETURN_IF_NOT(Int4x2::Unpack(dst, src), "Failed to unpack Tensor for QNN"); + + // NOTE: Masking off top 4 bits to workaround a QNN INT4 accuracy bug. + // Docs explicitly state that masking off top 4 bits should not be required. + for (size_t i = 0; i < dst.size(); i++) { + dst[i] &= 0x0F; // -3 (0b1111_1101) becomes 13 (0b0000_1101) + } } else if (onnx_data_type == ONNX_NAMESPACE::TensorProto_DataType_UINT4) { TensorShape shape = onnxruntime::utils::GetTensorShapeFromTensorProto(initializer); const size_t num_elems = shape.Size(); diff --git a/onnxruntime/test/providers/qnn/conv_test.cc b/onnxruntime/test/providers/qnn/conv_test.cc index 99636976b9c05..35889c9fa2307 100644 --- a/onnxruntime/test/providers/qnn/conv_test.cc +++ b/onnxruntime/test/providers/qnn/conv_test.cc @@ -799,7 +799,7 @@ TEST_F(QnnHTPBackendTests, ConvU16S4S32_PerChannel_NegativeWeightQuantAxis) { // CPU EP (f32 model): 25.143 21.554 17.964 10.785 7.195 3.605 -3.574 -7.164 -10.753 // CPU EP (qdq model): 24.670 21.103 17.536 10.254 6.689 2.972 -4.161 -7.728 -10.700 // QNN EP (qdq model): 27.186 27.186 27.186 21.541 6.685 -8.022 -10.548 -10.548 -10.548 -TEST_F(QnnHTPBackendTests, DISABLED_ConvU16S4S32_PerChannel_AccuracyIssue) { +TEST_F(QnnHTPBackendTests, ConvU16S4S32_PerChannel_AccuracyIssue) { std::vector input_shape = {1, 2, 4, 4}; std::vector weight_shape = {3, 2, 2, 2}; std::vector bias_shape = {3}; @@ -835,7 +835,8 @@ TEST_F(QnnHTPBackendTests, DISABLED_ConvU16S4S32_PerChannel_AccuracyIssue) { "NOTSET", ExpectedEPNodeAssignment::All, false, // use_qdq_contrib_ops - 21); // opset + 21, // opset + QDQTolerance(0.005f)); } // Test per-channel QDQ Conv is rejected with weight axis != 0 diff --git a/onnxruntime/test/providers/qnn/matmul_test.cpp b/onnxruntime/test/providers/qnn/matmul_test.cpp index dba60b1041696..d8c34d6a6c6ed 100644 --- a/onnxruntime/test/providers/qnn/matmul_test.cpp +++ b/onnxruntime/test/providers/qnn/matmul_test.cpp @@ -28,26 +28,25 @@ static GetTestModelFn BuildMatMulOpTestCase(const TestInputDef& input1_de // Returns a function that creates a graph with a QDQ MatMul operator. template -static GetTestQDQModelFn BuildMatMulOpQDQTestCase(const TestInputDef& input1_def, - const TestInputDef& input2_def, +static GetTestQDQModelFn BuildMatMulOpQDQTestCase(const TestInputDef& input0_def, + const TestInputDef& input1_def, bool use_contrib_qdq) { - return [input1_def, input2_def, use_contrib_qdq](ModelTestBuilder& builder, + return [input0_def, input1_def, use_contrib_qdq](ModelTestBuilder& builder, std::vector>& output_qparams) { // input1 -> Q -> DQ -> - NodeArg* input1 = MakeTestInput(builder, input1_def); - QuantParams input1_qparams = GetTestInputQuantParams(input1_def); - auto* input1_qdq = AddQDQNodePair(builder, input1, input1_qparams.scale, input1_qparams.zero_point, + NodeArg* input0 = MakeTestInput(builder, input0_def); + QuantParams input0_qparams = GetTestInputQuantParams(input0_def); + auto* input0_qdq = AddQDQNodePair(builder, input0, input0_qparams.scale, input0_qparams.zero_point, use_contrib_qdq); - - // input2 -> Q -> DQ -> - NodeArg* input2 = MakeTestInput(builder, input2_def); - QuantParams input2_qparams = GetTestInputQuantParams(input2_def); - auto* input2_qdq = AddQDQNodePair(builder, input2, input2_qparams.scale, input2_qparams.zero_point, + // input1 -> Q -> DQ -> + NodeArg* input1 = MakeTestInput(builder, input1_def); + QuantParams input1_qparams = GetTestInputQuantParams(input1_def); + auto* input1_qdq = AddQDQNodePair(builder, input1, input1_qparams.scale, input1_qparams.zero_point, use_contrib_qdq); // MatMul auto* op_output = builder.MakeIntermediate(); - builder.AddNode("MatMul", {input1_qdq, input2_qdq}, {op_output}); + builder.AddNode("MatMul", {input0_qdq, input1_qdq}, {op_output}); // op_output -> Q -> DQ -> output AddQDQNodePairWithOutputAsGraphOutput(builder, op_output, output_qparams[0].scale, @@ -55,6 +54,88 @@ static GetTestQDQModelFn BuildMatMulOpQDQTestCase(const TestInputDe }; } +template +static GetTestQDQModelFn BuildQDQPerChannelMatMulTestCase(const TestInputDef& input_def, + const TestInputDef& weights_def, + int64_t weight_quant_axis, + bool use_contrib_qdq = false) { + return [input_def, weights_def, weight_quant_axis, + use_contrib_qdq](ModelTestBuilder& builder, + std::vector>& output_qparams) { + std::vector matmul_inputs; + + // input -> Q/DQ -> + auto* input = MakeTestInput(builder, input_def); + QuantParams input_qparams = GetTestInputQuantParams(input_def); + auto* input_qdq = AddQDQNodePair(builder, input, input_qparams.scale, input_qparams.zero_point, + use_contrib_qdq); + matmul_inputs.push_back(input_qdq); + + // Quantized(weights) -> DQ -> + ORT_ENFORCE(weights_def.IsInitializer() && weights_def.IsRawData()); + std::vector weight_scales; + std::vector weight_zero_points; + TensorShape weights_shape = weights_def.GetTensorShape(); + int64_t pos_weight_quant_axis = weight_quant_axis; + if (pos_weight_quant_axis < 0) { + pos_weight_quant_axis += static_cast(weights_shape.NumDimensions()); + } + GetTestInputQuantParamsPerChannel(weights_def, weight_scales, weight_zero_points, + static_cast(pos_weight_quant_axis), true); + + std::vector quantized_weights; + size_t num_weight_storage_elems = weights_shape.Size(); + if constexpr (std::is_same_v || std::is_same_v) { + num_weight_storage_elems = Int4x2::CalcNumInt4Pairs(weights_shape.Size()); + } + quantized_weights.resize(num_weight_storage_elems); + QuantizeValues(weights_def.GetRawData(), quantized_weights, weights_shape, + weight_scales, weight_zero_points, pos_weight_quant_axis); + + NodeArg* weights_initializer = builder.MakeInitializer(weights_def.GetShape(), quantized_weights); + NodeArg* weights_dq = builder.MakeIntermediate(); + Node& weights_dq_node = builder.AddDequantizeLinearNode(weights_initializer, weight_scales, + weight_zero_points, weights_dq, + nullptr, use_contrib_qdq); + weights_dq_node.AddAttribute("axis", weight_quant_axis); + matmul_inputs.push_back(weights_dq); + + auto* matmul_output = builder.MakeIntermediate(); + builder.AddNode("MatMul", matmul_inputs, {matmul_output}); + + AddQDQNodePairWithOutputAsGraphOutput(builder, matmul_output, output_qparams[0].scale, + output_qparams[0].zero_point, use_contrib_qdq); + }; +} + +// Runs a QDQ per-channel MatMul model on the QNN HTP backend. Checks the graph node assignment, and that the +// QDQ model is accurate on QNN EP (compared to CPU EP). +template +static void RunQDQPerChannelMatMulOpOpTest(const TestInputDef& input_def, + const TestInputDef& weights_def, + int64_t weight_quant_axis, + ExpectedEPNodeAssignment expected_ep_assignment, + int opset = 21, + bool use_contrib_qdq = false, + QDQTolerance tolerance = QDQTolerance()) { + ProviderOptions provider_options; +#if defined(_WIN32) + provider_options["backend_path"] = "QnnHtp.dll"; +#else + provider_options["backend_path"] = "libQnnHtp.so"; +#endif + + TestQDQModelAccuracy(BuildMatMulOpTestCase(input_def, weights_def), + BuildQDQPerChannelMatMulTestCase(input_def, + weights_def, + weight_quant_axis, + use_contrib_qdq), + provider_options, + opset, + expected_ep_assignment, + tolerance); +} + // Runs an MatMul model on the QNN CPU backend. Checks the graph node assignment, and that inference // outputs for QNN and CPU match. static void RunMatMulOpOpTest(const TestInputDef& input1_def, @@ -160,6 +241,55 @@ TEST_F(QnnHTPBackendTests, MatMulOp_HTP_A16_W8Static) { true); // Use com.microsoft Q/DQ ops } +// Test QDQ per-channel MatMul with 16-bit act, signed 4-bit weights (static) +TEST_F(QnnHTPBackendTests, MatMulOp_PerChannel_A16_WeightInt4) { + std::vector input0_data = {-10.0f, -4.0f, -2.0f, 0.0f, 5.0f, 10.0f}; + std::vector input1_data = {-10.0f, -6.0f, -1.0f, 0.0f, 3.0f, 10.0f}; + RunQDQPerChannelMatMulOpOpTest(TestInputDef({1, 1, 2, 3}, false, input0_data), + TestInputDef({1, 1, 3, 2}, true, input1_data), + 1, // quantization axis + ExpectedEPNodeAssignment::All, + 21, + false); +} + +// Test QDQ per-channel MatMul with 16-bit act, unsigned 4-bit weights (static) +TEST_F(QnnHTPBackendTests, MatMulOp_PerChannel_A16_WeightUInt4) { + std::vector input0_data = {-10.0f, -4.0f, -2.0f, 0.0f, 5.0f, 10.0f}; + std::vector input1_data = {-10.0f, -6.0f, -1.0f, 0.0f, 3.0f, 10.0f}; + RunQDQPerChannelMatMulOpOpTest(TestInputDef({1, 1, 2, 3}, false, input0_data), + TestInputDef({1, 1, 3, 2}, true, input1_data), + 1, // quantization axis + ExpectedEPNodeAssignment::All, + 21, + false); +} + +// Test QDQ per-channel MatMul with int8 act, int4 weights (static) +TEST_F(QnnHTPBackendTests, MatMulOp_PerChannel_AS8_WeightInt4) { + std::vector input0_data = GetFloatDataInRange(-5.0f, 5.0f, 6); + std::vector input1_data = {-2.0f, -1.0f, -0.5f, 0.0f, 1.0f, 2.0f}; + RunQDQPerChannelMatMulOpOpTest(TestInputDef({1, 1, 2, 3}, false, input0_data), + TestInputDef({1, 1, 3, 2}, true, input1_data), + 1, // quantization axis + ExpectedEPNodeAssignment::All, + 21, + false, + QDQTolerance(0.007f)); +} + +// Test QDQ per-channel MatMul with 16-bit act, int8 weights (static) +TEST_F(QnnHTPBackendTests, MatMulOp_PerChannel_A16_WeightInt8) { + std::vector input0_data = {-10.0f, -4.0f, -2.0f, 0.0f, 5.0f, 10.0f}; + std::vector input1_data = {-10.0f, -6.0f, -1.0f, 0.0f, 3.0f, 10.0f}; + RunQDQPerChannelMatMulOpOpTest(TestInputDef({1, 1, 2, 3}, false, input0_data), + TestInputDef({1, 1, 3, 2}, true, input1_data), + 1, // quantization axis + ExpectedEPNodeAssignment::All, + 21, + false); +} + // Test QDQ MatMul with uint16 activation uint16 weights, both dynamic // Inaccuracy detected for output 'output_0', element 1. // Output quant params: scale=0.0015259021893143654, zero_point=0.