diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc index 5015e48fdb7b8..3880288bdba2e 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc @@ -443,7 +443,6 @@ bool InstanceAndLayerNormalizationNodeGroupSelector::Check(const GraphViewer& gr } int32_t dt_input = dq_nodes[0]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type(); - int32_t dt_scale = dq_nodes[1]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type(); int32_t dt_bias = 0; bool has_bias = false; // bias is optional for LayerNorm @@ -453,9 +452,9 @@ bool InstanceAndLayerNormalizationNodeGroupSelector::Check(const GraphViewer& gr } int32_t dt_output = q_nodes[0]->OutputDefs()[0]->TypeAsProto()->tensor_type().elem_type(); - // Input, output, and scale need to be the same type. The bias is int32. + // Input, output, need to be the same type. The bias is int32. + // Scale can be different with input for a16w8 case return (dt_input == dt_output) && - (dt_input == dt_scale) && (has_bias ? dt_bias == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32 : true); } diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc index 4ae59951c5e98..fdc5317419c5b 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc @@ -22,6 +22,11 @@ class SimpleOpBuilder : public BaseOpBuilder { ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SimpleOpBuilder); protected: + Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper, + const NodeUnit& node_unit, + const logging::Logger& logger, + std::vector& input_names, + bool do_op_validation) const override ORT_MUST_USE_RESULT; Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, std::vector&& input_names, @@ -48,6 +53,90 @@ class SimpleOpBuilder : public BaseOpBuilder { static constexpr std::array gridsample_supported_padding_modes = {"zeros", "border", "reflection"}; }; +// Move to qnn_utils if it's re-usable +Status InsertConvertOp(QnnModelWrapper& qnn_model_wrapper, + const std::string& convert_input_name, + const std::string& convert_output_name, + Qnn_DataType_t input_qnn_data_type, + Qnn_DataType_t output_qnn_data_type, + int32_t input_offset, + float input_scale, + const std::vector& output_shape, + bool do_op_validation) { + // Assume input is already handled. + float qmin = 0.0f; + float qmax = 255.0f; + ORT_RETURN_IF_ERROR(qnn::utils::GetQminQmax(input_qnn_data_type, qmin, qmax)); + double value_min = qnn::utils::Dequantize(input_offset, input_scale, qmin); + double value_max = qnn::utils::Dequantize(input_offset, input_scale, qmax); + + Qnn_QuantizeParams_t convert_output_quant_param = QNN_QUANTIZE_PARAMS_INIT; + convert_output_quant_param.encodingDefinition = QNN_DEFINITION_DEFINED; + convert_output_quant_param.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET; + ORT_RETURN_IF_ERROR(qnn::utils::GetQuantParams(static_cast(value_min), + static_cast(value_max), + output_qnn_data_type, + convert_output_quant_param.scaleOffsetEncoding.scale, + convert_output_quant_param.scaleOffsetEncoding.offset)); + + std::vector output_shape_copy = output_shape; + QnnTensorWrapper convert_output_tensorwrapper(convert_output_name, + QNN_TENSOR_TYPE_NATIVE, + output_qnn_data_type, + convert_output_quant_param, + std::move(output_shape_copy)); + ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(convert_output_tensorwrapper)), "Failed to add tensor."); + + ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(convert_output_name, + QNN_OP_PACKAGE_NAME_QTI_AISW, + "Convert", + {convert_input_name}, + {convert_output_name}, + {}, + do_op_validation), + "Failed to add node."); + return Status::OK(); +} + +Status SimpleOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, + const NodeUnit& node_unit, + const logging::Logger& logger, + std::vector& input_names, + bool do_op_validation) const { + const std::string& op_type = node_unit.OpType(); + ORT_RETURN_IF_ERROR(BaseOpBuilder::ProcessInputs(qnn_model_wrapper, node_unit, logger, input_names, do_op_validation)); + + if (op_type == "MatMul") { + const auto& inputs = node_unit.Inputs(); + TensorInfo input0_info = {}; + TensorInfo input1_info = {}; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[0], input0_info)); + ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[1], input1_info)); + // Need to insert Convert op if both inputs are dynamic inputs and are ufixed_16 + if (!input0_info.is_initializer && !input1_info.is_initializer && + input0_info.qnn_data_type == input1_info.qnn_data_type && + input0_info.qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) { + // insert Convert op after input1 + std::string convert_input_name = input_names.back(); + input_names.pop_back(); + const std::string& matmul_output_name = node_unit.Outputs()[0].node_arg.Name(); + std::string convert_output_name = convert_input_name + "_convert_" + matmul_output_name; + ORT_RETURN_IF_ERROR(InsertConvertOp(qnn_model_wrapper, + convert_input_name, + convert_output_name, + input1_info.qnn_data_type, + QNN_DATATYPE_UFIXED_POINT_8, + input1_info.quant_param.scaleOffsetEncoding.offset, + input1_info.quant_param.scaleOffsetEncoding.scale, + input1_info.shape, + do_op_validation)); + input_names.push_back(convert_output_name); + } + } + + return Status::OK(); +} + Status SimpleOpBuilder::ExplicitOpCheck(const NodeUnit& node_unit) const { const std::string& op_type = node_unit.OpType(); diff --git a/onnxruntime/test/providers/qnn/matmul_test.cpp b/onnxruntime/test/providers/qnn/matmul_test.cpp index 3073dde9d8e4c..3da3dc858175b 100644 --- a/onnxruntime/test/providers/qnn/matmul_test.cpp +++ b/onnxruntime/test/providers/qnn/matmul_test.cpp @@ -142,11 +142,6 @@ TEST_F(QnnHTPBackendTests, MatMulOp_HTP_u8) { } // Test QDQ MatMul with 16-bit act, 8-bit weights (static) -// TODO: (SLIGHT) Inaccuracy detected for output 'output', element 0. -// Output quant params: scale=0.0015259021893143654, zero_point=0. -// Expected val: 98 -// QNN QDQ val: 97.720298767089844 (err 0.27970123291015625) -// CPU QDQ val: 97.726402282714844 (err 0.27359771728515625) TEST_F(QnnHTPBackendTests, MatMulOp_HTP_A16_W8Static) { std::vector input0_data = {-10.0f, -4.0f, -2.0f, 0.0f, 5.0f, 10.0f}; std::vector input1_data = {-10.0f, -6.0f, -1.0f, 0.0f, 3.0f, 10.0f}; @@ -158,6 +153,40 @@ TEST_F(QnnHTPBackendTests, MatMulOp_HTP_A16_W8Static) { 7e-3f); } +// Test QDQ MatMul with uint16 activation uint16 weights, both dynamic +// Inaccuracy detected for output 'output_0', element 1. +// Output quant params: scale=0.0015259021893143654, zero_point=0. +// Expected val: 40 +// QNN QDQ val: 39.681087493896484 (err 0.31891250610351562) +// CPU QDQ val: 39.99847412109375 (err 0.00152587890625) +TEST_F(QnnHTPBackendTests, DISABLED_MatMulOp_HTP_A16_W16Dynamic) { + std::vector input0_data = {-10.0f, -4.0f, -2.0f, 0.0f, 5.0f, 10.0f}; + std::vector input1_data = {-10.0f, -6.0f, -1.0f, 0.0f, 3.0f, 10.0f}; + RunQDQMatMulOpOpTest(TestInputDef({2, 3}, false, input0_data), + TestInputDef({3, 2}, false, input1_data), + ExpectedEPNodeAssignment::All, + 18, + true, // Use com.microsoft Q/DQ ops + 7e-3f); +} + +// Test QDQ MatMul with uint16 activation uint16 weights, both dynamic +// Inaccuracy detected for output 'output_0', element 1. +// Output quant params: scale=0.71908456087112427, zero_point=1. +// Expected val: 46848.41015625 +// QNN QDQ val: 46844.04296875 (err 4.3671875) +// CPU QDQ val: 46848.359375 (err 0.05078125) +TEST_F(QnnHTPBackendTests, DISABLED_MatMulOp_HTP_A16_W16DynamicLarge) { + std::vector input0_data = GetFloatDataInRange(-10.0f, 10.0f, 12 * 96 * 512); + std::vector input1_data = GetFloatDataInRange(-10.0f, 10.0f, 12 * 96 * 512); + RunQDQMatMulOpOpTest(TestInputDef({1, 12, 96, 512}, false, input0_data), + TestInputDef({1, 12, 512, 96}, false, input1_data), + ExpectedEPNodeAssignment::All, + 18, + true, // Use com.microsoft Q/DQ ops + 7e-3f); +} + // Test 16-bit QDQ MatMul with static weights // TODO: Inaccuracy detected for output 'output', element 0. // Output quant params: scale=0.0015259021893143654, zero_point=0.