diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc index dd5c6a5a79cdb..6ef17b40d274b 100644 --- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc +++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc @@ -83,6 +83,7 @@ OpBuilderRegistrations::OpBuilderRegistrations() { CreateReduceOpBuilder("ReduceMin", *this); CreateReduceOpBuilder("ReduceProd", *this); CreateReduceOpBuilder("ReduceSum", *this); + CreateReduceOpBuilder("ReduceL2", *this); } { diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc index 2aefe5f6b8e71..77bc58bd6f833 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc @@ -6,15 +6,15 @@ #include #include +#include "core/common/safeint.h" +#include "onnx/defs/data_type_utils.h" #include "core/providers/common.h" -#include "core/providers/shared/utils/utils.h" #include "core/framework/endian_utils.h" -#include "core/providers/qnn/builder/qnn_model_wrapper.h" +#include "core/providers/shared/utils/utils.h" +#include "core/providers/qnn/builder/opbuilder/base_op_builder.h" #include "core/providers/qnn/builder/op_builder_factory.h" -#include "core/common/safeint.h" -#include "onnx/defs/data_type_utils.h" - -#include "base_op_builder.h" +#include "core/providers/qnn/builder/qnn_model_wrapper.h" +#include "core/providers/qnn/builder/qnn_utils.h" namespace onnxruntime { namespace qnn { @@ -25,6 +25,7 @@ enum ReduceOpType { REDUCE_OP_TYPE_MEAN, REDUCE_OP_TYPE_PROD, REDUCE_OP_TYPE_SUM, + REDUCE_OP_TYPE_L2, REDUCE_OP_TYPE_COUNT, REDUCE_OP_TYPE_UNKNOWN, @@ -41,6 +42,8 @@ ReduceOpType GetReduceOpType(const std::string& op_type) { return REDUCE_OP_TYPE_PROD; } else if (op_type == "ReduceSum") { return REDUCE_OP_TYPE_SUM; + } else if (op_type == "ReduceL2") { + return REDUCE_OP_TYPE_L2; } else { return REDUCE_OP_TYPE_UNKNOWN; } @@ -51,21 +54,16 @@ class ReduceOpBuilder : public BaseOpBuilder { ReduceOpBuilder() : BaseOpBuilder("ReduceOpBuilder") {} ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(ReduceOpBuilder); - Status IsOpSupported(QnnModelWrapper& qnn_model_wrapper, - const NodeUnit& node_unit, + Status IsOpSupported(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, const logging::Logger& logger) const override final ORT_MUST_USE_RESULT; protected: - Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper, - const NodeUnit& node_unit, - const logging::Logger& logger, + Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, const logging::Logger& logger, std::vector& input_names, bool do_op_validation = false) const override ORT_MUST_USE_RESULT; - Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper, - const NodeUnit& node_unit, - std::vector&& input_names, - const logging::Logger& logger, + Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, + std::vector&& input_names, const logging::Logger& logger, bool do_op_validation) const override ORT_MUST_USE_RESULT; private: @@ -84,7 +82,8 @@ const std::array ReduceOpBuilder::opset_with_axes_as_ 18, // ReduceMin 18, // ReduceMean 18, // ReduceProd - 13 // ReduceSum + 13, // ReduceSum + 18, // ReduceL2 }; Status ReduceOpBuilder::GetAxesSet(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, @@ -175,8 +174,7 @@ Status ReduceOpBuilder::GetAxesSet(QnnModelWrapper& qnn_model_wrapper, const Nod return Status::OK(); } -Status ReduceOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper, - const NodeUnit& node_unit, +Status ReduceOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, const logging::Logger& logger) const { ReduceOpType reduce_op_type = GetReduceOpType(node_unit.OpType()); if (reduce_op_type == ReduceOpType::REDUCE_OP_TYPE_UNKNOWN) { @@ -188,13 +186,17 @@ Status ReduceOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper, return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: ReduceProd operator not supported by HTP backend."); } + // ReduceL2 is composed by Mul->ReduceSum->Sqrt, it's not easy to set the quantization parameters for the activation + // tensors between, so we don't support ReduceL2 with quantized input for now. + if (reduce_op_type == ReduceOpType::REDUCE_OP_TYPE_L2 && node_unit.Inputs()[0].quant_param.has_value()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: ReduceL2 operator does not support quantized input for now."); + } + return AddToModelBuilder(qnn_model_wrapper, node_unit, logger, true); } -Status ReduceOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, - const NodeUnit& node_unit, - const logging::Logger& logger, - std::vector& input_names, +Status ReduceOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, + const logging::Logger& logger, std::vector& input_names, bool do_op_validation) const { ORT_UNUSED_PARAMETER(do_op_validation); @@ -207,11 +209,9 @@ Status ReduceOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, return Status::OK(); } -Status ReduceOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper, - const NodeUnit& node_unit, +Status ReduceOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, std::vector&& input_names, - const logging::Logger& logger, - bool do_op_validation) const { + const logging::Logger& logger, bool do_op_validation) const { NodeAttrHelper node_attr_helper(node_unit); std::vector param_tensor_names; @@ -229,8 +229,8 @@ Status ReduceOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w std::transform(axes_set.begin(), axes_set.end(), axes_data.begin(), [](AxesOnnxIntType item) { return SafeInt(item); }); - QnnParamWrapper axes_param(node_unit.Index(), node_unit.Name(), QNN_OP_REDUCE_MAX_PARAM_AXES, - std::move(axes_shape), std::move(axes_data)); + QnnParamWrapper axes_param(node_unit.Index(), node_unit.Name(), QNN_OP_REDUCE_MAX_PARAM_AXES, std::move(axes_shape), + std::move(axes_data)); param_tensor_names.push_back(axes_param.GetParamTensorName()); qnn_model_wrapper.AddParamWrapper(std::move(axes_param)); @@ -245,10 +245,57 @@ Status ReduceOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w param_tensor_names.push_back(keep_dims_param.GetParamTensorName()); qnn_model_wrapper.AddParamWrapper(std::move(keep_dims_param)); - ORT_RETURN_IF_ERROR(ProcessOutputs(qnn_model_wrapper, node_unit, - std::move(input_names), - std::move(param_tensor_names), - logger, do_op_validation, GetQnnOpType(node_unit.OpType()))); + if (node_unit.OpType() == "ReduceL2") { + // If ReduceL2, QNN doesn't have a single Op for it, we need to add a + // ElementWiseMultiply->ReduceSum->ElementWiseSquareRoot node sequence. + const auto& input = node_unit.Inputs()[0]; + const auto& output = node_unit.Outputs()[0]; + std::vector input_shape; + ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(input.node_arg, input_shape), "Cannot get input shape."); + std::vector output_shape; + ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(output.node_arg, output_shape), "Cannot get output shape."); + ORT_ENFORCE(!input.quant_param.has_value(), "Input tensor must not be quantized."); + const auto* type_proto = output.node_arg.TypeAsProto(); + Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32; + ORT_RETURN_IF_ERROR(utils::GetQnnDataType(false, type_proto, qnn_data_type)); + const std::string input_name = input_names[0]; + + // Step 1: y_pow2 = x * x, using ElementWiseMultiply instead of ElementWisePower so we don't need to add a new + // initializer tensor for the power value. The performance difference is negligible. + const std::string pow2_name = input_name + "_ort_qnn_ep_pow2"; + QnnTensorWrapper pow2_tensorwrapper(pow2_name, QNN_TENSOR_TYPE_NATIVE, qnn_data_type, QnnQuantParamsWrapper(), + std::move(input_shape)); + ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(pow2_tensorwrapper)), "AddTensorWrapper failed"); + ORT_RETURN_IF_NOT( + qnn_model_wrapper.CreateQnnNode(pow2_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_ELEMENT_WISE_MULTIPLY, + {input_name, input_name}, {pow2_name}, {}, do_op_validation), + "CreateQnnNode failed"); + + // Step 2: y_pow2_sum = ReduceSum(y_pow2) + const std::string reduce_name = input_name + "_ort_qnn_ep_pow2_sum"; + QnnTensorWrapper reduce_tensorwrapper(reduce_name, QNN_TENSOR_TYPE_NATIVE, qnn_data_type, QnnQuantParamsWrapper(), + std::vector(output_shape)); + ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(reduce_tensorwrapper)), "AddTensorWrapper failed"); + ORT_RETURN_IF_NOT( + qnn_model_wrapper.CreateQnnNode(utils::GetNodeName(node_unit), QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_REDUCE_SUM, + {pow2_name}, {reduce_name}, std::move(param_tensor_names), do_op_validation), + "CreateQnnNode failed"); + + // Step 3: y = Sqrt(y_pow2_sum) + Qnn_TensorType_t output_tensor_type = + qnn_model_wrapper.IsGraphOutput(output.node_arg.Name()) ? QNN_TENSOR_TYPE_APP_READ : QNN_TENSOR_TYPE_NATIVE; + QnnTensorWrapper sqrt_tensorwrapper(output.node_arg.Name(), output_tensor_type, qnn_data_type, + QnnQuantParamsWrapper(), std::move(output_shape)); + ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(sqrt_tensorwrapper)), "AddTensorWrapper failed"); + ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(input_name + "_ort_qnn_ep_pow2_sum_sqrt", + QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_ELEMENT_WISE_SQUARE_ROOT, + {reduce_name}, {output.node_arg.Name()}, {}, do_op_validation), + "CreateQnnNode failed"); + } else { + ORT_RETURN_IF_ERROR(ProcessOutputs(qnn_model_wrapper, node_unit, std::move(input_names), + std::move(param_tensor_names), logger, do_op_validation, + GetQnnOpType(node_unit.OpType()))); + } return Status::OK(); } diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.cc b/onnxruntime/test/providers/qnn/qnn_test_utils.cc index 79e7d39e85518..4feeb5f830508 100644 --- a/onnxruntime/test/providers/qnn/qnn_test_utils.cc +++ b/onnxruntime/test/providers/qnn/qnn_test_utils.cc @@ -388,6 +388,7 @@ bool ReduceOpHasAxesInput(const std::string& op_type, int opset_version) { {"ReduceMean", 18}, {"ReduceProd", 18}, {"ReduceSum", 13}, + {"ReduceL2", 18}, }; const auto it = opset_with_axes_as_input.find(op_type); diff --git a/onnxruntime/test/providers/qnn/reduce_op_test.cc b/onnxruntime/test/providers/qnn/reduce_op_test.cc index 13173d9a87f55..e4abe85908373 100644 --- a/onnxruntime/test/providers/qnn/reduce_op_test.cc +++ b/onnxruntime/test/providers/qnn/reduce_op_test.cc @@ -309,6 +309,27 @@ TEST_F(QnnCPUBackendTests, ReduceMeanOpset13) { ExpectedEPNodeAssignment::All); } +// +// ReduceL2 +// +TEST_F(QnnCPUBackendTests, ReduceL2Opset18) { + RunReduceTest("ReduceL2", + TestInputDef({2, 2}, false, -10.0f, 10.0f), + std::vector{0, 1}, + true, // keepdims + 18, + ExpectedEPNodeAssignment::All); +} + +TEST_F(QnnCPUBackendTests, ReduceL2Opset13) { + RunReduceTest("ReduceL2", + TestInputDef({2, 2}, false, -10.0f, 10.0f), + std::vector{0, 1}, + true, // keepdims + 13, + ExpectedEPNodeAssignment::All); +} + #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__) // Test creates a graph with a ReduceSum node, and checks that all nodes are supported by the QNN EP