diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc index 9b57bc4fa1c0f..4dd2e089dd19f 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc @@ -7,7 +7,6 @@ #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder_factory.h" #include "core/common/safeint.h" -#include "onnx/defs/data_type_utils.h" #include "base_op_builder.h" @@ -24,6 +23,12 @@ class SoftmaxOpBuilder : public BaseOpBuilder { const logging::Logger& logger) const override final ORT_MUST_USE_RESULT; protected: + Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper, + const NodeUnit& node_unit, + const logging::Logger& logger, + std::vector& input_names, + bool do_op_validation) const override ORT_MUST_USE_RESULT; + Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, std::vector&& input_names, @@ -31,31 +36,125 @@ class SoftmaxOpBuilder : public BaseOpBuilder { bool do_op_validation) const override ORT_MUST_USE_RESULT; }; -static int32_t GetDefaultAxisAttribute(const std::string& op_type, int opset_version) { - if (op_type == "Softmax" || op_type == "LogSoftmax") { - // Default axis changed from 1 to -1 in opset 13. - return opset_version < 13 ? 1 : -1; - } - - return 0; +constexpr int32_t GetDefaultAxisAttribute(int opset_version) { + // Default axis changed from 1 to -1 in opset 13. + return opset_version < 13 ? 1 : -1; } Status SoftmaxOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, const logging::Logger& logger) const { ORT_UNUSED_PARAMETER(logger); - const std::string& op_type = node_unit.OpType(); + const bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType()); + const int opset_version = node_unit.SinceVersion(); + + // The QNN HTP backend only supports an `axis` attribute that refers to the last input dimension. + // QNN EP is able to support arbitrary axis attributes by wrapping the QNN operator with transposes. + // However, the exception is Softmax/LogSoftmax with opset < 13. For these older ONNX operators, only + // axis == input_rank - 1 is supported. + if (is_npu_backend && opset_version < 13) { + const std::string& op_type = node_unit.OpType(); + + int32_t axis = GetDefaultAxisAttribute(opset_version); + Qnn_Scalar_t axis_qnn_scalar = QNN_SCALAR_INIT; + ORT_RETURN_IF_ERROR(ProcessAxisAttribute(qnn_model_wrapper, node_unit, axis_qnn_scalar, axis)); + std::vector input_shape; + ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(node_unit.Inputs()[0].node_arg, input_shape), + "QNN EP: Cannot get shape for Softmax input"); + ORT_RETURN_IF(axis != static_cast(input_shape.size() - 1), + "QNN ", op_type.c_str(), + " only supports an `axis` attribute equal to input_rank-1 (or -1) for ONNX opset < 13"); + } + + return AddToModelBuilder(qnn_model_wrapper, node_unit, logger, true); +} + +static std::vector GetTransposePermToUseLastAxis(uint32_t input_rank, uint32_t axis) { + assert(axis < input_rank); + std::vector transpose_perm; + transpose_perm.reserve(input_rank); + + for (uint32_t dim = 0; dim < input_rank; dim++) { + transpose_perm.push_back(dim); + } - int32_t axis = GetDefaultAxisAttribute(op_type, node_unit.SinceVersion()); + // Swap axis dim with last dim. + transpose_perm[axis] = input_rank - 1; + transpose_perm[input_rank - 1] = axis; + + return transpose_perm; +} + +Status SoftmaxOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, + const NodeUnit& node_unit, + const logging::Logger& logger, + std::vector& input_names, + bool do_op_validation) const { + const bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType()); + const auto& inputs = node_unit.Inputs(); + assert(inputs.size() == 1); + + int32_t axis = GetDefaultAxisAttribute(node_unit.SinceVersion()); Qnn_Scalar_t axis_qnn_scalar = QNN_SCALAR_INIT; ORT_RETURN_IF_ERROR(ProcessAxisAttribute(qnn_model_wrapper, node_unit, axis_qnn_scalar, axis)); - std::vector input_shape; - ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(node_unit.Inputs()[0].node_arg, input_shape), - "QNN EP: Cannot get shape for Softmax input"); - ORT_RETURN_IF(axis != static_cast(input_shape.size() - 1), - "QNN ", op_type.c_str(), " only supports an `axis` attribute equal to input_rank-1 (or -1)"); - return AddToModelBuilder(qnn_model_wrapper, node_unit, logger, true); + OnnxInputInfo input_info = {}; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetOnnxInputInfo(inputs[0], input_info)); + const size_t input_rank = input_info.shape.size(); + + // If the axis attribute refers to the last dimension (Or not NPU), then process the input as normal. + if (!is_npu_backend || axis == static_cast(input_rank) - 1) { + return ProcessInput(qnn_model_wrapper, inputs[0], logger, input_names); + } + + // + // The axis does **not** refer to the last input dimension. Must wrap transposes around the operator to be able to use + // QNN's Softmax operator, which always uses an axis value that refers to the last dimension. + // + + std::vector transpose_perm = GetTransposePermToUseLastAxis(static_cast(input_rank), + static_cast(axis)); + + const std::string& input_name = inputs[0].node_arg.Name(); + std::string op_input_name = input_info.is_initializer ? input_name : input_name + "_ort_qnn_ep_transpose"; + input_names.push_back(op_input_name); + + std::vector op_input_shape = input_info.shape; + op_input_shape[input_rank - 1] = input_info.shape[axis]; + op_input_shape[axis] = input_info.shape[input_rank - 1]; + + std::vector initializer_bytes; + if (input_info.is_initializer) { // Input is an initializer, so transpose initializer bytes. + std::vector perm_size_t; + perm_size_t.reserve(transpose_perm.size()); + + for (auto p : transpose_perm) { + perm_size_t.push_back(static_cast(p)); + } + + ORT_RETURN_IF_ERROR(TransposeInitializer(qnn_model_wrapper, *input_info.initializer_tensor, perm_size_t, + initializer_bytes)); + } else { // Input is dynamic, so add transpose node before input. + const bool is_graph_input = qnn_model_wrapper.IsGraphInput(input_name); + + ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddTransposeNode(node_unit.Index(), + input_name, + op_input_name, + input_info.shape, + transpose_perm, + op_input_shape, + input_info.qnn_data_type, + input_info.quant_param, + do_op_validation, + is_graph_input)); + } + + Qnn_TensorType_t tensor_type = GetInputTensorType(qnn_model_wrapper, op_input_name); + QnnTensorWrapper input_tensorwrapper(op_input_name, tensor_type, input_info.qnn_data_type, input_info.quant_param, + std::move(op_input_shape), std::move(initializer_bytes)); + ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor."); + + return Status::OK(); } Status SoftmaxOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper, @@ -63,21 +162,80 @@ Status SoftmaxOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_ std::vector&& input_names, const logging::Logger& logger, bool do_op_validation) const { + const bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType()); const std::string& op_type = node_unit.OpType(); + const auto& outputs = node_unit.Outputs(); + assert(outputs.size() == 1); - int32_t default_axis = GetDefaultAxisAttribute(op_type, node_unit.SinceVersion()); + int32_t axis = GetDefaultAxisAttribute(node_unit.SinceVersion()); Qnn_Scalar_t axis_qnn_scalar = QNN_SCALAR_INIT; - ORT_RETURN_IF_ERROR(ProcessAxisAttribute(qnn_model_wrapper, node_unit, axis_qnn_scalar, default_axis)); + ORT_RETURN_IF_ERROR(ProcessAxisAttribute(qnn_model_wrapper, node_unit, axis_qnn_scalar, axis)); + + OnnxInputInfo output_info = {}; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetOnnxInputInfo(outputs[0], output_info)); + const size_t output_rank = output_info.shape.size(); + const bool axis_is_last_dim = static_cast(axis) == output_rank - 1; + + // If axis refers to the last dimension (or not NPU), process outputs as usual. + if (!is_npu_backend || axis_is_last_dim) { + QnnParamWrapper axis_param(node_unit.Index(), node_unit.Name(), QNN_OP_SOFTMAX_PARAM_AXIS, axis_qnn_scalar); + + std::vector param_tensor_names; + param_tensor_names.push_back(axis_param.GetParamTensorName()); + qnn_model_wrapper.AddParamWrapper(std::move(axis_param)); + + return ProcessOutputs(qnn_model_wrapper, node_unit, + std::move(input_names), + std::move(param_tensor_names), + logger, do_op_validation, GetQnnOpType(op_type)); + } + + // + // The axis **does** not refer to the last dimension. Must wrap the operator with Transposes to be able to use + // QNN's Softmax operator, which only supports an axis that refers to the last dimension. + // + + axis_qnn_scalar.uint32Value = static_cast(output_rank - 1); // NOTE: override axis. QnnParamWrapper axis_param(node_unit.Index(), node_unit.Name(), QNN_OP_SOFTMAX_PARAM_AXIS, axis_qnn_scalar); std::vector param_tensor_names; param_tensor_names.push_back(axis_param.GetParamTensorName()); qnn_model_wrapper.AddParamWrapper(std::move(axis_param)); - return ProcessOutputs(qnn_model_wrapper, node_unit, - std::move(input_names), - std::move(param_tensor_names), - logger, do_op_validation, GetQnnOpType(op_type)); + const std::string& orig_output_name = outputs[0].node_arg.Name(); + std::string op_output_name = orig_output_name + "_ort_qnn_ep_transpose"; + + std::vector op_output_shape = output_info.shape; + op_output_shape[output_rank - 1] = output_info.shape[axis]; + op_output_shape[axis] = output_info.shape[output_rank - 1]; + + QnnTensorWrapper output_tensorwrapper(op_output_name, QNN_TENSOR_TYPE_NATIVE, output_info.qnn_data_type, output_info.quant_param, + std::vector(op_output_shape)); + ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(output_tensorwrapper)), "Failed to add tensor."); + ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(GetNodeName(node_unit), + QNN_OP_PACKAGE_NAME_QTI_AISW, + GetQnnOpType(node_unit.OpType()), + std::move(input_names), + {op_output_name}, + std::move(param_tensor_names)), + "Failed to add node."); + + const bool is_graph_output = qnn_model_wrapper.IsGraphOutput(orig_output_name); + std::vector transpose_perm = GetTransposePermToUseLastAxis(static_cast(output_rank), + static_cast(axis)); + + ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddTransposeNode(node_unit.Index(), + op_output_name, + orig_output_name, + op_output_shape, + transpose_perm, + output_info.shape, + output_info.qnn_data_type, + output_info.quant_param, + do_op_validation, + is_graph_output)); + + return Status::OK(); } void CreateSoftmaxOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) { diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc index be8afa7636b3d..e024eafcd6572 100644 --- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc +++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc @@ -447,8 +447,9 @@ TEST_F(QnnHTPBackendTests, UnaryOp_Log_U16) { // Check that QNN compiles DQ -> Softmax -> Q as a single unit. // Test that the default axis (-1) for SoftMax opset 13 works. TEST_F(QnnHTPBackendTests, UnaryOp_Softmax13_DefaultAxis) { + const std::vector input_data = GetFloatDataInRange(-5.0f, 5.0f, 6); RunQDQOpTest("Softmax", - {TestInputDef({1, 2, 3}, false, -5.0f, 5.0f)}, + {TestInputDef({1, 2, 3}, false, input_data)}, {}, // Uses default axis of -1 for opset 13 13, ExpectedEPNodeAssignment::All); @@ -466,14 +467,43 @@ TEST_F(QnnHTPBackendTests, UnaryOp_Softmax13_U16_DefaultAxis) { true); // Use com.microsoft domain for Q/DQ ops } -// Check that QNN compiles DQ -> Softmax -> Q as a single unit. -// Test that an axis != -1 is not supported. -TEST_F(QnnHTPBackendTests, UnaryOp_Softmax13_UnsupportedAxis) { +// Test that 8-bit QDQ Softmax (opset 13) with axis != -1 is supported by QNN EP. +// QNN EP will wrap the operator with transposes. +TEST_F(QnnHTPBackendTests, UnaryOp_Softmax13_NonLastAxis) { + const std::vector input_data = {0.0f, 1.0f, 2.0f, 10.0f, 11.0f, 12.0f, 100.0f, 110.0f, 120.0f, + 1.0856307f, 0.99734545f, 0.2829785f, 1.5062947f, 0.5786002f, 1.6514366f, + 2.4266791f, 0.42891264f, 1.2659363f}; RunQDQOpTest("Softmax", - {TestInputDef({1, 2, 3}, false, -5.0f, 5.0f)}, + {TestInputDef({1, 2, 3, 3}, false, input_data)}, {utils::MakeAttribute("axis", static_cast(1))}, 13, - ExpectedEPNodeAssignment::None); + ExpectedEPNodeAssignment::All); +} + +// Test that 8-bit QDQ Softmax (opset 13) with axis != -1 is supported by QNN EP. +// QNN EP will wrap the operator with transposes. +// This is a configuration used in one of our partner's models. +TEST_F(QnnHTPBackendTests, UnaryOp_Softmax13_NonLastAxis_LargeInput) { + const std::vector input_data = GetFloatDataInRange(-50.0f, 50.0f, 124); + RunQDQOpTest("Softmax", + {TestInputDef({1, 124, 1}, false, input_data)}, + {utils::MakeAttribute("axis", static_cast(1))}, + 13, + ExpectedEPNodeAssignment::All); +} + +// Test that 16-bit QDQ Softmax (opset 13) with axis != -1 is supported by QNN EP. +// QNN EP will wrap the operator with transposes. +// This is a configuration used in one of our partner's models. +TEST_F(QnnHTPBackendTests, UnaryOp_Softmax13_U16_NonLastAxis_LargeInput) { + const std::vector input_data = GetFloatDataInRange(-50.0f, 50.0f, 124); + RunQDQOpTest("Softmax", + {TestInputDef({1, 124, 1}, false, input_data)}, + {utils::MakeAttribute("axis", static_cast(1))}, + 13, + ExpectedEPNodeAssignment::All, + kOnnxDomain, + true); } // Check that QNN compiles DQ -> Softmax -> Q as a single unit. @@ -507,15 +537,15 @@ TEST_F(QnnHTPBackendTests, UnaryOp_LogSoftmax13_DefaultAxis) { ExpectedEPNodeAssignment::All); } -// Check that QNN compiles DQ -> LogSoftmax -> Q as a single unit. -// Test that an axis != -1 is not supported. -TEST_F(QnnHTPBackendTests, UnaryOp_LogSoftmax13_UnsupportedAxis) { +// Test that 8-bit QDQ LogSoftmax (opset 13) with axis != -1 is supported by QNN EP. +// QNN EP will wrap the operator with transposes. +TEST_F(QnnHTPBackendTests, UnaryOp_LogSoftmax13_NonLastAxis) { std::vector input_data = GetFloatDataInRange(-5.0f, 5.0f, 6); RunQDQOpTest("LogSoftmax", {TestInputDef({1, 2, 3}, false, input_data)}, {utils::MakeAttribute("axis", static_cast(1))}, 13, - ExpectedEPNodeAssignment::None); + ExpectedEPNodeAssignment::All); } // Check that QNN compiles DQ -> LogSoftmax -> Q as a single unit.