Skip to content

Commit

Permalink
[QNN EP] Update QNN SDK to 2.23.0 (#21008)
Browse files Browse the repository at this point in the history
### Description
- Updates CI pipelines to use QNN SDK 2.23.0 by default.
- QNN SDK adds support for int64 Cast. This allows QNN EP to support
ONNX ArgMax/ArgMin/TopK operators that generate an int64 graph output.

Example translation of ArgMax:
- **ONNX**:    input --> ArgMax --> output (int64)
- **QNN**: input --> ArgMax --> Cast (int32 to int64) --> output (int64)

### Motivation and Context
Update onnxruntime to use the latest QNN SDK.
  • Loading branch information
adrianlizarraga authored Jun 19, 2024
1 parent 6a0d64e commit 3ae5df1
Show file tree
Hide file tree
Showing 18 changed files with 54 additions and 160 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,12 @@ namespace onnxruntime {
namespace qnn {

// ArgMax/ArgMin support limitations:
// - HTP only: cannot generate a graph output
// - HTP only: max input rank is 4.
// - All backends: ONNX select_last_index attribute must be 0.
class ArgMaxMinOpBuilder : public BaseOpBuilder {
public:
ArgMaxMinOpBuilder() : BaseOpBuilder("ArgMaxMinOpBuilder") {}

Status IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
const logging::Logger& logger) const override ORT_MUST_USE_RESULT;

protected:
Qnn_DataType_t GetSupportedOutputDataType(size_t index,
Qnn_DataType_t qnn_data_type) const override ORT_MUST_USE_RESULT;
Expand All @@ -35,31 +30,18 @@ class ArgMaxMinOpBuilder : public BaseOpBuilder {
bool do_op_validation) const override ORT_MUST_USE_RESULT;
};

Status ArgMaxMinOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
const logging::Logger& logger) const {
// ONNX ArgMax/ArgMin ops output int64 indices, but the equivalent QNN ops output uint32 indices.
// The QNN HTP backend does not generally support the int64 type, but QNN EP can just use the uint32 type
// for ArgMax/ArgMin ops within the graph. However, if the ArgMin/ArgMax op **generates** a graph output,
// then we cannot support it on the HTP backend.
bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType());
if (is_npu_backend) {
const std::string& output_name = node_unit.Outputs()[0].node_arg.Name();
ORT_RETURN_IF(qnn_model_wrapper.IsGraphOutput(output_name),
"QNN EP does not support ArgMin/ArgMax ops that generate a graph output.");
}

return AddToModelBuilder(qnn_model_wrapper, node_unit, logger, true);
}

Qnn_DataType_t ArgMaxMinOpBuilder::GetSupportedOutputDataType(size_t index, Qnn_DataType_t qnn_data_type) const {
// ONNX ArgMxx ops have int64 output, but QNN requires uint32.
// ONNX ArgMxx ops have int64 output, but QNN requires uint32 or int32.
// If this node produces a graph output, BaseOpBuilder::ProcessOutputs() adds a Cast node after the ArgMxx op.
// Otherwise, it just set the output type to unit32. This only works for the QNN CPU backend, since the HTP backend
// does not generally support int64.
// Otherwise, it just set the output type to unit32 or int32.
ORT_UNUSED_PARAMETER(index);
ORT_UNUSED_PARAMETER(qnn_data_type);
return QNN_DATATYPE_UINT_32;
if (qnn_data_type == QNN_DATATYPE_INT_64) {
return QNN_DATATYPE_INT_32;
} else if (qnn_data_type == QNN_DATATYPE_UINT_64) {
return QNN_DATATYPE_UINT_32;
}

return qnn_data_type;
}

Status ArgMaxMinOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
Expand Down
11 changes: 0 additions & 11 deletions onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,17 +66,6 @@ Status TopKOpBuilder::ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const N
ORT_RETURN_IF_NOT(axis == -1 || axis == static_cast<int32_t>(rank - 1),
"QNN TopK's axis is always the last dimension");

// ONNX TopK outputs int64 indices, but the equivalent QNN op outputs uint32 indices.
// The QNN HTP backend does not generally support the int64 type, but QNN EP can just use the uint32 type
// for TopK ops within the graph. However, if the TopK op **generates** a graph output,
// then we cannot support it on the HTP backend.
bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType());
if (is_npu_backend) {
const std::string& output_name = node_unit.Outputs()[0].node_arg.Name();
ORT_RETURN_IF(qnn_model_wrapper.IsGraphOutput(output_name),
"QNN EP does not support TopK ops that generate a graph output.");
}

return Status::OK();
}

Expand Down
77 changes: 4 additions & 73 deletions onnxruntime/test/providers/qnn/argmaxmin_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,28 +15,7 @@
namespace onnxruntime {
namespace test {

// Builds a float32 model with ArgMin/ArgMax.
static GetTestModelFn BuildArgMxxTestCase(const std::string& op_type, TestInputDef<float> input_def,
const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
return [op_type, input_def, attrs](ModelTestBuilder& builder) {
auto* input = MakeTestInput(builder, input_def);

auto* argm_output = builder.MakeIntermediate();
Node& argm_node = builder.AddNode(op_type, {input}, {argm_output});
for (const auto& attr : attrs) {
argm_node.AddAttributeProto(attr);
}

// Add cast to uint32
auto* output = builder.MakeOutput();
Node& cast_node = builder.AddNode("Cast", {argm_output}, {output});
const auto dst_type = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT32;
cast_node.AddAttribute("to", static_cast<int64_t>(dst_type));
};
}

// Builds a QDQ model with ArgMin/ArgMax and a Cast to uint32. The quantization parameters are computed from the provided
// input definition.
// Builds a QDQ model with ArgMin/ArgMax. The quantization parameters are computed from the provided input definition.
template <typename QType = uint8_t>
static GetTestQDQModelFn<QType> BuildQDQArgMxxTestCase(const std::string& op_type, TestInputDef<float> input_def,
const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
Expand All @@ -49,17 +28,11 @@ static GetTestQDQModelFn<QType> BuildQDQArgMxxTestCase(const std::string& op_typ

// input -> Q -> DQ ->
auto* input_qdq = AddQDQNodePair<QType>(builder, input, input_qparams.scale, input_qparams.zero_point);
auto* argm_output = builder.MakeIntermediate();
auto* argm_output = builder.MakeOutput();
Node& argm_node = builder.AddNode(op_type, {input_qdq}, {argm_output});
for (const auto& attr : attrs) {
argm_node.AddAttributeProto(attr);
}

// Cast to uint32 (HTP does not support int64 as graph output)
auto* output = builder.MakeOutput();
Node& cast_node = builder.AddNode("Cast", {argm_output}, {output});
const auto dst_type = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT32;
cast_node.AddAttribute("to", static_cast<int64_t>(dst_type));
};
}

Expand All @@ -77,7 +50,7 @@ static void RunCPUArgMxxOpTest(const std::string& op_type, TestInputDef<float> i
provider_options["backend_path"] = "libQnnCpu.so";
#endif

RunQnnModelTest(BuildArgMxxTestCase(op_type, input_def, attrs),
RunQnnModelTest(BuildOpTestCase<float>(op_type, {input_def}, {}, attrs),
provider_options,
opset,
expected_ep_assignment);
Expand All @@ -98,7 +71,7 @@ static void RunQDQArgMxxOpTest(const std::string& op_type, TestInputDef<float> i
provider_options["backend_path"] = "libQnnHtp.so";
#endif

TestQDQModelAccuracy(BuildArgMxxTestCase(op_type, input_def, attrs), // baseline float32 model
TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, attrs), // baseline float32 model
BuildQDQArgMxxTestCase<QType>(op_type, input_def, attrs), // QDQ model
provider_options,
opset,
Expand Down Expand Up @@ -190,48 +163,6 @@ TEST_F(QnnHTPBackendTests, ArgMaxMinU8_RankGreaterThan4_Unsupported) {
ExpectedEPNodeAssignment::None, 13);
}

// Test that ArgMax/ArgMin are not supported if they generate a graph output.
TEST_F(QnnHTPBackendTests, ArgMaxMin_AsGraphOutputUnsupported) {
ProviderOptions provider_options;

#if defined(_WIN32)
provider_options["backend_path"] = "QnnHtp.dll";
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif

// Utility function that creates a QDQ model with ArgMax/ArgMin that produce a graph output.
auto model_builder_func = [](const std::string& op_type, const TestInputDef<float>& input_def,
const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) -> GetTestModelFn {
return [op_type, input_def, attrs](ModelTestBuilder& builder) {
QuantParams<uint8_t> input_qparams = GetTestInputQuantParams<uint8_t>(input_def);

auto* input = MakeTestInput(builder, input_def);
auto* output = builder.MakeOutput();

// input -> Q -> DQ ->
auto* input_qdq = AddQDQNodePair<uint8_t>(builder, input, input_qparams.scale, input_qparams.zero_point);

Node& argm_node = builder.AddNode(op_type, {input_qdq}, {output});
for (const auto& attr : attrs) {
argm_node.AddAttributeProto(attr);
}
};
};

const int expected_nodes_in_graph = -1; // Don't care exactly how many nodes in graph assigned to CPU EP.
RunQnnModelTest(model_builder_func("ArgMax", TestInputDef<float>({1, 3, 4}, false, -1.0f, 1.0f), {}),
provider_options,
13,
ExpectedEPNodeAssignment::None, // No nodes should be assigned to QNN EP!
expected_nodes_in_graph);
RunQnnModelTest(model_builder_func("ArgMin", TestInputDef<float>({1, 3, 4}, false, -1.0f, 1.0f), {}),
provider_options,
13,
ExpectedEPNodeAssignment::None, // No nodes should be assigned to QNN EP!
expected_nodes_in_graph);
}

#endif // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
} // namespace test
} // namespace onnxruntime
Expand Down
14 changes: 14 additions & 0 deletions onnxruntime/test/providers/qnn/cast_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,20 @@ TEST_F(QnnHTPBackendTests, TestCastFloatToInt32HTP) {
RunCastOpTest<float>({3, 3}, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32, ExpectedEPNodeAssignment::All,
true);
}

// Cast int64_t to int32_t on HTP
// Supported in QNN SDK 2.23
TEST_F(QnnHTPBackendTests, TestCastInt64ToInt32HTP) {
RunCastOpTest<int64_t>({3, 3}, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32,
ExpectedEPNodeAssignment::All, true);
}

// Cast int32_t to int64_t on HTP
// Supported in QNN SDK 2.23
TEST_F(QnnHTPBackendTests, TestCastInt32ToInt64HTP) {
RunCastOpTest<int32_t>({3, 3}, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT64,
ExpectedEPNodeAssignment::All, true);
}
#endif // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)

} // namespace test
Expand Down
50 changes: 14 additions & 36 deletions onnxruntime/test/providers/qnn/topk_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,27 +18,18 @@ namespace test {
template <typename DataType>
inline GetTestModelFn BuildTopKTestCase(const TestInputDef<DataType>& input_def,
const TestInputDef<int64_t>& k_def,
const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
bool cast_output_indices = true) {
return [input_def, k_def, attrs, cast_output_indices](ModelTestBuilder& builder) {
const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
return [input_def, k_def, attrs](ModelTestBuilder& builder) {
NodeArg* input = MakeTestInput<DataType>(builder, input_def);
NodeArg* k_input = MakeTestInput<int64_t>(builder, k_def);

NodeArg* values_output = builder.MakeOutput();
NodeArg* indices_output = cast_output_indices ? builder.MakeIntermediate() : builder.MakeOutput();
NodeArg* indices_output = builder.MakeOutput();
Node& topk_node = builder.AddNode("TopK", {input, k_input}, {values_output, indices_output});

for (const auto& attr : attrs) {
topk_node.AddAttributeProto(attr);
}

// Cast indices to uint32
if (cast_output_indices) {
auto* uint32_indices_output = builder.MakeOutput();
Node& cast_node = builder.AddNode("Cast", {indices_output}, {uint32_indices_output});
const auto dst_type = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT32;
cast_node.AddAttribute("to", static_cast<int64_t>(dst_type));
}
};
}

Expand All @@ -58,7 +49,7 @@ static void RunTopKTestOnCPU(const TestInputDef<DataType>& input_def,
provider_options["backend_path"] = "libQnnCpu.so";
#endif

RunQnnModelTest(BuildTopKTestCase<DataType>(input_def, k_def, attrs, false /*cast_output_indices*/),
RunQnnModelTest(BuildTopKTestCase<DataType>(input_def, k_def, attrs),
provider_options,
opset,
expected_ep_assignment);
Expand Down Expand Up @@ -131,26 +122,19 @@ GetTestQDQModelFn<QuantType> BuildQDQTopKTestCase(const TestInputDef<float>& inp
// K input
NodeArg* k_input = MakeTestInput(builder, k_def);

// Reshape op
// TopK_values_output -> Q -> DQ -> output
// NOTE: Create output QDQ nodes before the TopK node so that TopK's 'values' output is the graph's first output.
NodeArg* values_output = builder.MakeIntermediate();
NodeArg* indices_output = builder.MakeIntermediate();
output_qparams[0] = input_qparams; // Input and output qparams must be equal.
AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, values_output, input_qparams.scale,
input_qparams.zero_point, use_contrib_qdq);
// TopK node
NodeArg* indices_output = builder.MakeOutput();
Node& topk_node = builder.AddNode("TopK", {input_qdq, k_input}, {values_output, indices_output});

for (const auto& attr : attrs) {
topk_node.AddAttributeProto(attr);
}

// op_output -> Q -> DQ -> output
// NOTE: Input and output quantization parameters must be equal for Reshape.
output_qparams[0] = input_qparams; // Overwrite!
AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, values_output, input_qparams.scale,
input_qparams.zero_point, use_contrib_qdq);

// Cast indices to uint32 (HTP backend does not support int64 graph outputs)
auto* uint32_indices_output = builder.MakeOutput();
Node& cast_node = builder.AddNode("Cast", {indices_output}, {uint32_indices_output});
const auto dst_type = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT32;
cast_node.AddAttribute("to", static_cast<int64_t>(dst_type));
};
}

Expand All @@ -171,7 +155,7 @@ static void RunQDQTopKTestOnHTP(const TestInputDef<float>& input_def,
provider_options["backend_path"] = "libQnnHtp.so";
#endif

auto f32_model_builder = BuildTopKTestCase<float>(input_def, k_def, attrs, true /*cast_output_indices*/);
auto f32_model_builder = BuildTopKTestCase<float>(input_def, k_def, attrs);
auto qdq_model_builder = BuildQDQTopKTestCase<QType>(input_def, k_def, attrs, use_contrib_qdq);
TestQDQModelAccuracy(f32_model_builder,
qdq_model_builder,
Expand All @@ -189,18 +173,12 @@ TEST_F(QnnHTPBackendTests, TopK_LargestFloats_U8_LastAxis) {
}

// Test 16-bit QDQ TopK on HTP backend: top 2 largest floats from last axis
// TODO: Inaccuracy detected for output 'output_0', element 6.
// Output quant params: scale=0.00061036087572574615, zero_point=32768.
// Expected val: -7.2340402603149414
// QNN QDQ val: -17.446556091308594 (err 10.212515830993652)
// CPU QDQ val: -7.2339968681335449 (err 4.3392181396484375e-05)
TEST_F(QnnHTPBackendTests, DISABLED_TopK_LargestFloats_U16_LastAxis) {
TEST_F(QnnHTPBackendTests, TopK_LargestFloats_U16_LastAxis) {
RunQDQTopKTestOnHTP<uint16_t>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-20.0f, 20.0f, 48)),
TestInputDef<int64_t>({1}, true /* is_initializer */, {2}),
{}, // Attributes
ExpectedEPNodeAssignment::All,
19, // opset
true); // Use com.microsoft Q/DQ ops
21); // opset
}

#endif // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ parameters:
- name: QnnSdk
displayName: QNN SDK version
type: string
default: 2.22.0.240425
default: 2.23.0.240531

jobs:
- job: Build_QNN_EP
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ parameters:
- name: QnnSdk
displayName: QNN SDK Version
type: string
default: 2.22.0.240425
default: 2.23.0.240531

resources:
repositories:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ parameters:
- name: QnnSdk
displayName: QNN SDK version
type: string
default: 2.22.0.240425
default: 2.23.0.240531

jobs:
- job: Build_QNN_EP
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ parameters:
- name: qnn_sdk_version
type: string
displayName: 'QNN SDK version. Only for QNN packages.'
default: 2.22.0.240425
default: 2.23.0.240531

trigger: none

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ parameters:
- name: QnnSdk
displayName: QNN SDK Version
type: string
default: 2.22.0.240425
default: 2.23.0.240531

- name: build_config
displayName: Build Configuration
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
parameters:
- name: QnnSDKVersion
type: string
default: '2.22.0.240425'
default: '2.23.0.240531'

steps:
- script: |
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
parameters:
- name: QnnSDKVersion
type: string
default: '2.22.0.240425'
default: '2.23.0.240531'

steps:
- powershell: |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ parameters:
- name: qnn_sdk_version
type: string
displayName: 'QNN SDK version. Only for QNN packages.'
default: 2.22.0.240425
default: 2.23.0.240531

stages:
- ${{ if eq(parameters.enable_windows_cpu, true) }}:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ parameters:
- name: QNN_SDK
displayName: QNN SDK Version
type: string
default: 2.22.0.240425
default: 2.23.0.240531

- name: PYTHON_VERSION
type: string
Expand Down
Loading

0 comments on commit 3ae5df1

Please sign in to comment.