[QNN EP] Update QNN SDK to 2.23.0 (#21008)

### Description - Updates CI pipelines to use QNN SDK 2.23.0 by default. - QNN SDK adds support for int64 Cast. This allows QNN EP to support ONNX ArgMax/ArgMin/TopK operators that generate an int64 graph output. Example translation of ArgMax: - **ONNX**: input --> ArgMax --> output (int64) - **QNN**: input --> ArgMax --> Cast (int32 to int64) --> output (int64) ### Motivation and Context Update onnxruntime to use the latest QNN SDK.
microsoft · Jun 19, 2024 · 3ae5df1 · 3ae5df1
1 parent 6a0d64e
commit 3ae5df1
Show file tree

Hide file tree

Showing 18 changed files with 54 additions and 160 deletions.
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc
@@ -13,17 +13,12 @@ namespace onnxruntime {
 namespace qnn {
 
 // ArgMax/ArgMin support limitations:
-//  - HTP only: cannot generate a graph output
 //  - HTP only: max input rank is 4.
 //  - All backends: ONNX select_last_index attribute must be 0.
 class ArgMaxMinOpBuilder : public BaseOpBuilder {
  public:
   ArgMaxMinOpBuilder() : BaseOpBuilder("ArgMaxMinOpBuilder") {}
 
-  Status IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
-                       const NodeUnit& node_unit,
-                       const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
-
  protected:
   Qnn_DataType_t GetSupportedOutputDataType(size_t index,
                                             Qnn_DataType_t qnn_data_type) const override ORT_MUST_USE_RESULT;
@@ -35,31 +30,18 @@ class ArgMaxMinOpBuilder : public BaseOpBuilder {
                                      bool do_op_validation) const override ORT_MUST_USE_RESULT;
 };
 
-Status ArgMaxMinOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
-                                         const NodeUnit& node_unit,
-                                         const logging::Logger& logger) const {
-  // ONNX ArgMax/ArgMin ops output int64 indices, but the equivalent QNN ops output uint32 indices.
-  // The QNN HTP backend does not generally support the int64 type, but QNN EP can just use the uint32 type
-  // for ArgMax/ArgMin ops within the graph. However, if the ArgMin/ArgMax op **generates** a graph output,
-  // then we cannot support it on the HTP backend.
-  bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType());
-  if (is_npu_backend) {
-    const std::string& output_name = node_unit.Outputs()[0].node_arg.Name();
-    ORT_RETURN_IF(qnn_model_wrapper.IsGraphOutput(output_name),
-                  "QNN EP does not support ArgMin/ArgMax ops that generate a graph output.");
-  }
-
-  return AddToModelBuilder(qnn_model_wrapper, node_unit, logger, true);
-}
-
 Qnn_DataType_t ArgMaxMinOpBuilder::GetSupportedOutputDataType(size_t index, Qnn_DataType_t qnn_data_type) const {
-  // ONNX ArgMxx ops have int64 output, but QNN requires uint32.
+  // ONNX ArgMxx ops have int64 output, but QNN requires uint32 or int32.
   // If this node produces a graph output, BaseOpBuilder::ProcessOutputs() adds a Cast node after the ArgMxx op.
-  // Otherwise, it just set the output type to unit32. This only works for the QNN CPU backend, since the HTP backend
-  // does not generally support int64.
+  // Otherwise, it just set the output type to unit32 or int32.
   ORT_UNUSED_PARAMETER(index);
-  ORT_UNUSED_PARAMETER(qnn_data_type);
-  return QNN_DATATYPE_UINT_32;
+  if (qnn_data_type == QNN_DATATYPE_INT_64) {
+    return QNN_DATATYPE_INT_32;
+  } else if (qnn_data_type == QNN_DATATYPE_UINT_64) {
+    return QNN_DATATYPE_UINT_32;
+  }
+
+  return qnn_data_type;
 }
 
 Status ArgMaxMinOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
@@ -66,17 +66,6 @@ Status TopKOpBuilder::ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const N
   ORT_RETURN_IF_NOT(axis == -1 || axis == static_cast<int32_t>(rank - 1),
                     "QNN TopK's axis is always the last dimension");
 
-  // ONNX TopK outputs int64 indices, but the equivalent QNN op outputs uint32 indices.
-  // The QNN HTP backend does not generally support the int64 type, but QNN EP can just use the uint32 type
-  // for TopK ops within the graph. However, if the TopK op **generates** a graph output,
-  // then we cannot support it on the HTP backend.
-  bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType());
-  if (is_npu_backend) {
-    const std::string& output_name = node_unit.Outputs()[0].node_arg.Name();
-    ORT_RETURN_IF(qnn_model_wrapper.IsGraphOutput(output_name),
-                  "QNN EP does not support TopK ops that generate a graph output.");
-  }
-
   return Status::OK();
 }
 

diff --git a/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc b/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc
@@ -15,28 +15,7 @@
 namespace onnxruntime {
 namespace test {
 
-// Builds a float32 model with ArgMin/ArgMax.
-static GetTestModelFn BuildArgMxxTestCase(const std::string& op_type, TestInputDef<float> input_def,
-                                          const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
-  return [op_type, input_def, attrs](ModelTestBuilder& builder) {
-    auto* input = MakeTestInput(builder, input_def);
-
-    auto* argm_output = builder.MakeIntermediate();
-    Node& argm_node = builder.AddNode(op_type, {input}, {argm_output});
-    for (const auto& attr : attrs) {
-      argm_node.AddAttributeProto(attr);
-    }
-
-    // Add cast to uint32
-    auto* output = builder.MakeOutput();
-    Node& cast_node = builder.AddNode("Cast", {argm_output}, {output});
-    const auto dst_type = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT32;
-    cast_node.AddAttribute("to", static_cast<int64_t>(dst_type));
-  };
-}
-
-// Builds a QDQ model with ArgMin/ArgMax and a Cast to uint32. The quantization parameters are computed from the provided
-// input definition.
+// Builds a QDQ model with ArgMin/ArgMax. The quantization parameters are computed from the provided input definition.
 template <typename QType = uint8_t>
 static GetTestQDQModelFn<QType> BuildQDQArgMxxTestCase(const std::string& op_type, TestInputDef<float> input_def,
                                                        const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
@@ -49,17 +28,11 @@ static GetTestQDQModelFn<QType> BuildQDQArgMxxTestCase(const std::string& op_typ
 
     // input -> Q -> DQ ->
     auto* input_qdq = AddQDQNodePair<QType>(builder, input, input_qparams.scale, input_qparams.zero_point);
-    auto* argm_output = builder.MakeIntermediate();
+    auto* argm_output = builder.MakeOutput();
     Node& argm_node = builder.AddNode(op_type, {input_qdq}, {argm_output});
     for (const auto& attr : attrs) {
       argm_node.AddAttributeProto(attr);
     }
-
-    // Cast to uint32 (HTP does not support int64 as graph output)
-    auto* output = builder.MakeOutput();
-    Node& cast_node = builder.AddNode("Cast", {argm_output}, {output});
-    const auto dst_type = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT32;
-    cast_node.AddAttribute("to", static_cast<int64_t>(dst_type));
   };
 }
 
@@ -77,7 +50,7 @@ static void RunCPUArgMxxOpTest(const std::string& op_type, TestInputDef<float> i
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
 
-  RunQnnModelTest(BuildArgMxxTestCase(op_type, input_def, attrs),
+  RunQnnModelTest(BuildOpTestCase<float>(op_type, {input_def}, {}, attrs),
                   provider_options,
                   opset,
                   expected_ep_assignment);
@@ -98,7 +71,7 @@ static void RunQDQArgMxxOpTest(const std::string& op_type, TestInputDef<float> i
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  TestQDQModelAccuracy(BuildArgMxxTestCase(op_type, input_def, attrs),            // baseline float32 model
+  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, attrs),   // baseline float32 model
                        BuildQDQArgMxxTestCase<QType>(op_type, input_def, attrs),  // QDQ model
                        provider_options,
                        opset,
@@ -190,48 +163,6 @@ TEST_F(QnnHTPBackendTests, ArgMaxMinU8_RankGreaterThan4_Unsupported) {
                               ExpectedEPNodeAssignment::None, 13);
 }
 
-// Test that ArgMax/ArgMin are not supported if they generate a graph output.
-TEST_F(QnnHTPBackendTests, ArgMaxMin_AsGraphOutputUnsupported) {
-  ProviderOptions provider_options;
-
-#if defined(_WIN32)
-  provider_options["backend_path"] = "QnnHtp.dll";
-#else
-  provider_options["backend_path"] = "libQnnHtp.so";
-#endif
-
-  // Utility function that creates a QDQ model with ArgMax/ArgMin that produce a graph output.
-  auto model_builder_func = [](const std::string& op_type, const TestInputDef<float>& input_def,
-                               const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) -> GetTestModelFn {
-    return [op_type, input_def, attrs](ModelTestBuilder& builder) {
-      QuantParams<uint8_t> input_qparams = GetTestInputQuantParams<uint8_t>(input_def);
-
-      auto* input = MakeTestInput(builder, input_def);
-      auto* output = builder.MakeOutput();
-
-      // input -> Q -> DQ ->
-      auto* input_qdq = AddQDQNodePair<uint8_t>(builder, input, input_qparams.scale, input_qparams.zero_point);
-
-      Node& argm_node = builder.AddNode(op_type, {input_qdq}, {output});
-      for (const auto& attr : attrs) {
-        argm_node.AddAttributeProto(attr);
-      }
-    };
-  };
-
-  const int expected_nodes_in_graph = -1;  // Don't care exactly how many nodes in graph assigned to CPU EP.
-  RunQnnModelTest(model_builder_func("ArgMax", TestInputDef<float>({1, 3, 4}, false, -1.0f, 1.0f), {}),
-                  provider_options,
-                  13,
-                  ExpectedEPNodeAssignment::None,  // No nodes should be assigned to QNN EP!
-                  expected_nodes_in_graph);
-  RunQnnModelTest(model_builder_func("ArgMin", TestInputDef<float>({1, 3, 4}, false, -1.0f, 1.0f), {}),
-                  provider_options,
-                  13,
-                  ExpectedEPNodeAssignment::None,  // No nodes should be assigned to QNN EP!
-                  expected_nodes_in_graph);
-}
-
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 }  // namespace test
 }  // namespace onnxruntime

diff --git a/onnxruntime/test/providers/qnn/cast_test.cc b/onnxruntime/test/providers/qnn/cast_test.cc
@@ -107,6 +107,20 @@ TEST_F(QnnHTPBackendTests, TestCastFloatToInt32HTP) {
   RunCastOpTest<float>({3, 3}, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32, ExpectedEPNodeAssignment::All,
                        true);
 }
+
+// Cast int64_t to int32_t on HTP
+// Supported in QNN SDK 2.23
+TEST_F(QnnHTPBackendTests, TestCastInt64ToInt32HTP) {
+  RunCastOpTest<int64_t>({3, 3}, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32,
+                         ExpectedEPNodeAssignment::All, true);
+}
+
+// Cast int32_t to int64_t on HTP
+// Supported in QNN SDK 2.23
+TEST_F(QnnHTPBackendTests, TestCastInt32ToInt64HTP) {
+  RunCastOpTest<int32_t>({3, 3}, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT64,
+                         ExpectedEPNodeAssignment::All, true);
+}
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
 }  // namespace test

diff --git a/onnxruntime/test/providers/qnn/topk_op_test.cc b/onnxruntime/test/providers/qnn/topk_op_test.cc
@@ -18,27 +18,18 @@ namespace test {
 template <typename DataType>
 inline GetTestModelFn BuildTopKTestCase(const TestInputDef<DataType>& input_def,
                                         const TestInputDef<int64_t>& k_def,
-                                        const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
-                                        bool cast_output_indices = true) {
-  return [input_def, k_def, attrs, cast_output_indices](ModelTestBuilder& builder) {
+                                        const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
+  return [input_def, k_def, attrs](ModelTestBuilder& builder) {
     NodeArg* input = MakeTestInput<DataType>(builder, input_def);
     NodeArg* k_input = MakeTestInput<int64_t>(builder, k_def);
 
     NodeArg* values_output = builder.MakeOutput();
-    NodeArg* indices_output = cast_output_indices ? builder.MakeIntermediate() : builder.MakeOutput();
+    NodeArg* indices_output = builder.MakeOutput();
     Node& topk_node = builder.AddNode("TopK", {input, k_input}, {values_output, indices_output});
 
     for (const auto& attr : attrs) {
       topk_node.AddAttributeProto(attr);
     }
-
-    // Cast indices to uint32
-    if (cast_output_indices) {
-      auto* uint32_indices_output = builder.MakeOutput();
-      Node& cast_node = builder.AddNode("Cast", {indices_output}, {uint32_indices_output});
-      const auto dst_type = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT32;
-      cast_node.AddAttribute("to", static_cast<int64_t>(dst_type));
-    }
   };
 }
 
@@ -58,7 +49,7 @@ static void RunTopKTestOnCPU(const TestInputDef<DataType>& input_def,
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
 
-  RunQnnModelTest(BuildTopKTestCase<DataType>(input_def, k_def, attrs, false /*cast_output_indices*/),
+  RunQnnModelTest(BuildTopKTestCase<DataType>(input_def, k_def, attrs),
                   provider_options,
                   opset,
                   expected_ep_assignment);
@@ -131,26 +122,19 @@ GetTestQDQModelFn<QuantType> BuildQDQTopKTestCase(const TestInputDef<float>& inp
     // K input
     NodeArg* k_input = MakeTestInput(builder, k_def);
 
-    // Reshape op
+    // TopK_values_output -> Q -> DQ -> output
+    // NOTE: Create output QDQ nodes before the TopK node so that TopK's 'values' output is the graph's first output.
     NodeArg* values_output = builder.MakeIntermediate();
-    NodeArg* indices_output = builder.MakeIntermediate();
+    output_qparams[0] = input_qparams;  // Input and output qparams must be equal.
+    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, values_output, input_qparams.scale,
+                                                     input_qparams.zero_point, use_contrib_qdq);
+    // TopK node
+    NodeArg* indices_output = builder.MakeOutput();
     Node& topk_node = builder.AddNode("TopK", {input_qdq, k_input}, {values_output, indices_output});
 
     for (const auto& attr : attrs) {
       topk_node.AddAttributeProto(attr);
     }
-
-    // op_output -> Q -> DQ -> output
-    // NOTE: Input and output quantization parameters must be equal for Reshape.
-    output_qparams[0] = input_qparams;  // Overwrite!
-    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, values_output, input_qparams.scale,
-                                                     input_qparams.zero_point, use_contrib_qdq);
-
-    // Cast indices to uint32 (HTP backend does not support int64 graph outputs)
-    auto* uint32_indices_output = builder.MakeOutput();
-    Node& cast_node = builder.AddNode("Cast", {indices_output}, {uint32_indices_output});
-    const auto dst_type = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT32;
-    cast_node.AddAttribute("to", static_cast<int64_t>(dst_type));
   };
 }
 
@@ -171,7 +155,7 @@ static void RunQDQTopKTestOnHTP(const TestInputDef<float>& input_def,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  auto f32_model_builder = BuildTopKTestCase<float>(input_def, k_def, attrs, true /*cast_output_indices*/);
+  auto f32_model_builder = BuildTopKTestCase<float>(input_def, k_def, attrs);
   auto qdq_model_builder = BuildQDQTopKTestCase<QType>(input_def, k_def, attrs, use_contrib_qdq);
   TestQDQModelAccuracy(f32_model_builder,
                        qdq_model_builder,
@@ -189,18 +173,12 @@ TEST_F(QnnHTPBackendTests, TopK_LargestFloats_U8_LastAxis) {
 }
 
 // Test 16-bit QDQ TopK on HTP backend: top 2 largest floats from last axis
-// TODO: Inaccuracy detected for output 'output_0', element 6.
-// Output quant params: scale=0.00061036087572574615, zero_point=32768.
-// Expected val: -7.2340402603149414
-// QNN QDQ val: -17.446556091308594 (err 10.212515830993652)
-// CPU QDQ val: -7.2339968681335449 (err 4.3392181396484375e-05)
-TEST_F(QnnHTPBackendTests, DISABLED_TopK_LargestFloats_U16_LastAxis) {
+TEST_F(QnnHTPBackendTests, TopK_LargestFloats_U16_LastAxis) {
   RunQDQTopKTestOnHTP<uint16_t>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-20.0f, 20.0f, 48)),
                                 TestInputDef<int64_t>({1}, true /* is_initializer */, {2}),
                                 {},  // Attributes
                                 ExpectedEPNodeAssignment::All,
-                                19,     // opset
-                                true);  // Use com.microsoft Q/DQ ops
+                                21);  // opset
 }
 
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)

diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
@@ -31,7 +31,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.22.0.240425
+  default: 2.23.0.240531
 
 jobs:
 - job: Build_QNN_EP

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -62,7 +62,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK Version
   type: string
-  default: 2.22.0.240425
+  default: 2.23.0.240531
 
 resources:
   repositories:

diff --git a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.22.0.240425
+  default: 2.23.0.240531
 
 jobs:
   - job: Build_QNN_EP

diff --git a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
@@ -59,7 +59,7 @@ parameters:
 - name: qnn_sdk_version
   type: string
   displayName: 'QNN SDK version. Only for QNN packages.'
-  default: 2.22.0.240425
+  default: 2.23.0.240531
 
 trigger: none
 

diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
@@ -2,7 +2,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK Version
   type: string
-  default: 2.22.0.240425
+  default: 2.23.0.240531
 
 - name: build_config
   displayName: Build Configuration

diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml
@@ -1,7 +1,7 @@
 parameters:
   - name: QnnSDKVersion
     type: string
-    default: '2.22.0.240425'
+    default: '2.23.0.240531'
 
 steps:
   - script: |

diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml
@@ -1,7 +1,7 @@
 parameters:
   - name: QnnSDKVersion
     type: string
-    default: '2.22.0.240425'
+    default: '2.23.0.240531'
 
 steps:
   - powershell: |

diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -63,7 +63,7 @@ parameters:
 - name: qnn_sdk_version
   type: string
   displayName: 'QNN SDK version. Only for QNN packages.'
-  default: 2.22.0.240425
+  default: 2.23.0.240531
 
 stages:
 - ${{ if eq(parameters.enable_windows_cpu, true) }}:

diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
@@ -7,7 +7,7 @@ parameters:
 - name: QNN_SDK
   displayName: QNN SDK Version
   type: string
-  default: 2.22.0.240425
+  default: 2.23.0.240531
 
 - name: PYTHON_VERSION
   type: string