From b4212b82587179f37e2e75de870ced680946725c Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Thu, 7 Sep 2023 16:04:12 -0700
Subject: [PATCH 01/22] Add unit tests for QNN Reshape

---
 .../test/providers/qnn/reshape_op_test.cc     | 226 ++++++++++++++++++
 1 file changed, 226 insertions(+)
 create mode 100644 onnxruntime/test/providers/qnn/reshape_op_test.cc
diff --git a/onnxruntime/test/providers/qnn/reshape_op_test.cc b/onnxruntime/test/providers/qnn/reshape_op_test.cc
new file mode 100644
index 0000000000000..d295c3ef4d7aa
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/reshape_op_test.cc
@@ -0,0 +1,226 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include <string>
+
+#include "test/providers/qnn/qnn_test_utils.h"
+#include "core/graph/node_attr_utils.h"
+
+#include "onnx/onnx_pb.h"
+#include "gtest/gtest.h"
+
+namespace onnxruntime {
+namespace test {
+
+// Returns a function that creates a graph with a single MaxPool operator.
+template <typename DataType>
+static GetTestModelFn BuildReshapeTestCase(const TestInputDef<DataType>& input_def,
+                                           const TestInputDef<int64_t>& shape_def,
+                                           const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
+  return [input_def, shape_def, attrs](ModelTestBuilder& builder) {
+    NodeArg* input = MakeTestInput(builder, input_def);
+    NodeArg* shape_input = MakeTestInput(builder, shape_def);
+    NodeArg* output = builder.MakeOutput();
+    Node& reshape_node = builder.AddNode("Reshape", {input, shape_input}, {output});
+
+    for (const auto& attr : attrs) {
+      reshape_node.AddAttributeProto(attr);
+    }
+  };
+}
+
+// Returns a function that creates a graph with a QDQ Reshape operator.
+template <typename QuantType>
+GetTestQDQModelFn<QuantType> BuildQDQReshapeTestCase(const TestInputDef<float>& input_def,
+                                                     const TestInputDef<int64_t>& shape_def,
+                                                     const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
+  return [input_def, shape_def, attrs](ModelTestBuilder& builder,
+                                       std::vector<QuantParams<QuantType>>& output_qparams) {
+    // input -> Q -> DQ ->
+    NodeArg* input = MakeTestInput(builder, input_def);
+    QuantParams<QuantType> input_qparams = GetTestInputQuantParams<QuantType>(input_def);
+    NodeArg* input_qdq = AddQDQNodePair<QuantType>(builder, input, input_qparams.scale, input_qparams.zero_point);
+
+    // shape input
+    NodeArg* shape_input = MakeTestInput(builder, shape_def);
+
+    // Reshape op
+    NodeArg* reshape_output = builder.MakeIntermediate();
+    Node& reshape_node = builder.AddNode("Reshape", {input_qdq, shape_input}, {reshape_output});
+
+    for (const auto& attr : attrs) {
+      reshape_node.AddAttributeProto(attr);
+    }
+
+    // op_output -> Q -> DQ -> output
+    // NOTE: Input and output quantization parameters must be equal for Reshape.
+    output_qparams[0] = input_qparams;  // Overwrite!
+    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, reshape_output, input_qparams.scale,
+                                                     input_qparams.zero_point);
+  };
+}
+
+// Runs a model with a Reshape operator on the QNN CPU backend. Checks the graph node assignment,
+// and that inference outputs for QNN EP and CPU EP match.
+template <typename DataType>
+static void RunReshapeTestOnCPU(const TestInputDef<DataType>& input_def,
+                                const TestInputDef<int64_t>& shape_def,
+                                const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                ExpectedEPNodeAssignment expected_ep_assignment,
+                                int opset = 19) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnCpu.dll";
+#else
+  provider_options["backend_path"] = "libQnnCpu.so";
+#endif
+
+  RunQnnModelTest(BuildReshapeTestCase(input_def, shape_def, attrs),
+                  provider_options,
+                  opset,
+                  expected_ep_assignment);
+}
+
+// Runs a model with a non-QDQ Reshape operator on the QNN HTP backend. Checks the graph node assignment,
+// and that inference outputs for QNN EP and CPU EP match.
+template <typename DataType>
+static void RunReshapeTestOnHTP(const TestInputDef<DataType>& input_def,
+                                const TestInputDef<int64_t>& shape_def,
+                                const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                ExpectedEPNodeAssignment expected_ep_assignment,
+                                int opset = 19) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  RunQnnModelTest(BuildReshapeTestCase(input_def, shape_def, attrs),
+                  provider_options,
+                  opset,
+                  expected_ep_assignment);
+}
+
+// Runs a QDQ Reshape model on the QNN (HTP) EP and the ORT CPU EP. Checks the graph node assignment, and that inference
+// running the QDQ model on QNN EP is at least as accurate as on ORT CPU EP (when compared to the baseline float32 model).
+template <typename QType>
+static void RunQDQReshapeTestOnHTP(const TestInputDef<float>& input_def,
+                                   const TestInputDef<int64_t>& shape_def,
+                                   const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                   ExpectedEPNodeAssignment expected_ep_assignment,
+                                   int opset = 19) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  TestQDQModelAccuracy(BuildReshapeTestCase(input_def, shape_def, attrs),            // baseline float32 model
+                       BuildQDQReshapeTestCase<QType>(input_def, shape_def, attrs),  // QDQ model
+                       provider_options,
+                       opset,
+                       expected_ep_assignment);
+}
+
+//
+// CPU tests:
+//
+
+// Test that Reshape with a dynamic shape input is not supported by QNN EP.
+TEST_F(QnnCPUBackendTests, Reshape_DynamicShape_Unsupported) {
+  RunReshapeTestOnCPU(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                      TestInputDef<int64_t>({2}, false /* is_initializer */, {1, 48}),
+                      {},                              // Attributes
+                      ExpectedEPNodeAssignment::None,  // Should not be assigned to QNN EP.
+                      19);                             // Opset
+}
+
+// Test that Reshape with an enabled 'allowzero' attribute is not supported by QNN EP.
+TEST_F(QnnCPUBackendTests, Reshape_AllowZeroAttr_Unsupported) {
+  RunReshapeTestOnCPU(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                      TestInputDef<int64_t>({2}, true, {1, 48}),
+                      {utils::MakeAttribute("allowzero", static_cast<int64_t>(1))},
+                      ExpectedEPNodeAssignment::None,  // Should not be assigned to QNN EP.
+                      19);                             // Opset
+}
+
+// Test Reshape of rank 4 -> rank 2.
+TEST_F(QnnCPUBackendTests, Reshape_4D_f32) {
+  RunReshapeTestOnCPU(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                      TestInputDef<int64_t>({2}, true, {1, 48}),
+                      {},  // Attributes
+                      ExpectedEPNodeAssignment::All,
+                      19);  // Opset
+}
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+//
+// HTP tests:
+//
+
+// Test that QDQ Reshape with a dynamic shape input is not supported by QNN EP.
+TEST_F(QnnHTPBackendTests, Reshape_DynamicShape_Unsupported) {
+  RunQDQReshapeTestOnHTP<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                                  TestInputDef<int64_t>({2}, false /* is_initializer */, {1, 48}),
+                                  {},                              // Attributes
+                                  ExpectedEPNodeAssignment::None,  // Should not be assigned to QNN EP.
+                                  19);                             // Opset
+}
+
+// Test that QDQ Reshape with an enabled 'allowzero' attribute is not supported by QNN EP.
+TEST_F(QnnHTPBackendTests, Reshape_AllowZeroAttr_Unsupported) {
+  RunQDQReshapeTestOnHTP<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                                  TestInputDef<int64_t>({2}, true, {1, 48}),
+                                  {utils::MakeAttribute("allowzero", static_cast<int64_t>(1))},
+                                  ExpectedEPNodeAssignment::None,  // Should not be assigned to QNN EP.
+                                  19);                             // Opset
+}
+
+// Test QDQ Reshape of rank 4 -> rank 2.
+TEST_F(QnnHTPBackendTests, Reshape_4D_f32) {
+  RunQDQReshapeTestOnHTP<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                                  TestInputDef<int64_t>({2}, true, {1, 48}),
+                                  {},  // Attributes
+                                  ExpectedEPNodeAssignment::All,
+                                  19);  // Opset
+}
+
+// Test that int32 Reshape runs on HTP backend.
+TEST_F(QnnHTPBackendTests, Reshape_4D_int32) {
+  std::vector<int32_t> input_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  RunReshapeTestOnHTP<int32_t>(TestInputDef<int32_t>({1, 3, 2, 2}, false, input_data),
+                               TestInputDef<int64_t>({3}, true, {1, 1, 12}),
+                               {},  // Attributes
+                               ExpectedEPNodeAssignment::All,
+                               19);  // Opset
+}
+
+// Test QDQ Reshape with a shape value of 0 (copy dimension from input)
+TEST_F(QnnHTPBackendTests, Reshape_4D_0MeansCopy) {
+  RunQDQReshapeTestOnHTP<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                                  TestInputDef<int64_t>({3}, true, {1, 0, 16}),  // zero means copy => '(1, 3, 16)'
+                                  {},                                            // Attributes
+                                  ExpectedEPNodeAssignment::All,
+                                  19);  // Opset
+}
+
+// Test QDQ Reshape with a shape value of -1 (dimension is inferred from the expect number of elements)
+TEST_F(QnnHTPBackendTests, Reshape_4D_Neg1MeansInfer) {
+  RunQDQReshapeTestOnHTP<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                                  TestInputDef<int64_t>({3}, true, {1, 3, -1}),  // -1 means infer => '(1, 3, 16)'
+                                  {},                                            // Attributes
+                                  ExpectedEPNodeAssignment::All,
+                                  19);  // Opset
+}
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+}  // namespace test
+}  // namespace onnxruntime
+#endif  // !defined(ORT_MINIMAL_BUILD)

From 1b6135a5a96f5aad94d25a8e3cf5bf865f1f9d5c Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Thu, 7 Sep 2023 16:11:02 -0700
Subject: [PATCH 02/22] Clean up comments

---
 onnxruntime/test/providers/qnn/reshape_op_test.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/test/providers/qnn/reshape_op_test.cc b/onnxruntime/test/providers/qnn/reshape_op_test.cc
index d295c3ef4d7aa..9615e54ed61d2 100644
--- a/onnxruntime/test/providers/qnn/reshape_op_test.cc
+++ b/onnxruntime/test/providers/qnn/reshape_op_test.cc
@@ -14,7 +14,7 @@
 namespace onnxruntime {
 namespace test {
 
-// Returns a function that creates a graph with a single MaxPool operator.
+// Returns a function that creates a graph with a single Reshape operator.
 template <typename DataType>
 static GetTestModelFn BuildReshapeTestCase(const TestInputDef<DataType>& input_def,
                                            const TestInputDef<int64_t>& shape_def,
@@ -62,7 +62,7 @@ GetTestQDQModelFn<QuantType> BuildQDQReshapeTestCase(const TestInputDef<float>&
   };
 }
 
-// Runs a model with a Reshape operator on the QNN CPU backend. Checks the graph node assignment,
+// Runs a model with a Reshape operator on the QNN CPU backend. Checks the graph node assignment
 // and that inference outputs for QNN EP and CPU EP match.
 template <typename DataType>
 static void RunReshapeTestOnCPU(const TestInputDef<DataType>& input_def,
@@ -84,7 +84,7 @@ static void RunReshapeTestOnCPU(const TestInputDef<DataType>& input_def,
                   expected_ep_assignment);
 }
 
-// Runs a model with a non-QDQ Reshape operator on the QNN HTP backend. Checks the graph node assignment,
+// Runs a model with a non-QDQ Reshape operator on the QNN HTP backend. Checks the graph node assignment
 // and that inference outputs for QNN EP and CPU EP match.
 template <typename DataType>
 static void RunReshapeTestOnHTP(const TestInputDef<DataType>& input_def,
@@ -106,7 +106,7 @@ static void RunReshapeTestOnHTP(const TestInputDef<DataType>& input_def,
                   expected_ep_assignment);
 }
 
-// Runs a QDQ Reshape model on the QNN (HTP) EP and the ORT CPU EP. Checks the graph node assignment, and that inference
+// Runs a QDQ Reshape model on the QNN (HTP) EP and the ORT CPU EP. Checks the graph node assignment and that inference
 // running the QDQ model on QNN EP is at least as accurate as on ORT CPU EP (when compared to the baseline float32 model).
 template <typename QType>
 static void RunQDQReshapeTestOnHTP(const TestInputDef<float>& input_def,
@@ -211,7 +211,7 @@ TEST_F(QnnHTPBackendTests, Reshape_4D_0MeansCopy) {
                                   19);  // Opset
 }
 
-// Test QDQ Reshape with a shape value of -1 (dimension is inferred from the expect number of elements)
+// Test QDQ Reshape with a shape value of -1 (dimension is inferred from the expected number of elements)
 TEST_F(QnnHTPBackendTests, Reshape_4D_Neg1MeansInfer) {
   RunQDQReshapeTestOnHTP<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
                                   TestInputDef<int64_t>({3}, true, {1, 3, -1}),  // -1 means infer => '(1, 3, 16)'

From 06992bc3cfd5041356ccaf13bcbace2122dd77db Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Fri, 15 Sep 2023 14:07:50 -0700
Subject: [PATCH 03/22] Add QNN EP tests for the ONNX Flatten op

---
 .../test/providers/qnn/flatten_op_test.cc     | 180 ++++++++++++++++++
 1 file changed, 180 insertions(+)
 create mode 100644 onnxruntime/test/providers/qnn/flatten_op_test.cc

diff --git a/onnxruntime/test/providers/qnn/flatten_op_test.cc b/onnxruntime/test/providers/qnn/flatten_op_test.cc
new file mode 100644
index 0000000000000..ac00b8533f297
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/flatten_op_test.cc
@@ -0,0 +1,180 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include <string>
+
+#include "test/providers/qnn/qnn_test_utils.h"
+#include "core/graph/node_attr_utils.h"
+
+#include "onnx/onnx_pb.h"
+#include "gtest/gtest.h"
+
+namespace onnxruntime {
+namespace test {
+
+// Runs a model with a Flatten operator on the QNN CPU backend. Checks the graph node assignment
+// and that inference outputs for QNN EP and CPU EP match.
+template <typename DataType>
+static void RunFlattenTestOnCPU(const TestInputDef<DataType>& input_def,
+                                const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                ExpectedEPNodeAssignment expected_ep_assignment,
+                                int opset = 13) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnCpu.dll";
+#else
+  provider_options["backend_path"] = "libQnnCpu.so";
+#endif
+
+  RunQnnModelTest(BuildOpTestCase<DataType>("Flatten", {input_def}, attrs),
+                  provider_options,
+                  opset,
+                  expected_ep_assignment);
+}
+
+// Runs a model with a non-QDQ Flatten operator on the QNN HTP backend. Checks the graph node assignment
+// and that inference outputs for QNN EP and CPU EP match.
+template <typename DataType>
+static void RunFlattenTestOnHTP(const TestInputDef<DataType>& input_def,
+                                const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                ExpectedEPNodeAssignment expected_ep_assignment,
+                                int opset = 12) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  RunQnnModelTest(BuildOpTestCase<DataType>("Flatten", {input_def}, attrs),
+                  provider_options,
+                  opset,
+                  expected_ep_assignment);
+}
+
+// Runs a QDQ Flatten model on the QNN (HTP) EP and the ORT CPU EP. Checks the graph node assignment and that inference
+// running the QDQ model on QNN EP is at least as accurate as on ORT CPU EP (compared to the baseline float32 model).
+template <typename QType>
+static void RunQDQFlattenTestOnHTP(const TestInputDef<float>& input_def,
+                                   const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                   ExpectedEPNodeAssignment expected_ep_assignment,
+                                   int opset = 13) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  TestQDQModelAccuracy(BuildOpTestCase<float>("Flatten", {input_def}, attrs),     // baseline float32 model
+                       BuildQDQOpTestCase<QType>("Flatten", {input_def}, attrs),  // QDQ model
+                       provider_options,
+                       opset,
+                       expected_ep_assignment);
+}
+
+//
+// CPU tests:
+//
+
+// Test that Flatten input (rank4) with axis == 0.
+TEST_F(QnnCPUBackendTests, Flatten_Rank4_Axis0) {
+  RunFlattenTestOnCPU(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                      {utils::MakeAttribute("axis", static_cast<int64_t>(0))},  // Attributes
+                      ExpectedEPNodeAssignment::All);
+}
+
+// Test that Flatten input (rank4) with axis == -1.
+TEST_F(QnnCPUBackendTests, Flatten_Rank4_AxisNeg1) {
+  RunFlattenTestOnCPU(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                      {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},  // Attributes
+                      ExpectedEPNodeAssignment::All);
+}
+
+// Test that Flatten input (rank5) with axis == 2.
+TEST_F(QnnCPUBackendTests, Flatten_Rank5_Axis2) {
+  RunFlattenTestOnCPU(TestInputDef<float>({1, 2, 3, 4, 4}, false, -10.0f, 10.0f),
+                      {utils::MakeAttribute("axis", static_cast<int64_t>(2))},  // Attributes
+                      ExpectedEPNodeAssignment::All);
+}
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+//
+// HTP tests:
+//
+// Test that Flatten input (rank4) with axis == 0.
+TEST_F(QnnHTPBackendTests, Flatten_Rank4_Axis0) {
+  RunQDQFlattenTestOnHTP<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                                  {utils::MakeAttribute("axis", static_cast<int64_t>(0))},
+                                  ExpectedEPNodeAssignment::All);
+}
+
+// Test that Flatten input (rank4) with axis == -1.
+TEST_F(QnnHTPBackendTests, Flatten_Rank4_AxisNeg1) {
+  RunQDQFlattenTestOnHTP<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                                  {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},
+                                  ExpectedEPNodeAssignment::All);
+}
+
+// Test QDQ Flatten with an input of rank5.
+TEST_F(QnnHTPBackendTests, Flatten_QDQ8bit_Rank5) {
+  // We can't use the usual model-building functions because they add standalone Quantize and Dequantize nodes
+  // at the input and output. These Q/DQ ops get lowered to QNN's Quantize and Dequantize operators, which DO NOT
+  // support rank 5 tensors. Therefore, we have to create a test model that only instantiates the DQ -> Flatten -> Q
+  // QDQ node group, which gets lowered to a single QNN Reshape node.
+  GetTestModelFn model_fn = [](ModelTestBuilder& builder) {
+    // input (u8) -> DQ ->
+    NodeArg* quant_input = builder.MakeInput<uint8_t>({1, 2, 3, 4, 5}, 0, 255);
+    NodeArg* input_dq = builder.MakeIntermediate();
+    builder.AddDequantizeLinearNode<uint8_t>(quant_input, 1.0f, 0, input_dq);  // scale = 1.0, zp = 0
+
+    // Flatten ->
+    NodeArg* flatten_output = builder.MakeIntermediate();
+    Node& flatten_node = builder.AddNode("Flatten", {input_dq}, {flatten_output});
+    flatten_node.AddAttribute("axis", static_cast<int64_t>(2));
+
+    // Q -> output (u8)
+    NodeArg* output = builder.MakeOutput();
+    builder.AddQuantizeLinearNode<uint8_t>(flatten_output, 1.0f, 0, output);  // scale = 1.0, zp = 0
+  };
+
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  RunQnnModelTest(model_fn,
+                  provider_options,
+                  13,  // opset
+                  ExpectedEPNodeAssignment::All);
+}
+
+// Test that rank4 int32 Flatten runs on HTP backend.
+TEST_F(QnnHTPBackendTests, Flatten_Int32_Rank4_Axis2) {
+  std::vector<int32_t> input_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  RunFlattenTestOnHTP<int32_t>(TestInputDef<int32_t>({1, 3, 2, 2}, false, input_data),
+                               {utils::MakeAttribute("axis", static_cast<int64_t>(2))},
+                               ExpectedEPNodeAssignment::All);
+}
+
+// Test that rank 5 int32 Flatten runs on HTP backend.
+TEST_F(QnnHTPBackendTests, Flatten_Int32_Rank5_Axis2) {
+  std::vector<int32_t> input_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                                     12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23};
+  RunFlattenTestOnHTP<int32_t>(TestInputDef<int32_t>({1, 3, 2, 2, 2}, false, input_data),
+                               {utils::MakeAttribute("axis", static_cast<int64_t>(2))},
+                               ExpectedEPNodeAssignment::All);
+}
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+}  // namespace test
+}  // namespace onnxruntime
+#endif  // !defined(ORT_MINIMAL_BUILD)

From ebea003a11b47f8762b84a436efdd67a8bb1c2f5 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Fri, 15 Sep 2023 14:32:20 -0700
Subject: [PATCH 04/22] Use correct opset version for Flatten

---
 onnxruntime/test/providers/qnn/flatten_op_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/test/providers/qnn/flatten_op_test.cc b/onnxruntime/test/providers/qnn/flatten_op_test.cc
index ac00b8533f297..42484a8ec2c77 100644
--- a/onnxruntime/test/providers/qnn/flatten_op_test.cc
+++ b/onnxruntime/test/providers/qnn/flatten_op_test.cc
@@ -41,7 +41,7 @@ template <typename DataType>
 static void RunFlattenTestOnHTP(const TestInputDef<DataType>& input_def,
                                 const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
                                 ExpectedEPNodeAssignment expected_ep_assignment,
-                                int opset = 12) {
+                                int opset = 13) {
   ProviderOptions provider_options;
 
 #if defined(_WIN32)

From d4b19fb5d2ef60238e5f4d9ba88b2cfc4925bca1 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Fri, 15 Sep 2023 15:06:37 -0700
Subject: [PATCH 05/22] Remove unnecessary comments

---
 onnxruntime/test/providers/qnn/flatten_op_test.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/test/providers/qnn/flatten_op_test.cc b/onnxruntime/test/providers/qnn/flatten_op_test.cc
index 42484a8ec2c77..7952cc7b6ea1d 100644
--- a/onnxruntime/test/providers/qnn/flatten_op_test.cc
+++ b/onnxruntime/test/providers/qnn/flatten_op_test.cc
@@ -85,21 +85,21 @@ static void RunQDQFlattenTestOnHTP(const TestInputDef<float>& input_def,
 // Test that Flatten input (rank4) with axis == 0.
 TEST_F(QnnCPUBackendTests, Flatten_Rank4_Axis0) {
   RunFlattenTestOnCPU(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
-                      {utils::MakeAttribute("axis", static_cast<int64_t>(0))},  // Attributes
+                      {utils::MakeAttribute("axis", static_cast<int64_t>(0))},
                       ExpectedEPNodeAssignment::All);
 }
 
 // Test that Flatten input (rank4) with axis == -1.
 TEST_F(QnnCPUBackendTests, Flatten_Rank4_AxisNeg1) {
   RunFlattenTestOnCPU(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
-                      {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},  // Attributes
+                      {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},
                       ExpectedEPNodeAssignment::All);
 }
 
 // Test that Flatten input (rank5) with axis == 2.
 TEST_F(QnnCPUBackendTests, Flatten_Rank5_Axis2) {
   RunFlattenTestOnCPU(TestInputDef<float>({1, 2, 3, 4, 4}, false, -10.0f, 10.0f),
-                      {utils::MakeAttribute("axis", static_cast<int64_t>(2))},  // Attributes
+                      {utils::MakeAttribute("axis", static_cast<int64_t>(2))},
                       ExpectedEPNodeAssignment::All);
 }
 

From 8b79d15739e4493939a8c3fa9e329614392bc160 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Sat, 16 Sep 2023 18:10:17 -0700
Subject: [PATCH 06/22] Add QNN tests for Squeeze and Unsqueeze operators

---
 .../qnn/squeeze_unsqueeze_op_test.cc          | 310 ++++++++++++++++++
 1 file changed, 310 insertions(+)
 create mode 100644 onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc

diff --git a/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc b/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc
new file mode 100644
index 0000000000000..05006f7eaf9b5
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc
@@ -0,0 +1,310 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include <string>
+
+#include "test/providers/qnn/qnn_test_utils.h"
+
+#include "onnx/onnx_pb.h"
+#include "gtest/gtest.h"
+
+namespace onnxruntime {
+namespace test {
+
+// Returns a function that creates a graph with a single (Un)Squeeze operator.
+template <typename DataType>
+static GetTestModelFn BuildSqueezeTestCase(const std::string& op_type,  // Squeeze or Unsqueeze
+                                           const TestInputDef<DataType>& input_def,
+                                           const TestInputDef<int64_t>& axes_def) {
+  return [op_type, input_def, axes_def](ModelTestBuilder& builder) {
+    NodeArg* input = MakeTestInput(builder, input_def);
+    NodeArg* axes_input = MakeTestInput(builder, axes_def);
+    NodeArg* output = builder.MakeOutput();
+    builder.AddNode(op_type, {input, axes_input}, {output});
+  };
+}
+
+// Returns a function that creates a graph with a QDQ (Un)Squeeze operator.
+template <typename QuantType>
+GetTestQDQModelFn<QuantType> BuildQDQSqueezeTestCase(const std::string& op_type,  // Squeeze or Unsqueeze
+                                                     const TestInputDef<float>& input_def,
+                                                     const TestInputDef<int64_t>& axes_def) {
+  return [op_type, input_def, axes_def](ModelTestBuilder& builder,
+                                        std::vector<QuantParams<QuantType>>& output_qparams) {
+    // input -> Q -> DQ ->
+    NodeArg* input = MakeTestInput(builder, input_def);
+    QuantParams<QuantType> input_qparams = GetTestInputQuantParams<QuantType>(input_def);
+    NodeArg* input_qdq = AddQDQNodePair<QuantType>(builder, input, input_qparams.scale, input_qparams.zero_point);
+
+    // axes input
+    NodeArg* axes_input = MakeTestInput(builder, axes_def);
+
+    // (Un)Squeeze op
+    NodeArg* op_output = builder.MakeIntermediate();
+    builder.AddNode(op_type, {input_qdq, axes_input}, {op_output});
+
+    // op_output -> Q -> DQ -> output
+    // NOTE: Input and output quantization parameters must be equal for (Un)Squeeze.
+    output_qparams[0] = input_qparams;  // Overwrite!
+    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, op_output, input_qparams.scale,
+                                                     input_qparams.zero_point);
+  };
+}
+
+// Runs a model with a Squeeze (or Unsqueeze) operator on the QNN CPU backend. Checks the graph node assignment
+// and that inference outputs for QNN EP and CPU EP match.
+template <typename DataType>
+static void RunSqueezeTestOnCPU(const std::string& op_type,  // Squeeze or Unsqueeze
+                                const TestInputDef<DataType>& input_def,
+                                const TestInputDef<int64_t>& axes_def,
+                                ExpectedEPNodeAssignment expected_ep_assignment,
+                                int opset = 13) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnCpu.dll";
+#else
+  provider_options["backend_path"] = "libQnnCpu.so";
+#endif
+
+  RunQnnModelTest(BuildSqueezeTestCase<DataType>(op_type, input_def, axes_def),
+                  provider_options,
+                  opset,
+                  expected_ep_assignment);
+}
+
+// Runs a model with a non-QDQ (Un)Squeeze operator on the QNN HTP backend. Checks the graph node assignment
+// and that inference outputs for QNN EP and CPU EP match.
+template <typename DataType>
+static void RunSqueezeTestOnHTP(const std::string& op_type,  // Squeeze or Unsqueeze
+                                const TestInputDef<DataType>& input_def,
+                                const TestInputDef<int64_t>& axes_def,
+                                ExpectedEPNodeAssignment expected_ep_assignment,
+                                int opset = 13) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  RunQnnModelTest(BuildSqueezeTestCase<DataType>(op_type, input_def, axes_def),
+                  provider_options,
+                  opset,
+                  expected_ep_assignment);
+}
+
+// Runs a QDQ (Un)Squeeze model on the QNN (HTP) EP and the ORT CPU EP. Checks the graph node assignment and
+// that inference running the QDQ model on QNN EP is at least as accurate as on ORT CPU EP
+// (when compared to the baseline float32 model).
+template <typename QType>
+static void RunQDQSqueezeTestOnHTP(const std::string& op_type,
+                                   const TestInputDef<float>& input_def,
+                                   const TestInputDef<int64_t>& axes_def,
+                                   ExpectedEPNodeAssignment expected_ep_assignment,
+                                   int opset = 13) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  TestQDQModelAccuracy(BuildSqueezeTestCase<float>(op_type, input_def, axes_def),     // baseline float32 model
+                       BuildQDQSqueezeTestCase<QType>(op_type, input_def, axes_def),  // QDQ model
+                       provider_options,
+                       opset,
+                       expected_ep_assignment);
+}
+
+//
+// CPU tests:
+//
+
+// Test that Squeeze with a dynamic axes input is not supported by QNN EP.
+TEST_F(QnnCPUBackendTests, Squeeze_DynamicAxes_Unsupported) {
+  RunSqueezeTestOnCPU("Squeeze",
+                      TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                      TestInputDef<int64_t>({1}, false /* is_initializer */, {0}),
+                      ExpectedEPNodeAssignment::None);  // Should not be assigned to QNN EP.
+}
+
+// Test that Unsqueeze with a dynamic axes input is not supported by QNN EP.
+TEST_F(QnnCPUBackendTests, Unsqueeze_DynamicAxes_Unsupported) {
+  RunSqueezeTestOnCPU("Unsqueeze",
+                      TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                      TestInputDef<int64_t>({1}, false /* is_initializer */, {0}),
+                      ExpectedEPNodeAssignment::None);  // Should not be assigned to QNN EP.
+}
+
+// Test Squeeze of rank 5 -> rank 2.
+TEST_F(QnnCPUBackendTests, Squeeze_Rank5_Rank2_f32) {
+  RunSqueezeTestOnCPU("Squeeze",
+                      TestInputDef<float>({1, 3, 1, 2, 4}, false, -10.0f, 10.0f),
+                      TestInputDef<int64_t>({2}, true, {0, 2}),  // Squeeze axes 0 and 2 => (3, 2, 4)
+                      ExpectedEPNodeAssignment::All);
+}
+
+// Test Squeeze of rank 4 -> rank 3 with a negative axes value.
+TEST_F(QnnCPUBackendTests, Squeeze_Rank4_Rank3_NegAxes_f32) {
+  RunSqueezeTestOnCPU("Squeeze",
+                      TestInputDef<float>({1, 3, 2, 1}, false, -10.0f, 10.0f),
+                      TestInputDef<int64_t>({1}, true, {-1}),  // Squeeze last axis => (1, 3, 2)
+                      ExpectedEPNodeAssignment::All);
+}
+
+// Test Unsqueeze of rank 3 -> rank 5.
+TEST_F(QnnCPUBackendTests, Unsqueeze_Rank3_Rank5_f32) {
+  RunSqueezeTestOnCPU("Unsqueeze",
+                      TestInputDef<float>({3, 2, 4}, false, -10.0f, 10.0f),
+                      TestInputDef<int64_t>({2}, true, {0, 2}),  // Add 1's => (1, 3, 1, 2, 4)
+                      ExpectedEPNodeAssignment::All);
+}
+
+// Test Unsqueeze of rank 3 -> rank 4 with a negative axes value.
+TEST_F(QnnCPUBackendTests, Unsqueeze_Rank3_Rank4_NegAxes_f32) {
+  RunSqueezeTestOnCPU("Unsqueeze",
+                      TestInputDef<float>({1, 3, 2}, false, -10.0f, 10.0f),
+                      TestInputDef<int64_t>({1}, true, {-1}),  // Add 1 as last axis => (1, 3, 2, 1)
+                      ExpectedEPNodeAssignment::All);
+}
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+//
+// HTP tests:
+//
+
+// Test that QDQ Squeeze with a dynamic axes input is not supported by QNN EP.
+TEST_F(QnnHTPBackendTests, Squeeze_DynamicAxes_Unsupported) {
+  RunQDQSqueezeTestOnHTP<uint8_t>("Squeeze",
+                                  TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                                  TestInputDef<int64_t>({1}, false /* is_initializer */, {0}),
+                                  ExpectedEPNodeAssignment::None);  // Should not be assigned to QNN EP.
+}
+
+// Test that Unsqueeze with a dynamic axes input is not supported by QNN EP.
+TEST_F(QnnHTPBackendTests, Unsqueeze_DynamicAxes_Unsupported) {
+  RunQDQSqueezeTestOnHTP<uint8_t>("Unsqueeze",
+                                  TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                                  TestInputDef<int64_t>({1}, false /* is_initializer */, {0}),
+                                  ExpectedEPNodeAssignment::None);  // Should not be assigned to QNN EP.
+}
+
+// Test Squeeze of rank 5 -> rank 2.
+TEST_F(QnnHTPBackendTests, Squeeze_Rank5_Rank2_f32) {
+  // We can't use the usual model-building functions because they add standalone Quantize and Dequantize nodes
+  // at the input and output. These Q/DQ ops get lowered to QNN's Quantize and Dequantize operators, which DO NOT
+  // support rank 5 tensors. Therefore, we have to create a test model that only instantiates the DQ -> Squeeze -> Q
+  // QDQ node group, which gets lowered to a single QNN Reshape node.
+  GetTestModelFn model_fn = [](ModelTestBuilder& builder) {
+    // input (u8) -> DQ ->
+    NodeArg* quant_input = builder.MakeInput<uint8_t>({1, 3, 1, 2, 4}, 0, 255);
+    NodeArg* input_dq = builder.MakeIntermediate();
+    builder.AddDequantizeLinearNode<uint8_t>(quant_input, 1.0f, 0, input_dq);  // scale = 1.0, zp = 0
+
+    // axes_input ->
+    NodeArg* axes_input = builder.Make1DInitializer<int64_t>({0, 2});  // Squeeze axes 0 and 2 => (3, 2, 4)
+
+    // Squeeze ->
+    NodeArg* squeeze_output = builder.MakeIntermediate();
+    builder.AddNode("Squeeze", {input_dq, axes_input}, {squeeze_output});
+
+    // Q -> output (u8)
+    NodeArg* output = builder.MakeOutput();
+    builder.AddQuantizeLinearNode<uint8_t>(squeeze_output, 1.0f, 0, output);  // scale = 1.0, zp = 0
+  };
+
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  RunQnnModelTest(model_fn,
+                  provider_options,
+                  13,  // opset
+                  ExpectedEPNodeAssignment::All);
+}
+
+// Test Squeeze of rank 4 -> rank 3 with a negative axes value.
+TEST_F(QnnHTPBackendTests, Squeeze_Rank4_Rank3_NegAxes_f32) {
+  RunQDQSqueezeTestOnHTP<uint8_t>("Squeeze",
+                                  TestInputDef<float>({1, 3, 2, 1}, false, -10.0f, 10.0f),
+                                  TestInputDef<int64_t>({1}, true, {-1}),  // Squeeze last axis => (1, 3, 2)
+                                  ExpectedEPNodeAssignment::All);
+}
+
+// Test Unsqueeze of rank 3 -> rank 5.
+TEST_F(QnnHTPBackendTests, Unsqueeze_Rank3_Rank5_f32) {
+  // We can't use the usual model-building functions because they add standalone Quantize and Dequantize nodes
+  // at the input and output. These Q/DQ ops get lowered to QNN's Quantize and Dequantize operators, which DO NOT
+  // support rank 5 tensors. Therefore, we have to create a test model that only instantiates the DQ -> Squeeze -> Q
+  // QDQ node group, which gets lowered to a single QNN Reshape node.
+  GetTestModelFn model_fn = [](ModelTestBuilder& builder) {
+    // input (u8) -> DQ ->
+    NodeArg* quant_input = builder.MakeInput<uint8_t>({3, 2, 4}, 0, 255);
+    NodeArg* input_dq = builder.MakeIntermediate();
+    builder.AddDequantizeLinearNode<uint8_t>(quant_input, 1.0f, 0, input_dq);  // scale = 1.0, zp = 0
+
+    // axes_input ->
+    NodeArg* axes_input = builder.Make1DInitializer<int64_t>({0, 2});  // Add 1's => (1, 3, 1, 2, 4)
+
+    // Unsqueeze ->
+    NodeArg* unsqueeze_output = builder.MakeIntermediate();
+    builder.AddNode("Unsqueeze", {input_dq, axes_input}, {unsqueeze_output});
+
+    // Q -> output (u8)
+    NodeArg* output = builder.MakeOutput();
+    builder.AddQuantizeLinearNode<uint8_t>(unsqueeze_output, 1.0f, 0, output);  // scale = 1.0, zp = 0
+  };
+
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  RunQnnModelTest(model_fn,
+                  provider_options,
+                  13,  // opset
+                  ExpectedEPNodeAssignment::All);
+}
+
+// Test Unsqueeze of rank 3 -> rank 4 with a negative axes value.
+TEST_F(QnnHTPBackendTests, Unsqueeze_Rank3_Rank4_NegAxes_f32) {
+  RunQDQSqueezeTestOnHTP<uint8_t>("Unsqueeze",
+                                  TestInputDef<float>({1, 3, 2}, false, -10.0f, 10.0f),
+                                  TestInputDef<int64_t>({1}, true, {-1}),  // Add 1 as last axis => (1, 3, 2, 1)
+                                  ExpectedEPNodeAssignment::All);
+}
+
+// Test that int32 Squeeze runs on HTP backend.
+TEST_F(QnnHTPBackendTests, Squeeze_Int32_Rank4_Rank3) {
+  std::vector<int32_t> input_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  RunSqueezeTestOnHTP<int32_t>("Squeeze",
+                               TestInputDef<int32_t>({1, 3, 2, 2}, false, input_data),
+                               TestInputDef<int64_t>({1}, true, {0}),  // Squeeze 0th axis => (3, 2, 2)
+                               ExpectedEPNodeAssignment::All);
+}
+
+// Test that int32 Unsqueeze runs on HTP backend.
+TEST_F(QnnHTPBackendTests, Unsqueeze_Int32_Rank3_Rank4) {
+  std::vector<int32_t> input_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  RunSqueezeTestOnHTP<int32_t>("Unsqueeze",
+                               TestInputDef<int32_t>({3, 2, 2}, false, input_data),
+                               TestInputDef<int64_t>({1}, true, {0}),  // Unsqueeze 0th axis => (1, 3, 2, 2)
+                               ExpectedEPNodeAssignment::All);
+}
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+}  // namespace test
+}  // namespace onnxruntime
+#endif  // !defined(ORT_MINIMAL_BUILD)

From 6d0ab5557a9a3ccdcfadb18ce477697c09784629 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Sat, 16 Sep 2023 18:21:35 -0700
Subject: [PATCH 07/22] Fix comment

---
 onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc b/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc
index 05006f7eaf9b5..0c92969acdcba 100644
--- a/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc
+++ b/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc
@@ -244,7 +244,7 @@ TEST_F(QnnHTPBackendTests, Squeeze_Rank4_Rank3_NegAxes_f32) {
 TEST_F(QnnHTPBackendTests, Unsqueeze_Rank3_Rank5_f32) {
   // We can't use the usual model-building functions because they add standalone Quantize and Dequantize nodes
   // at the input and output. These Q/DQ ops get lowered to QNN's Quantize and Dequantize operators, which DO NOT
-  // support rank 5 tensors. Therefore, we have to create a test model that only instantiates the DQ -> Squeeze -> Q
+  // support rank 5 tensors. Therefore, we have to create a test model that only instantiates the DQ -> Unsqueeze -> Q
   // QDQ node group, which gets lowered to a single QNN Reshape node.
   GetTestModelFn model_fn = [](ModelTestBuilder& builder) {
     // input (u8) -> DQ ->

From 5215e1eb57a74e473127fa1a4c1df26703c84a44 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Sun, 17 Sep 2023 13:19:40 -0700
Subject: [PATCH 08/22] Add QNN CPU tests for Gemm. Need HTP tests.

---
 .../test/providers/qnn/gemm_op_test.cc        | 161 ++++++++++++++++++
 1 file changed, 161 insertions(+)
 create mode 100644 onnxruntime/test/providers/qnn/gemm_op_test.cc

diff --git a/onnxruntime/test/providers/qnn/gemm_op_test.cc b/onnxruntime/test/providers/qnn/gemm_op_test.cc
new file mode 100644
index 0000000000000..709a886a5d60d
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/gemm_op_test.cc
@@ -0,0 +1,161 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include <string>
+
+#include "test/providers/qnn/qnn_test_utils.h"
+#include "core/graph/node_attr_utils.h"
+
+#include "onnx/onnx_pb.h"
+#include "gtest/gtest.h"
+
+namespace onnxruntime {
+namespace test {
+
+// Runs a model with a Gemm operator on the QNN CPU backend. Checks the graph node assignment
+// and that inference outputs for QNN EP and CPU EP match.
+template <typename DataType>
+static void RunGemmTestOnCPU(const std::vector<TestInputDef<DataType>>& input_defs,
+                             const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                             ExpectedEPNodeAssignment expected_ep_assignment,
+                             int opset = 13) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnCpu.dll";
+#else
+  provider_options["backend_path"] = "libQnnCpu.so";
+#endif
+
+  RunQnnModelTest(BuildOpTestCase("Gemm", input_defs, attrs),
+                  provider_options,
+                  opset,
+                  expected_ep_assignment);
+}
+
+//
+// CPU tests:
+//
+
+// Test that Gemm with non-default 'alpha' or 'beta' attributes is not supported by QNN EP.
+TEST_F(QnnCPUBackendTests, Gemm_NonDefaultAlphaBeta_Unsupported) {
+  // Check that alpha != 1.0f is not supported.
+  RunGemmTestOnCPU<float>({TestInputDef<float>({1, 2}, false, -10.0f, 10.0f),
+                           TestInputDef<float>({2, 4}, false, -10.0f, 10.0f)},
+                          {utils::MakeAttribute("alpha", 1.5f)},
+                          ExpectedEPNodeAssignment::None);  // Should not be assigned to QNN EP.
+
+  // Check that beta != 1.0f is not supported.
+  RunGemmTestOnCPU<float>({TestInputDef<float>({1, 2}, false, -10.0f, 10.0f),
+                           TestInputDef<float>({2, 4}, false, -10.0f, 10.0f),
+                           TestInputDef<float>({1, 4}, false, -1.0f, 1.0f)},
+                          {utils::MakeAttribute("beta", 1.2f)},
+                          ExpectedEPNodeAssignment::None);  // Should not be assigned to QNN EP.
+}
+
+// Test that Gemm with general 2D bias (M, N) is NOT supported (unless M == 1).
+// QNN's FullyConnected operator only supports `outputVector = ( inputAsVector * weightsMatrix ) + biasesVector`
+TEST_F(QnnCPUBackendTests, Gemm_2D_Bias_Unsupported) {
+  std::vector<float> input_a_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
+  std::vector<float> input_b_data = GetFloatDataInRange(-5.0f, 5.0f, 12);
+
+  // 2D matrix mul with bias not supported.
+  RunGemmTestOnCPU<float>({TestInputDef<float>({2, 3}, false, input_a_data),
+                           TestInputDef<float>({3, 4}, false, input_b_data),
+                           TestInputDef<float>({2, 4}, false, -1.0f, 1.0f)},
+                          {},
+                          ExpectedEPNodeAssignment::None);  // Should not be assigned to QNN EP.
+
+  // However, 2D matrix mul without a bias is supported. Input A's 0th dimension is interpreted as `batch_size`.
+  RunGemmTestOnCPU<float>({TestInputDef<float>({2, 3}, false, input_a_data),
+                           TestInputDef<float>({3, 4}, false, input_b_data)},
+                          {},
+                          ExpectedEPNodeAssignment::All);  // Assigned to QNN EP.
+}
+
+// Test Gemm with dynamic (i.e., not initializer) inputs (A, B, Bias).
+TEST_F(QnnCPUBackendTests, Gemm_Dynamic_A_B_Bias) {
+  std::vector<float> input_a_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
+  std::vector<float> input_b_data = GetFloatDataInRange(-5.0f, 5.0f, 24);
+  std::vector<float> input_c_data = GetFloatDataInRange(-1.0f, 1.0f, 4);
+  RunGemmTestOnCPU<float>({TestInputDef<float>({1, 6}, false, input_a_data),
+                           TestInputDef<float>({6, 4}, false, input_b_data),
+                           TestInputDef<float>({1, 4}, false, input_c_data)},
+                          {},
+                          ExpectedEPNodeAssignment::All);
+}
+
+// Test Gemm with static B and Bias inputs.
+TEST_F(QnnCPUBackendTests, Gemm_Static_B_And_Bias) {
+  std::vector<float> input_a_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
+  std::vector<float> input_b_data = GetFloatDataInRange(-5.0f, 5.0f, 24);
+  std::vector<float> input_c_data = GetFloatDataInRange(-1.0f, 1.0f, 4);
+  RunGemmTestOnCPU<float>({TestInputDef<float>({1, 6}, false, input_a_data),
+                           TestInputDef<float>({6, 4}, true, input_b_data),
+                           TestInputDef<float>({1, 4}, true, input_c_data)},
+                          {},
+                          ExpectedEPNodeAssignment::All);
+}
+
+// Test Gemm with transposed A/B and static B and Bias inputs.
+TEST_F(QnnCPUBackendTests, Gemm_TransAB_Static_B_And_Bias) {
+  std::vector<float> input_a_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
+  std::vector<float> input_b_data = GetFloatDataInRange(-5.0f, 5.0f, 24);
+  std::vector<float> input_c_data = GetFloatDataInRange(-1.0f, 1.0f, 4);
+  RunGemmTestOnCPU<float>({TestInputDef<float>({6, 1}, false, input_a_data),
+                           TestInputDef<float>({4, 6}, true, input_b_data),
+                           TestInputDef<float>({1, 4}, true, input_c_data)},
+                          {utils::MakeAttribute("transA", static_cast<int64_t>(1)),
+                           utils::MakeAttribute("transB", static_cast<int64_t>(1))},
+                          ExpectedEPNodeAssignment::All);
+}
+
+// Test Gemm with transposed A/B and dynamic (i.e., not initializer) B and Bias inputs.
+TEST_F(QnnCPUBackendTests, Gemm_TransAB_Dynamic_B_And_Bias) {
+  std::vector<float> input_a_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
+  std::vector<float> input_b_data = GetFloatDataInRange(-5.0f, 5.0f, 24);
+  std::vector<float> input_c_data = GetFloatDataInRange(-1.0f, 1.0f, 4);
+  RunGemmTestOnCPU<float>({TestInputDef<float>({6, 1}, false, input_a_data),
+                           TestInputDef<float>({4, 6}, false, input_b_data),
+                           TestInputDef<float>({1, 4}, false, input_c_data)},
+                          {utils::MakeAttribute("transA", static_cast<int64_t>(1)),
+                           utils::MakeAttribute("transB", static_cast<int64_t>(1))},
+                          ExpectedEPNodeAssignment::All);
+}
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+//
+// HTP tests:
+//
+
+#if 0
+// Runs a QDQ Gemm model on the QNN (HTP) EP and the ORT CPU EP. Checks the graph node assignment and that inference
+// running the QDQ model on QNN EP is at least as accurate as on ORT CPU EP (when compared to the baseline float32 model).
+template <typename QType>
+static void RunQDQGemmTestOnHTP(const TestInputDef<float>& input0_def,
+                                const TestInputDef<float>& input1_def,
+                                const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                ExpectedEPNodeAssignment expected_ep_assignment,
+                                int opset = 13) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  TestQDQModelAccuracy<QType>(BuildOpTestCase<float>("Gemm", {input0_def, input1_def}, attrs),     // baseline float32 model
+                              BuildQDQOpTestCase<QType>("Gemm", {input0_def, input1_def}, attrs),  // QDQ model
+                              provider_options,
+                              opset,
+                              expected_ep_assignment);
+}
+#endif
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+}  // namespace test
+}  // namespace onnxruntime
+#endif  // !defined(ORT_MINIMAL_BUILD)

From 3abdf14a6ce2bb134ccd70ba352ff51c69aae4de Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Mon, 18 Sep 2023 00:53:22 -0700
Subject: [PATCH 09/22] Add QDQ Gemm HTP tests

---
 .../test/providers/qnn/gemm_op_test.cc        | 131 ++++++++++++++++--
 1 file changed, 121 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/test/providers/qnn/gemm_op_test.cc b/onnxruntime/test/providers/qnn/gemm_op_test.cc
index 709a886a5d60d..4c49a6d13f564 100644
--- a/onnxruntime/test/providers/qnn/gemm_op_test.cc
+++ b/onnxruntime/test/providers/qnn/gemm_op_test.cc
@@ -3,6 +3,7 @@
 
 #if !defined(ORT_MINIMAL_BUILD)
 
+#include <cassert>
 #include <string>
 
 #include "test/providers/qnn/qnn_test_utils.h"
@@ -130,12 +131,56 @@ TEST_F(QnnCPUBackendTests, Gemm_TransAB_Dynamic_B_And_Bias) {
 // HTP tests:
 //
 
-#if 0
+// Returns a function that builds a model with a QDQ Gemm node.
+template <typename InputAQType, typename InputBQType>
+inline GetTestQDQModelFn<InputAQType> BuildQDQGemmTestCase(const std::vector<TestInputDef<float>>& input_defs,
+                                                           const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
+  return [input_defs, attrs](ModelTestBuilder& builder,
+                             std::vector<QuantParams<InputAQType>>& output_qparams) {
+    const size_t num_inputs = input_defs.size();
+    assert(num_inputs == 2 || num_inputs == 3);
+
+    std::vector<NodeArg*> op_inputs;
+    op_inputs.reserve(num_inputs);
+
+    // Process input 0
+    NodeArg* input0 = MakeTestInput<float>(builder, input_defs[0]);
+    QuantParams<InputAQType> input0_qparams = GetTestInputQuantParams<InputAQType>(input_defs[0]);
+    NodeArg* input0_after_qdq = AddQDQNodePair<InputAQType>(builder, input0, input0_qparams.scale,
+                                                            input0_qparams.zero_point);
+    op_inputs.push_back(input0_after_qdq);
+
+    // Process input 1
+    NodeArg* input1 = MakeTestInput<float>(builder, input_defs[1]);
+    QuantParams<InputBQType> input1_qparams = GetTestInputQuantParams<InputBQType>(input_defs[1]);
+    NodeArg* input1_after_qdq = AddQDQNodePair<InputBQType>(builder, input1, input1_qparams.scale,
+                                                            input1_qparams.zero_point);
+    op_inputs.push_back(input1_after_qdq);
+
+    // Process bias
+    if (num_inputs == 3) {
+      NodeArg* bias_input = MakeTestQDQBiasInput(builder, input_defs[2], input0_qparams.scale * input1_qparams.scale);
+      op_inputs.push_back(bias_input);
+    }
+
+    // Op -> op_output
+    auto* gemm_output = builder.MakeIntermediate();
+    Node& gemm_node = builder.AddNode("Gemm", op_inputs, {gemm_output});
+
+    for (const auto& attr : attrs) {
+      gemm_node.AddAttributeProto(attr);
+    }
+
+    // op_output -> Q -> DQ -> output
+    AddQDQNodePairWithOutputAsGraphOutput<InputAQType>(builder, gemm_output, output_qparams[0].scale,
+                                                       output_qparams[0].zero_point);
+  };
+}
+
 // Runs a QDQ Gemm model on the QNN (HTP) EP and the ORT CPU EP. Checks the graph node assignment and that inference
 // running the QDQ model on QNN EP is at least as accurate as on ORT CPU EP (when compared to the baseline float32 model).
-template <typename QType>
-static void RunQDQGemmTestOnHTP(const TestInputDef<float>& input0_def,
-                                const TestInputDef<float>& input1_def,
+template <typename InputAQType, typename InputBQType>
+static void RunQDQGemmTestOnHTP(const std::vector<TestInputDef<float>>& input_defs,
                                 const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
                                 ExpectedEPNodeAssignment expected_ep_assignment,
                                 int opset = 13) {
@@ -147,13 +192,79 @@ static void RunQDQGemmTestOnHTP(const TestInputDef<float>& input0_def,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  TestQDQModelAccuracy<QType>(BuildOpTestCase<float>("Gemm", {input0_def, input1_def}, attrs),     // baseline float32 model
-                              BuildQDQOpTestCase<QType>("Gemm", {input0_def, input1_def}, attrs),  // QDQ model
-                              provider_options,
-                              opset,
-                              expected_ep_assignment);
+  TestQDQModelAccuracy<InputAQType>(BuildOpTestCase<float>("Gemm", input_defs, attrs),
+                                    BuildQDQGemmTestCase<InputAQType, InputBQType>(input_defs, attrs),
+                                    provider_options,
+                                    opset,
+                                    expected_ep_assignment);
+}
+
+// Test QDQ Gemm with dynamic inputs A and Bias. The B input is an initializer.
+TEST_F(QnnHTPBackendTests, Gemm_Dynamic_A_Static_B_Dynamic_Bias) {
+  std::vector<float> input_a_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
+  std::vector<float> input_b_data = GetFloatDataInRange(-5.0f, 5.0f, 24);
+  std::vector<float> input_c_data = GetFloatDataInRange(-1.0f, 1.0f, 4);
+  RunQDQGemmTestOnHTP<uint8_t, uint8_t>({TestInputDef<float>({1, 6}, false, input_a_data),
+                                         TestInputDef<float>({6, 4}, true, input_b_data),
+                                         TestInputDef<float>({1, 4}, false, input_c_data)},
+                                        {},
+                                        ExpectedEPNodeAssignment::All);
+}
+
+// Test QDQ Gemm with dynamic A and B inputs. The Bias is static.
+// TODO: Inaccuracy detected for output 'output', element 0.
+// Output quant params: scale=0.48132994771003723, zero_point=0.
+// Expected val: 120.73912048339844
+// QNN QDQ val: 77.012794494628906 (err 43.726325988769531)
+// CPU QDQ val: 119.85115814208984 (err 0.88796234130859375)
+TEST_F(QnnHTPBackendTests, DISABLED_Gemm_Dynamic_A_B_Static_Bias) {
+  std::vector<float> input_a_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
+  std::vector<float> input_b_data = GetFloatDataInRange(-5.0f, 5.0f, 24);
+  std::vector<float> input_c_data = GetFloatDataInRange(-1.0f, 1.0f, 4);
+  RunQDQGemmTestOnHTP<uint8_t, uint8_t>({TestInputDef<float>({1, 6}, false, input_a_data),
+                                         TestInputDef<float>({6, 4}, false, input_b_data),  // Dynamic => inaccuracy
+                                         TestInputDef<float>({1, 4}, true, input_c_data)},
+                                        {},
+                                        ExpectedEPNodeAssignment::All);
+}
+
+// Test QDQ Gemm with static B and Bias inputs.
+TEST_F(QnnHTPBackendTests, Gemm_Static_B_And_Bias) {
+  std::vector<float> input_a_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
+  std::vector<float> input_b_data = GetFloatDataInRange(-5.0f, 5.0f, 24);
+  std::vector<float> input_c_data = GetFloatDataInRange(-1.0f, 1.0f, 4);
+  RunQDQGemmTestOnHTP<uint8_t, uint8_t>({TestInputDef<float>({1, 6}, false, input_a_data),
+                                         TestInputDef<float>({6, 4}, true, input_b_data),
+                                         TestInputDef<float>({1, 4}, true, input_c_data)},
+                                        {},
+                                        ExpectedEPNodeAssignment::All);
+}
+
+// Test QDQ Gemm with transposed A/B and static B and Bias inputs.
+TEST_F(QnnHTPBackendTests, Gemm_TransAB_Static_B_And_Bias) {
+  std::vector<float> input_a_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
+  std::vector<float> input_b_data = GetFloatDataInRange(-5.0f, 5.0f, 24);
+  std::vector<float> input_c_data = GetFloatDataInRange(-1.0f, 1.0f, 4);
+  RunQDQGemmTestOnHTP<uint8_t, uint8_t>({TestInputDef<float>({6, 1}, false, input_a_data),
+                                         TestInputDef<float>({4, 6}, true, input_b_data),
+                                         TestInputDef<float>({1, 4}, true, input_c_data)},
+                                        {utils::MakeAttribute("transA", static_cast<int64_t>(1)),
+                                         utils::MakeAttribute("transB", static_cast<int64_t>(1))},
+                                        ExpectedEPNodeAssignment::All);
+}
+
+// Test QDQ Gemm with transposed A/B and dynamic (i.e., not initializer) B and Bias inputs.
+TEST_F(QnnHTPBackendTests, Gemm_TransAB_Dynamic_B_And_Bias) {
+  std::vector<float> input_a_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
+  std::vector<float> input_b_data = GetFloatDataInRange(-5.0f, 5.0f, 24);
+  std::vector<float> input_c_data = GetFloatDataInRange(-1.0f, 1.0f, 4);
+  RunQDQGemmTestOnHTP<uint8_t, uint8_t>({TestInputDef<float>({6, 1}, false, input_a_data),
+                                         TestInputDef<float>({4, 6}, false, input_b_data),
+                                         TestInputDef<float>({1, 4}, false, input_c_data)},
+                                        {utils::MakeAttribute("transA", static_cast<int64_t>(1)),
+                                         utils::MakeAttribute("transB", static_cast<int64_t>(1))},
+                                        ExpectedEPNodeAssignment::All);
 }
-#endif
 
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 }  // namespace test

From 976cc8b918733b9e1401013f4dcfcb4daa867d41 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Mon, 18 Sep 2023 11:52:36 -0700
Subject: [PATCH 10/22] Start adding QNN Clip tests

---
 .../test/providers/qnn/clip_op_test.cc        | 174 ++++++++++++++++++
 1 file changed, 174 insertions(+)
 create mode 100644 onnxruntime/test/providers/qnn/clip_op_test.cc

diff --git a/onnxruntime/test/providers/qnn/clip_op_test.cc b/onnxruntime/test/providers/qnn/clip_op_test.cc
new file mode 100644
index 0000000000000..fc9322ae856eb
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/clip_op_test.cc
@@ -0,0 +1,174 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include <string>
+
+#include "test/providers/qnn/qnn_test_utils.h"
+#include "core/graph/node_attr_utils.h"
+
+#include "onnx/onnx_pb.h"
+#include "gtest/gtest.h"
+
+namespace onnxruntime {
+namespace test {
+
+// Runs a model with a Clip operator on the QNN CPU backend. Checks the graph node assignment
+// and that inference outputs for QNN EP and CPU EP match.
+template <typename DataType>
+static void RunClipTestOnCPU(const std::vector<TestInputDef<DataType>>& input_defs,
+                             ExpectedEPNodeAssignment expected_ep_assignment,
+                             int opset = 13) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnCpu.dll";
+#else
+  provider_options["backend_path"] = "libQnnCpu.so";
+#endif
+
+  RunQnnModelTest(BuildOpTestCase<DataType>("Clip", input_defs, {}),
+                  provider_options,
+                  opset,
+                  expected_ep_assignment);
+}
+
+//
+// CPU tests:
+//
+
+// Test that Clip with a dynamic min or max input is not supported by QNN EP.
+TEST_F(QnnCPUBackendTests, Clip_Dynamic_MinMax_Unsupported) {
+  // Dynamic min input is not supported.
+  RunClipTestOnCPU<float>({TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                           TestInputDef<float>({}, false /* is_initializer */, {-5.0f})},
+                          ExpectedEPNodeAssignment::None);  // Should not be assigned to QNN EP.
+  // Dynamic max input is not supported.
+  RunClipTestOnCPU<float>({TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                           TestInputDef<float>({}, true, {-5.0f}),
+                           TestInputDef<float>({}, false, {5.0f})},
+                          ExpectedEPNodeAssignment::None);  // Should not be assigned to QNN EP.
+}
+
+// Test Clip with default min/max.
+TEST_F(QnnCPUBackendTests, Clip_4D_f32_DefaultMinMax) {
+  RunClipTestOnCPU<float>({TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48))},
+                          ExpectedEPNodeAssignment::All);
+}
+
+// Test Clip with 5D input.
+TEST_F(QnnCPUBackendTests, Clip_5D_f32) {
+  RunClipTestOnCPU<float>({TestInputDef<float>({1, 1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                           TestInputDef<float>({}, true, {-5.0f}),
+                           TestInputDef<float>({}, true, {5.0f})},
+                          ExpectedEPNodeAssignment::All);
+}
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+//
+// HTP tests:
+//
+
+// Returns a function that builds a model with a QDQ Clip operator. Only the first input is quantized.
+template <typename InputQType>
+inline GetTestQDQModelFn<InputQType> BuildQDQClipTestCase(const std::vector<TestInputDef<float>>& input_defs) {
+  return [input_defs](ModelTestBuilder& builder, std::vector<QuantParams<InputQType>>& output_qparams) {
+    const size_t num_inputs = input_defs.size();
+    std::vector<NodeArg*> op_inputs;
+    op_inputs.reserve(num_inputs);
+
+    for (size_t i = 0; i < num_inputs; i++) {
+      const TestInputDef<float>& input_def = input_defs[i];
+      NodeArg* input = MakeTestInput<float>(builder, input_def);
+
+      if (i == 0) {  // Only input 0 is quantized.
+        QuantParams<InputQType> input_qparams = GetTestInputQuantParams<InputQType>(input_def);
+        NodeArg* input_after_qdq = AddQDQNodePair<InputQType>(builder, input, input_qparams.scale,
+                                                              input_qparams.zero_point);
+        op_inputs.push_back(input_after_qdq);
+      } else {
+        op_inputs.push_back(input);
+      }
+    }
+
+    // Op -> op_output
+    auto* clip_output = builder.MakeIntermediate();
+    builder.AddNode("Clip", op_inputs, {clip_output});
+
+    // op_output -> Q -> DQ -> output
+    AddQDQNodePairWithOutputAsGraphOutput<InputQType>(builder, clip_output, output_qparams[0].scale,
+                                                      output_qparams[0].zero_point);
+  };
+}
+
+// Runs a QDQ Clip model on the QNN (HTP) EP and the ORT CPU EP. Checks the graph node assignment and that inference
+// running the QDQ model on QNN EP is at least as accurate as on ORT CPU EP (when compared to the baseline float32 model).
+template <typename QType>
+static void RunQDQClipTestOnHTP(const std::vector<TestInputDef<float>>& input_defs,
+                                ExpectedEPNodeAssignment expected_ep_assignment,
+                                int opset = 13) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  TestQDQModelAccuracy(BuildOpTestCase("Clip", input_defs, {}),  // baseline float32 model
+                       BuildQDQClipTestCase<QType>(input_defs),  // QDQ model
+                       provider_options,
+                       opset,
+                       expected_ep_assignment,
+                       1e-4f, logging::Severity::kVERBOSE);
+}
+
+// Runs a model with a non-QDQ Clip operator on the QNN HTP backend. Checks the graph node assignment
+// and that inference outputs for QNN EP and CPU EP match.
+template <typename DataType>
+static void RunClipTestOnHTP(const std::vector<TestInputDef<DataType>>& input_defs,
+                             ExpectedEPNodeAssignment expected_ep_assignment,
+                             int opset = 13) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  RunQnnModelTest(BuildOpTestCase("Clip", input_defs, {}),
+                  provider_options,
+                  opset,
+                  expected_ep_assignment);
+}
+
+// Test QDQ Clip with default min/max. (Fused with QuantizeLinear by optimizer).
+TEST_F(QnnHTPBackendTests, Clip_4D_DefaultMinMax) {
+  RunQDQClipTestOnHTP<uint8_t>({TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48))},
+                               ExpectedEPNodeAssignment::All);
+}
+
+// Test QDQ Clip with non-default min and max inputs.
+TEST_F(QnnHTPBackendTests, Clip_4D) {
+  RunQDQClipTestOnHTP<uint8_t>({TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                                TestInputDef<float>({}, true, {-5.0f}),
+                                TestInputDef<float>({}, true, {5.0f})},
+                               ExpectedEPNodeAssignment::All);
+}
+
+#if 0
+// Test non-QDQ Clip with 4D input on HTP
+TEST_F(QnnHTPBackendTests, Clip_NotQDQ_4D_f32) {
+  RunClipTestOnHTP<float>({TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                           TestInputDef<float>({}, true, {-5.0f}),
+                           TestInputDef<float>({}, true, {5.0f})},
+                          ExpectedEPNodeAssignment::All);
+}
+#endif
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+}  // namespace test
+}  // namespace onnxruntime
+#endif  // !defined(ORT_MINIMAL_BUILD)

From 20915452d32b5fcecf377adee9831dcf6321f2e2 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Mon, 18 Sep 2023 18:07:17 -0700
Subject: [PATCH 11/22] Enable QDQ Clip on QNN HTP backend. Add unit tests.

---
 .../selectors_actions/shared/utils.cc         |   3 +-
 .../qnn/builder/opbuilder/clip_op_builder.cc  | 122 ++++++++----------
 .../test/providers/qnn/clip_op_test.cc        |  81 +++++++-----
 .../qnn/squeeze_unsqueeze_op_test.cc          |   4 +-
 4 files changed, 104 insertions(+), 106 deletions(-)

diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
index f1bdd7a99c329..f951e41552cf0 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
@@ -78,7 +78,8 @@ static const OpVersionsAndSelector::OpVersionsMap GetUnaryOpVersionsMap() {
           {"Abs", {}},
           {"Neg", {}},
           {"DepthToSpace", {}},
-          {"SpaceToDepth", {}}};
+          {"SpaceToDepth", {}},
+          {"Clip", {}}};
 }
 static const OpVersionsAndSelector::OpVersionsMap GetBinaryOpVersionsMap() {
   return {{"Add", {}},
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
index 92a7feea7fc54..5e9faca8f53c0 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
@@ -33,8 +33,6 @@ class ClipOpBuilder : public BaseOpBuilder {
 
  private:
   Status ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const;
-  mutable float min_value_ = std::numeric_limits<float>::lowest();
-  mutable float max_value_ = std::numeric_limits<float>::max();
 };
 
 Status ClipOpBuilder::ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const {
@@ -61,61 +59,8 @@ Status ClipOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
   if (do_op_validation) {
     ORT_RETURN_IF_ERROR(ExplictOpCheck(qnn_model_wrapper, node_unit));
   }
-  Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
-
-  auto inputs = node_unit.Inputs();
-  for (size_t input_i = 0; input_i < inputs.size(); ++input_i) {
-    Qnn_QuantizeParams_t quantize_param = QNN_QUANTIZE_PARAMS_INIT;
-    bool is_quantized_tensor = inputs[input_i].quant_param.has_value();
-    utils::InitializeQuantizeParam(quantize_param, is_quantized_tensor);
-
-    auto& input_name = inputs[input_i].node_arg.Name();
-    if (input_name.empty()) {
-      // Ignore unspecified/unused optional input
-      continue;
-    }
-    if (qnn_model_wrapper.IsQnnTensorWrapperExist(input_name)) {
-      LOGS(logger, VERBOSE) << "Tensor already added or the input is not named, skip it: " << input_name;
-      input_names.push_back(input_name);
-      continue;
-    }
-
-    const auto* type_proto = inputs[input_i].node_arg.TypeAsProto();
-    ORT_RETURN_IF_ERROR(utils::GetQnnDataType(is_quantized_tensor, type_proto, qnn_data_type));
-
-    std::vector<uint32_t> input_shape;
-    ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[input_i].node_arg, input_shape), "Cannot get shape");
-
-    ORT_RETURN_IF_NOT(qnn_model_wrapper.ProcessQuantizationParameter(inputs[input_i].quant_param,
-                                                                     quantize_param.scaleOffsetEncoding.scale,
-                                                                     quantize_param.scaleOffsetEncoding.offset),
-                      "Cannot get quantization parameter");
-
-    float* ini_data = nullptr;
-    std::vector<uint8_t> unpacked_tensor;
-    bool is_initializer_input = qnn_model_wrapper.IsInitializerInput(input_name);
-    if (is_initializer_input) {
-      const auto& input_tensor = qnn_model_wrapper.GetInitializerTensors().at(input_name);
-      ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_tensor, unpacked_tensor));
-      ini_data = reinterpret_cast<float*>(unpacked_tensor.data());
-      if (input_i == 1) {
-        min_value_ = *ini_data;
-        continue;
-      } else if (input_i == 2) {
-        max_value_ = *ini_data;
-        continue;
-      }
-    }
-    ORT_ENFORCE(input_i == 0, "QNN ReluMinMax operator expects only one input. Min and max are expected to be parameters, ie. initializer inputs in ONNX model");
 
-    Qnn_TensorType_t tensor_type = GetInputTensorType(qnn_model_wrapper, input_name);
-    QnnTensorWrapper input_tensorwrapper(input_name, tensor_type, qnn_data_type, quantize_param,
-                                         std::move(input_shape), std::move(unpacked_tensor));
-    ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
-    input_names.push_back(input_name);
-  }
-
-  return Status::OK();
+  return ProcessInput(qnn_model_wrapper, node_unit.Inputs()[0], logger, input_names);
 }
 
 Status ClipOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
@@ -123,20 +68,59 @@ Status ClipOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra
                                                   std::vector<std::string>&& input_names,
                                                   const logging::Logger& logger,
                                                   bool do_op_validation) const {
+  const auto& inputs = node_unit.Inputs();
+  const size_t num_inputs = inputs.size();
+
+  const Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
   std::vector<std::string> param_tensor_names;
-  Qnn_Scalar_t min_qnn_scalar = QNN_SCALAR_INIT;
-  min_qnn_scalar.dataType = QNN_DATATYPE_FLOAT_32;
-  min_qnn_scalar.floatValue = min_value_;
-  QnnParamWrapper min_value_param(node_unit.Index(), node_unit.Name(), QNN_OP_RELU_MIN_MAX_PARAM_MIN_VALUE, min_qnn_scalar);
-  param_tensor_names.push_back(min_value_param.GetParamTensorName());
-  qnn_model_wrapper.AddParamWrapper(std::move(min_value_param));
-
-  Qnn_Scalar_t max_qnn_scalar = QNN_SCALAR_INIT;
-  max_qnn_scalar.dataType = QNN_DATATYPE_FLOAT_32;
-  max_qnn_scalar.floatValue = max_value_;
-  QnnParamWrapper max_value_param(node_unit.Index(), node_unit.Name(), QNN_OP_RELU_MIN_MAX_PARAM_MAX_VALUE, max_qnn_scalar);
-  param_tensor_names.push_back(max_value_param.GetParamTensorName());
-  qnn_model_wrapper.AddParamWrapper(std::move(max_value_param));
+
+  auto get_f32_from_bytes = [](const std::vector<uint8_t>& bytes, float default_val) -> float {
+    return bytes.empty() ? default_val : *reinterpret_cast<const float*>(bytes.data());
+  };
+
+  // Set the 'min' parameter.
+  {
+    std::vector<uint8_t> min_val_bytes;
+
+    if (num_inputs > 1 && !inputs[1].node_arg.Name().empty()) {
+      OnnxInputInfo min_input_info = {};
+      ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetOnnxInputInfo(inputs[1], min_input_info));
+      ORT_RETURN_IF_NOT(min_input_info.qnn_data_type == qnn_data_type,
+                        "QNN EP: The 'min' input of the Clip operator must be of type float32.");
+      ORT_RETURN_IF_NOT(min_input_info.is_initializer, "QNN EP: The Clip operator's 'min' input must be an initializer.");
+      ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*min_input_info.initializer_tensor, min_val_bytes));
+    }
+
+    Qnn_Scalar_t min_qnn_scalar = QNN_SCALAR_INIT;
+    min_qnn_scalar.dataType = qnn_data_type;
+    min_qnn_scalar.floatValue = get_f32_from_bytes(min_val_bytes, std::numeric_limits<float>::lowest());
+    QnnParamWrapper min_value_param(node_unit.Index(), node_unit.Name(), QNN_OP_RELU_MIN_MAX_PARAM_MIN_VALUE,
+                                    min_qnn_scalar);
+    param_tensor_names.push_back(min_value_param.GetParamTensorName());
+    qnn_model_wrapper.AddParamWrapper(std::move(min_value_param));
+  }
+
+  // Set the 'max' parameter.
+  {
+    std::vector<uint8_t> max_val_bytes;
+
+    if (num_inputs > 2 && !inputs[2].node_arg.Name().empty()) {
+      OnnxInputInfo max_input_info = {};
+      ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetOnnxInputInfo(inputs[2], max_input_info));
+      ORT_RETURN_IF_NOT(max_input_info.qnn_data_type == qnn_data_type,
+                        "QNN EP: The 'max' input of the Clip operator must of type float32.");
+      ORT_RETURN_IF_NOT(max_input_info.is_initializer, "QNN EP: The Clip operator's 'max' input must be an initializer.");
+      ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*max_input_info.initializer_tensor, max_val_bytes));
+    }
+
+    Qnn_Scalar_t max_qnn_scalar = QNN_SCALAR_INIT;
+    max_qnn_scalar.dataType = qnn_data_type;
+    max_qnn_scalar.floatValue = get_f32_from_bytes(max_val_bytes, std::numeric_limits<float>::max());
+    QnnParamWrapper max_value_param(node_unit.Index(), node_unit.Name(), QNN_OP_RELU_MIN_MAX_PARAM_MAX_VALUE,
+                                    max_qnn_scalar);
+    param_tensor_names.push_back(max_value_param.GetParamTensorName());
+    qnn_model_wrapper.AddParamWrapper(std::move(max_value_param));
+  }
 
   ORT_RETURN_IF_ERROR(ProcessOutputs(qnn_model_wrapper, node_unit,
                                      std::move(input_names),
diff --git a/onnxruntime/test/providers/qnn/clip_op_test.cc b/onnxruntime/test/providers/qnn/clip_op_test.cc
index fc9322ae856eb..496e4661da4de 100644
--- a/onnxruntime/test/providers/qnn/clip_op_test.cc
+++ b/onnxruntime/test/providers/qnn/clip_op_test.cc
@@ -120,54 +120,67 @@ static void RunQDQClipTestOnHTP(const std::vector<TestInputDef<float>>& input_de
                        BuildQDQClipTestCase<QType>(input_defs),  // QDQ model
                        provider_options,
                        opset,
-                       expected_ep_assignment,
-                       1e-4f, logging::Severity::kVERBOSE);
+                       expected_ep_assignment);
 }
 
-// Runs a model with a non-QDQ Clip operator on the QNN HTP backend. Checks the graph node assignment
-// and that inference outputs for QNN EP and CPU EP match.
-template <typename DataType>
-static void RunClipTestOnHTP(const std::vector<TestInputDef<DataType>>& input_defs,
-                             ExpectedEPNodeAssignment expected_ep_assignment,
-                             int opset = 13) {
-  ProviderOptions provider_options;
-
-#if defined(_WIN32)
-  provider_options["backend_path"] = "QnnHtp.dll";
-#else
-  provider_options["backend_path"] = "libQnnHtp.so";
-#endif
-
-  RunQnnModelTest(BuildOpTestCase("Clip", input_defs, {}),
-                  provider_options,
-                  opset,
-                  expected_ep_assignment);
-}
-
-// Test QDQ Clip with default min/max. (Fused with QuantizeLinear by optimizer).
-TEST_F(QnnHTPBackendTests, Clip_4D_DefaultMinMax) {
+// Test QDQ Clip with default min/max.
+// NOTE: The Clip operator is *optimized* away during L1 optimizations, so QNN EP does not get a graph with a Clip op.
+// Instead, QNN EP will get a graph with a Q -> DQ.
+// - Original sequence: Q1 -> DQ1 -> Clip -> Q2 -> DQ2
+// - ClipQuantFusion: Fuses Clip -> QuantizeLinear resulting in Q1 -> DQ1 -> Q2' -> DQ2
+// - DoubleQDQPairsRemover: Simplifies remaining Q1 -> DQ1 -> Q2' -> DQ2 sequence to Q1 -> DQ2.
+TEST_F(QnnHTPBackendTests, Clip_U8_DefaultMinMax_Rank4) {
   RunQDQClipTestOnHTP<uint8_t>({TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48))},
                                ExpectedEPNodeAssignment::All);
 }
 
-// Test QDQ Clip with non-default min and max inputs.
-TEST_F(QnnHTPBackendTests, Clip_4D) {
+// Test QDQ Clip with non-default min and max inputs. QNN EP will get a graph with a Clip operator.
+TEST_F(QnnHTPBackendTests, Clip_U8_Rank4) {
   RunQDQClipTestOnHTP<uint8_t>({TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
                                 TestInputDef<float>({}, true, {-5.0f}),
                                 TestInputDef<float>({}, true, {5.0f})},
                                ExpectedEPNodeAssignment::All);
 }
 
-#if 0
-// Test non-QDQ Clip with 4D input on HTP
-TEST_F(QnnHTPBackendTests, Clip_NotQDQ_4D_f32) {
-  RunClipTestOnHTP<float>({TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
-                           TestInputDef<float>({}, true, {-5.0f}),
-                           TestInputDef<float>({}, true, {5.0f})},
-                          ExpectedEPNodeAssignment::All);
-}
+// Test QDQ Clip of rank 5.
+TEST_F(QnnHTPBackendTests, Clip_U8_Rank5) {
+  // We can't use the usual model-building functions because they add standalone Quantize and Dequantize nodes
+  // at the input and output. These Q/DQ ops get lowered to QNN's Quantize and Dequantize operators, which DO NOT
+  // support rank 5 tensors. Therefore, we have to create a test model that only instantiates the DQ -> Clip -> Q
+  // QDQ node group, which gets lowered to a single QNN Clip node.
+  GetTestModelFn model_fn = [](ModelTestBuilder& builder) {
+    // input (u8) -> DQ ->
+    NodeArg* quant_input = builder.MakeInput<uint8_t>({1, 1, 2, 2, 2}, {0, 1, 6, 10, 20, 100, 128, 255});
+    NodeArg* input_dq = builder.MakeIntermediate();
+    builder.AddDequantizeLinearNode<uint8_t>(quant_input, 1.0f, 0, input_dq);  // scale = 1.0, zp = 0
+
+    // Min/Max initializers
+    NodeArg* min_input = builder.MakeScalarInitializer(5.0f);
+    NodeArg* max_input = builder.MakeScalarInitializer(100.0f);
+
+    // Unsqueeze ->
+    NodeArg* clip_output = builder.MakeIntermediate();
+    builder.AddNode("Clip", {input_dq, min_input, max_input}, {clip_output});
+
+    // Q -> output (u8)
+    NodeArg* output = builder.MakeOutput();
+    builder.AddQuantizeLinearNode<uint8_t>(clip_output, 1.0f, 0, output);  // scale = 1.0, zp = 0
+  };
+
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
+  RunQnnModelTest(model_fn,
+                  provider_options,
+                  13,  // opset
+                  ExpectedEPNodeAssignment::All);
+}
+
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc b/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc
index 0c92969acdcba..02e61bfc2d25e 100644
--- a/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc
+++ b/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc
@@ -232,7 +232,7 @@ TEST_F(QnnHTPBackendTests, Squeeze_Rank5_Rank2_f32) {
                   ExpectedEPNodeAssignment::All);
 }
 
-// Test Squeeze of rank 4 -> rank 3 with a negative axes value.
+// Test QDQ Squeeze of rank 4 -> rank 3 with a negative axes value.
 TEST_F(QnnHTPBackendTests, Squeeze_Rank4_Rank3_NegAxes_f32) {
   RunQDQSqueezeTestOnHTP<uint8_t>("Squeeze",
                                   TestInputDef<float>({1, 3, 2, 1}, false, -10.0f, 10.0f),
@@ -240,7 +240,7 @@ TEST_F(QnnHTPBackendTests, Squeeze_Rank4_Rank3_NegAxes_f32) {
                                   ExpectedEPNodeAssignment::All);
 }
 
-// Test Unsqueeze of rank 3 -> rank 5.
+// Test QDQ Unsqueeze of rank 3 -> rank 5.
 TEST_F(QnnHTPBackendTests, Unsqueeze_Rank3_Rank5_f32) {
   // We can't use the usual model-building functions because they add standalone Quantize and Dequantize nodes
   // at the input and output. These Q/DQ ops get lowered to QNN's Quantize and Dequantize operators, which DO NOT

From a1b3b35cc16be975a7dc71d69cb5104a707800ea Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Tue, 19 Sep 2023 09:47:31 -0700
Subject: [PATCH 12/22] Fix comment

---
 onnxruntime/test/providers/qnn/clip_op_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/test/providers/qnn/clip_op_test.cc b/onnxruntime/test/providers/qnn/clip_op_test.cc
index 496e4661da4de..544cf35c68f47 100644
--- a/onnxruntime/test/providers/qnn/clip_op_test.cc
+++ b/onnxruntime/test/providers/qnn/clip_op_test.cc
@@ -158,7 +158,7 @@ TEST_F(QnnHTPBackendTests, Clip_U8_Rank5) {
     NodeArg* min_input = builder.MakeScalarInitializer(5.0f);
     NodeArg* max_input = builder.MakeScalarInitializer(100.0f);
 
-    // Unsqueeze ->
+    // Clip ->
     NodeArg* clip_output = builder.MakeIntermediate();
     builder.AddNode("Clip", {input_dq, min_input, max_input}, {clip_output});
 

From 1d205bbb855d7ce1d6df204a31f52e2327665d4b Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Tue, 19 Sep 2023 09:59:53 -0700
Subject: [PATCH 13/22] Convert Status check into an assert

---
 .../core/providers/qnn/builder/opbuilder/clip_op_builder.cc  | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
index 5e9faca8f53c0..1f1b8d6867acd 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
@@ -9,6 +9,7 @@
 
 #include "base_op_builder.h"
 
+#include <cassert>
 #include <limits>
 
 namespace onnxruntime {
@@ -87,7 +88,7 @@ Status ClipOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra
       ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetOnnxInputInfo(inputs[1], min_input_info));
       ORT_RETURN_IF_NOT(min_input_info.qnn_data_type == qnn_data_type,
                         "QNN EP: The 'min' input of the Clip operator must be of type float32.");
-      ORT_RETURN_IF_NOT(min_input_info.is_initializer, "QNN EP: The Clip operator's 'min' input must be an initializer.");
+      assert(min_input_info.is_initializer);  // Checked by ExplicitOpCheck().
       ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*min_input_info.initializer_tensor, min_val_bytes));
     }
 
@@ -109,7 +110,7 @@ Status ClipOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra
       ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetOnnxInputInfo(inputs[2], max_input_info));
       ORT_RETURN_IF_NOT(max_input_info.qnn_data_type == qnn_data_type,
                         "QNN EP: The 'max' input of the Clip operator must of type float32.");
-      ORT_RETURN_IF_NOT(max_input_info.is_initializer, "QNN EP: The Clip operator's 'max' input must be an initializer.");
+      assert(max_input_info.is_initializer);  // Checked by ExplicitOpCheck().
       ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*max_input_info.initializer_tensor, max_val_bytes));
     }
 

From 80c8fb4c727112c5257c45cd448ef5e9efd7721c Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Tue, 19 Sep 2023 13:04:53 -0700
Subject: [PATCH 14/22] Improve model-building helpers to accept inputs of a
 potentially different type (e.g., int64 shape/indices/etc)

---
 .../test/providers/qnn/average_pool_test.cc   |  6 +-
 .../test/providers/qnn/clip_op_test.cc        |  6 +-
 .../test/providers/qnn/flatten_op_test.cc     |  8 +--
 .../test/providers/qnn/gather_op_htp_test.cc  | 64 +++++-------------
 .../test/providers/qnn/gemm_op_test.cc        |  4 +-
 .../providers/qnn/instance_norm_htp_test.cc   | 21 +-----
 .../test/providers/qnn/layer_norm_test.cc     |  4 +-
 .../providers/qnn/leakyrelu_op_htp_test.cc    | 46 ++-----------
 .../test/providers/qnn/max_min_op_test.cc     |  9 ++-
 .../test/providers/qnn/pool_op_test.cpp       | 19 +-----
 .../test/providers/qnn/qnn_test_utils.h       | 65 ++++++++++++-------
 .../test/providers/qnn/reshape_op_test.cc     | 27 ++------
 .../test/providers/qnn/simple_op_htp_test.cc  | 16 ++---
 .../test/providers/qnn/slice_htp_test.cc      | 64 +++---------------
 .../qnn/squeeze_unsqueeze_op_test.cc          | 21 ++----
 15 files changed, 114 insertions(+), 266 deletions(-)

diff --git a/onnxruntime/test/providers/qnn/average_pool_test.cc b/onnxruntime/test/providers/qnn/average_pool_test.cc
index 79ec07796c0e8..0ee52f7fec21a 100644
--- a/onnxruntime/test/providers/qnn/average_pool_test.cc
+++ b/onnxruntime/test/providers/qnn/average_pool_test.cc
@@ -32,7 +32,7 @@ static void RunAveragePoolOpTest(const std::string& op_type,
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
 
-  RunQnnModelTest(BuildOpTestCase(op_type, input_defs, attrs),
+  RunQnnModelTest(BuildOpTestCase<float>(op_type, input_defs, {}, attrs),
                   provider_options,
                   opset,
                   expected_ep_assignment);
@@ -53,8 +53,8 @@ static void RunQDQAveragePoolOpTest(const std::string& op_type,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  TestQDQModelAccuracy(BuildOpTestCase(op_type, input_defs, attrs),
-                       BuildQDQOpTestCase<QuantType>(op_type, input_defs, attrs),
+  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, input_defs, {}, attrs),
+                       BuildQDQOpTestCase<QuantType>(op_type, input_defs, {}, attrs),
                        provider_options,
                        opset,
                        expected_ep_assignment);
diff --git a/onnxruntime/test/providers/qnn/clip_op_test.cc b/onnxruntime/test/providers/qnn/clip_op_test.cc
index 544cf35c68f47..5e567644aa13b 100644
--- a/onnxruntime/test/providers/qnn/clip_op_test.cc
+++ b/onnxruntime/test/providers/qnn/clip_op_test.cc
@@ -28,7 +28,7 @@ static void RunClipTestOnCPU(const std::vector<TestInputDef<DataType>>& input_de
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
 
-  RunQnnModelTest(BuildOpTestCase<DataType>("Clip", input_defs, {}),
+  RunQnnModelTest(BuildOpTestCase<DataType>("Clip", input_defs, {}, {}),
                   provider_options,
                   opset,
                   expected_ep_assignment);
@@ -116,8 +116,8 @@ static void RunQDQClipTestOnHTP(const std::vector<TestInputDef<float>>& input_de
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  TestQDQModelAccuracy(BuildOpTestCase("Clip", input_defs, {}),  // baseline float32 model
-                       BuildQDQClipTestCase<QType>(input_defs),  // QDQ model
+  TestQDQModelAccuracy(BuildOpTestCase<float>("Clip", input_defs, {}, {}),  // baseline float32 model
+                       BuildQDQClipTestCase<QType>(input_defs),             // QDQ model
                        provider_options,
                        opset,
                        expected_ep_assignment);
diff --git a/onnxruntime/test/providers/qnn/flatten_op_test.cc b/onnxruntime/test/providers/qnn/flatten_op_test.cc
index 7952cc7b6ea1d..af536b731ad09 100644
--- a/onnxruntime/test/providers/qnn/flatten_op_test.cc
+++ b/onnxruntime/test/providers/qnn/flatten_op_test.cc
@@ -29,7 +29,7 @@ static void RunFlattenTestOnCPU(const TestInputDef<DataType>& input_def,
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
 
-  RunQnnModelTest(BuildOpTestCase<DataType>("Flatten", {input_def}, attrs),
+  RunQnnModelTest(BuildOpTestCase<DataType>("Flatten", {input_def}, {}, attrs),
                   provider_options,
                   opset,
                   expected_ep_assignment);
@@ -50,7 +50,7 @@ static void RunFlattenTestOnHTP(const TestInputDef<DataType>& input_def,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  RunQnnModelTest(BuildOpTestCase<DataType>("Flatten", {input_def}, attrs),
+  RunQnnModelTest(BuildOpTestCase<DataType>("Flatten", {input_def}, {}, attrs),
                   provider_options,
                   opset,
                   expected_ep_assignment);
@@ -71,8 +71,8 @@ static void RunQDQFlattenTestOnHTP(const TestInputDef<float>& input_def,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  TestQDQModelAccuracy(BuildOpTestCase<float>("Flatten", {input_def}, attrs),     // baseline float32 model
-                       BuildQDQOpTestCase<QType>("Flatten", {input_def}, attrs),  // QDQ model
+  TestQDQModelAccuracy(BuildOpTestCase<float>("Flatten", {input_def}, {}, attrs),     // baseline float32 model
+                       BuildQDQOpTestCase<QType>("Flatten", {input_def}, {}, attrs),  // QDQ model
                        provider_options,
                        opset,
                        expected_ep_assignment);
diff --git a/onnxruntime/test/providers/qnn/gather_op_htp_test.cc b/onnxruntime/test/providers/qnn/gather_op_htp_test.cc
index 5b05b39f34a27..37e0db906d054 100644
--- a/onnxruntime/test/providers/qnn/gather_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/gather_op_htp_test.cc
@@ -5,6 +5,7 @@
 
 #include <string>
 #include "core/graph/graph.h"
+#include "core/graph/node_attr_utils.h"
 
 #include "test/providers/qnn/qnn_test_utils.h"
 
@@ -14,47 +15,14 @@ namespace onnxruntime {
 namespace test {
 #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
-// Function that builds a float model with a Gather op.
-template <typename IndicesType = int32_t>
-static GetTestModelFn BuildGatherOpTestCase(const TestInputDef<float>& input_def,
-                                            const TestInputDef<IndicesType>& indices_def,
-                                            int64_t axis = 0) {
-  return [input_def, indices_def, axis](ModelTestBuilder& builder) {
-    NodeArg* input = MakeTestInput(builder, input_def);
-    NodeArg* indices = MakeTestInput(builder, indices_def);
-    NodeArg* output = builder.MakeOutput();
-
-    Node& gather_node = builder.AddNode("Gather", {input, indices}, {output});
-    gather_node.AddAttribute("axis", axis);
-  };
-}
-
-// Function that builds a QDQ model with a Gather op.
-template <typename QuantType = uint8_t, typename IndicesType = int32_t>
-static GetTestQDQModelFn<QuantType> BuildQDQGatherOpTestCase(const TestInputDef<float>& input_def,
-                                                             const TestInputDef<IndicesType>& indices_def,
-                                                             int64_t axis = 0) {
-  return [input_def, indices_def, axis](ModelTestBuilder& builder,
-                                        std::vector<QuantParams<QuantType>>& output_qparams) {
-    NodeArg* input = MakeTestInput(builder, input_def);
-    QuantParams<QuantType> input_qparams = GetTestInputQuantParams<QuantType>(input_def);
-    NodeArg* input_qdq = AddQDQNodePair<QuantType>(builder, input, input_qparams.scale, input_qparams.zero_point);
-
-    NodeArg* indices = MakeTestInput(builder, indices_def);
-
-    NodeArg* gather_output = builder.MakeIntermediate();
-    Node& gather_node = builder.AddNode("Gather", {input_qdq, indices}, {gather_output});
-    gather_node.AddAttribute("axis", axis);
-
-    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, gather_output, output_qparams[0].scale, output_qparams[0].zero_point);
-  };
-}
-
 // Test the accuracy of a QDQ Gather model on QNN EP. Checks if the QDQ model on QNN EP as accurate as the QDQ model on CPU EP
 // (compared to float32 model).
 template <typename QuantType, typename IndicesType>
-static void RunQDQGatherOpTest(const TestInputDef<float>& input_def, const TestInputDef<IndicesType>& indices_def,
-                               int64_t axis, int opset, ExpectedEPNodeAssignment expected_ep_assignment) {
+static void RunQDQGatherOpTest(const TestInputDef<float>& input_def,
+                               const TestInputDef<IndicesType>& indices_def,
+                               const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                               int opset,
+                               ExpectedEPNodeAssignment expected_ep_assignment) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
   provider_options["backend_path"] = "QnnHtp.dll";
@@ -62,12 +30,14 @@ static void RunQDQGatherOpTest(const TestInputDef<float>& input_def, const TestI
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  TestQDQModelAccuracy<QuantType>(BuildGatherOpTestCase<IndicesType>(input_def, indices_def, axis),
-                                  BuildQDQGatherOpTestCase<QuantType, IndicesType>(input_def, indices_def, axis),
+  auto f32_model_builder = BuildOpTestCase<float, IndicesType>("Gather", {input_def}, {indices_def}, attrs);
+  auto qdq_model_builder = BuildQDQOpTestCase<QuantType, IndicesType>("Gather", {input_def}, {indices_def}, attrs);
+
+  TestQDQModelAccuracy<QuantType>(f32_model_builder,
+                                  qdq_model_builder,
                                   provider_options,
                                   opset,
-                                  expected_ep_assignment,
-                                  1e-5f);
+                                  expected_ep_assignment);
 }
 
 // Test creates a DQ -> Gather -> Q -> DQ graph, and checks that all
@@ -77,7 +47,7 @@ static void RunQDQGatherOpTest(const TestInputDef<float>& input_def, const TestI
 TEST_F(QnnHTPBackendTests, GatherOp_IndicesStaticInt64_Axis0) {
   RunQDQGatherOpTest<uint8_t, int64_t>(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.7f}),
                                        TestInputDef<int64_t>({2, 2}, true, {0, 1, 1, 2}),
-                                       0,
+                                       {utils::MakeAttribute("axis", static_cast<int64_t>(0))},
                                        13,
                                        ExpectedEPNodeAssignment::All);
 }
@@ -86,7 +56,7 @@ TEST_F(QnnHTPBackendTests, GatherOp_IndicesStaticInt64_Axis0) {
 TEST_F(QnnHTPBackendTests, GatherOp_IndicesDynamicInt64_Axis0) {
   RunQDQGatherOpTest<uint8_t, int64_t>(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.7f}),
                                        TestInputDef<int64_t>({2, 2}, false, {0, 1, 1, 2}),
-                                       0,
+                                       {utils::MakeAttribute("axis", static_cast<int64_t>(0))},
                                        13,
                                        ExpectedEPNodeAssignment::None);
 }
@@ -98,7 +68,7 @@ TEST_F(QnnHTPBackendTests, GatherOp_IndicesDynamicInt64_Axis0) {
 TEST_F(QnnHTPBackendTests, GatherOp_IndicesStaticInt32_Axis0) {
   RunQDQGatherOpTest<uint8_t, int32_t>(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.7f}),
                                        TestInputDef<int32_t>({2, 2}, true, {0, 1, 1, 2}),
-                                       0,
+                                       {utils::MakeAttribute("axis", static_cast<int64_t>(0))},
                                        13,
                                        ExpectedEPNodeAssignment::All);
 }
@@ -110,7 +80,7 @@ TEST_F(QnnHTPBackendTests, GatherOp_IndicesStaticInt32_Axis0) {
 TEST_F(QnnHTPBackendTests, GatherOp_IndicesDynamicInt32_Axis0) {
   RunQDQGatherOpTest<uint8_t, int32_t>(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.7f}),
                                        TestInputDef<int32_t>({2, 2}, false, {0, 1, 1, 2}),
-                                       0,
+                                       {utils::MakeAttribute("axis", static_cast<int64_t>(0))},
                                        13,
                                        ExpectedEPNodeAssignment::All);
 }
@@ -122,7 +92,7 @@ TEST_F(QnnHTPBackendTests, GatherOp_IndicesDynamicInt32_Axis0) {
 TEST_F(QnnHTPBackendTests, GatherOp_IndicesStaticInt32_Axis1) {
   RunQDQGatherOpTest<uint8_t, int32_t>(TestInputDef<float>({3, 3}, false, {1.0f, 1.2f, 1.9f, 2.3f, 3.4f, 3.9f, 4.5f, 5.7f, 5.9f}),
                                        TestInputDef<int32_t>({1, 2}, true, {0, 2}),
-                                       1,
+                                       {utils::MakeAttribute("axis", static_cast<int64_t>(1))},
                                        13,
                                        ExpectedEPNodeAssignment::All);
 }
diff --git a/onnxruntime/test/providers/qnn/gemm_op_test.cc b/onnxruntime/test/providers/qnn/gemm_op_test.cc
index 4c49a6d13f564..86119afbbb3fa 100644
--- a/onnxruntime/test/providers/qnn/gemm_op_test.cc
+++ b/onnxruntime/test/providers/qnn/gemm_op_test.cc
@@ -30,7 +30,7 @@ static void RunGemmTestOnCPU(const std::vector<TestInputDef<DataType>>& input_de
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
 
-  RunQnnModelTest(BuildOpTestCase("Gemm", input_defs, attrs),
+  RunQnnModelTest(BuildOpTestCase<float>("Gemm", input_defs, {}, attrs),
                   provider_options,
                   opset,
                   expected_ep_assignment);
@@ -192,7 +192,7 @@ static void RunQDQGemmTestOnHTP(const std::vector<TestInputDef<float>>& input_de
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  TestQDQModelAccuracy<InputAQType>(BuildOpTestCase<float>("Gemm", input_defs, attrs),
+  TestQDQModelAccuracy<InputAQType>(BuildOpTestCase<float>("Gemm", input_defs, {}, attrs),
                                     BuildQDQGemmTestCase<InputAQType, InputBQType>(input_defs, attrs),
                                     provider_options,
                                     opset,
diff --git a/onnxruntime/test/providers/qnn/instance_norm_htp_test.cc b/onnxruntime/test/providers/qnn/instance_norm_htp_test.cc
index 594973e37ef0b..f662ac14336f8 100644
--- a/onnxruntime/test/providers/qnn/instance_norm_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/instance_norm_htp_test.cc
@@ -16,25 +16,6 @@ namespace onnxruntime {
 namespace test {
 #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
-// Function that builds a float32 model with an InstanceNormalization operator.
-GetTestModelFn BuildInstanceNormTestCase(const TestInputDef<float>& input_def,
-                                         const TestInputDef<float>& scale_def,
-                                         const TestInputDef<float>& bias_def,
-                                         const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
-  return [input_def, scale_def, bias_def, attrs](ModelTestBuilder& builder) {
-    NodeArg* input = MakeTestInput(builder, input_def);
-    NodeArg* scale = MakeTestInput(builder, scale_def);
-    NodeArg* bias = MakeTestInput(builder, bias_def);
-
-    NodeArg* output = builder.MakeOutput();
-    Node& op_node = builder.AddNode("InstanceNormalization", {input, scale, bias}, {output});
-
-    for (const auto& attr : attrs) {
-      op_node.AddAttributeProto(attr);
-    }
-  };
-}
-
 // Function that builds a QDQ model with an InstanceNormalization operator.
 template <typename QuantType>
 static GetTestQDQModelFn<QuantType> BuildQDQInstanceNormTestCase(const TestInputDef<float>& input_def,
@@ -93,7 +74,7 @@ static void RunInstanceNormQDQTest(const TestInputDef<float>& input_def,
 #endif
 
   // Runs model with DQ-> InstanceNorm -> Q and compares the outputs of the CPU and QNN EPs.
-  TestQDQModelAccuracy(BuildInstanceNormTestCase(input_def, scale_def, bias_def, attrs),
+  TestQDQModelAccuracy(BuildOpTestCase<float>("InstanceNormalization", {input_def, scale_def, bias_def}, {}, attrs),
                        BuildQDQInstanceNormTestCase<QuantType>(input_def, scale_def, bias_def, attrs),
                        provider_options,
                        18,
diff --git a/onnxruntime/test/providers/qnn/layer_norm_test.cc b/onnxruntime/test/providers/qnn/layer_norm_test.cc
index aa6c6a142e6d1..085454004e5a5 100644
--- a/onnxruntime/test/providers/qnn/layer_norm_test.cc
+++ b/onnxruntime/test/providers/qnn/layer_norm_test.cc
@@ -29,7 +29,7 @@ static void RunLayerNormCpuTest(const TestInputDef<float>& input_def,
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
 
-  RunQnnModelTest(BuildOpTestCase<float>("LayerNormalization", {input_def, scale_def}, attrs),
+  RunQnnModelTest(BuildOpTestCase<float>("LayerNormalization", {input_def, scale_def}, {}, attrs),
                   provider_options,
                   17,
                   expected_ep_assignment);
@@ -114,7 +114,7 @@ static void RunLayerNormQDQTest(const TestInputDef<float>& input_def,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  TestQDQModelAccuracy(BuildOpTestCase<float>("LayerNormalization", {input_def, scale_def}, attrs),
+  TestQDQModelAccuracy(BuildOpTestCase<float>("LayerNormalization", {input_def, scale_def}, {}, attrs),
                        BuildQDQLayerNormTestCase<InputQType, ScaleQType>(input_def, scale_def, attrs),
                        provider_options,
                        17,  // opset
diff --git a/onnxruntime/test/providers/qnn/leakyrelu_op_htp_test.cc b/onnxruntime/test/providers/qnn/leakyrelu_op_htp_test.cc
index a8237817c71df..e3077ec569923 100644
--- a/onnxruntime/test/providers/qnn/leakyrelu_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/leakyrelu_op_htp_test.cc
@@ -5,6 +5,7 @@
 
 #include <string>
 #include "core/graph/graph.h"
+#include "core/graph/node_attr_utils.h"
 
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
@@ -15,42 +16,10 @@ namespace onnxruntime {
 namespace test {
 #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
-// Creates a function that builds a model with a LeakyRelu operator.
-static GetTestModelFn BuildLeakyReluOpTestCase(const TestInputDef<float>& input_def, float alpha) {
-  return [input_def, alpha](ModelTestBuilder& builder) {
-    NodeArg* input = MakeTestInput(builder, input_def);
-    NodeArg* output = builder.MakeOutput();
-    Node& leakyrelu_node = builder.AddNode("LeakyRelu", {input}, {output});
-    leakyrelu_node.AddAttribute("alpha", alpha);
-  };
-}
-
-// Creates a function that builds a QDQ model with a LeakyRelu operator.
-template <typename QuantType>
-static GetTestQDQModelFn<QuantType> BuildQDQLeakyReluOpTestCase(const TestInputDef<float>& input_def,
-                                                                float alpha) {
-  return [input_def, alpha](ModelTestBuilder& builder,
-                            std::vector<QuantParams<QuantType>>& output_qparams) {
-    // input => Q => DQ =>
-    NodeArg* input = MakeTestInput(builder, input_def);
-    QuantParams<QuantType> input_qparams = GetTestInputQuantParams<QuantType>(input_def);
-    NodeArg* input_qdq = AddQDQNodePair<QuantType>(builder, input, input_qparams.scale, input_qparams.zero_point);
-
-    // LeakryRelu
-    auto* leakyrelu_output = builder.MakeIntermediate();
-    Node& leakyrelu_node = builder.AddNode("LeakyRelu", {input_qdq}, {leakyrelu_output});
-    leakyrelu_node.AddAttribute("alpha", alpha);
-
-    // => Q => DQ -> final output
-    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, leakyrelu_output, output_qparams[0].scale,
-                                                     output_qparams[0].zero_point);
-  };
-}
-
 // Checks the accuracy of a QDQ LeakyRelu model by comparing to ORT CPU EP.
 template <typename QuantType>
 static void RunLeakyReluOpQDQTest(const TestInputDef<float>& input_def,
-                                  float alpha,
+                                  const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
                                   int opset,
                                   ExpectedEPNodeAssignment expected_ep_assignment) {
   ProviderOptions provider_options;
@@ -60,12 +29,11 @@ static void RunLeakyReluOpQDQTest(const TestInputDef<float>& input_def,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  TestQDQModelAccuracy(BuildLeakyReluOpTestCase(input_def, alpha),
-                       BuildQDQLeakyReluOpTestCase<QuantType>(input_def, alpha),
+  TestQDQModelAccuracy(BuildOpTestCase<float>("LeakyRelu", {input_def}, {}, attrs),
+                       BuildQDQOpTestCase<QuantType>("LeakyRelu", {input_def}, {}, attrs),
                        provider_options,
                        opset,
-                       expected_ep_assignment,
-                       1e-5f);
+                       expected_ep_assignment);
 }
 
 // Test creates a DQ -> Gather -> Q -> DQ graph, and checks that all
@@ -74,7 +42,7 @@ static void RunLeakyReluOpQDQTest(const TestInputDef<float>& input_def,
 // - Uses uint8 as the quantization type.
 TEST_F(QnnHTPBackendTests, LeakyReluOpSet15) {
   RunLeakyReluOpQDQTest<uint8_t>(TestInputDef<float>({1, 2, 3}, false, {-40.0f, -20.0f, 0.0f, 10.0f, 30.0f, 40.0f}),
-                                 0.2f,
+                                 {utils::MakeAttribute("alpha", 0.2f)},
                                  15,
                                  ExpectedEPNodeAssignment::All);
 }
@@ -85,7 +53,7 @@ TEST_F(QnnHTPBackendTests, LeakyReluOpSet15) {
 // - Uses uint8 as the quantization type.
 TEST_F(QnnHTPBackendTests, LeakyReluOpSet16) {
   RunLeakyReluOpQDQTest<uint8_t>(TestInputDef<float>({1, 2, 3}, false, {-40.0f, -20.0f, 0.0f, 10.0f, 30.0f, 40.0f}),
-                                 0.2f,
+                                 {utils::MakeAttribute("alpha", 0.2f)},
                                  16,
                                  ExpectedEPNodeAssignment::All);
 }
diff --git a/onnxruntime/test/providers/qnn/max_min_op_test.cc b/onnxruntime/test/providers/qnn/max_min_op_test.cc
index 09ea71e5f03eb..3deff121f3c72 100644
--- a/onnxruntime/test/providers/qnn/max_min_op_test.cc
+++ b/onnxruntime/test/providers/qnn/max_min_op_test.cc
@@ -27,7 +27,7 @@ static void RunCPUMinOrMaxOpTest(const std::string& op_type,
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
 
-  RunQnnModelTest(BuildOpTestCase(op_type, input_defs, {}, kOnnxDomain),
+  RunQnnModelTest(BuildOpTestCase<float>(op_type, input_defs, {}, {}, kOnnxDomain),
                   provider_options,
                   opset,
                   expected_ep_assignment);
@@ -48,12 +48,11 @@ static void RunQDQMinOrMaxOpTest(const std::string& op_type,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  TestQDQModelAccuracy(BuildOpTestCase(op_type, input_defs, {}, kOnnxDomain),            // baseline float32 model
-                       BuildQDQOpTestCase<QType>(op_type, input_defs, {}, kOnnxDomain),  // QDQ model
+  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, input_defs, {}, {}, kOnnxDomain),     // baseline float32 model
+                       BuildQDQOpTestCase<QType>(op_type, input_defs, {}, {}, kOnnxDomain),  // QDQ model
                        provider_options,
                        opset,
-                       expected_ep_assignment,
-                       1e-4f);
+                       expected_ep_assignment);
 }
 
 //
diff --git a/onnxruntime/test/providers/qnn/pool_op_test.cpp b/onnxruntime/test/providers/qnn/pool_op_test.cpp
index fee10a542fb82..7ed9072a95b32 100644
--- a/onnxruntime/test/providers/qnn/pool_op_test.cpp
+++ b/onnxruntime/test/providers/qnn/pool_op_test.cpp
@@ -17,21 +17,6 @@
 namespace onnxruntime {
 namespace test {
 
-// Returns a function that creates a graph with a single MaxPool operator.
-static GetTestModelFn BuildPoolTestCase(const std::string& op_type,
-                                        const TestInputDef<float>& input_def,
-                                        const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
-  return [op_type, input_def, attrs](ModelTestBuilder& builder) {
-    NodeArg* input = MakeTestInput(builder, input_def);
-    NodeArg* output = builder.MakeOutput();
-    Node& pool_node = builder.AddNode(op_type, {input}, {output});
-
-    for (const auto& attr : attrs) {
-      pool_node.AddAttributeProto(attr);
-    }
-  };
-}
-
 // Returns a function that creates a graph with a QDQ MaxPool operator.
 template <typename QuantType>
 GetTestQDQModelFn<QuantType> BuildPoolQDQTestCase(const std::string& op_type,
@@ -74,7 +59,7 @@ static void RunPoolOpTest(const std::string& op_type,
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
 
-  RunQnnModelTest(BuildPoolTestCase(op_type, input_def, attrs),
+  RunQnnModelTest(BuildOpTestCase<float>(op_type, {input_def}, {}, attrs),
                   provider_options,
                   opset,
                   expected_ep_assignment);
@@ -95,7 +80,7 @@ static void RunQDQPoolOpTest(const std::string& op_type,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  TestQDQModelAccuracy(BuildPoolTestCase(op_type, input_def, attrs),
+  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, attrs),
                        BuildPoolQDQTestCase<QuantType>(op_type, input_def, attrs),
                        provider_options,
                        opset,
diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.h b/onnxruntime/test/providers/qnn/qnn_test_utils.h
index fd572fa17f2b1..e8f512619a8a6 100644
--- a/onnxruntime/test/providers/qnn/qnn_test_utils.h
+++ b/onnxruntime/test/providers/qnn/qnn_test_utils.h
@@ -438,25 +438,33 @@ NodeArg* MakeTestQDQBiasInput(ModelTestBuilder& builder, const TestInputDef<floa
                               bool use_contrib_qdq = false);
 
 /**
- * Returns a function that builds a model with a single operator with N inputs of the same element type.
+ * Returns a function that builds a model with a single operator with N inputs type InputType1 and M inputs
+ * of type InputType2.
  *
  * \param op_type The operator to instantiate.
- * \param input_defs List of input definitions.
+ * \param input_defs_1 List of input definitions of type InputType1.
+ * \param input_defs_2 List of input definitions of type InputType2.
  * \param attrs List of operator attributes.
  * \param op_domain The operator's domain. Defaults to the ONNX domain (i.e., "").
  * \returns A model building function.
  */
-template <typename InputType>
+template <typename InputType1, typename InputType2 = int64_t>
 inline GetTestModelFn BuildOpTestCase(const std::string& op_type,
-                                      const std::vector<TestInputDef<InputType>>& input_defs,
+                                      const std::vector<TestInputDef<InputType1>>& input_defs_1,
+                                      const std::vector<TestInputDef<InputType2>>& input_defs_2,
                                       const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
                                       const std::string& op_domain = kOnnxDomain) {
-  return [op_type, input_defs, attrs, op_domain](ModelTestBuilder& builder) {
+  return [op_type, input_defs_1, input_defs_2, attrs, op_domain](ModelTestBuilder& builder) {
     std::vector<NodeArg*> op_inputs;
-    op_inputs.reserve(input_defs.size());
+    op_inputs.reserve(input_defs_1.size() + input_defs_2.size());
+
+    for (const auto& input_def : input_defs_1) {
+      NodeArg* input = MakeTestInput<InputType1>(builder, input_def);
+      op_inputs.push_back(input);
+    }
 
-    for (const auto& input_def : input_defs) {
-      NodeArg* input = MakeTestInput<InputType>(builder, input_def);
+    for (const auto& input_def : input_defs_2) {
+      NodeArg* input = MakeTestInput<InputType2>(builder, input_def);
       op_inputs.push_back(input);
     }
 
@@ -470,7 +478,8 @@ inline GetTestModelFn BuildOpTestCase(const std::string& op_type,
 }
 
 /**
- * Returns a function that builds a model with a single QDQ operator with N inputs of the same element type.
+ * Returns a function that builds a model with a single QDQ operator with N float (quantizeable) inputs
+ * and M inputs of a potentially different type.
  *
  * \param op_type The operator to instantiate.
  * \param input_defs List of input definitions.
@@ -478,25 +487,33 @@ inline GetTestModelFn BuildOpTestCase(const std::string& op_type,
  * \param op_domain The operator's domain. Defaults to the ONNX domain (i.e., "").
  * \returns A model building function.
  */
-template <typename InputQType>
-inline GetTestQDQModelFn<InputQType> BuildQDQOpTestCase(const std::string& op_type,
-                                                        const std::vector<TestInputDef<float>>& input_defs,
-                                                        const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
-                                                        const std::string& op_domain = kOnnxDomain,
-                                                        bool use_contrib_qdq = false) {
-  return [op_type, input_defs, attrs, op_domain,
-          use_contrib_qdq](ModelTestBuilder& builder, std::vector<QuantParams<InputQType>>& output_qparams) {
+template <typename QuantType, typename OtherInputType = int64_t>
+inline GetTestQDQModelFn<QuantType> BuildQDQOpTestCase(const std::string& op_type,
+                                                       const std::vector<TestInputDef<float>>& quant_input_defs,
+                                                       const std::vector<TestInputDef<OtherInputType>>& non_quant_input_defs,
+                                                       const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                                       const std::string& op_domain = kOnnxDomain,
+                                                       bool use_contrib_qdq = false) {
+  return [op_type, quant_input_defs, non_quant_input_defs, attrs, op_domain,
+          use_contrib_qdq](ModelTestBuilder& builder, std::vector<QuantParams<QuantType>>& output_qparams) {
     std::vector<NodeArg*> op_inputs;
-    op_inputs.reserve(input_defs.size());
+    op_inputs.reserve(quant_input_defs.size() + non_quant_input_defs.size());
 
-    for (const auto& input_def : input_defs) {
+    // Create QDQ inputs
+    for (const auto& input_def : quant_input_defs) {
       NodeArg* input = MakeTestInput<float>(builder, input_def);
-      QuantParams<InputQType> input_qparams = GetTestInputQuantParams<InputQType>(input_def);
-      NodeArg* input_after_qdq = AddQDQNodePair<InputQType>(builder, input, input_qparams.scale,
-                                                            input_qparams.zero_point, use_contrib_qdq);
+      QuantParams<QuantType> input_qparams = GetTestInputQuantParams<QuantType>(input_def);
+      NodeArg* input_after_qdq = AddQDQNodePair<QuantType>(builder, input, input_qparams.scale,
+                                                           input_qparams.zero_point, use_contrib_qdq);
       op_inputs.push_back(input_after_qdq);
     }
 
+    // Create non-QDQ inputs
+    for (const auto& input_def : non_quant_input_defs) {
+      NodeArg* input = MakeTestInput<OtherInputType>(builder, input_def);
+      op_inputs.push_back(input);
+    }
+
     // Op -> op_output
     auto* op_output = builder.MakeIntermediate();
     Node& onnx_node = builder.AddNode(op_type, op_inputs, {op_output}, op_domain);
@@ -506,8 +523,8 @@ inline GetTestQDQModelFn<InputQType> BuildQDQOpTestCase(const std::string& op_ty
     }
 
     // op_output -> Q -> DQ -> output
-    AddQDQNodePairWithOutputAsGraphOutput<InputQType>(builder, op_output, output_qparams[0].scale,
-                                                      output_qparams[0].zero_point, use_contrib_qdq);
+    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, op_output, output_qparams[0].scale,
+                                                     output_qparams[0].zero_point, use_contrib_qdq);
   };
 }
 
diff --git a/onnxruntime/test/providers/qnn/reshape_op_test.cc b/onnxruntime/test/providers/qnn/reshape_op_test.cc
index 9615e54ed61d2..fcba46a04bcb3 100644
--- a/onnxruntime/test/providers/qnn/reshape_op_test.cc
+++ b/onnxruntime/test/providers/qnn/reshape_op_test.cc
@@ -14,23 +14,6 @@
 namespace onnxruntime {
 namespace test {
 
-// Returns a function that creates a graph with a single Reshape operator.
-template <typename DataType>
-static GetTestModelFn BuildReshapeTestCase(const TestInputDef<DataType>& input_def,
-                                           const TestInputDef<int64_t>& shape_def,
-                                           const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
-  return [input_def, shape_def, attrs](ModelTestBuilder& builder) {
-    NodeArg* input = MakeTestInput(builder, input_def);
-    NodeArg* shape_input = MakeTestInput(builder, shape_def);
-    NodeArg* output = builder.MakeOutput();
-    Node& reshape_node = builder.AddNode("Reshape", {input, shape_input}, {output});
-
-    for (const auto& attr : attrs) {
-      reshape_node.AddAttributeProto(attr);
-    }
-  };
-}
-
 // Returns a function that creates a graph with a QDQ Reshape operator.
 template <typename QuantType>
 GetTestQDQModelFn<QuantType> BuildQDQReshapeTestCase(const TestInputDef<float>& input_def,
@@ -78,7 +61,7 @@ static void RunReshapeTestOnCPU(const TestInputDef<DataType>& input_def,
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
 
-  RunQnnModelTest(BuildReshapeTestCase(input_def, shape_def, attrs),
+  RunQnnModelTest(BuildOpTestCase<DataType, int64_t>("Reshape", {input_def}, {shape_def}, attrs),
                   provider_options,
                   opset,
                   expected_ep_assignment);
@@ -100,7 +83,7 @@ static void RunReshapeTestOnHTP(const TestInputDef<DataType>& input_def,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  RunQnnModelTest(BuildReshapeTestCase(input_def, shape_def, attrs),
+  RunQnnModelTest(BuildOpTestCase<DataType, int64_t>("Reshape", {input_def}, {shape_def}, attrs),
                   provider_options,
                   opset,
                   expected_ep_assignment);
@@ -122,8 +105,10 @@ static void RunQDQReshapeTestOnHTP(const TestInputDef<float>& input_def,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  TestQDQModelAccuracy(BuildReshapeTestCase(input_def, shape_def, attrs),            // baseline float32 model
-                       BuildQDQReshapeTestCase<QType>(input_def, shape_def, attrs),  // QDQ model
+  auto f32_model_builder = BuildOpTestCase<float, int64_t>("Reshape", {input_def}, {shape_def}, attrs);
+  auto qdq_model_builder = BuildQDQReshapeTestCase<QType>(input_def, shape_def, attrs);
+  TestQDQModelAccuracy(f32_model_builder,
+                       qdq_model_builder,
                        provider_options,
                        opset,
                        expected_ep_assignment);
diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
index 63498982930f5..f77c098f72116 100644
--- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
@@ -32,7 +32,7 @@ static void RunOpTestOnCPU(const std::string& op_type,
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
 
-  RunQnnModelTest(BuildOpTestCase<InputType>(op_type, input_defs, attrs, op_domain),
+  RunQnnModelTest(BuildOpTestCase<InputType>(op_type, input_defs, {}, attrs, op_domain),
                   provider_options,
                   opset_version,
                   expected_ep_assignment);
@@ -113,8 +113,8 @@ static void RunQDQOpTest(const std::string& op_type,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, input_defs, attrs, op_domain),
-                       BuildQDQOpTestCase<InputQType>(op_type, input_defs, attrs, op_domain, use_contrib_qdq),
+  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, input_defs, {}, attrs, op_domain),
+                       BuildQDQOpTestCase<InputQType>(op_type, input_defs, {}, attrs, op_domain, use_contrib_qdq),
                        provider_options,
                        opset_version,
                        expected_ep_assignment,
@@ -137,7 +137,7 @@ static void RunOpTest(const std::string& op_type,
 #endif
 
   // Runs model with a Q/DQ binary op and compares the outputs of the CPU and QNN EPs.
-  RunQnnModelTest(BuildOpTestCase<InputType>(op_type, input_defs, attrs, op_domain),
+  RunQnnModelTest(BuildOpTestCase<InputType>(op_type, input_defs, {}, attrs, op_domain),
                   provider_options,
                   opset_version,
                   expected_ep_assignment);
@@ -698,8 +698,8 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCacheTest) {
 
   // Runs model with DQ-> Atan-> Q and compares the outputs of the CPU and QNN EPs.
   // 1st run will generate the Qnn context cache binary file
-  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}),
-                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}),
+  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, {}),
+                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}, {}),
                        provider_options,
                        14,
                        ExpectedEPNodeAssignment::All);
@@ -708,8 +708,8 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCacheTest) {
   EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
 
   // 2nd run will load and run from Qnn context cache binary file
-  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}),
-                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}),
+  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, {}),
+                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}, {}),
                        provider_options,
                        14,
                        ExpectedEPNodeAssignment::All);
diff --git a/onnxruntime/test/providers/qnn/slice_htp_test.cc b/onnxruntime/test/providers/qnn/slice_htp_test.cc
index f7163f04736a5..edc079dc65276 100644
--- a/onnxruntime/test/providers/qnn/slice_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/slice_htp_test.cc
@@ -16,51 +16,6 @@ namespace onnxruntime {
 namespace test {
 #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
-// Function that builds a model with a Slice operator.
-template <typename DataType>
-GetTestModelFn BuildSliceTestCase(const TestInputDef<DataType>& data_def,
-                                  const TestInputDef<int64_t>& starts_def,
-                                  const TestInputDef<int64_t>& ends_def,
-                                  const TestInputDef<int64_t>& axes_def,
-                                  const TestInputDef<int64_t>& steps_def) {
-  return [data_def, starts_def, ends_def, axes_def, steps_def](ModelTestBuilder& builder) {
-    NodeArg* data = MakeTestInput(builder, data_def);
-    NodeArg* starts = MakeTestInput(builder, starts_def);
-    NodeArg* ends = MakeTestInput(builder, ends_def);
-    NodeArg* axes = MakeTestInput(builder, axes_def);
-    NodeArg* steps = MakeTestInput(builder, steps_def);
-
-    NodeArg* output = builder.MakeOutput();
-    builder.AddNode("Slice", {data, starts, ends, axes, steps}, {output});
-  };
-}
-
-// Function that builds a QDQ model with a Slice operator.
-template <typename QuantType>
-static GetTestQDQModelFn<QuantType> BuildQDQSliceTestCase(const TestInputDef<float>& data_def,
-                                                          const TestInputDef<int64_t>& starts_def,
-                                                          const TestInputDef<int64_t>& ends_def,
-                                                          const TestInputDef<int64_t>& axes_def,
-                                                          const TestInputDef<int64_t>& steps_def) {
-  return [data_def, starts_def, ends_def, axes_def, steps_def](ModelTestBuilder& builder,
-                                                               std::vector<QuantParams<QuantType>>& output_qparams) {
-    NodeArg* data = MakeTestInput(builder, data_def);
-    QuantParams<QuantType> data_qparams = GetTestInputQuantParams<QuantType>(data_def);
-    NodeArg* data_qdq = AddQDQNodePair(builder, data, data_qparams.scale, data_qparams.zero_point);
-
-    NodeArg* starts = MakeTestInput(builder, starts_def);
-    NodeArg* ends = MakeTestInput(builder, ends_def);
-    NodeArg* axes = MakeTestInput(builder, axes_def);
-    NodeArg* steps = MakeTestInput(builder, steps_def);
-
-    auto* slice_output = builder.MakeIntermediate();
-    builder.AddNode("Slice", {data_qdq, starts, ends, axes, steps}, {slice_output});
-
-    // Add output -> Q -> output_u8
-    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, slice_output, output_qparams[0].scale, output_qparams[0].zero_point);
-  };
-}
-
 /**
  * Runs an Slice model on the QNN HTP backend. Checks the graph node assignment, and that inference
  * outputs for QNN and CPU match.
@@ -86,13 +41,14 @@ static void RunSliceQDQTest(const TestInputDef<float>& data_def,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  // Runs model with DQ-> Slice -> Q and compares the outputs of the CPU and QNN EPs.
-  TestQDQModelAccuracy(BuildSliceTestCase<float>(data_def, starts_def, ends_def, axes_def, steps_def),
-                       BuildQDQSliceTestCase<QuantType>(data_def, starts_def, ends_def, axes_def, steps_def),
+  const std::vector<TestInputDef<float>> f32_inputs = {data_def};
+  const std::vector<TestInputDef<int64_t>> int64_inputs = {starts_def, ends_def, axes_def, steps_def};
+
+  TestQDQModelAccuracy(BuildOpTestCase<float, int64_t>("Slice", f32_inputs, int64_inputs, {}),
+                       BuildQDQOpTestCase<QuantType, int64_t>("Slice", f32_inputs, int64_inputs, {}),
                        provider_options,
                        18,
-                       expected_ep_assignment,
-                       1e-5f);
+                       expected_ep_assignment);
 }
 
 /**
@@ -119,12 +75,12 @@ static void RunSliceNonQDQOnHTP(const TestInputDef<DataType>& data_def,
 #else
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
-
-  RunQnnModelTest(BuildSliceTestCase<DataType>(data_def, starts_def, ends_def, axes_def, steps_def),
+  auto f32_model_builder = BuildOpTestCase<DataType, int64_t>("Slice", {data_def},
+                                                              {starts_def, ends_def, axes_def, steps_def}, {});
+  RunQnnModelTest(f32_model_builder,
                   provider_options,
                   13,
-                  expected_ep_assignment,
-                  1e-5f);
+                  expected_ep_assignment);
 }
 
 // Check that QNN compiles DQ -> Slice -> Q as a single unit.
diff --git a/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc b/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc
index 02e61bfc2d25e..454bcb280b3e8 100644
--- a/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc
+++ b/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc
@@ -13,19 +13,6 @@
 namespace onnxruntime {
 namespace test {
 
-// Returns a function that creates a graph with a single (Un)Squeeze operator.
-template <typename DataType>
-static GetTestModelFn BuildSqueezeTestCase(const std::string& op_type,  // Squeeze or Unsqueeze
-                                           const TestInputDef<DataType>& input_def,
-                                           const TestInputDef<int64_t>& axes_def) {
-  return [op_type, input_def, axes_def](ModelTestBuilder& builder) {
-    NodeArg* input = MakeTestInput(builder, input_def);
-    NodeArg* axes_input = MakeTestInput(builder, axes_def);
-    NodeArg* output = builder.MakeOutput();
-    builder.AddNode(op_type, {input, axes_input}, {output});
-  };
-}
-
 // Returns a function that creates a graph with a QDQ (Un)Squeeze operator.
 template <typename QuantType>
 GetTestQDQModelFn<QuantType> BuildQDQSqueezeTestCase(const std::string& op_type,  // Squeeze or Unsqueeze
@@ -69,7 +56,7 @@ static void RunSqueezeTestOnCPU(const std::string& op_type,  // Squeeze or Unsqu
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
 
-  RunQnnModelTest(BuildSqueezeTestCase<DataType>(op_type, input_def, axes_def),
+  RunQnnModelTest(BuildOpTestCase<DataType, int64_t>(op_type, {input_def}, {axes_def}, {}),
                   provider_options,
                   opset,
                   expected_ep_assignment);
@@ -91,7 +78,7 @@ static void RunSqueezeTestOnHTP(const std::string& op_type,  // Squeeze or Unsqu
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  RunQnnModelTest(BuildSqueezeTestCase<DataType>(op_type, input_def, axes_def),
+  RunQnnModelTest(BuildOpTestCase<DataType, int64_t>(op_type, {input_def}, {axes_def}, {}),
                   provider_options,
                   opset,
                   expected_ep_assignment);
@@ -114,8 +101,8 @@ static void RunQDQSqueezeTestOnHTP(const std::string& op_type,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  TestQDQModelAccuracy(BuildSqueezeTestCase<float>(op_type, input_def, axes_def),     // baseline float32 model
-                       BuildQDQSqueezeTestCase<QType>(op_type, input_def, axes_def),  // QDQ model
+  TestQDQModelAccuracy(BuildOpTestCase<float, int64_t>(op_type, {input_def}, {axes_def}, {}),  // baseline float32 model
+                       BuildQDQSqueezeTestCase<QType>(op_type, input_def, axes_def),           // QDQ model
                        provider_options,
                        opset,
                        expected_ep_assignment);

From e5de9830eb42f614ab72744b89cade1689d3b479 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Tue, 19 Sep 2023 13:18:52 -0700
Subject: [PATCH 15/22] Clean up Clip tests

---
 .../test/providers/qnn/clip_op_test.cc        | 69 ++++++-------------
 1 file changed, 22 insertions(+), 47 deletions(-)

diff --git a/onnxruntime/test/providers/qnn/clip_op_test.cc b/onnxruntime/test/providers/qnn/clip_op_test.cc
index 5e567644aa13b..1d448cbe57030 100644
--- a/onnxruntime/test/providers/qnn/clip_op_test.cc
+++ b/onnxruntime/test/providers/qnn/clip_op_test.cc
@@ -17,7 +17,8 @@ namespace test {
 // Runs a model with a Clip operator on the QNN CPU backend. Checks the graph node assignment
 // and that inference outputs for QNN EP and CPU EP match.
 template <typename DataType>
-static void RunClipTestOnCPU(const std::vector<TestInputDef<DataType>>& input_defs,
+static void RunClipTestOnCPU(const TestInputDef<DataType>& input_def,
+                             const std::vector<TestInputDef<DataType>>& min_max_defs,
                              ExpectedEPNodeAssignment expected_ep_assignment,
                              int opset = 13) {
   ProviderOptions provider_options;
@@ -28,7 +29,7 @@ static void RunClipTestOnCPU(const std::vector<TestInputDef<DataType>>& input_de
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
 
-  RunQnnModelTest(BuildOpTestCase<DataType>("Clip", input_defs, {}, {}),
+  RunQnnModelTest(BuildOpTestCase<DataType, DataType>("Clip", {input_def}, min_max_defs, {}),
                   provider_options,
                   opset,
                   expected_ep_assignment);
@@ -41,26 +42,27 @@ static void RunClipTestOnCPU(const std::vector<TestInputDef<DataType>>& input_de
 // Test that Clip with a dynamic min or max input is not supported by QNN EP.
 TEST_F(QnnCPUBackendTests, Clip_Dynamic_MinMax_Unsupported) {
   // Dynamic min input is not supported.
-  RunClipTestOnCPU<float>({TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
-                           TestInputDef<float>({}, false /* is_initializer */, {-5.0f})},
+  RunClipTestOnCPU<float>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                          {TestInputDef<float>({}, false /* is_initializer */, {-5.0f})},
                           ExpectedEPNodeAssignment::None);  // Should not be assigned to QNN EP.
   // Dynamic max input is not supported.
-  RunClipTestOnCPU<float>({TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
-                           TestInputDef<float>({}, true, {-5.0f}),
+  RunClipTestOnCPU<float>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                          {TestInputDef<float>({}, true, {-5.0f}),
                            TestInputDef<float>({}, false, {5.0f})},
                           ExpectedEPNodeAssignment::None);  // Should not be assigned to QNN EP.
 }
 
 // Test Clip with default min/max.
 TEST_F(QnnCPUBackendTests, Clip_4D_f32_DefaultMinMax) {
-  RunClipTestOnCPU<float>({TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48))},
+  RunClipTestOnCPU<float>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                          {},  // Don't specify min/max inputs.
                           ExpectedEPNodeAssignment::All);
 }
 
 // Test Clip with 5D input.
 TEST_F(QnnCPUBackendTests, Clip_5D_f32) {
-  RunClipTestOnCPU<float>({TestInputDef<float>({1, 1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
-                           TestInputDef<float>({}, true, {-5.0f}),
+  RunClipTestOnCPU<float>(TestInputDef<float>({1, 1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                          {TestInputDef<float>({}, true, {-5.0f}),
                            TestInputDef<float>({}, true, {5.0f})},
                           ExpectedEPNodeAssignment::All);
 }
@@ -70,42 +72,11 @@ TEST_F(QnnCPUBackendTests, Clip_5D_f32) {
 // HTP tests:
 //
 
-// Returns a function that builds a model with a QDQ Clip operator. Only the first input is quantized.
-template <typename InputQType>
-inline GetTestQDQModelFn<InputQType> BuildQDQClipTestCase(const std::vector<TestInputDef<float>>& input_defs) {
-  return [input_defs](ModelTestBuilder& builder, std::vector<QuantParams<InputQType>>& output_qparams) {
-    const size_t num_inputs = input_defs.size();
-    std::vector<NodeArg*> op_inputs;
-    op_inputs.reserve(num_inputs);
-
-    for (size_t i = 0; i < num_inputs; i++) {
-      const TestInputDef<float>& input_def = input_defs[i];
-      NodeArg* input = MakeTestInput<float>(builder, input_def);
-
-      if (i == 0) {  // Only input 0 is quantized.
-        QuantParams<InputQType> input_qparams = GetTestInputQuantParams<InputQType>(input_def);
-        NodeArg* input_after_qdq = AddQDQNodePair<InputQType>(builder, input, input_qparams.scale,
-                                                              input_qparams.zero_point);
-        op_inputs.push_back(input_after_qdq);
-      } else {
-        op_inputs.push_back(input);
-      }
-    }
-
-    // Op -> op_output
-    auto* clip_output = builder.MakeIntermediate();
-    builder.AddNode("Clip", op_inputs, {clip_output});
-
-    // op_output -> Q -> DQ -> output
-    AddQDQNodePairWithOutputAsGraphOutput<InputQType>(builder, clip_output, output_qparams[0].scale,
-                                                      output_qparams[0].zero_point);
-  };
-}
-
 // Runs a QDQ Clip model on the QNN (HTP) EP and the ORT CPU EP. Checks the graph node assignment and that inference
 // running the QDQ model on QNN EP is at least as accurate as on ORT CPU EP (when compared to the baseline float32 model).
 template <typename QType>
-static void RunQDQClipTestOnHTP(const std::vector<TestInputDef<float>>& input_defs,
+static void RunQDQClipTestOnHTP(const TestInputDef<float>& input_def,
+                                const std::vector<TestInputDef<float>>& min_max_defs,
                                 ExpectedEPNodeAssignment expected_ep_assignment,
                                 int opset = 13) {
   ProviderOptions provider_options;
@@ -116,8 +87,11 @@ static void RunQDQClipTestOnHTP(const std::vector<TestInputDef<float>>& input_de
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  TestQDQModelAccuracy(BuildOpTestCase<float>("Clip", input_defs, {}, {}),  // baseline float32 model
-                       BuildQDQClipTestCase<QType>(input_defs),             // QDQ model
+  auto f32_model_builder = BuildOpTestCase<float, float>("Clip", {input_def}, {min_max_defs}, {});
+  auto qdq_model_builder = BuildQDQOpTestCase<QType, float>("Clip", {input_def}, {min_max_defs}, {});
+
+  TestQDQModelAccuracy(f32_model_builder,
+                       qdq_model_builder,
                        provider_options,
                        opset,
                        expected_ep_assignment);
@@ -130,14 +104,15 @@ static void RunQDQClipTestOnHTP(const std::vector<TestInputDef<float>>& input_de
 // - ClipQuantFusion: Fuses Clip -> QuantizeLinear resulting in Q1 -> DQ1 -> Q2' -> DQ2
 // - DoubleQDQPairsRemover: Simplifies remaining Q1 -> DQ1 -> Q2' -> DQ2 sequence to Q1 -> DQ2.
 TEST_F(QnnHTPBackendTests, Clip_U8_DefaultMinMax_Rank4) {
-  RunQDQClipTestOnHTP<uint8_t>({TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48))},
+  RunQDQClipTestOnHTP<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                               {},  // Don't specify min/max inputs.
                                ExpectedEPNodeAssignment::All);
 }
 
 // Test QDQ Clip with non-default min and max inputs. QNN EP will get a graph with a Clip operator.
 TEST_F(QnnHTPBackendTests, Clip_U8_Rank4) {
-  RunQDQClipTestOnHTP<uint8_t>({TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
-                                TestInputDef<float>({}, true, {-5.0f}),
+  RunQDQClipTestOnHTP<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                               {TestInputDef<float>({}, true, {-5.0f}),
                                 TestInputDef<float>({}, true, {5.0f})},
                                ExpectedEPNodeAssignment::All);
 }

From 51ff43de5da46ef6bed7fb6db5bbc4e380ea3775 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Tue, 19 Sep 2023 17:43:15 -0700
Subject: [PATCH 16/22] Add QNN EP tests for the Split operator

---
 .../qnn/builder/opbuilder/split_op_builder.cc |   1 +
 .../test/providers/qnn/qnn_test_utils.cc      |  14 +-
 .../test/providers/qnn/qnn_test_utils.h       |  17 +-
 .../test/providers/qnn/split_op_test.cc       | 311 ++++++++++++++++++
 4 files changed, 326 insertions(+), 17 deletions(-)
 create mode 100644 onnxruntime/test/providers/qnn/split_op_test.cc

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
index 6812c223f7c90..a11712dd06ad0 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
@@ -76,6 +76,7 @@ Status SplitOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wr
                      [](int64_t item) { return SafeInt<uint32_t>(item); });
       split_index.pop_back();
     } else {
+      // TODO: Support Split opset 18, which may specify the 'num_outputs' attribute.
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN doesn't support dynamic split");
     }
   } else {
diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.cc b/onnxruntime/test/providers/qnn/qnn_test_utils.cc
index 724e9a11cd781..51df93f8853ec 100644
--- a/onnxruntime/test/providers/qnn/qnn_test_utils.cc
+++ b/onnxruntime/test/providers/qnn/qnn_test_utils.cc
@@ -73,7 +73,7 @@ void RunQnnModelTest(const GetTestModelFn& build_test_case, const ProviderOption
 void InferenceModel(const std::string& model_data, const char* log_id,
                     std::unique_ptr<IExecutionProvider> execution_provider,
                     ExpectedEPNodeAssignment expected_ep_assignment, const NameMLValMap& feeds,
-                    std::vector<std::string>& output_names, std::vector<OrtValue>& output_vals) {
+                    std::vector<OrtValue>& output_vals) {
   SessionOptions so;
   so.session_logid = log_id;
   RunOptions run_options;
@@ -102,14 +102,12 @@ void InferenceModel(const std::string& model_data, const char* log_id,
   }
 
   const auto& outputs = graph.GetOutputs();
+  std::vector<std::string> output_names;
 
-  // fetch all outputs if necessary.
-  if (output_names.empty()) {
-    output_names.reserve(outputs.size());
-    for (const auto* node_arg : outputs) {
-      if (node_arg->Exists()) {
-        output_names.push_back(node_arg->Name());
-      }
+  output_names.reserve(outputs.size());
+  for (const auto* node_arg : outputs) {
+    if (node_arg->Exists()) {
+      output_names.push_back(node_arg->Name());
     }
   }
 
diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.h b/onnxruntime/test/providers/qnn/qnn_test_utils.h
index e8f512619a8a6..14c62f98f6a3e 100644
--- a/onnxruntime/test/providers/qnn/qnn_test_utils.h
+++ b/onnxruntime/test/providers/qnn/qnn_test_utils.h
@@ -213,13 +213,12 @@ inline QuantParams<QType> GetTestInputQuantParams(const TestInputDef<float>& inp
  * \param execution_provider The EP on which to run the model. Set to nullptr for CPU EP.
  * \param expected_ep_assignment Describes "which nodes" should be assigned to the EP.
  * \param feeds The input feeds.
- * \param output_names If empty, the function will write the output names.
  * \param output_vals Initialized to the inference results.
  */
 void InferenceModel(const std::string& model_data, const char* log_id,
                     std::unique_ptr<IExecutionProvider> execution_provider,
                     ExpectedEPNodeAssignment expected_ep_assignment, const NameMLValMap& feeds,
-                    std::vector<std::string>& output_names, std::vector<OrtValue>& output_vals);
+                    std::vector<OrtValue>& output_vals);
 
 /**
  * Tests the accuracy of a QDQ model on QNN EP by runnning 3 inferences:
@@ -263,9 +262,8 @@ inline void TestQDQModelAccuracy(const GetTestModelFn& f32_model_fn, const GetTe
 
   // Run f32 model on CPU EP and collect outputs.
   std::vector<OrtValue> cpu_f32_outputs;
-  std::vector<std::string> output_names;
   InferenceModel(f32_model_data, "f32_model_logger", nullptr, ExpectedEPNodeAssignment::All,
-                 f32_helper.feeds_, output_names, cpu_f32_outputs);
+                 f32_helper.feeds_, cpu_f32_outputs);
   ASSERT_FALSE(cpu_f32_outputs.empty());
 
   const size_t num_outputs = cpu_f32_outputs.size();
@@ -304,13 +302,13 @@ inline void TestQDQModelAccuracy(const GetTestModelFn& f32_model_fn, const GetTe
   // Run QDQ model on QNN EP and collect outputs.
   std::vector<OrtValue> qnn_qdq_outputs;
   InferenceModel(qdq_model_data, "qdq_model_logger", QnnExecutionProviderWithOptions(qnn_options),
-                 expected_ep_assignment, qdq_helper.feeds_, output_names, qnn_qdq_outputs);
+                 expected_ep_assignment, qdq_helper.feeds_, qnn_qdq_outputs);
 
   if (expected_ep_assignment != ExpectedEPNodeAssignment::None) {
     // Run QDQ model on CPU EP and collect outputs.
     std::vector<OrtValue> cpu_qdq_outputs;
     InferenceModel(qdq_model_data, "qdq_model_logger", nullptr, ExpectedEPNodeAssignment::All,
-                   qdq_helper.feeds_, output_names, cpu_qdq_outputs);
+                   qdq_helper.feeds_, cpu_qdq_outputs);
     ASSERT_EQ(cpu_qdq_outputs.size(), num_outputs);
     ASSERT_EQ(qnn_qdq_outputs.size(), num_outputs);
 
@@ -320,7 +318,9 @@ inline void TestQDQModelAccuracy(const GetTestModelFn& f32_model_fn, const GetTe
 
     // Compare accuracy of QDQ results with float model.
     // QNN EP must be at least as accurate as CPU EP when running the QDQ model.
+    const std::string base_output_name = "output_";
     for (size_t i = 0; i < num_outputs; i++) {
+      std::string debug_output_name = base_output_name + std::to_string(i);
       auto& cpu_qdq_tensor = cpu_qdq_outputs[i].Get<Tensor>();
       auto& qnn_qdq_tensor = qnn_qdq_outputs[i].Get<Tensor>();
 
@@ -353,8 +353,7 @@ inline void TestQDQModelAccuracy(const GetTestModelFn& f32_model_fn, const GetTe
           }
 
           EXPECT_TRUE(is_as_accurate_as_cpu_qdq)
-              << "Inaccuracy detected for output '"
-              << output_names[i]
+              << "Inaccuracy detected for output '" << debug_output_name
               << "', element " << j
               << ".\nOutput quant params: scale=" << output_qparams[i].scale
               << ", zero_point=" << static_cast<int32_t>(output_qparams[i].zero_point)
@@ -363,7 +362,7 @@ inline void TestQDQModelAccuracy(const GetTestModelFn& f32_model_fn, const GetTe
               << "CPU QDQ val: " << cpu_qdq_val << " (err " << cpu_err << ")";
         }
       } else {
-        VerifyOutput(output_names[i], cpu_f32_outputs[i].Get<Tensor>(), qnn_qdq_tensor, fp32_abs_err);
+        VerifyOutput(debug_output_name, cpu_f32_outputs[i].Get<Tensor>(), qnn_qdq_tensor, fp32_abs_err);
       }
     }
   }
diff --git a/onnxruntime/test/providers/qnn/split_op_test.cc b/onnxruntime/test/providers/qnn/split_op_test.cc
new file mode 100644
index 0000000000000..8ad84f69430da
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/split_op_test.cc
@@ -0,0 +1,311 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include <string>
+
+#include "test/providers/qnn/qnn_test_utils.h"
+
+#include "onnx/onnx_pb.h"
+#include "gtest/gtest.h"
+
+namespace onnxruntime {
+namespace test {
+
+template <typename DataType>
+GetTestModelFn BuildSplitTestCase(const TestInputDef<DataType>& input_def,
+                                  const std::vector<int64_t>& split, bool split_is_input,
+                                  int64_t axis, int64_t num_outputs) {
+  return [input_def, split, split_is_input, axis, num_outputs](ModelTestBuilder& builder) {
+    std::vector<NodeArg*> op_inputs;
+
+    op_inputs.push_back(MakeTestInput<DataType>(builder, input_def));
+
+    if (split_is_input) {
+      op_inputs.push_back(builder.Make1DInitializer(split));
+    }
+
+    // Determine the actual number of outputs from the 'split' or 'num_outputs' arguments.
+    // In opset 18, the num_outputs attribute or the split input can determine the actual number of outputs.
+    // In opset 13, the split input determines the number of actual outputs.
+    // In opsets < 13, the split attribute determines the number of actual outputs.
+    size_t actual_num_outputs = (num_outputs > -1) ? static_cast<size_t>(num_outputs) : split.size();
+
+    std::vector<NodeArg*> split_outputs;
+    for (size_t i = 0; i < actual_num_outputs; i++) {
+      split_outputs.push_back(builder.MakeOutput());
+    }
+
+    Node& split_node = builder.AddNode("Split", op_inputs, split_outputs);
+
+    if (!split_is_input) {
+      split_node.AddAttribute("split", split);
+    }
+
+    if (num_outputs > -1) {
+      split_node.AddAttribute("num_outputs", num_outputs);
+    }
+
+    split_node.AddAttribute("axis", axis);
+  };
+}
+
+template <typename DataType>
+static void RunSplitOpTestOnCPU(const TestInputDef<DataType>& input_def,
+                                const std::vector<int64_t>& split,
+                                int64_t axis,
+                                int64_t num_outputs,
+                                int opset,
+                                ExpectedEPNodeAssignment expected_ep_assignment) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnCpu.dll";
+#else
+  provider_options["backend_path"] = "libQnnCpu.so";
+#endif
+
+  const bool split_is_input = opset >= 13;
+  RunQnnModelTest(BuildSplitTestCase<DataType>(input_def, split, split_is_input, axis, num_outputs),
+                  provider_options,
+                  opset,
+                  expected_ep_assignment);
+}
+
+//
+// CPU tests:
+//
+
+// Test Split opset 13 on CPU backend: equal split of axis 0
+TEST_F(QnnCPUBackendTests, Split_Equal_Axis0_Opset13) {
+  RunSplitOpTestOnCPU<float>(TestInputDef<float>({4, 2}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),
+                             {2, 2},  // split
+                             0,       // axis
+                             -1,      // num_outputs (not in opset 13)
+                             13,      // opset
+                             ExpectedEPNodeAssignment::All);
+  RunSplitOpTestOnCPU<int32_t>(TestInputDef<int32_t>({4, 2}, false, {1, 2, 3, 4, 5, 6, 7, 8}),
+                               {2, 2},  // split
+                               0,       // axis
+                               -1,      // num_outputs (not in opset 13)
+                               13,      // opset
+                               ExpectedEPNodeAssignment::All);
+}
+
+// Test Split opset 11 on CPU backend: equal split of axis 0
+TEST_F(QnnCPUBackendTests, Split_Equal_Axis0_Opset11) {
+  RunSplitOpTestOnCPU<float>(TestInputDef<float>({4, 2}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),
+                             {2, 2},  // split
+                             0,       // axis
+                             -1,      // num_outputs (not in opset 11)
+                             11,      // opset
+                             ExpectedEPNodeAssignment::All);
+  RunSplitOpTestOnCPU<int32_t>(TestInputDef<int32_t>({4, 2}, false, {1, 2, 3, 4, 5, 6, 7, 8}),
+                               {2, 2},  // split
+                               0,       // axis
+                               -1,      // num_outputs (not in opset 11)
+                               11,      // opset
+                               ExpectedEPNodeAssignment::All);
+}
+
+// Test Split opset 13 on CPU backend: unequal split of axis 1
+TEST_F(QnnCPUBackendTests, Split_Unequal_Axis1_Opset13) {
+  RunSplitOpTestOnCPU<float>(TestInputDef<float>({2, 4}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),
+                             {1, 3},  // split
+                             1,       // axis
+                             -1,      // num_outputs (not in opset 13)
+                             13,      // opset
+                             ExpectedEPNodeAssignment::All);
+  RunSplitOpTestOnCPU<int32_t>(TestInputDef<int32_t>({2, 4}, false, {1, 2, 3, 4, 5, 6, 7, 8}),
+                               {1, 3},  // split
+                               1,       // axis
+                               -1,      // num_outputs (not in opset 13)
+                               13,      // opset
+                               ExpectedEPNodeAssignment::All);
+}
+
+// Test Split opset 11 on CPU backend: unequal split of axis 1
+TEST_F(QnnCPUBackendTests, Split_Unequal_Axis1_Opset11) {
+  RunSplitOpTestOnCPU<float>(TestInputDef<float>({2, 4}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),
+                             {1, 3},  // split
+                             1,       // axis
+                             -1,      // num_outputs (not in opset 11)
+                             11,      // opset
+                             ExpectedEPNodeAssignment::All);
+  RunSplitOpTestOnCPU<int32_t>(TestInputDef<int32_t>({2, 4}, false, {1, 2, 3, 4, 5, 6, 7, 8}),
+                               {1, 3},  // split
+                               1,       // axis
+                               -1,      // num_outputs (not in opset 11)
+                               11,      // opset
+                               ExpectedEPNodeAssignment::All);
+}
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+//
+// HTP tests:
+//
+
+// Return function that builds a model with a QDQ Split.
+template <typename QuantType>
+GetTestQDQModelFn<QuantType> BuildQDQSplitTestCase(const TestInputDef<float>& input_def,
+                                                   const std::vector<int64_t>& split,
+                                                   bool split_is_input,
+                                                   int64_t axis,
+                                                   int64_t num_outputs,
+                                                   bool use_contrib_qdq = false) {
+  return [input_def, split, split_is_input, axis, num_outputs,
+          use_contrib_qdq](ModelTestBuilder& builder,
+                           std::vector<QuantParams<QuantType>>& output_qparams) {
+    std::vector<NodeArg*> op_inputs;
+
+    // Add QDQ input
+    NodeArg* input = MakeTestInput<float>(builder, input_def);
+    QuantParams<QuantType> input_qparams = GetTestInputQuantParams<QuantType>(input_def);
+    NodeArg* input_after_qdq = AddQDQNodePair<QuantType>(builder, input, input_qparams.scale,
+                                                         input_qparams.zero_point, use_contrib_qdq);
+    op_inputs.push_back(input_after_qdq);
+
+    // Add split input
+    if (split_is_input) {
+      op_inputs.push_back(builder.Make1DInitializer(split));
+    }
+
+    // Determine the actual number of outputs from the 'split' or 'num_outputs' arguments.
+    // In opset 18, the num_outputs attribute or the split input can determine the actual number of outputs.
+    // In opset 13, the split input determines the number of actual outputs.
+    // In opsets < 13, the split attribute determines the number of actual outputs.
+    size_t actual_num_outputs = (num_outputs > -1) ? static_cast<size_t>(num_outputs) : split.size();
+
+    std::vector<NodeArg*> split_outputs;
+    for (size_t i = 0; i < actual_num_outputs; i++) {
+      split_outputs.push_back(builder.MakeIntermediate());
+    }
+
+    Node& split_node = builder.AddNode("Split", op_inputs, split_outputs);
+
+    if (!split_is_input) {
+      split_node.AddAttribute("split", split);
+    }
+
+    if (num_outputs > -1) {
+      split_node.AddAttribute("num_outputs", num_outputs);
+    }
+
+    split_node.AddAttribute("axis", axis);
+
+    // op_output -> Q -> DQ -> output
+    assert(output_qparams.size() == actual_num_outputs);
+    for (size_t i = 0; i < actual_num_outputs; i++) {
+      // NOTE: Input and output quantization parameters must be equal for Split.
+      output_qparams[i] = input_qparams;
+      AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, split_outputs[i], output_qparams[i].scale,
+                                                       output_qparams[i].zero_point, use_contrib_qdq);
+    }
+  };
+}
+
+// Runs a non-QDQ Split operator on the HTP backend.
+template <typename DataType>
+static void RunSplitOpTestOnHTP(const TestInputDef<DataType>& input_def,
+                                const std::vector<int64_t>& split,
+                                int64_t axis,
+                                int64_t num_outputs,
+                                int opset,
+                                ExpectedEPNodeAssignment expected_ep_assignment) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  const bool split_is_input = opset >= 13;
+  RunQnnModelTest(BuildSplitTestCase<DataType>(input_def, split, split_is_input, axis, num_outputs),
+                  provider_options,
+                  opset,
+                  expected_ep_assignment);
+}
+
+// Runs a QDQ Split operator on the HTP backend.
+template <typename QuantType>
+static void RunQDQSplitOpTestOnHTP(const TestInputDef<float>& input_def,
+                                   const std::vector<int64_t>& split,
+                                   int64_t axis,
+                                   int64_t num_outputs,
+                                   int opset,
+                                   ExpectedEPNodeAssignment expected_ep_assignment) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  const bool split_is_input = opset >= 13;
+  auto f32_model_builder = BuildSplitTestCase<float>(input_def, split, split_is_input, axis, num_outputs);
+  auto qdq_model_builder = BuildQDQSplitTestCase<QuantType>(input_def, split, split_is_input, axis, num_outputs);
+  TestQDQModelAccuracy<QuantType>(f32_model_builder,
+                                  qdq_model_builder,
+                                  provider_options,
+                                  opset,
+                                  expected_ep_assignment);
+}
+
+// Test that HTP can run non-QDQ Split (int32 input).
+TEST_F(QnnHTPBackendTests, Split_Int32_Opset13) {
+  // Equal split.
+  RunSplitOpTestOnHTP<int32_t>(TestInputDef<int32_t>({4, 2}, false, {1, 2, 3, 4, 5, 6, 7, 8}),
+                               {2, 2},  // split
+                               0,       // axis
+                               -1,      // num_outputs (not in opset 13)
+                               13,      // opset
+                               ExpectedEPNodeAssignment::All);
+}
+
+// Test QDQ Split op on HTP backend: equal split on axis 0 with opset 13.
+TEST_F(QnnHTPBackendTests, Split_Equal_Axis0_Opset13) {
+  RunQDQSplitOpTestOnHTP<uint8_t>(TestInputDef<float>({4, 2}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),
+                                  {2, 2},  // split
+                                  0,       // axis
+                                  -1,      // num_outputs (not in opset 13)
+                                  13,      // opset
+                                  ExpectedEPNodeAssignment::All);
+}
+
+// Test QDQ Split op on HTP backend: equal split on axis 0 with opset 11.
+TEST_F(QnnHTPBackendTests, Split_Equal_Axis0_Opset11) {
+  RunQDQSplitOpTestOnHTP<uint8_t>(TestInputDef<float>({4, 2}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),
+                                  {2, 2},  // split
+                                  0,       // axis
+                                  -1,      // num_outputs (not in opset 11)
+                                  11,      // opset
+                                  ExpectedEPNodeAssignment::All);
+}
+
+// Test Split opset 13 on HTP backend: unequal split of axis 1
+TEST_F(QnnHTPBackendTests, Split_Unequal_Axis1_Opset13) {
+  RunQDQSplitOpTestOnHTP<uint8_t>(TestInputDef<float>({2, 4}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),
+                                  {1, 3},  // split
+                                  1,       // axis
+                                  -1,      // num_outputs (not in opset 13)
+                                  13,      // opset
+                                  ExpectedEPNodeAssignment::All);
+}
+
+// Test Split opset 11 on HTP backend: unequal split of axis 1
+TEST_F(QnnHTPBackendTests, Split_Unequal_Axis1_Opset11) {
+  RunQDQSplitOpTestOnHTP<uint8_t>(TestInputDef<float>({2, 4}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),
+                                  {1, 3},  // split
+                                  1,       // axis
+                                  -1,      // num_outputs (not in opset 11)
+                                  11,      // opset
+                                  ExpectedEPNodeAssignment::All);
+}
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+}  // namespace test
+}  // namespace onnxruntime
+#endif  // !defined(ORT_MINIMAL_BUILD)

From 482f3caadb033f57f140983d33801f01e628c79f Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Tue, 19 Sep 2023 21:57:39 -0700
Subject: [PATCH 17/22] Add Split opset 18 unit tests

---
 .../qnn/builder/opbuilder/split_op_builder.cc |  1 -
 .../test/providers/qnn/split_op_test.cc       | 60 +++++++++++++++++--
 2 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
index a11712dd06ad0..6812c223f7c90 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
@@ -76,7 +76,6 @@ Status SplitOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wr
                      [](int64_t item) { return SafeInt<uint32_t>(item); });
       split_index.pop_back();
     } else {
-      // TODO: Support Split opset 18, which may specify the 'num_outputs' attribute.
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN doesn't support dynamic split");
     }
   } else {
diff --git a/onnxruntime/test/providers/qnn/split_op_test.cc b/onnxruntime/test/providers/qnn/split_op_test.cc
index 8ad84f69430da..6a93bbacabda2 100644
--- a/onnxruntime/test/providers/qnn/split_op_test.cc
+++ b/onnxruntime/test/providers/qnn/split_op_test.cc
@@ -22,7 +22,7 @@ GetTestModelFn BuildSplitTestCase(const TestInputDef<DataType>& input_def,
 
     op_inputs.push_back(MakeTestInput<DataType>(builder, input_def));
 
-    if (split_is_input) {
+    if (split_is_input && !split.empty()) {
       op_inputs.push_back(builder.Make1DInitializer(split));
     }
 
@@ -39,7 +39,7 @@ GetTestModelFn BuildSplitTestCase(const TestInputDef<DataType>& input_def,
 
     Node& split_node = builder.AddNode("Split", op_inputs, split_outputs);
 
-    if (!split_is_input) {
+    if (!split_is_input && !split.empty()) {
       split_node.AddAttribute("split", split);
     }
 
@@ -77,6 +77,38 @@ static void RunSplitOpTestOnCPU(const TestInputDef<DataType>& input_def,
 // CPU tests:
 //
 
+// Test Split opset 18 on CPU backend: equal split of axis 0 via 'num_outputs' attribute
+// and 'split' input.
+TEST_F(QnnCPUBackendTests, Split_Equal_Axis0_Opset18) {
+  // Use 'split' input (initializer).
+  RunSplitOpTestOnCPU<float>(TestInputDef<float>({4, 2}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),
+                             {2, 2},  // split
+                             0,       // axis
+                             -1,      // num_outputs
+                             18,      // opset
+                             ExpectedEPNodeAssignment::All);
+  RunSplitOpTestOnCPU<int32_t>(TestInputDef<int32_t>({4, 2}, false, {1, 2, 3, 4, 5, 6, 7, 8}),
+                               {2, 2},  // split
+                               0,       // axis
+                               -1,      // num_outputs
+                               18,      // opset
+                               ExpectedEPNodeAssignment::All);
+
+  // Use 'num_outputs' attribute.
+  RunSplitOpTestOnCPU<float>(TestInputDef<float>({4, 2}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),
+                             {},  // split (use num_outputs instead)
+                             0,   // axis
+                             2,   // num_outputs
+                             18,  // opset
+                             ExpectedEPNodeAssignment::All);
+  RunSplitOpTestOnCPU<int32_t>(TestInputDef<int32_t>({4, 2}, false, {1, 2, 3, 4, 5, 6, 7, 8}),
+                               {},  // split (use num_outputs instead)
+                               0,   // axis
+                               2,   // num_outputs
+                               18,  // opset
+                               ExpectedEPNodeAssignment::All);
+}
+
 // Test Split opset 13 on CPU backend: equal split of axis 0
 TEST_F(QnnCPUBackendTests, Split_Equal_Axis0_Opset13) {
   RunSplitOpTestOnCPU<float>(TestInputDef<float>({4, 2}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),
@@ -167,7 +199,7 @@ GetTestQDQModelFn<QuantType> BuildQDQSplitTestCase(const TestInputDef<float>& in
     op_inputs.push_back(input_after_qdq);
 
     // Add split input
-    if (split_is_input) {
+    if (split_is_input && !split.empty()) {
       op_inputs.push_back(builder.Make1DInitializer(split));
     }
 
@@ -184,7 +216,7 @@ GetTestQDQModelFn<QuantType> BuildQDQSplitTestCase(const TestInputDef<float>& in
 
     Node& split_node = builder.AddNode("Split", op_inputs, split_outputs);
 
-    if (!split_is_input) {
+    if (!split_is_input && !split.empty()) {
       split_node.AddAttribute("split", split);
     }
 
@@ -265,6 +297,26 @@ TEST_F(QnnHTPBackendTests, Split_Int32_Opset13) {
                                ExpectedEPNodeAssignment::All);
 }
 
+// Test QDQ Split opset 18 on HTP backend: equal split of axis 0 via 'num_outputs' attribute
+// and 'split' input.
+TEST_F(QnnHTPBackendTests, Split_Equal_Axis0_Opset18) {
+  // Use 'split' input (initializer).
+  RunQDQSplitOpTestOnHTP<uint8_t>(TestInputDef<float>({4, 2}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),
+                                  {2, 2},  // split
+                                  0,       // axis
+                                  -1,      // num_outputs
+                                  18,      // opset
+                                  ExpectedEPNodeAssignment::All);
+
+  // Use 'num_outputs' attribute.
+  RunQDQSplitOpTestOnHTP<uint8_t>(TestInputDef<float>({4, 2}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),
+                                  {},  // split (use num_outputs instead)
+                                  0,   // axis
+                                  2,   // num_outputs
+                                  18,  // opset
+                                  ExpectedEPNodeAssignment::All);
+}
+
 // Test QDQ Split op on HTP backend: equal split on axis 0 with opset 13.
 TEST_F(QnnHTPBackendTests, Split_Equal_Axis0_Opset13) {
   RunQDQSplitOpTestOnHTP<uint8_t>(TestInputDef<float>({4, 2}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),

From d491447e6b147bdd331d64fbe2489d3c99152589 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Tue, 19 Sep 2023 22:03:12 -0700
Subject: [PATCH 18/22] Fix linter warnings

---
 .../core/providers/qnn/builder/opbuilder/clip_op_builder.cc | 6 +++---
 onnxruntime/test/providers/qnn/clip_op_test.cc              | 2 +-
 onnxruntime/test/providers/qnn/gemm_op_test.cc              | 2 +-
 onnxruntime/test/providers/qnn/reshape_op_test.cc           | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
index 1f1b8d6867acd..df4c718949269 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
@@ -1,6 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <cassert>
+#include <limits>
+
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
@@ -9,9 +12,6 @@
 
 #include "base_op_builder.h"
 
-#include <cassert>
-#include <limits>
-
 namespace onnxruntime {
 namespace qnn {
 class ClipOpBuilder : public BaseOpBuilder {
diff --git a/onnxruntime/test/providers/qnn/clip_op_test.cc b/onnxruntime/test/providers/qnn/clip_op_test.cc
index 1d448cbe57030..dc76ccc2b2a3d 100644
--- a/onnxruntime/test/providers/qnn/clip_op_test.cc
+++ b/onnxruntime/test/providers/qnn/clip_op_test.cc
@@ -73,7 +73,7 @@ TEST_F(QnnCPUBackendTests, Clip_5D_f32) {
 //
 
 // Runs a QDQ Clip model on the QNN (HTP) EP and the ORT CPU EP. Checks the graph node assignment and that inference
-// running the QDQ model on QNN EP is at least as accurate as on ORT CPU EP (when compared to the baseline float32 model).
+// running the QDQ model on QNN EP is at least as accurate as on ORT CPU EP (compared to the baseline float32 model).
 template <typename QType>
 static void RunQDQClipTestOnHTP(const TestInputDef<float>& input_def,
                                 const std::vector<TestInputDef<float>>& min_max_defs,
diff --git a/onnxruntime/test/providers/qnn/gemm_op_test.cc b/onnxruntime/test/providers/qnn/gemm_op_test.cc
index 86119afbbb3fa..1e843df7d34c9 100644
--- a/onnxruntime/test/providers/qnn/gemm_op_test.cc
+++ b/onnxruntime/test/providers/qnn/gemm_op_test.cc
@@ -178,7 +178,7 @@ inline GetTestQDQModelFn<InputAQType> BuildQDQGemmTestCase(const std::vector<Tes
 }
 
 // Runs a QDQ Gemm model on the QNN (HTP) EP and the ORT CPU EP. Checks the graph node assignment and that inference
-// running the QDQ model on QNN EP is at least as accurate as on ORT CPU EP (when compared to the baseline float32 model).
+// running the QDQ model on QNN EP is at least as accurate as on ORT CPU EP (compared to the baseline float32 model).
 template <typename InputAQType, typename InputBQType>
 static void RunQDQGemmTestOnHTP(const std::vector<TestInputDef<float>>& input_defs,
                                 const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
diff --git a/onnxruntime/test/providers/qnn/reshape_op_test.cc b/onnxruntime/test/providers/qnn/reshape_op_test.cc
index fcba46a04bcb3..66c01cb950799 100644
--- a/onnxruntime/test/providers/qnn/reshape_op_test.cc
+++ b/onnxruntime/test/providers/qnn/reshape_op_test.cc
@@ -90,7 +90,7 @@ static void RunReshapeTestOnHTP(const TestInputDef<DataType>& input_def,
 }
 
 // Runs a QDQ Reshape model on the QNN (HTP) EP and the ORT CPU EP. Checks the graph node assignment and that inference
-// running the QDQ model on QNN EP is at least as accurate as on ORT CPU EP (when compared to the baseline float32 model).
+// running the QDQ model on QNN EP is at least as accurate as on ORT CPU EP (compared to the baseline float32 model).
 template <typename QType>
 static void RunQDQReshapeTestOnHTP(const TestInputDef<float>& input_def,
                                    const TestInputDef<int64_t>& shape_def,

From 1b66cfaf7f26e1bbf575a5b26b4b074c0f422482 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Tue, 19 Sep 2023 23:04:26 -0700
Subject: [PATCH 19/22] Add QNN EP unit tests for Tile operator

---
 .../selectors_actions/shared/utils.cc         |   5 +-
 .../test/providers/qnn/reshape_op_test.cc     | 116 +++++++--------
 .../test/providers/qnn/tile_op_test.cc        | 132 ++++++++++++++++++
 3 files changed, 193 insertions(+), 60 deletions(-)
 create mode 100644 onnxruntime/test/providers/qnn/tile_op_test.cc

diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
index f951e41552cf0..293c885858179 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
@@ -36,8 +36,9 @@ static const OpVersionsAndSelector::OpVersionsMap GetMiscOpVersionsMap() {
           {"Resize", {}},
           {"Split", {}},
           {"Squeeze", {}},
-          {"Unsqueeze", {}}};
-}
+          {"Unsqueeze", {}},
+          {"Tile", {}}};
+  }
 
 static const OpVersionsAndSelector::OpVersionsMap GetDropDQOpVersionsMap() {
   return {{"ArgMax", {}},
diff --git a/onnxruntime/test/providers/qnn/reshape_op_test.cc b/onnxruntime/test/providers/qnn/reshape_op_test.cc
index 66c01cb950799..e117f134015c0 100644
--- a/onnxruntime/test/providers/qnn/reshape_op_test.cc
+++ b/onnxruntime/test/providers/qnn/reshape_op_test.cc
@@ -14,6 +14,64 @@
 namespace onnxruntime {
 namespace test {
 
+// Runs a model with a Reshape operator on the QNN CPU backend. Checks the graph node assignment
+// and that inference outputs for QNN EP and CPU EP match.
+template <typename DataType>
+static void RunReshapeTestOnCPU(const TestInputDef<DataType>& input_def,
+                                const TestInputDef<int64_t>& shape_def,
+                                const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                ExpectedEPNodeAssignment expected_ep_assignment,
+                                int opset = 19) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnCpu.dll";
+#else
+  provider_options["backend_path"] = "libQnnCpu.so";
+#endif
+
+  RunQnnModelTest(BuildOpTestCase<DataType, int64_t>("Reshape", {input_def}, {shape_def}, attrs),
+                  provider_options,
+                  opset,
+                  expected_ep_assignment);
+}
+
+//
+// CPU tests:
+//
+
+// Test that Reshape with a dynamic shape input is not supported by QNN EP.
+TEST_F(QnnCPUBackendTests, Reshape_DynamicShape_Unsupported) {
+  RunReshapeTestOnCPU(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                      TestInputDef<int64_t>({2}, false /* is_initializer */, {1, 48}),
+                      {},                              // Attributes
+                      ExpectedEPNodeAssignment::None,  // Should not be assigned to QNN EP.
+                      19);                             // Opset
+}
+
+// Test that Reshape with an enabled 'allowzero' attribute is not supported by QNN EP.
+TEST_F(QnnCPUBackendTests, Reshape_AllowZeroAttr_Unsupported) {
+  RunReshapeTestOnCPU(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                      TestInputDef<int64_t>({2}, true, {1, 48}),
+                      {utils::MakeAttribute("allowzero", static_cast<int64_t>(1))},
+                      ExpectedEPNodeAssignment::None,  // Should not be assigned to QNN EP.
+                      19);                             // Opset
+}
+
+// Test Reshape of rank 4 -> rank 2.
+TEST_F(QnnCPUBackendTests, Reshape_4D_f32) {
+  RunReshapeTestOnCPU(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                      TestInputDef<int64_t>({2}, true, {1, 48}),
+                      {},  // Attributes
+                      ExpectedEPNodeAssignment::All,
+                      19);  // Opset
+}
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+//
+// HTP tests:
+//
+
 // Returns a function that creates a graph with a QDQ Reshape operator.
 template <typename QuantType>
 GetTestQDQModelFn<QuantType> BuildQDQReshapeTestCase(const TestInputDef<float>& input_def,
@@ -45,28 +103,6 @@ GetTestQDQModelFn<QuantType> BuildQDQReshapeTestCase(const TestInputDef<float>&
   };
 }
 
-// Runs a model with a Reshape operator on the QNN CPU backend. Checks the graph node assignment
-// and that inference outputs for QNN EP and CPU EP match.
-template <typename DataType>
-static void RunReshapeTestOnCPU(const TestInputDef<DataType>& input_def,
-                                const TestInputDef<int64_t>& shape_def,
-                                const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
-                                ExpectedEPNodeAssignment expected_ep_assignment,
-                                int opset = 19) {
-  ProviderOptions provider_options;
-
-#if defined(_WIN32)
-  provider_options["backend_path"] = "QnnCpu.dll";
-#else
-  provider_options["backend_path"] = "libQnnCpu.so";
-#endif
-
-  RunQnnModelTest(BuildOpTestCase<DataType, int64_t>("Reshape", {input_def}, {shape_def}, attrs),
-                  provider_options,
-                  opset,
-                  expected_ep_assignment);
-}
-
 // Runs a model with a non-QDQ Reshape operator on the QNN HTP backend. Checks the graph node assignment
 // and that inference outputs for QNN EP and CPU EP match.
 template <typename DataType>
@@ -114,42 +150,6 @@ static void RunQDQReshapeTestOnHTP(const TestInputDef<float>& input_def,
                        expected_ep_assignment);
 }
 
-//
-// CPU tests:
-//
-
-// Test that Reshape with a dynamic shape input is not supported by QNN EP.
-TEST_F(QnnCPUBackendTests, Reshape_DynamicShape_Unsupported) {
-  RunReshapeTestOnCPU(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
-                      TestInputDef<int64_t>({2}, false /* is_initializer */, {1, 48}),
-                      {},                              // Attributes
-                      ExpectedEPNodeAssignment::None,  // Should not be assigned to QNN EP.
-                      19);                             // Opset
-}
-
-// Test that Reshape with an enabled 'allowzero' attribute is not supported by QNN EP.
-TEST_F(QnnCPUBackendTests, Reshape_AllowZeroAttr_Unsupported) {
-  RunReshapeTestOnCPU(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
-                      TestInputDef<int64_t>({2}, true, {1, 48}),
-                      {utils::MakeAttribute("allowzero", static_cast<int64_t>(1))},
-                      ExpectedEPNodeAssignment::None,  // Should not be assigned to QNN EP.
-                      19);                             // Opset
-}
-
-// Test Reshape of rank 4 -> rank 2.
-TEST_F(QnnCPUBackendTests, Reshape_4D_f32) {
-  RunReshapeTestOnCPU(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
-                      TestInputDef<int64_t>({2}, true, {1, 48}),
-                      {},  // Attributes
-                      ExpectedEPNodeAssignment::All,
-                      19);  // Opset
-}
-
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
-//
-// HTP tests:
-//
-
 // Test that QDQ Reshape with a dynamic shape input is not supported by QNN EP.
 TEST_F(QnnHTPBackendTests, Reshape_DynamicShape_Unsupported) {
   RunQDQReshapeTestOnHTP<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
diff --git a/onnxruntime/test/providers/qnn/tile_op_test.cc b/onnxruntime/test/providers/qnn/tile_op_test.cc
new file mode 100644
index 0000000000000..2b35c730ee5fe
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/tile_op_test.cc
@@ -0,0 +1,132 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include <string>
+
+#include "test/providers/qnn/qnn_test_utils.h"
+#include "core/graph/node_attr_utils.h"
+
+#include "onnx/onnx_pb.h"
+#include "gtest/gtest.h"
+
+namespace onnxruntime {
+namespace test {
+
+// Runs a model with a Tile operator on the QNN CPU backend. Checks the graph node assignment
+// and that inference outputs for QNN EP and CPU EP match.
+template <typename DataType>
+static void RunTileTestOnCPU(const TestInputDef<DataType>& input_def,
+                             const TestInputDef<int64_t>& repeats_def,
+                             ExpectedEPNodeAssignment expected_ep_assignment,
+                             int opset = 13) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnCpu.dll";
+#else
+  provider_options["backend_path"] = "libQnnCpu.so";
+#endif
+
+  RunQnnModelTest(BuildOpTestCase<DataType, int64_t>("Tile", {input_def}, {repeats_def}, {}),
+                  provider_options,
+                  opset,
+                  expected_ep_assignment);
+}
+
+// Test that Tile with a dynamic repeats input is not supported by QNN EP.
+TEST_F(QnnCPUBackendTests, Tile_DynamicRepeats_Unsupported) {
+  RunTileTestOnCPU(TestInputDef<float>({2, 2}, false, {1.0f, 2.0f, 3.0f, 4.0f}),
+                   TestInputDef<int64_t>({2}, false /* is_initializer */, {1, 2}),
+                   ExpectedEPNodeAssignment::None);  // Should not be assigned to QNN EP.
+}
+
+// Test that Tile with rank 4 float input.
+TEST_F(QnnCPUBackendTests, Tile_F32_Rank4) {
+  std::vector<float> input_data = {-4.0f, -3.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f};
+  RunTileTestOnCPU(TestInputDef<float>({1, 2, 2, 2}, false, input_data),
+                   TestInputDef<int64_t>({4}, true /* is_initializer */, {1, 2, 1, 1}),
+                   ExpectedEPNodeAssignment::All);
+}
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+//
+// HTP tests:
+//
+
+// Returns a function that creates a graph with a QDQ Tile operator.
+template <typename QuantType>
+GetTestQDQModelFn<QuantType> BuildQDQTileTestCase(const TestInputDef<float>& input_def,
+                                                  const TestInputDef<int64_t>& repeats_def,
+                                                  bool use_contrib_qdq = false) {
+  return [input_def, repeats_def, use_contrib_qdq](ModelTestBuilder& builder,
+                                                   std::vector<QuantParams<QuantType>>& output_qparams) {
+    // input -> Q -> DQ ->
+    NodeArg* input = MakeTestInput(builder, input_def);
+    QuantParams<QuantType> input_qparams = GetTestInputQuantParams<QuantType>(input_def);
+    NodeArg* input_qdq = AddQDQNodePair<QuantType>(builder, input, input_qparams.scale, input_qparams.zero_point,
+                                                   use_contrib_qdq);
+
+    // repeats input
+    NodeArg* repeats_input = MakeTestInput(builder, repeats_def);
+
+    // Tile op
+    NodeArg* tile_output = builder.MakeIntermediate();
+    builder.AddNode("Tile", {input_qdq, repeats_input}, {tile_output});
+
+    // op_output -> Q -> DQ -> output
+    // NOTE: Input and output quantization parameters must be equal for Tile.
+    output_qparams[0] = input_qparams;  // Overwrite!
+    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, tile_output, input_qparams.scale,
+                                                     input_qparams.zero_point, use_contrib_qdq);
+  };
+}
+
+// Runs a QDQ Tile model on the QNN (HTP) EP and the ORT CPU EP. Checks the graph node assignment and that inference
+// running the QDQ model on QNN EP is at least as accurate as on ORT CPU EP (compared to the baseline float32 model).
+template <typename QType>
+static void RunQDQTileTestOnHTP(const TestInputDef<float>& input_def,
+                                const TestInputDef<int64_t>& repeats_def,
+                                ExpectedEPNodeAssignment expected_ep_assignment,
+                                int opset = 13,
+                                bool use_contrib_qdq = false) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  auto f32_model_builder = BuildOpTestCase<float, int64_t>("Tile", {input_def}, {repeats_def}, {});
+  auto qdq_model_builder = BuildQDQTileTestCase<QType>(input_def, repeats_def, use_contrib_qdq);
+  TestQDQModelAccuracy(f32_model_builder,
+                       qdq_model_builder,
+                       provider_options,
+                       opset,
+                       expected_ep_assignment);
+}
+
+// Test 8-bit QDQ Tile with rank 4 input.
+TEST_F(QnnHTPBackendTests, Tile_U8_Rank4) {
+  std::vector<float> input_data = {-4.0f, -3.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f};
+  RunQDQTileTestOnHTP<uint8_t>(TestInputDef<float>({1, 2, 2, 2}, false, input_data),
+                               TestInputDef<int64_t>({4}, true /* is_initializer */, {1, 2, 1, 1}),
+                               ExpectedEPNodeAssignment::All);
+}
+
+// Test 16-bit QDQ Tile with rank 4 input.
+TEST_F(QnnHTPBackendTests, Tile_U16_Rank4) {
+  std::vector<float> input_data = {-4.0f, -3.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f};
+  RunQDQTileTestOnHTP<uint16_t>(TestInputDef<float>({1, 2, 2, 2}, false, input_data),
+                                TestInputDef<int64_t>({4}, true /* is_initializer */, {1, 2, 1, 1}),
+                                ExpectedEPNodeAssignment::All,
+                                13,     // opset
+                                true);  // Use com.microsoft Q/DQ ops
+}
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+}  // namespace test
+}  // namespace onnxruntime
+#endif  // !defined(ORT_MINIMAL_BUILD)

From 7e8f7cea637347b775b803da05e75f3c669f4bf1 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Wed, 20 Sep 2023 00:26:33 -0700
Subject: [PATCH 20/22] Add support for 16bit QDQ Clip. Add more 16-bit QDQ
 tests.

---
 .../qdq_transformer/clip_quantizelinear.cc    |  25 ++-
 .../test/optimizer/qdq_transformer_test.cc    |  50 ++---
 .../test/providers/qnn/clip_op_test.cc        |  34 +++-
 .../test/providers/qnn/flatten_op_test.cc     |  94 +++++----
 .../test/providers/qnn/gemm_op_test.cc        | 101 ++++++++--
 .../test/providers/qnn/reshape_op_test.cc     |  32 +++-
 .../test/providers/qnn/split_op_test.cc       |  30 ++-
 .../qnn/squeeze_unsqueeze_op_test.cc          | 181 ++++++++++--------
 8 files changed, 375 insertions(+), 172 deletions(-)

diff --git a/onnxruntime/core/optimizer/qdq_transformer/clip_quantizelinear.cc b/onnxruntime/core/optimizer/qdq_transformer/clip_quantizelinear.cc
index a0942c31b0161..50653b368857d 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/clip_quantizelinear.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/clip_quantizelinear.cc
@@ -1,8 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/optimizer/initializer.h"
 #include "core/optimizer/qdq_transformer/clip_quantizelinear.h"
+
+#include <limits>
+
+#include "core/optimizer/initializer.h"
 #include "core/optimizer/qdq_transformer/qdq_util.h"
 #include "core/optimizer/utils.h"
 #include "core/graph/graph_utils.h"
@@ -50,14 +53,26 @@ static bool GetQConstantLowerUpper(const Graph& graph, const Node& node, float&
   switch (zp_initializer.data_type()) {
     case ONNX_NAMESPACE::TensorProto_DataType_INT8: {
       const int8_t zero_point = zp_initializer.data<int8_t>()[0];
-      lower = scale * (-128 - zero_point);
-      upper = scale * (127 - zero_point);
+      lower = scale * (std::numeric_limits<int8_t>::lowest() - zero_point);
+      upper = scale * (std::numeric_limits<int8_t>::max() - zero_point);
       break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_UINT8: {
       const uint8_t zero_point = zp_initializer.data<uint8_t>()[0];
-      lower = scale * (0 - zero_point);
-      upper = scale * (255 - zero_point);
+      lower = scale * (std::numeric_limits<uint8_t>::lowest() - zero_point);
+      upper = scale * (std::numeric_limits<uint8_t>::max() - zero_point);
+      break;
+    }
+    case ONNX_NAMESPACE::TensorProto_DataType_INT16: {
+      const int16_t zero_point = zp_initializer.data<int16_t>()[0];
+      lower = scale * (std::numeric_limits<int16_t>::lowest() - zero_point);
+      upper = scale * (std::numeric_limits<int16_t>::max() - zero_point);
+      break;
+    }
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT16: {
+      const uint16_t zero_point = zp_initializer.data<uint16_t>()[0];
+      lower = scale * (std::numeric_limits<uint16_t>::lowest() - zero_point);
+      upper = scale * (std::numeric_limits<uint16_t>::max() - zero_point);
       break;
     }
     default:
diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc
index a438a61cb9b36..2cca44e4d834b 100644
--- a/onnxruntime/test/optimizer/qdq_transformer_test.cc
+++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc
@@ -2501,28 +2501,34 @@ TEST(QDQTransformerTests, Clip) {
   for (auto opset : opsets) {
     test_case(.0235294122248888f, static_cast<int8_t>(-128), 0, opset);        // [0, 6]
     test_case(.0235294122248888f, static_cast<int8_t>(-128), 0, opset, true);  // [0, 6] contrib qdq
-    test_case(.02f, static_cast<int8_t>(-128), 0, opset);                      // [0, 5.1]
-    test_case(.02f, static_cast<int8_t>(-128), 0, opset, true);                // [0, 5.1] contrib qdq
-    test_case(.03f, static_cast<int8_t>(-128), 1, opset);                      // [0, 7.65]
-    test_case(.03f, static_cast<int8_t>(-128), 1, opset, true);                // [0, 7.65] contrib qdq
-    test_case(.02f, static_cast<int8_t>(127), 1, opset);                       // [-5.1 , 0]
-    test_case(.02f, static_cast<int8_t>(127), 1, opset, true);                 // [-5.1 , 0] contrib qdq
-    test_case(.02f, static_cast<int8_t>(0), 1, opset);                         // [-2.56, 2.54]
-    test_case(.02f, static_cast<int8_t>(0), 1, opset, true);                   // [-2.56, 2.54] contrib qdq
-    test_case(.04f, static_cast<int8_t>(-97), 1, opset);                       // [-1.24, 8.96]
-    test_case(.04f, static_cast<int8_t>(-97), 1, opset, true);                 // [-1.24, 8.96] contrib qdq
-    test_case(.02352941176f, static_cast<uint8_t>(0), 0, opset);               // [0, 6]
-    test_case(.02352941176f, static_cast<uint8_t>(0), 0, opset, true);         // [0, 6] contrib qdq
-    test_case(.02f, static_cast<uint8_t>(0), 0, opset);                        // [0, 5.1]
-    test_case(.02f, static_cast<uint8_t>(0), 0, opset, true);                  // [0, 5.1] contrib qdq
-    test_case(.03f, static_cast<uint8_t>(0), 1, opset);                        // [0, 7.65]
-    test_case(.03f, static_cast<uint8_t>(0), 1, opset, true);                  // [0, 7.65] contrib qdq
-    test_case(.02f, static_cast<uint8_t>(255), 1, opset);                      // [-5.1, 0]
-    test_case(.02f, static_cast<uint8_t>(255), 1, opset, true);                // [-5.1, 0] contrib qdq
-    test_case(.02f, static_cast<uint8_t>(128), 1, opset);                      // [-2.56, 2.54]
-    test_case(.02f, static_cast<uint8_t>(128), 1, opset, true);                // [-2.56, 2.54] contrib qdq
-    test_case(.04f, static_cast<uint8_t>(31), 1, opset);                       // [-1.24, 8.96]
-    test_case(.04f, static_cast<uint8_t>(31), 1, opset, true);                 // [-1.24, 8.96] contrib qdq
+    test_case(9.15541313801785e-5f, static_cast<int16_t>(std::numeric_limits<int16_t>::min()), 0,
+              opset, true);  // [0, 6] contrib 16-bit qdq
+    test_case(0.0009f, static_cast<int16_t>(std::numeric_limits<int16_t>::min()), 1,
+              opset, true);                                             // [0, 58.98] contrib 16-bit qdq
+    test_case(.02f, static_cast<int8_t>(-128), 0, opset);               // [0, 5.1]
+    test_case(.02f, static_cast<int8_t>(-128), 0, opset, true);         // [0, 5.1] contrib qdq
+    test_case(.03f, static_cast<int8_t>(-128), 1, opset);               // [0, 7.65]
+    test_case(.03f, static_cast<int8_t>(-128), 1, opset, true);         // [0, 7.65] contrib qdq
+    test_case(.02f, static_cast<int8_t>(127), 1, opset);                // [-5.1 , 0]
+    test_case(.02f, static_cast<int8_t>(127), 1, opset, true);          // [-5.1 , 0] contrib qdq
+    test_case(.02f, static_cast<int8_t>(0), 1, opset);                  // [-2.56, 2.54]
+    test_case(.02f, static_cast<int8_t>(0), 1, opset, true);            // [-2.56, 2.54] contrib qdq
+    test_case(.04f, static_cast<int8_t>(-97), 1, opset);                // [-1.24, 8.96]
+    test_case(.04f, static_cast<int8_t>(-97), 1, opset, true);          // [-1.24, 8.96] contrib qdq
+    test_case(.02352941176f, static_cast<uint8_t>(0), 0, opset);        // [0, 6]
+    test_case(.02352941176f, static_cast<uint8_t>(0), 0, opset, true);  // [0, 6] contrib qdq
+    test_case(9.15541313801785e-5f, static_cast<uint16_t>(0), 0, opset, true);  // [0, 6] contrib 16-bit qdq
+    test_case(0.0009f, static_cast<uint16_t>(0), 1, opset, true);  // [0, 58.98] contrib 16-bit qdq
+    test_case(.02f, static_cast<uint8_t>(0), 0, opset);                 // [0, 5.1]
+    test_case(.02f, static_cast<uint8_t>(0), 0, opset, true);           // [0, 5.1] contrib qdq
+    test_case(.03f, static_cast<uint8_t>(0), 1, opset);                 // [0, 7.65]
+    test_case(.03f, static_cast<uint8_t>(0), 1, opset, true);           // [0, 7.65] contrib qdq
+    test_case(.02f, static_cast<uint8_t>(255), 1, opset);               // [-5.1, 0]
+    test_case(.02f, static_cast<uint8_t>(255), 1, opset, true);         // [-5.1, 0] contrib qdq
+    test_case(.02f, static_cast<uint8_t>(128), 1, opset);               // [-2.56, 2.54]
+    test_case(.02f, static_cast<uint8_t>(128), 1, opset, true);         // [-2.56, 2.54] contrib qdq
+    test_case(.04f, static_cast<uint8_t>(31), 1, opset);                // [-1.24, 8.96]
+    test_case(.04f, static_cast<uint8_t>(31), 1, opset, true);          // [-1.24, 8.96] contrib qdq
   }
 
   // opset_version = 10
diff --git a/onnxruntime/test/providers/qnn/clip_op_test.cc b/onnxruntime/test/providers/qnn/clip_op_test.cc
index dc76ccc2b2a3d..15ba3b5de2fa1 100644
--- a/onnxruntime/test/providers/qnn/clip_op_test.cc
+++ b/onnxruntime/test/providers/qnn/clip_op_test.cc
@@ -78,7 +78,8 @@ template <typename QType>
 static void RunQDQClipTestOnHTP(const TestInputDef<float>& input_def,
                                 const std::vector<TestInputDef<float>>& min_max_defs,
                                 ExpectedEPNodeAssignment expected_ep_assignment,
-                                int opset = 13) {
+                                int opset = 13,
+                                bool use_contrib_qdq = false) {
   ProviderOptions provider_options;
 
 #if defined(_WIN32)
@@ -88,7 +89,8 @@ static void RunQDQClipTestOnHTP(const TestInputDef<float>& input_def,
 #endif
 
   auto f32_model_builder = BuildOpTestCase<float, float>("Clip", {input_def}, {min_max_defs}, {});
-  auto qdq_model_builder = BuildQDQOpTestCase<QType, float>("Clip", {input_def}, {min_max_defs}, {});
+  auto qdq_model_builder = BuildQDQOpTestCase<QType, float>("Clip", {input_def}, {min_max_defs}, {},
+                                                            kOnnxDomain, use_contrib_qdq);
 
   TestQDQModelAccuracy(f32_model_builder,
                        qdq_model_builder,
@@ -97,7 +99,7 @@ static void RunQDQClipTestOnHTP(const TestInputDef<float>& input_def,
                        expected_ep_assignment);
 }
 
-// Test QDQ Clip with default min/max.
+// Test 8-bit QDQ Clip with default min/max.
 // NOTE: The Clip operator is *optimized* away during L1 optimizations, so QNN EP does not get a graph with a Clip op.
 // Instead, QNN EP will get a graph with a Q -> DQ.
 // - Original sequence: Q1 -> DQ1 -> Clip -> Q2 -> DQ2
@@ -109,7 +111,21 @@ TEST_F(QnnHTPBackendTests, Clip_U8_DefaultMinMax_Rank4) {
                                ExpectedEPNodeAssignment::All);
 }
 
-// Test QDQ Clip with non-default min and max inputs. QNN EP will get a graph with a Clip operator.
+// Test 16-bit QDQ Clip with default min/max.
+// NOTE: The Clip operator is *optimized* away during L1 optimizations, so QNN EP does not get a graph with a Clip op.
+// Instead, QNN EP will get a graph with a Q -> DQ.
+// - Original sequence: Q1 -> DQ1 -> Clip -> Q2 -> DQ2
+// - ClipQuantFusion: Fuses Clip -> QuantizeLinear resulting in Q1 -> DQ1 -> Q2' -> DQ2
+// - DoubleQDQPairsRemover: Simplifies remaining Q1 -> DQ1 -> Q2' -> DQ2 sequence to Q1 -> DQ2.
+TEST_F(QnnHTPBackendTests, Clip_U16_DefaultMinMax_Rank4) {
+  RunQDQClipTestOnHTP<uint16_t>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                                {},  // Don't specify min/max inputs.
+                                ExpectedEPNodeAssignment::All,
+                                13,     // opset
+                                true);  // Use com.microsoft Q/DQ ops
+}
+
+// Test 8-bit QDQ Clip with non-default min and max inputs. QNN EP will get a graph with a Clip operator.
 TEST_F(QnnHTPBackendTests, Clip_U8_Rank4) {
   RunQDQClipTestOnHTP<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
                                {TestInputDef<float>({}, true, {-5.0f}),
@@ -117,6 +133,16 @@ TEST_F(QnnHTPBackendTests, Clip_U8_Rank4) {
                                ExpectedEPNodeAssignment::All);
 }
 
+// Test 16-bit QDQ Clip with non-default min and max inputs. QNN EP will get a graph with a Clip operator.
+TEST_F(QnnHTPBackendTests, Clip_U16_Rank4) {
+  RunQDQClipTestOnHTP<uint16_t>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                                {TestInputDef<float>({}, true, {-5.0f}),
+                                 TestInputDef<float>({}, true, {5.0f})},
+                                ExpectedEPNodeAssignment::All,
+                                13,     // opset
+                                true);  // Use com.microsoft Q/DQ ops
+}
+
 // Test QDQ Clip of rank 5.
 TEST_F(QnnHTPBackendTests, Clip_U8_Rank5) {
   // We can't use the usual model-building functions because they add standalone Quantize and Dequantize nodes
diff --git a/onnxruntime/test/providers/qnn/flatten_op_test.cc b/onnxruntime/test/providers/qnn/flatten_op_test.cc
index af536b731ad09..637d3257ddea7 100644
--- a/onnxruntime/test/providers/qnn/flatten_op_test.cc
+++ b/onnxruntime/test/providers/qnn/flatten_op_test.cc
@@ -35,6 +35,36 @@ static void RunFlattenTestOnCPU(const TestInputDef<DataType>& input_def,
                   expected_ep_assignment);
 }
 
+//
+// CPU tests:
+//
+
+// Test that Flatten input (rank4) with axis == 0.
+TEST_F(QnnCPUBackendTests, Flatten_Rank4_Axis0) {
+  RunFlattenTestOnCPU(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                      {utils::MakeAttribute("axis", static_cast<int64_t>(0))},
+                      ExpectedEPNodeAssignment::All);
+}
+
+// Test that Flatten input (rank4) with axis == -1.
+TEST_F(QnnCPUBackendTests, Flatten_Rank4_AxisNeg1) {
+  RunFlattenTestOnCPU(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                      {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},
+                      ExpectedEPNodeAssignment::All);
+}
+
+// Test that Flatten input (rank5) with axis == 2.
+TEST_F(QnnCPUBackendTests, Flatten_Rank5_Axis2) {
+  RunFlattenTestOnCPU(TestInputDef<float>({1, 2, 3, 4, 4}, false, -10.0f, 10.0f),
+                      {utils::MakeAttribute("axis", static_cast<int64_t>(2))},
+                      ExpectedEPNodeAssignment::All);
+}
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+//
+// HTP tests:
+//
+
 // Runs a model with a non-QDQ Flatten operator on the QNN HTP backend. Checks the graph node assignment
 // and that inference outputs for QNN EP and CPU EP match.
 template <typename DataType>
@@ -62,7 +92,8 @@ template <typename QType>
 static void RunQDQFlattenTestOnHTP(const TestInputDef<float>& input_def,
                                    const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
                                    ExpectedEPNodeAssignment expected_ep_assignment,
-                                   int opset = 13) {
+                                   int opset = 13,
+                                   bool use_contrib_qdq = false) {
   ProviderOptions provider_options;
 
 #if defined(_WIN32)
@@ -71,57 +102,48 @@ static void RunQDQFlattenTestOnHTP(const TestInputDef<float>& input_def,
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
 
-  TestQDQModelAccuracy(BuildOpTestCase<float>("Flatten", {input_def}, {}, attrs),     // baseline float32 model
-                       BuildQDQOpTestCase<QType>("Flatten", {input_def}, {}, attrs),  // QDQ model
+  auto f32_model_builder = BuildOpTestCase<float>("Flatten", {input_def}, {}, attrs);
+  auto qdq_model_builder = BuildQDQOpTestCase<QType>("Flatten", {input_def}, {}, attrs, kOnnxDomain, use_contrib_qdq);
+  TestQDQModelAccuracy(f32_model_builder,
+                       qdq_model_builder,
                        provider_options,
                        opset,
                        expected_ep_assignment);
 }
 
-//
-// CPU tests:
-//
-
-// Test that Flatten input (rank4) with axis == 0.
-TEST_F(QnnCPUBackendTests, Flatten_Rank4_Axis0) {
-  RunFlattenTestOnCPU(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
-                      {utils::MakeAttribute("axis", static_cast<int64_t>(0))},
-                      ExpectedEPNodeAssignment::All);
-}
-
-// Test that Flatten input (rank4) with axis == -1.
-TEST_F(QnnCPUBackendTests, Flatten_Rank4_AxisNeg1) {
-  RunFlattenTestOnCPU(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
-                      {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},
-                      ExpectedEPNodeAssignment::All);
-}
-
-// Test that Flatten input (rank5) with axis == 2.
-TEST_F(QnnCPUBackendTests, Flatten_Rank5_Axis2) {
-  RunFlattenTestOnCPU(TestInputDef<float>({1, 2, 3, 4, 4}, false, -10.0f, 10.0f),
-                      {utils::MakeAttribute("axis", static_cast<int64_t>(2))},
-                      ExpectedEPNodeAssignment::All);
-}
-
-#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
-//
-// HTP tests:
-//
-// Test that Flatten input (rank4) with axis == 0.
+// Test 8-bit QDQ Flatten input (rank4) with axis == 0.
 TEST_F(QnnHTPBackendTests, Flatten_Rank4_Axis0) {
   RunQDQFlattenTestOnHTP<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
                                   {utils::MakeAttribute("axis", static_cast<int64_t>(0))},
                                   ExpectedEPNodeAssignment::All);
 }
 
-// Test that Flatten input (rank4) with axis == -1.
+// Test 16-bit QDQ Flatten input (rank4) with axis == 0.
+TEST_F(QnnHTPBackendTests, Flatten_Rank4_Axis0_U16) {
+  RunQDQFlattenTestOnHTP<uint16_t>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                                   {utils::MakeAttribute("axis", static_cast<int64_t>(0))},
+                                   ExpectedEPNodeAssignment::All,
+                                   13,     // opset
+                                   true);  // Use com.microsoft Q/DQ ops
+}
+
+// Test 8-bit QDQ Flatten input (rank4) with axis == -1.
 TEST_F(QnnHTPBackendTests, Flatten_Rank4_AxisNeg1) {
   RunQDQFlattenTestOnHTP<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
                                   {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},
                                   ExpectedEPNodeAssignment::All);
 }
 
-// Test QDQ Flatten with an input of rank5.
+// Test 16-bit QDQ Flatten input (rank4) with axis == -1.
+TEST_F(QnnHTPBackendTests, Flatten_Rank4_AxisNeg1_U16) {
+  RunQDQFlattenTestOnHTP<uint16_t>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                                   {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},
+                                   ExpectedEPNodeAssignment::All,
+                                   13,     // opset
+                                   true);  // Use com.microsoft Q/DQ ops
+}
+
+// Test 8-bit QDQ Flatten with an input of rank5.
 TEST_F(QnnHTPBackendTests, Flatten_QDQ8bit_Rank5) {
   // We can't use the usual model-building functions because they add standalone Quantize and Dequantize nodes
   // at the input and output. These Q/DQ ops get lowered to QNN's Quantize and Dequantize operators, which DO NOT
@@ -157,7 +179,7 @@ TEST_F(QnnHTPBackendTests, Flatten_QDQ8bit_Rank5) {
                   ExpectedEPNodeAssignment::All);
 }
 
-// Test that rank4 int32 Flatten runs on HTP backend.
+// Test that int32 non-QDQ Flatten runs on HTP backend.
 TEST_F(QnnHTPBackendTests, Flatten_Int32_Rank4_Axis2) {
   std::vector<int32_t> input_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
   RunFlattenTestOnHTP<int32_t>(TestInputDef<int32_t>({1, 3, 2, 2}, false, input_data),
diff --git a/onnxruntime/test/providers/qnn/gemm_op_test.cc b/onnxruntime/test/providers/qnn/gemm_op_test.cc
index 1e843df7d34c9..15f26717b06fd 100644
--- a/onnxruntime/test/providers/qnn/gemm_op_test.cc
+++ b/onnxruntime/test/providers/qnn/gemm_op_test.cc
@@ -134,9 +134,10 @@ TEST_F(QnnCPUBackendTests, Gemm_TransAB_Dynamic_B_And_Bias) {
 // Returns a function that builds a model with a QDQ Gemm node.
 template <typename InputAQType, typename InputBQType>
 inline GetTestQDQModelFn<InputAQType> BuildQDQGemmTestCase(const std::vector<TestInputDef<float>>& input_defs,
-                                                           const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
-  return [input_defs, attrs](ModelTestBuilder& builder,
-                             std::vector<QuantParams<InputAQType>>& output_qparams) {
+                                                           const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                                           bool use_contrib_qdq = false) {
+  return [input_defs, attrs, use_contrib_qdq](ModelTestBuilder& builder,
+                                              std::vector<QuantParams<InputAQType>>& output_qparams) {
     const size_t num_inputs = input_defs.size();
     assert(num_inputs == 2 || num_inputs == 3);
 
@@ -147,19 +148,20 @@ inline GetTestQDQModelFn<InputAQType> BuildQDQGemmTestCase(const std::vector<Tes
     NodeArg* input0 = MakeTestInput<float>(builder, input_defs[0]);
     QuantParams<InputAQType> input0_qparams = GetTestInputQuantParams<InputAQType>(input_defs[0]);
     NodeArg* input0_after_qdq = AddQDQNodePair<InputAQType>(builder, input0, input0_qparams.scale,
-                                                            input0_qparams.zero_point);
+                                                            input0_qparams.zero_point, use_contrib_qdq);
     op_inputs.push_back(input0_after_qdq);
 
     // Process input 1
     NodeArg* input1 = MakeTestInput<float>(builder, input_defs[1]);
     QuantParams<InputBQType> input1_qparams = GetTestInputQuantParams<InputBQType>(input_defs[1]);
     NodeArg* input1_after_qdq = AddQDQNodePair<InputBQType>(builder, input1, input1_qparams.scale,
-                                                            input1_qparams.zero_point);
+                                                            input1_qparams.zero_point, use_contrib_qdq);
     op_inputs.push_back(input1_after_qdq);
 
     // Process bias
     if (num_inputs == 3) {
-      NodeArg* bias_input = MakeTestQDQBiasInput(builder, input_defs[2], input0_qparams.scale * input1_qparams.scale);
+      NodeArg* bias_input = MakeTestQDQBiasInput(builder, input_defs[2], input0_qparams.scale * input1_qparams.scale,
+                                                 use_contrib_qdq);
       op_inputs.push_back(bias_input);
     }
 
@@ -173,7 +175,7 @@ inline GetTestQDQModelFn<InputAQType> BuildQDQGemmTestCase(const std::vector<Tes
 
     // op_output -> Q -> DQ -> output
     AddQDQNodePairWithOutputAsGraphOutput<InputAQType>(builder, gemm_output, output_qparams[0].scale,
-                                                       output_qparams[0].zero_point);
+                                                       output_qparams[0].zero_point, use_contrib_qdq);
   };
 }
 
@@ -183,7 +185,9 @@ template <typename InputAQType, typename InputBQType>
 static void RunQDQGemmTestOnHTP(const std::vector<TestInputDef<float>>& input_defs,
                                 const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
                                 ExpectedEPNodeAssignment expected_ep_assignment,
-                                int opset = 13) {
+                                int opset = 13,
+                                float f32_abs_err = 1e-4f,
+                                bool use_contrib_qdq = false) {
   ProviderOptions provider_options;
 
 #if defined(_WIN32)
@@ -191,16 +195,18 @@ static void RunQDQGemmTestOnHTP(const std::vector<TestInputDef<float>>& input_de
 #else
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
-
-  TestQDQModelAccuracy<InputAQType>(BuildOpTestCase<float>("Gemm", input_defs, {}, attrs),
-                                    BuildQDQGemmTestCase<InputAQType, InputBQType>(input_defs, attrs),
+  auto f32_model_builder = BuildOpTestCase<float>("Gemm", input_defs, {}, attrs);
+  auto qdq_model_builder = BuildQDQGemmTestCase<InputAQType, InputBQType>(input_defs, attrs, use_contrib_qdq);
+  TestQDQModelAccuracy<InputAQType>(f32_model_builder,
+                                    qdq_model_builder,
                                     provider_options,
                                     opset,
-                                    expected_ep_assignment);
+                                    expected_ep_assignment,
+                                    f32_abs_err);
 }
 
-// Test QDQ Gemm with dynamic inputs A and Bias. The B input is an initializer.
-TEST_F(QnnHTPBackendTests, Gemm_Dynamic_A_Static_B_Dynamic_Bias) {
+// Test 8-bit QDQ Gemm with dynamic inputs A and Bias. The B input is an initializer.
+TEST_F(QnnHTPBackendTests, Gemm_Dynamic_A_Static_B_Dynamic_Bias_U8) {
   std::vector<float> input_a_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
   std::vector<float> input_b_data = GetFloatDataInRange(-5.0f, 5.0f, 24);
   std::vector<float> input_c_data = GetFloatDataInRange(-1.0f, 1.0f, 4);
@@ -211,6 +217,47 @@ TEST_F(QnnHTPBackendTests, Gemm_Dynamic_A_Static_B_Dynamic_Bias) {
                                         ExpectedEPNodeAssignment::All);
 }
 
+// Test 16-bit QDQ Gemm with dynamic inputs A and Bias. The B input is an initializer.
+// TODO: Inaccuracy detected for output 'output_0', element 0.
+// Output quant params: scale=0.001872879103757441, zero_point=0.
+// Expected val: 120.73912048339844
+// QNN QDQ val: 0 (err 120.73912048339844)
+// CPU QDQ val: 120.73889923095703 (err 0.00022125244140625)
+TEST_F(QnnHTPBackendTests, DISABLED_Gemm_Dynamic_A_Static_B_Dynamic_Bias_U16) {
+  std::vector<float> input_a_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
+  std::vector<float> input_b_data = GetFloatDataInRange(-5.0f, 5.0f, 24);
+  std::vector<float> input_c_data = GetFloatDataInRange(-1.0f, 1.0f, 4);
+  RunQDQGemmTestOnHTP<uint16_t, uint16_t>({TestInputDef<float>({1, 6}, false, input_a_data),
+                                           TestInputDef<float>({6, 4}, true, input_b_data),
+                                           TestInputDef<float>({1, 4}, false, input_c_data)},
+                                          {},
+                                          ExpectedEPNodeAssignment::All,
+                                          13,     // opset
+                                          1e-4f,  // f32_abs_err
+                                          true);  // Use com.microsoft Q/DQ ops
+}
+
+// Test QDQ Gemm (16bit act, 8bit weight) with dynamic inputs A and Bias. The B input is an initializer.
+// TODO: Allow small inaccuracies based on % of expected value.
+// Inaccuracy detected for output 'output_0', element 0.
+// Output quant params: scale=0.001872879103757441, zero_point=0.
+// Expected val: 120.73912048339844
+// QNN QDQ val: 120.48043823242188 (err 0.2586822509765625)
+// CPU QDQ val: 120.48980712890625 (err 0.2493133544921875)
+TEST_F(QnnHTPBackendTests, Gemm_Dynamic_A_Static_B_Dynamic_Bias_U16Act_U8Weight) {
+  std::vector<float> input_a_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
+  std::vector<float> input_b_data = GetFloatDataInRange(-5.0f, 5.0f, 24);
+  std::vector<float> input_c_data = GetFloatDataInRange(-1.0f, 1.0f, 4);
+  RunQDQGemmTestOnHTP<uint16_t, uint8_t>({TestInputDef<float>({1, 6}, false, input_a_data),
+                                          TestInputDef<float>({6, 4}, true, input_b_data),
+                                          TestInputDef<float>({1, 4}, false, input_c_data)},
+                                         {},
+                                         ExpectedEPNodeAssignment::All,
+                                         13,     // opset
+                                         0.15f,  // f32_abs_err
+                                         true);  // Use com.microsoft Q/DQ ops
+}
+
 // Test QDQ Gemm with dynamic A and B inputs. The Bias is static.
 // TODO: Inaccuracy detected for output 'output', element 0.
 // Output quant params: scale=0.48132994771003723, zero_point=0.
@@ -240,8 +287,8 @@ TEST_F(QnnHTPBackendTests, Gemm_Static_B_And_Bias) {
                                         ExpectedEPNodeAssignment::All);
 }
 
-// Test QDQ Gemm with transposed A/B and static B and Bias inputs.
-TEST_F(QnnHTPBackendTests, Gemm_TransAB_Static_B_And_Bias) {
+// Test 8-bit QDQ Gemm with transposed A/B and static B and Bias inputs.
+TEST_F(QnnHTPBackendTests, Gemm_TransAB_Static_B_And_Bias_U8) {
   std::vector<float> input_a_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
   std::vector<float> input_b_data = GetFloatDataInRange(-5.0f, 5.0f, 24);
   std::vector<float> input_c_data = GetFloatDataInRange(-1.0f, 1.0f, 4);
@@ -253,6 +300,28 @@ TEST_F(QnnHTPBackendTests, Gemm_TransAB_Static_B_And_Bias) {
                                         ExpectedEPNodeAssignment::All);
 }
 
+// Test QDQ Gemm (16bit activation, 8bit weight) with transposed A/B and static B and Bias inputs.
+// TODO: Allow small inaccuracies based on % of expected value.
+// Inaccuracy detected for output 'output_0', element 0.
+// Output quant params: scale=0.00047966410056687891, zero_point=0.
+// Expected val: 29.434776306152344
+// QNN QDQ val: 29.191877365112305 (err 0.24289894104003906)
+// CPU QDQ val: 29.197153091430664 (err 0.23762321472167969)
+TEST_F(QnnHTPBackendTests, Gemm_TransAB_Static_B_And_Bias_U16Act_U8Weight) {
+  std::vector<float> input_a_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
+  std::vector<float> input_b_data = GetFloatDataInRange(-5.0f, 5.0f, 24);
+  std::vector<float> input_c_data = GetFloatDataInRange(-1.0f, 1.0f, 4);
+  RunQDQGemmTestOnHTP<uint16_t, uint8_t>({TestInputDef<float>({6, 1}, false, input_a_data),
+                                          TestInputDef<float>({4, 6}, true, input_b_data),
+                                          TestInputDef<float>({1, 4}, true, input_c_data)},
+                                         {utils::MakeAttribute("transA", static_cast<int64_t>(1)),
+                                          utils::MakeAttribute("transB", static_cast<int64_t>(1))},
+                                         ExpectedEPNodeAssignment::All,
+                                         13,     // opset
+                                         0.15f,  // f32_abs_err
+                                         true);  // Use com.microsoft Q/DQ ops
+}
+
 // Test QDQ Gemm with transposed A/B and dynamic (i.e., not initializer) B and Bias inputs.
 TEST_F(QnnHTPBackendTests, Gemm_TransAB_Dynamic_B_And_Bias) {
   std::vector<float> input_a_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
diff --git a/onnxruntime/test/providers/qnn/reshape_op_test.cc b/onnxruntime/test/providers/qnn/reshape_op_test.cc
index e117f134015c0..eb495e44ec770 100644
--- a/onnxruntime/test/providers/qnn/reshape_op_test.cc
+++ b/onnxruntime/test/providers/qnn/reshape_op_test.cc
@@ -76,13 +76,16 @@ TEST_F(QnnCPUBackendTests, Reshape_4D_f32) {
 template <typename QuantType>
 GetTestQDQModelFn<QuantType> BuildQDQReshapeTestCase(const TestInputDef<float>& input_def,
                                                      const TestInputDef<int64_t>& shape_def,
-                                                     const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
-  return [input_def, shape_def, attrs](ModelTestBuilder& builder,
-                                       std::vector<QuantParams<QuantType>>& output_qparams) {
+                                                     const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                                     bool use_contrib_qdq = false) {
+  return [input_def, shape_def, attrs,
+          use_contrib_qdq](ModelTestBuilder& builder,
+                           std::vector<QuantParams<QuantType>>& output_qparams) {
     // input -> Q -> DQ ->
     NodeArg* input = MakeTestInput(builder, input_def);
     QuantParams<QuantType> input_qparams = GetTestInputQuantParams<QuantType>(input_def);
-    NodeArg* input_qdq = AddQDQNodePair<QuantType>(builder, input, input_qparams.scale, input_qparams.zero_point);
+    NodeArg* input_qdq = AddQDQNodePair<QuantType>(builder, input, input_qparams.scale, input_qparams.zero_point,
+                                                   use_contrib_qdq);
 
     // shape input
     NodeArg* shape_input = MakeTestInput(builder, shape_def);
@@ -99,7 +102,7 @@ GetTestQDQModelFn<QuantType> BuildQDQReshapeTestCase(const TestInputDef<float>&
     // NOTE: Input and output quantization parameters must be equal for Reshape.
     output_qparams[0] = input_qparams;  // Overwrite!
     AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, reshape_output, input_qparams.scale,
-                                                     input_qparams.zero_point);
+                                                     input_qparams.zero_point, use_contrib_qdq);
   };
 }
 
@@ -132,7 +135,8 @@ static void RunQDQReshapeTestOnHTP(const TestInputDef<float>& input_def,
                                    const TestInputDef<int64_t>& shape_def,
                                    const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
                                    ExpectedEPNodeAssignment expected_ep_assignment,
-                                   int opset = 19) {
+                                   int opset = 19,
+                                   bool use_contrib_qdq = false) {
   ProviderOptions provider_options;
 
 #if defined(_WIN32)
@@ -142,7 +146,7 @@ static void RunQDQReshapeTestOnHTP(const TestInputDef<float>& input_def,
 #endif
 
   auto f32_model_builder = BuildOpTestCase<float, int64_t>("Reshape", {input_def}, {shape_def}, attrs);
-  auto qdq_model_builder = BuildQDQReshapeTestCase<QType>(input_def, shape_def, attrs);
+  auto qdq_model_builder = BuildQDQReshapeTestCase<QType>(input_def, shape_def, attrs, use_contrib_qdq);
   TestQDQModelAccuracy(f32_model_builder,
                        qdq_model_builder,
                        provider_options,
@@ -168,8 +172,8 @@ TEST_F(QnnHTPBackendTests, Reshape_AllowZeroAttr_Unsupported) {
                                   19);                             // Opset
 }
 
-// Test QDQ Reshape of rank 4 -> rank 2.
-TEST_F(QnnHTPBackendTests, Reshape_4D_f32) {
+// Test 8-bit QDQ Reshape of rank 4 -> rank 2.
+TEST_F(QnnHTPBackendTests, Reshape_4D_u8) {
   RunQDQReshapeTestOnHTP<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
                                   TestInputDef<int64_t>({2}, true, {1, 48}),
                                   {},  // Attributes
@@ -177,6 +181,16 @@ TEST_F(QnnHTPBackendTests, Reshape_4D_f32) {
                                   19);  // Opset
 }
 
+// Test 16-bit QDQ Reshape of rank 4 -> rank 2.
+TEST_F(QnnHTPBackendTests, Reshape_4D_u16) {
+  RunQDQReshapeTestOnHTP<uint16_t>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                                   TestInputDef<int64_t>({2}, true, {1, 48}),
+                                   {},  // Attributes
+                                   ExpectedEPNodeAssignment::All,
+                                   19,     // Opset
+                                   true);  // Use com.microsoft Q/DQ ops
+}
+
 // Test that int32 Reshape runs on HTP backend.
 TEST_F(QnnHTPBackendTests, Reshape_4D_int32) {
   std::vector<int32_t> input_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
diff --git a/onnxruntime/test/providers/qnn/split_op_test.cc b/onnxruntime/test/providers/qnn/split_op_test.cc
index 6a93bbacabda2..57e4b211777bb 100644
--- a/onnxruntime/test/providers/qnn/split_op_test.cc
+++ b/onnxruntime/test/providers/qnn/split_op_test.cc
@@ -267,7 +267,8 @@ static void RunQDQSplitOpTestOnHTP(const TestInputDef<float>& input_def,
                                    int64_t axis,
                                    int64_t num_outputs,
                                    int opset,
-                                   ExpectedEPNodeAssignment expected_ep_assignment) {
+                                   ExpectedEPNodeAssignment expected_ep_assignment,
+                                   bool use_contrib_qdq = false) {
   ProviderOptions provider_options;
 
 #if defined(_WIN32)
@@ -278,7 +279,8 @@ static void RunQDQSplitOpTestOnHTP(const TestInputDef<float>& input_def,
 
   const bool split_is_input = opset >= 13;
   auto f32_model_builder = BuildSplitTestCase<float>(input_def, split, split_is_input, axis, num_outputs);
-  auto qdq_model_builder = BuildQDQSplitTestCase<QuantType>(input_def, split, split_is_input, axis, num_outputs);
+  auto qdq_model_builder = BuildQDQSplitTestCase<QuantType>(input_def, split, split_is_input, axis, num_outputs,
+                                                            use_contrib_qdq);
   TestQDQModelAccuracy<QuantType>(f32_model_builder,
                                   qdq_model_builder,
                                   provider_options,
@@ -297,7 +299,7 @@ TEST_F(QnnHTPBackendTests, Split_Int32_Opset13) {
                                ExpectedEPNodeAssignment::All);
 }
 
-// Test QDQ Split opset 18 on HTP backend: equal split of axis 0 via 'num_outputs' attribute
+// Test 8-bit QDQ Split opset 18 on HTP backend: equal split of axis 0 via 'num_outputs' attribute
 // and 'split' input.
 TEST_F(QnnHTPBackendTests, Split_Equal_Axis0_Opset18) {
   // Use 'split' input (initializer).
@@ -317,6 +319,28 @@ TEST_F(QnnHTPBackendTests, Split_Equal_Axis0_Opset18) {
                                   ExpectedEPNodeAssignment::All);
 }
 
+// Test 16-bit QDQ Split opset 18 on HTP backend: equal split of axis 0 via 'num_outputs' attribute
+// and 'split' input.
+TEST_F(QnnHTPBackendTests, Split_Equal_Axis0_Opset18_U16) {
+  // Use 'split' input (initializer).
+  RunQDQSplitOpTestOnHTP<uint16_t>(TestInputDef<float>({4, 2}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),
+                                   {2, 2},  // split
+                                   0,       // axis
+                                   -1,      // num_outputs
+                                   18,      // opset
+                                   ExpectedEPNodeAssignment::All,
+                                   true);  // Use com.microsoft Q/DQ ops
+
+  // Use 'num_outputs' attribute.
+  RunQDQSplitOpTestOnHTP<uint16_t>(TestInputDef<float>({4, 2}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),
+                                   {},  // split (use num_outputs instead)
+                                   0,   // axis
+                                   2,   // num_outputs
+                                   18,  // opset
+                                   ExpectedEPNodeAssignment::All,
+                                   true);  // Use com.microsoft Q/DQ ops
+}
+
 // Test QDQ Split op on HTP backend: equal split on axis 0 with opset 13.
 TEST_F(QnnHTPBackendTests, Split_Equal_Axis0_Opset13) {
   RunQDQSplitOpTestOnHTP<uint8_t>(TestInputDef<float>({4, 2}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),
diff --git a/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc b/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc
index 454bcb280b3e8..33d2f64c0315e 100644
--- a/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc
+++ b/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc
@@ -13,33 +13,6 @@
 namespace onnxruntime {
 namespace test {
 
-// Returns a function that creates a graph with a QDQ (Un)Squeeze operator.
-template <typename QuantType>
-GetTestQDQModelFn<QuantType> BuildQDQSqueezeTestCase(const std::string& op_type,  // Squeeze or Unsqueeze
-                                                     const TestInputDef<float>& input_def,
-                                                     const TestInputDef<int64_t>& axes_def) {
-  return [op_type, input_def, axes_def](ModelTestBuilder& builder,
-                                        std::vector<QuantParams<QuantType>>& output_qparams) {
-    // input -> Q -> DQ ->
-    NodeArg* input = MakeTestInput(builder, input_def);
-    QuantParams<QuantType> input_qparams = GetTestInputQuantParams<QuantType>(input_def);
-    NodeArg* input_qdq = AddQDQNodePair<QuantType>(builder, input, input_qparams.scale, input_qparams.zero_point);
-
-    // axes input
-    NodeArg* axes_input = MakeTestInput(builder, axes_def);
-
-    // (Un)Squeeze op
-    NodeArg* op_output = builder.MakeIntermediate();
-    builder.AddNode(op_type, {input_qdq, axes_input}, {op_output});
-
-    // op_output -> Q -> DQ -> output
-    // NOTE: Input and output quantization parameters must be equal for (Un)Squeeze.
-    output_qparams[0] = input_qparams;  // Overwrite!
-    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, op_output, input_qparams.scale,
-                                                     input_qparams.zero_point);
-  };
-}
-
 // Runs a model with a Squeeze (or Unsqueeze) operator on the QNN CPU backend. Checks the graph node assignment
 // and that inference outputs for QNN EP and CPU EP match.
 template <typename DataType>
@@ -62,52 +35,6 @@ static void RunSqueezeTestOnCPU(const std::string& op_type,  // Squeeze or Unsqu
                   expected_ep_assignment);
 }
 
-// Runs a model with a non-QDQ (Un)Squeeze operator on the QNN HTP backend. Checks the graph node assignment
-// and that inference outputs for QNN EP and CPU EP match.
-template <typename DataType>
-static void RunSqueezeTestOnHTP(const std::string& op_type,  // Squeeze or Unsqueeze
-                                const TestInputDef<DataType>& input_def,
-                                const TestInputDef<int64_t>& axes_def,
-                                ExpectedEPNodeAssignment expected_ep_assignment,
-                                int opset = 13) {
-  ProviderOptions provider_options;
-
-#if defined(_WIN32)
-  provider_options["backend_path"] = "QnnHtp.dll";
-#else
-  provider_options["backend_path"] = "libQnnHtp.so";
-#endif
-
-  RunQnnModelTest(BuildOpTestCase<DataType, int64_t>(op_type, {input_def}, {axes_def}, {}),
-                  provider_options,
-                  opset,
-                  expected_ep_assignment);
-}
-
-// Runs a QDQ (Un)Squeeze model on the QNN (HTP) EP and the ORT CPU EP. Checks the graph node assignment and
-// that inference running the QDQ model on QNN EP is at least as accurate as on ORT CPU EP
-// (when compared to the baseline float32 model).
-template <typename QType>
-static void RunQDQSqueezeTestOnHTP(const std::string& op_type,
-                                   const TestInputDef<float>& input_def,
-                                   const TestInputDef<int64_t>& axes_def,
-                                   ExpectedEPNodeAssignment expected_ep_assignment,
-                                   int opset = 13) {
-  ProviderOptions provider_options;
-
-#if defined(_WIN32)
-  provider_options["backend_path"] = "QnnHtp.dll";
-#else
-  provider_options["backend_path"] = "libQnnHtp.so";
-#endif
-
-  TestQDQModelAccuracy(BuildOpTestCase<float, int64_t>(op_type, {input_def}, {axes_def}, {}),  // baseline float32 model
-                       BuildQDQSqueezeTestCase<QType>(op_type, input_def, axes_def),           // QDQ model
-                       provider_options,
-                       opset,
-                       expected_ep_assignment);
-}
-
 //
 // CPU tests:
 //
@@ -165,6 +92,86 @@ TEST_F(QnnCPUBackendTests, Unsqueeze_Rank3_Rank4_NegAxes_f32) {
 // HTP tests:
 //
 
+// Returns a function that creates a graph with a QDQ (Un)Squeeze operator.
+template <typename QuantType>
+GetTestQDQModelFn<QuantType> BuildQDQSqueezeTestCase(const std::string& op_type,  // Squeeze or Unsqueeze
+                                                     const TestInputDef<float>& input_def,
+                                                     const TestInputDef<int64_t>& axes_def,
+                                                     bool use_contrib_qdq = false) {
+  return [op_type, input_def, axes_def,
+          use_contrib_qdq](ModelTestBuilder& builder,
+                           std::vector<QuantParams<QuantType>>& output_qparams) {
+    // input -> Q -> DQ ->
+    NodeArg* input = MakeTestInput(builder, input_def);
+    QuantParams<QuantType> input_qparams = GetTestInputQuantParams<QuantType>(input_def);
+    NodeArg* input_qdq = AddQDQNodePair<QuantType>(builder, input, input_qparams.scale, input_qparams.zero_point,
+                                                   use_contrib_qdq);
+
+    // axes input
+    NodeArg* axes_input = MakeTestInput(builder, axes_def);
+
+    // (Un)Squeeze op
+    NodeArg* op_output = builder.MakeIntermediate();
+    builder.AddNode(op_type, {input_qdq, axes_input}, {op_output});
+
+    // op_output -> Q -> DQ -> output
+    // NOTE: Input and output quantization parameters must be equal for (Un)Squeeze.
+    output_qparams[0] = input_qparams;  // Overwrite!
+    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, op_output, input_qparams.scale,
+                                                     input_qparams.zero_point, use_contrib_qdq);
+  };
+}
+
+// Runs a model with a non-QDQ (Un)Squeeze operator on the QNN HTP backend. Checks the graph node assignment
+// and that inference outputs for QNN EP and CPU EP match.
+template <typename DataType>
+static void RunSqueezeTestOnHTP(const std::string& op_type,  // Squeeze or Unsqueeze
+                                const TestInputDef<DataType>& input_def,
+                                const TestInputDef<int64_t>& axes_def,
+                                ExpectedEPNodeAssignment expected_ep_assignment,
+                                int opset = 13) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  RunQnnModelTest(BuildOpTestCase<DataType, int64_t>(op_type, {input_def}, {axes_def}, {}),
+                  provider_options,
+                  opset,
+                  expected_ep_assignment);
+}
+
+// Runs a QDQ (Un)Squeeze model on the QNN (HTP) EP and the ORT CPU EP. Checks the graph node assignment and
+// that inference running the QDQ model on QNN EP is at least as accurate as on ORT CPU EP
+// (when compared to the baseline float32 model).
+template <typename QType>
+static void RunQDQSqueezeTestOnHTP(const std::string& op_type,
+                                   const TestInputDef<float>& input_def,
+                                   const TestInputDef<int64_t>& axes_def,
+                                   ExpectedEPNodeAssignment expected_ep_assignment,
+                                   int opset = 13,
+                                   bool use_contrib_qdq = false) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  auto f32_model_builder = BuildOpTestCase<float, int64_t>(op_type, {input_def}, {axes_def}, {});
+  auto qdq_model_builder = BuildQDQSqueezeTestCase<QType>(op_type, input_def, axes_def, use_contrib_qdq);
+
+  TestQDQModelAccuracy(f32_model_builder,
+                       qdq_model_builder,
+                       provider_options,
+                       opset,
+                       expected_ep_assignment);
+}
+
 // Test that QDQ Squeeze with a dynamic axes input is not supported by QNN EP.
 TEST_F(QnnHTPBackendTests, Squeeze_DynamicAxes_Unsupported) {
   RunQDQSqueezeTestOnHTP<uint8_t>("Squeeze",
@@ -219,14 +226,24 @@ TEST_F(QnnHTPBackendTests, Squeeze_Rank5_Rank2_f32) {
                   ExpectedEPNodeAssignment::All);
 }
 
-// Test QDQ Squeeze of rank 4 -> rank 3 with a negative axes value.
-TEST_F(QnnHTPBackendTests, Squeeze_Rank4_Rank3_NegAxes_f32) {
+// Test 8-bit QDQ Squeeze of rank 4 -> rank 3 with a negative axes value.
+TEST_F(QnnHTPBackendTests, Squeeze_Rank4_Rank3_NegAxes_u8) {
   RunQDQSqueezeTestOnHTP<uint8_t>("Squeeze",
                                   TestInputDef<float>({1, 3, 2, 1}, false, -10.0f, 10.0f),
                                   TestInputDef<int64_t>({1}, true, {-1}),  // Squeeze last axis => (1, 3, 2)
                                   ExpectedEPNodeAssignment::All);
 }
 
+// Test 16-bit QDQ Squeeze of rank 4 -> rank 3 with a negative axes value.
+TEST_F(QnnHTPBackendTests, Squeeze_Rank4_Rank3_NegAxes_u16) {
+  RunQDQSqueezeTestOnHTP<uint16_t>("Squeeze",
+                                   TestInputDef<float>({1, 3, 2, 1}, false, -10.0f, 10.0f),
+                                   TestInputDef<int64_t>({1}, true, {-1}),  // Squeeze last axis => (1, 3, 2)
+                                   ExpectedEPNodeAssignment::All,
+                                   13,     // opset
+                                   true);  // Use com.microsoft Q/DQ ops
+}
+
 // Test QDQ Unsqueeze of rank 3 -> rank 5.
 TEST_F(QnnHTPBackendTests, Unsqueeze_Rank3_Rank5_f32) {
   // We can't use the usual model-building functions because they add standalone Quantize and Dequantize nodes
@@ -265,14 +282,24 @@ TEST_F(QnnHTPBackendTests, Unsqueeze_Rank3_Rank5_f32) {
                   ExpectedEPNodeAssignment::All);
 }
 
-// Test Unsqueeze of rank 3 -> rank 4 with a negative axes value.
-TEST_F(QnnHTPBackendTests, Unsqueeze_Rank3_Rank4_NegAxes_f32) {
+// Test 8-bit QDQ Unsqueeze of rank 3 -> rank 4 with a negative axes value.
+TEST_F(QnnHTPBackendTests, Unsqueeze_Rank3_Rank4_NegAxes_u8) {
   RunQDQSqueezeTestOnHTP<uint8_t>("Unsqueeze",
                                   TestInputDef<float>({1, 3, 2}, false, -10.0f, 10.0f),
                                   TestInputDef<int64_t>({1}, true, {-1}),  // Add 1 as last axis => (1, 3, 2, 1)
                                   ExpectedEPNodeAssignment::All);
 }
 
+// Test 16-bit QDQ Unsqueeze of rank 3 -> rank 4 with a negative axes value.
+TEST_F(QnnHTPBackendTests, Unsqueeze_Rank3_Rank4_NegAxes_u16) {
+  RunQDQSqueezeTestOnHTP<uint16_t>("Unsqueeze",
+                                   TestInputDef<float>({1, 3, 2}, false, -10.0f, 10.0f),
+                                   TestInputDef<int64_t>({1}, true, {-1}),  // Add 1 as last axis => (1, 3, 2, 1)
+                                   ExpectedEPNodeAssignment::All,
+                                   13,     // opset
+                                   true);  // Use com.microsoft Q/DQ ops
+}
+
 // Test that int32 Squeeze runs on HTP backend.
 TEST_F(QnnHTPBackendTests, Squeeze_Int32_Rank4_Rank3) {
   std::vector<int32_t> input_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};

From b1e832aa0537d4c1662135f00a77a47b9c3798af Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Wed, 20 Sep 2023 02:49:21 -0700
Subject: [PATCH 21/22] Enable QDQ TopK on QNN EP's HTP backend. Add TopK unit
 tests.

---
 .../selectors_actions/qdq_selectors.cc        |  36 +++
 .../selectors_actions/qdq_selectors.h         |   8 +
 .../selectors_actions/shared/utils.cc         |  14 +-
 .../providers/qnn/builder/opbuilder/topk.cc   |  15 +-
 .../test/optimizer/qdq_transformer_test.cc    |  23 +-
 .../test/providers/qnn/topk_op_test.cc        | 209 ++++++++++++++++++
 6 files changed, 291 insertions(+), 14 deletions(-)
 create mode 100644 onnxruntime/test/providers/qnn/topk_op_test.cc

diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
index 16c7bd5fce960..5015e48fdb7b8 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
@@ -496,6 +496,42 @@ bool LogicalComparisonNodeGroupSelector::Check(const GraphViewer& graph_viewer,
   return dt_input_1 == dt_input_2;
 }
 
+bool TopKNodeGroupSelector::Check(const GraphViewer& graph_viewer,
+                                  const Node& node,
+                                  const std::vector<const Node*>& dq_nodes,
+                                  const std::vector<const Node*>& q_nodes) const {
+  constexpr int num_dq_inputs = 1;
+  constexpr int num_q_outputs = 1;
+  if (num_dq_inputs != gsl::narrow_cast<int>(dq_nodes.size())) {
+    return false;
+  }
+
+  if (const auto dq_validation_status = QDQ::ValidateNodeGroupDQNodes(graph_viewer, node, dq_nodes);
+      !dq_validation_status.IsOK()) {
+    return false;
+  }
+
+  if (num_q_outputs != gsl::narrow_cast<int>(q_nodes.size())) {
+    return false;
+  }
+
+  const Node& dq_node = *dq_nodes.front();
+  const Node& q_node = *q_nodes.front();
+
+  int32_t dt_input = dq_node.InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+  int32_t dt_output = q_node.OutputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+
+  if (dt_input != dt_output) {
+    return false;
+  }
+
+  auto get_const_initializer = [&graph_viewer](const std::string& initializer_name) {
+    return graph_viewer.GetConstantInitializer(initializer_name, true);
+  };
+
+  return IsQDQPairSupported(q_node, dq_node, get_const_initializer, graph_viewer.ModelPath());
+}
+
 }  // namespace QDQ
 }  // namespace onnxruntime
 
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
index d8fefdd8dc3d9..be7f7e0288eda 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
@@ -220,6 +220,14 @@ class LogicalComparisonNodeGroupSelector : public NodeGroupSelector {
              const std::vector<const Node*>& q_nodes) const override;
 };
 
+// TopK has 1 DQ input node and 1 Q output node.
+// Zero point and scale are constant scalars and must match
+class TopKNodeGroupSelector : public NodeGroupSelector {
+  bool Check(const GraphViewer& graph_viewer, const Node& node,
+             const std::vector<const Node*>& dq_nodes,
+             const std::vector<const Node*>& q_nodes) const override;
+};
+
 /*
  * NodeSelector instances for use in the QDQ::SelectorActionTransformer.
  */
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
index 293c885858179..3f1b2f0458bc0 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
@@ -38,7 +38,7 @@ static const OpVersionsAndSelector::OpVersionsMap GetMiscOpVersionsMap() {
           {"Squeeze", {}},
           {"Unsqueeze", {}},
           {"Tile", {}}};
-  }
+}
 
 static const OpVersionsAndSelector::OpVersionsMap GetDropDQOpVersionsMap() {
   return {{"ArgMax", {}},
@@ -129,6 +129,10 @@ static const OpVersionsAndSelector::OpVersionsMap GetPadOpVersionsMap() {
   return {{"Pad", {}}};
 }
 
+static const OpVersionsAndSelector::OpVersionsMap GetTopKOpVersionsMap() {
+  return {{"TopK", {}}};
+}
+
 /* Selector rules registration related */
 void RegisterMiscSelectors(Selectors& qdq_selectors) {
   /* register selectors for miscellaneous ops */
@@ -229,6 +233,13 @@ void RegisterPadSelectors(Selectors& qdq_selectors) {
                                  std::move(selector));
 }
 
+void RegisterTopKSelector(Selectors& qdq_selectors) {
+  /* register selector for TopK op */
+  std::unique_ptr<NodeGroupSelector> selector = std::make_unique<TopKNodeGroupSelector>();
+  qdq_selectors.RegisterSelector(GetTopKOpVersionsMap(),
+                                 std::move(selector));
+}
+
 void SelectorManager::CreateSelectors() {
   RegisterMiscSelectors(qdq_selectors_);
   RegisterDropDQSelectors(qdq_selectors_);
@@ -244,6 +255,7 @@ void SelectorManager::CreateSelectors() {
   RegisterLogicalComparisonSelectors(qdq_selectors_);
   RegisterWhereSelectors(qdq_selectors_);
   RegisterPadSelectors(qdq_selectors_);
+  RegisterTopKSelector(qdq_selectors_);
 }
 
 void SelectorManager::InitializeSelectorsMap() {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
index 6ca36736f2f7f..047972294f78c 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
@@ -63,9 +63,20 @@ Status TopKOpBuilder::ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const N
   auto rank = input_shape.size();
   auto axis = node_helper.Get("axis", -1);
 
-  if (-1 == axis && axis != static_cast<int32_t>(rank - 1)) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN TopK axis is always the last dimension");
+  ORT_RETURN_IF_NOT(axis == -1 || axis == static_cast<int32_t>(rank - 1),
+                    "QNN TopK's axis is always the last dimension");
+
+  // ONNX TopK outputs int64 indices, but the equivalent QNN op outputs uint32 indices.
+  // The QNN HTP backend does not generally support the int64 type, but QNN EP can just use the uint32 type
+  // for TopK ops within the graph. However, if the TopK op **generates** a graph output,
+  // then we cannot support it on the HTP backend.
+  bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType());
+  if (is_npu_backend) {
+    const std::string& output_name = node_unit.Outputs()[0].node_arg.Name();
+    ORT_RETURN_IF(qnn_model_wrapper.IsGraphOutput(output_name),
+                  "QNN EP does not support TopK ops that generate a graph output.");
   }
+
   return Status::OK();
 }
 
diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc
index 2cca44e4d834b..63129ef2fff1e 100644
--- a/onnxruntime/test/optimizer/qdq_transformer_test.cc
+++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc
@@ -2517,18 +2517,19 @@ TEST(QDQTransformerTests, Clip) {
     test_case(.04f, static_cast<int8_t>(-97), 1, opset, true);          // [-1.24, 8.96] contrib qdq
     test_case(.02352941176f, static_cast<uint8_t>(0), 0, opset);        // [0, 6]
     test_case(.02352941176f, static_cast<uint8_t>(0), 0, opset, true);  // [0, 6] contrib qdq
-    test_case(9.15541313801785e-5f, static_cast<uint16_t>(0), 0, opset, true);  // [0, 6] contrib 16-bit qdq
+    test_case(9.15541313801785e-5f, static_cast<uint16_t>(0),
+              0, opset, true);                                     // [0, 6] contrib 16-bit qdq
     test_case(0.0009f, static_cast<uint16_t>(0), 1, opset, true);  // [0, 58.98] contrib 16-bit qdq
-    test_case(.02f, static_cast<uint8_t>(0), 0, opset);                 // [0, 5.1]
-    test_case(.02f, static_cast<uint8_t>(0), 0, opset, true);           // [0, 5.1] contrib qdq
-    test_case(.03f, static_cast<uint8_t>(0), 1, opset);                 // [0, 7.65]
-    test_case(.03f, static_cast<uint8_t>(0), 1, opset, true);           // [0, 7.65] contrib qdq
-    test_case(.02f, static_cast<uint8_t>(255), 1, opset);               // [-5.1, 0]
-    test_case(.02f, static_cast<uint8_t>(255), 1, opset, true);         // [-5.1, 0] contrib qdq
-    test_case(.02f, static_cast<uint8_t>(128), 1, opset);               // [-2.56, 2.54]
-    test_case(.02f, static_cast<uint8_t>(128), 1, opset, true);         // [-2.56, 2.54] contrib qdq
-    test_case(.04f, static_cast<uint8_t>(31), 1, opset);                // [-1.24, 8.96]
-    test_case(.04f, static_cast<uint8_t>(31), 1, opset, true);          // [-1.24, 8.96] contrib qdq
+    test_case(.02f, static_cast<uint8_t>(0), 0, opset);            // [0, 5.1]
+    test_case(.02f, static_cast<uint8_t>(0), 0, opset, true);      // [0, 5.1] contrib qdq
+    test_case(.03f, static_cast<uint8_t>(0), 1, opset);            // [0, 7.65]
+    test_case(.03f, static_cast<uint8_t>(0), 1, opset, true);      // [0, 7.65] contrib qdq
+    test_case(.02f, static_cast<uint8_t>(255), 1, opset);          // [-5.1, 0]
+    test_case(.02f, static_cast<uint8_t>(255), 1, opset, true);    // [-5.1, 0] contrib qdq
+    test_case(.02f, static_cast<uint8_t>(128), 1, opset);          // [-2.56, 2.54]
+    test_case(.02f, static_cast<uint8_t>(128), 1, opset, true);    // [-2.56, 2.54] contrib qdq
+    test_case(.04f, static_cast<uint8_t>(31), 1, opset);           // [-1.24, 8.96]
+    test_case(.04f, static_cast<uint8_t>(31), 1, opset, true);     // [-1.24, 8.96] contrib qdq
   }
 
   // opset_version = 10
diff --git a/onnxruntime/test/providers/qnn/topk_op_test.cc b/onnxruntime/test/providers/qnn/topk_op_test.cc
new file mode 100644
index 0000000000000..93e725af5f20e
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/topk_op_test.cc
@@ -0,0 +1,209 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include <string>
+
+#include "test/providers/qnn/qnn_test_utils.h"
+#include "core/graph/node_attr_utils.h"
+
+#include "onnx/onnx_pb.h"
+#include "gtest/gtest.h"
+
+namespace onnxruntime {
+namespace test {
+
+// Returns a function that builds a model with a TopK operator.
+template <typename DataType>
+inline GetTestModelFn BuildTopKTestCase(const TestInputDef<DataType>& input_def,
+                                        const TestInputDef<int64_t>& k_def,
+                                        const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                        bool cast_output_indices = true) {
+  return [input_def, k_def, attrs, cast_output_indices](ModelTestBuilder& builder) {
+    NodeArg* input = MakeTestInput<DataType>(builder, input_def);
+    NodeArg* k_input = MakeTestInput<int64_t>(builder, k_def);
+
+    NodeArg* values_output = builder.MakeOutput();
+    NodeArg* indices_output = cast_output_indices ? builder.MakeIntermediate() : builder.MakeOutput();
+    Node& topk_node = builder.AddNode("TopK", {input, k_input}, {values_output, indices_output});
+
+    for (const auto& attr : attrs) {
+      topk_node.AddAttributeProto(attr);
+    }
+
+    // Cast indices to uint32
+    if (cast_output_indices) {
+      auto* uint32_indices_output = builder.MakeOutput();
+      Node& cast_node = builder.AddNode("Cast", {indices_output}, {uint32_indices_output});
+      const auto dst_type = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT32;
+      cast_node.AddAttribute("to", static_cast<int64_t>(dst_type));
+    }
+  };
+}
+
+// Runs a model with a TopK operator on the QNN CPU backend. Checks the graph node assignment
+// and that inference outputs for QNN EP and CPU EP match.
+template <typename DataType>
+static void RunTopKTestOnCPU(const TestInputDef<DataType>& input_def,
+                             const TestInputDef<int64_t>& k_def,
+                             const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                             ExpectedEPNodeAssignment expected_ep_assignment,
+                             int opset = 19) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnCpu.dll";
+#else
+  provider_options["backend_path"] = "libQnnCpu.so";
+#endif
+
+  RunQnnModelTest(BuildTopKTestCase<DataType>(input_def, k_def, attrs, false /*cast_output_indices*/),
+                  provider_options,
+                  opset,
+                  expected_ep_assignment);
+}
+
+//
+// CPU tests:
+//
+
+// Test that TopK with a dynamic K input is not supported by QNN EP.
+TEST_F(QnnCPUBackendTests, TopK_DynamicK_Unsupported) {
+  RunTopKTestOnCPU<float>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                          TestInputDef<int64_t>({1}, false /* is_initializer */, {2}),
+                          {},                               // Attributes
+                          ExpectedEPNodeAssignment::None);  // Should not be assigned to QNN EP.
+}
+
+// Test that TopK with an axis attribute that is not the last dimension is not supported by QNN EP.
+TEST_F(QnnCPUBackendTests, TopK_NonLastAxis_Unsupported) {
+  RunTopKTestOnCPU<float>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                          TestInputDef<int64_t>({1}, true /* is_initializer */, {2}),
+                          {utils::MakeAttribute("axis", static_cast<int64_t>(1))},
+                          ExpectedEPNodeAssignment::None);  // Should not be assigned to QNN EP.
+}
+
+// Test that TopK that returns the top k minimum values is not supported by QNN EP.
+TEST_F(QnnCPUBackendTests, TopK_MinValues_Unsupported) {
+  RunTopKTestOnCPU<float>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                          TestInputDef<int64_t>({1}, true /* is_initializer */, {2}),
+                          {utils::MakeAttribute("largest", static_cast<int64_t>(0))},
+                          ExpectedEPNodeAssignment::None);  // Should not be assigned to QNN EP.
+}
+
+// Test TopK on CPU backend: top 2 largest floats from last axis
+TEST_F(QnnCPUBackendTests, TopK_LargestFloats_LastAxis) {
+  RunTopKTestOnCPU<float>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                          TestInputDef<int64_t>({1}, true /* is_initializer */, {2}),
+                          {},  // Attributes
+                          ExpectedEPNodeAssignment::All);
+}
+
+// Test TopK on CPU backend: top 2 largest int32s from last axis
+TEST_F(QnnCPUBackendTests, TopK_LargestInt32s_LastAxis) {
+  std::vector<int32_t> input_data = {-6, -5, -4, -3, -2, 0, 1, 2, 3, 4, 5, 6};
+  RunTopKTestOnCPU<int32_t>(TestInputDef<int32_t>({1, 2, 2, 3}, false, input_data),
+                            TestInputDef<int64_t>({1}, true /* is_initializer */, {2}),
+                            {},  // Attributes
+                            ExpectedEPNodeAssignment::All);
+}
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+//
+// HTP tests:
+//
+
+// Returns a function that creates a graph with a QDQ TopK operator.
+template <typename QuantType>
+GetTestQDQModelFn<QuantType> BuildQDQTopKTestCase(const TestInputDef<float>& input_def,
+                                                  const TestInputDef<int64_t>& k_def,
+                                                  const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                                  bool use_contrib_qdq = false) {
+  return [input_def, k_def, attrs, use_contrib_qdq](ModelTestBuilder& builder,
+                                                    std::vector<QuantParams<QuantType>>& output_qparams) {
+    // input -> Q -> DQ ->
+    NodeArg* input = MakeTestInput(builder, input_def);
+    QuantParams<QuantType> input_qparams = GetTestInputQuantParams<QuantType>(input_def);
+    NodeArg* input_qdq = AddQDQNodePair<QuantType>(builder, input, input_qparams.scale, input_qparams.zero_point,
+                                                   use_contrib_qdq);
+
+    // K input
+    NodeArg* k_input = MakeTestInput(builder, k_def);
+
+    // Reshape op
+    NodeArg* values_output = builder.MakeIntermediate();
+    NodeArg* indices_output = builder.MakeIntermediate();
+    Node& topk_node = builder.AddNode("TopK", {input_qdq, k_input}, {values_output, indices_output});
+
+    for (const auto& attr : attrs) {
+      topk_node.AddAttributeProto(attr);
+    }
+
+    // op_output -> Q -> DQ -> output
+    // NOTE: Input and output quantization parameters must be equal for Reshape.
+    output_qparams[0] = input_qparams;  // Overwrite!
+    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, values_output, input_qparams.scale,
+                                                     input_qparams.zero_point, use_contrib_qdq);
+
+    // Cast indices to uint32 (HTP backend does not support int64 graph outputs)
+    auto* uint32_indices_output = builder.MakeOutput();
+    Node& cast_node = builder.AddNode("Cast", {indices_output}, {uint32_indices_output});
+    const auto dst_type = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT32;
+    cast_node.AddAttribute("to", static_cast<int64_t>(dst_type));
+  };
+}
+
+// Runs a QDQ TopK model on the QNN (HTP) EP and the ORT CPU EP. Checks the graph node assignment and that inference
+// running the QDQ model on QNN EP is at least as accurate as on ORT CPU EP (compared to the baseline float32 model).
+template <typename QType>
+static void RunQDQTopKTestOnHTP(const TestInputDef<float>& input_def,
+                                const TestInputDef<int64_t>& k_def,
+                                const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                ExpectedEPNodeAssignment expected_ep_assignment,
+                                int opset = 19,
+                                bool use_contrib_qdq = false) {
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  auto f32_model_builder = BuildTopKTestCase<float>(input_def, k_def, attrs, true /*cast_output_indices*/);
+  auto qdq_model_builder = BuildQDQTopKTestCase<QType>(input_def, k_def, attrs, use_contrib_qdq);
+  TestQDQModelAccuracy(f32_model_builder,
+                       qdq_model_builder,
+                       provider_options,
+                       opset,
+                       expected_ep_assignment);
+}
+
+// Test 8-bit QDQ TopK on HTP backend: top 2 largest floats from last axis
+TEST_F(QnnHTPBackendTests, TopK_LargestFloats_U8_LastAxis) {
+  RunQDQTopKTestOnHTP<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                               TestInputDef<int64_t>({1}, true /* is_initializer */, {2}),
+                               {},  // Attributes
+                               ExpectedEPNodeAssignment::All);
+}
+
+// Test 16-bit QDQ TopK on HTP backend: top 2 largest floats from last axis
+// TODO: Inaccuracy detected for output 'output_0', element 6.
+// Output quant params: scale=0.00061036087572574615, zero_point=32768.
+// Expected val: -7.2340402603149414
+// QNN QDQ val: -17.446556091308594 (err 10.212515830993652)
+// CPU QDQ val: -7.2339968681335449 (err 4.3392181396484375e-05)
+TEST_F(QnnHTPBackendTests, DISABLED_TopK_LargestFloats_U16_LastAxis) {
+  RunQDQTopKTestOnHTP<uint16_t>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-20.0f, 20.0f, 48)),
+                                TestInputDef<int64_t>({1}, true /* is_initializer */, {2}),
+                                {},  // Attributes
+                                ExpectedEPNodeAssignment::All,
+                                19,     // opset
+                                true);  // Use com.microsoft Q/DQ ops
+}
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+}  // namespace test
+}  // namespace onnxruntime
+#endif  // !defined(ORT_MINIMAL_BUILD)

From 6694aaed5739c6fb7a1711311fa95b708f803edf Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Wed, 20 Sep 2023 10:34:10 -0700
Subject: [PATCH 22/22] Try to limit the white-space changes made by
 clang-format

---
 .../test/optimizer/qdq_transformer_test.cc    | 58 +++++++++----------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc
index 63129ef2fff1e..d3616a14d8a5d 100644
--- a/onnxruntime/test/optimizer/qdq_transformer_test.cc
+++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc
@@ -2497,39 +2497,39 @@ TEST(QDQTransformerTests, Clip) {
                       epsilon);
   };
 
+  constexpr int16_t int16_min = std::numeric_limits<int16_t>::min();
+  constexpr uint16_t uint16_min = std::numeric_limits<uint16_t>::min();
+
   std::vector<int> opsets{12, 18, 19};
   for (auto opset : opsets) {
     test_case(.0235294122248888f, static_cast<int8_t>(-128), 0, opset);        // [0, 6]
     test_case(.0235294122248888f, static_cast<int8_t>(-128), 0, opset, true);  // [0, 6] contrib qdq
-    test_case(9.15541313801785e-5f, static_cast<int16_t>(std::numeric_limits<int16_t>::min()), 0,
-              opset, true);  // [0, 6] contrib 16-bit qdq
-    test_case(0.0009f, static_cast<int16_t>(std::numeric_limits<int16_t>::min()), 1,
-              opset, true);                                             // [0, 58.98] contrib 16-bit qdq
-    test_case(.02f, static_cast<int8_t>(-128), 0, opset);               // [0, 5.1]
-    test_case(.02f, static_cast<int8_t>(-128), 0, opset, true);         // [0, 5.1] contrib qdq
-    test_case(.03f, static_cast<int8_t>(-128), 1, opset);               // [0, 7.65]
-    test_case(.03f, static_cast<int8_t>(-128), 1, opset, true);         // [0, 7.65] contrib qdq
-    test_case(.02f, static_cast<int8_t>(127), 1, opset);                // [-5.1 , 0]
-    test_case(.02f, static_cast<int8_t>(127), 1, opset, true);          // [-5.1 , 0] contrib qdq
-    test_case(.02f, static_cast<int8_t>(0), 1, opset);                  // [-2.56, 2.54]
-    test_case(.02f, static_cast<int8_t>(0), 1, opset, true);            // [-2.56, 2.54] contrib qdq
-    test_case(.04f, static_cast<int8_t>(-97), 1, opset);                // [-1.24, 8.96]
-    test_case(.04f, static_cast<int8_t>(-97), 1, opset, true);          // [-1.24, 8.96] contrib qdq
-    test_case(.02352941176f, static_cast<uint8_t>(0), 0, opset);        // [0, 6]
-    test_case(.02352941176f, static_cast<uint8_t>(0), 0, opset, true);  // [0, 6] contrib qdq
-    test_case(9.15541313801785e-5f, static_cast<uint16_t>(0),
-              0, opset, true);                                     // [0, 6] contrib 16-bit qdq
-    test_case(0.0009f, static_cast<uint16_t>(0), 1, opset, true);  // [0, 58.98] contrib 16-bit qdq
-    test_case(.02f, static_cast<uint8_t>(0), 0, opset);            // [0, 5.1]
-    test_case(.02f, static_cast<uint8_t>(0), 0, opset, true);      // [0, 5.1] contrib qdq
-    test_case(.03f, static_cast<uint8_t>(0), 1, opset);            // [0, 7.65]
-    test_case(.03f, static_cast<uint8_t>(0), 1, opset, true);      // [0, 7.65] contrib qdq
-    test_case(.02f, static_cast<uint8_t>(255), 1, opset);          // [-5.1, 0]
-    test_case(.02f, static_cast<uint8_t>(255), 1, opset, true);    // [-5.1, 0] contrib qdq
-    test_case(.02f, static_cast<uint8_t>(128), 1, opset);          // [-2.56, 2.54]
-    test_case(.02f, static_cast<uint8_t>(128), 1, opset, true);    // [-2.56, 2.54] contrib qdq
-    test_case(.04f, static_cast<uint8_t>(31), 1, opset);           // [-1.24, 8.96]
-    test_case(.04f, static_cast<uint8_t>(31), 1, opset, true);     // [-1.24, 8.96] contrib qdq
+    test_case(9.15541313801785e-5f, int16_min, 0, opset, true);                // [0, 6] contrib 16-bit qdq
+    test_case(0.0009f, int16_min, 1, opset, true);                             // [0, 58.98] contrib 16-bit qdq
+    test_case(.02f, static_cast<int8_t>(-128), 0, opset);                      // [0, 5.1]
+    test_case(.02f, static_cast<int8_t>(-128), 0, opset, true);                // [0, 5.1] contrib qdq
+    test_case(.03f, static_cast<int8_t>(-128), 1, opset);                      // [0, 7.65]
+    test_case(.03f, static_cast<int8_t>(-128), 1, opset, true);                // [0, 7.65] contrib qdq
+    test_case(.02f, static_cast<int8_t>(127), 1, opset);                       // [-5.1 , 0]
+    test_case(.02f, static_cast<int8_t>(127), 1, opset, true);                 // [-5.1 , 0] contrib qdq
+    test_case(.02f, static_cast<int8_t>(0), 1, opset);                         // [-2.56, 2.54]
+    test_case(.02f, static_cast<int8_t>(0), 1, opset, true);                   // [-2.56, 2.54] contrib qdq
+    test_case(.04f, static_cast<int8_t>(-97), 1, opset);                       // [-1.24, 8.96]
+    test_case(.04f, static_cast<int8_t>(-97), 1, opset, true);                 // [-1.24, 8.96] contrib qdq
+    test_case(.02352941176f, static_cast<uint8_t>(0), 0, opset);               // [0, 6]
+    test_case(.02352941176f, static_cast<uint8_t>(0), 0, opset, true);         // [0, 6] contrib qdq
+    test_case(9.15541313801785e-5f, uint16_min, 0, opset, true);               // [0, 6] contrib 16-bit qdq
+    test_case(0.0009f, uint16_min, 1, opset, true);                            // [0, 58.98] contrib 16-bit qdq
+    test_case(.02f, static_cast<uint8_t>(0), 0, opset);                        // [0, 5.1]
+    test_case(.02f, static_cast<uint8_t>(0), 0, opset, true);                  // [0, 5.1] contrib qdq
+    test_case(.03f, static_cast<uint8_t>(0), 1, opset);                        // [0, 7.65]
+    test_case(.03f, static_cast<uint8_t>(0), 1, opset, true);                  // [0, 7.65] contrib qdq
+    test_case(.02f, static_cast<uint8_t>(255), 1, opset);                      // [-5.1, 0]
+    test_case(.02f, static_cast<uint8_t>(255), 1, opset, true);                // [-5.1, 0] contrib qdq
+    test_case(.02f, static_cast<uint8_t>(128), 1, opset);                      // [-2.56, 2.54]
+    test_case(.02f, static_cast<uint8_t>(128), 1, opset, true);                // [-2.56, 2.54] contrib qdq
+    test_case(.04f, static_cast<uint8_t>(31), 1, opset);                       // [-1.24, 8.96]
+    test_case(.04f, static_cast<uint8_t>(31), 1, opset, true);                 // [-1.24, 8.96] contrib qdq
   }
 
   // opset_version = 10