From d035fb42b46cbcc26eea6f802c56e76e33f71093 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Sat, 7 Dec 2024 00:09:44 -0800
Subject: [PATCH 01/64] Copy shared utils into qnn ep

---
 cmake/onnxruntime_providers_qnn.cmake         |   8 -
 cmake/onnxruntime_unittests.cmake             |   3 +-
 .../qnn/builder/onnx_ctx_model_helper.cc      |   7 +-
 .../qnn/builder/onnx_ctx_model_helper.h       |   1 -
 .../core/providers/qnn/builder/op_builder.h   |   1 -
 .../opbuilder/argmax_min_op_builder.cc        |   4 +-
 .../qnn/builder/opbuilder/base_op_builder.cc  |   3 +-
 .../qnn/builder/opbuilder/base_op_builder.h   |   4 +-
 .../opbuilder/batch_norm_op_builder.cc        |   3 +-
 .../qnn/builder/opbuilder/clip_op_builder.cc  |   1 -
 .../qnn/builder/opbuilder/conv_op_builder.cc  |   5 +-
 .../builder/opbuilder/expand_op_builder.cc    |   1 -
 .../builder/opbuilder/gather_op_builder.cc    |   3 +-
 .../qnn/builder/opbuilder/gemm_op_builder.cc  |   5 +-
 .../opbuilder/instance_norm_op_builder.cc     |   5 +-
 .../opbuilder/layer_norm_op_builder.cc        |   3 +-
 .../qnn/builder/opbuilder/lrn_op_builder.cc   |   6 +-
 .../qnn/builder/opbuilder/pad_op_builder.cc   |   4 +-
 .../qnn/builder/opbuilder/pool_op_builder.cc  |   9 +-
 .../builder/opbuilder/reduce_op_builder.cc    |   5 +-
 .../builder/opbuilder/reshape_op_builder.cc   |   3 +-
 .../builder/opbuilder/resize_op_builder.cc    |   6 +-
 .../builder/opbuilder/simple_op_builder.cc    |  15 +-
 .../qnn/builder/opbuilder/slice_op_builder.cc |   3 +-
 .../builder/opbuilder/softmax_op_builder.cc   |   1 -
 .../qnn/builder/opbuilder/split_op_builder.cc |   4 +-
 .../qnn/builder/opbuilder/tile_op_builder.cc  |   2 -
 .../providers/qnn/builder/opbuilder/topk.cc   |   3 +-
 .../builder/opbuilder/transpose_op_builder.cc |   2 +-
 .../core/providers/qnn/builder/qnn_model.cc   |   1 -
 .../qnn/builder/qnn_model_wrapper.cc          |   1 -
 .../providers/qnn/builder/qnn_model_wrapper.h |   1 -
 .../qnn_node_group/conv_activation_fusion.cc  |   5 +-
 .../qnn/builder/qnn_node_group/dq_q_fusion.cc |   1 -
 .../qnn_node_group/hardsigmoid_mul_fusion.cc  |   3 +-
 .../core/providers/qnn/builder/qnn_utils.cc   | 243 ++++++++++++++++++
 .../core/providers/qnn/builder/qnn_utils.h    |  52 ++++
 .../providers/qnn/qnn_execution_provider.cc   |   8 +-
 .../test/providers/qnn/qnn_ep_context_test.cc |  29 ++-
 onnxruntime/test/qnn_ctx_gen/main.cc          |  29 ++-
 40 files changed, 391 insertions(+), 102 deletions(-)

diff --git a/cmake/onnxruntime_providers_qnn.cmake b/cmake/onnxruntime_providers_qnn.cmake
index b68d84c23bb32..52ccdbf7c9ecc 100644
--- a/cmake/onnxruntime_providers_qnn.cmake
+++ b/cmake/onnxruntime_providers_qnn.cmake
@@ -3,13 +3,6 @@
 
   add_compile_definitions(USE_QNN=1)
 
-  # These are shared utils,
-  # TODO, move to a separate lib when used by EPs other than QNN, NNAPI and CoreML
-  file(GLOB onnxruntime_providers_shared_utils_cc_srcs CONFIGURE_DEPENDS
-    "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.cc"
-  )
-
   file(GLOB_RECURSE
     onnxruntime_providers_qnn_ep_cc_srcs CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/core/providers/qnn/*.h"
@@ -23,7 +16,6 @@
   )
 
   set(onnxruntime_providers_qnn_cc_srcs
-    ${onnxruntime_providers_shared_utils_cc_srcs}
     ${onnxruntime_providers_qnn_ep_cc_srcs}
     ${onnxruntime_providers_qnn_builder_cc_srcs}
   )
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index e822f0a3655fc..306096db128a7 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -1283,7 +1283,8 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
 
     file(GLOB onnxruntime_qnn_ctx_gen_src CONFIGURE_DEPENDS
       ${onnxruntime_qnn_ctx_gen_src_patterns}
-      )
+    )
+
     onnxruntime_add_executable(onnxruntime_qnn_ctx_gen ${onnxruntime_qnn_ctx_gen_src})
     target_include_directories(onnxruntime_qnn_ctx_gen PRIVATE   ${onnx_test_runner_src_dir} ${ONNXRUNTIME_ROOT}
           ${eigen_INCLUDE_DIRS} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir}
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
index 57ae8c354abb7..d017d9503b8cc 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -3,6 +3,7 @@
 
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
 #include "core/graph/constants.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model.h"
 
 #include <iostream>
@@ -17,7 +18,7 @@ bool GraphHasEpContextNode(const onnxruntime::GraphViewer& graph_viewer) {
   // and the source is QNN or QNNExecutionProvider.
   for (const auto& node : graph_viewer.Nodes()) {
     if (EPCONTEXT_OP == node.OpType()) {
-      NodeAttrHelper node_helper(node);
+      utils::NodeAttrHelper node_helper(node);
       std::string cache_source = node_helper.Get(SOURCE, "");
 
       std::transform(cache_source.begin(),
@@ -53,7 +54,7 @@ Status GetMainContextNode(const std::vector<IExecutionProvider::FusedNodeAndGrap
     ORT_RETURN_IF(graph_viewer.NumberOfNodes() != 1, "One filtered graph should has only one EPContext node!");
     const auto& ep_context_node = graph_viewer.Nodes().begin();
     ORT_RETURN_IF_NOT(EPCONTEXT_OP == ep_context_node->OpType(), "Should only filter in the EPContext node.");
-    NodeAttrHelper node_helper(*ep_context_node);
+    utils::NodeAttrHelper node_helper(*ep_context_node);
     int64_t is_main_context = node_helper.Get(MAIN_CONTEXT, static_cast<int64_t>(0));
     if (1 == is_main_context) {
       main_context_pos.push_back(static_cast<int>(i));
@@ -89,7 +90,7 @@ Status GetEpContextFromMainNode(const onnxruntime::Node& main_context_node,
                                 QnnBackendManager* qnn_backend_manager,
                                 QnnModelLookupTable& qnn_models) {
   ORT_RETURN_IF_NOT(EPCONTEXT_OP == main_context_node.OpType(), "Should only filter in the EPContext node.");
-  NodeAttrHelper node_helper(main_context_node);
+  utils::NodeAttrHelper node_helper(main_context_node);
   bool is_embed_mode = node_helper.Get(EMBED_MODE, true);
   if (is_embed_mode) {
     const std::string& context_binary = node_helper.Get(EP_CACHE_CONTEXT, "");
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
index f308a7456d46c..d6c65c2725211 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
@@ -9,7 +9,6 @@
 #include "qnn_def.h"
 #include "core/common/logging/logging.h"
 #include "core/graph/graph_viewer.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/graph/model.h"
 #include "core/framework/execution_provider.h"
 
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder.h b/onnxruntime/core/providers/qnn/builder/op_builder.h
index 05398c3f22ea2..b729503320f05 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/op_builder.h
@@ -5,7 +5,6 @@
 
 #include "core/graph/graph_viewer.h"
 #include "core/framework/node_unit.h"
-#include "core/providers/shared/utils/utils.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc
index c685fa065e2ba..192c9496f0999 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc
@@ -2,8 +2,8 @@
 // Licensed under the MIT License.
 
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 
@@ -57,7 +57,7 @@ Status ArgMaxMinOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_mode
   param_tensor_names.push_back(axis_param.GetParamTensorName());
   qnn_model_wrapper.AddParamWrapper(std::move(axis_param));
 
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   auto select_last_index = node_helper.Get("select_last_index", static_cast<int32_t>(0));
   if (select_last_index != 0) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN ArgMax/ArgMin only support select_last_index=0.");
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
index ed70111087e19..af070fc01a279 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
@@ -6,7 +6,6 @@
 
 #include <core/providers/common.h>
 
-#include "core/providers/shared/utils/utils.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/cpu/tensor/transpose.h"
 #include "core/common/safeint.h"
@@ -311,7 +310,7 @@ Status BaseOpBuilder::ProcessAxisAttribute(const QnnModelWrapper& qnn_model_wrap
   ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[0].node_arg, input_shape), "Cannot get shape");
 
   auto rank = static_cast<int32_t>(input_shape.size());
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   int32_t onnx_axis = node_helper.Get("axis", default_axis_value);
   if (onnx_axis < 0) {
     onnx_axis += rank;
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
index 055c0f6ccf2fa..20d3bac5964b7 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder.h"
 #include "core/providers/qnn/builder/qnn_quant_params_wrapper.h"
@@ -352,7 +352,7 @@ struct OnnxAttrInfo {
 };
 
 template <typename ValType>
-inline ValType GetOnnxAttr(const NodeAttrHelper& node_helper, const OnnxAttrInfo<ValType>& attr_info) {
+inline ValType GetOnnxAttr(const qnn::utils::NodeAttrHelper& node_helper, const OnnxAttrInfo<ValType>& attr_info) {
   return node_helper.Get(attr_info.name, attr_info.default_val);
 }
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc
index 07abcf1c7bf84..9c7f1d374e5b7 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc
@@ -6,7 +6,6 @@
 #include <utility>
 
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/framework/float16.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
@@ -546,7 +545,7 @@ Status BatchNormOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
     std::vector<double> scale_double_tensor;
     std::vector<double> bias_double_tensor;
 
-    NodeAttrHelper node_helper(node_unit);
+    utils::NodeAttrHelper node_helper(node_unit);
     const float epsilon = node_helper.Get("epsilon", 1e-05f);  // Default is 1e-05 according to ONNX spec.
 
     double scale_rmax = std::numeric_limits<double>::min();
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
index e5dc4d04afefd..aa6080eb1195d 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
@@ -5,7 +5,6 @@
 #include <limits>
 
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
index 12887f0fb72d6..2aeb8a47000c2 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
@@ -97,7 +96,7 @@ Status ConvOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
                 "QNN EP: Data type ", input_data_type->c_str(),
                 " is not supported for Conv operator in CPU backend.");
 
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   auto auto_pad = node_helper.Get("auto_pad", std::string("NOTSET"));
   ORT_RETURN_IF(auto_pad != "NOTSET" && auto_pad != "SAME_LOWER" && auto_pad != "SAME_UPPER",
                 "QNN Conv operators do not support 'auto_pad' value: ", auto_pad.c_str());
@@ -539,7 +538,7 @@ Status ConvOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra
   OnnxConvType conv_type = {};
   ORT_RETURN_IF_ERROR(GetOnnxConvType(node_unit.OpType(), conv_type));
 
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   std::vector<std::string> param_tensor_names;
 
   const auto& input_0 = node_unit.Inputs()[0];
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
index 64f676aaa9875..20978f41b529b 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
index 3737fcb54f4cf..df02d12bd59c9 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
@@ -3,7 +3,6 @@
 
 #include <cassert>
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
@@ -100,7 +99,7 @@ static Status GetInpu0AxisDimValue(const QnnModelWrapper& qnn_model_wrapper,
                     "Cannot get shape for ", node_unit.OpType(), " input[0] ", input0.node_arg.Name());
 
   int64_t rank = static_cast<int64_t>(input0_shape.size());
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   int64_t onnx_axis = node_helper.Get("axis", default_axis_value);
   if (onnx_axis < 0) {
     onnx_axis += rank;
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc
index eeee26c177281..20f2f4383044c 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
@@ -36,7 +35,7 @@ class GemmOpBuilder : public BaseOpBuilder {
 };
 
 Status GemmOpBuilder::ExplictOpCheck(const NodeUnit& node_unit) const {
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   auto alpha = node_helper.Get("alpha", (float)1.0);
   if (alpha != 1.0) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN FullyConnected Op only support alpha=1.0.");
@@ -79,7 +78,7 @@ Status GemmOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
 
   // for Input A, B, C: 1 -- need transpose, 0 -- not needed
   std::vector<int64_t> input_trans_flag(3, 0);
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   input_trans_flag.at(0) = node_helper.Get("transA", (int64_t)0);
   auto transB = node_helper.Get("transB", (int64_t)0);
   // QNN input_1 [m, n] vs Onnx [n, m]
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc
index 4b8d079c0062a..53bc93e2fa832 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
@@ -74,7 +73,7 @@ Status InstanceNormOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN InstanceNorm input 2 (bias) must have 1D shape [channel].");
   }
 
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   const float epsilon = node_helper.Get("epsilon", 1e-05f);  // Default is 1e-05 according to ONNX spec.
   if (epsilon <= 0.0f) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN InstanceNorm epsilon must be greater than 0.0");
@@ -160,7 +159,7 @@ Status InstanceNormOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_m
                                                           std::vector<std::string>&& input_names,
                                                           const logging::Logger& logger,
                                                           bool do_op_validation) const {
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   std::vector<std::string> param_tensor_names;
 
   const float epsilon = node_helper.Get("epsilon", 1e-05f);  // Default is 1e-05 according to ONNX spec.
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
index d1a0e88686f39..b0394be15aba2 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
@@ -3,7 +3,6 @@
 
 #include <cassert>
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
@@ -114,7 +113,7 @@ Status LayerNormOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_mode
                                                        std::vector<std::string>&& input_names,
                                                        const logging::Logger& logger,
                                                        bool do_op_validation) const {
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   std::vector<std::string> param_tensor_names;
 
   const float epsilon = node_helper.Get("epsilon", 1e-05f);  // Default is 1e-05 according to ONNX spec.
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc
index 2f66069b6609e..dbb29557cccc4 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/common/safeint.h"
@@ -75,7 +75,7 @@ Status LRNOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
 
   ORT_RETURN_IF(output_shape != input_shape, "QNN EP: LRN operator's output must have the same shape as the input.");
 
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
 
   // 'size' attribute must be odd and > 0.
   const int64_t onnx_size = GetOnnxAttr(node_helper, onnx_size_attr);
@@ -98,7 +98,7 @@ Status LRNOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrap
                                                  const logging::Logger& logger,
                                                  bool do_op_validation) const {
   std::vector<std::string> param_tensor_names;
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
 
   const int64_t onnx_size = GetOnnxAttr(node_helper, onnx_size_attr);
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
index 5fc6d42a8a179..3035da2723907 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
@@ -2,11 +2,9 @@
 // Licensed under the MIT License.
 
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/cpu/tensor/slice_helper.h"
-#include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/common/safeint.h"
 
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
@@ -204,7 +202,7 @@ Status PadOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrap
   std::vector<uint32_t> input_shape;
   ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[0].node_arg, input_shape), "Cannot get shape of input 0.");
 
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   std::string mode = node_helper.Get("mode", "constant");
   Qnn_Scalar_t mode_qnn_scalar = QNN_SCALAR_INIT;
   mode_qnn_scalar.dataType = QNN_DATATYPE_UINT_32;
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
index ef1990ad8e69a..0ed11bed30929 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
@@ -39,7 +38,7 @@ class PoolOpBuilder : public BaseOpBuilder {
                                   QnnQuantParamsWrapper& quant_param) const override ORT_MUST_USE_RESULT;
 
  private:
-  Status SetCommonPoolParams(const NodeAttrHelper& node_helper, std::vector<uint32_t>& filter_size,
+  Status SetCommonPoolParams(const utils::NodeAttrHelper& node_helper, std::vector<uint32_t>& filter_size,
                              std::vector<uint32_t>& pad_amount, std::vector<uint32_t>& stride,
                              int32_t& ceil_mode,
                              std::vector<uint32_t>&& input_shape,
@@ -79,7 +78,7 @@ Status PoolOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
     return Status::OK();
   }
 
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   auto dilation_values = node_helper.Get("dilations", std::vector<int32_t>{1, 1});
   if (dilation_values != std::vector<int32_t>{1, 1}) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN does not support Dilation attribute");
@@ -94,7 +93,7 @@ Status PoolOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
   return Status::OK();
 }
 
-Status PoolOpBuilder::SetCommonPoolParams(const NodeAttrHelper& node_helper,
+Status PoolOpBuilder::SetCommonPoolParams(const utils::NodeAttrHelper& node_helper,
                                           std::vector<uint32_t>& filter_size,
                                           std::vector<uint32_t>& pad_amount, std::vector<uint32_t>& strides,
                                           int32_t& ceil_mode,
@@ -155,7 +154,7 @@ Status PoolOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra
                                                   std::vector<std::string>&& input_names,
                                                   const logging::Logger& logger,
                                                   bool do_op_validation) const {
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   // Get the NCHW from input data, use HW for the pool filter size and pool stride
   const auto& inputs = node_unit.Inputs();
   std::vector<uint32_t> input_shape;
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
index 77bc58bd6f833..ce6654b3906d7 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
@@ -10,7 +10,6 @@
 #include "onnx/defs/data_type_utils.h"
 #include "core/providers/common.h"
 #include "core/framework/endian_utils.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
@@ -102,7 +101,7 @@ Status ReduceOpBuilder::GetAxesSet(QnnModelWrapper& qnn_model_wrapper, const Nod
 
   const int opset_axes_as_input = ReduceOpBuilder::opset_with_axes_as_input[reduce_op_type];
   const int opset = node_unit.SinceVersion();
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
 
   // Extract the axes values from either the attribute or initializer input (depending on opset).
   if (opset < opset_axes_as_input) {  // Axes is in ONNX node attribute.
@@ -212,7 +211,7 @@ Status ReduceOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, const
 Status ReduceOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
                                                     std::vector<std::string>&& input_names,
                                                     const logging::Logger& logger, bool do_op_validation) const {
-  NodeAttrHelper node_attr_helper(node_unit);
+  utils::NodeAttrHelper node_attr_helper(node_unit);
   std::vector<std::string> param_tensor_names;
 
   //
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc
index b6f414da950d8..c374a3c64b350 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
@@ -39,7 +38,7 @@ Status ReshapeOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
                                        std::vector<std::string>& input_names,
                                        bool do_op_validation) const {
   if (do_op_validation) {
-    NodeAttrHelper node_helper(node_unit);
+    utils::NodeAttrHelper node_helper(node_unit);
     auto allowzero = node_helper.Get("allowzero", static_cast<int64_t>(0));
     if (0 != allowzero) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN Reshape doesn't support dynamic shape!");
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
index c62fca88b6ec2..6b1088e488c31 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
@@ -6,12 +6,10 @@
 #include <unordered_map>
 
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/cpu/tensor/slice_helper.h"
-#include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/common/safeint.h"
 
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
@@ -124,7 +122,7 @@ Status ResizeOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
   }
 
   const bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType());
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
 
   // QNN doesn't support anti-aliasing (added in opset 18)
   if (node_unit.SinceVersion() >= 18) {
@@ -260,7 +258,7 @@ Status ResizeOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
                                                     const logging::Logger& logger,
                                                     bool do_op_validation) const {
   std::vector<std::string> param_tensor_names;
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
 
   const auto& input_0 = node_unit.Inputs()[0];
   std::vector<uint32_t> input_shape;
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
index a6c4203ad92e4..f23b6b240389d 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
@@ -143,7 +142,7 @@ Status SimpleOpBuilder::ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper,
   const std::string& op_type = node_unit.OpType();
 
   if (op_type == "GridSample") {
-    NodeAttrHelper node_helper(node_unit);
+    utils::NodeAttrHelper node_helper(node_unit);
     std::string mode = node_helper.Get("mode", "linear");
     ORT_RETURN_IF_NOT(utils::ArrayHasString(gridsample_supported_modes, mode), "GridSample does not support mode ",
                       mode.c_str());
@@ -193,7 +192,7 @@ Status ProcessNodeAttribute(QnnModelWrapper& qnn_model_wrapper,
                             const std::string& qnn_param_key,
                             std::vector<std::string>& param_tensor_names,
                             const float default_value = 1.0f) {
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   float attr_value = node_helper.Get(onnx_attr_key, default_value);
   Qnn_Scalar_t attr_qnn_scalar = QNN_SCALAR_INIT;
   attr_qnn_scalar.dataType = QNN_DATATYPE_FLOAT_32;
@@ -209,7 +208,7 @@ Status ProcessNodeAttribute(QnnModelWrapper& qnn_model_wrapper,
 Status ProcessBlockSizeAttribute(QnnModelWrapper& qnn_model_wrapper,
                                  const NodeUnit& node_unit,
                                  std::vector<std::string>& param_tensor_names) {
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   uint32_t block_size = node_helper.Get("blocksize", static_cast<uint32_t>(0));
   std::vector<uint32_t> block_size_shape{2};
   std::vector<uint32_t> block_size_data(2, block_size);
@@ -224,7 +223,7 @@ Status ProcessBlockSizeAttribute(QnnModelWrapper& qnn_model_wrapper,
 Status ProcessModeAttribute(QnnModelWrapper& qnn_model_wrapper,
                             const NodeUnit& node_unit,
                             std::vector<std::string>& param_tensor_names) {
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   std::string mode = node_helper.Get("mode", "DCR");
   Qnn_Scalar_t mode_qnn_scalar = QNN_SCALAR_INIT;
   mode_qnn_scalar.dataType = QNN_DATATYPE_UINT_32;
@@ -247,7 +246,7 @@ Status ProcessModeAttribute(QnnModelWrapper& qnn_model_wrapper,
 Status ProcessAlphaAttributeAsInput(QnnModelWrapper& qnn_model_wrapper,
                                     const NodeUnit& node_unit,
                                     const std::string input_name) {
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   QnnQuantParamsWrapper quantize_param;
   Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
   union {
@@ -293,7 +292,7 @@ Status ProcessAlphaAttributeAsInput(QnnModelWrapper& qnn_model_wrapper,
 Status ProcessGridSampleAttributes(QnnModelWrapper& qnn_model_wrapper,
                                    const NodeUnit& node_unit,
                                    std::vector<std::string>& param_tensor_names) {
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   int64_t align_corners = node_helper.Get("align_corners", static_cast<int64_t>(0));
   Qnn_Scalar_t align_corners_qnn_scalar = QNN_SCALAR_INIT;
   align_corners_qnn_scalar.dataType = QNN_DATATYPE_BOOL_8;
@@ -373,7 +372,7 @@ Status SimpleOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
     param_tensor_names.push_back(axis_param.GetParamTensorName());
     qnn_model_wrapper.AddParamWrapper(std::move(axis_param));
 
-    NodeAttrHelper node_helper(node_unit);
+    utils::NodeAttrHelper node_helper(node_unit);
     int64_t norm_p_order = node_helper.Get("p", static_cast<int64_t>(2));
     ORT_RETURN_IF(norm_p_order != 2, "QNN EP only supports LpNormalization with 'p' attribute equal to 2.");
   }
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
index b033c8723ea86..13b106d3c1bde 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
@@ -63,7 +62,7 @@ void SliceOpBuilder::GetDataFromAttribute(const NodeUnit& node_unit,
                                           TensorShapeVector& raw_starts,
                                           TensorShapeVector& raw_ends,
                                           TensorShapeVector& raw_axes) const {
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   auto starts = node_helper.Get("starts", std::vector<int64_t>{0});
   raw_starts.assign(starts.begin(), starts.end());
   auto ends = node_helper.Get("ends", std::vector<int64_t>{0});
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc
index b62534bacf426..bc5339d90660e 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
index ba5ad2cf03cef..f435b1d6d802f 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
@@ -2,12 +2,10 @@
 // Licensed under the MIT License.
 
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/cpu/tensor/slice_helper.h"
-#include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/common/safeint.h"
 
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
@@ -98,7 +96,7 @@ Status SplitOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wr
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN doesn't support dynamic split");
     }
   } else {
-    NodeAttrHelper node_helper(node_unit);
+    utils::NodeAttrHelper node_helper(node_unit);
     if (node_helper.HasAttr("split")) {
       auto split_lengths = node_helper.Get("split", std::vector<int64_t>{0});
       ConvertSplitLengthsToSplitIndices(split_lengths, split_index);
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc
index 851ca84dce075..e66c4cd350235 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc
@@ -2,12 +2,10 @@
 // Licensed under the MIT License.
 
 #include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/cpu/tensor/slice_helper.h"
-#include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/common/safeint.h"
 
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
index d22c0811682d0..9cb8f91a9db0b 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
@@ -3,6 +3,7 @@
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/framework/utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 namespace onnxruntime {
 namespace qnn {
 const int TOPK_MIN_INPUT = 2;
@@ -48,7 +49,7 @@ Status TopKOpBuilder::ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const N
   if (!qnn_model_wrapper.IsInitializerInput(input_1)) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "The number of top elements to retrieve must be specified as constant input.");
   }
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   auto largest = node_helper.Get("largest", 1);
   auto sorted = node_helper.Get("sorted", 1);
   if (0 == sorted) {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc
index a42d7312f0203..1290a012d5902 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc
@@ -45,7 +45,7 @@ Status TransposeOpBuilder::ProcessPermAttribute(QnnModelWrapper& qnn_model_wrapp
     transpose_perm[i] = rank - 1 - i;
   }
 
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   transpose_perm = node_helper.Get("perm", transpose_perm);
   auto perm_size = static_cast<uint32_t>(transpose_perm.size());
   std::vector<uint32_t> perm_shape{perm_size};
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.cc b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
index 88fa6429fc01e..75a02e3834567 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
@@ -8,7 +8,6 @@
 
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/framework/utils.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
index 2c7f3c8b22ddd..20ec422774845 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
@@ -11,7 +11,6 @@
 #include "qnn_model_wrapper.h"
 #include "core/common/safeint.h"
 #include "core/framework/tensorprotoutils.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
index f3e52050e79e0..9e308aa33a560 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
@@ -13,7 +13,6 @@
 #include "core/common/logging/logging.h"
 #include "core/framework/node_unit.h"
 #include "core/graph/graph_viewer.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/qnn/builder/qnn_quant_params_wrapper.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.cc
index 813bba8a5952b..76316250a88ad 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.cc
@@ -8,7 +8,6 @@
 #include <string>
 #include "core/graph/graph_utils.h"
 #include "core/framework/node_unit.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_node_group/utils.h"
@@ -110,8 +109,8 @@ static bool CanClipBeRemoved(const QnnModelWrapper& qnn_model_wrapper,
   float clip_min = std::numeric_limits<float>::lowest();
   float clip_max = std::numeric_limits<float>::max();
 
-  if (!onnxruntime::GetClipMinMax(qnn_model_wrapper.GetGraphViewer(), clip_node_unit.GetNode(),
-                                  clip_min, clip_max, logger)) {
+  if (!qnn::utils::GetClipMinMax(qnn_model_wrapper.GetGraphViewer(), clip_node_unit.GetNode(),
+                                 clip_min, clip_max, logger)) {
     return false;
   }
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc
index caf4725626338..17af5725a01ee 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc
@@ -8,7 +8,6 @@
 #include <utility>
 #include "core/graph/graph_utils.h"
 #include "core/framework/node_unit.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_node_group/utils.h"
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc
index 76b1726646486..aceaf0399a6cb 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc
@@ -8,7 +8,6 @@
 #include <utility>
 #include "core/graph/graph_utils.h"
 #include "core/framework/node_unit.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
@@ -39,7 +38,7 @@ std::unique_ptr<IQnnNodeGroup> HardSigmoidMulFusion::TryFusion(
     return nullptr;
   }
 
-  NodeAttrHelper hs_attr_helper(hardsigmoid_node_unit);
+  utils::NodeAttrHelper hs_attr_helper(hardsigmoid_node_unit);
   float alpha = hs_attr_helper.Get("alpha", 0.2f);
   float beta = hs_attr_helper.Get("beta", 0.5f);
   constexpr float req_alpha = 1.0f / 6.0f;
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
index 8d2cb5bdb6da0..9457877ddfc93 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
@@ -9,8 +9,10 @@
 
 #include "core/common/common.h"
 #include "core/framework/data_types.h"
+#include "core/framework/tensorprotoutils.h"
 #include "qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_def.h"
+#include "core/graph/graph_viewer.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -570,6 +572,247 @@ Status Quantize(const double double_value,
   return Status::OK();
 }
 
+static bool GetType(const NodeArg& node_arg, int32_t& type, const logging::Logger& logger) {
+  type = ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED;
+  const auto* type_proto = node_arg.TypeAsProto();
+  if (!type_proto || !type_proto->has_tensor_type() || !type_proto->tensor_type().has_elem_type()) {
+    LOGS(logger, WARNING) << "NodeArg [" << node_arg.Name() << "] has no input type";
+    return false;
+  }
+
+  type = type_proto->tensor_type().elem_type();
+  return true;
+}
+
+NodeAttrHelper::NodeAttrHelper(const onnxruntime::Node& node)
+    : node_attributes_(node.GetAttributes()) {}
+
+NodeAttrHelper::NodeAttrHelper(const NodeUnit& node_unit)
+    : node_attributes_(node_unit.GetNode().GetAttributes()) {}
+
+float NodeAttrHelper::Get(const std::string& key, float def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return entry->second.f();
+  }
+
+  return def_val;
+}
+
+int32_t NodeAttrHelper::Get(const std::string& key, int32_t def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return narrow<int32_t>(entry->second.i());
+  }
+
+  return def_val;
+}
+
+uint32_t NodeAttrHelper::Get(const std::string& key, uint32_t def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return narrow<uint32_t>(entry->second.i());
+  }
+
+  return def_val;
+}
+
+int64_t NodeAttrHelper::Get(const std::string& key, int64_t def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return entry->second.i();
+  }
+
+  return def_val;
+}
+
+const std::string& NodeAttrHelper::Get(const std::string& key, const std::string& def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return entry->second.s();
+  }
+
+  return def_val;
+}
+
+std::vector<int32_t> NodeAttrHelper::Get(const std::string& key, const std::vector<int32_t>& def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& attr = entry->second;
+    std::vector<int32_t> v;
+    v.reserve(static_cast<size_t>(attr.ints_size()));
+    std::transform(attr.ints().cbegin(), attr.ints().cend(), std::back_inserter(v),
+                   [](int64_t val) -> int32_t { return narrow<int32_t>(val); });
+    return v;
+  }
+
+  return def_val;
+}
+
+std::vector<uint32_t> NodeAttrHelper::Get(const std::string& key, const std::vector<uint32_t>& def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& attr = entry->second;
+    std::vector<uint32_t> v;
+    v.reserve(static_cast<size_t>(attr.ints_size()));
+    std::transform(attr.ints().cbegin(), attr.ints().cend(), std::back_inserter(v),
+                   [](int64_t val) -> uint32_t { return narrow<uint32_t>(val); });
+    return v;
+  }
+
+  return def_val;
+}
+
+std::vector<int64_t> NodeAttrHelper::Get(const std::string& key, const std::vector<int64_t>& def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = entry->second.ints();
+    return std::vector<int64_t>{values.cbegin(), values.cend()};
+  }
+
+  return def_val;
+}
+
+std::vector<std::string> NodeAttrHelper::Get(const std::string& key, const std::vector<std::string>& def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = entry->second.strings();
+    return std::vector<std::string>{values.cbegin(), values.cend()};
+  }
+
+  return def_val;
+}
+
+std::vector<float> NodeAttrHelper::Get(const std::string& key, const std::vector<float>& def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = entry->second.floats();
+    return std::vector<float>{values.cbegin(), values.cend()};
+  }
+
+  return def_val;
+}
+
+std::optional<float> NodeAttrHelper::GetFloat(const std::string& key) const {
+  std::optional<float> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    result = entry->second.f();
+  }
+
+  return result;
+}
+
+std::optional<int64_t> NodeAttrHelper::GetInt64(const std::string& key) const {
+  std::optional<int64_t> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    result = entry->second.i();
+  }
+
+  return result;
+}
+
+std::optional<std::vector<float>> NodeAttrHelper::GetFloats(const std::string& key) const {
+  std::optional<std::vector<float>> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = entry->second.floats();
+    result = std::vector<float>(values.begin(), values.end());
+  }
+
+  return result;
+}
+
+std::optional<std::vector<int64_t>> NodeAttrHelper::GetInt64s(const std::string& key) const {
+  std::optional<std::vector<int64_t>> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = entry->second.ints();
+    result = std::vector<int64_t>(values.begin(), values.end());
+  }
+
+  return result;
+}
+
+std::optional<std::string> NodeAttrHelper::GetString(const std::string& key) const {
+  std::optional<std::string> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    result = entry->second.s();
+  }
+
+  return result;
+}
+
+bool NodeAttrHelper::HasAttr(const std::string& key) const {
+  return Contains(node_attributes_, key);
+}
+static bool GetClipMinMaxImpl(const GraphViewer& graph_viewer, const Node& node, float& min, float& max,
+                              const logging::Logger& logger) {
+  const auto& node_name = node.Name();
+  int32_t input_type;
+  if (!GetType(*node.InputDefs()[0], input_type, logger)) {
+    return false;
+  }
+
+  min = std::numeric_limits<float>::lowest();
+  max = std::numeric_limits<float>::max();
+
+  if (node.SinceVersion() < 11) {  // Clip opset 1, 6 is using attributes for min/max
+    NodeAttrHelper helper(node);
+    // attributes will be always float
+    min = helper.Get("min", std::numeric_limits<float>::lowest());
+    max = helper.Get("max", std::numeric_limits<float>::max());
+  } else {
+    auto get_value =
+        [&](const ONNX_NAMESPACE::TensorProto* initializer, std::string_view type, float& value) -> bool {
+      if (!initializer) {
+        LOGS(logger, VERBOSE) << type << " input of Clip must be a constant initializer";
+        return false;
+      }
+
+      switch (input_type) {
+        case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: {
+          auto status = onnxruntime::utils::UnpackTensor(*initializer, graph_viewer.ModelPath(), &value, 1);
+          if (!status.IsOK()) {
+            LOGS(logger, ERROR) << "GetClipMinMax() failed to unpack float initializer: " << status.ErrorMessage();
+            return false;
+          }
+          break;
+        }
+        case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16: {
+          MLFloat16 f16_val{};
+          auto status = onnxruntime::utils::UnpackTensor(*initializer, graph_viewer.ModelPath(), &f16_val, 1);
+          if (!status.IsOK()) {
+            LOGS(logger, ERROR) << "GetClipMinMax() failed to unpack float16 initializer: " << status.ErrorMessage();
+            return false;
+          }
+          value = f16_val.ToFloat();
+          break;
+        }
+        default:
+          LOGS(logger, VERBOSE) << "GetClipMinMax() only supports float and float16 as min and max inputs for now."
+                                << " The node [" << node_name << "] has input type: " << input_type;
+          return false;
+      }
+
+      return true;
+    };
+
+    // min and max are both optional. could have neither, one or both.
+    if (node.InputDefs().size() > 1 && node.InputDefs()[1]->Exists()) {
+      // we have input min
+      const auto& min_name = node.InputDefs()[1]->Name();
+      const auto* min_value = graph_viewer.GetConstantInitializer(min_name);
+      if (!get_value(min_value, "Min", min)) {
+        return false;
+      }
+    }
+
+    if (node.InputDefs().size() > 2 && node.InputDefs()[2]->Exists()) {
+      // we have input max
+      const auto& max_name = node.InputDefs()[2]->Name();
+      const auto* max_value = graph_viewer.GetConstantInitializer(max_name);
+      if (!get_value(max_value, "Max", max)) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+bool GetClipMinMax(const GraphViewer& graph_viewer, const Node& node, float& min, float& max,
+                   const logging::Logger& logger) {
+  return GetClipMinMaxImpl(graph_viewer, node, min, max, logger);
+}
+
 }  // namespace utils
 }  // namespace qnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
index aa4a27460563f..0d69242958666 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
@@ -104,6 +104,58 @@ Status Quantize(const double double_value,
                 const Qnn_DataType_t qnn_data_type,
                 int& quant_value);
 
+/**
+ * Wrapping onnxruntime::Node for retrieving attribute values
+ */
+class NodeAttrHelper {
+ public:
+  explicit NodeAttrHelper(const Node& node);
+
+  // Get the attributes from the target node of the node_unit
+  explicit NodeAttrHelper(const NodeUnit& node_unit);
+
+  /*
+   * Get with default
+   */
+  float Get(const std::string& key, float def_val) const;
+  std::vector<float> Get(const std::string& key, const std::vector<float>& def_val) const;
+
+  int64_t Get(const std::string& key, int64_t def_val) const;
+  std::vector<int64_t> Get(const std::string& key, const std::vector<int64_t>& def_val) const;
+
+  const std::string& Get(const std::string& key, const std::string& def_val) const;
+  std::vector<std::string> Get(const std::string& key, const std::vector<std::string>& def_val) const;
+
+  // Convert the i() or ints() of the attribute from int64_t to int32_t
+  int32_t Get(const std::string& key, int32_t def_val) const;
+  std::vector<int32_t> Get(const std::string& key, const std::vector<int32_t>& def_val) const;
+
+  // Convert the i() or ints() of the attribute from int64_t to uint32_t
+  uint32_t Get(const std::string& key, uint32_t def_val) const;
+  std::vector<uint32_t> Get(const std::string& key, const std::vector<uint32_t>& def_val) const;
+
+  /*
+   * Get without default.
+   */
+  std::optional<float> GetFloat(const std::string& key) const;
+  std::optional<std::vector<float>> GetFloats(const std::string& key) const;
+
+  std::optional<int64_t> GetInt64(const std::string& key) const;
+  std::optional<std::vector<int64_t>> GetInt64s(const std::string& key) const;
+
+  std::optional<std::string> GetString(const std::string& key) const;
+
+  bool HasAttr(const std::string& key) const;
+
+ private:
+  const NodeAttributes& node_attributes_;
+};
+
+// Get the min/max of a Clip operator. Reads values from attributes for opset < 11 and inputs after that.
+// For opset 11+, if min/max are not constant initializers, will return false.
+// For now we only support getting float min/max.
+bool GetClipMinMax(const GraphViewer& graph_viewer, const Node& node,
+                   float& min, float& max, const logging::Logger& logger);
 }  // namespace utils
 }  // namespace qnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 6735528bebbf9..960fafd1fa2c4 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -16,7 +16,7 @@
 #include "core/platform/env.h"
 #include "core/providers/common.h"
 #include "core/providers/partitioning_utils.h"
-#include "core/providers/partitioning_utils.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
@@ -565,7 +565,7 @@ QNNExecutionProvider::GetSupportedNodes(const GraphViewer& graph_viewer,
 static bool EpSharedContextsHasAllGraphs(const onnxruntime::GraphViewer& graph_viewer,
                                          const logging::Logger& logger) {
   for (const auto& node : graph_viewer.Nodes()) {
-    NodeAttrHelper node_helper(node);
+    qnn::utils::NodeAttrHelper node_helper(node);
     std::string cache_source = node_helper.Get(qnn::SOURCE, "");
 
     std::transform(cache_source.begin(),
@@ -591,7 +591,7 @@ static bool EpSharedContextsHasAllGraphs(const std::vector<IExecutionProvider::F
   for (auto fused_node_and_graph : fused_nodes_and_graphs) {
     const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
     const auto& ep_context_node = graph_viewer.Nodes().begin();
-    NodeAttrHelper node_helper(*ep_context_node);
+    qnn::utils::NodeAttrHelper node_helper(*ep_context_node);
     std::string cache_source = node_helper.Get(qnn::SOURCE, "");
 
     const std::string& graph_name = ep_context_node->Name();
@@ -615,7 +615,7 @@ static void PartitionCtxModel(const onnxruntime::GraphViewer& graph_viewer,
   std::vector<std::vector<const Node*>> supported_groups{};
 
   for (const auto& node : graph_viewer.Nodes()) {
-    NodeAttrHelper node_helper(node);
+    qnn::utils::NodeAttrHelper node_helper(node);
     std::string cache_source = node_helper.Get(qnn::SOURCE, "");
 
     std::transform(cache_source.begin(),
diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
index a3f0ed55b83f2..38fde332ca992 100644
--- a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
@@ -7,7 +7,6 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "core/session/inference_session.h"
-#include "core/providers/shared/utils/utils.h"
 
 #include "test/providers/qnn/qnn_test_utils.h"
 
@@ -25,6 +24,24 @@ namespace test {
 
 #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
+static int64_t GetNodeAttr(const Node& node, const std::string& attr_name, int64_t default_val) {
+  const auto& attributes = node.GetAttributes();
+  if (auto entry = attributes.find(attr_name); entry != attributes.end()) {
+    return entry->second.i();
+  }
+
+  return default_val;
+}
+
+static const std::string& GetNodeAttr(const Node& node, const std::string& attr_name, const std::string& default_val) {
+  const auto& attributes = node.GetAttributes();
+  if (auto entry = attributes.find(attr_name); entry != attributes.end()) {
+    return entry->second.s();
+  }
+
+  return default_val;
+}
+
 // Create a model with FusedMatMul + Add (quantized)
 // input1 -> Add -> Q -> DQ \
 //                           FusedMatMul -> Q -> DQ -> output
@@ -873,10 +890,9 @@ static void GetLastContextBinaryFileName(const std::string last_onnx_ctx_file,
   auto& ctx_graph = ctx_model->MainGraph();
   for (auto& node : ctx_graph.Nodes()) {
     if (node.OpType() == "EPContext") {
-      NodeAttrHelper node_helper(node);
-      int64_t is_main_context = node_helper.Get("main_context", static_cast<int64_t>(0));
+      int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast<int64_t>(0));
       if (1 == is_main_context) {
-        last_ctx_bin_file = node_helper.Get("ep_cache_context", "");
+        last_ctx_bin_file = GetNodeAttr(node, "ep_cache_context", "");
         return;
       }
     }
@@ -899,10 +915,9 @@ static void UpdateEpContextModel(const std::vector<std::string>& ep_ctx_files,
 
     for (auto& node : ctx_graph.Nodes()) {
       if (node.OpType() == "EPContext") {
-        NodeAttrHelper node_helper(node);
-        int64_t is_main_context = node_helper.Get("main_context", static_cast<int64_t>(0));
+        int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast<int64_t>(0));
         if (1 == is_main_context) {
-          std::string old_qnn_ctx_binary_file_name = node_helper.Get("ep_cache_context", "");
+          std::string old_qnn_ctx_binary_file_name = GetNodeAttr(node, "ep_cache_context", "");
           auto file_path = path.replace_filename(old_qnn_ctx_binary_file_name);
           std::remove(file_path.string().c_str());
           node.ClearAttribute("ep_cache_context");
diff --git a/onnxruntime/test/qnn_ctx_gen/main.cc b/onnxruntime/test/qnn_ctx_gen/main.cc
index d568d5e78688a..b7b01cd6fbc20 100644
--- a/onnxruntime/test/qnn_ctx_gen/main.cc
+++ b/onnxruntime/test/qnn_ctx_gen/main.cc
@@ -16,7 +16,6 @@
 #include "core/common/logging/sinks/clog_sink.h"
 
 #include "core/graph/model.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/session/environment.h"
 #include "core/common/logging/logging.h"
 
@@ -31,6 +30,24 @@ static void CheckStatus(const Status& status) {
   }
 }
 
+static int64_t GetNodeAttr(const Node& node, const std::string& attr_name, int64_t default_val) {
+  const auto& attributes = node.GetAttributes();
+  if (auto entry = attributes.find(attr_name); entry != attributes.end()) {
+    return entry->second.i();
+  }
+
+  return default_val;
+}
+
+static const std::string& GetNodeAttr(const Node& node, const std::string& attr_name, const std::string& default_val) {
+  const auto& attributes = node.GetAttributes();
+  if (auto entry = attributes.find(attr_name); entry != attributes.end()) {
+    return entry->second.s();
+  }
+
+  return default_val;
+}
+
 // from the last context cache Onnx model, find the EPContext node with main_context=1,
 // and get the QNN context binary file name, this context binary contains all graphs from all Onnx models
 static void GetLastContextBinaryFileName(const std::basic_string<ORTCHAR_T> last_onnx_ctx_file,
@@ -41,10 +58,9 @@ static void GetLastContextBinaryFileName(const std::basic_string<ORTCHAR_T> last
   auto& ctx_graph = ctx_model->MainGraph();
   for (auto& node : ctx_graph.Nodes()) {
     if (node.OpType() == "EPContext") {
-      NodeAttrHelper node_helper(node);
-      int64_t is_main_context = node_helper.Get("main_context", static_cast<int64_t>(0));
+      int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast<int64_t>(0));
       if (1 == is_main_context) {
-        last_ctx_bin_file = node_helper.Get("ep_cache_context", "");
+        last_ctx_bin_file = GetNodeAttr(node, "ep_cache_context", "");
         return;
       }
     }
@@ -67,10 +83,9 @@ static void UpdateEpContextModel(const std::vector<std::basic_string<ORTCHAR_T>>
 
     for (auto& node : ctx_graph.Nodes()) {
       if (node.OpType() == "EPContext") {
-        NodeAttrHelper node_helper(node);
-        int64_t is_main_context = node_helper.Get("main_context", static_cast<int64_t>(0));
+        int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast<int64_t>(0));
         if (1 == is_main_context) {
-          std::string old_qnn_ctx_binary_file_name = node_helper.Get("ep_cache_context", "");
+          std::string old_qnn_ctx_binary_file_name = GetNodeAttr(node, "ep_cache_context", "");
           auto file_path = path.replace_filename(old_qnn_ctx_binary_file_name);
           std::remove(file_path.string().c_str());
           node.ClearAttribute("ep_cache_context");

From 7e46a7de63686141cb580bf4918d15624ec85880 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Mon, 9 Dec 2024 19:17:15 -0800
Subject: [PATCH 02/64] Update QNN EP's initializer transpose logic to use only
 functions exposed by the provider bridge.

---
 .../qnn/builder/opbuilder/base_op_builder.cc  | 59 ++++++++++++-------
 .../qnn/builder/qnn_model_wrapper.cc          | 24 ++------
 .../core/providers/qnn/builder/qnn_utils.h    | 29 +++++++++
 3 files changed, 71 insertions(+), 41 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
index af070fc01a279..3c0ca0dab2da3 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
@@ -274,29 +274,46 @@ Status BaseOpBuilder::TransposeInitializer(const QnnModelWrapper& qnn_model_wrap
                                            const onnx::TensorProto& initializer,
                                            const std::vector<size_t>& perm,
                                            std::vector<uint8_t>& transposed_data) const {
-  const DataTypeImpl* tensor_dtype = DataTypeImpl::TensorTypeFromONNXEnum(initializer.data_type())->GetElementType();
-  const auto tensor_shape_dims = onnxruntime::utils::GetTensorShapeFromTensorProto(initializer);
-  TensorShape tensor_shape{tensor_shape_dims};
-  AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();
-  Tensor in_tensor = Tensor(tensor_dtype, tensor_shape, cpu_allocator);
-
+  int32_t onnx_type = initializer.data_type();
+  const DataTypeImpl* tensor_dtype = DataTypeImpl::TensorTypeFromONNXEnum(onnx_type)->GetElementType();
+  const TensorShape in_tensor_shape = onnxruntime::utils::GetTensorShapeFromTensorProto(initializer);
+
+  // Unpack initializer data into an input Tensor.
+  size_t tensor_data_size = Tensor::CalculateTensorStorageSize(tensor_dtype, in_tensor_shape);
+  std::vector<uint8_t> input_tensor_data(tensor_data_size);
+  ORT_RETURN_IF_ERROR(onnxruntime::utils::UnpackInitializerData(initializer,
+                                                                qnn_model_wrapper.GetGraphViewer().ModelPath(),
+                                                                input_tensor_data));
+  Tensor in_tensor(tensor_dtype, in_tensor_shape, input_tensor_data.data(), OrtMemoryInfo{});
+
+  // Determine the new transposed shape.
   auto rank = perm.size();
-  std::vector<int64_t> new_tensor_shape_dims;
-  std::vector<size_t> permutations;
-  new_tensor_shape_dims.reserve(rank);
-  permutations.reserve(rank);
-  for (int64_t p : perm) {
-    permutations.push_back(p);
-    new_tensor_shape_dims.push_back(tensor_shape_dims[p]);
+  std::vector<int64_t> out_tensor_shape_dims;
+  out_tensor_shape_dims.reserve(rank);
+  for (size_t p : perm) {
+    out_tensor_shape_dims.push_back(in_tensor_shape[p]);
+  }
+  const TensorShape out_tensor_shape = TensorShape::FromExistingBuffer(out_tensor_shape_dims);
+
+  // Create an output tensor that does not own the pre-allocated `transposed_data` buffer.
+  // DoTranspose() will write the new transposed elements directly into the `transposed_data` buffer.
+  // We do this to eliminate unnecessary weight copies.
+  transposed_data.resize(tensor_data_size);
+  Tensor out_tensor(tensor_dtype, out_tensor_shape, transposed_data.data(), OrtMemoryInfo{});
+  ORT_RETURN_IF_ERROR(Transpose::DoTranspose(perm, in_tensor, out_tensor));
+
+  // If this is an int4, we need to unpack it because QNN treats int4 as a full int8.
+  // TODO: Improve memory usage! Transpose::DoTranspose() internally copies Tensor<int4> to Tensor<int8>,
+  // does the transpose, and then copies the result to a new Tensor<int4>. Afterwards, QNN EP will unpack
+  // the new Tensor<int4> back to 8-bits. This is wasteful. A better approach would be for QNN EP to do the following:
+  //  - Explicitly unpack Tensor<int4> to Tensor<int8>
+  //  - Call Transpose::DoTranspose() with the Tensor<int8>. This generates a new transposed Tensor<int8>.
+  //  - Clear the top 4-bits to zero for every int8 element in the transposed Tensor<int8>.
+  if (onnx_type == ONNX_NAMESPACE::TensorProto_DataType_INT4) {
+    ORT_RETURN_IF_ERROR(qnn::utils::UnpackInt4ToInt8<true>(out_tensor_shape.Size(), transposed_data));
+  } else if (onnx_type == ONNX_NAMESPACE::TensorProto_DataType_UINT4) {
+    ORT_RETURN_IF_ERROR(qnn::utils::UnpackInt4ToInt8<false>(out_tensor_shape.Size(), transposed_data));
   }
-
-  TensorShape new_tensor_shape(new_tensor_shape_dims);
-  Tensor out_tensor = Tensor(tensor_dtype, new_tensor_shape, cpu_allocator);
-  ORT_RETURN_IF_ERROR(onnxruntime::utils::TensorProtoToTensor(
-      Env::Default(), qnn_model_wrapper.GetGraphViewer().ModelPath(), initializer, in_tensor));
-  ORT_RETURN_IF_ERROR(Transpose::DoTranspose(permutations, in_tensor, out_tensor));
-  onnx::TensorProto new_tensor_proto = onnxruntime::utils::TensorToTensorProto(out_tensor, "test");
-  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(new_tensor_proto, transposed_data));
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
index 20ec422774845..a6bd17e75b6c0 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
@@ -624,28 +624,12 @@ Status QnnModelWrapper::UnpackInitializerData(const ONNX_NAMESPACE::TensorProto&
   // If this is an int4, we need to unpack it because QNN treats int4 as a full int8.
   if (onnx_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT4) {
     TensorShape shape = onnxruntime::utils::GetTensorShapeFromTensorProto(initializer);
-    const size_t num_elems = shape.Size();
-    std::vector<uint8_t> packed_int4_bytes = std::move(unpacked_tensor);
-    unpacked_tensor = std::vector<uint8_t>(num_elems);
-
-    auto dst = gsl::make_span(reinterpret_cast<int8_t*>(unpacked_tensor.data()), unpacked_tensor.size());
-    auto src = gsl::make_span(reinterpret_cast<const Int4x2*>(packed_int4_bytes.data()), packed_int4_bytes.size());
-    ORT_RETURN_IF_NOT(Int4x2::Unpack(dst, src), "Failed to unpack Tensor<Int4x2> for QNN");
-
-    // NOTE: Masking off top 4 bits to workaround a QNN INT4 accuracy bug.
-    // Docs explicitly state that masking off top 4 bits should not be required.
-    for (size_t i = 0; i < dst.size(); i++) {
-      dst[i] &= 0x0F;  // -3 (0b1111_1101) becomes 13 (0b0000_1101)
-    }
+    const size_t num_int4_elems = shape.Size();
+    ORT_RETURN_IF_ERROR(qnn::utils::UnpackInt4ToInt8<true>(num_int4_elems, unpacked_tensor));
   } else if (onnx_data_type == ONNX_NAMESPACE::TensorProto_DataType_UINT4) {
     TensorShape shape = onnxruntime::utils::GetTensorShapeFromTensorProto(initializer);
-    const size_t num_elems = shape.Size();
-    std::vector<uint8_t> packed_int4_bytes = std::move(unpacked_tensor);
-    unpacked_tensor = std::vector<uint8_t>(num_elems);
-
-    auto dst = gsl::make_span(reinterpret_cast<uint8_t*>(unpacked_tensor.data()), unpacked_tensor.size());
-    auto src = gsl::make_span(reinterpret_cast<const UInt4x2*>(packed_int4_bytes.data()), packed_int4_bytes.size());
-    ORT_RETURN_IF_NOT(UInt4x2::Unpack(dst, src), "Failed to unpack Tensor<UInt4x2> for QNN");
+    const size_t num_uint4_elems = shape.Size();
+    ORT_RETURN_IF_ERROR(qnn::utils::UnpackInt4ToInt8<false>(num_uint4_elems, unpacked_tensor));
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
index 0d69242958666..11ecf57ada357 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
@@ -104,6 +104,35 @@ Status Quantize(const double double_value,
                 const Qnn_DataType_t qnn_data_type,
                 int& quant_value);
 
+// Re-writes a buffer of packed 4-bit elements to a buffer of unpacked 8-bit elements.
+// QNN requires that 4-bit weights are unpacked to 8-bit.
+template <bool Signed>
+Status UnpackInt4ToInt8(size_t num_int4_elems, std::vector<uint8_t>& data_bytes) {
+  if constexpr (Signed) {  // INT4
+    std::vector<uint8_t> packed_int4_bytes = std::move(data_bytes);
+    data_bytes = std::vector<uint8_t>(num_int4_elems);
+
+    auto dst = gsl::make_span(reinterpret_cast<int8_t*>(data_bytes.data()), data_bytes.size());
+    auto src = gsl::make_span(reinterpret_cast<const Int4x2*>(packed_int4_bytes.data()), packed_int4_bytes.size());
+    ORT_RETURN_IF_NOT(Int4x2::Unpack(dst, src), "Failed to unpack Tensor<Int4x2> for QNN");
+
+    // NOTE: Masking off top 4 bits to workaround a QNN INT4 accuracy bug.
+    // Docs explicitly state that masking off top 4 bits should not be required, but we have to do it.
+    for (size_t i = 0; i < dst.size(); i++) {
+      dst[i] &= 0x0F;  // -3 (0b1111_1101) becomes 13 (0b0000_1101)
+    }
+  } else {  // UINT4
+    std::vector<uint8_t> packed_uint4_bytes = std::move(data_bytes);
+    data_bytes = std::vector<uint8_t>(num_int4_elems);
+
+    auto dst = gsl::make_span(reinterpret_cast<uint8_t*>(data_bytes.data()), data_bytes.size());
+    auto src = gsl::make_span(reinterpret_cast<const UInt4x2*>(packed_uint4_bytes.data()), packed_uint4_bytes.size());
+    ORT_RETURN_IF_NOT(UInt4x2::Unpack(dst, src), "Failed to unpack Tensor<UInt4x2> for QNN");
+  }
+
+  return Status::OK();
+}
+
 /**
  * Wrapping onnxruntime::Node for retrieving attribute values
  */

From a155b33b8e2ff40854e8146857aecfd7a18501d5 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Mon, 9 Dec 2024 19:19:18 -0800
Subject: [PATCH 03/64] Update comment

---
 .../core/providers/qnn/builder/opbuilder/base_op_builder.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
index 3c0ca0dab2da3..f8a6c1c602fe9 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
@@ -304,11 +304,11 @@ Status BaseOpBuilder::TransposeInitializer(const QnnModelWrapper& qnn_model_wrap
 
   // If this is an int4, we need to unpack it because QNN treats int4 as a full int8.
   // TODO: Improve memory usage! Transpose::DoTranspose() internally copies Tensor<int4> to Tensor<int8>,
-  // does the transpose, and then copies the result to a new Tensor<int4>. Afterwards, QNN EP will unpack
+  // does the transpose in 8-bits, and then copies the result back to a new Tensor<int4>. Afterwards, QNN EP unpacks
   // the new Tensor<int4> back to 8-bits. This is wasteful. A better approach would be for QNN EP to do the following:
-  //  - Explicitly unpack Tensor<int4> to Tensor<int8>
+  //  - Explicitly unpack Tensor<int4> to Tensor<int8> in QNN EP.
   //  - Call Transpose::DoTranspose() with the Tensor<int8>. This generates a new transposed Tensor<int8>.
-  //  - Clear the top 4-bits to zero for every int8 element in the transposed Tensor<int8>.
+  //  - Clear the top 4-bits to zero for every int8 element in the transposed Tensor<int8>. [ONLY if signed int4]
   if (onnx_type == ONNX_NAMESPACE::TensorProto_DataType_INT4) {
     ORT_RETURN_IF_ERROR(qnn::utils::UnpackInt4ToInt8<true>(out_tensor_shape.Size(), transposed_data));
   } else if (onnx_type == ONNX_NAMESPACE::TensorProto_DataType_UINT4) {

From e9c5f1420078cfbc5d48a080c0cd7556031e718e Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Tue, 10 Dec 2024 10:53:10 -0800
Subject: [PATCH 04/64] Added TransposeBase::DoTranspose() to provider bridge.
 May elect to revert this in favor of doing the transpose manually in QNN EP

---
 onnxruntime/core/providers/cpu/cpu_provider_shared.cc     | 8 ++++++++
 onnxruntime/core/providers/cpu/cpu_provider_shared.h      | 5 +++++
 .../providers/qnn/builder/opbuilder/base_op_builder.cc    | 4 ++--
 onnxruntime/core/providers/shared_library/provider_api.h  | 1 +
 .../providers/shared_library/provider_bridge_provider.cc  | 7 +++++++
 5 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/cpu_provider_shared.cc b/onnxruntime/core/providers/cpu/cpu_provider_shared.cc
index ce9780031a250..3778c89a00e19 100644
--- a/onnxruntime/core/providers/cpu/cpu_provider_shared.cc
+++ b/onnxruntime/core/providers/cpu/cpu_provider_shared.cc
@@ -23,6 +23,7 @@
 #include "core/providers/cpu/tensor/slice.h"
 #include "core/providers/cpu/tensor/onehot.h"
 #include "core/providers/cpu/tensor/tile.h"
+#include "core/providers/cpu/tensor/transpose.h"
 #include "core/providers/cpu/tensor/gather_elements.h"
 #include "core/providers/cpu/tensor/unsqueeze.h"
 #include "core/providers/cpu/tensor/upsamplebase.h"
@@ -81,6 +82,13 @@ struct ProviderHostCPUImpl : ProviderHostCPU {
   Status NonMaxSuppressionBase__PrepareCompute(OpKernelContext* ctx, PrepareContext& pc) override { return NonMaxSuppressionBase::PrepareCompute(ctx, pc); }
   Status NonMaxSuppressionBase__GetThresholdsFromInputs(const PrepareContext& pc, int64_t& max_output_boxes_per_class, float& iou_threshold, float& score_threshold) override { return NonMaxSuppressionBase::GetThresholdsFromInputs(pc, max_output_boxes_per_class, iou_threshold, score_threshold); }
 
+  // TransposeBase (direct)
+  Status TransposeBase__DoTranspose(const gsl::span<const size_t>& permutations, const Tensor& input, Tensor& output,
+                                    const TensorShape* input_shape_override,
+                                    concurrency::ThreadPool* tp) override {
+    return TransposeBase::DoTranspose(permutations, input, output, input_shape_override, tp);
+  }
+
 #if defined(USE_CUDA) || defined(USE_ROCM)
   // From cpu/tensor/size.h (direct)
   Status Size__Compute(const Size* p, OpKernelContext* context) override { return p->Size::Compute(context); }
diff --git a/onnxruntime/core/providers/cpu/cpu_provider_shared.h b/onnxruntime/core/providers/cpu/cpu_provider_shared.h
index eb1569c3e499e..ce7cd6155c38c 100644
--- a/onnxruntime/core/providers/cpu/cpu_provider_shared.h
+++ b/onnxruntime/core/providers/cpu/cpu_provider_shared.h
@@ -38,6 +38,11 @@ struct ProviderHostCPU {
   virtual Status NonMaxSuppressionBase__PrepareCompute(OpKernelContext* ctx, PrepareContext& pc) = 0;
   virtual Status NonMaxSuppressionBase__GetThresholdsFromInputs(const PrepareContext& pc, int64_t& max_output_boxes_per_class, float& iou_threshold, float& score_threshold) = 0;
 
+  // TransposeBase
+  virtual Status TransposeBase__DoTranspose(const gsl::span<const size_t>& permutations, const Tensor& input, Tensor& output,
+                                            const TensorShape* input_shape_override,
+                                            concurrency::ThreadPool* tp) = 0;
+
 #if defined(USE_CUDA) || defined(USE_ROCM)
 
   // From cpu/tensor/size.h
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
index f8a6c1c602fe9..d9d83ba085859 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
@@ -300,10 +300,10 @@ Status BaseOpBuilder::TransposeInitializer(const QnnModelWrapper& qnn_model_wrap
   // We do this to eliminate unnecessary weight copies.
   transposed_data.resize(tensor_data_size);
   Tensor out_tensor(tensor_dtype, out_tensor_shape, transposed_data.data(), OrtMemoryInfo{});
-  ORT_RETURN_IF_ERROR(Transpose::DoTranspose(perm, in_tensor, out_tensor));
+  ORT_RETURN_IF_ERROR(TransposeBase::DoTranspose(perm, in_tensor, out_tensor));
 
   // If this is an int4, we need to unpack it because QNN treats int4 as a full int8.
-  // TODO: Improve memory usage! Transpose::DoTranspose() internally copies Tensor<int4> to Tensor<int8>,
+  // TODO: Reduce copies for INT4! Transpose::DoTranspose() internally copies Tensor<int4> to Tensor<int8>,
   // does the transpose in 8-bits, and then copies the result back to a new Tensor<int4>. Afterwards, QNN EP unpacks
   // the new Tensor<int4> back to 8-bits. This is wasteful. A better approach would be for QNN EP to do the following:
   //  - Explicitly unpack Tensor<int4> to Tensor<int8> in QNN EP.
diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index b84825236a453..35014b39335f2 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -184,6 +184,7 @@ class GatherBase;
 class Size;
 class SliceBase;
 class SplitBase;
+class TransposeBase;
 class TensorShape;
 struct Prepare;
 struct PrepareContext;
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index d3b12f9728135..92e5eb1ed5eb0 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -23,6 +23,7 @@
 #include "core/providers/cpu/tensor/split.h"
 #include "core/providers/cpu/tensor/size.h"
 #include "core/providers/cpu/tensor/scatter_nd.h"
+#include "core/providers/cpu/tensor/transpose.h"
 #include "core/providers/cpu/tensor/unsqueeze.h"
 #include "core/providers/cpu/tensor/upsamplebase.h"
 #include "core/providers/cpu/tensor/tile.h"
@@ -513,6 +514,12 @@ Status NonMaxSuppressionBase::GetThresholdsFromInputs(const PrepareContext& pc,
 Status GatherBase::PrepareForCompute(OpKernelContext* context, GatherBase::Prepare& p) const { return g_host_cpu.GatherBase__PrepareForCompute(this, context, reinterpret_cast<GatherBase__Prepare&>(p)); }
 Status UnsqueezeBase::PrepareCompute(OpKernelContext* ctx, UnsqueezeBase::Prepare& p) const { return g_host_cpu.UnsqueezeBase__PrepareCompute(this, ctx, reinterpret_cast<UnsqueezeBase__Prepare&>(p)); }
 
+Status TransposeBase::DoTranspose(const gsl::span<const size_t>& permutations, const Tensor& input, Tensor& output,
+                                  const TensorShape* input_shape_override,
+                                  concurrency::ThreadPool* tp) {
+  return g_host_cpu.TransposeBase__DoTranspose(permutations, input, output, input_shape_override, tp);
+}
+
 #if defined(USE_CUDA) || defined(USE_ROCM)
 bool TileOp::IsTileMemcpy(const TensorShape& input_shape, const int64_t* repeats, size_t rank, bool& is_batched_memcpy, size_t& num_of_elements_per_batch, size_t& num_of_copies_per_batch, size_t& num_of_batch_copies) {
   return g_host_cpu.TileOp__IsTileMemcpy(input_shape, repeats, rank, is_batched_memcpy, num_of_elements_per_batch, num_of_copies_per_batch, num_of_batch_copies);

From d0f64dc1d69395e5408037c012d775c8e528c11e Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Tue, 10 Dec 2024 11:05:36 -0800
Subject: [PATCH 05/64] Add TypeProto_Tensor_has_elem_type() to provider bridge

---
 onnxruntime/core/providers/shared_library/provider_interfaces.h  | 1 +
 .../core/providers/shared_library/provider_wrappedtypes.h        | 1 +
 onnxruntime/core/session/provider_bridge_ort.cc                  | 1 +
 3 files changed, 3 insertions(+)

diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index f9f2bb69a9d1a..dc3f5e60f2745 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -325,6 +325,7 @@ struct ProviderHost {
   virtual bool TypeProto_Tensor__has_shape(const ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
   virtual const ONNX_NAMESPACE::TensorShapeProto& TypeProto_Tensor__shape(const ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
   virtual ONNX_NAMESPACE::TensorShapeProto* TypeProto_Tensor__mutable_shape(ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
+  virtual bool TypeProto_Tensor__has_elem_type(const ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
   virtual int32_t TypeProto_Tensor__elem_type(const ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
   virtual void TypeProto_Tensor__set_elem_type(ONNX_NAMESPACE::TypeProto_Tensor* p, int32_t value) = 0;
 
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index a82ddfe64c64b..0efa3833a978b 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -304,6 +304,7 @@ struct TypeProto_Tensor final {
   bool has_shape() const { return g_host->TypeProto_Tensor__has_shape(this); }
   const TensorShapeProto& shape() const { return g_host->TypeProto_Tensor__shape(this); }
   TensorShapeProto* mutable_shape() { return g_host->TypeProto_Tensor__mutable_shape(this); }
+  bool has_elem_type() const { return g_host->TypeProto_Tensor__has_elem_type(this); }
   int32_t elem_type() const { return g_host->TypeProto_Tensor__elem_type(this); }
   void set_elem_type(int32_t value) { g_host->TypeProto_Tensor__set_elem_type(this, value); }
 
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index d55fd34d5a8f2..29ffd9487925b 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -423,6 +423,7 @@ struct ProviderHostImpl : ProviderHost {
   bool TypeProto_Tensor__has_shape(const ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->has_shape(); }
   const ONNX_NAMESPACE::TensorShapeProto& TypeProto_Tensor__shape(const ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->shape(); }
   ONNX_NAMESPACE::TensorShapeProto* TypeProto_Tensor__mutable_shape(ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->mutable_shape(); }
+  bool TypeProto_Tensor__has_elem_type(const ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->has_elem_type(); }
   int32_t TypeProto_Tensor__elem_type(const ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->elem_type(); }
   void TypeProto_Tensor__set_elem_type(ONNX_NAMESPACE::TypeProto_Tensor* p, int32_t value) override { p->set_elem_type(value); };
 

From f8bd2f6338fdee7c739922aef79557f416c712b0 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Tue, 10 Dec 2024 12:51:26 -0800
Subject: [PATCH 06/64] Add to provider bridge: TensorTypeBase class,
 TensorTypeBase::GetElementType(), DataTypeImpl::TensorTypeFromONNXEnum()

---
 onnxruntime/core/providers/shared_library/provider_api.h   | 1 +
 .../core/providers/shared_library/provider_interfaces.h    | 5 +++++
 .../core/providers/shared_library/provider_wrappedtypes.h  | 7 +++++++
 onnxruntime/core/session/provider_bridge_ort.cc            | 7 +++++++
 4 files changed, 20 insertions(+)

diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index 35014b39335f2..6e17947af3389 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -169,6 +169,7 @@ class OpKernel;
 struct OpKernelContext;
 struct OpKernelInfo;
 struct PrimitiveDataTypeBase;
+struct TensorTypeBase;
 struct OrtRunOptions;
 struct Tensor;
 struct SparseTensor;
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index dc3f5e60f2745..dfe46e0ee32b5 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -677,6 +677,9 @@ struct ProviderHost {
   virtual int32_t PrimitiveDataTypeBase__GetNumSubElems(const PrimitiveDataTypeBase* p) = 0;
   virtual bool PrimitiveDataTypeBase__HasSubElems(const PrimitiveDataTypeBase* p) = 0;
 
+  // TensorTypeBase
+  virtual MLDataType TensorTypeBase__GetElementType(const TensorTypeBase* p) = 0;
+
   // DataTypeImpl
   virtual MLDataType DataTypeImpl__GetType_Tensor() = 0;
 #if !defined(DISABLE_SPARSE_TENSORS)
@@ -795,6 +798,8 @@ struct ProviderHost {
   virtual size_t DataTypeImpl__Size(const DataTypeImpl* p) = 0;
   virtual const PrimitiveDataTypeBase* DataTypeImpl__AsPrimitiveDataType(const DataTypeImpl* p) = 0;
 
+  virtual const TensorTypeBase* DataTypeImpl__TensorTypeFromONNXEnum(int type) = 0;
+
   // Function
   virtual const Graph& Function__Body(const Function* p) = 0;
 
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index 0efa3833a978b..04818245d146f 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -703,6 +703,12 @@ struct PrimitiveDataTypeBase final {
   PROVIDER_DISALLOW_ALL(PrimitiveDataTypeBase)
 };
 
+struct TensorTypeBase final {
+  MLDataType GetElementType() const { return g_host->TensorTypeBase__GetElementType(this); }
+
+  PROVIDER_DISALLOW_ALL(TensorTypeBase)
+};
+
 class DataTypeImpl final {
  public:
   size_t Size() const { return g_host->DataTypeImpl__Size(this); }
@@ -759,6 +765,7 @@ class DataTypeImpl final {
 
   const PrimitiveDataTypeBase* AsPrimitiveDataType() const { return g_host->DataTypeImpl__AsPrimitiveDataType(this); }
 
+  static const TensorTypeBase* TensorTypeFromONNXEnum(int type) { return g_host->DataTypeImpl__TensorTypeFromONNXEnum(type); }
   static const char* ToString(MLDataType type) { return g_host->DataTypeImpl__ToString(type); }
 
   PROVIDER_DISALLOW_ALL(DataTypeImpl)
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 29ffd9487925b..a3a12e1ba32a2 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -812,6 +812,9 @@ struct ProviderHostImpl : ProviderHost {
   int32_t PrimitiveDataTypeBase__GetNumSubElems(const PrimitiveDataTypeBase* p) override { return p->GetNumSubElems(); }
   bool PrimitiveDataTypeBase__HasSubElems(const PrimitiveDataTypeBase* p) override { return p->HasSubElems(); }
 
+  // TensorTypeBase (wrapped)
+  MLDataType TensorTypeBase__GetElementType(const TensorTypeBase* p) override { return p->GetElementType(); }
+
   // DataTypeImpl (wrapped)
   MLDataType DataTypeImpl__GetType_Tensor() override { return DataTypeImpl::GetType<Tensor>(); }
 #if !defined(DISABLE_SPARSE_TENSORS)
@@ -932,6 +935,10 @@ struct ProviderHostImpl : ProviderHost {
   size_t DataTypeImpl__Size(const DataTypeImpl* p) override { return p->Size(); }
   const PrimitiveDataTypeBase* DataTypeImpl__AsPrimitiveDataType(const DataTypeImpl* p) override { return p->AsPrimitiveDataType(); }
 
+  const TensorTypeBase* DataTypeImpl__TensorTypeFromONNXEnum(int type) override {
+    return DataTypeImpl::TensorTypeFromONNXEnum(type);
+  }
+
   // Function (wrapped)
   const Graph& Function__Body(const Function* p) override { return p->Body(); }
 

From 1f533a99a5eb9e6af955971b2f2c72c7d32e964f Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Wed, 11 Dec 2024 03:12:15 -0800
Subject: [PATCH 07/64] Transpose initializers within QNN EP without using CPU
 EP utils

---
 .../core/providers/cpu/cpu_provider_shared.cc |   8 -
 .../core/providers/cpu/cpu_provider_shared.h  |   5 -
 .../qnn/builder/opbuilder/base_op_builder.cc  | 229 ++++++++++++++----
 .../qnn/builder/opbuilder/base_op_builder.h   |  42 ++--
 .../qnn/builder/opbuilder/conv_op_builder.cc  |  26 +-
 .../qnn/builder/qnn_model_wrapper.cc          |   4 +-
 .../core/providers/qnn/builder/qnn_utils.cc   |  36 +++
 .../core/providers/qnn/builder/qnn_utils.h    |  28 +++
 .../provider_bridge_provider.cc               |   7 -
 9 files changed, 284 insertions(+), 101 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/cpu_provider_shared.cc b/onnxruntime/core/providers/cpu/cpu_provider_shared.cc
index 3778c89a00e19..ce9780031a250 100644
--- a/onnxruntime/core/providers/cpu/cpu_provider_shared.cc
+++ b/onnxruntime/core/providers/cpu/cpu_provider_shared.cc
@@ -23,7 +23,6 @@
 #include "core/providers/cpu/tensor/slice.h"
 #include "core/providers/cpu/tensor/onehot.h"
 #include "core/providers/cpu/tensor/tile.h"
-#include "core/providers/cpu/tensor/transpose.h"
 #include "core/providers/cpu/tensor/gather_elements.h"
 #include "core/providers/cpu/tensor/unsqueeze.h"
 #include "core/providers/cpu/tensor/upsamplebase.h"
@@ -82,13 +81,6 @@ struct ProviderHostCPUImpl : ProviderHostCPU {
   Status NonMaxSuppressionBase__PrepareCompute(OpKernelContext* ctx, PrepareContext& pc) override { return NonMaxSuppressionBase::PrepareCompute(ctx, pc); }
   Status NonMaxSuppressionBase__GetThresholdsFromInputs(const PrepareContext& pc, int64_t& max_output_boxes_per_class, float& iou_threshold, float& score_threshold) override { return NonMaxSuppressionBase::GetThresholdsFromInputs(pc, max_output_boxes_per_class, iou_threshold, score_threshold); }
 
-  // TransposeBase (direct)
-  Status TransposeBase__DoTranspose(const gsl::span<const size_t>& permutations, const Tensor& input, Tensor& output,
-                                    const TensorShape* input_shape_override,
-                                    concurrency::ThreadPool* tp) override {
-    return TransposeBase::DoTranspose(permutations, input, output, input_shape_override, tp);
-  }
-
 #if defined(USE_CUDA) || defined(USE_ROCM)
   // From cpu/tensor/size.h (direct)
   Status Size__Compute(const Size* p, OpKernelContext* context) override { return p->Size::Compute(context); }
diff --git a/onnxruntime/core/providers/cpu/cpu_provider_shared.h b/onnxruntime/core/providers/cpu/cpu_provider_shared.h
index ce7cd6155c38c..eb1569c3e499e 100644
--- a/onnxruntime/core/providers/cpu/cpu_provider_shared.h
+++ b/onnxruntime/core/providers/cpu/cpu_provider_shared.h
@@ -38,11 +38,6 @@ struct ProviderHostCPU {
   virtual Status NonMaxSuppressionBase__PrepareCompute(OpKernelContext* ctx, PrepareContext& pc) = 0;
   virtual Status NonMaxSuppressionBase__GetThresholdsFromInputs(const PrepareContext& pc, int64_t& max_output_boxes_per_class, float& iou_threshold, float& score_threshold) = 0;
 
-  // TransposeBase
-  virtual Status TransposeBase__DoTranspose(const gsl::span<const size_t>& permutations, const Tensor& input, Tensor& output,
-                                            const TensorShape* input_shape_override,
-                                            concurrency::ThreadPool* tp) = 0;
-
 #if defined(USE_CUDA) || defined(USE_ROCM)
 
   // From cpu/tensor/size.h
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
index d9d83ba085859..06b02a5e5e31b 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
@@ -270,54 +270,199 @@ Status BaseOpBuilder::SetOutputQParamEqualToInputIfNearlyEqual(QnnModelWrapper&
   return Status::OK();
 }
 
-Status BaseOpBuilder::TransposeInitializer(const QnnModelWrapper& qnn_model_wrapper,
-                                           const onnx::TensorProto& initializer,
-                                           const std::vector<size_t>& perm,
-                                           std::vector<uint8_t>& transposed_data) const {
-  int32_t onnx_type = initializer.data_type();
-  const DataTypeImpl* tensor_dtype = DataTypeImpl::TensorTypeFromONNXEnum(onnx_type)->GetElementType();
-  const TensorShape in_tensor_shape = onnxruntime::utils::GetTensorShapeFromTensorProto(initializer);
-
-  // Unpack initializer data into an input Tensor.
-  size_t tensor_data_size = Tensor::CalculateTensorStorageSize(tensor_dtype, in_tensor_shape);
-  std::vector<uint8_t> input_tensor_data(tensor_data_size);
-  ORT_RETURN_IF_ERROR(onnxruntime::utils::UnpackInitializerData(initializer,
-                                                                qnn_model_wrapper.GetGraphViewer().ModelPath(),
-                                                                input_tensor_data));
-  Tensor in_tensor(tensor_dtype, in_tensor_shape, input_tensor_data.data(), OrtMemoryInfo{});
-
-  // Determine the new transposed shape.
-  auto rank = perm.size();
-  std::vector<int64_t> out_tensor_shape_dims;
-  out_tensor_shape_dims.reserve(rank);
-  for (size_t p : perm) {
-    out_tensor_shape_dims.push_back(in_tensor_shape[p]);
+// Internal function to transpose input from either (N,C,H,W,D) or (C,N,H,W,D) to (H,W,D,C,N).
+static Status TransposeToHwdcn(const TensorShape& input_shape,
+                               gsl::span<const size_t> perm,
+                               size_t elem_byte_size,
+                               gsl::span<const uint8_t> input_buffer,
+                               gsl::span<uint8_t> output_buffer) {
+  const size_t rank = input_shape.NumDimensions();
+  ORT_RETURN_IF_NOT(rank == 5 && perm.size() == 5, "Invalid input tensor rank");
+  std::vector<size_t> perm_inverse(perm.size());
+  ORT_RETURN_IF_ERROR(qnn::utils::InvertPerm<size_t>(perm, perm_inverse));
+
+  std::vector<int64_t> output_shape_dims(rank);
+  ORT_RETURN_IF_ERROR((qnn::utils::PermuteShape<int64_t, size_t>(input_shape.GetDims(), perm, output_shape_dims)));
+  const TensorShape output_shape = TensorShape::FromExistingBuffer(output_shape_dims);
+
+  std::array<size_t, 5> src_strides = {};
+  for (size_t i = 0; i < rank; ++i) {
+    int64_t stride = (i < rank - 1) ? input_shape.SizeFromDimension(i + 1) : 1;
+    ORT_RETURN_IF_NOT(stride > 0, "Expected positive shape dims when computing strides.");
+    src_strides[i] = static_cast<size_t>(stride);
   }
-  const TensorShape out_tensor_shape = TensorShape::FromExistingBuffer(out_tensor_shape_dims);
-
-  // Create an output tensor that does not own the pre-allocated `transposed_data` buffer.
-  // DoTranspose() will write the new transposed elements directly into the `transposed_data` buffer.
-  // We do this to eliminate unnecessary weight copies.
-  transposed_data.resize(tensor_data_size);
-  Tensor out_tensor(tensor_dtype, out_tensor_shape, transposed_data.data(), OrtMemoryInfo{});
-  ORT_RETURN_IF_ERROR(TransposeBase::DoTranspose(perm, in_tensor, out_tensor));
-
-  // If this is an int4, we need to unpack it because QNN treats int4 as a full int8.
-  // TODO: Reduce copies for INT4! Transpose::DoTranspose() internally copies Tensor<int4> to Tensor<int8>,
-  // does the transpose in 8-bits, and then copies the result back to a new Tensor<int4>. Afterwards, QNN EP unpacks
-  // the new Tensor<int4> back to 8-bits. This is wasteful. A better approach would be for QNN EP to do the following:
-  //  - Explicitly unpack Tensor<int4> to Tensor<int8> in QNN EP.
-  //  - Call Transpose::DoTranspose() with the Tensor<int8>. This generates a new transposed Tensor<int8>.
-  //  - Clear the top 4-bits to zero for every int8 element in the transposed Tensor<int8>. [ONLY if signed int4]
-  if (onnx_type == ONNX_NAMESPACE::TensorProto_DataType_INT4) {
-    ORT_RETURN_IF_ERROR(qnn::utils::UnpackInt4ToInt8<true>(out_tensor_shape.Size(), transposed_data));
-  } else if (onnx_type == ONNX_NAMESPACE::TensorProto_DataType_UINT4) {
-    ORT_RETURN_IF_ERROR(qnn::utils::UnpackInt4ToInt8<false>(out_tensor_shape.Size(), transposed_data));
+
+  std::array<size_t, 5> dst_strides = {};
+  for (size_t i = 0; i < rank; ++i) {
+    int64_t stride = (i < rank - 1) ? output_shape.SizeFromDimension(i + 1) : 1;
+    ORT_RETURN_IF_NOT(stride > 0, "Expected positive shape dims when computing strides.");
+    dst_strides[i] = static_cast<size_t>(stride);
+  }
+
+  for (int64_t d0 = 0; d0 < input_shape[0]; ++d0) {
+    for (int64_t d1 = 0; d1 < input_shape[1]; ++d1) {
+      for (int64_t d2 = 0; d2 < input_shape[2]; ++d2) {
+        for (int64_t d3 = 0; d3 < input_shape[3]; ++d3) {
+          for (int64_t d4 = 0; d4 < input_shape[4]; ++d4) {
+            const size_t src_elem_index = ((d0 * src_strides[0]) +
+                                           (d1 * src_strides[1]) +
+                                           (d2 * src_strides[2]) +
+                                           (d3 * src_strides[3]) +
+                                           (d4 * src_strides[4]));
+            const size_t dst_elem_index = ((d0 * dst_strides[perm_inverse[0]]) +
+                                           (d1 * dst_strides[perm_inverse[1]]) +
+                                           (d2 * dst_strides[perm_inverse[2]]) +
+                                           (d3 * dst_strides[perm_inverse[3]]) +
+                                           (d4 * dst_strides[perm_inverse[4]]));
+
+            const size_t src_byte_index = src_elem_index * elem_byte_size;
+            const size_t dst_byte_index = dst_elem_index * elem_byte_size;
+            assert(src_byte_index < input_buffer.size());
+            assert(dst_byte_index < output_buffer.size());
+
+            std::memcpy(&output_buffer[dst_byte_index], &input_buffer[src_byte_index], elem_byte_size);
+          }
+        }
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+Status BaseOpBuilder::TwoDimensionTranspose(const QnnModelWrapper& qnn_model_wrapper,
+                                            std::vector<uint32_t>& data_shape,
+                                            const onnx::TensorProto& initializer,
+                                            std::vector<uint8_t>& transposed_data) const {
+  ORT_RETURN_IF_NOT(data_shape.size() == 2, "Expected shape of rank 2");
+
+  std::array<size_t, 2> perm = {1, 0};
+  std::vector<uint32_t> output_shape(data_shape.size());
+  ORT_RETURN_IF_ERROR((qnn::utils::PermuteShape<uint32_t, size_t>(data_shape, perm, output_shape)));
+
+  auto onnx_type = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(initializer.data_type());
+  const size_t elem_byte_size = qnn::utils::GetElementSizeByType(onnx_type);
+  ORT_RETURN_IF_NOT(elem_byte_size != 0, "Can't get element byte size from given ONNX type");
+
+  std::vector<uint8_t> input_buffer;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(initializer, input_buffer));
+  transposed_data.resize(input_buffer.size());
+
+  for (size_t row = 0; row < data_shape[0]; row++) {
+    for (size_t col = 0; col < data_shape[1]; col++) {
+      const size_t src_elem_index = (row * data_shape[1] + col);
+      const size_t dst_elem_index = (col * output_shape[1] + row);
+      const size_t src_byte_index = src_elem_index * elem_byte_size;
+      const size_t dst_byte_index = dst_elem_index * elem_byte_size;
+      assert(src_byte_index < input_buffer.size());
+      assert(dst_byte_index < transposed_data.size());
+
+      std::memcpy(&transposed_data[dst_byte_index], &input_buffer[src_byte_index], elem_byte_size);
+    }
   }
 
+  data_shape = std::move(output_shape);  // Update parameter with final transposed shape
   return Status::OK();
 }
 
+Status BaseOpBuilder::TransposeFromNchwToHwcn(const QnnModelWrapper& qnn_model_wrapper,
+                                              const onnx::TensorProto& initializer,
+                                              std::vector<uint8_t>& transposed_data,
+                                              bool is_3d) const {
+  auto onnx_type = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(initializer.data_type());
+  const size_t elem_byte_size = qnn::utils::GetElementSizeByType(onnx_type);
+  ORT_RETURN_IF_NOT(elem_byte_size != 0, "Can't get element byte size from given ONNX type");
+
+  std::vector<int64_t> input_shape = qnn::utils::GetInitializerShape<int64_t>(initializer);
+  ORT_RETURN_IF_NOT((is_3d && input_shape.size() == 5) || (!is_3d && input_shape.size() == 4),
+                    "Unexpected rank: only support rank 4 or rank 5 input shapes");
+
+  if (!is_3d) {
+    input_shape.push_back(1);  // Make it 3D by making shape (N,C,H,W,1)
+  }
+
+  std::vector<uint8_t> input_buffer;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(initializer, input_buffer));
+  transposed_data.resize(input_buffer.size());
+
+  return TransposeToHwdcn(TensorShape::FromExistingBuffer(input_shape),
+                          nchw2hwcn_perm_3d,
+                          elem_byte_size,
+                          input_buffer,
+                          transposed_data);
+}
+
+Status BaseOpBuilder::TransposeFromNchwToHwcn(std::vector<int64_t> input_shape_dims,
+                                              size_t elem_byte_size,
+                                              gsl::span<const uint8_t> input_buffer,
+                                              gsl::span<uint8_t> output_buffer,
+                                              bool is_3d) const {
+  const size_t rank = input_shape_dims.size();
+  ORT_RETURN_IF_NOT((is_3d && rank == 5) || (!is_3d && rank == 4), "Invalid input tensor rank");
+  ORT_RETURN_IF_NOT(input_buffer.size() == output_buffer.size(),
+                    "Expected input_buffer.size() == output_buffer.size()");
+  ORT_RETURN_IF_NOT(elem_byte_size != 0, "Can't get element byte size from given ONNX type");
+
+  if (!is_3d) {
+    input_shape_dims.push_back(1);  // Make it 3D by making shape (N,C,H,W,1)
+  }
+
+  return TransposeToHwdcn(TensorShape::FromExistingBuffer(input_shape_dims),
+                          nchw2hwcn_perm_3d,
+                          elem_byte_size,
+                          input_buffer,
+                          output_buffer);
+}
+
+Status BaseOpBuilder::TransposeFromCnhwToHwcn(const QnnModelWrapper& qnn_model_wrapper,
+                                              const onnx::TensorProto& initializer,
+                                              std::vector<uint8_t>& transposed_data,
+                                              bool is_3d) const {
+  auto onnx_type = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(initializer.data_type());
+  const size_t elem_byte_size = qnn::utils::GetElementSizeByType(onnx_type);
+  ORT_RETURN_IF_NOT(elem_byte_size != 0, "Can't get element byte size from given ONNX type");
+
+  std::vector<int64_t> input_shape = qnn::utils::GetInitializerShape<int64_t>(initializer);
+  ORT_RETURN_IF_NOT((is_3d && input_shape.size() == 5) || (!is_3d && input_shape.size() == 4),
+                    "Unexpected rank: only support rank 4 or rank 5 input shapes");
+
+  if (!is_3d) {
+    input_shape.push_back(1);  // Make it 3D by making shape (C,N,H,W,1)
+  }
+
+  std::vector<uint8_t> input_buffer;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(initializer, input_buffer));
+  transposed_data.resize(input_buffer.size());
+
+  return TransposeToHwdcn(TensorShape::FromExistingBuffer(input_shape),
+                          cnhw2hwcn_perm_3d,
+                          elem_byte_size,
+                          input_buffer,
+                          transposed_data);
+}
+
+Status BaseOpBuilder::TransposeFromCnhwToHwcn(std::vector<int64_t> input_shape_dims,
+                                              size_t elem_byte_size,
+                                              gsl::span<const uint8_t> input_buffer,
+                                              gsl::span<uint8_t> output_buffer,
+                                              bool is_3d) const {
+  const size_t rank = input_shape_dims.size();
+  ORT_RETURN_IF_NOT((is_3d && rank == 5) || (!is_3d && rank == 4), "Invalid input tensor rank");
+  ORT_RETURN_IF_NOT(input_buffer.size() == output_buffer.size(),
+                    "Expected input_buffer.size() == output_buffer.size()");
+  ORT_RETURN_IF_NOT(elem_byte_size != 0, "Can't get element byte size from given ONNX type");
+
+  if (!is_3d) {
+    input_shape_dims.push_back(1);  // Make it 3D by making shape (C,N,H,W,1)
+  }
+
+  return TransposeToHwdcn(TensorShape::FromExistingBuffer(input_shape_dims),
+                          cnhw2hwcn_perm_3d,
+                          elem_byte_size,
+                          input_buffer,
+                          output_buffer);
+}
+
 Status BaseOpBuilder::ProcessAxisAttribute(const QnnModelWrapper& qnn_model_wrapper,
                                            const NodeUnit& node_unit,
                                            Qnn_Scalar_t& axis_qnn_scalar,
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
index 20d3bac5964b7..b2bb3f043eecd 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
@@ -215,7 +215,8 @@ class BaseOpBuilder : public IOpBuilder {
   }
 
   // NCHW shape to channel last
-  Status NchwShapeToNhwc(const std::vector<uint32_t>& nchw_shape, std::vector<uint32_t>& nhwc_shape) const {
+  template <typename T>
+  Status NchwShapeToNhwc(gsl::span<const T> nchw_shape, gsl::span<T> nhwc_shape) const {
     ORT_RETURN_IF_NOT(nchw_shape.size() == 4, "shape should have 4 dimension NCHW.");
     nhwc_shape[0] = nchw_shape[0];
     nhwc_shape[1] = nchw_shape[2];
@@ -226,7 +227,8 @@ class BaseOpBuilder : public IOpBuilder {
   }
 
   // NCHW shape to HWCN shape, required for Conv weight
-  Status NchwShapeToHwcn(const std::vector<uint32_t>& nchw_shape, std::vector<uint32_t>& hwcn_shape) const {
+  template <typename T>
+  Status NchwShapeToHwcn(gsl::span<const T> nchw_shape, gsl::span<T> hwcn_shape) const {
     if (nchw_shape.size() == 4) {
       hwcn_shape[0] = nchw_shape[2];
       hwcn_shape[1] = nchw_shape[3];
@@ -246,7 +248,8 @@ class BaseOpBuilder : public IOpBuilder {
   }
 
   // CNHW shape to HWCN shape, required for Conv weight
-  Status CnhwShapeToHwcn(const std::vector<uint32_t>& cnhw_shape, std::vector<uint32_t>& hwcn_shape) const {
+  template <typename T>
+  Status CnhwShapeToHwcn(gsl::span<const T> cnhw_shape, gsl::span<T> hwcn_shape) const {
     if (cnhw_shape.size() == 4) {
       hwcn_shape[0] = cnhw_shape[2];
       hwcn_shape[1] = cnhw_shape[3];
@@ -264,37 +267,32 @@ class BaseOpBuilder : public IOpBuilder {
 
     return Status::OK();
   }
-  Status TransposeInitializer(const QnnModelWrapper& qnn_model_wrapper,
-                              const onnx::TensorProto& initializer,
-                              const std::vector<size_t>& perm,
-                              std::vector<uint8_t>& transposed_data) const;
 
   Status TransposeFromNchwToHwcn(const QnnModelWrapper& qnn_model_wrapper,
                                  const onnx::TensorProto& initializer,
                                  std::vector<uint8_t>& transposed_data,
-                                 bool is_3d = false) const {
-    auto& perm = is_3d ? nchw2hwcn_perm_3d : nchw2hwcn_perm;
-    return TransposeInitializer(qnn_model_wrapper, initializer, perm, transposed_data);
-  }
+                                 bool is_3d = false) const;
+
+  Status TransposeFromNchwToHwcn(std::vector<int64_t> input_shape_dims,
+                                 size_t elem_byte_size,
+                                 gsl::span<const uint8_t> input_buffer,
+                                 gsl::span<uint8_t> output_buffer,
+                                 bool is_3d = false) const;
 
   Status TransposeFromCnhwToHwcn(const QnnModelWrapper& qnn_model_wrapper,
                                  const onnx::TensorProto& initializer,
                                  std::vector<uint8_t>& transposed_data,
-                                 bool is_3d = false) const {
-    auto& perm = is_3d ? cnhw2hwcn_perm_3d : cnhw2hwcn_perm;
-    return TransposeInitializer(qnn_model_wrapper, initializer, perm, transposed_data);
-  }
+                                 bool is_3d = false) const;
+  Status TransposeFromCnhwToHwcn(std::vector<int64_t> input_shape_dims,
+                                 size_t elem_byte_size,
+                                 gsl::span<const uint8_t> input_buffer,
+                                 gsl::span<uint8_t> output_buffer,
+                                 bool is_3d = false) const;
 
   Status TwoDimensionTranspose(const QnnModelWrapper& qnn_model_wrapper,
                                std::vector<uint32_t>& data_shape,
                                const onnx::TensorProto& initializer,
-                               std::vector<uint8_t>& transposed_data) const {
-    auto tmp = data_shape[0];
-    data_shape[0] = data_shape[1];
-    data_shape[1] = tmp;
-    std::vector<size_t> two_dim_trans_perm{1, 0};
-    return TransposeInitializer(qnn_model_wrapper, initializer, two_dim_trans_perm, transposed_data);
-  }
+                               std::vector<uint8_t>& transposed_data) const;
 
   // Onnx Pads is [x1_begin, x2_begin, x1_end, x2_end], QNN requires [x1_begin, x1_end, x2_begin, x2_end]
   void ReArranagePads(std::vector<uint32_t>& pads) const {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
index 2aeb8a47000c2..f0c6f53affecd 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
@@ -210,9 +210,9 @@ Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper,
 
     // Change shape to HWCN, it could be initializer or normal input
     if (conv_type == OnnxConvType::kConv) {
-      ORT_RETURN_IF_ERROR(NchwShapeToHwcn(input_info.shape, actual_shape));
+      ORT_RETURN_IF_ERROR(NchwShapeToHwcn<uint32_t>(input_info.shape, actual_shape));
     } else if (conv_type == OnnxConvType::kConvTranspose) {
-      ORT_RETURN_IF_ERROR(CnhwShapeToHwcn(input_info.shape, actual_shape));
+      ORT_RETURN_IF_ERROR(CnhwShapeToHwcn<uint32_t>(input_info.shape, actual_shape));
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unexpected convolution op type: ", node_unit.OpType().c_str());
     }
@@ -412,9 +412,9 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
 
     // Create the final shape after the weights are transposed to HWCN.
     if (conv_type == OnnxConvType::kConv) {
-      ORT_RETURN_IF_ERROR(NchwShapeToHwcn(shape_2d, final_shape));
+      ORT_RETURN_IF_ERROR(NchwShapeToHwcn<uint32_t>(shape_2d, final_shape));
     } else if (conv_type == OnnxConvType::kConvTranspose) {
-      ORT_RETURN_IF_ERROR(CnhwShapeToHwcn(shape_2d, final_shape));
+      ORT_RETURN_IF_ERROR(CnhwShapeToHwcn<uint32_t>(shape_2d, final_shape));
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unexpected convolution op type: ", node_unit.OpType().c_str());
     }
@@ -433,15 +433,11 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
         return static_cast<int64_t>(dim);
       });
 
-      const TensorShape tensor_shape = TensorShape::FromExistingBuffer(shape_2d_int64);  // Does not own shape data.
-      const DataTypeImpl* tensor_dtype = DataTypeImpl::TensorTypeFromONNXEnum(
-                                             input_info.initializer_tensor->data_type())
-                                             ->GetElementType();
-      ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor, unpacked_tensor));
-
-      Tensor tensor_2d(tensor_dtype, tensor_shape, unpacked_tensor.data(), OrtMemoryInfo{});  // Does not own data.
-      ONNX_NAMESPACE::TensorProto reshaped_initializer = onnxruntime::utils::TensorToTensorProto(tensor_2d,
-                                                                                                 reshape_output);
+      std::vector<uint8_t> original_tensor_bytes;
+      ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor, original_tensor_bytes));
+      unpacked_tensor.resize(original_tensor_bytes.size());
+      size_t elem_byte_size = qnn::utils::GetElementSizeByType(
+          static_cast<ONNX_NAMESPACE::TensorProto_DataType>(input_info.initializer_tensor->data_type()));
 
       // The reshape (unsqueeze) may require us to shift the quant parameter's axis.
       if (input_info.quant_param.IsPerChannel()) {
@@ -452,9 +448,9 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
       // Get transposed initializer bytes.
       //
       if (conv_type == OnnxConvType::kConv) {
-        ORT_RETURN_IF_ERROR(TransposeFromNchwToHwcn(qnn_model_wrapper, reshaped_initializer, unpacked_tensor));
+        ORT_RETURN_IF_ERROR(TransposeFromNchwToHwcn(shape_2d_int64, elem_byte_size, original_tensor_bytes, unpacked_tensor));
       } else if (conv_type == OnnxConvType::kConvTranspose) {
-        ORT_RETURN_IF_ERROR(TransposeFromCnhwToHwcn(qnn_model_wrapper, reshaped_initializer, unpacked_tensor));
+        ORT_RETURN_IF_ERROR(TransposeFromCnhwToHwcn(shape_2d_int64, elem_byte_size, original_tensor_bytes, unpacked_tensor));
       } else {
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unexpected convolution op type: ", node_unit.OpType().c_str());
       }
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
index a6bd17e75b6c0..e7f4c37eaef66 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
@@ -623,11 +623,11 @@ Status QnnModelWrapper::UnpackInitializerData(const ONNX_NAMESPACE::TensorProto&
 
   // If this is an int4, we need to unpack it because QNN treats int4 as a full int8.
   if (onnx_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT4) {
-    TensorShape shape = onnxruntime::utils::GetTensorShapeFromTensorProto(initializer);
+    TensorShape shape(qnn::utils::GetInitializerShape<int64_t>(initializer));
     const size_t num_int4_elems = shape.Size();
     ORT_RETURN_IF_ERROR(qnn::utils::UnpackInt4ToInt8<true>(num_int4_elems, unpacked_tensor));
   } else if (onnx_data_type == ONNX_NAMESPACE::TensorProto_DataType_UINT4) {
-    TensorShape shape = onnxruntime::utils::GetTensorShapeFromTensorProto(initializer);
+    TensorShape shape(qnn::utils::GetInitializerShape<int64_t>(initializer));
     const size_t num_uint4_elems = shape.Size();
     ORT_RETURN_IF_ERROR(qnn::utils::UnpackInt4ToInt8<false>(num_uint4_elems, unpacked_tensor));
   }
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
index 9457877ddfc93..f6c30e85b53f9 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
@@ -65,6 +65,42 @@ size_t GetElementSizeByType(ONNXTensorElementDataType elem_type) {
   return pos->second;
 }
 
+size_t GetElementSizeByType(ONNX_NAMESPACE::TensorProto_DataType onnx_type) {
+  switch (onnx_type) {
+    case ONNX_NAMESPACE::TensorProto_DataType_INT4:
+      return sizeof(Int4x2);
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT4:
+      return sizeof(UInt4x2);
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      return sizeof(int8_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
+      return sizeof(uint8_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_INT16:
+      return sizeof(int16_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT16:
+      return sizeof(uint16_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_INT32:
+      return sizeof(int32_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT32:
+      return sizeof(uint32_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_INT64:
+      return sizeof(int64_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT64:
+      return sizeof(uint64_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
+      return 2;
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
+      return sizeof(float);
+    case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE:
+      return sizeof(double);
+    case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+      return sizeof(bool);
+    default:
+      return 0;
+  }
+  // Unreachable
+}
+
 std::ostream& operator<<(std::ostream& out, const Qnn_Scalar_t& scalar) {
   switch (scalar.dataType) {
     case QNN_DATATYPE_INT_8:
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
index 11ecf57ada357..c76c99b8454ef 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
@@ -11,6 +11,7 @@
 #include "QnnTypes.h"
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/framework/node_unit.h"
+#include "core/framework/tensor_shape.h"
 #include "core/util/qmath.h"
 
 namespace onnxruntime {
@@ -22,6 +23,8 @@ size_t GetElementSizeByType(const Qnn_DataType_t& data_type);
 
 size_t GetElementSizeByType(ONNXTensorElementDataType elem_type);
 
+size_t GetElementSizeByType(ONNX_NAMESPACE::TensorProto_DataType onnx_type);
+
 // TODO: make these work with Wrappers?
 std::ostream& operator<<(std::ostream& out, const Qnn_Param_t& qnn_param);
 std::ostream& operator<<(std::ostream& out, const Qnn_Tensor_t& tensor);
@@ -133,6 +136,31 @@ Status UnpackInt4ToInt8(size_t num_int4_elems, std::vector<uint8_t>& data_bytes)
   return Status::OK();
 }
 
+template <typename T>
+std::vector<T> GetInitializerShape(const ONNX_NAMESPACE::TensorProto& tensor_proto) {
+  const auto& dims = tensor_proto.dims();
+  std::vector<T> tensor_shape_vec(static_cast<size_t>(dims.size()));
+  for (int i = 0; i < dims.size(); ++i) {
+    tensor_shape_vec[i] = static_cast<T>(dims[i]);
+  }
+
+  return tensor_shape_vec;
+}
+
+template <typename T, typename P>
+Status PermuteShape(gsl::span<const T> input_shape, gsl::span<const P> perm, gsl::span<T> output_shape) {
+  const size_t rank = input_shape.size();
+  ORT_RETURN_IF_NOT(rank == perm.size() && rank == output_shape.size(),
+                    "PermuteShape(): expect all arguments to have the same rank.");
+
+  for (size_t i = 0; i < rank; ++i) {
+    size_t p = static_cast<size_t>(perm[i]);
+    output_shape[i] = input_shape[p];
+  }
+
+  return Status::OK();
+}
+
 /**
  * Wrapping onnxruntime::Node for retrieving attribute values
  */
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index 92e5eb1ed5eb0..d3b12f9728135 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -23,7 +23,6 @@
 #include "core/providers/cpu/tensor/split.h"
 #include "core/providers/cpu/tensor/size.h"
 #include "core/providers/cpu/tensor/scatter_nd.h"
-#include "core/providers/cpu/tensor/transpose.h"
 #include "core/providers/cpu/tensor/unsqueeze.h"
 #include "core/providers/cpu/tensor/upsamplebase.h"
 #include "core/providers/cpu/tensor/tile.h"
@@ -514,12 +513,6 @@ Status NonMaxSuppressionBase::GetThresholdsFromInputs(const PrepareContext& pc,
 Status GatherBase::PrepareForCompute(OpKernelContext* context, GatherBase::Prepare& p) const { return g_host_cpu.GatherBase__PrepareForCompute(this, context, reinterpret_cast<GatherBase__Prepare&>(p)); }
 Status UnsqueezeBase::PrepareCompute(OpKernelContext* ctx, UnsqueezeBase::Prepare& p) const { return g_host_cpu.UnsqueezeBase__PrepareCompute(this, ctx, reinterpret_cast<UnsqueezeBase__Prepare&>(p)); }
 
-Status TransposeBase::DoTranspose(const gsl::span<const size_t>& permutations, const Tensor& input, Tensor& output,
-                                  const TensorShape* input_shape_override,
-                                  concurrency::ThreadPool* tp) {
-  return g_host_cpu.TransposeBase__DoTranspose(permutations, input, output, input_shape_override, tp);
-}
-
 #if defined(USE_CUDA) || defined(USE_ROCM)
 bool TileOp::IsTileMemcpy(const TensorShape& input_shape, const int64_t* repeats, size_t rank, bool& is_batched_memcpy, size_t& num_of_elements_per_batch, size_t& num_of_copies_per_batch, size_t& num_of_batch_copies) {
   return g_host_cpu.TileOp__IsTileMemcpy(input_shape, repeats, rank, is_batched_memcpy, num_of_elements_per_batch, num_of_copies_per_batch, num_of_batch_copies);

From ccaefb3ed899a2e044bd288faa0e28cf34cdf1a2 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Wed, 11 Dec 2024 03:17:57 -0800
Subject: [PATCH 08/64] Rename transpose func

---
 .../qnn/builder/opbuilder/base_op_builder.cc  | 50 +++++++++----------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
index 06b02a5e5e31b..7690427416770 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
@@ -271,11 +271,11 @@ Status BaseOpBuilder::SetOutputQParamEqualToInputIfNearlyEqual(QnnModelWrapper&
 }
 
 // Internal function to transpose input from either (N,C,H,W,D) or (C,N,H,W,D) to (H,W,D,C,N).
-static Status TransposeToHwdcn(const TensorShape& input_shape,
-                               gsl::span<const size_t> perm,
-                               size_t elem_byte_size,
-                               gsl::span<const uint8_t> input_buffer,
-                               gsl::span<uint8_t> output_buffer) {
+static Status TransposeDataRank5(const TensorShape& input_shape,
+                                 gsl::span<const size_t> perm,
+                                 size_t elem_byte_size,
+                                 gsl::span<const uint8_t> input_buffer,
+                                 gsl::span<uint8_t> output_buffer) {
   const size_t rank = input_shape.NumDimensions();
   ORT_RETURN_IF_NOT(rank == 5 && perm.size() == 5, "Invalid input tensor rank");
   std::vector<size_t> perm_inverse(perm.size());
@@ -385,11 +385,11 @@ Status BaseOpBuilder::TransposeFromNchwToHwcn(const QnnModelWrapper& qnn_model_w
   ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(initializer, input_buffer));
   transposed_data.resize(input_buffer.size());
 
-  return TransposeToHwdcn(TensorShape::FromExistingBuffer(input_shape),
-                          nchw2hwcn_perm_3d,
-                          elem_byte_size,
-                          input_buffer,
-                          transposed_data);
+  return TransposeDataRank5(TensorShape::FromExistingBuffer(input_shape),
+                            nchw2hwcn_perm_3d,
+                            elem_byte_size,
+                            input_buffer,
+                            transposed_data);
 }
 
 Status BaseOpBuilder::TransposeFromNchwToHwcn(std::vector<int64_t> input_shape_dims,
@@ -407,11 +407,11 @@ Status BaseOpBuilder::TransposeFromNchwToHwcn(std::vector<int64_t> input_shape_d
     input_shape_dims.push_back(1);  // Make it 3D by making shape (N,C,H,W,1)
   }
 
-  return TransposeToHwdcn(TensorShape::FromExistingBuffer(input_shape_dims),
-                          nchw2hwcn_perm_3d,
-                          elem_byte_size,
-                          input_buffer,
-                          output_buffer);
+  return TransposeDataRank5(TensorShape::FromExistingBuffer(input_shape_dims),
+                            nchw2hwcn_perm_3d,
+                            elem_byte_size,
+                            input_buffer,
+                            output_buffer);
 }
 
 Status BaseOpBuilder::TransposeFromCnhwToHwcn(const QnnModelWrapper& qnn_model_wrapper,
@@ -434,11 +434,11 @@ Status BaseOpBuilder::TransposeFromCnhwToHwcn(const QnnModelWrapper& qnn_model_w
   ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(initializer, input_buffer));
   transposed_data.resize(input_buffer.size());
 
-  return TransposeToHwdcn(TensorShape::FromExistingBuffer(input_shape),
-                          cnhw2hwcn_perm_3d,
-                          elem_byte_size,
-                          input_buffer,
-                          transposed_data);
+  return TransposeDataRank5(TensorShape::FromExistingBuffer(input_shape),
+                            cnhw2hwcn_perm_3d,
+                            elem_byte_size,
+                            input_buffer,
+                            transposed_data);
 }
 
 Status BaseOpBuilder::TransposeFromCnhwToHwcn(std::vector<int64_t> input_shape_dims,
@@ -456,11 +456,11 @@ Status BaseOpBuilder::TransposeFromCnhwToHwcn(std::vector<int64_t> input_shape_d
     input_shape_dims.push_back(1);  // Make it 3D by making shape (C,N,H,W,1)
   }
 
-  return TransposeToHwdcn(TensorShape::FromExistingBuffer(input_shape_dims),
-                          cnhw2hwcn_perm_3d,
-                          elem_byte_size,
-                          input_buffer,
-                          output_buffer);
+  return TransposeDataRank5(TensorShape::FromExistingBuffer(input_shape_dims),
+                            cnhw2hwcn_perm_3d,
+                            elem_byte_size,
+                            input_buffer,
+                            output_buffer);
 }
 
 Status BaseOpBuilder::ProcessAxisAttribute(const QnnModelWrapper& qnn_model_wrapper,

From fb765c7f237524b44c5fe68a6de6b43c7af500f3 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Wed, 11 Dec 2024 03:22:05 -0800
Subject: [PATCH 09/64] Remove TransposeBase forward declaration from provider
 bridge

---
 onnxruntime/core/providers/shared_library/provider_api.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index 6e17947af3389..d68dbf8ce3ec5 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -185,7 +185,6 @@ class GatherBase;
 class Size;
 class SliceBase;
 class SplitBase;
-class TransposeBase;
 class TensorShape;
 struct Prepare;
 struct PrepareContext;

From 0237bca5a5f61e6be8ad674e3aac4996cc45c07e Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Wed, 11 Dec 2024 03:41:59 -0800
Subject: [PATCH 10/64] Rewrite SliceOpBuilder util GetInitializerInputData()
 to not use functions not available in the provider bridge.

---
 .../qnn/builder/opbuilder/slice_op_builder.cc | 22 ++++++++-----------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
index 13b106d3c1bde..e383e71d2a497 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
@@ -85,26 +85,22 @@ static Status GetInitializerInputData(const NodeUnitIODef& input, const QnnModel
   ORT_RETURN_IF_NOT(initializer_proto->has_data_type(), "Expected initializer ", input_name.c_str(),
                     " to have a proto data type.");
 
-  // Create empty Tensor.
-  const auto* dtype = DataTypeImpl::TensorTypeFromONNXEnum(initializer_proto->data_type())->GetElementType();
-  TensorShape shape = onnxruntime::utils::GetTensorShapeFromTensorProto(*initializer_proto);
-  Tensor tensor(dtype, shape, std::make_shared<CPUAllocator>());
-
-  // Deserialize initializer into Tensor.
-  ORT_RETURN_IF_ERROR(onnxruntime::utils::TensorProtoToTensor(
-      onnxruntime::Env::Default(), qnn_model_wrapper.GetGraphViewer().ModelPath(), *initializer_proto, tensor));
+  // Deserialize initializer into byte buffer
+  std::vector<uint8_t> initializer_bytes;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*initializer_proto, initializer_bytes));
 
   Status status;
 
   // Copy Tensor of int32_t or int64_t elems into output (int64_ts).
-  if (tensor.IsDataType<int64_t>()) {
-    gsl::span<const int64_t> tensor_elems = tensor.DataAsSpan<int64_t>();
+  auto onnx_type = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(initializer_proto->data_type());
+  if (onnx_type == ONNX_NAMESPACE::TensorProto_DataType_INT64) {
+    gsl::span<const int64_t> tensor_elems = ReinterpretAsSpan<int64_t, uint8_t>(initializer_bytes);
     output.insert(output.end(), tensor_elems.begin(), tensor_elems.end());
-  } else if (tensor.IsDataType<int32_t>()) {
-    gsl::span<const int32_t> tensor_elems = tensor.DataAsSpan<int32_t>();
+  } else if (onnx_type == ONNX_NAMESPACE::TensorProto_DataType_INT32) {
+    gsl::span<const int32_t> tensor_elems = ReinterpretAsSpan<int32_t, uint8_t>(initializer_bytes);
     output.insert(output.end(), tensor_elems.begin(), tensor_elems.end());
   } else {
-    status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Data type ", DataTypeImpl::ToString(dtype),
+    status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Data type ", onnx_type,
                              " is not supported for Slice initializer input ", input.node_arg.Name().c_str());
   }
 

From e3705b260a67a66c4c71bfa3020b8b6af537e842 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Wed, 11 Dec 2024 03:50:40 -0800
Subject: [PATCH 11/64] Revert addition of TensorTypeBase to provider bridge

---
 onnxruntime/core/providers/shared_library/provider_api.h   | 1 -
 .../core/providers/shared_library/provider_interfaces.h    | 5 -----
 .../core/providers/shared_library/provider_wrappedtypes.h  | 7 -------
 onnxruntime/core/session/provider_bridge_ort.cc            | 7 -------
 4 files changed, 20 deletions(-)

diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index d68dbf8ce3ec5..b84825236a453 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -169,7 +169,6 @@ class OpKernel;
 struct OpKernelContext;
 struct OpKernelInfo;
 struct PrimitiveDataTypeBase;
-struct TensorTypeBase;
 struct OrtRunOptions;
 struct Tensor;
 struct SparseTensor;
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index dfe46e0ee32b5..dc3f5e60f2745 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -677,9 +677,6 @@ struct ProviderHost {
   virtual int32_t PrimitiveDataTypeBase__GetNumSubElems(const PrimitiveDataTypeBase* p) = 0;
   virtual bool PrimitiveDataTypeBase__HasSubElems(const PrimitiveDataTypeBase* p) = 0;
 
-  // TensorTypeBase
-  virtual MLDataType TensorTypeBase__GetElementType(const TensorTypeBase* p) = 0;
-
   // DataTypeImpl
   virtual MLDataType DataTypeImpl__GetType_Tensor() = 0;
 #if !defined(DISABLE_SPARSE_TENSORS)
@@ -798,8 +795,6 @@ struct ProviderHost {
   virtual size_t DataTypeImpl__Size(const DataTypeImpl* p) = 0;
   virtual const PrimitiveDataTypeBase* DataTypeImpl__AsPrimitiveDataType(const DataTypeImpl* p) = 0;
 
-  virtual const TensorTypeBase* DataTypeImpl__TensorTypeFromONNXEnum(int type) = 0;
-
   // Function
   virtual const Graph& Function__Body(const Function* p) = 0;
 
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index 04818245d146f..0efa3833a978b 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -703,12 +703,6 @@ struct PrimitiveDataTypeBase final {
   PROVIDER_DISALLOW_ALL(PrimitiveDataTypeBase)
 };
 
-struct TensorTypeBase final {
-  MLDataType GetElementType() const { return g_host->TensorTypeBase__GetElementType(this); }
-
-  PROVIDER_DISALLOW_ALL(TensorTypeBase)
-};
-
 class DataTypeImpl final {
  public:
   size_t Size() const { return g_host->DataTypeImpl__Size(this); }
@@ -765,7 +759,6 @@ class DataTypeImpl final {
 
   const PrimitiveDataTypeBase* AsPrimitiveDataType() const { return g_host->DataTypeImpl__AsPrimitiveDataType(this); }
 
-  static const TensorTypeBase* TensorTypeFromONNXEnum(int type) { return g_host->DataTypeImpl__TensorTypeFromONNXEnum(type); }
   static const char* ToString(MLDataType type) { return g_host->DataTypeImpl__ToString(type); }
 
   PROVIDER_DISALLOW_ALL(DataTypeImpl)
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index a3a12e1ba32a2..29ffd9487925b 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -812,9 +812,6 @@ struct ProviderHostImpl : ProviderHost {
   int32_t PrimitiveDataTypeBase__GetNumSubElems(const PrimitiveDataTypeBase* p) override { return p->GetNumSubElems(); }
   bool PrimitiveDataTypeBase__HasSubElems(const PrimitiveDataTypeBase* p) override { return p->HasSubElems(); }
 
-  // TensorTypeBase (wrapped)
-  MLDataType TensorTypeBase__GetElementType(const TensorTypeBase* p) override { return p->GetElementType(); }
-
   // DataTypeImpl (wrapped)
   MLDataType DataTypeImpl__GetType_Tensor() override { return DataTypeImpl::GetType<Tensor>(); }
 #if !defined(DISABLE_SPARSE_TENSORS)
@@ -935,10 +932,6 @@ struct ProviderHostImpl : ProviderHost {
   size_t DataTypeImpl__Size(const DataTypeImpl* p) override { return p->Size(); }
   const PrimitiveDataTypeBase* DataTypeImpl__AsPrimitiveDataType(const DataTypeImpl* p) override { return p->AsPrimitiveDataType(); }
 
-  const TensorTypeBase* DataTypeImpl__TensorTypeFromONNXEnum(int type) override {
-    return DataTypeImpl::TensorTypeFromONNXEnum(type);
-  }
-
   // Function (wrapped)
   const Graph& Function__Body(const Function* p) override { return p->Body(); }
 

From 6f0b3c61330adfc5b41261e4f08cda92f1924978 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Wed, 11 Dec 2024 03:59:54 -0800
Subject: [PATCH 12/64] Remove last use of GetTensorShapeFromTensorProto

---
 onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
index e7f4c37eaef66..147c740313d7d 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
@@ -444,7 +444,7 @@ Status QnnModelWrapper::IsPerChannelQuantized(const onnxruntime::NodeUnitIODef&
   ORT_RETURN_IF(iter == graph_initializers.end(), "Unable to find initializer for scale(s): ",
                 scale_name.c_str());
   gsl::not_null<const onnx::TensorProto*> scale_tensor_proto = iter->second;
-  TensorShape scale_shape = onnxruntime::utils::GetTensorShapeFromTensorProto(*scale_tensor_proto);
+  TensorShape scale_shape(qnn::utils::GetInitializerShape<int64_t>(*scale_tensor_proto));
 
   // Check the number of scale values to determine if the tensor is per-channel.
   // This is consistent with CPU EP's Quant/Dequant logic. We can't use the presence of an axis because even a

From 5939bf64f6d78633e214c6f20f0a36690684b50c Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Wed, 11 Dec 2024 04:18:06 -0800
Subject: [PATCH 13/64] Add DataTypeUtils::ToType(std::string&) to provider
 bridge

---
 .../core/providers/shared_library/provider_interfaces.h        | 1 +
 .../core/providers/shared_library/provider_wrappedtypes.h      | 1 +
 onnxruntime/core/session/provider_bridge_ort.cc                | 3 +++
 3 files changed, 5 insertions(+)

diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index dc3f5e60f2745..77ee3bc01d212 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -284,6 +284,7 @@ struct ProviderHost {
 
   // Utils::DataTypeUtils
   virtual const std::string* Utils__DataTypeUtils__ToType(const ONNX_NAMESPACE::TypeProto& type_proto) = 0;
+  virtual const std::string* Utils__DataTypeUtils__ToType(const std::string& type_str) = 0;
 
   // int64s
   virtual int int64s__size(const ONNX_NAMESPACE::int64s* p) = 0;
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index 0efa3833a978b..a9e4bafe9f3c6 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -475,6 +475,7 @@ namespace Utils {
 
 struct DataTypeUtils final {
   static const std::string* ToType(const ONNX_NAMESPACE::TypeProto& type_proto) { return g_host->Utils__DataTypeUtils__ToType(type_proto); }
+  static const std::string* ToType(const std::string& type_str) { return g_host->Utils__DataTypeUtils__ToType(type_str); }
 
   PROVIDER_DISALLOW_ALL(DataTypeUtils)
 };
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 29ffd9487925b..65917f035020b 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -382,6 +382,9 @@ struct ProviderHostImpl : ProviderHost {
 
   // Utils::DataTypeUtils (wrapped)
   const std::string* Utils__DataTypeUtils__ToType(const ONNX_NAMESPACE::TypeProto& type_proto) override { return ONNX_NAMESPACE::Utils::DataTypeUtils::ToType(type_proto); }
+  const std::string* Utils__DataTypeUtils__ToType(const std::string& type_str) override {
+    return ONNX_NAMESPACE::Utils::DataTypeUtils::ToType(type_str);
+  }
 
   // int64s (wrapped)
   int int64s__size(const ONNX_NAMESPACE::int64s* p) override { return p->size(); }

From 58dbf494618f8339c06c07a1e7e967e9b4a436e1 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Wed, 11 Dec 2024 04:21:37 -0800
Subject: [PATCH 14/64] Add Logger::GetSeverity() to provider bridge

---
 .../core/providers/shared_library/provider_interfaces.h        | 1 +
 .../core/providers/shared_library/provider_wrappedtypes.h      | 3 +++
 onnxruntime/core/session/provider_bridge_ort.cc                | 3 +++
 3 files changed, 7 insertions(+)

diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 77ee3bc01d212..81b8b00d7777a 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -270,6 +270,7 @@ struct ProviderHost {
 
   // logging::Logger
   virtual bool logging__Logger__OutputIsEnabled(const logging::Logger* p, logging::Severity severity, logging::DataType data_type) = 0;
+  virtual logging::Severity logging__Logger__GetSeverity(const logging::Logger* p) = 0;
 
   // logging::LoggingManager
   virtual const logging::Logger& logging__LoggingManager__DefaultLogger() = 0;
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index a9e4bafe9f3c6..365ee987f4930 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -23,6 +23,9 @@ namespace logging {
 
 struct Logger final {
   bool OutputIsEnabled(Severity severity, DataType data_type) const noexcept { return g_host->logging__Logger__OutputIsEnabled(this, severity, data_type); }
+  Severity GetSeverity() const noexcept {
+    return g_host->logging__Logger__GetSeverity(this);
+  }
 
   PROVIDER_DISALLOW_ALL(Logger)
 };
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 65917f035020b..457176136fe49 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -366,6 +366,9 @@ struct ProviderHostImpl : ProviderHost {
 
   // logging::Logger (wrapped)
   bool logging__Logger__OutputIsEnabled(const logging::Logger* p, logging::Severity severity, logging::DataType data_type) override { return p->OutputIsEnabled(severity, data_type); }
+  logging::Severity logging__Logger__GetSeverity(const logging::Logger* p) override {
+    return p->GetSeverity();
+  }
 
   // logging::LoggingManager (wrapped)
   const logging::Logger& logging__LoggingManager__DefaultLogger() override { return logging::LoggingManager::DefaultLogger(); }

From f76b09a2b665318c76fa24666c830b0837a522c3 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Wed, 11 Dec 2024 04:28:19 -0800
Subject: [PATCH 15/64] Add TensorShapeProto_Dimensions__size to provider
 bridge

---
 .../core/providers/shared_library/provider_interfaces.h       | 1 +
 .../core/providers/shared_library/provider_wrappedtypes.h     | 1 +
 onnxruntime/core/session/provider_bridge_ort.cc               | 4 ++++
 3 files changed, 6 insertions(+)

diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 81b8b00d7777a..e4c96b57a9f99 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -494,6 +494,7 @@ struct ProviderHost {
   // TensorShapeProto_Dimensions
   virtual std::unique_ptr<TensorShapeProto_Dimension_Iterator> TensorShapeProto_Dimensions__begin(const ONNX_NAMESPACE::TensorShapeProto_Dimensions* p) = 0;
   virtual std::unique_ptr<TensorShapeProto_Dimension_Iterator> TensorShapeProto_Dimensions__end(const ONNX_NAMESPACE::TensorShapeProto_Dimensions* p) = 0;
+  virtual size_t TensorShapeProto_Dimensions__size(const ONNX_NAMESPACE::TensorShapeProto_Dimensions* p) = 0;
 
   // TensorShapeProto
   virtual int TensorShapeProto__dim_size(const ONNX_NAMESPACE::TensorShapeProto* p) = 0;
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index 365ee987f4930..aeae15ee4b4d2 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -288,6 +288,7 @@ struct TensorShapeProto_Dimension final {
 struct TensorShapeProto_Dimensions final {
   IteratorHolder<TensorShapeProto_Dimension_Iterator, const TensorShapeProto_Dimension> begin() const { return g_host->TensorShapeProto_Dimensions__begin(this); }
   IteratorHolder<TensorShapeProto_Dimension_Iterator, const TensorShapeProto_Dimension> end() const { return g_host->TensorShapeProto_Dimensions__end(this); }
+  size_t size() const { return g_host->TensorShapeProto_Dimensions__size(this); }
 
   PROVIDER_DISALLOW_ALL(TensorShapeProto_Dimensions)
 };
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 457176136fe49..83935293a221d 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -615,6 +615,10 @@ struct ProviderHostImpl : ProviderHost {
     return std::make_unique<TensorShapeProto_Dimension_Iterator_Impl>(p->end());
   }
 
+  size_t TensorShapeProto_Dimensions__size(const ONNX_NAMESPACE::TensorShapeProto_Dimensions* p) override {
+    return p->size();
+  }
+
   // TensorShapeProto (wrapped)
   int TensorShapeProto__dim_size(const ONNX_NAMESPACE::TensorShapeProto* p) override { return p->dim_size(); }
   const ONNX_NAMESPACE::TensorShapeProto_Dimensions& TensorShapeProto__dim(const ONNX_NAMESPACE::TensorShapeProto* p) override { return p->dim(); }

From d189fe66ceedfc8feb86f7f66ff191ce44a91492 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Wed, 11 Dec 2024 17:35:26 -0800
Subject: [PATCH 16/64] Add utils::CreateSupportedPartitions() to provider
 bridege

---
 .../providers/shared_library/provider_api.h   | 15 +++++++++++++
 .../shared_library/provider_interfaces.h      | 11 ++++++++++
 .../core/session/provider_bridge_ort.cc       | 21 +++++++++++++++++++
 3 files changed, 47 insertions(+)

diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index b84825236a453..31e7e0d47dfe1 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -215,6 +215,7 @@ using DeleteFunc = void (*)(void*);
 using NodeArgInfo = ONNX_NAMESPACE::ValueInfoProto;
 
 using NameMLValMap = std::unordered_map<std::string, OrtValue>;
+
 }  // namespace onnxruntime
 
 #include "core/platform/threadpool.h"
@@ -367,6 +368,20 @@ template <>
 constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<UInt4x2>() {
   return ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT4;
 }
+
+inline std::vector<std::unique_ptr<ComputeCapability>>
+CreateSupportedPartitions(const GraphViewer& graph_viewer,
+                          const std::unordered_set<const Node*>& supported_nodes,
+                          const std::unordered_set<std::string>& stop_ops,
+                          const std::function<std::string()>& generate_metadef_name,
+                          const std::string& execution_provider_name,
+                          const std::string& execution_provider_type,
+                          const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
+                          bool drop_constant_initializers = false) {
+  return g_host->Utils__CreateSupportedPartitions(graph_viewer, supported_nodes, stop_ops, generate_metadef_name,
+                                                  execution_provider_name, execution_provider_type, node_unit_map,
+                                                  drop_constant_initializers);
+}
 }  // namespace utils
 
 namespace QDQ {
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index e4c96b57a9f99..71b46d27906bc 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -896,6 +896,17 @@ struct ProviderHost {
   virtual std::pair<std::vector<std::unique_ptr<NodeUnit>>, std::unordered_map<const Node*, const NodeUnit*>>
   QDQ__GetAllNodeUnits(const GraphViewer* graph_viewer) = 0;
 
+  // Partitioning utils
+  virtual std::vector<std::unique_ptr<ComputeCapability>>
+  Utils__CreateSupportedPartitions(const GraphViewer& graph_viewer,
+                                   const std::unordered_set<const Node*>& supported_nodes,
+                                   const std::unordered_set<std::string>& stop_ops,
+                                   const std::function<std::string()>& generate_metadef_name,
+                                   const std::string& execution_provider_name,
+                                   const std::string& execution_provider_type,
+                                   const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
+                                   bool drop_constant_initializers) = 0;
+
   // Model
   virtual std::unique_ptr<Model> Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
                                                   const IOnnxRuntimeOpSchemaRegistryList* local_registries,
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 83935293a221d..94e9d09301dff 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -81,6 +81,7 @@ using IndexedSubGraph_SourceOfSchema = IndexedSubGraph::SourceOfSchema;
 #include "core/common/cpuid_info.h"
 #include "core/common/logging/logging.h"
 #include "core/providers/shared_library/provider_interfaces.h"
+#include "core/providers/partitioning_utils.h"
 
 #include "core/providers/cuda/cuda_provider_factory_creator.h"
 #include "core/providers/cann/cann_provider_factory_creator.h"
@@ -1072,6 +1073,26 @@ struct ProviderHostImpl : ProviderHost {
     return QDQ::GetAllNodeUnits(*graph_viewer);
   }
 
+  // Partitioning utils
+  std::vector<std::unique_ptr<ComputeCapability>>
+  Utils__CreateSupportedPartitions(const GraphViewer& graph_viewer,
+                                   const std::unordered_set<const Node*>& supported_nodes,
+                                   const std::unordered_set<std::string>& stop_ops,
+                                   const utils::GenerateMetadefNameFn& generate_metadef_name,
+                                   const std::string& execution_provider_name,
+                                   const std::string& execution_provider_type,
+                                   const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
+                                   bool drop_constant_initializers) override {
+    return onnxruntime::utils::CreateSupportedPartitions(graph_viewer,
+                                                         supported_nodes,
+                                                         stop_ops,
+                                                         generate_metadef_name,
+                                                         execution_provider_name,
+                                                         execution_provider_type,
+                                                         node_unit_map,
+                                                         drop_constant_initializers);
+  }
+
   // Model (wrapped)
   std::unique_ptr<Model> Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
                                           const IOnnxRuntimeOpSchemaRegistryList* local_registries,

From 48191eaac27fb9ad90a851b0da7e20592a30ddce Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Fri, 13 Dec 2024 10:01:59 -0800
Subject: [PATCH 17/64] Use new namespace for NodeAttrHelper

---
 onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
index f4feab303d374..1bb764913b6ea 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -163,7 +163,7 @@ Status TryGetMaxSpillFillSize(const std::vector<IExecutionProvider::FusedNodeAnd
     const onnxruntime::GraphViewer& main_ctx_graph_viewer(fused_nodes_and_graphs[index].filtered_graph);
     ORT_RETURN_IF(main_ctx_graph_viewer.NumberOfNodes() != 1, "One filtered graph should has only one EPContext node!");
     const auto& ep_context_node = main_ctx_graph_viewer.Nodes().begin();
-    NodeAttrHelper node_helper(*ep_context_node);
+    qnn::utils::NodeAttrHelper node_helper(*ep_context_node);
     int64_t max_size = node_helper.Get(MAX_SIZE, static_cast<int64_t>(0));
     if (max_size > max_spill_fill_size) {
       max_spill_fill_size = max_size;

From e6afd7298376c5cd7545aacad09e9bdc60bcab55 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Fri, 13 Dec 2024 10:02:38 -0800
Subject: [PATCH 18/64] Add to provider bridge: GraphViewer::Nodes(),
 ConstGraphNodes struct + iterators

---
 .../providers/shared_library/provider_api.h   |  1 +
 .../shared_library/provider_interfaces.h      | 16 +++++++++++
 .../shared_library/provider_wrappedtypes.h    | 20 +++++++++++++
 .../core/session/provider_bridge_ort.cc       | 28 +++++++++++++++++++
 4 files changed, 65 insertions(+)

diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index 88d4e0d4cd18b..561744d30844b 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -157,6 +157,7 @@ struct KernelRegistry;
 struct Function;
 struct Graph;
 class GraphViewer;
+struct ConstGraphNodes;
 enum class DataLayout;
 struct Model;
 struct Path;
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index d045fd27c0d25..7b586f6a71642 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -123,6 +123,14 @@ struct Node__EdgeIterator {
   virtual int GetDstArgIndex() const = 0;
 };
 
+struct ConstGraphNodes_Iterator {
+  virtual ~ConstGraphNodes_Iterator() {}
+
+  virtual bool operator!=(const ConstGraphNodes_Iterator& other) const = 0;
+  virtual void operator++() = 0;
+  virtual const Node& operator*() = 0;
+};
+
 // There are two ways to route a function, one is a virtual method and the other is a function pointer (or pointer to
 // member function).
 // The function pointers are nicer in that they directly call the target function, but they cannot be used in cases
@@ -982,6 +990,7 @@ struct ProviderHost {
   virtual const std::string& GraphViewer__Name(const GraphViewer* p) noexcept = 0;
   virtual const std::filesystem::path& GraphViewer__ModelPath(const GraphViewer* p) noexcept = 0;
 
+  virtual const ConstGraphNodes& GraphViewer__Nodes(const GraphViewer* p) noexcept = 0;
   virtual const Node* GraphViewer__GetNode(const GraphViewer* p, NodeIndex node_index) = 0;
   virtual const NodeArg* GraphViewer__GetNodeArg(const GraphViewer* p, const std::string& name) = 0;
 
@@ -1015,6 +1024,13 @@ struct ProviderHost {
   virtual const Node* GraphViewer__GetProducerNode(const GraphViewer* p, const std::string& node_arg_name) const = 0;
   virtual IOnnxRuntimeOpSchemaCollectionPtr GraphViewer__GetSchemaRegistry(const GraphViewer* p) const = 0;
 
+  // ConstGraphNodes
+  virtual std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__begin(const ConstGraphNodes* p) = 0;
+  virtual std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__end(const ConstGraphNodes* p) = 0;
+  virtual std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__cbegin(const ConstGraphNodes* p) = 0;
+  virtual std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__cend(const ConstGraphNodes* p) = 0;
+  virtual bool ConstGraphNodes__empty(const ConstGraphNodes* p) noexcept = 0;
+
   // OpKernel
   virtual const Node& OpKernel__Node(const OpKernel* p) = 0;
 
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index ae9eed6f66f0b..c8a8fefdac71e 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -1040,6 +1040,7 @@ class GraphViewer final {
   const std::string& Name() const noexcept { return g_host->GraphViewer__Name(this); }
   const std::filesystem::path& ModelPath() const noexcept { return g_host->GraphViewer__ModelPath(this); }
 
+  const ConstGraphNodes& Nodes() const noexcept { return g_host->GraphViewer__Nodes(this); }
   const Node* GetNode(NodeIndex node_index) const { return g_host->GraphViewer__GetNode(this, node_index); }
   const NodeArg* GetNodeArg(const std::string& name) const { return g_host->GraphViewer__GetNodeArg(this, name); }
 
@@ -1084,6 +1085,25 @@ class GraphViewer final {
   void operator=(const GraphViewer&) = delete;
 };
 
+struct ConstGraphNodes final {
+  IteratorHolder<ConstGraphNodes_Iterator, const Node> begin() const {
+    return g_host->ConstGraphNodes__begin(this);
+  }
+  IteratorHolder<ConstGraphNodes_Iterator, const Node> end() const {
+    return g_host->ConstGraphNodes__end(this);
+  }
+  IteratorHolder<ConstGraphNodes_Iterator, const Node> cbegin() const {
+    return g_host->ConstGraphNodes__cbegin(this);
+  }
+  IteratorHolder<ConstGraphNodes_Iterator, const Node> cend() const {
+    return g_host->ConstGraphNodes__cend(this);
+  }
+
+  bool empty() const noexcept { return g_host->ConstGraphNodes__empty(this); }
+
+  PROVIDER_DISALLOW_ALL(ConstGraphNodes)
+};
+
 struct OpKernelContext final {
   template <typename T>
   const T& RequiredInput(int index) const;
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 87be39b6b70f0..873a0d10cd094 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -189,6 +189,18 @@ struct Node__EdgeIterator_Impl : Node__EdgeIterator {
   Node::EdgeConstIterator v_;
 };
 
+struct ConstGraphNodes_Iterator_Impl : ConstGraphNodes_Iterator {
+  ConstGraphNodes_Iterator_Impl(ConstGraphNodes::ConstNodeIterator&& v) : v_{std::move(v)} {}
+
+  bool operator!=(const ConstGraphNodes_Iterator& other) const override {
+    return v_ != static_cast<const ConstGraphNodes_Iterator_Impl*>(&other)->v_;
+  }
+  void operator++() override { v_.operator++(); }
+  const Node& operator*() override { return *v_; }
+
+  ConstGraphNodes::ConstNodeIterator v_;
+};
+
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
 common::Status LoadDynamicLibraryFromProvider(onnxruntime::PathString library_name) {
   const auto& platform_env = onnxruntime::Env::Default();
@@ -1203,6 +1215,7 @@ struct ProviderHostImpl : ProviderHost {
   const std::string& GraphViewer__Name(const GraphViewer* p) noexcept override { return p->Name(); }
   const std::filesystem::path& GraphViewer__ModelPath(const GraphViewer* p) noexcept override { return p->ModelPath(); }
 
+  const ConstGraphNodes& GraphViewer__Nodes(const GraphViewer* p) noexcept override { return p->Nodes(); }
   const Node* GraphViewer__GetNode(const GraphViewer* p, NodeIndex node_index) override { return p->GetNode(node_index); }
   const NodeArg* GraphViewer__GetNodeArg(const GraphViewer* p, const std::string& name) override { return p->GetNodeArg(name); }
 
@@ -1248,6 +1261,21 @@ struct ProviderHostImpl : ProviderHost {
   const Node* GraphViewer__GetProducerNode(const GraphViewer* p, const std::string& node_arg_name) const override { return p->GetProducerNode(node_arg_name); }
   IOnnxRuntimeOpSchemaCollectionPtr GraphViewer__GetSchemaRegistry(const GraphViewer* p) const override { return p->GetSchemaRegistry(); }
 
+  // ConstGraphNodes
+  std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__begin(const ConstGraphNodes* p) override {
+    return std::make_unique<ConstGraphNodes_Iterator_Impl>(p->begin());
+  }
+  std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__end(const ConstGraphNodes* p) override {
+    return std::make_unique<ConstGraphNodes_Iterator_Impl>(p->end());
+  }
+  std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__cbegin(const ConstGraphNodes* p) override {
+    return std::make_unique<ConstGraphNodes_Iterator_Impl>(p->cbegin());
+  }
+  std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__cend(const ConstGraphNodes* p) override {
+    return std::make_unique<ConstGraphNodes_Iterator_Impl>(p->cend());
+  }
+  bool ConstGraphNodes__empty(const ConstGraphNodes* p) noexcept override { return p->empty(); }
+
   // OpKernel (direct)
   const Node& OpKernel__Node(const OpKernel* p) override { return p->OpKernel::Node(); }
 

From 0b1e538642fd5901120c8b58784d76efa9112732 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Fri, 13 Dec 2024 12:01:39 -0800
Subject: [PATCH 19/64] Replace usage of cbegin() and cend() in NodeAttrHelper
 with version that does not need to add new functionality to the provider
 bridge

---
 .../core/providers/qnn/builder/qnn_utils.cc   | 41 ++++++++++---------
 .../core/providers/qnn/builder/qnn_utils.h    |  1 -
 2 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
index f6c30e85b53f9..889b6e21647b5 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
@@ -668,10 +668,12 @@ const std::string& NodeAttrHelper::Get(const std::string& key, const std::string
 
 std::vector<int32_t> NodeAttrHelper::Get(const std::string& key, const std::vector<int32_t>& def_val) const {
   if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
-    const auto& attr = entry->second;
+    const auto& values = entry->second.ints();
+    const int64_t* cbegin = values.data();
+    const int64_t* cend = values.data() + values.size();
     std::vector<int32_t> v;
-    v.reserve(static_cast<size_t>(attr.ints_size()));
-    std::transform(attr.ints().cbegin(), attr.ints().cend(), std::back_inserter(v),
+    v.reserve(static_cast<size_t>(values.size()));
+    std::transform(cbegin, cend, std::back_inserter(v),
                    [](int64_t val) -> int32_t { return narrow<int32_t>(val); });
     return v;
   }
@@ -681,10 +683,12 @@ std::vector<int32_t> NodeAttrHelper::Get(const std::string& key, const std::vect
 
 std::vector<uint32_t> NodeAttrHelper::Get(const std::string& key, const std::vector<uint32_t>& def_val) const {
   if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
-    const auto& attr = entry->second;
+    const auto& values = entry->second.ints();
+    const int64_t* cbegin = values.data();
+    const int64_t* cend = values.data() + values.size();
     std::vector<uint32_t> v;
-    v.reserve(static_cast<size_t>(attr.ints_size()));
-    std::transform(attr.ints().cbegin(), attr.ints().cend(), std::back_inserter(v),
+    v.reserve(static_cast<size_t>(values.size()));
+    std::transform(cbegin, cend, std::back_inserter(v),
                    [](int64_t val) -> uint32_t { return narrow<uint32_t>(val); });
     return v;
   }
@@ -695,16 +699,9 @@ std::vector<uint32_t> NodeAttrHelper::Get(const std::string& key, const std::vec
 std::vector<int64_t> NodeAttrHelper::Get(const std::string& key, const std::vector<int64_t>& def_val) const {
   if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
     const auto& values = entry->second.ints();
-    return std::vector<int64_t>{values.cbegin(), values.cend()};
-  }
-
-  return def_val;
-}
-
-std::vector<std::string> NodeAttrHelper::Get(const std::string& key, const std::vector<std::string>& def_val) const {
-  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
-    const auto& values = entry->second.strings();
-    return std::vector<std::string>{values.cbegin(), values.cend()};
+    const int64_t* cbegin = values.data();
+    const int64_t* cend = values.data() + values.size();
+    return std::vector<int64_t>{cbegin, cend};
   }
 
   return def_val;
@@ -713,7 +710,9 @@ std::vector<std::string> NodeAttrHelper::Get(const std::string& key, const std::
 std::vector<float> NodeAttrHelper::Get(const std::string& key, const std::vector<float>& def_val) const {
   if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
     const auto& values = entry->second.floats();
-    return std::vector<float>{values.cbegin(), values.cend()};
+    const float* cbegin = values.data();
+    const float* cend = values.data() + values.size();
+    return std::vector<float>{cbegin, cend};
   }
 
   return def_val;
@@ -741,7 +740,9 @@ std::optional<std::vector<float>> NodeAttrHelper::GetFloats(const std::string& k
   std::optional<std::vector<float>> result;
   if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
     const auto& values = entry->second.floats();
-    result = std::vector<float>(values.begin(), values.end());
+    const float* cbegin = values.data();
+    const float* cend = values.data() + values.size();
+    result = std::vector<float>(cbegin, cend);
   }
 
   return result;
@@ -751,7 +752,9 @@ std::optional<std::vector<int64_t>> NodeAttrHelper::GetInt64s(const std::string&
   std::optional<std::vector<int64_t>> result;
   if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
     const auto& values = entry->second.ints();
-    result = std::vector<int64_t>(values.begin(), values.end());
+    const int64_t* cbegin = values.data();
+    const int64_t* cend = values.data() + values.size();
+    result = std::vector<int64_t>(cbegin, cend);
   }
 
   return result;
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
index c76c99b8454ef..1e01a9d76a5b0 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
@@ -181,7 +181,6 @@ class NodeAttrHelper {
   std::vector<int64_t> Get(const std::string& key, const std::vector<int64_t>& def_val) const;
 
   const std::string& Get(const std::string& key, const std::string& def_val) const;
-  std::vector<std::string> Get(const std::string& key, const std::vector<std::string>& def_val) const;
 
   // Convert the i() or ints() of the attribute from int64_t to int32_t
   int32_t Get(const std::string& key, int32_t def_val) const;

From fb3618dc33e4cbcb4855f849955b1903f4bc28ba Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Fri, 13 Dec 2024 12:03:15 -0800
Subject: [PATCH 20/64] Add convenience function to get the default Env to
 provider bridge

---
 .../core/providers/qnn/builder/qnn_backend_manager.cc        | 5 ++++-
 onnxruntime/core/providers/qnn/qnn_execution_provider.cc     | 1 +
 onnxruntime/core/providers/shared_library/provider_api.h     | 4 ++++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index 3af646c3ce13a..40730a535bd43 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -1099,6 +1099,7 @@ Status QnnBackendManager::ExtractBackendProfilingInfo() {
 
   bool tracelogging_provider_ep_enabled = false;
   const Env& env = Env::Default();
+  // const Env& env = GetDefaultEnv();
   auto& provider = env.GetTelemetryProvider();
   auto level = provider.Level();
   if (provider.IsEnabled()) {
@@ -1492,7 +1493,9 @@ void* QnnBackendManager::LoadLib(const char* file_name, int flags, std::string&
   auto file_path = std::filesystem::path(file_name);
   if (!file_path.is_absolute()) {
     // construct an absolute path from ORT runtime path + file_name and check whether it exists.
-    auto pathstring = Env::Default().GetRuntimePath() + ToPathString(file_name);
+    const Env& env = Env::Default();
+    // const Env& env = GetDefaultEnv();
+    auto pathstring = env.GetRuntimePath() + ToPathString(file_name);
     auto absolute_path = pathstring.c_str();
     if (std::filesystem::exists(std::filesystem::path(absolute_path))) {
       // load library from absolute path and search for dependencies there.
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index ddc9d15e4ea71..ed246b66c2784 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -247,6 +247,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
   // set to invalid to indicate that ETW is no enabled when we setup QNN
   qnn::ProfilingLevel profiling_level_etw = qnn::ProfilingLevel::INVALID;
   const Env& env = Env::Default();
+  // const Env& env = GetDefaultEnv();
   auto& provider = env.GetTelemetryProvider();
   if (provider.IsEnabled()) {
     auto level = provider.Level();
diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index 561744d30844b..50d94249d4916 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -397,6 +397,10 @@ GetAllNodeUnits(const GraphViewer* graph_viewer, const logging::Logger& logger)
 // So the C API (and C++) becomes available when ORT_API_MANUAL_INIT is used.
 void InitProviderOrtApi();
 
+// This is a replacement for Env::Default(). Returns a reference to the default ORT Environment.
+inline Env& GetDefaultEnv() {
+  return g_host->Env__Default();
+}
 }  // namespace onnxruntime
 
 #define CREATE_MESSAGE(logger, severity, category, datatype) \

From 6b581fd698d3888835854693e7ad2a4ff963f996 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Fri, 13 Dec 2024 17:16:52 -0800
Subject: [PATCH 21/64] Moving ORT includes to a separate header

---
 .../builder/opbuilder/reduce_op_builder.cc    |  6 +--
 .../qnn/builder/qnn_backend_manager.cc        |  4 +-
 .../qnn/builder/qnn_backend_manager.h         |  5 +-
 .../core/providers/qnn/builder/qnn_utils.cc   | 18 ++++---
 .../core/providers/qnn/builder/qnn_utils.h    |  5 +-
 onnxruntime/core/providers/qnn/ort_api.h      | 35 +++++++++++++
 .../providers/qnn/qnn_execution_provider.cc   | 40 +++++++-------
 .../providers/qnn/qnn_execution_provider.h    | 21 +++++---
 .../providers/qnn/qnn_provider_factory.cc     | 52 ++++++++++++++++---
 .../qnn/qnn_provider_factory_creator.h        |  1 +
 onnxruntime/core/providers/qnn/symbols.def    |  2 +
 .../core/providers/qnn/version_script.lds     |  9 ++++
 .../core/session/provider_bridge_ort.cc       |  7 +++
 13 files changed, 149 insertions(+), 56 deletions(-)
 create mode 100644 onnxruntime/core/providers/qnn/ort_api.h
 create mode 100644 onnxruntime/core/providers/qnn/symbols.def
 create mode 100644 onnxruntime/core/providers/qnn/version_script.lds

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
index ce6654b3906d7..a2eeeee4453e4 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
@@ -9,7 +9,6 @@
 #include "core/common/safeint.h"
 #include "onnx/defs/data_type_utils.h"
 #include "core/providers/common.h"
-#include "core/framework/endian_utils.h"
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
@@ -145,10 +144,7 @@ Status ReduceOpBuilder::GetAxesSet(QnnModelWrapper& qnn_model_wrapper, const Nod
       auto src_span = gsl::make_span(axes_bytes.data(), axes_bytes.size());
       auto dst_span = gsl::make_span(reduce_axes.data(), reduce_axes.size());
 
-      // Copy initializer bytes (stored in little-endian order) to vector of int64_t.
-      // ReadLittleEndian returns a status error if the source and destination spans do not have
-      // matching byte sizes.
-      ORT_RETURN_IF_ERROR(onnxruntime::utils::ReadLittleEndian(src_span, dst_span));
+      std::memcpy(dst_span.data(), src_span.data(), src_span.size_bytes());
     }
   }
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index 40730a535bd43..7f575257a77f7 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -17,8 +17,8 @@
 #include "HTP/QnnHtpContext.h"
 #include "Saver/QnnSaver.h"
 #include <gsl/gsl>
-#include "core/framework/endian_utils.h"
-#include "core/common/logging/capture.h"
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
 #include "core/providers/qnn/builder/qnn_configs_helper.h"
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
index b145f2a2cd724..661a830bfb733 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
@@ -21,9 +21,8 @@
 #include "QnnLog.h"
 #include "QnnTypes.h"
 #include "System/QnnSystemInterface.h"
-#include "core/common/status.h"
-#include "core/common/logging/logging.h"
-#include "core/common/path_string.h"
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
index 889b6e21647b5..8e7017c063bc0 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
@@ -1,18 +1,16 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "qnn_utils.h"
+
 #include <functional>
 #include <numeric>
 #include <string>
 #include <vector>
 #include <map>
 
-#include "core/common/common.h"
-#include "core/framework/data_types.h"
-#include "core/framework/tensorprotoutils.h"
-#include "qnn_utils.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_def.h"
-#include "core/graph/graph_viewer.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -560,6 +558,14 @@ Status GetQminQmax(const Qnn_DataType_t qnn_data_type,
   return Status::OK();
 }
 
+inline float RoundHalfToEven(float input) {
+  if (!std::isfinite(input)) {
+    return input;
+  }
+  // std::remainder returns x - n, where n is the integral value nearest to x. When |x - n| = 0.5, n is chosen to be even
+  return input - std::remainderf(input, 1.f);
+}
+
 Status GetQuantParams(float rmin,
                       float rmax,
                       const Qnn_DataType_t qnn_data_type,
@@ -584,7 +590,7 @@ Status GetQuantParams(float rmin,
   } else {
     initial_zero_point = qmin - (rmin / scale);
   }
-  zero_point = static_cast<int32_t>(RoundHalfToEven(Saturate(qmax, qmin, initial_zero_point)));
+  zero_point = static_cast<int32_t>(qnn::utils::RoundHalfToEven(Saturate(qmax, qmin, initial_zero_point)));
   // To match QNN quantization definition
   zero_point = 0 - zero_point;
   return Status::OK();
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
index 1e01a9d76a5b0..578f50ba895cf 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
@@ -9,10 +9,7 @@
 #include <vector>
 
 #include "QnnTypes.h"
-#include "core/session/onnxruntime_cxx_api.h"
-#include "core/framework/node_unit.h"
-#include "core/framework/tensor_shape.h"
-#include "core/util/qmath.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/ort_api.h b/onnxruntime/core/providers/qnn/ort_api.h
new file mode 100644
index 0000000000000..0c26d9c99c200
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/ort_api.h
@@ -0,0 +1,35 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License
+
+#pragma once
+
+#define BUILD_QNN_EP_STATIC 1
+
+#if BUILD_QNN_EP_STATIC
+#include "core/common/common.h"
+#include "core/common/status.h"
+#include "core/common/logging/logging.h"
+#include "core/common/logging/capture.h"
+#include "core/common/path_string.h"
+#include "core/platform/env.h"
+#include "core/framework/data_types.h"
+#include "core/framework/run_options.h"
+#include "core/framework/execution_provider.h"
+#include "core/framework/model_metadef_id_generator.h"
+#include "core/framework/compute_capability.h"
+#include "core/framework/tensor_shape.h"
+#include "core/framework/node_unit.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/graph/model.h"
+#include "core/graph/graph_viewer.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
+#include "core/providers/common.h"
+#include "core/providers/partitioning_utils.h"
+#else
+#include "core/providers/shared_library/provider_api.h"
+#endif
+
+#include "core/session/onnxruntime_session_options_config_keys.h"
+#include "core/session/onnxruntime_run_options_config_keys.h"
+#include "core/session/onnxruntime_cxx_api.h"
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index ed246b66c2784..413db0489e37a 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -5,29 +5,21 @@
 
 #include <filesystem>
 #include <unordered_set>
-#include "core/framework/compute_capability.h"
-#include "core/graph/graph_viewer.h"
-#include "core/session/onnxruntime_session_options_config_keys.h"
-#include "core/session/onnxruntime_run_options_config_keys.h"
-#include "core/session/onnxruntime_cxx_api.h"
-#include "core/framework/kernel_registry.h"
-#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
-#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
-#include "core/platform/env.h"
-#include "core/providers/common.h"
-#include "core/providers/partitioning_utils.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
-#include "core/framework/run_options.h"
 
 #ifdef _WIN32
 #include <Windows.h>
+// TODO: Enable once QNN is built as a DLL
+#if 0
 #include "core/platform/windows/logging/etw_sink.h"
 #endif
+#endif  // _WIN32
 
 namespace onnxruntime {
 
@@ -35,6 +27,7 @@ constexpr const char* QNN = "QNN";
 
 static std::unique_ptr<std::vector<std::function<void()>>> s_run_on_unload_;
 
+// TODO: Remove and use versions in EP provider bridge.
 void RunOnUnload(std::function<void()> function) {
   static std::mutex mutex;
   std::lock_guard<std::mutex> guard(mutex);
@@ -44,6 +37,7 @@ void RunOnUnload(std::function<void()> function) {
   s_run_on_unload_->push_back(std::move(function));
 }
 
+// TODO: Remove and use versions in EP provider bridge.
 struct OnUnload {
   ~OnUnload() {
     if (!s_run_on_unload_)
@@ -193,17 +187,17 @@ qnn::ProfilingLevel QNNExecutionProvider::GetProfilingLevelFromETWLevel(unsigned
 }
 
 QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_options_map,
-                                           const SessionOptions* session_options)
+                                           const ConfigOptions* config_options)
     : IExecutionProvider{onnxruntime::kQnnExecutionProvider} {
-  if (session_options) {
-    disable_cpu_ep_fallback_ = session_options->config_options.GetConfigOrDefault(
+  if (config_options) {
+    disable_cpu_ep_fallback_ = config_options->GetConfigOrDefault(
                                    kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
 
-    context_cache_enabled_ = session_options->config_options.GetConfigOrDefault(
+    context_cache_enabled_ = config_options->GetConfigOrDefault(
                                  kOrtSessionOptionEpContextEnable, "0") == "1";
     LOGS_DEFAULT(VERBOSE) << "Context cache enable: " << context_cache_enabled_;
 
-    std::string embed_mode = session_options->config_options.GetConfigOrDefault(
+    std::string embed_mode = config_options->GetConfigOrDefault(
         kOrtSessionOptionEpContextEmbedMode, "0");
     if ("1" == embed_mode) {
       qnn_context_embed_mode_ = true;
@@ -214,18 +208,18 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
     }
     LOGS_DEFAULT(VERBOSE) << "User specified context cache embed mode: " << qnn_context_embed_mode_;
 
-    context_cache_path_cfg_ = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+    context_cache_path_cfg_ = config_options->GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
     LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path_cfg_;
 
     // For the case that workaround QNN context PD memory limit, user need split the model into pieces and
     // generate the QNN context model separately.
     // It could happen that the generated EPContext node in separate graph has same node name.
     // User can set this context_node_name_prefix for each split pieces to avoid that happens.
-    context_node_name_prefix_ = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextNodeNamePrefix, "");
+    context_node_name_prefix_ = config_options->GetConfigOrDefault(kOrtSessionOptionEpContextNodeNamePrefix, "");
     LOGS_DEFAULT(VERBOSE) << "User specified QNN context node name prefix: " << context_node_name_prefix_;
 
     share_ep_contexts_ =
-        session_options->config_options.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1";
+        config_options->GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1";
     LOGS_DEFAULT(VERBOSE) << "User specified option - share EP contexts across sessions: " << share_ep_contexts_;
   }
 
@@ -403,6 +397,8 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
       soc_model,
       enable_htp_weight_sharing);
 
+// TODO: Renable once QNN is a dll
+#if 0
 #ifdef _WIN32
   auto& etwRegistrationManager = logging::EtwRegistrationManager::Instance();
   // Register callback for ETW capture state (rundown)
@@ -445,6 +441,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
       });
   etwRegistrationManager.RegisterInternalCallback(callback_ETWSink_provider_);
 #endif
+#endif
 }
 
 QNNExecutionProvider::~QNNExecutionProvider() {
@@ -458,10 +455,13 @@ QNNExecutionProvider::~QNNExecutionProvider() {
 
   // Unregister the ETW callback
 #ifdef _WIN32
+  // TODO: Re-enable when QNN EP is a DLL
+#if 0
   if (callback_ETWSink_provider_ != nullptr) {
     logging::EtwRegistrationManager::Instance().UnregisterInternalCallback(callback_ETWSink_provider_);
   }
 #endif
+#endif
 }
 
 // Logs information about the supported/unsupported nodes.
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index a0577e8fd87f2..b390988f39da4 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -3,24 +3,26 @@
 
 #pragma once
 
-#include "core/framework/execution_provider.h"
-#include "core/framework/session_options.h"
-#include "core/framework/model_metadef_id_generator.h"
-#include "core/graph/model.h"
+#include <set>
 #include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_backend_manager.h"
 #include "core/providers/qnn/builder/qnn_model.h"
 #include "core/providers/qnn/builder/qnn_configs_helper.h"
 #include "HTP/QnnHtpGraph.h"
-#include <vector>
-#include <set>
-#include <unordered_map>
 #ifdef _WIN32
+// TODO: Reenable when QNN ep is a dll
+#if 0
 #include "core/platform/windows/logging/etw_sink.h"
 #endif
+#endif
 
 namespace onnxruntime {
 
+// TODO: Remove. It's in provider bridge.
 void RunOnUnload(std::function<void()> function);
 
 class SharedContext {
@@ -87,7 +89,7 @@ class SharedContext {
 // Logical device representation.
 class QNNExecutionProvider : public IExecutionProvider {
  public:
-  explicit QNNExecutionProvider(const ProviderOptions& provider_options_map, const SessionOptions* session_options);
+  explicit QNNExecutionProvider(const ProviderOptions& provider_options_map, const ConfigOptions* config_options);
   virtual ~QNNExecutionProvider();
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(QNNExecutionProvider);
 
@@ -151,7 +153,10 @@ class QNNExecutionProvider : public IExecutionProvider {
   bool share_ep_contexts_ = false;
   bool enable_spill_fill_buffer_ = false;
 #ifdef _WIN32
+  // TODO: Re-enable when QNN is a DLL
+#if 0
   onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback callback_ETWSink_provider_ = nullptr;
+#endif
 #endif
   qnn::ModelSettings model_settings_ = {};
 
diff --git a/onnxruntime/core/providers/qnn/qnn_provider_factory.cc b/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
index 4095d7ff02a33..fdeb9dc106386 100644
--- a/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
+++ b/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
@@ -2,32 +2,68 @@
 // Licensed under the MIT License
 
 #include "core/providers/qnn/qnn_provider_factory_creator.h"
-
-#include "core/session/abi_session_options_impl.h"
 #include "core/providers/qnn/qnn_execution_provider.h"
-#include "core/session/ort_apis.h"
 
 namespace onnxruntime {
 struct QNNProviderFactory : IExecutionProviderFactory {
-  QNNProviderFactory(const ProviderOptions& provider_options_map, const SessionOptions* session_options)
-      : provider_options_map_(provider_options_map), session_options_(session_options) {
+  QNNProviderFactory(const ProviderOptions& provider_options_map, const ConfigOptions* config_options)
+      : provider_options_map_(provider_options_map), config_options_(config_options) {
   }
 
   ~QNNProviderFactory() override {
   }
 
   std::unique_ptr<IExecutionProvider> CreateProvider() override {
-    return std::make_unique<QNNExecutionProvider>(provider_options_map_, session_options_);
+    return std::make_unique<QNNExecutionProvider>(provider_options_map_, config_options_);
   }
 
  private:
   ProviderOptions provider_options_map_;
-  const SessionOptions* session_options_;
+  const ConfigOptions* config_options_;
 };
 
+// TODO: Move to core/session/provider_bridge_ort.cc
 std::shared_ptr<IExecutionProviderFactory> QNNProviderFactoryCreator::Create(const ProviderOptions& provider_options_map,
                                                                              const SessionOptions* session_options) {
-  return std::make_shared<onnxruntime::QNNProviderFactory>(provider_options_map, session_options);
+  const ConfigOptions* config_options = nullptr;
+  if (session_options != nullptr) {
+    config_options = &session_options->config_options;
+  }
+
+  return std::make_shared<onnxruntime::QNNProviderFactory>(provider_options_map, config_options);
 }
 
+struct QNN_Provider /*: Provider*/ {
+  std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory(const void* param) /*override*/ {
+    if (param == nullptr) {
+      LOGS_DEFAULT(ERROR) << "[QNN EP] Passed NULL options to CreateExecutionProviderFactory()";
+      return nullptr;
+    }
+
+    std::array<const void*, 2> pointers_array = *reinterpret_cast<const std::array<const void*, 2>*>(param);
+    const ProviderOptions* provider_options = reinterpret_cast<const ProviderOptions*>(pointers_array[0]);
+    const ConfigOptions* config_options = reinterpret_cast<const ConfigOptions*>(pointers_array[1]);
+
+    if (provider_options == nullptr) {
+      LOGS_DEFAULT(ERROR) << "[QNN EP] Passed NULL ProviderOptions to CreateExecutionProviderFactory()";
+      return nullptr;
+    }
+
+    return std::make_shared<onnxruntime::QNNProviderFactory>(*provider_options, config_options);
+  }
+
+  void Initialize() /*override*/ {}
+  void Shutdown() /*override*/ {}
+} g_provider;
+
 }  // namespace onnxruntime
+
+// TODO: Uncomment when it is an EP dll
+#if 0
+extern "C" {
+
+ORT_API(onnxruntime::Provider*, GetProvider) {
+  return &onnxruntime::g_provider;
+}
+}
+#endif
diff --git a/onnxruntime/core/providers/qnn/qnn_provider_factory_creator.h b/onnxruntime/core/providers/qnn/qnn_provider_factory_creator.h
index 80f9d99b804e7..859152752893e 100644
--- a/onnxruntime/core/providers/qnn/qnn_provider_factory_creator.h
+++ b/onnxruntime/core/providers/qnn/qnn_provider_factory_creator.h
@@ -11,6 +11,7 @@
 namespace onnxruntime {
 struct SessionOptions;
 
+// defined in core/session/provider_bridge_ort.cc
 struct QNNProviderFactoryCreator {
   static std::shared_ptr<IExecutionProviderFactory> Create(const ProviderOptions& provider_options_map,
                                                            const SessionOptions* session_options);
diff --git a/onnxruntime/core/providers/qnn/symbols.def b/onnxruntime/core/providers/qnn/symbols.def
new file mode 100644
index 0000000000000..4ec2f7914c208
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/symbols.def
@@ -0,0 +1,2 @@
+EXPORTS
+   GetProvider
diff --git a/onnxruntime/core/providers/qnn/version_script.lds b/onnxruntime/core/providers/qnn/version_script.lds
new file mode 100644
index 0000000000000..094abb3329781
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/version_script.lds
@@ -0,0 +1,9 @@
+#_init and _fini should be local
+VERS_1.0 {
+  global:
+    GetProvider;    
+
+  # Hide everything else.
+  local:
+    *;
+};
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 873a0d10cd094..af23274825aa6 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -1702,6 +1702,12 @@ static ProviderLibrary s_library_tensorrt(LIBRARY_PREFIX ORT_TSTR("onnxruntime_p
 #endif
 );
 static ProviderLibrary s_library_migraphx(LIBRARY_PREFIX ORT_TSTR("onnxruntime_providers_migraphx") LIBRARY_EXTENSION);
+static ProviderLibrary s_library_qnn(LIBRARY_PREFIX ORT_TSTR("onnxruntime_providers_qnn") LIBRARY_EXTENSION
+#ifndef _WIN32
+                                     ,
+                                     false /* unload - On Linux if we unload the vitisai shared provider we crash */
+#endif
+);
 
 void UnloadSharedProviders() {
   s_library_dnnl.Unload();
@@ -1714,6 +1720,7 @@ void UnloadSharedProviders() {
   s_library_rocm.Unload();
   s_library_shared.Unload();
   s_library_migraphx.Unload();
+  s_library_qnn.Unload();
 }
 
 // Used by test code

From a1129e5dae1cce802da0ad6d48e64d235dc5fccf Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Sun, 15 Dec 2024 00:40:49 -0800
Subject: [PATCH 22/64] Add Node::EdgeEnd wrapper class to provider bridge. Add
 NodeUnit constructor

---
 .../providers/shared_library/provider_api.h   |  1 +
 .../shared_library/provider_interfaces.h      | 15 ++++++++++
 .../shared_library/provider_wrappedtypes.h    | 29 ++++++++++++++++++
 .../core/session/provider_bridge_ort.cc       | 30 +++++++++++++++++++
 4 files changed, 75 insertions(+)

diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index 50d94249d4916..ceb654931ae61 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -162,6 +162,7 @@ enum class DataLayout;
 struct Model;
 struct Path;
 struct Node;
+struct Node_EdgeEnd;
 struct NodeArg;
 struct NodeAttributes;
 struct NodeUnitIODef;
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 7b586f6a71642..474eddf5af310 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -118,6 +118,7 @@ struct Node__EdgeIterator {
   virtual bool operator!=(const Node__EdgeIterator& p) const = 0;
 
   virtual void operator++() = 0;
+  virtual const Node_EdgeEnd& operator*() const = 0;
   virtual const Node& GetNode() const = 0;
   virtual int GetSrcArgIndex() const = 0;
   virtual int GetDstArgIndex() const = 0;
@@ -851,6 +852,14 @@ struct ProviderHost {
   virtual const std::unordered_map<std::string, gsl::not_null<Graph*>>& Node__GetAttributeNameToMutableSubgraphMap(Node* p) = 0;
   virtual std::unordered_map<std::string, gsl::not_null<const Graph*>> Node__GetAttributeNameToSubgraphMap(const Node* p) const = 0;
 
+  // Node_EdgeEnd
+  virtual std::unique_ptr<Node_EdgeEnd> Node_EdgeEnd__construct(const Node& node, int src_arg_index, int dst_arg_index) = 0;
+  virtual void Node_EdgeEnd__operator_delete(Node_EdgeEnd* p) noexcept = 0;
+
+  virtual const Node& Node_EdgeEnd__GetNode(const Node_EdgeEnd* p) = 0;
+  virtual int Node_EdgeEnd__GetSrcArgIndex(const Node_EdgeEnd* p) = 0;
+  virtual int Node_EdgeEnd__GetDstArgIndex(const Node_EdgeEnd* p) = 0;
+
   // NodeArg
   virtual const std::string& NodeArg__Name(const NodeArg* p) noexcept = 0;
   virtual const ONNX_NAMESPACE::TensorShapeProto* NodeArg__Shape(const NodeArg* p) = 0;
@@ -881,6 +890,12 @@ struct ProviderHost {
   virtual void NodeAttributes__reserve(NodeAttributes* p, size_t size) = 0;
 
   // NodeUnit
+  virtual std::unique_ptr<NodeUnit> NodeUnit__construct(gsl::span<const Node* const> dq_nodes, const Node& target_node,
+                                                        gsl::span<const Node* const> q_nodes, uint8_t unit_type,
+                                                        gsl::span<const NodeUnitIODef> inputs, gsl::span<const NodeUnitIODef> outputs,
+                                                        size_t input_edge_count, gsl::span<const Node_EdgeEnd* const> output_edges) = 0;
+  virtual void NodeUnit__operator_delete(NodeUnit* p) noexcept = 0;
+
   virtual int NodeUnit__UnitType(const NodeUnit* p) noexcept = 0;
 
   virtual const std::vector<NodeUnitIODef>& NodeUnit__Inputs(const NodeUnit* p) noexcept = 0;
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index c8a8fefdac71e..e4ecab5740af9 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -776,6 +776,21 @@ struct Function final {
   PROVIDER_DISALLOW_ALL(Function)
 };
 
+struct Node_EdgeEnd final {
+  static std::unique_ptr<Node_EdgeEnd> Create(const Node& node, int src_arg_index, int dst_arg_index) {
+    return g_host->Node_EdgeEnd__construct(node, src_arg_index, dst_arg_index);
+  }
+  static void operator delete(void* p) { g_host->Node_EdgeEnd__operator_delete(reinterpret_cast<Node_EdgeEnd*>(p)); }
+
+  const Node& GetNode() const { return g_host->Node_EdgeEnd__GetNode(this); }
+  int GetSrcArgIndex() const { return g_host->Node_EdgeEnd__GetSrcArgIndex(this); }
+  int GetDstArgIndex() const { return g_host->Node_EdgeEnd__GetDstArgIndex(this); }
+
+  Node_EdgeEnd() = delete;
+  Node_EdgeEnd(const Node_EdgeEnd&) = delete;
+  void operator=(const Node_EdgeEnd&) = delete;
+};
+
 struct Node final {
   enum class Type {
     Primitive = 0,
@@ -838,6 +853,7 @@ struct Node final {
     }
 
     void operator++() { impl_->operator++(); }
+    const Node_EdgeEnd& operator*() { return impl_->operator*(); }
     const Node__EdgeIterator* operator->() const { return impl_.get(); }
 
     std::unique_ptr<Node__EdgeIterator> impl_;
@@ -912,6 +928,15 @@ struct NodeUnit final {
     QDQGroup,    // The NodeUnit contain a QDQ group of nodes, such as "DQ->Sigmoid->Q"
   };
 
+  static std::unique_ptr<NodeUnit> Create(gsl::span<const Node* const> dq_nodes, const Node& target_node,
+                                          gsl::span<const Node* const> q_nodes, Type unit_type,
+                                          gsl::span<const NodeUnitIODef> inputs, gsl::span<const NodeUnitIODef> outputs,
+                                          size_t input_edge_count, gsl::span<const Node_EdgeEnd* const> output_edges) {
+    return g_host->NodeUnit__construct(dq_nodes, target_node, q_nodes, static_cast<uint8_t>(unit_type),
+                                       inputs, outputs, input_edge_count, output_edges);
+  }
+  static void operator delete(void* p) { g_host->NodeUnit__operator_delete(reinterpret_cast<NodeUnit*>(p)); }
+
   Type UnitType() const noexcept { return static_cast<Type>(g_host->NodeUnit__UnitType(this)); }
 
   const std::vector<NodeUnitIODef>& Inputs() const noexcept { return g_host->NodeUnit__Inputs(this); }
@@ -938,6 +963,10 @@ struct NodeUnit final {
   // output. any Q nodes are hidden.
   Node::EdgeConstIterator OutputEdgesBegin() const { return g_host->NodeUnit__OutputEdgesBegin(this); }
   Node::EdgeConstIterator OutputEdgesEnd() const { return g_host->NodeUnit__OutputEdgesEnd(this); }
+
+  NodeUnit() = delete;
+  NodeUnit(const NodeUnit&) = delete;
+  void operator=(const NodeUnit& v) = delete;
 };
 
 struct Model final {
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index af23274825aa6..b6a9aa287e924 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -76,6 +76,7 @@ using FunctionProtos = google::protobuf::RepeatedPtrField<FunctionProto>;
 namespace onnxruntime {
 using IndexedSubGraph_MetaDef = IndexedSubGraph::MetaDef;
 using IndexedSubGraph_SourceOfSchema = IndexedSubGraph::SourceOfSchema;
+using Node_EdgeEnd = Node::EdgeEnd;
 }  // namespace onnxruntime
 
 #include "core/common/cpuid_info.h"
@@ -182,6 +183,7 @@ struct Node__EdgeIterator_Impl : Node__EdgeIterator {
   bool operator!=(const Node__EdgeIterator& p) const override { return v_ != static_cast<const Node__EdgeIterator_Impl*>(&p)->v_; }
 
   void operator++() override { v_.operator++(); }
+  const Node_EdgeEnd& operator*() const override { return v_.operator*(); }
   const Node& GetNode() const override { return v_->GetNode(); }
   int GetSrcArgIndex() const override { return v_->GetSrcArgIndex(); }
   int GetDstArgIndex() const override { return v_->GetDstArgIndex(); }
@@ -1005,6 +1007,16 @@ struct ProviderHostImpl : ProviderHost {
   std::unordered_map<std::string, gsl::not_null<const Graph*>> Node__GetAttributeNameToSubgraphMap(const Node* p) const override { return p->GetAttributeNameToSubgraphMap(); }
   int Node__NodeType(const Node* p) const noexcept override { return int(p->NodeType()); }
 
+  // Node_EdgeEnd (wrapped). Maps to Node::EdgeEnd struct.
+  std::unique_ptr<Node_EdgeEnd> Node_EdgeEnd__construct(const Node& node, int src_arg_index, int dst_arg_index) override {
+    return std::make_unique<Node::EdgeEnd>(node, src_arg_index, dst_arg_index);
+  }
+  void Node_EdgeEnd__operator_delete(Node_EdgeEnd* p) noexcept override { delete p; }
+
+  const Node& Node_EdgeEnd__GetNode(const Node_EdgeEnd* p) override { return p->GetNode(); }
+  int Node_EdgeEnd__GetSrcArgIndex(const Node_EdgeEnd* p) override { return p->GetSrcArgIndex(); }
+  int Node_EdgeEnd__GetDstArgIndex(const Node_EdgeEnd* p) override { return p->GetDstArgIndex(); }
+
   // NodeArg (wrapped)
   const std::string& NodeArg__Name(const NodeArg* p) noexcept override { return p->Name(); }
   const ONNX_NAMESPACE::TensorShapeProto* NodeArg__Shape(const NodeArg* p) override { return p->Shape(); }
@@ -1040,6 +1052,24 @@ struct ProviderHostImpl : ProviderHost {
   void NodeAttributes__reserve(NodeAttributes* p, size_t size) override { p->reserve(size); }
 
   // NodeUnit (wrapped)
+  std::unique_ptr<NodeUnit> NodeUnit__construct(gsl::span<const Node* const> dq_nodes,
+                                                const Node& target_node,
+                                                gsl::span<const Node* const> q_nodes,
+                                                uint8_t unit_type,
+                                                gsl::span<const NodeUnitIODef> inputs,
+                                                gsl::span<const NodeUnitIODef> outputs,
+                                                size_t input_edge_count,
+                                                gsl::span<const Node_EdgeEnd* const> output_edges) override {
+    Node::EdgeSet output_edge_set;
+    for (const Node_EdgeEnd* edge_end : output_edges) {
+      output_edge_set.insert(*edge_end);
+    }
+
+    return std::make_unique<NodeUnit>(dq_nodes, target_node, q_nodes, static_cast<NodeUnit::Type>(unit_type),
+                                      inputs, outputs, input_edge_count, output_edge_set);
+  }
+  void NodeUnit__operator_delete(NodeUnit* p) noexcept override { delete p; }
+
   int NodeUnit__UnitType(const NodeUnit* p) noexcept override { return static_cast<int>(p->UnitType()); }
 
   const std::vector<NodeUnitIODef>& NodeUnit__Inputs(const NodeUnit* p) noexcept override {

From ba86c418df30114d3cc4cba803b33b249129f570 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Sun, 15 Dec 2024 01:18:18 -0800
Subject: [PATCH 23/64] Move more header includes to ort_api.h

---
 .../core/providers/qnn/builder/onnx_ctx_model_helper.cc   | 7 ++++---
 .../core/providers/qnn/builder/onnx_ctx_model_helper.h    | 5 +----
 onnxruntime/core/providers/qnn/builder/op_builder.h       | 3 +--
 .../core/providers/qnn/builder/op_builder_factory.cc      | 2 --
 .../core/providers/qnn/builder/qnn_configs_helper.h       | 8 ++++----
 onnxruntime/core/providers/qnn/builder/qnn_def.h          | 3 +--
 onnxruntime/core/providers/qnn/builder/qnn_model.cc       | 4 +---
 onnxruntime/core/providers/qnn/builder/qnn_model.h        | 7 ++-----
 .../core/providers/qnn/builder/qnn_model_wrapper.cc       | 6 +++---
 .../core/providers/qnn/builder/qnn_model_wrapper.h        | 6 ++----
 onnxruntime/core/providers/qnn/builder/qnn_node_group.h   | 3 +--
 .../core/providers/qnn/builder/qnn_node_group/utils.cc    | 3 +--
 .../core/providers/qnn/builder/qnn_node_group/utils.h     | 3 +--
 .../core/providers/qnn/builder/qnn_quant_params_wrapper.h | 6 +++---
 onnxruntime/core/providers/qnn/ort_api.h                  | 4 ++++
 onnxruntime/core/providers/qnn/qnn_execution_provider.cc  | 3 +++
 16 files changed, 32 insertions(+), 41 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
index 1bb764913b6ea..9fe1ff3da381a 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -2,14 +2,15 @@
 // Licensed under the MIT License.
 
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
-#include "core/graph/constants.h"
-#include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/providers/qnn/builder/qnn_model.h"
 
 #include <iostream>
 #include <fstream>
 #include <filesystem>
 
+#include "core/providers/qnn/ort_api.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
+#include "core/providers/qnn/builder/qnn_model.h"
+
 namespace onnxruntime {
 namespace qnn {
 
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
index f1ca1374be0b7..48d68aec55c38 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
@@ -7,10 +7,7 @@
 #include <vector>
 
 #include "qnn_def.h"
-#include "core/common/logging/logging.h"
-#include "core/graph/graph_viewer.h"
-#include "core/graph/model.h"
-#include "core/framework/execution_provider.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime {
 
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder.h b/onnxruntime/core/providers/qnn/builder/op_builder.h
index b729503320f05..0846275496ebf 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/op_builder.h
@@ -3,8 +3,7 @@
 
 #pragma once
 
-#include "core/graph/graph_viewer.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
index 6ef17b40d274b..3e337f679056f 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
+++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
@@ -5,8 +5,6 @@
 #include <unordered_map>
 #include <string>
 
-#include <core/graph/graph.h>
-
 #include "op_builder_factory.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h b/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h
index 9dd9bbaa08d64..1f0680782d76b 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include <core/common/inlined_containers_fwd.h>
+#include <vector>
 
 namespace onnxruntime {
 namespace qnn {
@@ -81,9 +81,9 @@ class QnnConfigsBuilder {
 
   BaseConfigType base_config_init_;
   CustomConfigType custom_config_init_;
-  InlinedVector<CustomConfigType> custom_configs_;
-  InlinedVector<BaseConfigType> configs_;
-  InlinedVector<const BaseConfigType*> config_ptrs_;
+  std::vector<CustomConfigType> custom_configs_;
+  std::vector<BaseConfigType> configs_;
+  std::vector<const BaseConfigType*> config_ptrs_;
 };
 
 }  // namespace qnn
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_def.h b/onnxruntime/core/providers/qnn/builder/qnn_def.h
index ffd2dc9b11010..705212ae52c77 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_def.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_def.h
@@ -9,8 +9,7 @@
 #include <memory>
 #include <climits>
 #include <type_traits>
-#include "core/graph/basic_types.h"
-#include "core/common/common.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_quant_params_wrapper.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.cc b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
index 79d13ba77ec86..8bafd17b2648e 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
@@ -6,11 +6,9 @@
 #include <iostream>
 #include "QnnOpDef.h"
 
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
-#include "core/framework/utils.h"
-#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
-#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.h b/onnxruntime/core/providers/qnn/builder/qnn_model.h
index 2e0935391ca78..489acaacde4fe 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.h
@@ -3,16 +3,13 @@
 
 #pragma once
 
+#include <mutex>
 #include <vector>
 
-#include "core/common/status.h"
-#include "core/framework/node_unit.h"
-#include "core/graph/graph_viewer.h"
-#include <mutex>
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/qnn_backend_manager.h"
-#include "core/session/onnxruntime_cxx_api.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
index 147c740313d7d..79e9e1408a9ca 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "qnn_model_wrapper.h"
+
 #include <algorithm>
 #include <cstdlib>
 #include <cstring>
@@ -8,9 +10,7 @@
 #include <utility>
 #include <vector>
 
-#include "qnn_model_wrapper.h"
-#include "core/common/safeint.h"
-#include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
index 9e308aa33a560..8cd7360606d71 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
@@ -7,12 +7,10 @@
 #include <string>
 #include <vector>
 
-#include "core/common/status.h"
 #include "QnnInterface.h"
 #include "qnn_def.h"
-#include "core/common/logging/logging.h"
-#include "core/framework/node_unit.h"
-#include "core/graph/graph_viewer.h"
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_quant_params_wrapper.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group.h
index f9ef01411310f..276fbaae3b3c9 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group.h
@@ -8,8 +8,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "core/common/logging/logging.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc
index 5548d7d37c378..93b2fca296389 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc
@@ -4,8 +4,7 @@
 #include <string_view>
 #include <unordered_map>
 
-#include "core/graph/graph_viewer.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h
index 0d11d21906ccb..c4cf4e8a20a92 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h
@@ -7,8 +7,7 @@
 #include <string_view>
 #include <unordered_map>
 
-#include "core/graph/graph_viewer.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
index 23330f5616d73..01c15cf4bebe6 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
@@ -4,10 +4,10 @@
 #pragma once
 #include <memory>
 #include <vector>
-#include "QnnTypes.h"
-#include "core/common/common.h"
 #include <gsl/gsl>
-#include "core/framework/node_unit.h"
+
+#include "core/providers/qnn/ort_api.h"
+#include "QnnTypes.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/ort_api.h b/onnxruntime/core/providers/qnn/ort_api.h
index 0c26d9c99c200..1e6f96b565385 100644
--- a/onnxruntime/core/providers/qnn/ort_api.h
+++ b/onnxruntime/core/providers/qnn/ort_api.h
@@ -8,6 +8,7 @@
 #if BUILD_QNN_EP_STATIC
 #include "core/common/common.h"
 #include "core/common/status.h"
+#include "core/common/safeint.h"
 #include "core/common/logging/logging.h"
 #include "core/common/logging/capture.h"
 #include "core/common/path_string.h"
@@ -20,6 +21,9 @@
 #include "core/framework/tensor_shape.h"
 #include "core/framework/node_unit.h"
 #include "core/framework/tensorprotoutils.h"
+#include "core/framework/utils.h"
+#include "core/graph/constants.h"
+#include "core/graph/basic_types.h"
 #include "core/graph/model.h"
 #include "core/graph/graph_viewer.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 413db0489e37a..2cc954db5ad7f 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -189,6 +189,9 @@ qnn::ProfilingLevel QNNExecutionProvider::GetProfilingLevelFromETWLevel(unsigned
 QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_options_map,
                                            const ConfigOptions* config_options)
     : IExecutionProvider{onnxruntime::kQnnExecutionProvider} {
+  // TODO: Uncomment when QNN EP is built as a DLL
+  // InitProviderOrtApi();
+
   if (config_options) {
     disable_cpu_ep_fallback_ = config_options->GetConfigOrDefault(
                                    kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";

From 2b1ea09dfa11cf482c72fc8f57328c64276f8358 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Sun, 15 Dec 2024 01:26:46 -0800
Subject: [PATCH 24/64] Add GraphViewer::NodeProducesGraphOutput() to provider
 bridge

---
 .../core/providers/shared_library/provider_interfaces.h        | 1 +
 .../core/providers/shared_library/provider_wrappedtypes.h      | 3 +++
 onnxruntime/core/session/provider_bridge_ort.cc                | 3 +++
 3 files changed, 7 insertions(+)

diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 474eddf5af310..3e4b81fb75773 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -1021,6 +1021,7 @@ struct ProviderHost {
 
   virtual const std::vector<const NodeArg*>& GraphViewer__GetInputs(const GraphViewer* p) noexcept = 0;
   virtual const std::vector<const NodeArg*>& GraphViewer__GetOutputs(const GraphViewer* p) noexcept = 0;
+  virtual bool GraphViewer__NodeProducesGraphOutput(const GraphViewer* p, const Node& node) = 0;
   virtual const std::unordered_set<const NodeArg*>& GraphViewer__GetValueInfo(const GraphViewer* p) noexcept = 0;
 
   virtual const InitializedTensorSet& GraphViewer__GetAllInitializedTensors(const GraphViewer* p) = 0;
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index e4ecab5740af9..c2d99e1d5786f 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -1087,6 +1087,9 @@ class GraphViewer final {
 
   const std::vector<const NodeArg*>& GetInputs() const noexcept { return g_host->GraphViewer__GetInputs(this); }
   const std::vector<const NodeArg*>& GetOutputs() const noexcept { return g_host->GraphViewer__GetOutputs(this); }
+  bool NodeProducesGraphOutput(const Node& node) const {
+    return g_host->GraphViewer__NodeProducesGraphOutput(this, node);
+  }
   const std::unordered_set<const NodeArg*>& GetValueInfo() const noexcept { return g_host->GraphViewer__GetValueInfo(this); }
 
   const InitializedTensorSet& GetAllInitializedTensors() const noexcept { return g_host->GraphViewer__GetAllInitializedTensors(this); }
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index b6a9aa287e924..08943e5a4a2a6 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -1263,6 +1263,9 @@ struct ProviderHostImpl : ProviderHost {
 
   const std::vector<const NodeArg*>& GraphViewer__GetInputs(const GraphViewer* p) noexcept override { return p->GetInputs(); }
   const std::vector<const NodeArg*>& GraphViewer__GetOutputs(const GraphViewer* p) noexcept override { return p->GetOutputs(); }
+  bool GraphViewer__NodeProducesGraphOutput(const GraphViewer* p, const Node& node) override {
+    return p->NodeProducesGraphOutput(node);
+  }
   const std::unordered_set<const NodeArg*>& GraphViewer__GetValueInfo(const GraphViewer* p) noexcept override { return p->GetValueInfo(); }
 
   const InitializedTensorSet& GraphViewer__GetAllInitializedTensors(const GraphViewer* p) override { return p->GetAllInitializedTensors(); }

From 421cd7814ae1ea5da93becfcee03691a37e95893 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Sun, 15 Dec 2024 04:25:04 -0800
Subject: [PATCH 25/64] Replace use of InlinedVector with std::vector and fix
 newly discovered bug in qnn_configs_helper

---
 .../qnn/builder/qnn_backend_manager.cc        | 28 ++++++-------
 .../qnn/builder/qnn_configs_helper.h          | 26 +++++++-----
 .../providers/qnn/qnn_execution_provider.cc   | 40 +++++++++----------
 3 files changed, 50 insertions(+), 44 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index 7f575257a77f7..6a1dd37d01b65 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -394,25 +394,25 @@ Status QnnBackendManager::CreateDevice() {
     // Set SoC Model. The *enum* Qnn_SocModel_t is deprecated and will not be updated in the future. Therefore,
     // must use the latest SDK documentation to get the SoC model of the latest HW.
     if (soc_model_ != QNN_SOC_MODEL_UNKNOWN) {
-      QnnHtpDevice_CustomConfig_t& custom_config = device_configs_builder.PushCustomConfig();
-      custom_config.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC;
-      custom_config.socModel = soc_model_;
+      gsl::not_null<QnnHtpDevice_CustomConfig_t*> custom_config = device_configs_builder.PushCustomConfig();
+      custom_config->option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC;
+      custom_config->socModel = soc_model_;
 
-      QnnDevice_Config_t& device_config = device_configs_builder.PushConfig();
-      device_config.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
-      device_config.customConfig = &custom_config;
+      gsl::not_null<QnnDevice_Config_t*> device_config = device_configs_builder.PushConfig();
+      device_config->option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
+      device_config->customConfig = custom_config;
     }
 
     // Set the minimum HTP architecture. The driver will use ops that are compatible with this minimum architecture.
     if (htp_arch_ != QNN_HTP_DEVICE_ARCH_NONE) {
-      QnnHtpDevice_CustomConfig_t& custom_config = device_configs_builder.PushCustomConfig();
-      custom_config.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH;
-      custom_config.arch.arch = htp_arch_;
-      custom_config.arch.deviceId = device_id_;
-
-      QnnDevice_Config_t& device_config = device_configs_builder.PushConfig();
-      device_config.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
-      device_config.customConfig = &custom_config;
+      gsl::not_null<QnnHtpDevice_CustomConfig_t*> custom_config = device_configs_builder.PushCustomConfig();
+      custom_config->option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH;
+      custom_config->arch.arch = htp_arch_;
+      custom_config->arch.deviceId = device_id_;
+
+      gsl::not_null<QnnDevice_Config_t*> device_config = device_configs_builder.PushConfig();
+      device_config->option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
+      device_config->customConfig = custom_config;
     }
   }
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h b/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h
index 1f0680782d76b..b581cd90537d9 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <gsl/gsl>
 #include <vector>
 
 namespace onnxruntime {
@@ -49,9 +50,9 @@ class QnnConfigsBuilder {
    *
    * \return A reference to a default CustomConfigType object.
    */
-  CustomConfigType& PushCustomConfig() {
-    custom_configs_.push_back(custom_config_init_);
-    return custom_configs_.back();
+  gsl::not_null<CustomConfigType*> PushCustomConfig() {
+    custom_configs_.push_back(std::make_unique<CustomConfigType>(custom_config_init_));
+    return custom_configs_.back().get();
   }
 
   /**
@@ -60,15 +61,15 @@ class QnnConfigsBuilder {
    *
    * \return A reference to a default BaseConfigType object.
    */
-  BaseConfigType& PushConfig() {
-    configs_.push_back(base_config_init_);
-    BaseConfigType& config = configs_.back();
+  gsl::not_null<BaseConfigType*> PushConfig() {
+    configs_.push_back(std::make_unique<BaseConfigType>(base_config_init_));
+    BaseConfigType* config = configs_.back().get();
 
     // Add pointer to this new config to the list of config pointers.
     if (IsNullTerminated()) {
-      config_ptrs_.back() = &config;  // Replace last nullptr entry.
+      config_ptrs_.back() = config;  // Replace last nullptr entry.
     } else {
-      config_ptrs_.push_back(&config);
+      config_ptrs_.push_back(config);
     }
 
     return config;
@@ -81,8 +82,13 @@ class QnnConfigsBuilder {
 
   BaseConfigType base_config_init_;
   CustomConfigType custom_config_init_;
-  std::vector<CustomConfigType> custom_configs_;
-  std::vector<BaseConfigType> configs_;
+
+  // Store elements of unique_ptrs instead of by value because std::vector reallocation would change the
+  // location of elements in memory. BaseConfigType objects may contain pointers to CustomConfigType objects,
+  // so we need to make sure that pointers to these objects are stable in memory.
+  std::vector<std::unique_ptr<CustomConfigType>> custom_configs_;
+  std::vector<std::unique_ptr<BaseConfigType>> configs_;
+
   std::vector<const BaseConfigType*> config_ptrs_;
 };
 
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 2cc954db5ad7f..f4ce648251415 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -838,34 +838,34 @@ Status QNNExecutionProvider::CreateComputeFunc(std::vector<NodeComputeInfo>& nod
 void QNNExecutionProvider::InitQnnGraphConfigs(qnn::QnnConfigsBuilder<QnnGraph_Config_t, QnnHtpGraph_CustomConfig_t>& configs_builder) const {
   if (qnn_backend_manager_->GetQnnBackendType() == qnn::QnnBackendType::HTP) {
     if (htp_graph_finalization_opt_mode_ != qnn::HtpGraphFinalizationOptimizationMode::kDefault) {
-      QnnHtpGraph_CustomConfig_t& htp_graph_opt_config = configs_builder.PushCustomConfig();
-      htp_graph_opt_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
-      htp_graph_opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
-      htp_graph_opt_config.optimizationOption.floatValue = static_cast<float>(htp_graph_finalization_opt_mode_);
-
-      QnnGraph_Config_t& graph_opt_config = configs_builder.PushConfig();
-      graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-      graph_opt_config.customConfig = &htp_graph_opt_config;
+      gsl::not_null<QnnHtpGraph_CustomConfig_t*> htp_graph_opt_config = configs_builder.PushCustomConfig();
+      htp_graph_opt_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
+      htp_graph_opt_config->optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
+      htp_graph_opt_config->optimizationOption.floatValue = static_cast<float>(htp_graph_finalization_opt_mode_);
+
+      gsl::not_null<QnnGraph_Config_t*> graph_opt_config = configs_builder.PushConfig();
+      graph_opt_config->option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+      graph_opt_config->customConfig = htp_graph_opt_config;
     }
 
     if (vtcm_size_in_mb_ > 0) {
-      QnnHtpGraph_CustomConfig_t& htp_graph_opt_config_vtcm = configs_builder.PushCustomConfig();
-      htp_graph_opt_config_vtcm.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
-      htp_graph_opt_config_vtcm.vtcmSizeInMB = static_cast<uint32_t>(vtcm_size_in_mb_);
+      gsl::not_null<QnnHtpGraph_CustomConfig_t*> htp_graph_opt_config_vtcm = configs_builder.PushCustomConfig();
+      htp_graph_opt_config_vtcm->option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
+      htp_graph_opt_config_vtcm->vtcmSizeInMB = static_cast<uint32_t>(vtcm_size_in_mb_);
 
-      QnnGraph_Config_t& graph_opt_config_vtcm = configs_builder.PushConfig();
-      graph_opt_config_vtcm.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-      graph_opt_config_vtcm.customConfig = &htp_graph_opt_config_vtcm;
+      gsl::not_null<QnnGraph_Config_t*> graph_opt_config_vtcm = configs_builder.PushConfig();
+      graph_opt_config_vtcm->option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+      graph_opt_config_vtcm->customConfig = htp_graph_opt_config_vtcm;
     }
 
     if (enable_HTP_FP16_precision_) {
-      QnnHtpGraph_CustomConfig_t& htp_graph_precision_config = configs_builder.PushCustomConfig();
-      htp_graph_precision_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
-      htp_graph_precision_config.precision = QNN_PRECISION_FLOAT16;
+      gsl::not_null<QnnHtpGraph_CustomConfig_t*> htp_graph_precision_config = configs_builder.PushCustomConfig();
+      htp_graph_precision_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
+      htp_graph_precision_config->precision = QNN_PRECISION_FLOAT16;
 
-      QnnGraph_Config_t& graph_precision_config = configs_builder.PushConfig();
-      graph_precision_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-      graph_precision_config.customConfig = &htp_graph_precision_config;
+      gsl::not_null<QnnGraph_Config_t*> graph_precision_config = configs_builder.PushConfig();
+      graph_precision_config->option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+      graph_precision_config->customConfig = htp_graph_precision_config;
     }
   }
 }

From d94e6f7bf839738057ee6e43ddd2a658fc1083e7 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Sun, 15 Dec 2024 04:26:50 -0800
Subject: [PATCH 26/64] Eliminate use of qmath.h by introducing new
 quantization utils for QNN

---
 .../builder/opbuilder/simple_op_builder.cc    |  24 +--
 .../core/providers/qnn/builder/qnn_utils.cc   | 173 ++++++++++++++----
 .../core/providers/qnn/builder/qnn_utils.h    |  59 +++++-
 3 files changed, 203 insertions(+), 53 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
index f23b6b240389d..307ab31a09651 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
@@ -1,15 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/framework/tensorprotoutils.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/common/safeint.h"
-#include "core/util/qmath.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -259,15 +254,16 @@ Status ProcessAlphaAttributeAsInput(QnnModelWrapper& qnn_model_wrapper,
   // Check LeakyRelu input 0 to see if it's quantized tensor
   bool is_quantized_tensor = node_unit.Outputs()[0].quant_param.has_value();
   if (is_quantized_tensor) {
-    float scale;
-    uint8_t zero_point;
-    int64_t num_of_elements = 1;
-    concurrency::ThreadPool* thread_pool = nullptr;
-    GetQuantizationParameter(&tensor_data.alpha, num_of_elements, scale, zero_point, thread_pool);
-    unpacked_data.resize(1);
-    ParQuantizeLinearStd(&tensor_data.alpha, unpacked_data.data(), num_of_elements, scale, zero_point, thread_pool);
-    quantize_param = QnnQuantParamsWrapper(scale, static_cast<int32_t>(zero_point));
     qnn_data_type = QNN_DATATYPE_UFIXED_POINT_8;
+    std::array<float, 1> scales = {1.0f};
+    std::array<int32_t, 1> offsets = {0};
+    std::array<uint32_t, 1> shape = {1};
+    auto float_data = gsl::make_span<const float>(&tensor_data.alpha, 1);
+    ORT_RETURN_IF_ERROR(qnn::utils::GetDataQuantParams(float_data, shape, scales, offsets, qnn_data_type));
+
+    unpacked_data.resize(1);
+    ORT_RETURN_IF_ERROR(qnn::utils::QuantizeData(float_data, shape, scales, offsets, unpacked_data, qnn_data_type));
+    quantize_param = QnnQuantParamsWrapper(scales[0], static_cast<int32_t>(offsets[0]));
   } else {
     const auto& inputs = node_unit.Inputs();
     TensorInfo input_info = {};
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
index 8e7017c063bc0..64b62779263ad 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
@@ -523,39 +523,14 @@ bool OnnxDataTypeToQnnDataType(const int32_t onnx_data_type, Qnn_DataType_t& qnn
 }
 
 std::pair<float, float> CheckMinMax(float rmin, float rmax) {
-  // Ensure a minimum range of 0.0001 (required by QNN)
-  rmax = std::max(rmax, rmin + 0.0001f);
-
   // Both QNN and ORT require the range to include 0.0f
   rmin = std::min(rmin, 0.0f);
   rmax = std::max(rmax, 0.0f);
 
-  return std::make_pair(rmin, rmax);
-}
+  // Ensure a minimum range of 0.0001 (required by QNN)
+  rmax = std::max(rmax, rmin + 0.0001f);
 
-template <typename T>
-Status GetQminQmax(const Qnn_DataType_t qnn_data_type,
-                   T& qmin,
-                   T& qmax) {
-  if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_8) {
-    qmin = static_cast<T>(std::numeric_limits<int8_t>::min());
-    qmax = static_cast<T>(std::numeric_limits<int8_t>::max());
-  } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_8) {
-    qmin = static_cast<T>(std::numeric_limits<uint8_t>::min());
-    qmax = static_cast<T>(std::numeric_limits<uint8_t>::max());
-  } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_16) {
-    qmin = static_cast<T>(std::numeric_limits<int16_t>::min());
-    qmax = static_cast<T>(std::numeric_limits<int16_t>::max());
-  } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) {
-    qmin = static_cast<T>(std::numeric_limits<uint16_t>::min());
-    qmax = static_cast<T>(std::numeric_limits<uint16_t>::max());
-  } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_32) {
-    qmin = static_cast<T>(std::numeric_limits<int32_t>::min());
-    qmax = static_cast<T>(std::numeric_limits<int32_t>::max());
-  } else {
-    ORT_RETURN_IF(true, "Qnn Data Type: %d not supported yet.", qnn_data_type);
-  }
-  return Status::OK();
+  return std::make_pair(rmin, rmax);
 }
 
 inline float RoundHalfToEven(float input) {
@@ -579,20 +554,22 @@ Status GetQuantParams(float rmin,
     rmin = -abs_max;
   }
 
-  float qmin = 0.0f;
-  float qmax = 255.0f;
-  ORT_RETURN_IF_ERROR(GetQminQmax(qnn_data_type, qmin, qmax));
+  double rmin_dbl = static_cast<double>(rmin);
+  double rmax_dbl = static_cast<double>(rmax);
+  double qmin = 0.0;
+  double qmax = 0.0;
+  ORT_RETURN_IF_ERROR(GetQminQmax(qnn_data_type, qmin, qmax, symmetric));
 
-  scale = (rmax - rmin) / (qmax - qmin);
-  float initial_zero_point = 0.0f;
+  double scale_dbl = (rmax_dbl - rmin_dbl) / (qmax - qmin);
+  double initial_zero_point = 0.0;
   if (symmetric) {
-    initial_zero_point = std::round(rmin + rmax) / 2;
+    initial_zero_point = std::round(rmin_dbl + rmax_dbl) / 2;
   } else {
-    initial_zero_point = qmin - (rmin / scale);
+    initial_zero_point = qmin - (rmin_dbl / scale_dbl);
   }
-  zero_point = static_cast<int32_t>(qnn::utils::RoundHalfToEven(Saturate(qmax, qmin, initial_zero_point)));
-  // To match QNN quantization definition
-  zero_point = 0 - zero_point;
+  zero_point = static_cast<int32_t>(RoundHalfToEven(static_cast<float>(Saturate(qmax, qmin, initial_zero_point))));
+  zero_point = -zero_point;  // Negate to match QNN quantization definition.
+  scale = static_cast<float>(scale_dbl);
   return Status::OK();
 }
 
@@ -614,6 +591,126 @@ Status Quantize(const double double_value,
   return Status::OK();
 }
 
+size_t ShapeSizeCalc(gsl::span<const uint32_t> shape, size_t start, size_t end) {
+  size_t size = 1;
+  for (size_t i = start; i < end; i++) {
+    size *= shape[i];
+  }
+  return size;
+}
+
+Status GetDataQuantParams(gsl::span<const float> data, gsl::span<const uint32_t> shape,
+                          /*out*/ gsl::span<float> scales, /*out*/ gsl::span<int32_t> offsets,
+                          Qnn_DataType_t data_type, bool symmetric, std::optional<int64_t> axis) {
+  const size_t num_dims = shape.size();
+  const size_t num_elems = ShapeSizeCalc(shape, 0, num_dims);
+  ORT_RETURN_IF_NOT(num_elems == data.size(), "Shape mismatch with data to quantize");
+
+  size_t block_count = 1;
+  size_t broadcast_dim = 1;
+  size_t block_size = num_elems;
+
+  if (axis.has_value()) {
+    size_t axis_no_neg = *axis < 0 ? static_cast<size_t>(*axis) + num_dims : static_cast<size_t>(*axis);
+    block_count = ShapeSizeCalc(shape, 0, axis_no_neg);
+    broadcast_dim = shape[axis_no_neg];
+    block_size = ShapeSizeCalc(shape, axis_no_neg + 1, num_dims);
+  }
+
+  ORT_RETURN_IF_NOT(scales.size() == broadcast_dim, "Unexpected size of scales output buffer");
+  ORT_RETURN_IF_NOT(offsets.size() == broadcast_dim, "Unexpected size of offsets output buffer");
+
+  size_t i = 0;
+  for (size_t n = 0; n < block_count; n++) {
+    for (size_t bd = 0; bd < broadcast_dim; bd++) {
+      float rmin = std::numeric_limits<float>::max();
+      float rmax = std::numeric_limits<float>::lowest();
+      for (size_t j = 0; j < block_size; j++) {
+        rmin = std::min(rmin, data[i]);
+        rmax = std::max(rmax, data[i]);
+        i++;
+      }
+
+      scales[bd] = 1.0f;
+      offsets[bd] = 0;
+      ORT_RETURN_IF_ERROR(GetQuantParams(rmin, rmax, data_type, scales[bd], offsets[bd], symmetric));
+    }
+  }
+
+  assert(i == data.size());
+  return Status::OK();
+}
+
+Status QuantizeData(gsl::span<const float> data, gsl::span<const uint32_t> shape,
+                    gsl::span<const float> scales, gsl::span<const int32_t> offsets,
+                    /*out*/ gsl::span<uint8_t> quant_bytes, Qnn_DataType_t data_type,
+                    std::optional<int64_t> axis) {
+  const size_t num_dims = shape.size();
+  const size_t num_elems = ShapeSizeCalc(shape, 0, num_dims);
+  ORT_RETURN_IF_NOT(num_elems == data.size(), "Shape mismatch with data to quantize");
+  size_t expected_num_quant_bytes = GetElementSizeByType(data_type) * data.size();
+  ORT_RETURN_IF_NOT(quant_bytes.size() == expected_num_quant_bytes,
+                    "Cannot quantize data because output buffer is not the correct size");
+
+  size_t block_count = 1;
+  size_t broadcast_dim = 1;
+  size_t block_size = num_elems;
+
+  if (axis.has_value()) {
+    size_t axis_no_neg = *axis < 0 ? static_cast<size_t>(*axis) + num_dims : static_cast<size_t>(*axis);
+    block_count = ShapeSizeCalc(shape, 0, axis_no_neg);
+    broadcast_dim = shape[axis_no_neg];
+    block_size = ShapeSizeCalc(shape, axis_no_neg + 1, num_dims);
+  }
+
+  ORT_RETURN_IF_NOT(scales.size() == broadcast_dim, "Unexpected size of scales output buffer");
+  ORT_RETURN_IF_NOT(offsets.size() == broadcast_dim, "Unexpected size of offsets output buffer");
+
+  size_t i = 0;
+  for (size_t n = 0; n < block_count; n++) {
+    for (size_t bd = 0; bd < broadcast_dim; bd++) {
+      switch (data_type) {
+        case QNN_DATATYPE_SFIXED_POINT_8: {
+          auto input_span = gsl::make_span<const float>(&data[i], block_size);
+          auto output_span = gsl::make_span<uint8_t>(&quant_bytes[i * sizeof(int8_t)], sizeof(int8_t) * block_size);
+          ORT_RETURN_IF_ERROR(QuantizeData<int8_t>(input_span, scales[bd], offsets[bd], output_span));
+          break;
+        }
+        case QNN_DATATYPE_UFIXED_POINT_8: {
+          auto input_span = gsl::make_span<const float>(&data[i], block_size);
+          auto output_span = gsl::make_span<uint8_t>(&quant_bytes[i * sizeof(uint8_t)], sizeof(uint8_t) * block_size);
+          ORT_RETURN_IF_ERROR(QuantizeData<uint8_t>(input_span, scales[bd], offsets[bd], output_span));
+          break;
+        }
+        case QNN_DATATYPE_SFIXED_POINT_16: {
+          auto input_span = gsl::make_span<const float>(&data[i], block_size);
+          auto output_span = gsl::make_span<uint8_t>(&quant_bytes[i * sizeof(int16_t)], sizeof(int16_t) * block_size);
+          ORT_RETURN_IF_ERROR(QuantizeData<int16_t>(input_span, scales[bd], offsets[bd], output_span));
+          break;
+        }
+        case QNN_DATATYPE_UFIXED_POINT_16: {
+          auto input_span = gsl::make_span<const float>(&data[i], block_size);
+          auto output_span = gsl::make_span<uint8_t>(&quant_bytes[i * sizeof(uint16_t)], sizeof(uint16_t) * block_size);
+          ORT_RETURN_IF_ERROR(QuantizeData<uint16_t>(input_span, scales[bd], offsets[bd], output_span));
+          break;
+        }
+        case QNN_DATATYPE_SFIXED_POINT_32: {
+          auto input_span = gsl::make_span<const float>(&data[i], block_size);
+          auto output_span = gsl::make_span<uint8_t>(&quant_bytes[i * sizeof(int32_t)], sizeof(int32_t) * block_size);
+          ORT_RETURN_IF_ERROR(QuantizeData<int32_t>(input_span, scales[bd], offsets[bd], output_span));
+          break;
+        }
+        default:
+          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported quantization data type for QuantizeData");
+      }
+      i += block_size;
+    }
+  }
+  assert(i == data.size());
+
+  return Status::OK();
+}
+
 static bool GetType(const NodeArg& node_arg, int32_t& type, const logging::Logger& logger) {
   type = ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED;
   const auto* type_proto = node_arg.TypeAsProto();
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
index 578f50ba895cf..1a30b10105fbf 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
@@ -74,7 +74,30 @@ static bool ArrayHasString(const std::array<std::string_view, N>& strings, std::
 std::pair<float, float> CheckMinMax(float rmin, float rmax);
 
 template <typename T>
-Status GetQminQmax(const Qnn_DataType_t qnn_data_type, T& qmin, T& qmax);
+Status GetQminQmax(const Qnn_DataType_t qnn_data_type,
+                   T& qmin,
+                   T& qmax,
+                   bool symmetric = false) {
+  if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_8) {
+    qmin = static_cast<T>(std::numeric_limits<int8_t>::min() + static_cast<int8_t>(symmetric));
+    qmax = static_cast<T>(std::numeric_limits<int8_t>::max());
+  } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_8) {
+    qmin = static_cast<T>(std::numeric_limits<uint8_t>::min());
+    qmax = static_cast<T>(std::numeric_limits<uint8_t>::max());
+  } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_16) {
+    qmin = static_cast<T>(std::numeric_limits<int16_t>::min() + static_cast<int16_t>(symmetric));
+    qmax = static_cast<T>(std::numeric_limits<int16_t>::max());
+  } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) {
+    qmin = static_cast<T>(std::numeric_limits<uint16_t>::min());
+    qmax = static_cast<T>(std::numeric_limits<uint16_t>::max());
+  } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_32) {
+    qmin = static_cast<T>(std::numeric_limits<int32_t>::min() + static_cast<int32_t>(symmetric));
+    qmax = static_cast<T>(std::numeric_limits<int32_t>::max());
+  } else {
+    ORT_RETURN_IF(true, "Qnn Data Type: %d not supported yet.", qnn_data_type);
+  }
+  return Status::OK();
+}
 
 template <typename T>
 inline T Saturate(const T qmax,
@@ -104,6 +127,40 @@ Status Quantize(const double double_value,
                 const Qnn_DataType_t qnn_data_type,
                 int& quant_value);
 
+size_t ShapeSizeCalc(gsl::span<const uint32_t> shape, size_t start, size_t end);
+
+Status GetDataQuantParams(gsl::span<const float> data, gsl::span<const uint32_t> shape,
+                          /*out*/ gsl::span<float> scales, /*out*/ gsl::span<int32_t> offsets,
+                          Qnn_DataType_t data_type, bool symmetric = false,
+                          std::optional<int64_t> axis = std::nullopt);
+
+Status QuantizeData(gsl::span<const float> data, gsl::span<const uint32_t> shape,
+                    gsl::span<const float> scales, gsl::span<const int32_t> offsets,
+                    /*out*/ gsl::span<uint8_t> quant_bytes, Qnn_DataType_t data_type,
+                    std::optional<int64_t> axis = std::nullopt);
+
+template <typename QuantType>
+inline Status QuantizeData(gsl::span<const float> data, float scale, int32_t offset,
+                           /*out*/ gsl::span<uint8_t> quant_bytes) {
+  const size_t num_elems = data.size();
+  const size_t expected_output_bytes = sizeof(QuantType) * num_elems;
+  ORT_RETURN_IF_NOT(expected_output_bytes == quant_bytes.size(),
+                    "Output buffer is not large enough to hold quantized bytes.");
+  const double clip_min = static_cast<double>(std::numeric_limits<QuantType>::lowest());
+  const double clip_max = static_cast<double>(std::numeric_limits<QuantType>::max());
+
+  QuantType* output = reinterpret_cast<QuantType*>(quant_bytes.data());
+  for (size_t i = 0; i < num_elems; ++i) {
+    const double scale_dbl = static_cast<double>(scale);
+    const double offset_dbl = static_cast<double>(offset);
+    double float_val = std::nearbyint(static_cast<double>(data[i]) / scale_dbl) - offset_dbl;
+    float_val = std::max(float_val, clip_min);
+    float_val = std::min(float_val, clip_max);
+    output[i] = static_cast<QuantType>(float_val);
+  }
+  return Status::OK();
+}
+
 // Re-writes a buffer of packed 4-bit elements to a buffer of unpacked 8-bit elements.
 // QNN requires that 4-bit weights are unpacked to 8-bit.
 template <bool Signed>

From 4eb1e8088f106bbb2f8721c021329a349b3d738f Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Sun, 15 Dec 2024 04:43:01 -0800
Subject: [PATCH 27/64] Move includes into qnn/ort_api.h

---
 .../qnn/builder/opbuilder/argmax_min_op_builder.cc       | 5 +----
 .../providers/qnn/builder/opbuilder/base_op_builder.cc   | 6 ------
 .../providers/qnn/builder/opbuilder/base_op_builder.h    | 2 +-
 .../qnn/builder/opbuilder/batch_norm_op_builder.cc       | 6 +-----
 .../providers/qnn/builder/opbuilder/cast_op_builder.cc   | 3 +--
 .../providers/qnn/builder/opbuilder/clip_op_builder.cc   | 4 +---
 .../providers/qnn/builder/opbuilder/conv_op_builder.cc   | 6 +-----
 .../providers/qnn/builder/opbuilder/expand_op_builder.cc | 5 +----
 .../providers/qnn/builder/opbuilder/gather_op_builder.cc | 5 +----
 .../providers/qnn/builder/opbuilder/gemm_op_builder.cc   | 5 +----
 .../qnn/builder/opbuilder/instance_norm_op_builder.cc    | 7 +------
 .../qnn/builder/opbuilder/layer_norm_op_builder.cc       | 7 +------
 .../providers/qnn/builder/opbuilder/lrn_op_builder.cc    | 2 --
 .../providers/qnn/builder/opbuilder/pad_op_builder.cc    | 6 +-----
 .../providers/qnn/builder/opbuilder/pool_op_builder.cc   | 7 +------
 .../providers/qnn/builder/opbuilder/reduce_op_builder.cc | 4 +---
 .../qnn/builder/opbuilder/reshape_op_builder.cc          | 5 +----
 .../providers/qnn/builder/opbuilder/resize_op_builder.cc | 9 ++-------
 .../providers/qnn/builder/opbuilder/slice_op_builder.cc  | 6 +-----
 .../qnn/builder/opbuilder/softmax_op_builder.cc          | 6 +-----
 .../providers/qnn/builder/opbuilder/split_op_builder.cc  | 5 +----
 .../providers/qnn/builder/opbuilder/tile_op_builder.cc   | 5 +----
 onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc | 1 -
 .../qnn/builder/opbuilder/transpose_op_builder.cc        | 2 +-
 .../qnn/builder/qnn_node_group/conv_activation_fusion.cc | 4 ++--
 .../qnn/builder/qnn_node_group/conv_activation_fusion.h  | 2 +-
 .../providers/qnn/builder/qnn_node_group/dq_q_fusion.cc  | 4 ++--
 .../providers/qnn/builder/qnn_node_group/dq_q_fusion.h   | 3 +--
 .../qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc | 4 ++--
 .../qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h  | 3 +--
 .../qnn/builder/qnn_node_group/qnn_node_group.cc         | 3 +--
 onnxruntime/core/providers/qnn/ort_api.h                 | 2 ++
 32 files changed, 34 insertions(+), 110 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc
index 192c9496f0999..76762c82ece2e 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc
@@ -1,14 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/framework/tensorprotoutils.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
index 7690427416770..7db4f8c0c609d 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
@@ -4,12 +4,6 @@
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
-#include <core/providers/common.h>
-
-#include "core/framework/tensorprotoutils.h"
-#include "core/providers/cpu/tensor/transpose.h"
-#include "core/common/safeint.h"
-
 namespace onnxruntime {
 namespace qnn {
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
index b2bb3f043eecd..0c400c4a5abc9 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
@@ -3,11 +3,11 @@
 
 #pragma once
 
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder.h"
 #include "core/providers/qnn/builder/qnn_quant_params_wrapper.h"
-#include "core/framework/allocator.h"
 
 #include "QnnOpDef.h"
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc
index 9c7f1d374e5b7..5be54729a6a86 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc
@@ -5,15 +5,11 @@
 #include <cmath>
 #include <utility>
 
-#include "core/providers/common.h"
-#include "core/framework/float16.h"
-#include "core/framework/tensorprotoutils.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 class BatchNormOpBuilder : public BaseOpBuilder {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc
index d3bdee02437e4..5b3dd223a9f3f 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc
@@ -4,12 +4,11 @@
 #include <string>
 #include <vector>
 
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
index aa6080eb1195d..a86354edf7e8e 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
@@ -4,13 +4,11 @@
 #include <cassert>
 #include <limits>
 
-#include "core/providers/common.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 class ClipOpBuilder : public BaseOpBuilder {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
index f0c6f53affecd..5f723e2e262be 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
@@ -1,15 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/framework/tensorprotoutils.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
index 20978f41b529b..1beab0ed8b735 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
@@ -1,13 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/common/safeint.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
index df02d12bd59c9..a3ada04b7b017 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
@@ -2,13 +2,10 @@
 // Licensed under the MIT License.
 
 #include <cassert>
-#include "core/providers/common.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/common/safeint.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc
index 20f2f4383044c..8a36aa192313f 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc
@@ -1,13 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/common/safeint.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc
index 53bc93e2fa832..80d96f513df63 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc
@@ -1,15 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/framework/tensorprotoutils.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-#include "onnx/defs/data_type_utils.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
index b0394be15aba2..7f830d68999a1 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
@@ -2,15 +2,10 @@
 // Licensed under the MIT License.
 
 #include <cassert>
-#include "core/providers/common.h"
-#include "core/framework/tensorprotoutils.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-#include "onnx/defs/data_type_utils.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc
index dbb29557cccc4..bc2b7a01c0779 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc
@@ -5,8 +5,6 @@
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-#include "onnx/defs/data_type_utils.h"
 
 #include "QnnOpDef.h"  // From QNN SDK: contains QNN constants (e.g., op names, param values).
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
index 3035da2723907..7d40a3489e550 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
@@ -1,13 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/providers/cpu/tensor/slice_helper.h"
-#include "core/common/safeint.h"
-
-#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
index 0ed11bed30929..4923371ce9ee2 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
@@ -1,15 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/framework/tensorprotoutils.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-#include "onnx/defs/data_type_utils.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
index a2eeeee4453e4..a74fcfd843551 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
@@ -6,10 +6,8 @@
 #include <array>
 #include <vector>
 
-#include "core/common/safeint.h"
-#include "onnx/defs/data_type_utils.h"
-#include "core/providers/common.h"
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc
index c374a3c64b350..7d12a6843d4a6 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc
@@ -1,14 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/framework/tensorprotoutils.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
index 6b1088e488c31..5298b20033115 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
@@ -5,15 +5,10 @@
 #include <cassert>
 #include <unordered_map>
 
-#include "core/providers/common.h"
-#include "core/framework/tensorprotoutils.h"
-#include "core/providers/qnn/builder/qnn_model_wrapper.h"
-#include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/providers/cpu/tensor/slice_helper.h"
-#include "core/common/safeint.h"
-
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+#include "core/providers/qnn/builder/op_builder_factory.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
index e383e71d2a497..3096967a5f166 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
@@ -1,16 +1,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/cpu/tensor/slice_helper.h"
 
-#include "core/framework/tensorprotoutils.h"
-
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc
index bc5339d90660e..e7d37937d527f 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc
@@ -1,14 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/framework/tensorprotoutils.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
index f435b1d6d802f..de7277390f9bb 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
@@ -1,14 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/cpu/tensor/slice_helper.h"
-#include "core/common/safeint.h"
-
-#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc
index e66c4cd350235..1d518c3ed5359 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc
@@ -1,14 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/cpu/tensor/slice_helper.h"
-#include "core/common/safeint.h"
-
-#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
index 9cb8f91a9db0b..b2891022e73a5 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
@@ -1,7 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
-#include "core/framework/utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc
index 1290a012d5902..7bc43f04d5361 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc
@@ -4,10 +4,10 @@
 #include <string>
 #include <vector>
 
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/common/safeint.h"
 
 #include "base_op_builder.h"
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.cc
index 76316250a88ad..789811a423884 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.cc
@@ -6,8 +6,8 @@
 #include <limits>
 #include <optional>
 #include <string>
-#include "core/graph/graph_utils.h"
-#include "core/framework/node_unit.h"
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_node_group/utils.h"
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.h
index b604b25e943e6..a211c86c2301e 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.h
@@ -9,7 +9,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc
index 17af5725a01ee..3af2fdd1f0276 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc
@@ -6,8 +6,8 @@
 #include <limits>
 #include <optional>
 #include <utility>
-#include "core/graph/graph_utils.h"
-#include "core/framework/node_unit.h"
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_node_group/utils.h"
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h
index 90fe44c3af059..d3d552bc172ec 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h
@@ -7,8 +7,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "core/common/common.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc
index aceaf0399a6cb..0b2d7451553e7 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc
@@ -6,8 +6,8 @@
 #include <limits>
 #include <optional>
 #include <utility>
-#include "core/graph/graph_utils.h"
-#include "core/framework/node_unit.h"
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h
index 3b67f13492a46..0a1b16d24ffcd 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h
@@ -7,8 +7,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "core/common/common.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
index 9fb9e815321c0..56413b781b246 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
@@ -10,8 +10,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
-#include "core/graph/graph_utils.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
diff --git a/onnxruntime/core/providers/qnn/ort_api.h b/onnxruntime/core/providers/qnn/ort_api.h
index 1e6f96b565385..494d1fbeedb3d 100644
--- a/onnxruntime/core/providers/qnn/ort_api.h
+++ b/onnxruntime/core/providers/qnn/ort_api.h
@@ -6,6 +6,7 @@
 #define BUILD_QNN_EP_STATIC 1
 
 #if BUILD_QNN_EP_STATIC
+#include "onnx/defs/data_type_utils.h"
 #include "core/common/common.h"
 #include "core/common/status.h"
 #include "core/common/safeint.h"
@@ -14,6 +15,7 @@
 #include "core/common/path_string.h"
 #include "core/platform/env.h"
 #include "core/framework/data_types.h"
+#include "core/framework/float16.h"
 #include "core/framework/run_options.h"
 #include "core/framework/execution_provider.h"
 #include "core/framework/model_metadef_id_generator.h"

From d86fb6c005856d8a4f2c9a7dadafc4ec7b929ef8 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Sun, 15 Dec 2024 04:56:46 -0800
Subject: [PATCH 28/64] Add TensorProto::has_data_type() to provider bridge

---
 onnxruntime/core/providers/shared_library/provider_interfaces.h  | 1 +
 .../core/providers/shared_library/provider_wrappedtypes.h        | 1 +
 onnxruntime/core/session/provider_bridge_ort.cc                  | 1 +
 3 files changed, 3 insertions(+)

diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 3e4b81fb75773..96cd72c91b0e6 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -472,6 +472,7 @@ struct ProviderHost {
   virtual bool TensorProto__has_raw_data(const ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual const std::string& TensorProto__raw_data(const ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual std::string* TensorProto__mutable_raw_data(ONNX_NAMESPACE::TensorProto* p) = 0;
+  virtual bool TensorProto__has_data_type(const ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual int32_t TensorProto__data_type(const ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual void TensorProto__set_data_type(ONNX_NAMESPACE::TensorProto* p, int32_t type) = 0;
   virtual void TensorProto__CopyFrom(ONNX_NAMESPACE::TensorProto* p, const ONNX_NAMESPACE::TensorProto* other) = 0;
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index c2d99e1d5786f..03a0b4ea99524 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -237,6 +237,7 @@ struct TensorProto final {
   const std::string& raw_data() const { return g_host->TensorProto__raw_data(this); }
   std::string* mutable_raw_data() { return g_host->TensorProto__mutable_raw_data(this); }
 
+  bool has_data_type() const { return g_host->TensorProto__has_data_type(this); }
   int32_t data_type() const { return g_host->TensorProto__data_type(this); }
   void set_data_type(int32_t type) { return g_host->TensorProto__set_data_type(this, type); }
 
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 08943e5a4a2a6..b18d4c6ed23ef 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -594,6 +594,7 @@ struct ProviderHostImpl : ProviderHost {
   const std::string& TensorProto__raw_data(const ONNX_NAMESPACE::TensorProto* p) override { return p->raw_data(); }
   std::string* TensorProto__mutable_raw_data(ONNX_NAMESPACE::TensorProto* p) override { return p->mutable_raw_data(); }
 
+  bool TensorProto__has_data_type(const ONNX_NAMESPACE::TensorProto* p) override { return p->has_data_type(); }
   int32_t TensorProto__data_type(const ONNX_NAMESPACE::TensorProto* p) override { return p->data_type(); }
   void TensorProto__set_data_type(ONNX_NAMESPACE::TensorProto* p, int32_t type) override { p->set_data_type(type); }
 

From 187e3b9e0a38c40fe9ef8db73ab020e02774c2a2 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Sun, 15 Dec 2024 06:02:35 -0800
Subject: [PATCH 29/64] Checkpoint: updating usage of provider bridge in ep

---
 .../qnn/builder/qnn_backend_manager.cc        | 12 ++--
 .../core/providers/qnn/builder/qnn_model.cc   |  2 +-
 .../core/providers/qnn/builder/qnn_utils.cc   | 31 +++++-----
 onnxruntime/core/providers/qnn/ort_api.h      |  2 +-
 .../providers/qnn/qnn_execution_provider.cc   | 62 ++++++++++++-------
 .../providers/qnn/qnn_provider_factory.cc     | 22 ++-----
 .../providers/shared_library/provider_api.h   |  8 +++
 .../provider_bridge_provider.cc               |  3 +
 .../shared_library/provider_interfaces.h      |  7 +++
 .../shared_library/provider_wrappedtypes.h    |  1 +
 .../core/session/provider_bridge_ort.cc       | 24 +++++++
 11 files changed, 111 insertions(+), 63 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index 6a1dd37d01b65..5a9abe43fec72 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -246,12 +246,12 @@ void QnnLogging(const char* format,
   const auto data_type = ::onnxruntime::logging::DataType::SYSTEM;
 
   if (logger.OutputIsEnabled(severity, data_type)) {
-    ::onnxruntime::logging::Capture(logger,
-                                    severity,
-                                    ::onnxruntime::logging::Category::onnxruntime,
-                                    data_type,
-                                    ORT_WHERE)
-        .ProcessPrintf(format, argument_parameter);
+    auto log_capture = ::onnxruntime::logging::Capture::Create(logger,
+                                                               severity,
+                                                               ::onnxruntime::logging::Category::onnxruntime,
+                                                               data_type,
+                                                               ORT_WHERE);
+    log_capture.ProcessPrintf(format, argument_parameter);
   }
 }
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.cc b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
index 8bafd17b2648e..de8fa816efdb1 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
@@ -101,7 +101,7 @@ Status QnnModel::ComposeGraph(const GraphViewer& graph_viewer,
   // valid throughout the lifetime of the ModelBuilder
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
-  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer, logger);
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(&graph_viewer, logger);
 
   // This name must be same with the EPContext node name
   const auto& graph_name = fused_node.Name();
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
index 64b62779263ad..50a151292d9c7 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
@@ -731,7 +731,7 @@ NodeAttrHelper::NodeAttrHelper(const NodeUnit& node_unit)
 
 float NodeAttrHelper::Get(const std::string& key, float def_val) const {
   if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
-    return entry->second.f();
+    return entry->second().f();
   }
 
   return def_val;
@@ -739,7 +739,7 @@ float NodeAttrHelper::Get(const std::string& key, float def_val) const {
 
 int32_t NodeAttrHelper::Get(const std::string& key, int32_t def_val) const {
   if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
-    return narrow<int32_t>(entry->second.i());
+    return narrow<int32_t>(entry->second().i());
   }
 
   return def_val;
@@ -747,7 +747,7 @@ int32_t NodeAttrHelper::Get(const std::string& key, int32_t def_val) const {
 
 uint32_t NodeAttrHelper::Get(const std::string& key, uint32_t def_val) const {
   if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
-    return narrow<uint32_t>(entry->second.i());
+    return narrow<uint32_t>(entry->second().i());
   }
 
   return def_val;
@@ -755,7 +755,7 @@ uint32_t NodeAttrHelper::Get(const std::string& key, uint32_t def_val) const {
 
 int64_t NodeAttrHelper::Get(const std::string& key, int64_t def_val) const {
   if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
-    return entry->second.i();
+    return entry->second().i();
   }
 
   return def_val;
@@ -763,7 +763,7 @@ int64_t NodeAttrHelper::Get(const std::string& key, int64_t def_val) const {
 
 const std::string& NodeAttrHelper::Get(const std::string& key, const std::string& def_val) const {
   if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
-    return entry->second.s();
+    return entry->second().s();
   }
 
   return def_val;
@@ -771,7 +771,7 @@ const std::string& NodeAttrHelper::Get(const std::string& key, const std::string
 
 std::vector<int32_t> NodeAttrHelper::Get(const std::string& key, const std::vector<int32_t>& def_val) const {
   if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
-    const auto& values = entry->second.ints();
+    const auto& values = entry->second().ints();
     const int64_t* cbegin = values.data();
     const int64_t* cend = values.data() + values.size();
     std::vector<int32_t> v;
@@ -786,7 +786,7 @@ std::vector<int32_t> NodeAttrHelper::Get(const std::string& key, const std::vect
 
 std::vector<uint32_t> NodeAttrHelper::Get(const std::string& key, const std::vector<uint32_t>& def_val) const {
   if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
-    const auto& values = entry->second.ints();
+    const auto& values = entry->second().ints();
     const int64_t* cbegin = values.data();
     const int64_t* cend = values.data() + values.size();
     std::vector<uint32_t> v;
@@ -801,7 +801,7 @@ std::vector<uint32_t> NodeAttrHelper::Get(const std::string& key, const std::vec
 
 std::vector<int64_t> NodeAttrHelper::Get(const std::string& key, const std::vector<int64_t>& def_val) const {
   if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
-    const auto& values = entry->second.ints();
+    const auto& values = entry->second().ints();
     const int64_t* cbegin = values.data();
     const int64_t* cend = values.data() + values.size();
     return std::vector<int64_t>{cbegin, cend};
@@ -812,7 +812,7 @@ std::vector<int64_t> NodeAttrHelper::Get(const std::string& key, const std::vect
 
 std::vector<float> NodeAttrHelper::Get(const std::string& key, const std::vector<float>& def_val) const {
   if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
-    const auto& values = entry->second.floats();
+    const auto& values = entry->second().floats();
     const float* cbegin = values.data();
     const float* cend = values.data() + values.size();
     return std::vector<float>{cbegin, cend};
@@ -824,7 +824,7 @@ std::vector<float> NodeAttrHelper::Get(const std::string& key, const std::vector
 std::optional<float> NodeAttrHelper::GetFloat(const std::string& key) const {
   std::optional<float> result;
   if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
-    result = entry->second.f();
+    result = entry->second().f();
   }
 
   return result;
@@ -833,7 +833,7 @@ std::optional<float> NodeAttrHelper::GetFloat(const std::string& key) const {
 std::optional<int64_t> NodeAttrHelper::GetInt64(const std::string& key) const {
   std::optional<int64_t> result;
   if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
-    result = entry->second.i();
+    result = entry->second().i();
   }
 
   return result;
@@ -842,7 +842,7 @@ std::optional<int64_t> NodeAttrHelper::GetInt64(const std::string& key) const {
 std::optional<std::vector<float>> NodeAttrHelper::GetFloats(const std::string& key) const {
   std::optional<std::vector<float>> result;
   if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
-    const auto& values = entry->second.floats();
+    const auto& values = entry->second().floats();
     const float* cbegin = values.data();
     const float* cend = values.data() + values.size();
     result = std::vector<float>(cbegin, cend);
@@ -854,7 +854,7 @@ std::optional<std::vector<float>> NodeAttrHelper::GetFloats(const std::string& k
 std::optional<std::vector<int64_t>> NodeAttrHelper::GetInt64s(const std::string& key) const {
   std::optional<std::vector<int64_t>> result;
   if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
-    const auto& values = entry->second.ints();
+    const auto& values = entry->second().ints();
     const int64_t* cbegin = values.data();
     const int64_t* cend = values.data() + values.size();
     result = std::vector<int64_t>(cbegin, cend);
@@ -866,15 +866,16 @@ std::optional<std::vector<int64_t>> NodeAttrHelper::GetInt64s(const std::string&
 std::optional<std::string> NodeAttrHelper::GetString(const std::string& key) const {
   std::optional<std::string> result;
   if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
-    result = entry->second.s();
+    result = entry->second().s();
   }
 
   return result;
 }
 
 bool NodeAttrHelper::HasAttr(const std::string& key) const {
-  return Contains(node_attributes_, key);
+  return node_attributes_.find(key) != node_attributes_.end();
 }
+
 static bool GetClipMinMaxImpl(const GraphViewer& graph_viewer, const Node& node, float& min, float& max,
                               const logging::Logger& logger) {
   const auto& node_name = node.Name();
diff --git a/onnxruntime/core/providers/qnn/ort_api.h b/onnxruntime/core/providers/qnn/ort_api.h
index 494d1fbeedb3d..6fb346d2349a0 100644
--- a/onnxruntime/core/providers/qnn/ort_api.h
+++ b/onnxruntime/core/providers/qnn/ort_api.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#define BUILD_QNN_EP_STATIC 1
+#define BUILD_QNN_EP_STATIC 0
 
 #if BUILD_QNN_EP_STATIC
 #include "onnx/defs/data_type_utils.h"
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index f4ce648251415..c8ec3098b7ce0 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -243,8 +243,10 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
   // separate out the profiling level for ETW in case it gets disabled later when we extract the events
   // set to invalid to indicate that ETW is no enabled when we setup QNN
   qnn::ProfilingLevel profiling_level_etw = qnn::ProfilingLevel::INVALID;
-  const Env& env = Env::Default();
-  // const Env& env = GetDefaultEnv();
+
+// TODO: Re-enable ETW after QNN is a DLL
+#if 0
+  const Env& env = GetDefaultEnv();
   auto& provider = env.GetTelemetryProvider();
   if (provider.IsEnabled()) {
     auto level = provider.Level();
@@ -255,6 +257,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
       }
     }
   }
+#endif
 
   // In case ETW gets disabled later
   auto profiling_level_pos = provider_options_map.find(PROFILING_LEVEL);
@@ -492,9 +495,10 @@ static void LogNodeSupport(const logging::Logger& logger,
     oss << "\tREASON : " << support_status.ErrorMessage() << std::endl;
   }
 
-  logging::Capture(logger, log_severity, logging::Category::onnxruntime,
-                   log_data_type, call_site)
-          .Stream()
+  std::unique_ptr<logging::Capture> log_capture = logging::Capture::Create(logger, log_severity,
+                                                                           logging::Category::onnxruntime,
+                                                                           log_data_type, call_site);
+  log_capture->Stream()
       << (support_status.IsOK() ? "Validation PASSED " : "Validation FAILED ") << "for " << num_nodes
       << " nodes in " << qnn_node_group.Type() << " (" << qnn_node_group.GetTargetNodeUnit()->OpType() << ") :"
       << std::endl
@@ -598,11 +602,11 @@ static bool EpSharedContextsHasAllGraphs(const std::vector<IExecutionProvider::F
                                          const logging::Logger& logger) {
   for (auto fused_node_and_graph : fused_nodes_and_graphs) {
     const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
-    const auto& ep_context_node = graph_viewer.Nodes().begin();
-    qnn::utils::NodeAttrHelper node_helper(*ep_context_node);
+    const Node& ep_context_node = *graph_viewer.Nodes().begin();
+    qnn::utils::NodeAttrHelper node_helper(ep_context_node);
     std::string cache_source = node_helper.Get(qnn::SOURCE, "");
 
-    const std::string& graph_name = ep_context_node->Name();
+    const std::string& graph_name = ep_context_node.Name();
     bool has_shared_qnn_model = SharedContext::GetInstance().HasQnnModel(graph_name);
     if (!has_shared_qnn_model) {
       LOGS(logger, VERBOSE) << "Graph: " << graph_name << " from EpContext node not found from shared EP contexts.";
@@ -617,7 +621,7 @@ static bool EpSharedContextsHasAllGraphs(const std::vector<IExecutionProvider::F
 static void PartitionCtxModel(const onnxruntime::GraphViewer& graph_viewer,
                               const size_t num_nodes_in_graph,
                               std::vector<std::unique_ptr<ComputeCapability>>& result,
-                              const utils::GenerateMetadefNameFn& gen_metadef_name,
+                              const std::function<std::string()>& gen_metadef_name,
                               const logging::Logger& logger) {
   std::unordered_set<const Node*> supported_nodes{};
   std::vector<std::vector<const Node*>> supported_groups{};
@@ -722,7 +726,7 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
 
-  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer, logger);
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(&graph_viewer, logger);
 
   // remove is_qnn_ctx_model related code
   const auto supported_nodes = GetSupportedNodes(graph_viewer, node_unit_map,
@@ -765,11 +769,11 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
     bool is_valid_partition = true;
     size_t nodes_in_partition = 0;
 
-    if (partition && partition->sub_graph) {
-      nodes_in_partition = partition->sub_graph->nodes.size();
+    if (partition && partition->SubGraph()) {
+      nodes_in_partition = partition->SubGraph()->Nodes().size();
 
       if (nodes_in_partition == 1 && !is_qnn_ctx_model) {
-        const Node* node = graph_viewer.GetNode(partition->sub_graph->nodes[0]);
+        const Node* node = graph_viewer.GetNode(partition->SubGraph()->Nodes()[0]);
 
         if (!node) {
           LOGS(logger, ERROR) << "QNN EP: Invalid node in partition of one node.";
@@ -922,10 +926,10 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
       if (EpSharedContextsHasAllGraphs(fused_nodes_and_graphs, logger)) {
         for (auto fused_node_and_graph : fused_nodes_and_graphs) {
           const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
-          const auto& ep_context_node = graph_viewer.Nodes().begin();
+          const Node& ep_context_node = *graph_viewer.Nodes().begin();
           const Node& fused_node = fused_node_and_graph.fused_node;
           const std::string& graph_meta_id = fused_node.Name();
-          std::string key = ep_context_node->Name();
+          std::string key = ep_context_node.Name();
           auto qnn_model_shared = SharedContext::GetInstance().GetSharedQnnModel(key);
           ORT_RETURN_IF(nullptr == qnn_model_shared, "Graph: " + key + " not found from shared EP contexts.");
           ORT_RETURN_IF_ERROR(qnn_model_shared->SetGraphInputOutputInfo(graph_viewer, fused_node, logger));
@@ -967,10 +971,10 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
 
     for (auto fused_node_and_graph : fused_nodes_and_graphs) {
       const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
-      const auto& ep_context_node = graph_viewer.Nodes().begin();
+      const Node& ep_context_node = *graph_viewer.Nodes().begin();
       const Node& fused_node = fused_node_and_graph.fused_node;
       const std::string& graph_meta_id = fused_node.Name();
-      std::string key = ep_context_node->Name();
+      std::string key = ep_context_node.Name();
       ORT_RETURN_IF(qnn_models.find(key) == qnn_models.end(), key + " key name not exist in table qnn_models.");
       auto qnn_model = std::move(qnn_models[key]);
       ORT_RETURN_IF_ERROR(qnn_model->SetGraphInputOutputInfo(graph_viewer, fused_node, logger));
@@ -1030,8 +1034,8 @@ const InlinedVector<const Node*> QNNExecutionProvider::GetEpContextNodes() const
   InlinedVector<const Node*> ep_context_nodes;
   if (qnn_ep_context_model_) {
     const auto& graph = qnn_ep_context_model_->MainGraph();
-    for (const auto& node : graph.Nodes()) {
-      ep_context_nodes.push_back(graph.GetNode(node.Index()));
+    for (const Node* node : graph.Nodes()) {
+      ep_context_nodes.push_back(graph.GetNode(node->Index()));
     }
   }
 
@@ -1122,22 +1126,34 @@ void QNNExecutionProvider::ReleasePerThreadContext() const {
   per_thread_context_cache->erase(cached_context_it);
 }
 
+static bool TryGetConfigEntry(const ConfigOptions& config_options, const std::string& key, std::string& value) {
+  std::optional<std::string> new_value = config_options.GetConfigEntry(key);
+  if (!new_value.has_value()) {
+    return false;
+  }
+
+  value = *new_value;
+  return true;
+}
+
 Status QNNExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_options) {
   auto backend_type = qnn_backend_manager_->GetQnnBackendType();
   if (qnn::QnnBackendType::HTP != backend_type && qnn::QnnBackendType::DSP != backend_type) {
     return Status::OK();
   }
 
+  const ConfigOptions& config_options = run_options.GetConfigOptions();
+
   std::string htp_perf_mode = "";
   qnn::HtpPerformanceMode htp_performance_mode = qnn::HtpPerformanceMode::kHtpDefault;
-  if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnPerfMode, htp_perf_mode)) {
+  if (TryGetConfigEntry(config_options, kOrtRunOptionsConfigQnnPerfMode, htp_perf_mode)) {
     // set power mode
     ParseHtpPerformanceMode(htp_perf_mode, htp_performance_mode);
   }
 
   std::string rpc_latency = "";
   uint32_t rpc_control_latency = 0;
-  if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnRpcControlLatency, rpc_latency)) {
+  if (TryGetConfigEntry(config_options, kOrtRunOptionsConfigQnnRpcControlLatency, rpc_latency)) {
     rpc_control_latency = static_cast<uint32_t>(std::stoul(rpc_latency));
     LOGS_DEFAULT(VERBOSE) << "rpc_control_latency: " << rpc_control_latency;
   }
@@ -1163,9 +1179,11 @@ Status QNNExecutionProvider::OnRunEnd(bool /*sync_stream*/, const onnxruntime::R
     return Status::OK();
   }
 
+  const ConfigOptions& config_options = run_options.GetConfigOptions();
+
   std::string htp_perf_mode = "";
   qnn::HtpPerformanceMode htp_performance_mode = qnn::HtpPerformanceMode::kHtpDefault;
-  if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnPerfModePostRun, htp_perf_mode)) {
+  if (TryGetConfigEntry(config_options, kOrtRunOptionsConfigQnnPerfModePostRun, htp_perf_mode)) {
     // set power mode
     ParseHtpPerformanceMode(htp_perf_mode, htp_performance_mode);
   }
diff --git a/onnxruntime/core/providers/qnn/qnn_provider_factory.cc b/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
index fdeb9dc106386..2407a7c83bfeb 100644
--- a/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
+++ b/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
@@ -22,19 +22,8 @@ struct QNNProviderFactory : IExecutionProviderFactory {
   const ConfigOptions* config_options_;
 };
 
-// TODO: Move to core/session/provider_bridge_ort.cc
-std::shared_ptr<IExecutionProviderFactory> QNNProviderFactoryCreator::Create(const ProviderOptions& provider_options_map,
-                                                                             const SessionOptions* session_options) {
-  const ConfigOptions* config_options = nullptr;
-  if (session_options != nullptr) {
-    config_options = &session_options->config_options;
-  }
-
-  return std::make_shared<onnxruntime::QNNProviderFactory>(provider_options_map, config_options);
-}
-
-struct QNN_Provider /*: Provider*/ {
-  std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory(const void* param) /*override*/ {
+struct QNN_Provider : Provider {
+  std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory(const void* param) override {
     if (param == nullptr) {
       LOGS_DEFAULT(ERROR) << "[QNN EP] Passed NULL options to CreateExecutionProviderFactory()";
       return nullptr;
@@ -52,18 +41,15 @@ struct QNN_Provider /*: Provider*/ {
     return std::make_shared<onnxruntime::QNNProviderFactory>(*provider_options, config_options);
   }
 
-  void Initialize() /*override*/ {}
-  void Shutdown() /*override*/ {}
+  void Initialize() override {}
+  void Shutdown() override {}
 } g_provider;
 
 }  // namespace onnxruntime
 
-// TODO: Uncomment when it is an EP dll
-#if 0
 extern "C" {
 
 ORT_API(onnxruntime::Provider*, GetProvider) {
   return &onnxruntime::g_provider;
 }
 }
-#endif
diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index ceb654931ae61..dc28848b2bab0 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -385,6 +385,14 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer,
                                                   execution_provider_name, execution_provider_type, node_unit_map,
                                                   drop_constant_initializers);
 }
+inline std::unique_ptr<ComputeCapability> MakeComputeCapability(const GraphViewer& graph_viewer,
+                                                                const std::vector<const Node*>& group,
+                                                                const std::function<std::string()>& generate_metadef_name,
+                                                                const std::string& execution_provider_name,
+                                                                bool drop_constant_initializers) {
+  return g_host->Utils__MakeComputeCapability(graph_viewer, group, generate_metadef_name,
+                                              execution_provider_name, drop_constant_initializers);
+}
 }  // namespace utils
 
 namespace QDQ {
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index aa8c367d25d51..456e164917587 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -505,6 +505,9 @@ Status UnpackInitializerData(const ONNX_NAMESPACE::TensorProto& tensor, const st
                              /*out*/ std::vector<uint8_t>& unpacked_tensor) {
   return g_host->UnpackInitializerData(tensor, model_path, unpacked_tensor);
 }
+Status UnpackInitializerData(const ONNX_NAMESPACE::TensorProto& tensor, /*out*/ std::vector<uint8_t>& unpacked_tensor) {
+  return g_host->UnpackInitializerData(tensor, std::filesystem::path(), unpacked_tensor);
+}
 
 }  // namespace utils
 
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 96cd72c91b0e6..03dc05a539d61 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -352,6 +352,7 @@ struct ProviderHost {
   // TypeProto
   virtual std::unique_ptr<ONNX_NAMESPACE::TypeProto> TypeProto__construct() = 0;
   virtual void TypeProto__CopyFrom(ONNX_NAMESPACE::TypeProto* p, const ONNX_NAMESPACE::TypeProto* other) = 0;
+  virtual bool TypeProto__has_tensor_type(const ONNX_NAMESPACE::TypeProto* p) = 0;
   virtual const ONNX_NAMESPACE::TypeProto_Tensor& TypeProto__tensor_type(const ONNX_NAMESPACE::TypeProto* p) = 0;
   virtual ONNX_NAMESPACE::TypeProto_Tensor* TypeProto__mutable_tensor_type(ONNX_NAMESPACE::TypeProto* p) = 0;
 
@@ -933,6 +934,12 @@ struct ProviderHost {
                                    const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
                                    bool drop_constant_initializers) = 0;
 
+  virtual std::unique_ptr<ComputeCapability>
+  Utils__MakeComputeCapability(const GraphViewer& graph_viewer,
+                               const std::vector<const Node*>& group,
+                               const std::function<std::string()>& generate_metadef_name,
+                               const std::string& execution_provider_name,
+                               bool drop_constant_initializers) = 0;
   // Model
   virtual std::unique_ptr<Model> Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
                                                   const IOnnxRuntimeOpSchemaRegistryList* local_registries,
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index 03a0b4ea99524..c92bae856b514 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -345,6 +345,7 @@ struct TypeProto_Sequence final {
 struct TypeProto final {
   static std::unique_ptr<TypeProto> Create() { return g_host->TypeProto__construct(); }
 
+  bool has_tensor_type() const { return g_host->TypeProto__has_tensor_type(this); }
   const TypeProto_Tensor& tensor_type() const { return g_host->TypeProto__tensor_type(this); }
   TypeProto_Tensor* mutable_tensor_type() { return g_host->TypeProto__mutable_tensor_type(this); }
 
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index b18d4c6ed23ef..c63a06fb34f9d 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -92,6 +92,7 @@ using Node_EdgeEnd = Node::EdgeEnd;
 #include "core/providers/openvino/openvino_provider_factory_creator.h"
 #include "core/providers/tensorrt/tensorrt_provider_factory_creator.h"
 #include "core/providers/vitisai/vitisai_provider_factory_creator.h"
+#include "core/providers/qnn/qnn_provider_factory_creator.h"
 
 #include "core/providers/cuda/cuda_provider_factory.h"
 #include "core/providers/cann/cann_provider_factory.h"
@@ -466,6 +467,7 @@ struct ProviderHostImpl : ProviderHost {
   // TypeProto (wrapped)
   std::unique_ptr<ONNX_NAMESPACE::TypeProto> TypeProto__construct() override { return std::make_unique<ONNX_NAMESPACE::TypeProto>(); }
   void TypeProto__CopyFrom(ONNX_NAMESPACE::TypeProto* p, const ONNX_NAMESPACE::TypeProto* other) override { p->CopyFrom(*other); }
+  bool TypeProto__has_tensor_type(const ONNX_NAMESPACE::TypeProto* p) override { return p->has_tensor_type(); }
   const ONNX_NAMESPACE::TypeProto_Tensor& TypeProto__tensor_type(const ONNX_NAMESPACE::TypeProto* p) override { return p->tensor_type(); }
   ONNX_NAMESPACE::TypeProto_Tensor* TypeProto__mutable_tensor_type(ONNX_NAMESPACE::TypeProto* p) override { return p->mutable_tensor_type(); }
   int TypeProto__value_case(const ONNX_NAMESPACE::TypeProto* p) override { return p->value_case(); }
@@ -1138,6 +1140,16 @@ struct ProviderHostImpl : ProviderHost {
                                                          drop_constant_initializers);
   }
 
+  std::unique_ptr<ComputeCapability>
+  Utils__MakeComputeCapability(const GraphViewer& graph_viewer,
+                               const std::vector<const Node*>& group,
+                               const std::function<std::string()>& generate_metadef_name,
+                               const std::string& execution_provider_name,
+                               bool drop_constant_initializers) override {
+    return onnxruntime::utils::MakeComputeCapability(graph_viewer, group, generate_metadef_name,
+                                                     execution_provider_name, drop_constant_initializers);
+  }
+
   // Model (wrapped)
   std::unique_ptr<Model> Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
                                           const IOnnxRuntimeOpSchemaRegistryList* local_registries,
@@ -1925,6 +1937,18 @@ ProviderOptions OrtOpenVINOProviderOptionsToOrtOpenVINOProviderOptionsV2(const O
   return ov_options_converted_map;
 }
 
+std::shared_ptr<IExecutionProviderFactory> QNNProviderFactoryCreator::Create(const ProviderOptions& provider_options_map,
+                                                                             const SessionOptions* session_options) {
+  const ConfigOptions* config_options = nullptr;
+  if (session_options != nullptr) {
+    config_options = &session_options->config_options;
+  }
+
+  std::array<const void*, 2> configs_array = {&provider_options_map, config_options};
+  const void* arg = reinterpret_cast<const void*>(&configs_array);
+  return s_library_qnn.Get().CreateExecutionProviderFactory(arg);
+}
+
 std::shared_ptr<IExecutionProviderFactory> OpenVINOProviderFactoryCreator::Create(
     const ProviderOptions* provider_options_map, const SessionOptions* session_options) {
   // Append session options applicable for EP to EP Provider options.

From 693dd335f363e15f4605b7a527436f9e36e3e72e Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Sun, 15 Dec 2024 22:12:03 -0800
Subject: [PATCH 30/64] Compiles but does not link (until update cmake to build
 as shared lib)

---
 .../qnn/builder/onnx_ctx_model_helper.cc      | 18 +++++------
 .../builder/opbuilder/reduce_op_builder.cc    |  6 ++--
 .../qnn/builder/qnn_backend_manager.cc        | 16 +++++++++-
 .../qnn_node_group/conv_activation_fusion.cc  | 24 ++++++++------
 .../providers/qnn/qnn_execution_provider.cc   | 31 ++-----------------
 .../providers/qnn/qnn_execution_provider.h    |  3 --
 .../shared_library/provider_interfaces.h      |  5 +++
 .../shared_library/provider_wrappedtypes.h    | 10 ++++++
 .../core/session/provider_bridge_ort.cc       | 13 ++++++++
 9 files changed, 71 insertions(+), 55 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
index 9fe1ff3da381a..36bccfe7effce 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -53,9 +53,9 @@ Status GetMainContextNode(const std::vector<IExecutionProvider::FusedNodeAndGrap
     // There is only one EPContext node in one filtered graph -- this is guaranteed by GetCapability
     const onnxruntime::GraphViewer& graph_viewer(fused_nodes_and_graphs[i].filtered_graph);
     ORT_RETURN_IF(graph_viewer.NumberOfNodes() != 1, "One filtered graph should has only one EPContext node!");
-    const auto& ep_context_node = graph_viewer.Nodes().begin();
-    ORT_RETURN_IF_NOT(EPCONTEXT_OP == ep_context_node->OpType(), "Should only filter in the EPContext node.");
-    utils::NodeAttrHelper node_helper(*ep_context_node);
+    const Node& ep_context_node = *graph_viewer.Nodes().begin();
+    ORT_RETURN_IF_NOT(EPCONTEXT_OP == ep_context_node.OpType(), "Should only filter in the EPContext node.");
+    utils::NodeAttrHelper node_helper(ep_context_node);
     int64_t is_main_context = node_helper.Get(MAIN_CONTEXT, static_cast<int64_t>(0));
     if (1 == is_main_context) {
       main_context_pos.push_back(static_cast<int>(i));
@@ -75,12 +75,12 @@ Status CreateNodeArgs(const std::vector<std::string>& names,
     std::string name = names[i];
     ORT_RETURN_IF(tensor_info_table.find(name) == tensor_info_table.end(), "Tensor name: ", name, " not found in tensor_info_table");
     const OnnxTensorInfo& tensor_info = tensor_info_table.at(name);
-    TypeProto tensor_type;
-    tensor_type.mutable_tensor_type()->set_elem_type(tensor_info.data_type_);
+    std::unique_ptr<TypeProto> tensor_type = TypeProto::Create();
+    tensor_type->mutable_tensor_type()->set_elem_type(tensor_info.data_type_);
     for (size_t j = 0; j < tensor_info.shape_.size(); ++j) {
-      tensor_type.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(tensor_info.shape_[j]);
+      tensor_type->mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(tensor_info.shape_[j]);
     }
-    auto& input_arg = graph.GetOrCreateNodeArg(name, &tensor_type);
+    auto& input_arg = graph.GetOrCreateNodeArg(name, tensor_type.get());
     node_args.push_back(&input_arg);
   }
   return Status::OK();
@@ -163,8 +163,8 @@ Status TryGetMaxSpillFillSize(const std::vector<IExecutionProvider::FusedNodeAnd
     auto index = main_context_pos_list[i];
     const onnxruntime::GraphViewer& main_ctx_graph_viewer(fused_nodes_and_graphs[index].filtered_graph);
     ORT_RETURN_IF(main_ctx_graph_viewer.NumberOfNodes() != 1, "One filtered graph should has only one EPContext node!");
-    const auto& ep_context_node = main_ctx_graph_viewer.Nodes().begin();
-    qnn::utils::NodeAttrHelper node_helper(*ep_context_node);
+    const Node& ep_context_node = *main_ctx_graph_viewer.Nodes().begin();
+    qnn::utils::NodeAttrHelper node_helper(ep_context_node);
     int64_t max_size = node_helper.Get(MAX_SIZE, static_cast<int64_t>(0));
     if (max_size > max_spill_fill_size) {
       max_spill_fill_size = max_size;
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
index a74fcfd843551..2ad6d3741d0ba 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
@@ -67,7 +67,7 @@ class ReduceOpBuilder : public BaseOpBuilder {
   using AxesQnnIntType = uint32_t;
 
   Status GetAxesSet(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
-                    InlinedHashSet<AxesOnnxIntType>& axes_set) const;
+                    std::set<AxesOnnxIntType>& axes_set) const;
 
   // Maps an operator type to the opset in which "axes" became an input instead of an attribute.
   static const std::array<int, REDUCE_OP_TYPE_COUNT> opset_with_axes_as_input;
@@ -83,7 +83,7 @@ const std::array<int, REDUCE_OP_TYPE_COUNT> ReduceOpBuilder::opset_with_axes_as_
 };
 
 Status ReduceOpBuilder::GetAxesSet(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
-                                   InlinedHashSet<AxesOnnxIntType>& axes_set) const {
+                                   std::set<AxesOnnxIntType>& axes_set) const {
   ReduceOpType reduce_op_type = GetReduceOpType(node_unit.OpType());
   if (reduce_op_type == ReduceOpType::REDUCE_OP_TYPE_UNKNOWN) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unknown reduce operator ", node_unit.OpType());
@@ -211,7 +211,7 @@ Status ReduceOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
   //
   // Handle axes param.
   //
-  InlinedHashSet<AxesOnnxIntType> axes_set;
+  std::set<AxesOnnxIntType> axes_set;
   ORT_RETURN_IF_ERROR(GetAxesSet(qnn_model_wrapper, node_unit, axes_set));
   const size_t num_axes = axes_set.size();
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index 5a9abe43fec72..dde5738731986 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -251,7 +251,7 @@ void QnnLogging(const char* format,
                                                                ::onnxruntime::logging::Category::onnxruntime,
                                                                data_type,
                                                                ORT_WHERE);
-    log_capture.ProcessPrintf(format, argument_parameter);
+    log_capture->ProcessPrintf(format, argument_parameter);
   }
 }
 
@@ -1098,6 +1098,8 @@ Status QnnBackendManager::ExtractBackendProfilingInfo() {
   }
 
   bool tracelogging_provider_ep_enabled = false;
+  // TODO: Re-enable when QNN EP is a dll
+#if 0
   const Env& env = Env::Default();
   // const Env& env = GetDefaultEnv();
   auto& provider = env.GetTelemetryProvider();
@@ -1108,6 +1110,7 @@ Status QnnBackendManager::ExtractBackendProfilingInfo() {
       tracelogging_provider_ep_enabled = true;
     }
   }
+#endif
 
   // ETW disabled previously, but enabled now
   if (ProfilingLevel::INVALID == profiling_level_etw_ && tracelogging_provider_ep_enabled) {
@@ -1325,6 +1328,8 @@ void QnnBackendManager::LogQnnProfileEventAsTraceLogging(
     const std::string& timingSource,
     const std::string& eventLevel,
     const char* eventIdentifier) {
+  // TODO: Re-enable when QNN EP is a dll
+#if 0
   TraceLoggingWrite(
       telemetry_provider_handle,
       "QNNProfilingEvent",
@@ -1337,6 +1342,15 @@ void QnnBackendManager::LogQnnProfileEventAsTraceLogging(
       TraceLoggingString(timingSource.c_str(), "Timing Source"),
       TraceLoggingString(eventLevel.c_str(), "Event Level"),
       TraceLoggingString(eventIdentifier, "Event Identifier"));
+#else
+  ORT_UNUSED_PARAMETER(timestamp);
+  ORT_UNUSED_PARAMETER(message);
+  ORT_UNUSED_PARAMETER(qnnScalarValue);
+  ORT_UNUSED_PARAMETER(unit);
+  ORT_UNUSED_PARAMETER(timingSource);
+  ORT_UNUSED_PARAMETER(eventLevel);
+  ORT_UNUSED_PARAMETER(eventIdentifier);
+#endif
 }
 #endif
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.cc
index 789811a423884..567dd5c1d6567 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.cc
@@ -173,7 +173,7 @@ static bool CanActivationBeRemoved(const QnnModelWrapper& qnn_model_wrapper,
 static std::vector<const Node*> FindParentDQNodes(const GraphViewer& graph_viewer, const Node& node) {
   // Get all parent DQ nodes sorted by destination argument index.
   std::vector<const Node*> parents(node.InputDefs().size(), nullptr);
-  for (auto it = node.InputEdgesBegin(); it != node.InputEdgesEnd(); it++) {
+  for (auto it = node.InputEdgesBegin(); it != node.InputEdgesEnd(); ++it) {
     if (it->GetNode().OpType().compare(DEQUANTIZE_LINEAR) == 0) {
       parents[it->GetDstArgIndex()] = &(it->GetNode());
     }
@@ -317,7 +317,7 @@ static Status CreateOrValidateOnQnn(QnnModelWrapper& qnn_model_wrapper,
 
     std::optional<int64_t> axis;
     if (auto entry = dq_attrs.find("axis"); entry != dq_attrs.end()) {
-      axis = entry->second.i();
+      axis = entry->second().i();
     }
 
     // quantization scale and zp are always the input[1, 2]
@@ -327,7 +327,8 @@ static Status CreateOrValidateOnQnn(QnnModelWrapper& qnn_model_wrapper,
 
   // Populate NodeUnit outputs and output edges
   std::vector<NodeUnitIODef> outputs;
-  Node::EdgeSet output_edges;
+  std::vector<std::unique_ptr<Node_EdgeEnd>> output_edges_holder;
+  std::vector<const Node_EdgeEnd*> output_edges;
   for (const Node* q_node : q_nodes) {
     const auto q_inputs = q_node->InputDefs();
     const auto& q_attrs = q_node->GetAttributes();
@@ -335,7 +336,7 @@ static Status CreateOrValidateOnQnn(QnnModelWrapper& qnn_model_wrapper,
 
     std::optional<int64_t> axis;
     if (auto entry = q_attrs.find("axis"); entry != q_attrs.end()) {
-      axis = entry->second.i();
+      axis = entry->second().i();
     }
 
     // quantization scale and zp are always the input[1, 2]
@@ -346,22 +347,25 @@ static Status CreateOrValidateOnQnn(QnnModelWrapper& qnn_model_wrapper,
     auto q_cur_edge = q_node->OutputEdgesBegin();
     auto q_end_edge = q_node->OutputEdgesEnd();
     for (; q_cur_edge != q_end_edge; ++q_cur_edge) {
-      output_edges.insert(Node::EdgeEnd{q_cur_edge->GetNode(), 0, q_cur_edge->GetDstArgIndex()});
+      auto output_edge = Node_EdgeEnd::Create(q_cur_edge->GetNode(), 0, q_cur_edge->GetDstArgIndex());
+      output_edges.push_back(output_edge.get());
+      output_edges_holder.push_back(std::move(output_edge));
     }
   }
 
-  NodeUnit custom_node_unit(dq_nodes, target_node, q_nodes, NodeUnit::Type::QDQGroup,
-                            inputs, outputs, num_dqs, output_edges);
-  const auto* conv_op_builder = qnn::GetOpBuilder(custom_node_unit.OpType());
+  std::unique_ptr<NodeUnit> custom_node_unit = NodeUnit::Create(dq_nodes, target_node,
+                                                                q_nodes, NodeUnit::Type::QDQGroup,
+                                                                inputs, outputs, num_dqs, output_edges);
+  const auto* conv_op_builder = qnn::GetOpBuilder(custom_node_unit->OpType());
   if (conv_op_builder == nullptr) {
     return Status::OK();
   }
 
   if (validate) {
-    return conv_op_builder->IsOpSupported(qnn_model_wrapper, custom_node_unit, logger);
+    return conv_op_builder->IsOpSupported(qnn_model_wrapper, *custom_node_unit, logger);
   }
 
-  return conv_op_builder->AddToModelBuilder(qnn_model_wrapper, custom_node_unit, logger, validate);
+  return conv_op_builder->AddToModelBuilder(qnn_model_wrapper, *custom_node_unit, logger, validate);
 }
 
 // Traverses graph to check if the given NodeUnit is part of a valid DQ* -> Conv -> Relu/Clip -> Q sequence.
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index c8ec3098b7ce0..cfafc468e8be6 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -25,32 +25,6 @@ namespace onnxruntime {
 
 constexpr const char* QNN = "QNN";
 
-static std::unique_ptr<std::vector<std::function<void()>>> s_run_on_unload_;
-
-// TODO: Remove and use versions in EP provider bridge.
-void RunOnUnload(std::function<void()> function) {
-  static std::mutex mutex;
-  std::lock_guard<std::mutex> guard(mutex);
-  if (!s_run_on_unload_) {
-    s_run_on_unload_ = std::make_unique<std::vector<std::function<void()>>>();
-  }
-  s_run_on_unload_->push_back(std::move(function));
-}
-
-// TODO: Remove and use versions in EP provider bridge.
-struct OnUnload {
-  ~OnUnload() {
-    if (!s_run_on_unload_)
-      return;
-
-    for (auto& function : *s_run_on_unload_)
-      function();
-
-    s_run_on_unload_.reset();
-  }
-
-} g_on_unload;
-
 static void ParseProfilingLevel(std::string profiling_level_string,
                                 qnn::ProfilingLevel& profiling_level) {
   std::transform(profiling_level_string.begin(),
@@ -189,8 +163,7 @@ qnn::ProfilingLevel QNNExecutionProvider::GetProfilingLevelFromETWLevel(unsigned
 QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_options_map,
                                            const ConfigOptions* config_options)
     : IExecutionProvider{onnxruntime::kQnnExecutionProvider} {
-  // TODO: Uncomment when QNN EP is built as a DLL
-  // InitProviderOrtApi();
+  InitProviderOrtApi();
 
   if (config_options) {
     disable_cpu_ep_fallback_ = config_options->GetConfigOrDefault(
@@ -1015,7 +988,7 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
                                                                           buffer_size,
                                                                           max_spill_fill_buffer_size));
     }
-    qnn_ep_context_model_ = std::make_unique<Model>("qnn_ep_context_model", false, logger);
+    qnn_ep_context_model_ = Model::Create("qnn_ep_context_model", false, logger);
     ORT_RETURN_IF_ERROR(qnn::CreateEPContextNodes(qnn_ep_context_model_.get(),
                                                   context_buffer.get(),
                                                   buffer_size,
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index b390988f39da4..4324b3ddfef78 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -22,9 +22,6 @@
 
 namespace onnxruntime {
 
-// TODO: Remove. It's in provider bridge.
-void RunOnUnload(std::function<void()> function);
-
 class SharedContext {
  public:
   static SharedContext& GetInstance() {
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 03dc05a539d61..41ad441db616e 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -289,6 +289,7 @@ struct ProviderHost {
   virtual std::unique_ptr<logging::Capture> logging__Capture__construct(const logging::Logger& logger, logging::Severity severity, const char* category, logging::DataType dataType, const CodeLocation& location) = 0;
   virtual void logging__Capture__operator_delete(logging::Capture* p) noexcept = 0;
   virtual std::ostream& logging__Capture__Stream(logging::Capture* p) noexcept = 0;
+  virtual void logging__Capture__ProcessPrintf(logging::Capture* p, const char* format, va_list args) = 0;
 
   // Env
   virtual Env& Env__Default() = 0;
@@ -835,6 +836,8 @@ struct ProviderHost {
 
   virtual const NodeAttributes& Node__GetAttributes(const Node* p) noexcept = 0;
   virtual void Node__AddAttribute(Node* p, const ::std::string& attr_name, const ONNX_NAMESPACE::GraphProto& value) = 0;
+  virtual void Node__AddAttribute(Node* p, const ::std::string& attr_name, const std::string& value) = 0;
+  virtual void Node__AddAttribute(Node* p, const ::std::string& attr_name, int64_t value) = 0;
   virtual size_t Node__GetInputEdgesCount(const Node* p) noexcept = 0;
   virtual size_t Node__GetOutputEdgesCount(const Node* p) noexcept = 0;
 
@@ -944,6 +947,8 @@ struct ProviderHost {
   virtual std::unique_ptr<Model> Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
                                                   const IOnnxRuntimeOpSchemaRegistryList* local_registries,
                                                   const logging::Logger& logger) = 0;
+  virtual std::unique_ptr<Model> Model__construct(const std::string& graph_name, bool is_onnx_domain_only,
+                                                  const logging::Logger& logger) = 0;
   virtual void Model__operator_delete(Model* p) = 0;
   virtual Graph& Model__MainGraph(Model* p) = 0;
   virtual std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToProto(Model* p) = 0;
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index c92bae856b514..3b5e5039796f7 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -42,6 +42,7 @@ struct Capture final {
   static void operator delete(void* p) { g_host->logging__Capture__operator_delete(reinterpret_cast<Capture*>(p)); }
 
   std::ostream& Stream() noexcept { return g_host->logging__Capture__Stream(this); }
+  void ProcessPrintf(const char* format, va_list args) { g_host->logging__Capture__ProcessPrintf(this, format, args); }
 
   Capture() = delete;
   Capture(const Capture&) = delete;
@@ -824,6 +825,12 @@ struct Node final {
   void AddAttribute(const ::std::string& attr_name, const ONNX_NAMESPACE::GraphProto& value) {
     g_host->Node__AddAttribute(this, attr_name, value);
   }
+  void AddAttribute(const std::string& attr_name, const std::string& value) {
+    g_host->Node__AddAttribute(this, attr_name, value);
+  }
+  void AddAttribute(const std::string& attr_name, int64_t value) {
+    g_host->Node__AddAttribute(this, attr_name, value);
+  }
 
   size_t GetInputEdgesCount() const noexcept { return g_host->Node__GetInputEdgesCount(this); }
   size_t GetOutputEdgesCount() const noexcept { return g_host->Node__GetOutputEdgesCount(this); }
@@ -976,6 +983,9 @@ struct Model final {
                                        const IOnnxRuntimeOpSchemaRegistryList* local_registries, const logging::Logger& logger) {
     return g_host->Model__construct(std::move(model_proto), model_path, local_registries, logger);
   }
+  static std::unique_ptr<Model> Create(const std::string& graph_name, bool is_onnx_domain_only, const logging::Logger& logger) {
+    return g_host->Model__construct(graph_name, is_onnx_domain_only, logger);
+  }
   static void operator delete(void* p) { g_host->Model__operator_delete(reinterpret_cast<Model*>(p)); }
   static Status Load(const PathString& file_path, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) { return g_host->Model__Load(file_path, model_proto); }
 
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index c63a06fb34f9d..2ad85552aa813 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -396,6 +396,9 @@ struct ProviderHostImpl : ProviderHost {
   }
   void logging__Capture__operator_delete(logging::Capture* p) noexcept override { delete p; }
   std::ostream& logging__Capture__Stream(logging::Capture* p) noexcept override { return p->Stream(); }
+  void logging__Capture__ProcessPrintf(logging::Capture* p, const char* format, va_list args) override {
+    p->ProcessPrintf(format, args);
+  }
 
   // Env
   Env& Env__Default() override { return Env::Default(); }
@@ -988,6 +991,12 @@ struct ProviderHostImpl : ProviderHost {
   void Node__AddAttribute(Node* p, const ::std::string& attr_name, const ONNX_NAMESPACE::GraphProto& value) override {
     p->AddAttribute(attr_name, value);
   }
+  void Node__AddAttribute(Node* p, const ::std::string& attr_name, const std::string& value) override {
+    p->AddAttribute(attr_name, value);
+  }
+  void Node__AddAttribute(Node* p, const ::std::string& attr_name, int64_t value) override {
+    p->AddAttribute(attr_name, value);
+  }
   size_t Node__GetInputEdgesCount(const Node* p) noexcept override { return p->GetInputEdgesCount(); }
   size_t Node__GetOutputEdgesCount(const Node* p) noexcept override { return p->GetOutputEdgesCount(); }
 
@@ -1156,6 +1165,10 @@ struct ProviderHostImpl : ProviderHost {
                                           const logging::Logger& logger) override {
     return std::make_unique<Model>(model_proto, model_path, local_registries, logger);
   }
+  std::unique_ptr<Model> Model__construct(const std::string& graph_name, bool is_onnx_domain_only,
+                                          const logging::Logger& logger) override {
+    return std::make_unique<Model>(graph_name, is_onnx_domain_only, logger);
+  }
   void Model__operator_delete(Model* p) override { delete p; }
   Graph& Model__MainGraph(Model* p) override { return p->MainGraph(); }
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToProto(Model* p) override { return std::make_unique<ONNX_NAMESPACE::ModelProto>(p->ToProto()); }

From ae2dbd28dc46d30741ab00e5533e7c4bc0f46737 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Sun, 15 Dec 2024 22:17:30 -0800
Subject: [PATCH 31/64] Use provider bridge function to get default Env

---
 .../core/providers/qnn/builder/qnn_backend_manager.cc       | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index dde5738731986..c816858018411 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -1100,8 +1100,7 @@ Status QnnBackendManager::ExtractBackendProfilingInfo() {
   bool tracelogging_provider_ep_enabled = false;
   // TODO: Re-enable when QNN EP is a dll
 #if 0
-  const Env& env = Env::Default();
-  // const Env& env = GetDefaultEnv();
+  const Env& env = GetDefaultEnv();
   auto& provider = env.GetTelemetryProvider();
   auto level = provider.Level();
   if (provider.IsEnabled()) {
@@ -1507,8 +1506,7 @@ void* QnnBackendManager::LoadLib(const char* file_name, int flags, std::string&
   auto file_path = std::filesystem::path(file_name);
   if (!file_path.is_absolute()) {
     // construct an absolute path from ORT runtime path + file_name and check whether it exists.
-    const Env& env = Env::Default();
-    // const Env& env = GetDefaultEnv();
+    const Env& env = GetDefaultEnv();
     auto pathstring = env.GetRuntimePath() + ToPathString(file_name);
     auto absolute_path = pathstring.c_str();
     if (std::filesystem::exists(std::filesystem::path(absolute_path))) {

From db5f0ec1f664c9494d0fce8391484f8d54bec0e8 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Mon, 16 Dec 2024 00:45:12 -0800
Subject: [PATCH 32/64] It works! QNN EP is a shared library and all QNN unit
 tests pass on Windows ARM64. [disabled ETW code for now]

---
 cmake/onnxruntime.cmake                       |  1 -
 cmake/onnxruntime_providers.cmake             |  3 -
 cmake/onnxruntime_providers_qnn.cmake         | 59 ++++++++++++-------
 cmake/onnxruntime_python.cmake                |  8 ++-
 cmake/onnxruntime_unittests.cmake             |  6 +-
 .../core/providers/qnn/builder/qnn_utils.cc   | 10 ++--
 onnxruntime/core/providers/qnn/ort_api.h      |  6 +-
 .../providers/qnn/qnn_execution_provider.cc   |  3 +-
 .../providers/qnn/qnn_execution_provider.h    |  2 +-
 9 files changed, 62 insertions(+), 36 deletions(-)

diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 732c0511d400f..3b76aff829be2 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -208,7 +208,6 @@ set(onnxruntime_INTERNAL_LIBRARIES
   ${PROVIDERS_COREML}
   ${PROVIDERS_DML}
   ${PROVIDERS_NNAPI}
-  ${PROVIDERS_QNN}
   ${PROVIDERS_SNPE}
   ${PROVIDERS_RKNPU}
   ${PROVIDERS_VSINPU}
diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
index 582491de9503d..67fa48b28278d 100644
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@@ -74,9 +74,6 @@ endif()
 if(onnxruntime_USE_JSEP)
   set(PROVIDERS_JS onnxruntime_providers_js)
 endif()
-if(onnxruntime_USE_QNN)
-  set(PROVIDERS_QNN onnxruntime_providers_qnn)
-endif()
 if(onnxruntime_USE_RKNPU)
   set(PROVIDERS_RKNPU onnxruntime_providers_rknpu)
 endif()
diff --git a/cmake/onnxruntime_providers_qnn.cmake b/cmake/onnxruntime_providers_qnn.cmake
index 52ccdbf7c9ecc..2a6c63ee01149 100644
--- a/cmake/onnxruntime_providers_qnn.cmake
+++ b/cmake/onnxruntime_providers_qnn.cmake
@@ -4,32 +4,51 @@
   add_compile_definitions(USE_QNN=1)
 
   file(GLOB_RECURSE
-    onnxruntime_providers_qnn_ep_cc_srcs CONFIGURE_DEPENDS
+    onnxruntime_providers_qnn_cc_srcs CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/core/providers/qnn/*.h"
     "${ONNXRUNTIME_ROOT}/core/providers/qnn/*.cc"
-  )
-
-  file(GLOB_RECURSE
-    onnxruntime_providers_qnn_builder_cc_srcs CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/core/providers/qnn/builder/*.h"
     "${ONNXRUNTIME_ROOT}/core/providers/qnn/builder/*.cc"
-  )
-
-  set(onnxruntime_providers_qnn_cc_srcs
-    ${onnxruntime_providers_qnn_ep_cc_srcs}
-    ${onnxruntime_providers_qnn_builder_cc_srcs}
+    "${ONNXRUNTIME_ROOT}/core/providers/qnn/builder/qnn_node_group/*.h"
+    "${ONNXRUNTIME_ROOT}/core/providers/qnn/builder/qnn_node_group/*.cc"
+    "${ONNXRUNTIME_ROOT}/core/providers/qnn/builder/opbuilder/*.h"
+    "${ONNXRUNTIME_ROOT}/core/providers/qnn/builder/opbuilder/*.cc"
+    "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
+    "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
   )
 
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_qnn_cc_srcs})
-  onnxruntime_add_static_library(onnxruntime_providers_qnn ${onnxruntime_providers_qnn_cc_srcs})
-  onnxruntime_add_include_to_target(onnxruntime_providers_qnn onnxruntime_common onnxruntime_framework onnx onnx_proto protobuf::libprotobuf-lite flatbuffers::flatbuffers Boost::mp11)
-  target_link_libraries(onnxruntime_providers_qnn)
-  add_dependencies(onnxruntime_providers_qnn onnx ${onnxruntime_EXTERNAL_DEPENDENCIES})
-  set_target_properties(onnxruntime_providers_qnn PROPERTIES CXX_STANDARD_REQUIRED ON)
-  set_target_properties(onnxruntime_providers_qnn PROPERTIES FOLDER "ONNXRuntime")
-  target_include_directories(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_ROOT} ${onnxruntime_QNN_HOME}/include/QNN ${onnxruntime_QNN_HOME}/include)
-  set_target_properties(onnxruntime_providers_qnn PROPERTIES LINKER_LANGUAGE CXX)
-  # ignore the warning unknown-pragmas on "pragma region"
-  if(NOT MSVC)
+  onnxruntime_add_shared_library_module(onnxruntime_providers_qnn ${onnxruntime_providers_qnn_cc_srcs})
+  onnxruntime_add_include_to_target(onnxruntime_providers_qnn ${ONNXRUNTIME_PROVIDERS_SHARED} ${GSL_TARGET} onnx onnxruntime_common safeint_interface)
+  target_link_libraries(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_PROVIDERS_SHARED} ${ABSEIL_LIBS})
+  add_dependencies(onnxruntime_providers_qnn ${onnxruntime_EXTERNAL_DEPENDENCIES})
+  target_include_directories(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_ROOT}
+	                                                       ${CMAKE_CURRENT_BINARY_DIR}
+	                                                       ${onnxruntime_QNN_HOME}/include/QNN
+							       ${onnxruntime_QNN_HOME}/include)
+
+  # Set linker flags for function(s) exported by EP DLL
+  if(UNIX)
+    set_property(TARGET onnxruntime_providers_qnn APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds -Xlinker --gc-sections")
+  elseif(WIN32)
+    set_property(TARGET onnxruntime_providers_qnn APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/qnn/symbols.def")
+  else()
+    message(FATAL_ERROR "onnxruntime_providers_qnn unknown platform, need to specify shared library exports for it")
+  endif()
+
+  # Set compile options
+  if(MSVC)
+    target_compile_options(onnxruntime_providers_qnn PUBLIC /wd4099 /wd4005)
+  else()
+    # ignore the warning unknown-pragmas on "pragma region"
     target_compile_options(onnxruntime_providers_qnn PRIVATE "-Wno-unknown-pragmas")
   endif()
+
+  set_target_properties(onnxruntime_providers_qnn PROPERTIES LINKER_LANGUAGE CXX)
+  set_target_properties(onnxruntime_providers_qnn PROPERTIES CXX_STANDARD_REQUIRED ON)
+  set_target_properties(onnxruntime_providers_qnn PROPERTIES FOLDER "ONNXRuntime")
+
+  install(TARGETS onnxruntime_providers_qnn
+          ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+          LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+          RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index 5a87252b08573..776c866efbb12 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -180,7 +180,6 @@ target_link_libraries(onnxruntime_pybind11_state PRIVATE
     ${PROVIDERS_XNNPACK}
     ${PROVIDERS_WEBGPU}
     ${PROVIDERS_AZURE}
-    ${PROVIDERS_QNN}
     onnxruntime_optimizer
     onnxruntime_providers
     onnxruntime_util
@@ -997,6 +996,13 @@ if (onnxruntime_USE_COREML)
 endif()
 
 if (onnxruntime_USE_QNN)
+  add_custom_command(
+    TARGET onnxruntime_pybind11_state POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy
+        $<TARGET_FILE:onnxruntime_providers_qnn>
+        $<TARGET_FILE:onnxruntime_providers_shared>
+        $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
+  )
   add_custom_command(
     TARGET onnxruntime_pybind11_state POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E copy
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 306096db128a7..c19a18ef15089 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -628,12 +628,11 @@ set(ONNXRUNTIME_TEST_LIBS
     onnxruntime_session
     ${ONNXRUNTIME_INTEROP_TEST_LIBS}
     ${onnxruntime_libs}
-    # CUDA, ROCM, TENSORRT, MIGRAPHX, DNNL, and OpenVINO are dynamically loaded at runtime
+    # CUDA, ROCM, TENSORRT, MIGRAPHX, DNNL, OpenVINO, and QNN are dynamically loaded at runtime
     ${PROVIDERS_NNAPI}
     ${PROVIDERS_VSINPU}
     ${PROVIDERS_JS}
     ${PROVIDERS_WEBGPU}
-    ${PROVIDERS_QNN}
     ${PROVIDERS_SNPE}
     ${PROVIDERS_RKNPU}
     ${PROVIDERS_DML}
@@ -704,8 +703,7 @@ endif()
 if(onnxruntime_USE_QNN AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_REDUCED_OPS_BUILD)
   list(APPEND onnxruntime_test_framework_src_patterns ${TEST_SRC_DIR}/providers/qnn/*)
   list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_qnn)
-  list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_qnn)
-  list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_qnn)
+  list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_qnn onnxruntime_providers_shared)
 endif()
 
 if(onnxruntime_USE_SNPE)
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
index 50a151292d9c7..74cdf2ad17f42 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
@@ -902,21 +902,23 @@ static bool GetClipMinMaxImpl(const GraphViewer& graph_viewer, const Node& node,
 
       switch (input_type) {
         case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: {
-          auto status = onnxruntime::utils::UnpackTensor(*initializer, graph_viewer.ModelPath(), &value, 1);
+          std::vector<uint8_t> bytes(sizeof(float));
+          auto status = onnxruntime::utils::UnpackInitializerData(*initializer, graph_viewer.ModelPath(), bytes);
           if (!status.IsOK()) {
             LOGS(logger, ERROR) << "GetClipMinMax() failed to unpack float initializer: " << status.ErrorMessage();
             return false;
           }
+          value = *reinterpret_cast<float*>(bytes.data());
           break;
         }
         case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16: {
-          MLFloat16 f16_val{};
-          auto status = onnxruntime::utils::UnpackTensor(*initializer, graph_viewer.ModelPath(), &f16_val, 1);
+          std::vector<uint8_t> bytes(sizeof(MLFloat16));
+          auto status = onnxruntime::utils::UnpackInitializerData(*initializer, graph_viewer.ModelPath(), bytes);
           if (!status.IsOK()) {
             LOGS(logger, ERROR) << "GetClipMinMax() failed to unpack float16 initializer: " << status.ErrorMessage();
             return false;
           }
-          value = f16_val.ToFloat();
+          value = reinterpret_cast<MLFloat16*>(bytes.data())->ToFloat();
           break;
         }
         default:
diff --git a/onnxruntime/core/providers/qnn/ort_api.h b/onnxruntime/core/providers/qnn/ort_api.h
index 6fb346d2349a0..44face145fb04 100644
--- a/onnxruntime/core/providers/qnn/ort_api.h
+++ b/onnxruntime/core/providers/qnn/ort_api.h
@@ -6,6 +6,7 @@
 #define BUILD_QNN_EP_STATIC 0
 
 #if BUILD_QNN_EP_STATIC
+// Includes when building QNN EP statically
 #include "onnx/defs/data_type_utils.h"
 #include "core/common/common.h"
 #include "core/common/status.h"
@@ -32,10 +33,13 @@
 #include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
 #include "core/providers/common.h"
 #include "core/providers/partitioning_utils.h"
+#include "core/session/onnxruntime_cxx_api.h"
 #else
+// Includes when building QNN EP as a shared library
 #include "core/providers/shared_library/provider_api.h"
+#define ORT_API_MANUAL_INIT
+#include "core/session/onnxruntime_cxx_api.h"
 #endif
 
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "core/session/onnxruntime_run_options_config_keys.h"
-#include "core/session/onnxruntime_cxx_api.h"
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index cfafc468e8be6..99de2089cea13 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -164,6 +164,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
                                            const ConfigOptions* config_options)
     : IExecutionProvider{onnxruntime::kQnnExecutionProvider} {
   InitProviderOrtApi();
+  metadef_id_generator_ = ModelMetadefIdGenerator::Create();
 
   if (config_options) {
     disable_cpu_ep_fallback_ = config_options->GetConfigOrDefault(
@@ -654,7 +655,7 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
 
   const auto gen_metadef_name = [&]() {
     uint64_t model_hash;
-    int metadef_id = metadef_id_generator_.GenerateId(graph_viewer, model_hash);
+    int metadef_id = metadef_id_generator_->GenerateId(graph_viewer, model_hash);
     return MakeString(QNN, context_node_name_prefix_, "_", model_hash, "_", metadef_id);
   };
 
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index 4324b3ddfef78..168fadfa98e25 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -142,7 +142,7 @@ class QNNExecutionProvider : public IExecutionProvider {
   bool qnn_context_embed_mode_ = true;
   int32_t vtcm_size_in_mb_ = 0;
   std::unique_ptr<onnxruntime::Model> qnn_ep_context_model_;
-  ModelMetadefIdGenerator metadef_id_generator_;
+  std::unique_ptr<ModelMetadefIdGenerator> metadef_id_generator_;
   uint32_t device_id_ = 0;
   qnn::HtpPerformanceMode default_htp_performance_mode_ = qnn::HtpPerformanceMode::kHtpDefault;
   uint32_t default_rpc_control_latency_ = 0;

From ea2a141a63372b7d497c36c8f64b131cae846cac Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Mon, 16 Dec 2024 00:55:34 -0800
Subject: [PATCH 33/64] Add onnxruntime_providers_qnn.dll to nuget

---
 .../nuget/generate_nuspec_for_native_nuget.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index 11842f34ce45b..e19e0219e7d5f 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -382,6 +382,7 @@ def generate_files(line_list, args):
             "tensorrt_ep_shared_lib": "onnxruntime_providers_tensorrt.dll",
             "openvino_ep_shared_lib": "onnxruntime_providers_openvino.dll",
             "cuda_ep_shared_lib": "onnxruntime_providers_cuda.dll",
+            "qnn_ep_shared_lib": "onnxruntime_providers_qnn.dll",
             "onnxruntime_perf_test": "onnxruntime_perf_test.exe",
             "onnx_test_runner": "onnx_test_runner.exe",
         }
@@ -777,6 +778,24 @@ def generate_files(line_list, args):
             + '\\native" />'
         )
 
+    if args.execution_provider == "qnn":
+        files_list.append(
+            "<file src="
+            + '"'
+            + os.path.join(args.native_build_path, nuget_dependencies["providers_shared_lib"])
+            + runtimes_target
+            + args.target_architecture
+            + '\\native" />'
+        )
+        files_list.append(
+            "<file src="
+            + '"'
+            + os.path.join(args.native_build_path, nuget_dependencies["qnn_ep_shared_lib"])
+            + runtimes_target
+            + args.target_architecture
+            + '\\native" />'
+        )
+
     # process all other library dependencies
     if is_cpu_package or is_cuda_gpu_package or is_dml_package or is_mklml_package:
         # Process dnnl dependency

From d820c9b587967312a5b85e9eaf0ba685844eb4c5 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Mon, 16 Dec 2024 01:13:57 -0800
Subject: [PATCH 34/64] Pass --build_shared_lib to some QNN pipelines. Include
 Boost::mp11.

---
 cmake/onnxruntime_providers_qnn.cmake                           | 2 +-
 .../android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml          | 1 +
 tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml | 2 ++
 .../github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml        | 2 ++
 tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml   | 2 ++
 5 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/cmake/onnxruntime_providers_qnn.cmake b/cmake/onnxruntime_providers_qnn.cmake
index 2a6c63ee01149..53aba28939a1a 100644
--- a/cmake/onnxruntime_providers_qnn.cmake
+++ b/cmake/onnxruntime_providers_qnn.cmake
@@ -19,7 +19,7 @@
 
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_qnn_cc_srcs})
   onnxruntime_add_shared_library_module(onnxruntime_providers_qnn ${onnxruntime_providers_qnn_cc_srcs})
-  onnxruntime_add_include_to_target(onnxruntime_providers_qnn ${ONNXRUNTIME_PROVIDERS_SHARED} ${GSL_TARGET} onnx onnxruntime_common safeint_interface)
+  onnxruntime_add_include_to_target(onnxruntime_providers_qnn ${ONNXRUNTIME_PROVIDERS_SHARED} ${GSL_TARGET} onnx onnxruntime_common Boost::mp11 safeint_interface)
   target_link_libraries(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_PROVIDERS_SHARED} ${ABSEIL_LIBS})
   add_dependencies(onnxruntime_providers_qnn ${onnxruntime_EXTERNAL_DEPENDENCIES})
   target_include_directories(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_ROOT}
diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
index c3dbee336b69d..d080f68ca292f 100644
--- a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
@@ -72,6 +72,7 @@ jobs:
         --android_abi=x86_64 \
         --android_api=31 \
         --parallel \
+        --build_shared_lib \
         --use_qnn \
         --qnn_home $(QnnSDKRootDir) \
         --cmake_generator=Ninja \
diff --git a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
index d3826d90f9073..78bd2e20a4763 100644
--- a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
@@ -65,6 +65,7 @@ jobs:
             --config Release \
             --use_binskim_compliant_compile_flags \
             --build_java \
+            --build_shared_lib \
             --use_qnn \
             --qnn_home $(QnnSDKRootDir) \
             --cmake_generator=Ninja \
@@ -77,6 +78,7 @@ jobs:
             --config Release \
             --use_binskim_compliant_compile_flags \
             --build_java \
+            --build_shared_lib \
             --use_qnn \
             --qnn_home $(QnnSDKRootDir) \
             --cmake_generator=Ninja \
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
index 5c013fae6be0b..826c43ebd9a15 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
@@ -79,6 +79,7 @@ jobs:
         --config $(BuildConfig)
         --build_dir $(Build.BinariesDirectory)
         --cmake_generator "Visual Studio 17 2022"
+        --build_shared_lib
         --use_qnn
         --qnn_home $(QnnSDKRootDir)
         --update --build --parallel
@@ -88,6 +89,7 @@ jobs:
         --config $(BuildConfig) ^
         --build_dir $(Build.BinariesDirectory) ^
         --cmake_generator "Visual Studio 17 2022" ^
+        --build_shared_lib ^
         --use_qnn ^
         --qnn_home $(QnnSDKRootDir) ^
         --test --enable_onnx_tests
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
index 53700c58c7e7d..485c06fdbed04 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
@@ -72,6 +72,7 @@ jobs:
         --build_dir $(Build.BinariesDirectory)
         --cmake_generator "Visual Studio 17 2022"
         --build_java
+        --build_shared_lib
         --use_qnn
         --qnn_home $(QnnSDKRootDir)
         --use_binskim_compliant_compile_flags
@@ -87,6 +88,7 @@ jobs:
         --build_dir $(Build.BinariesDirectory) ^
         --cmake_generator "Visual Studio 17 2022" ^
         --build_java ^
+        --build_shared_lib ^
         --use_qnn ^
         --qnn_home $(QnnSDKRootDir) ^
         --use_binskim_compliant_compile_flags ^

From 9861ec8b70a9768a0fb87c8a6c5504621b58cfbe Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Mon, 16 Dec 2024 10:40:23 -0800
Subject: [PATCH 35/64] Copy qnn dll for java build

---
 cmake/onnxruntime_java.cmake | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/cmake/onnxruntime_java.cmake b/cmake/onnxruntime_java.cmake
index b15b9632e9e24..662f7cb949ece 100644
--- a/cmake/onnxruntime_java.cmake
+++ b/cmake/onnxruntime_java.cmake
@@ -148,7 +148,7 @@ if (WIN32)
   if(NOT onnxruntime_ENABLE_STATIC_ANALYSIS)
     add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_FILE_NAME:onnxruntime>)
     add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime4j_jni> ${JAVA_PACKAGE_JNI_DIR}/$<TARGET_FILE_NAME:onnxruntime4j_jni>)
-    if (onnxruntime_USE_CUDA OR onnxruntime_USE_DNNL OR onnxruntime_USE_OPENVINO OR onnxruntime_USE_TENSORRT)
+    if (onnxruntime_USE_CUDA OR onnxruntime_USE_DNNL OR onnxruntime_USE_OPENVINO OR onnxruntime_USE_TENSORRT OR onnxruntime_USE_QNN)
       add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_shared> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_FILE_NAME:onnxruntime_providers_shared>)
     endif()
     if (onnxruntime_USE_CUDA)
@@ -163,11 +163,14 @@ if (WIN32)
     if (onnxruntime_USE_TENSORRT)
       add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_tensorrt> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_FILE_NAME:onnxruntime_providers_tensorrt>)
     endif()
+    if (onnxruntime_USE_QNN)
+      add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_qnn> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_FILE_NAME:onnxruntime_providers_qnn>)
+    endif()
   endif()
 else()
   add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime>)
   add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime4j_jni> ${JAVA_PACKAGE_JNI_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime4j_jni>)
-  if (onnxruntime_USE_CUDA OR onnxruntime_USE_DNNL OR onnxruntime_USE_OPENVINO OR onnxruntime_USE_TENSORRT)
+  if (onnxruntime_USE_CUDA OR onnxruntime_USE_DNNL OR onnxruntime_USE_OPENVINO OR onnxruntime_USE_TENSORRT OR onnxruntime_USE_QNN)
     add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_shared> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime_providers_shared>)
   endif()
   if (onnxruntime_USE_CUDA)
@@ -182,6 +185,9 @@ else()
   if (onnxruntime_USE_TENSORRT)
     add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_tensorrt> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime_providers_tensorrt>)
   endif()
+  if (onnxruntime_USE_QNN)
+    add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_qnn> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime_providers_qnn>)
+  endif()
 endif()
 
 # run the build process (this copies the results back into CMAKE_CURRENT_BINARY_DIR)

From b92043dafb5f049a17e7b938ee5d701b14f2aac1 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Tue, 17 Dec 2024 09:51:21 -0800
Subject: [PATCH 36/64] Add linker -rpath="

---
 cmake/onnxruntime_providers_qnn.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/onnxruntime_providers_qnn.cmake b/cmake/onnxruntime_providers_qnn.cmake
index 53aba28939a1a..84a776dac0674 100644
--- a/cmake/onnxruntime_providers_qnn.cmake
+++ b/cmake/onnxruntime_providers_qnn.cmake
@@ -20,8 +20,8 @@
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_qnn_cc_srcs})
   onnxruntime_add_shared_library_module(onnxruntime_providers_qnn ${onnxruntime_providers_qnn_cc_srcs})
   onnxruntime_add_include_to_target(onnxruntime_providers_qnn ${ONNXRUNTIME_PROVIDERS_SHARED} ${GSL_TARGET} onnx onnxruntime_common Boost::mp11 safeint_interface)
-  target_link_libraries(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_PROVIDERS_SHARED} ${ABSEIL_LIBS})
-  add_dependencies(onnxruntime_providers_qnn ${onnxruntime_EXTERNAL_DEPENDENCIES})
+  target_link_libraries(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_PROVIDERS_SHARED} ${ABSEIL_LIBS} ${CMAKE_DL_LIBS})
+  add_dependencies(onnxruntime_providers_qnn onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
   target_include_directories(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_ROOT}
 	                                                       ${CMAKE_CURRENT_BINARY_DIR}
 	                                                       ${onnxruntime_QNN_HOME}/include/QNN
@@ -29,7 +29,7 @@
 
   # Set linker flags for function(s) exported by EP DLL
   if(UNIX)
-    set_property(TARGET onnxruntime_providers_qnn APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds -Xlinker --gc-sections")
+    set_property(TARGET onnxruntime_providers_qnn APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds -Xlinker --gc-sections -Xlinker -rpath=\$ORIGIN")
   elseif(WIN32)
     set_property(TARGET onnxruntime_providers_qnn APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/qnn/symbols.def")
   else()

From f29ce59bcdf239e2577b35121e67d6896c1c7660 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Tue, 17 Dec 2024 10:31:22 -0800
Subject: [PATCH 37/64] two backslashes in rpath

---
 cmake/onnxruntime_providers_qnn.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/onnxruntime_providers_qnn.cmake b/cmake/onnxruntime_providers_qnn.cmake
index 84a776dac0674..215367f2f040c 100644
--- a/cmake/onnxruntime_providers_qnn.cmake
+++ b/cmake/onnxruntime_providers_qnn.cmake
@@ -29,7 +29,7 @@
 
   # Set linker flags for function(s) exported by EP DLL
   if(UNIX)
-    set_property(TARGET onnxruntime_providers_qnn APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds -Xlinker --gc-sections -Xlinker -rpath=\$ORIGIN")
+    set_property(TARGET onnxruntime_providers_qnn APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds -Xlinker --gc-sections -Xlinker -rpath=\\$ORIGIN")
   elseif(WIN32)
     set_property(TARGET onnxruntime_providers_qnn APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/qnn/symbols.def")
   else()

From 5d954d69ccdcdacf1a18756953b1f3f3575e2030 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Tue, 17 Dec 2024 11:24:58 -0800
Subject: [PATCH 38/64] Copy qnn ep dlls when running java unit tests

---
 cmake/onnxruntime_unittests.cmake | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index c19a18ef15089..b19e32e87443a 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -1608,8 +1608,14 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
 
         # also copy other library dependencies that may be required by tests to native-test
         if(onnxruntime_USE_QNN)
-          add_custom_command(TARGET onnxruntime_providers_qnn POST_BUILD
-              COMMAND ${CMAKE_COMMAND} -E copy ${QNN_LIB_FILES} ${JAVA_NATIVE_TEST_DIR})
+	  add_custom_command(
+	    TARGET onnxruntime_providers_qnn POST_BUILD
+	    COMMAND ${CMAKE_COMMAND} -E copy
+		$<TARGET_FILE:onnxruntime_providers_qnn>
+		$<TARGET_FILE:onnxruntime_providers_shared>
+		${QNN_LIB_FILES}
+		${JAVA_NATIVE_TEST_DIR}
+	  )
         endif()
 
         # delegate to gradle's test runner

From 03614bcfaaa23bd486699d4f5cc5fa1373a3fe34 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Tue, 17 Dec 2024 13:16:42 -0800
Subject: [PATCH 39/64] Modify Java bindings to use QNN shared lib

---
 cmake/onnxruntime_unittests.cmake                  | 10 ++--------
 java/src/main/java/ai/onnxruntime/OnnxRuntime.java | 14 ++++++++++++++
 java/src/main/java/ai/onnxruntime/OrtSession.java  |  9 +++++++--
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index b19e32e87443a..bc7abc885b717 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -1608,14 +1608,8 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
 
         # also copy other library dependencies that may be required by tests to native-test
         if(onnxruntime_USE_QNN)
-	  add_custom_command(
-	    TARGET onnxruntime_providers_qnn POST_BUILD
-	    COMMAND ${CMAKE_COMMAND} -E copy
-		$<TARGET_FILE:onnxruntime_providers_qnn>
-		$<TARGET_FILE:onnxruntime_providers_shared>
-		${QNN_LIB_FILES}
-		${JAVA_NATIVE_TEST_DIR}
-	  )
+	  add_custom_command(TARGET onnxruntime_providers_qnn POST_BUILD
+              COMMAND ${CMAKE_COMMAND} -E copy ${QNN_LIB_FILES} ${JAVA_NATIVE_TEST_DIR})
         endif()
 
         # delegate to gradle's test runner
diff --git a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
index b80debdde47c4..ae498bee69353 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
@@ -76,6 +76,9 @@ final class OnnxRuntime {
   /** The short name of the ONNX runtime TensorRT provider library */
   static final String ONNXRUNTIME_LIBRARY_TENSORRT_NAME = "onnxruntime_providers_tensorrt";
 
+  /** The short name of the ONNX runtime QNN provider library */
+  static final String ONNXRUNTIME_LIBRARY_QNN_NAME = "onnxruntime_providers_qnn";
+
   /** The OS & CPU architecture string */
   private static final String OS_ARCH_STR = initOsArch();
 
@@ -252,6 +255,17 @@ static boolean extractTensorRT() {
     return extractProviderLibrary(ONNXRUNTIME_LIBRARY_TENSORRT_NAME);
   }
 
+  /**
+   * Extracts the QNN provider library from the classpath resources if present, or checks to
+   * see if the QNN provider library is in the directory specified by {@link
+   * #ONNXRUNTIME_NATIVE_PATH}.
+   *
+   * @return True if the QNN provider library is ready for loading, false otherwise.
+   */
+  static boolean extractQNN() {
+    return extractProviderLibrary(ONNXRUNTIME_LIBRARY_QNN_NAME);
+  }
+
   /**
    * Extracts a shared provider library from the classpath resources if present, or checks to see if
    * that library is in the directory specified by {@link #ONNXRUNTIME_NATIVE_PATH}.
diff --git a/java/src/main/java/ai/onnxruntime/OrtSession.java b/java/src/main/java/ai/onnxruntime/OrtSession.java
index 32dc9d9f84aaa..700008e66bb36 100644
--- a/java/src/main/java/ai/onnxruntime/OrtSession.java
+++ b/java/src/main/java/ai/onnxruntime/OrtSession.java
@@ -1319,8 +1319,13 @@ public void addXnnpack(Map<String, String> providerOptions) throws OrtException
      * @throws OrtException If there was an error in native code.
      */
     public void addQnn(Map<String, String> providerOptions) throws OrtException {
-      String qnnProviderName = "QNN";
-      addExecutionProvider(qnnProviderName, providerOptions);
+      if (OnnxRuntime.extractQNN()) {
+        String qnnProviderName = "QNN";
+        addExecutionProvider(qnnProviderName, providerOptions);
+      } else {
+        throw new OrtException(
+            OrtException.OrtErrorCode.ORT_EP_FAIL, "Failed to find QNN shared provider");
+      }
     }
 
     /**

From 421f39f37676de257bc5eca42584ea3bad2f1a0c Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Tue, 17 Dec 2024 13:54:15 -0800
Subject: [PATCH 40/64] Try to build onnxruntime_providers_shared on Android

---
 cmake/onnxruntime_providers_cpu.cmake              |  1 -
 cmake/onnxruntime_unittests.cmake                  |  2 +-
 java/src/main/java/ai/onnxruntime/OnnxRuntime.java |  5 ++---
 tools/ci_build/github/android/build_aar_package.py | 10 ++++++++--
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake
index 91a2b13002ec9..79e430763da93 100644
--- a/cmake/onnxruntime_providers_cpu.cmake
+++ b/cmake/onnxruntime_providers_cpu.cmake
@@ -215,7 +215,6 @@ set_target_properties(onnxruntime_providers PROPERTIES FOLDER "ONNXRuntime")
 
 if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD
                                   AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin|iOS|visionOS"
-                                  AND NOT CMAKE_SYSTEM_NAME STREQUAL "Android"
                                   AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
   file(GLOB onnxruntime_providers_shared_cc_srcs CONFIGURE_DEPENDS
   "${ONNXRUNTIME_ROOT}/core/providers/shared/*.h"
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index bc7abc885b717..c19a18ef15089 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -1608,7 +1608,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
 
         # also copy other library dependencies that may be required by tests to native-test
         if(onnxruntime_USE_QNN)
-	  add_custom_command(TARGET onnxruntime_providers_qnn POST_BUILD
+          add_custom_command(TARGET onnxruntime_providers_qnn POST_BUILD
               COMMAND ${CMAKE_COMMAND} -E copy ${QNN_LIB_FILES} ${JAVA_NATIVE_TEST_DIR})
         endif()
 
diff --git a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
index ae498bee69353..01bf33f8d36e5 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
@@ -256,9 +256,8 @@ static boolean extractTensorRT() {
   }
 
   /**
-   * Extracts the QNN provider library from the classpath resources if present, or checks to
-   * see if the QNN provider library is in the directory specified by {@link
-   * #ONNXRUNTIME_NATIVE_PATH}.
+   * Extracts the QNN provider library from the classpath resources if present, or checks to see if
+   * the QNN provider library is in the directory specified by {@link #ONNXRUNTIME_NATIVE_PATH}.
    *
    * @return True if the QNN provider library is ready for loading, false otherwise.
    */
diff --git a/tools/ci_build/github/android/build_aar_package.py b/tools/ci_build/github/android/build_aar_package.py
index 1b34b3d302e57..7294e0548e1b2 100644
--- a/tools/ci_build/github/android/build_aar_package.py
+++ b/tools/ci_build/github/android/build_aar_package.py
@@ -129,7 +129,10 @@ def _build_aar(args):
         # to jnilibs/[abi] for later compiling the aar package
         abi_jnilibs_dir = os.path.join(jnilibs_dir, abi)
         os.makedirs(abi_jnilibs_dir, exist_ok=True)
-        for lib_name in ["libonnxruntime.so", "libonnxruntime4j_jni.so"]:
+        sym_link_libs = ["libonnxruntime.so", "libonnxruntime4j_jni.so"]
+        if qnn_android_build:
+            sym_link_libs.extend(["libonnxruntime_providers_shared.so", "libonnxruntime_providers_qnn.so"])
+        for lib_name in sym_link_libs:
             target_lib_name = os.path.join(abi_jnilibs_dir, lib_name)
             # If the symbolic already exists, delete it first
             # For some reason, os.path.exists will return false for a symbolic link in Linux,
@@ -141,7 +144,10 @@ def _build_aar(args):
         # copy executables for each abi, in case we want to publish those as well
         # some of them might not exist, e.g., if we skip building the tests
         abi_exe_dir = os.path.join(exe_dir, abi)
-        for exe_name in ["libonnxruntime.so", "onnxruntime_perf_test", "onnx_test_runner"]:
+        execs_to_copy = ["libonnxruntime.so", "onnxruntime_perf_test", "onnx_test_runner"]
+        if qnn_android_build:
+            execs_to_copy.extend(["libonnxruntime_providers_shared.so", "libonnxruntime_providers_qnn.so"])
+        for exe_name in execs_to_copy:
             src_exe_path = os.path.join(abi_build_dir, build_config, exe_name)
             if not os.path.exists(src_exe_path):
                 continue

From 9354f18dad37234043b3abe4d9be1c2fb010ca5c Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Wed, 18 Dec 2024 17:19:42 -0800
Subject: [PATCH 41/64] Pass linker flag to Android build of qnn dll

---
 cmake/onnxruntime_providers_qnn.cmake | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/cmake/onnxruntime_providers_qnn.cmake b/cmake/onnxruntime_providers_qnn.cmake
index 215367f2f040c..847a677a4d316 100644
--- a/cmake/onnxruntime_providers_qnn.cmake
+++ b/cmake/onnxruntime_providers_qnn.cmake
@@ -29,7 +29,15 @@
 
   # Set linker flags for function(s) exported by EP DLL
   if(UNIX)
-    set_property(TARGET onnxruntime_providers_qnn APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds -Xlinker --gc-sections -Xlinker -rpath=\\$ORIGIN")
+    string(CONCAT ONNXRUNTIME_PROVIDERS_QNN_LINK_FLAGS
+           "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds "
+           "-Xlinker --gc-sections -Xlinker -rpath=\\$ORIGIN")
+    if(CMAKE_SYSTEM_NAME STREQUAL "Android")
+      string(CONCAT ONNXRUNTIME_PROVIDERS_QNN_LINK_FLAGS
+	     "${ONNXRUNTIME_PROVIDERS_QNN_LINK_FLAGS} "
+	     "-Xlinker -undefined=Provider_GetHost")
+    endif()
+    set_property(TARGET onnxruntime_providers_qnn APPEND_STRING PROPERTY LINK_FLAGS "${ONNXRUNTIME_PROVIDERS_QNN_LINK_FLAGS}")
   elseif(WIN32)
     set_property(TARGET onnxruntime_providers_qnn APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/qnn/symbols.def")
   else()

From e8df64fc3ae197c299a0797f58448d8315313752 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Wed, 18 Dec 2024 20:03:22 -0800
Subject: [PATCH 42/64] Try different linker flag for android

---
 cmake/onnxruntime_providers_qnn.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/onnxruntime_providers_qnn.cmake b/cmake/onnxruntime_providers_qnn.cmake
index 847a677a4d316..6eb547927073b 100644
--- a/cmake/onnxruntime_providers_qnn.cmake
+++ b/cmake/onnxruntime_providers_qnn.cmake
@@ -35,7 +35,7 @@
     if(CMAKE_SYSTEM_NAME STREQUAL "Android")
       string(CONCAT ONNXRUNTIME_PROVIDERS_QNN_LINK_FLAGS
 	     "${ONNXRUNTIME_PROVIDERS_QNN_LINK_FLAGS} "
-	     "-Xlinker -undefined=Provider_GetHost")
+	     "-Xlinker --allow-shlib-undefined") # Allow undefined global symbols (e.g., Provider_GetHost) in shared library
     endif()
     set_property(TARGET onnxruntime_providers_qnn APPEND_STRING PROPERTY LINK_FLAGS "${ONNXRUNTIME_PROVIDERS_QNN_LINK_FLAGS}")
   elseif(WIN32)

From 863ff04c65713f8043e5e4979970206be7db60c3 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Wed, 18 Dec 2024 20:56:17 -0800
Subject: [PATCH 43/64] Use -z undefs on android

---
 cmake/onnxruntime_providers_qnn.cmake | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/cmake/onnxruntime_providers_qnn.cmake b/cmake/onnxruntime_providers_qnn.cmake
index 6eb547927073b..505c357d516d0 100644
--- a/cmake/onnxruntime_providers_qnn.cmake
+++ b/cmake/onnxruntime_providers_qnn.cmake
@@ -29,15 +29,20 @@
 
   # Set linker flags for function(s) exported by EP DLL
   if(UNIX)
-    string(CONCAT ONNXRUNTIME_PROVIDERS_QNN_LINK_FLAGS
-           "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds "
-           "-Xlinker --gc-sections -Xlinker -rpath=\\$ORIGIN")
     if(CMAKE_SYSTEM_NAME STREQUAL "Android")
-      string(CONCAT ONNXRUNTIME_PROVIDERS_QNN_LINK_FLAGS
-	     "${ONNXRUNTIME_PROVIDERS_QNN_LINK_FLAGS} "
-	     "-Xlinker --allow-shlib-undefined") # Allow undefined global symbols (e.g., Provider_GetHost) in shared library
+      target_link_options(onnxruntime_providers_qnn PRIVATE
+                          "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds"
+                          "LINKER:--gc-sections"
+                          "LINKER:-rpath=\$ORIGIN"
+                          "LINKER:-z,undefs"
+      )
+    else()
+      target_link_options(onnxruntime_providers_qnn PRIVATE
+                          "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds"
+                          "LINKER:--gc-sections"
+                          "LINKER:-rpath=\$ORIGIN"
+      )
     endif()
-    set_property(TARGET onnxruntime_providers_qnn APPEND_STRING PROPERTY LINK_FLAGS "${ONNXRUNTIME_PROVIDERS_QNN_LINK_FLAGS}")
   elseif(WIN32)
     set_property(TARGET onnxruntime_providers_qnn APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/qnn/symbols.def")
   else()

From ef1b91da26e6366b474c460f96ee53ca483d8e57 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Wed, 18 Dec 2024 22:47:33 -0800
Subject: [PATCH 44/64] Expose ETW logger functionality via provider bridge.
 Fix multithreading bug in callback from Etw (QNN EP needs to properly lock a
 logging mutex).

---
 .../qnn/builder/qnn_backend_manager.cc        | 37 ++++++-------------
 .../qnn/builder/qnn_backend_manager.h         | 20 ++++------
 onnxruntime/core/providers/qnn/ort_api.h      |  6 +++
 .../providers/qnn/qnn_execution_provider.cc   | 23 ++----------
 .../providers/qnn/qnn_execution_provider.h    | 11 +-----
 .../providers/shared_library/provider_api.h   | 37 +++++++++++++++++++
 .../provider_bridge_provider.cc               |  2 +-
 .../shared_library/provider_interfaces.h      | 12 ++++++
 .../shared_library/provider_wrappedtypes.h    | 15 ++++++++
 .../core/session/provider_bridge_ort.cc       | 30 ++++++++++++++-
 10 files changed, 124 insertions(+), 69 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index c816858018411..494c0a53eaab4 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -22,11 +22,6 @@
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
 #include "core/providers/qnn/builder/qnn_configs_helper.h"
 
-#ifdef _WIN32
-#include <winmeta.h>
-#include "core/platform/tracing.h"
-#endif
-
 // Flag to determine if Backend should do node validation for each opNode added
 #define DO_GRAPH_NODE_VALIDATIONS 1
 
@@ -255,7 +250,9 @@ void QnnLogging(const char* format,
   }
 }
 
-Status QnnBackendManager::InitializeQnnLog() {
+Status QnnBackendManager::InitializeQnnLog(const logging::Logger& logger) {
+  logger_ = &logger;
+
   // Set Qnn log level align with Ort log level
   auto ort_log_level = logger_->GetSeverity();
   QnnLog_Level_t qnn_log_level = MapOrtSeverityToQNNLogLevel(ort_log_level);
@@ -303,23 +300,15 @@ QnnLog_Level_t QnnBackendManager::MapOrtSeverityToQNNLogLevel(logging::Severity
   }
 }
 
-Status QnnBackendManager::ResetQnnLogLevel() {
+Status QnnBackendManager::ResetQnnLogLevel(std::optional<logging::Severity> ort_log_level) {
   std::lock_guard<std::mutex> lock(logger_mutex_);
-
-  if (backend_setup_completed_ && logger_ != nullptr) {
-    auto ort_log_level = logger_->GetSeverity();
-    LOGS(*logger_, INFO) << "Reset Qnn log level to ORT Logger level: " << (unsigned int)ort_log_level;
-    return UpdateQnnLogLevel(ort_log_level);
+  if (!backend_setup_completed_ || logger_ == nullptr) {
+    return Status::OK();
   }
-  return Status::OK();
-}
-
-Status QnnBackendManager::UpdateQnnLogLevel(logging::Severity ort_log_level) {
   ORT_RETURN_IF(nullptr == log_handle_, "Unable to update QNN Log Level. Invalid QNN log handle.");
-  ORT_RETURN_IF(false == backend_setup_completed_, "Unable to update QNN Log Level. Backend setup not completed.");
-  ORT_RETURN_IF(nullptr == logger_, "Unable to update QNN Log Level. Invalid logger.");
 
-  QnnLog_Level_t qnn_log_level = MapOrtSeverityToQNNLogLevel(ort_log_level);
+  logging::Severity actual_log_level = ort_log_level.has_value() ? *ort_log_level : logger_->GetSeverity();
+  QnnLog_Level_t qnn_log_level = MapOrtSeverityToQNNLogLevel(actual_log_level);
 
   LOGS(*logger_, INFO) << "Updating Qnn log level to: " << qnn_log_level;
 
@@ -332,7 +321,8 @@ Status QnnBackendManager::UpdateQnnLogLevel(logging::Severity ort_log_level) {
       LOGS(*logger_, ERROR) << "Invalid log handle provided to QnnLog_setLogLevel.";
     }
   }
-  ORT_RETURN_IF(QNN_BACKEND_NO_ERROR != result, "Failed to set log level in Qnn backend. Error: ", QnnErrorHandleToString(result));
+  ORT_RETURN_IF(QNN_BACKEND_NO_ERROR != result,
+                "Failed to set log level in Qnn backend. Error: ", QnnErrorHandleToString(result));
   return Status::OK();
 }
 
@@ -823,7 +813,7 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger,
   LOGS(logger, VERBOSE) << "Backend build version: "
                         << sdk_build_version_;
 
-  SetLogger(&logger);
+  ORT_RETURN_IF_ERROR(InitializeQnnLog(logger));
   LOGS(logger, VERBOSE) << "SetLogger succeed.";
 
   ORT_RETURN_IF_ERROR(InitializeBackend());
@@ -1098,8 +1088,6 @@ Status QnnBackendManager::ExtractBackendProfilingInfo() {
   }
 
   bool tracelogging_provider_ep_enabled = false;
-  // TODO: Re-enable when QNN EP is a dll
-#if 0
   const Env& env = GetDefaultEnv();
   auto& provider = env.GetTelemetryProvider();
   auto level = provider.Level();
@@ -1109,7 +1097,6 @@ Status QnnBackendManager::ExtractBackendProfilingInfo() {
       tracelogging_provider_ep_enabled = true;
     }
   }
-#endif
 
   // ETW disabled previously, but enabled now
   if (ProfilingLevel::INVALID == profiling_level_etw_ && tracelogging_provider_ep_enabled) {
@@ -1327,7 +1314,7 @@ void QnnBackendManager::LogQnnProfileEventAsTraceLogging(
     const std::string& timingSource,
     const std::string& eventLevel,
     const char* eventIdentifier) {
-  // TODO: Re-enable when QNN EP is a dll
+  // TODO: Re-enable when add a method to ORT Telemetry provider to log EP profiling data.
 #if 0
   TraceLoggingWrite(
       telemetry_provider_handle,
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
index 661a830bfb733..a52738e8263bf 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
@@ -120,18 +120,10 @@ class QnnBackendManager {
 
   const Qnn_ProfileHandle_t& GetQnnProfileHandle() { return profile_backend_handle_; }
 
-  void SetLogger(const logging::Logger* logger) {
-    if (logger_ == nullptr) {
-      logger_ = logger;
-      (void)InitializeQnnLog();
-    }
-  }
-
-  Status InitializeQnnLog();
-
-  Status UpdateQnnLogLevel(logging::Severity ort_log_level);
-
-  Status ResetQnnLogLevel();
+  // Resets the QNN log level to the given ORT log level or to the default log level if the argument is
+  // std::nullopt.
+  // IMPORTANT: This function locks the internal `logging_mutex_`.
+  Status ResetQnnLogLevel(std::optional<logging::Severity> ort_log_level = std::nullopt);
 
   // Terminate logging in the backend
   Status TerminateQnnLog() {
@@ -170,6 +162,10 @@ class QnnBackendManager {
                                    uint64_t& max_spill_fill_buffer_size);
 
  private:
+  // Sets the ORT logger and creates a corresponding QNN logger with the same log level.
+  // IMPORTANT: caller must lock the `logger_mutex_` before calling this function.
+  Status InitializeQnnLog(const logging::Logger& logger);
+
   void* LoadLib(const char* file_name, int flags, std::string& error_msg);
 
   Status LoadQnnSystemLib();
diff --git a/onnxruntime/core/providers/qnn/ort_api.h b/onnxruntime/core/providers/qnn/ort_api.h
index 44face145fb04..96f6ab76f113f 100644
--- a/onnxruntime/core/providers/qnn/ort_api.h
+++ b/onnxruntime/core/providers/qnn/ort_api.h
@@ -6,6 +6,12 @@
 #define BUILD_QNN_EP_STATIC 0
 
 #if BUILD_QNN_EP_STATIC
+#ifdef _WIN32
+#include <winmeta.h>
+#include "core/platform/tracing.h"
+#include "core/platform/windows/logging/etw_sink.h"
+#endif
+
 // Includes when building QNN EP statically
 #include "onnx/defs/data_type_utils.h"
 #include "core/common/common.h"
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 99de2089cea13..afddcec7a182b 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -13,14 +13,6 @@
 #include "core/providers/qnn/builder/qnn_def.h"
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
 
-#ifdef _WIN32
-#include <Windows.h>
-// TODO: Enable once QNN is built as a DLL
-#if 0
-#include "core/platform/windows/logging/etw_sink.h"
-#endif
-#endif  // _WIN32
-
 namespace onnxruntime {
 
 constexpr const char* QNN = "QNN";
@@ -218,8 +210,6 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
   // set to invalid to indicate that ETW is no enabled when we setup QNN
   qnn::ProfilingLevel profiling_level_etw = qnn::ProfilingLevel::INVALID;
 
-// TODO: Re-enable ETW after QNN is a DLL
-#if 0
   const Env& env = GetDefaultEnv();
   auto& provider = env.GetTelemetryProvider();
   if (provider.IsEnabled()) {
@@ -231,7 +221,6 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
       }
     }
   }
-#endif
 
   // In case ETW gets disabled later
   auto profiling_level_pos = provider_options_map.find(PROFILING_LEVEL);
@@ -377,9 +366,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
       soc_model,
       enable_htp_weight_sharing);
 
-// TODO: Renable once QNN is a dll
-#if 0
-#ifdef _WIN32
+#if defined(_WIN32) && defined(ETW_TRACE_LOGGING_SUPPORTED)
   auto& etwRegistrationManager = logging::EtwRegistrationManager::Instance();
   // Register callback for ETW capture state (rundown)
   callback_ETWSink_provider_ = onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback(
@@ -400,7 +387,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
         if (IsEnabled == EVENT_CONTROL_CODE_ENABLE_PROVIDER) {
           if ((MatchAnyKeyword & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Logs)) != 0) {
             auto ortETWSeverity = etwRegistrationManager.MapLevelToSeverity();
-            (void)qnn_backend_manager_->UpdateQnnLogLevel(ortETWSeverity);
+            (void)qnn_backend_manager_->ResetQnnLogLevel(ortETWSeverity);
           }
           if ((MatchAnyKeyword & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)) != 0) {
             if (Level != 0) {
@@ -421,7 +408,6 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
       });
   etwRegistrationManager.RegisterInternalCallback(callback_ETWSink_provider_);
 #endif
-#endif
 }
 
 QNNExecutionProvider::~QNNExecutionProvider() {
@@ -434,14 +420,11 @@ QNNExecutionProvider::~QNNExecutionProvider() {
   }
 
   // Unregister the ETW callback
-#ifdef _WIN32
-  // TODO: Re-enable when QNN EP is a DLL
-#if 0
+#if defined(_WIN32) && defined(ETW_TRACE_LOGGING_SUPPORTED)
   if (callback_ETWSink_provider_ != nullptr) {
     logging::EtwRegistrationManager::Instance().UnregisterInternalCallback(callback_ETWSink_provider_);
   }
 #endif
-#endif
 }
 
 // Logs information about the supported/unsupported nodes.
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index 168fadfa98e25..cd3ccd96e31ab 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -13,12 +13,6 @@
 #include "core/providers/qnn/builder/qnn_model.h"
 #include "core/providers/qnn/builder/qnn_configs_helper.h"
 #include "HTP/QnnHtpGraph.h"
-#ifdef _WIN32
-// TODO: Reenable when QNN ep is a dll
-#if 0
-#include "core/platform/windows/logging/etw_sink.h"
-#endif
-#endif
 
 namespace onnxruntime {
 
@@ -149,11 +143,8 @@ class QNNExecutionProvider : public IExecutionProvider {
   bool enable_HTP_FP16_precision_ = true;
   bool share_ep_contexts_ = false;
   bool enable_spill_fill_buffer_ = false;
-#ifdef _WIN32
-  // TODO: Re-enable when QNN is a DLL
-#if 0
+#if defined(_WIN32) && defined(ETW_TRACE_LOGGING_SUPPORTED)
   onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback callback_ETWSink_provider_ = nullptr;
-#endif
 #endif
   qnn::ModelSettings model_settings_ = {};
 
diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index dc28848b2bab0..d4295b88faa79 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -9,6 +9,25 @@
 #pragma once
 #define SHARED_PROVIDER 1
 
+#ifdef _WIN32
+#include <ntverp.h>
+
+// ETW requires Windows 10 SDK or later
+// https://stackoverflow.com/questions/2665755/how-can-i-determine-the-version-of-the-windows-sdk-installed-on-my-computer
+#if VER_PRODUCTBUILD > 9600
+// ETW trace logging uses Windows 10 SDK's TraceLoggingProvider.h
+#define ETW_TRACE_LOGGING_SUPPORTED 1
+#endif  // VER_PRODUCTBUILD > 9600
+
+#ifdef ETW_TRACE_LOGGING_SUPPORTED
+#include <Windows.h>
+// TraceLoggingProvider.h must follow Windows.h
+#include <TraceLoggingProvider.h>
+#include <evntrace.h>
+#include <winmeta.h>
+#endif  // defined(ETW_TRACE_LOGGING_SUPPORTED)
+#endif  // defined(_WIN32)
+
 #include <vector>
 #include <string>
 #include <map>
@@ -136,6 +155,17 @@ enum class DataType {
   USER = 1     ///< Contains potentially sensitive user data.
 };
 
+enum class ORTTraceLoggingKeyword : uint64_t {
+  Session = 0x1,    // ORT Session TraceLoggingWrite
+  Logs = 0x2,       // LOGS() Macro ORT logs. Pair with an appropriate level depending on detail required
+  Reserved1 = 0x4,  // Reserved if we want to add some specific sub-categories instead of just LOGS() or other uses
+  Reserved2 = 0x8,
+  Reserved3 = 0x10,
+  Reserved4 = 0x20,
+  Reserved5 = 0x40,
+  Reserved6 = 0x80,
+  Profiling = 0x100  // Enables profiling. At higher levels >5 can impact inference performance
+};
 }  // namespace logging
 
 // OnnxRuntime Types (these are the internal types)
@@ -143,6 +173,13 @@ struct CPUIDInfo;
 namespace logging {
 struct Logger;
 struct Capture;
+#ifdef ETW_TRACE_LOGGING_SUPPORTED
+struct EtwRegistrationManager;
+using EtwRegistrationManager_EtwInternalCallback = std::function<void(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level,
+                                                                      ULONGLONG MatchAnyKeyword, ULONGLONG MatchAllKeyword,
+                                                                      PEVENT_FILTER_DESCRIPTOR FilterData,
+                                                                      PVOID CallbackContext)>;
+#endif
 }  // namespace logging
 struct ComputeCapability;
 struct ConfigOptions;
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index 456e164917587..4c050534456da 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -791,5 +791,5 @@ std::string ToUTF8String(const std::wstring& s) {
 std::wstring ToWideString(const std::string& s) {
   return g_host->ToWideString(s);
 }
-#endif
+#endif  // _WIN32
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 41ad441db616e..76ccd361761a7 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -291,6 +291,18 @@ struct ProviderHost {
   virtual std::ostream& logging__Capture__Stream(logging::Capture* p) noexcept = 0;
   virtual void logging__Capture__ProcessPrintf(logging::Capture* p, const char* format, va_list args) = 0;
 
+#if defined(ETW_TRACE_LOGGING_SUPPORTED)
+  // logging::EtwRegistrationManager
+  virtual logging::EtwRegistrationManager& logging__EtwRegistrationManager__Instance() = 0;
+  virtual logging::Severity logging__EtwRegistrationManager__MapLevelToSeverity(logging::EtwRegistrationManager* p) = 0;
+  virtual void logging__EtwRegistrationManager__RegisterInternalCallback(
+      logging::EtwRegistrationManager* p,
+      const logging::EtwRegistrationManager_EtwInternalCallback& callback) = 0;
+  virtual void logging__EtwRegistrationManager__UnregisterInternalCallback(
+      logging::EtwRegistrationManager* p,
+      const logging::EtwRegistrationManager_EtwInternalCallback& callback) = 0;
+#endif  // defined(ETW_TRACE_LOGGING_SUPPORTED)
+
   // Env
   virtual Env& Env__Default() = 0;
 
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index 3b5e5039796f7..b7817e98377eb 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -48,6 +48,21 @@ struct Capture final {
   Capture(const Capture&) = delete;
   void operator=(const Capture&) = delete;
 };
+
+#if defined(ETW_TRACE_LOGGING_SUPPORTED)
+struct EtwRegistrationManager final {
+  using EtwInternalCallback = EtwRegistrationManager_EtwInternalCallback;
+  static EtwRegistrationManager& Instance() { return g_host->logging__EtwRegistrationManager__Instance(); }
+  Severity MapLevelToSeverity() { return g_host->logging__EtwRegistrationManager__MapLevelToSeverity(this); }
+  void RegisterInternalCallback(const EtwInternalCallback& callback) {
+    g_host->logging__EtwRegistrationManager__RegisterInternalCallback(this, callback);
+  }
+  void UnregisterInternalCallback(const EtwInternalCallback& callback) {
+    g_host->logging__EtwRegistrationManager__UnregisterInternalCallback(this, callback);
+  }
+};
+#endif  // defined(ETW_TRACE_LOGGING_SUPPORTED)
+
 }  // namespace logging
 }  // namespace onnxruntime
 
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 2ad85552aa813..78c441efea856 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -37,7 +37,6 @@
 #include "core/framework/model_metadef_id_generator.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
-#include "core/session/onnxruntime_session_options_config_keys.h"
 
 #include "core/session/onnxruntime_c_api.h"
 #include "core/common/string_helper.h"
@@ -62,6 +61,10 @@
 #include "orttraining/core/framework/distributed_run_context.h"
 #endif
 
+#ifdef _WIN32
+#include "core/platform/windows/logging/etw_sink.h"
+#endif
+
 namespace ONNX_NAMESPACE {
 // We use these names in the provider API because we don't have the protobuf definitions of the RepeatedField* types
 using int64s = google::protobuf::RepeatedField<int64_t>;
@@ -77,6 +80,11 @@ namespace onnxruntime {
 using IndexedSubGraph_MetaDef = IndexedSubGraph::MetaDef;
 using IndexedSubGraph_SourceOfSchema = IndexedSubGraph::SourceOfSchema;
 using Node_EdgeEnd = Node::EdgeEnd;
+#ifdef ETW_TRACE_LOGGING_SUPPORTED
+namespace logging {
+using EtwRegistrationManager_EtwInternalCallback = EtwRegistrationManager::EtwInternalCallback;
+}
+#endif
 }  // namespace onnxruntime
 
 #include "core/common/cpuid_info.h"
@@ -400,6 +408,26 @@ struct ProviderHostImpl : ProviderHost {
     p->ProcessPrintf(format, args);
   }
 
+#if defined(ETW_TRACE_LOGGING_SUPPORTED)
+  // logging::EtwRegistrationManager
+  logging::EtwRegistrationManager& logging__EtwRegistrationManager__Instance() override {
+    return logging::EtwRegistrationManager::Instance();
+  }
+  logging::Severity logging__EtwRegistrationManager__MapLevelToSeverity(logging::EtwRegistrationManager* p) override {
+    return p->MapLevelToSeverity();
+  }
+  void logging__EtwRegistrationManager__RegisterInternalCallback(
+      logging::EtwRegistrationManager* p,
+      const logging::EtwRegistrationManager_EtwInternalCallback& callback) override {
+    p->RegisterInternalCallback(callback);
+  }
+  void logging__EtwRegistrationManager__UnregisterInternalCallback(
+      logging::EtwRegistrationManager* p,
+      const logging::EtwRegistrationManager_EtwInternalCallback& callback) override {
+    p->UnregisterInternalCallback(callback);
+  }
+#endif  // defined(ETW_TRACE_LOGGING_SUPPORTED)
+
   // Env
   Env& Env__Default() override { return Env::Default(); }
 

From 57e00722fdb27e48de3407a5d205e12e1c13af98 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Wed, 18 Dec 2024 22:53:43 -0800
Subject: [PATCH 45/64] Cmake java android: copy
 libonnxruntime_providers_shared.so and libonnxruntime_providers_qnn.so to
 directory for building Android AAR package

---
 cmake/onnxruntime_java.cmake | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/cmake/onnxruntime_java.cmake b/cmake/onnxruntime_java.cmake
index 662f7cb949ece..f1c6379afab72 100644
--- a/cmake/onnxruntime_java.cmake
+++ b/cmake/onnxruntime_java.cmake
@@ -214,6 +214,16 @@ if (ANDROID)
   add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime> ${ANDROID_PACKAGE_ABI_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime>)
   add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime4j_jni> ${ANDROID_PACKAGE_ABI_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime4j_jni>)
 
+  # If using QNN, also copy libonnxruntime_providers_shared.so and libonnxruntime_providers_qnn.so
+  if (onnxruntime_USE_QNN)
+    add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                       $<TARGET_FILE:onnxruntime_providers_shared>
+                       ${ANDROID_PACKAGE_ABI_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime>)
+    add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                       $<TARGET_FILE:onnxruntime_providers_qnn>
+                       ${ANDROID_PACKAGE_ABI_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime>)
+  endif()
+
   # Generate the Android AAR package
   add_custom_command(TARGET onnxruntime4j_jni
     POST_BUILD

From bd32daaf217b30fba222dcea5bb2ed2b19590ea6 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Wed, 18 Dec 2024 23:05:05 -0800
Subject: [PATCH 46/64] QNN Nuget Pipeline: print contents of binaries
 directory to see if shared libs were copied

---
 .../github/azure-pipelines/templates/qnn-ep-win.yml       | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
index aa0b6bf6d391e..4204a1ba7b90f 100644
--- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
@@ -93,12 +93,18 @@ stages:
           workingFolder: '$(Build.BinariesDirectory)\${{ parameters.build_config }}'
           createLogFile: true
 
+      - task: CmdLine@2
+        displayName: 'Print contents of binaries directory'
+        inputs:
+          script: |
+            dir '$(Build.BinariesDirectory)\${{ parameters.build_config }}'
+
       - template: win-esrp-dll.yml
         parameters:
           FolderPath: '$(Build.BinariesDirectory)\${{ parameters.build_config }}\${{ parameters.build_config }}'
           DisplayName: 'ESRP - Sign dlls'
           DoEsrp: ${{ parameters.DoEsrp }}
-          Pattern: 'onnxruntime.dll'
+          Pattern: 'onnxruntime*.dll'
 
       - task: MSBuild@1
         displayName: 'Restore NuGet Packages and create project.assets.json'

From 8a65a1d34120ef329c2444efe8044f161d1417fd Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Thu, 19 Dec 2024 00:33:50 -0800
Subject: [PATCH 47/64] Fix print in yaml

---
 tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
index 4204a1ba7b90f..23147804e856b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
@@ -97,7 +97,7 @@ stages:
         displayName: 'Print contents of binaries directory'
         inputs:
           script: |
-            dir '$(Build.BinariesDirectory)\${{ parameters.build_config }}'
+            dir $(Build.BinariesDirectory)\${{ parameters.build_config }}
 
       - template: win-esrp-dll.yml
         parameters:

From 7ff94a0b85692ff0f0a9ebc7bee1b429f6852b34 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Thu, 19 Dec 2024 00:59:48 -0800
Subject: [PATCH 48/64] Fix cmake copy command for Java build with qnn

---
 cmake/onnxruntime_java.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/onnxruntime_java.cmake b/cmake/onnxruntime_java.cmake
index f1c6379afab72..d96f90d02b1de 100644
--- a/cmake/onnxruntime_java.cmake
+++ b/cmake/onnxruntime_java.cmake
@@ -218,10 +218,10 @@ if (ANDROID)
   if (onnxruntime_USE_QNN)
     add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different
                        $<TARGET_FILE:onnxruntime_providers_shared>
-                       ${ANDROID_PACKAGE_ABI_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime>)
+                       ${ANDROID_PACKAGE_ABI_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime_providers_shared>)
     add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different
                        $<TARGET_FILE:onnxruntime_providers_qnn>
-                       ${ANDROID_PACKAGE_ABI_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime>)
+                       ${ANDROID_PACKAGE_ABI_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime_providers_qnn>)
   endif()
 
   # Generate the Android AAR package

From 0fbff4ad3f112d4444dae4d5a8e3e79f556104c1 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Thu, 19 Dec 2024 02:01:19 -0800
Subject: [PATCH 49/64] Try to fix nuget shared lib files for qnn

---
 tools/nuget/generate_nuspec_for_native_nuget.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index e19e0219e7d5f..980455ccddb0e 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -778,7 +778,7 @@ def generate_files(line_list, args):
             + '\\native" />'
         )
 
-    if args.execution_provider == "qnn":
+    if args.execution_provider == "qnn" or is_qnn_package and not is_ado_packaging_build:
         files_list.append(
             "<file src="
             + '"'

From 9cbd0fa9c5da9ed58926c8c433fd435ddd161202 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Thu, 19 Dec 2024 10:14:21 -0800
Subject: [PATCH 50/64] Pass --build_shared_lib when bulding python wheels with
 qnn [pipelines]

---
 .../github/azure-pipelines/templates/py-win-arm64-qnn.yml       | 1 +
 .../github/azure-pipelines/templates/py-win-arm64ec-qnn.yml     | 1 +
 .../github/azure-pipelines/templates/py-win-x64-qnn.yml         | 1 +
 tools/ci_build/github/linux/build_linux_python_package.sh       | 2 +-
 4 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
index e07f0afa6109c..da58b70be7f83 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
@@ -94,6 +94,7 @@ jobs:
             --build_dir $(Build.BinariesDirectory)
             --skip_submodule_sync
             --cmake_generator "$(VSGenerator)"
+            --build_shared_lib
             --use_qnn
             --qnn_home $(QnnSDKRootDir)
             --enable_pybind
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
index 8cc647c2464f3..e64a184d8ebeb 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
@@ -92,6 +92,7 @@ jobs:
             --build_dir $(Build.BinariesDirectory)
             --skip_submodule_sync
             --cmake_generator "$(VSGenerator)"
+            --build_shared_lib
             --use_qnn
             --qnn_home $(QnnSDKRootDir)
             --enable_pybind
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
index 466fee92d0d5e..a61bfc7706818 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
@@ -92,6 +92,7 @@ jobs:
             --build_dir $(Build.BinariesDirectory)
             --skip_submodule_sync
             --cmake_generator "$(VSGenerator)"
+            --build_shared_lib
             --use_qnn
             --qnn_home $(QnnSDKRootDir)
             --enable_pybind
diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh
index e2e0cea69efb5..11997382d119c 100755
--- a/tools/ci_build/github/linux/build_linux_python_package.sh
+++ b/tools/ci_build/github/linux/build_linux_python_package.sh
@@ -94,7 +94,7 @@ fi
 
 if [ "$BUILD_DEVICE" == "NPU" ]; then
     #Enable QNN EP
-    BUILD_ARGS+=("--use_qnn" "--qnn_home=/qnn_sdk")
+    BUILD_ARGS+=("--build_shared_lib" "--use_qnn" "--qnn_home=/qnn_sdk")
 fi
 
 export ONNX_ML=1

From f8747db958e7d16b95db7cda2b9975b0e9f7a05d Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Thu, 19 Dec 2024 13:11:44 -0800
Subject: [PATCH 51/64] Print correct binary directory in QNN Nuget pipeline

---
 tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
index 23147804e856b..4a437be325e7a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
@@ -97,7 +97,7 @@ stages:
         displayName: 'Print contents of binaries directory'
         inputs:
           script: |
-            dir $(Build.BinariesDirectory)\${{ parameters.build_config }}
+            dir $(Build.BinariesDirectory)\${{ parameters.build_config }}\${{ parameters.build_config }}
 
       - template: win-esrp-dll.yml
         parameters:

From 058e7bbbf23a4cf2c6384880e49e1440393b9fe3 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Thu, 19 Dec 2024 13:32:18 -0800
Subject: [PATCH 52/64] Add onnxruntime_providers_qnn.dll/.so to setup.py so
 that it gets copied when building python wheel

---
 setup.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index c1580eeb9e8f9..5c464eec537ec 100644
--- a/setup.py
+++ b/setup.py
@@ -311,17 +311,20 @@ def finalize_options(self):
 providers_tensorrt_or_migraphx = "onnxruntime_providers_" + ("migraphx" if is_migraphx else "tensorrt")
 providers_openvino = "onnxruntime_providers_openvino"
 providers_cann = "onnxruntime_providers_cann"
+providers_qnn = "onnxruntime_providers_qnn"
 
 if platform.system() == "Linux":
     providers_cuda_or_rocm = "lib" + providers_cuda_or_rocm + ".so"
     providers_tensorrt_or_migraphx = "lib" + providers_tensorrt_or_migraphx + ".so"
     providers_openvino = "lib" + providers_openvino + ".so"
     providers_cann = "lib" + providers_cann + ".so"
+    providers_qnn = "lib" + providers_qnn + ".so"
 elif platform.system() == "Windows":
     providers_cuda_or_rocm = providers_cuda_or_rocm + ".dll"
     providers_tensorrt_or_migraphx = providers_tensorrt_or_migraphx + ".dll"
     providers_openvino = providers_openvino + ".dll"
     providers_cann = providers_cann + ".dll"
+    providers_qnn = providers_qnn + ".dll"
 
 # Additional binaries
 dl_libs = []
@@ -341,8 +344,9 @@ def finalize_options(self):
     dl_libs.append(providers_cuda_or_rocm)
     dl_libs.append(providers_tensorrt_or_migraphx)
     dl_libs.append(providers_cann)
+    dl_libs.append(providers_qnn)
     dl_libs.append("libonnxruntime.so*")
-    # DNNL, TensorRT & OpenVINO EPs are built as shared libs
+    # DNNL, TensorRT, OpenVINO, and QNN EPs are built as shared libs
     libs.extend(["libonnxruntime_providers_shared.so"])
     libs.extend(["libonnxruntime_providers_dnnl.so"])
     libs.extend(["libonnxruntime_providers_openvino.so"])
@@ -350,6 +354,7 @@ def finalize_options(self):
     libs.append(providers_cuda_or_rocm)
     libs.append(providers_tensorrt_or_migraphx)
     libs.append(providers_cann)
+    libs.append(providers_qnn)
     # QNN
     qnn_deps = [
         "libQnnCpu.so",
@@ -388,13 +393,14 @@ def finalize_options(self):
         providers_cann,
         "onnxruntime.dll",
     ]
-    # DNNL, TensorRT & OpenVINO EPs are built as shared libs
+    # DNNL, TensorRT, OpenVINO, and QNN EPs are built as shared libs
     libs.extend(["onnxruntime_providers_shared.dll"])
     libs.extend(["onnxruntime_providers_dnnl.dll"])
     libs.extend(["onnxruntime_providers_tensorrt.dll"])
     libs.extend(["onnxruntime_providers_openvino.dll"])
     libs.extend(["onnxruntime_providers_cuda.dll"])
     libs.extend(["onnxruntime_providers_vitisai.dll"])
+    libs.extend(["onnxruntime_providers_qnn.dll"])
     # DirectML Libs
     libs.extend(["DirectML.dll"])
     # QNN V68/V73 dependencies

From 4172575575cab93ba09cbd9d4a4c745fcebe38ea Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Thu, 19 Dec 2024 14:54:11 -0800
Subject: [PATCH 53/64] Edit Java bindins to allow loading/extracting shared
 provider libs on Android

---
 .../main/java/ai/onnxruntime/OnnxRuntime.java | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
index 01bf33f8d36e5..0830ed5b7db39 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
@@ -162,8 +162,14 @@ static synchronized void init() throws IOException {
       // the ONNX Runtime native library will load it
       extractProviderLibrary(ONNXRUNTIME_LIBRARY_SHARED_NAME);
 
-      load(ONNXRUNTIME_LIBRARY_NAME);
-      load(ONNXRUNTIME_JNI_LIBRARY_NAME);
+      if (isAndroid()) {
+        // On Android, we only need to load onnxruntime4j_jni with System.loadLibrary
+        System.loadLibrary(ONNXRUNTIME_JNI_LIBRARY_NAME);
+      } else {
+        load(ONNXRUNTIME_LIBRARY_NAME);
+        load(ONNXRUNTIME_JNI_LIBRARY_NAME);
+      }
+
       ortApiHandle = initialiseAPIBase(ORT_API_VERSION_14);
       if (ortApiHandle == 0L) {
         throw new IllegalStateException(
@@ -273,10 +279,6 @@ static boolean extractQNN() {
    * @return True if the library is ready for loading by ORT's native code, false otherwise.
    */
   static synchronized boolean extractProviderLibrary(String libraryName) {
-    // Android does not need to extract library and it has no shared provider library
-    if (isAndroid()) {
-      return false;
-    }
     // Check if we've already extracted or check this provider, and it's ready
     if (extractedSharedProviders.contains(libraryName)) {
       return true;
@@ -323,12 +325,6 @@ static boolean isAndroid() {
    * @throws IOException If the file failed to read or write.
    */
   private static void load(String library) throws IOException {
-    // On Android, we simply use System.loadLibrary
-    if (isAndroid()) {
-      System.loadLibrary("onnxruntime4j_jni");
-      return;
-    }
-
     // 1) The user may skip loading of this library:
     String skip = System.getProperty("onnxruntime.native." + library + ".skip");
     if (Boolean.TRUE.toString().equalsIgnoreCase(skip)) {

From 3e39b88ff4d5d39ad2d98c3cbffa8d7bb9ecb5f1 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Thu, 19 Dec 2024 15:54:02 -0800
Subject: [PATCH 54/64] Create temp directory for java android

---
 java/src/main/java/ai/onnxruntime/OnnxRuntime.java | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
index 0830ed5b7db39..742805bebc0fd 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
@@ -155,7 +155,7 @@ static synchronized void init() throws IOException {
     if (loaded) {
       return;
     }
-    tempDirectory = isAndroid() ? null : Files.createTempDirectory("onnxruntime-java");
+    tempDirectory = Files.createTempDirectory("onnxruntime-java");
     try {
       libraryDirPathProperty = System.getProperty(ONNXRUNTIME_NATIVE_PATH);
       // Extract and prepare the shared provider library but don't try to load it,
@@ -181,9 +181,7 @@ static synchronized void init() throws IOException {
       version = initialiseVersion();
       loaded = true;
     } finally {
-      if (tempDirectory != null) {
-        cleanUp(tempDirectory.toFile());
-      }
+      cleanUp(tempDirectory.toFile());
     }
   }
 

From 5ed035fc2e22843803e0ef06c78d4978e18cb350 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Fri, 20 Dec 2024 15:29:21 -0800
Subject: [PATCH 55/64] consistent library loading logic for java android

---
 java/src/main/java/ai/onnxruntime/OnnxRuntime.java | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
index 742805bebc0fd..c4500a6a2178d 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
@@ -162,13 +162,8 @@ static synchronized void init() throws IOException {
       // the ONNX Runtime native library will load it
       extractProviderLibrary(ONNXRUNTIME_LIBRARY_SHARED_NAME);
 
-      if (isAndroid()) {
-        // On Android, we only need to load onnxruntime4j_jni with System.loadLibrary
-        System.loadLibrary(ONNXRUNTIME_JNI_LIBRARY_NAME);
-      } else {
-        load(ONNXRUNTIME_LIBRARY_NAME);
-        load(ONNXRUNTIME_JNI_LIBRARY_NAME);
-      }
+      load(ONNXRUNTIME_LIBRARY_NAME);
+      load(ONNXRUNTIME_JNI_LIBRARY_NAME);
 
       ortApiHandle = initialiseAPIBase(ORT_API_VERSION_14);
       if (ortApiHandle == 0L) {

From 65624339b5aa6d9b9888b38ae77e0822fb996c3a Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Sat, 21 Dec 2024 19:49:22 -0800
Subject: [PATCH 56/64] Add temporary logging

---
 .../main/java/ai/onnxruntime/OnnxRuntime.java |  3 +++
 .../github/android/build_aar_package.py       | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
index c4500a6a2178d..10ea82e02e0ab 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
@@ -155,6 +155,9 @@ static synchronized void init() throws IOException {
     if (loaded) {
       return;
     }
+    // TODO: Remove
+    logger.setLevel(Level.FINE);
+
     tempDirectory = Files.createTempDirectory("onnxruntime-java");
     try {
       libraryDirPathProperty = System.getProperty(ONNXRUNTIME_NATIVE_PATH);
diff --git a/tools/ci_build/github/android/build_aar_package.py b/tools/ci_build/github/android/build_aar_package.py
index 7294e0548e1b2..dc4f76eb25fb6 100644
--- a/tools/ci_build/github/android/build_aar_package.py
+++ b/tools/ci_build/github/android/build_aar_package.py
@@ -125,6 +125,11 @@ def _build_aar(args):
 
         subprocess.run(abi_build_command, env=temp_env, shell=False, check=True, cwd=REPO_DIR)
 
+        # TODO: Remove
+        abi_build_dir_files = os.listdir(abi_build_dir)
+        print("[REMOVE]: {abi_build_dir=} contents:")
+        print(abi_jnilibs_dir_files)
+
         # create symbolic links for libonnxruntime.so and libonnxruntime4j_jni.so
         # to jnilibs/[abi] for later compiling the aar package
         abi_jnilibs_dir = os.path.join(jnilibs_dir, abi)
@@ -139,8 +144,15 @@ def _build_aar(args):
             # add double check with os.path.islink
             if os.path.exists(target_lib_name) or os.path.islink(target_lib_name):
                 os.remove(target_lib_name)
+            print(f"[REMOVE]: Making sym link from {os.path.join(abi_build_dir, build_config, lib_name)} to "
+                  f"{target_lib_name}")
             os.symlink(os.path.join(abi_build_dir, build_config, lib_name), target_lib_name)
 
+        # TODO: Remove
+        abi_jnilibs_dir_files = os.listdir(abi_jnilibs_dir)
+        print("[REMOVE]: {abi_jnilibs_dir=} contents:")
+        print(abi_jnilibs_dir_files)
+
         # copy executables for each abi, in case we want to publish those as well
         # some of them might not exist, e.g., if we skip building the tests
         abi_exe_dir = os.path.join(exe_dir, abi)
@@ -150,12 +162,19 @@ def _build_aar(args):
         for exe_name in execs_to_copy:
             src_exe_path = os.path.join(abi_build_dir, build_config, exe_name)
             if not os.path.exists(src_exe_path):
+                print(f"[REMOVE]: Source exe path does not exist: {src_exe_path}")
                 continue
 
             os.makedirs(abi_exe_dir, exist_ok=True)
             dest_exe_path = os.path.join(abi_exe_dir, exe_name)
+            print(f"[REMOVE]: Copying {src_exe_path} to {dst_exe_path}")
             shutil.copyfile(src_exe_path, dest_exe_path)
 
+        # TODO: Remove
+        abi_exe_dir_files = os.listdir(abi_exe_dir)
+        print("[REMOVE]: {abi_exe_dir=} contents:")
+        print(abi_exe_dir_files)
+
         # we only need to define the header files path once
         if not header_files_path:
             header_files_path = os.path.join(abi_build_dir, build_config, "android", "headers")

From 0bbf3af4bd38961d91561e9dbd2c005c11929d80 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Sat, 21 Dec 2024 20:43:37 -0800
Subject: [PATCH 57/64] fix typo

---
 tools/ci_build/github/android/build_aar_package.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/android/build_aar_package.py b/tools/ci_build/github/android/build_aar_package.py
index dc4f76eb25fb6..0fa62a93c21ea 100644
--- a/tools/ci_build/github/android/build_aar_package.py
+++ b/tools/ci_build/github/android/build_aar_package.py
@@ -128,7 +128,7 @@ def _build_aar(args):
         # TODO: Remove
         abi_build_dir_files = os.listdir(abi_build_dir)
         print("[REMOVE]: {abi_build_dir=} contents:")
-        print(abi_jnilibs_dir_files)
+        print(abi_build_dir_files)
 
         # create symbolic links for libonnxruntime.so and libonnxruntime4j_jni.so
         # to jnilibs/[abi] for later compiling the aar package

From c04176a5c5b3e5dee7a937f503a4083ba7daa48c Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Sat, 21 Dec 2024 21:17:39 -0800
Subject: [PATCH 58/64] fix another type in temporary logging code for
 debugging android qnn java

---
 tools/ci_build/github/android/build_aar_package.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/android/build_aar_package.py b/tools/ci_build/github/android/build_aar_package.py
index 0fa62a93c21ea..df2ed73445217 100644
--- a/tools/ci_build/github/android/build_aar_package.py
+++ b/tools/ci_build/github/android/build_aar_package.py
@@ -167,7 +167,7 @@ def _build_aar(args):
 
             os.makedirs(abi_exe_dir, exist_ok=True)
             dest_exe_path = os.path.join(abi_exe_dir, exe_name)
-            print(f"[REMOVE]: Copying {src_exe_path} to {dst_exe_path}")
+            print(f"[REMOVE]: Copying {src_exe_path} to {dest_exe_path}")
             shutil.copyfile(src_exe_path, dest_exe_path)
 
         # TODO: Remove

From cc14971813c957c77bcfbfb4916b45a49c893653 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Sat, 21 Dec 2024 23:53:22 -0800
Subject: [PATCH 59/64] Android: Go back to not extracting shared libs from
 classpath resources.

---
 .../main/java/ai/onnxruntime/OnnxRuntime.java | 23 ++++++++++++++-----
 .../main/java/ai/onnxruntime/OrtSession.java  |  6 +++--
 .../github/android/build_aar_package.py       | 19 ---------------
 3 files changed, 21 insertions(+), 27 deletions(-)

diff --git a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
index 10ea82e02e0ab..c28c79f1e723e 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
@@ -155,17 +155,16 @@ static synchronized void init() throws IOException {
     if (loaded) {
       return;
     }
-    // TODO: Remove
-    logger.setLevel(Level.FINE);
-
-    tempDirectory = Files.createTempDirectory("onnxruntime-java");
+    tempDirectory = isAndroid() ? null : Files.createTempDirectory("onnxruntime-java");
     try {
       libraryDirPathProperty = System.getProperty(ONNXRUNTIME_NATIVE_PATH);
       // Extract and prepare the shared provider library but don't try to load it,
       // the ONNX Runtime native library will load it
       extractProviderLibrary(ONNXRUNTIME_LIBRARY_SHARED_NAME);
 
-      load(ONNXRUNTIME_LIBRARY_NAME);
+      if (!isAndroid()) {
+        load(ONNXRUNTIME_LIBRARY_NAME);
+      }
       load(ONNXRUNTIME_JNI_LIBRARY_NAME);
 
       ortApiHandle = initialiseAPIBase(ORT_API_VERSION_14);
@@ -179,7 +178,9 @@ static synchronized void init() throws IOException {
       version = initialiseVersion();
       loaded = true;
     } finally {
-      cleanUp(tempDirectory.toFile());
+      if (tempDirectory != null) {
+        cleanUp(tempDirectory.toFile());
+      }
     }
   }
 
@@ -275,6 +276,10 @@ static boolean extractQNN() {
    * @return True if the library is ready for loading by ORT's native code, false otherwise.
    */
   static synchronized boolean extractProviderLibrary(String libraryName) {
+    // Android does not need to extract provider libraries.
+    if (isAndroid()) {
+      return false;
+    }
     // Check if we've already extracted or check this provider, and it's ready
     if (extractedSharedProviders.contains(libraryName)) {
       return true;
@@ -321,6 +326,12 @@ static boolean isAndroid() {
    * @throws IOException If the file failed to read or write.
    */
   private static void load(String library) throws IOException {
+    // On Android, we simply use System.loadLibrary
+    if (isAndroid()) {
+      System.loadLibrary(library);
+      return;
+    }
+
     // 1) The user may skip loading of this library:
     String skip = System.getProperty("onnxruntime.native." + library + ".skip");
     if (Boolean.TRUE.toString().equalsIgnoreCase(skip)) {
diff --git a/java/src/main/java/ai/onnxruntime/OrtSession.java b/java/src/main/java/ai/onnxruntime/OrtSession.java
index 700008e66bb36..c41d06bb1b2bf 100644
--- a/java/src/main/java/ai/onnxruntime/OrtSession.java
+++ b/java/src/main/java/ai/onnxruntime/OrtSession.java
@@ -1319,8 +1319,10 @@ public void addXnnpack(Map<String, String> providerOptions) throws OrtException
      * @throws OrtException If there was an error in native code.
      */
     public void addQnn(Map<String, String> providerOptions) throws OrtException {
-      if (OnnxRuntime.extractQNN()) {
-        String qnnProviderName = "QNN";
+      String qnnProviderName = "QNN";
+      if (OnnxRuntime.isAndroid()) {
+        addExecutionProvider(qnnProviderName, providerOptions);
+      } else if (OnnxRuntime.extractQNN()) {
         addExecutionProvider(qnnProviderName, providerOptions);
       } else {
         throw new OrtException(
diff --git a/tools/ci_build/github/android/build_aar_package.py b/tools/ci_build/github/android/build_aar_package.py
index df2ed73445217..7294e0548e1b2 100644
--- a/tools/ci_build/github/android/build_aar_package.py
+++ b/tools/ci_build/github/android/build_aar_package.py
@@ -125,11 +125,6 @@ def _build_aar(args):
 
         subprocess.run(abi_build_command, env=temp_env, shell=False, check=True, cwd=REPO_DIR)
 
-        # TODO: Remove
-        abi_build_dir_files = os.listdir(abi_build_dir)
-        print("[REMOVE]: {abi_build_dir=} contents:")
-        print(abi_build_dir_files)
-
         # create symbolic links for libonnxruntime.so and libonnxruntime4j_jni.so
         # to jnilibs/[abi] for later compiling the aar package
         abi_jnilibs_dir = os.path.join(jnilibs_dir, abi)
@@ -144,15 +139,8 @@ def _build_aar(args):
             # add double check with os.path.islink
             if os.path.exists(target_lib_name) or os.path.islink(target_lib_name):
                 os.remove(target_lib_name)
-            print(f"[REMOVE]: Making sym link from {os.path.join(abi_build_dir, build_config, lib_name)} to "
-                  f"{target_lib_name}")
             os.symlink(os.path.join(abi_build_dir, build_config, lib_name), target_lib_name)
 
-        # TODO: Remove
-        abi_jnilibs_dir_files = os.listdir(abi_jnilibs_dir)
-        print("[REMOVE]: {abi_jnilibs_dir=} contents:")
-        print(abi_jnilibs_dir_files)
-
         # copy executables for each abi, in case we want to publish those as well
         # some of them might not exist, e.g., if we skip building the tests
         abi_exe_dir = os.path.join(exe_dir, abi)
@@ -162,19 +150,12 @@ def _build_aar(args):
         for exe_name in execs_to_copy:
             src_exe_path = os.path.join(abi_build_dir, build_config, exe_name)
             if not os.path.exists(src_exe_path):
-                print(f"[REMOVE]: Source exe path does not exist: {src_exe_path}")
                 continue
 
             os.makedirs(abi_exe_dir, exist_ok=True)
             dest_exe_path = os.path.join(abi_exe_dir, exe_name)
-            print(f"[REMOVE]: Copying {src_exe_path} to {dest_exe_path}")
             shutil.copyfile(src_exe_path, dest_exe_path)
 
-        # TODO: Remove
-        abi_exe_dir_files = os.listdir(abi_exe_dir)
-        print("[REMOVE]: {abi_exe_dir=} contents:")
-        print(abi_exe_dir_files)
-
         # we only need to define the header files path once
         if not header_files_path:
             header_files_path = os.path.join(abi_build_dir, build_config, "android", "headers")

From 4c6a985432779fcb209bb610eab3520e3266601a Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Sun, 22 Dec 2024 01:59:51 -0800
Subject: [PATCH 60/64] Try linking shared.so with qnn.so for android

---
 cmake/onnxruntime_providers_cpu.cmake | 10 +++++++++-
 cmake/onnxruntime_providers_qnn.cmake | 18 ++++--------------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake
index 79e430763da93..b4b4e7659d05a 100644
--- a/cmake/onnxruntime_providers_cpu.cmake
+++ b/cmake/onnxruntime_providers_cpu.cmake
@@ -238,7 +238,15 @@ if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD
   set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/shared/exported_symbols.lst")
   elseif(UNIX)
     if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
-      set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds -Xlinker --gc-sections")
+      target_link_options(onnxruntime_providers_shared PRIVATE
+                          "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds"
+                          "LINKER:--gc-sections")
+      if(CMAKE_SYSTEM_NAME STREQUAL "Android")
+        # Need to link libonnxruntime_providers_<EP>.so with libonnxruntime_providers_shared.so on Android
+        # because dlopen with RTLD_GLOBAL does not bring all symbols to global scope.
+        # See: https://github.com/android/ndk/issues/201
+        set(ONNXRUNTIME_PROVIDERS_SHARED onnxruntime_providers_shared)
+      endif()
     endif()
   elseif(WIN32)
   set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/shared/symbols.def")
diff --git a/cmake/onnxruntime_providers_qnn.cmake b/cmake/onnxruntime_providers_qnn.cmake
index 505c357d516d0..4f97c968ebf56 100644
--- a/cmake/onnxruntime_providers_qnn.cmake
+++ b/cmake/onnxruntime_providers_qnn.cmake
@@ -29,20 +29,10 @@
 
   # Set linker flags for function(s) exported by EP DLL
   if(UNIX)
-    if(CMAKE_SYSTEM_NAME STREQUAL "Android")
-      target_link_options(onnxruntime_providers_qnn PRIVATE
-                          "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds"
-                          "LINKER:--gc-sections"
-                          "LINKER:-rpath=\$ORIGIN"
-                          "LINKER:-z,undefs"
-      )
-    else()
-      target_link_options(onnxruntime_providers_qnn PRIVATE
-                          "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds"
-                          "LINKER:--gc-sections"
-                          "LINKER:-rpath=\$ORIGIN"
-      )
-    endif()
+    target_link_options(onnxruntime_providers_qnn PRIVATE
+                        "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds"
+                        "LINKER:--gc-sections"
+                        "LINKER:-rpath=\$ORIGIN")
   elseif(WIN32)
     set_property(TARGET onnxruntime_providers_qnn APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/qnn/symbols.def")
   else()

From fc003463828dd631b22d2efa4a7fe4b7b56da0c3 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Sun, 22 Dec 2024 16:30:42 -0800
Subject: [PATCH 61/64] Use --undefined=Provider_GetHost

---
 cmake/onnxruntime_providers_cpu.cmake |  6 ------
 cmake/onnxruntime_providers_qnn.cmake | 18 ++++++++++++++----
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake
index b4b4e7659d05a..f84b26d7d7a47 100644
--- a/cmake/onnxruntime_providers_cpu.cmake
+++ b/cmake/onnxruntime_providers_cpu.cmake
@@ -241,12 +241,6 @@ if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD
       target_link_options(onnxruntime_providers_shared PRIVATE
                           "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds"
                           "LINKER:--gc-sections")
-      if(CMAKE_SYSTEM_NAME STREQUAL "Android")
-        # Need to link libonnxruntime_providers_<EP>.so with libonnxruntime_providers_shared.so on Android
-        # because dlopen with RTLD_GLOBAL does not bring all symbols to global scope.
-        # See: https://github.com/android/ndk/issues/201
-        set(ONNXRUNTIME_PROVIDERS_SHARED onnxruntime_providers_shared)
-      endif()
     endif()
   elseif(WIN32)
   set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/shared/symbols.def")
diff --git a/cmake/onnxruntime_providers_qnn.cmake b/cmake/onnxruntime_providers_qnn.cmake
index 4f97c968ebf56..063b704524106 100644
--- a/cmake/onnxruntime_providers_qnn.cmake
+++ b/cmake/onnxruntime_providers_qnn.cmake
@@ -29,10 +29,20 @@
 
   # Set linker flags for function(s) exported by EP DLL
   if(UNIX)
-    target_link_options(onnxruntime_providers_qnn PRIVATE
-                        "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds"
-                        "LINKER:--gc-sections"
-                        "LINKER:-rpath=\$ORIGIN")
+    if(CMAKE_SYSTEM_NAME STREQUAL "Android")
+      target_link_options(onnxruntime_providers_qnn PRIVATE
+                          "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds"
+                          "LINKER:--gc-sections"
+                          "LINKER:-rpath=\$ORIGIN"
+                          "LINKER:--undefined=Provider_GetHost"
+      )
+    else()
+      target_link_options(onnxruntime_providers_qnn PRIVATE
+                          "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds"
+                          "LINKER:--gc-sections"
+                          "LINKER:-rpath=\$ORIGIN"
+      )
+    endif()
   elseif(WIN32)
     set_property(TARGET onnxruntime_providers_qnn APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/qnn/symbols.def")
   else()

From b707c46acfa93c40ffd85e062888f31add5fbc8a Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Sun, 22 Dec 2024 17:13:02 -0800
Subject: [PATCH 62/64] prepend _ to linker arg

---
 cmake/onnxruntime_providers_qnn.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/onnxruntime_providers_qnn.cmake b/cmake/onnxruntime_providers_qnn.cmake
index 063b704524106..aa81e70a4bb00 100644
--- a/cmake/onnxruntime_providers_qnn.cmake
+++ b/cmake/onnxruntime_providers_qnn.cmake
@@ -34,7 +34,7 @@
                           "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds"
                           "LINKER:--gc-sections"
                           "LINKER:-rpath=\$ORIGIN"
-                          "LINKER:--undefined=Provider_GetHost"
+                          "LINKER:-u,_Provider_GetHost"
       )
     else()
       target_link_options(onnxruntime_providers_qnn PRIVATE

From 7f505586bb7110206e836f6da1140a92c1624fdc Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Mon, 23 Dec 2024 11:02:32 -0800
Subject: [PATCH 63/64] Add linker option -z global to
 libonnxruntime_providers_shared.so on Android

---
 cmake/onnxruntime_providers_cpu.cmake | 13 ++++++++++---
 cmake/onnxruntime_providers_qnn.cmake |  2 +-
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake
index f84b26d7d7a47..177969b1d0c6d 100644
--- a/cmake/onnxruntime_providers_cpu.cmake
+++ b/cmake/onnxruntime_providers_cpu.cmake
@@ -238,9 +238,16 @@ if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD
   set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/shared/exported_symbols.lst")
   elseif(UNIX)
     if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
-      target_link_options(onnxruntime_providers_shared PRIVATE
-                          "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds"
-                          "LINKER:--gc-sections")
+      if(CMAKE_SYSTEM_NAME STREQUAL "Android")
+        target_link_options(onnxruntime_providers_shared PRIVATE
+                            "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds"
+                            "LINKER:--gc-sections"
+                            "LINKER:-z,global")
+      else()
+        target_link_options(onnxruntime_providers_shared PRIVATE
+                            "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds"
+                            "LINKER:--gc-sections")
+      endif()
     endif()
   elseif(WIN32)
   set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/shared/symbols.def")
diff --git a/cmake/onnxruntime_providers_qnn.cmake b/cmake/onnxruntime_providers_qnn.cmake
index aa81e70a4bb00..505c357d516d0 100644
--- a/cmake/onnxruntime_providers_qnn.cmake
+++ b/cmake/onnxruntime_providers_qnn.cmake
@@ -34,7 +34,7 @@
                           "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds"
                           "LINKER:--gc-sections"
                           "LINKER:-rpath=\$ORIGIN"
-                          "LINKER:-u,_Provider_GetHost"
+                          "LINKER:-z,undefs"
       )
     else()
       target_link_options(onnxruntime_providers_qnn PRIVATE

From 17c3bdee734a4dad1a6e30a3abb3d67df3d43554 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Mon, 23 Dec 2024 14:56:05 -0800
Subject: [PATCH 64/64] Try to use libc++_shared.so for android qnn build

---
 .../ci_build/github/android/default_qnn_aar_build_settings.json  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/ci_build/github/android/default_qnn_aar_build_settings.json b/tools/ci_build/github/android/default_qnn_aar_build_settings.json
index 599c108f830e7..66ae7d25153f0 100644
--- a/tools/ci_build/github/android/default_qnn_aar_build_settings.json
+++ b/tools/ci_build/github/android/default_qnn_aar_build_settings.json
@@ -10,6 +10,7 @@
         "--parallel",
         "--cmake_generator=Ninja",
         "--build_java",
+        "--android_cpp_shared",
         "--build_shared_lib",
         "--use_qnn",
         "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF",