diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index d72b61a0859b2..5adacdc393da8 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -209,7 +209,6 @@ set(onnxruntime_INTERNAL_LIBRARIES
   ${PROVIDERS_COREML}
   ${PROVIDERS_DML}
   ${PROVIDERS_NNAPI}
-  ${PROVIDERS_QNN}
   ${PROVIDERS_SNPE}
   ${PROVIDERS_RKNPU}
   ${PROVIDERS_VSINPU}
diff --git a/cmake/onnxruntime_java.cmake b/cmake/onnxruntime_java.cmake
index b15b9632e9e24..d96f90d02b1de 100644
--- a/cmake/onnxruntime_java.cmake
+++ b/cmake/onnxruntime_java.cmake
@@ -148,7 +148,7 @@ if (WIN32)
   if(NOT onnxruntime_ENABLE_STATIC_ANALYSIS)
     add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_FILE_NAME:onnxruntime>)
     add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime4j_jni> ${JAVA_PACKAGE_JNI_DIR}/$<TARGET_FILE_NAME:onnxruntime4j_jni>)
-    if (onnxruntime_USE_CUDA OR onnxruntime_USE_DNNL OR onnxruntime_USE_OPENVINO OR onnxruntime_USE_TENSORRT)
+    if (onnxruntime_USE_CUDA OR onnxruntime_USE_DNNL OR onnxruntime_USE_OPENVINO OR onnxruntime_USE_TENSORRT OR onnxruntime_USE_QNN)
       add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_shared> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_FILE_NAME:onnxruntime_providers_shared>)
     endif()
     if (onnxruntime_USE_CUDA)
@@ -163,11 +163,14 @@ if (WIN32)
     if (onnxruntime_USE_TENSORRT)
       add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_tensorrt> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_FILE_NAME:onnxruntime_providers_tensorrt>)
     endif()
+    if (onnxruntime_USE_QNN)
+      add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_qnn> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_FILE_NAME:onnxruntime_providers_qnn>)
+    endif()
   endif()
 else()
   add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime>)
   add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime4j_jni> ${JAVA_PACKAGE_JNI_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime4j_jni>)
-  if (onnxruntime_USE_CUDA OR onnxruntime_USE_DNNL OR onnxruntime_USE_OPENVINO OR onnxruntime_USE_TENSORRT)
+  if (onnxruntime_USE_CUDA OR onnxruntime_USE_DNNL OR onnxruntime_USE_OPENVINO OR onnxruntime_USE_TENSORRT OR onnxruntime_USE_QNN)
     add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_shared> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime_providers_shared>)
   endif()
   if (onnxruntime_USE_CUDA)
@@ -182,6 +185,9 @@ else()
   if (onnxruntime_USE_TENSORRT)
     add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_tensorrt> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime_providers_tensorrt>)
   endif()
+  if (onnxruntime_USE_QNN)
+    add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_qnn> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime_providers_qnn>)
+  endif()
 endif()
 
 # run the build process (this copies the results back into CMAKE_CURRENT_BINARY_DIR)
@@ -208,6 +214,16 @@ if (ANDROID)
   add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime> ${ANDROID_PACKAGE_ABI_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime>)
   add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime4j_jni> ${ANDROID_PACKAGE_ABI_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime4j_jni>)
 
+  # If using QNN, also copy libonnxruntime_providers_shared.so and libonnxruntime_providers_qnn.so
+  if (onnxruntime_USE_QNN)
+    add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                       $<TARGET_FILE:onnxruntime_providers_shared>
+                       ${ANDROID_PACKAGE_ABI_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime_providers_shared>)
+    add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                       $<TARGET_FILE:onnxruntime_providers_qnn>
+                       ${ANDROID_PACKAGE_ABI_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime_providers_qnn>)
+  endif()
+
   # Generate the Android AAR package
   add_custom_command(TARGET onnxruntime4j_jni
     POST_BUILD
diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
index 582491de9503d..67fa48b28278d 100644
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@@ -74,9 +74,6 @@ endif()
 if(onnxruntime_USE_JSEP)
   set(PROVIDERS_JS onnxruntime_providers_js)
 endif()
-if(onnxruntime_USE_QNN)
-  set(PROVIDERS_QNN onnxruntime_providers_qnn)
-endif()
 if(onnxruntime_USE_RKNPU)
   set(PROVIDERS_RKNPU onnxruntime_providers_rknpu)
 endif()
diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake
index 91a2b13002ec9..177969b1d0c6d 100644
--- a/cmake/onnxruntime_providers_cpu.cmake
+++ b/cmake/onnxruntime_providers_cpu.cmake
@@ -215,7 +215,6 @@ set_target_properties(onnxruntime_providers PROPERTIES FOLDER "ONNXRuntime")
 
 if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD
                                   AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin|iOS|visionOS"
-                                  AND NOT CMAKE_SYSTEM_NAME STREQUAL "Android"
                                   AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
   file(GLOB onnxruntime_providers_shared_cc_srcs CONFIGURE_DEPENDS
   "${ONNXRUNTIME_ROOT}/core/providers/shared/*.h"
@@ -239,7 +238,16 @@ if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD
   set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/shared/exported_symbols.lst")
   elseif(UNIX)
     if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
-      set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds -Xlinker --gc-sections")
+      if(CMAKE_SYSTEM_NAME STREQUAL "Android")
+        target_link_options(onnxruntime_providers_shared PRIVATE
+                            "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds"
+                            "LINKER:--gc-sections"
+                            "LINKER:-z,global")
+      else()
+        target_link_options(onnxruntime_providers_shared PRIVATE
+                            "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds"
+                            "LINKER:--gc-sections")
+      endif()
     endif()
   elseif(WIN32)
   set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/shared/symbols.def")
diff --git a/cmake/onnxruntime_providers_qnn.cmake b/cmake/onnxruntime_providers_qnn.cmake
index b68d84c23bb32..505c357d516d0 100644
--- a/cmake/onnxruntime_providers_qnn.cmake
+++ b/cmake/onnxruntime_providers_qnn.cmake
@@ -3,41 +3,65 @@
 
   add_compile_definitions(USE_QNN=1)
 
-  # These are shared utils,
-  # TODO, move to a separate lib when used by EPs other than QNN, NNAPI and CoreML
-  file(GLOB onnxruntime_providers_shared_utils_cc_srcs CONFIGURE_DEPENDS
-    "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.cc"
-  )
-
   file(GLOB_RECURSE
-    onnxruntime_providers_qnn_ep_cc_srcs CONFIGURE_DEPENDS
+    onnxruntime_providers_qnn_cc_srcs CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/core/providers/qnn/*.h"
     "${ONNXRUNTIME_ROOT}/core/providers/qnn/*.cc"
-  )
-
-  file(GLOB_RECURSE
-    onnxruntime_providers_qnn_builder_cc_srcs CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/core/providers/qnn/builder/*.h"
     "${ONNXRUNTIME_ROOT}/core/providers/qnn/builder/*.cc"
-  )
-
-  set(onnxruntime_providers_qnn_cc_srcs
-    ${onnxruntime_providers_shared_utils_cc_srcs}
-    ${onnxruntime_providers_qnn_ep_cc_srcs}
-    ${onnxruntime_providers_qnn_builder_cc_srcs}
+    "${ONNXRUNTIME_ROOT}/core/providers/qnn/builder/qnn_node_group/*.h"
+    "${ONNXRUNTIME_ROOT}/core/providers/qnn/builder/qnn_node_group/*.cc"
+    "${ONNXRUNTIME_ROOT}/core/providers/qnn/builder/opbuilder/*.h"
+    "${ONNXRUNTIME_ROOT}/core/providers/qnn/builder/opbuilder/*.cc"
+    "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
+    "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
   )
 
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_qnn_cc_srcs})
-  onnxruntime_add_static_library(onnxruntime_providers_qnn ${onnxruntime_providers_qnn_cc_srcs})
-  onnxruntime_add_include_to_target(onnxruntime_providers_qnn onnxruntime_common onnxruntime_framework onnx onnx_proto protobuf::libprotobuf-lite flatbuffers::flatbuffers Boost::mp11)
-  target_link_libraries(onnxruntime_providers_qnn)
-  add_dependencies(onnxruntime_providers_qnn onnx ${onnxruntime_EXTERNAL_DEPENDENCIES})
-  set_target_properties(onnxruntime_providers_qnn PROPERTIES CXX_STANDARD_REQUIRED ON)
-  set_target_properties(onnxruntime_providers_qnn PROPERTIES FOLDER "ONNXRuntime")
-  target_include_directories(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_ROOT} ${onnxruntime_QNN_HOME}/include/QNN ${onnxruntime_QNN_HOME}/include)
-  set_target_properties(onnxruntime_providers_qnn PROPERTIES LINKER_LANGUAGE CXX)
-  # ignore the warning unknown-pragmas on "pragma region"
-  if(NOT MSVC)
+  onnxruntime_add_shared_library_module(onnxruntime_providers_qnn ${onnxruntime_providers_qnn_cc_srcs})
+  onnxruntime_add_include_to_target(onnxruntime_providers_qnn ${ONNXRUNTIME_PROVIDERS_SHARED} ${GSL_TARGET} onnx onnxruntime_common Boost::mp11 safeint_interface)
+  target_link_libraries(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_PROVIDERS_SHARED} ${ABSEIL_LIBS} ${CMAKE_DL_LIBS})
+  add_dependencies(onnxruntime_providers_qnn onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
+  target_include_directories(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_ROOT}
+	                                                       ${CMAKE_CURRENT_BINARY_DIR}
+	                                                       ${onnxruntime_QNN_HOME}/include/QNN
+							       ${onnxruntime_QNN_HOME}/include)
+
+  # Set linker flags for function(s) exported by EP DLL
+  if(UNIX)
+    if(CMAKE_SYSTEM_NAME STREQUAL "Android")
+      target_link_options(onnxruntime_providers_qnn PRIVATE
+                          "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds"
+                          "LINKER:--gc-sections"
+                          "LINKER:-rpath=\$ORIGIN"
+                          "LINKER:-z,undefs"
+      )
+    else()
+      target_link_options(onnxruntime_providers_qnn PRIVATE
+                          "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds"
+                          "LINKER:--gc-sections"
+                          "LINKER:-rpath=\$ORIGIN"
+      )
+    endif()
+  elseif(WIN32)
+    set_property(TARGET onnxruntime_providers_qnn APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/qnn/symbols.def")
+  else()
+    message(FATAL_ERROR "onnxruntime_providers_qnn unknown platform, need to specify shared library exports for it")
+  endif()
+
+  # Set compile options
+  if(MSVC)
+    target_compile_options(onnxruntime_providers_qnn PUBLIC /wd4099 /wd4005)
+  else()
+    # ignore the warning unknown-pragmas on "pragma region"
     target_compile_options(onnxruntime_providers_qnn PRIVATE "-Wno-unknown-pragmas")
   endif()
+
+  set_target_properties(onnxruntime_providers_qnn PROPERTIES LINKER_LANGUAGE CXX)
+  set_target_properties(onnxruntime_providers_qnn PROPERTIES CXX_STANDARD_REQUIRED ON)
+  set_target_properties(onnxruntime_providers_qnn PROPERTIES FOLDER "ONNXRuntime")
+
+  install(TARGETS onnxruntime_providers_qnn
+          ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+          LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+          RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index 5a87252b08573..776c866efbb12 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -180,7 +180,6 @@ target_link_libraries(onnxruntime_pybind11_state PRIVATE
     ${PROVIDERS_XNNPACK}
     ${PROVIDERS_WEBGPU}
     ${PROVIDERS_AZURE}
-    ${PROVIDERS_QNN}
     onnxruntime_optimizer
     onnxruntime_providers
     onnxruntime_util
@@ -997,6 +996,13 @@ if (onnxruntime_USE_COREML)
 endif()
 
 if (onnxruntime_USE_QNN)
+  add_custom_command(
+    TARGET onnxruntime_pybind11_state POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy
+        $<TARGET_FILE:onnxruntime_providers_qnn>
+        $<TARGET_FILE:onnxruntime_providers_shared>
+        $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
+  )
   add_custom_command(
     TARGET onnxruntime_pybind11_state POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E copy
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 9e3ab4d41f416..17ee0e9c4e15a 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -631,12 +631,11 @@ set(ONNXRUNTIME_TEST_LIBS
     onnxruntime_session
     ${ONNXRUNTIME_INTEROP_TEST_LIBS}
     ${onnxruntime_libs}
-    # CUDA, ROCM, TENSORRT, MIGRAPHX, DNNL, and OpenVINO are dynamically loaded at runtime
+    # CUDA, ROCM, TENSORRT, MIGRAPHX, DNNL, OpenVINO, and QNN are dynamically loaded at runtime
     ${PROVIDERS_NNAPI}
     ${PROVIDERS_VSINPU}
     ${PROVIDERS_JS}
     ${PROVIDERS_WEBGPU}
-    ${PROVIDERS_QNN}
     ${PROVIDERS_SNPE}
     ${PROVIDERS_RKNPU}
     ${PROVIDERS_DML}
@@ -707,8 +706,7 @@ endif()
 if(onnxruntime_USE_QNN AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_REDUCED_OPS_BUILD)
   list(APPEND onnxruntime_test_framework_src_patterns ${TEST_SRC_DIR}/providers/qnn/*)
   list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_qnn)
-  list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_qnn)
-  list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_qnn)
+  list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_qnn onnxruntime_providers_shared)
 endif()
 
 if(onnxruntime_USE_SNPE)
@@ -1286,7 +1284,8 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
 
     file(GLOB onnxruntime_qnn_ctx_gen_src CONFIGURE_DEPENDS
       ${onnxruntime_qnn_ctx_gen_src_patterns}
-      )
+    )
+
     onnxruntime_add_executable(onnxruntime_qnn_ctx_gen ${onnxruntime_qnn_ctx_gen_src})
     target_include_directories(onnxruntime_qnn_ctx_gen PRIVATE   ${onnx_test_runner_src_dir} ${ONNXRUNTIME_ROOT}
           ${eigen_INCLUDE_DIRS} ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir}
diff --git a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
index b80debdde47c4..c28c79f1e723e 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
@@ -76,6 +76,9 @@ final class OnnxRuntime {
   /** The short name of the ONNX runtime TensorRT provider library */
   static final String ONNXRUNTIME_LIBRARY_TENSORRT_NAME = "onnxruntime_providers_tensorrt";
 
+  /** The short name of the ONNX runtime QNN provider library */
+  static final String ONNXRUNTIME_LIBRARY_QNN_NAME = "onnxruntime_providers_qnn";
+
   /** The OS & CPU architecture string */
   private static final String OS_ARCH_STR = initOsArch();
 
@@ -159,8 +162,11 @@ static synchronized void init() throws IOException {
       // the ONNX Runtime native library will load it
       extractProviderLibrary(ONNXRUNTIME_LIBRARY_SHARED_NAME);
 
-      load(ONNXRUNTIME_LIBRARY_NAME);
+      if (!isAndroid()) {
+        load(ONNXRUNTIME_LIBRARY_NAME);
+      }
       load(ONNXRUNTIME_JNI_LIBRARY_NAME);
+
       ortApiHandle = initialiseAPIBase(ORT_API_VERSION_14);
       if (ortApiHandle == 0L) {
         throw new IllegalStateException(
@@ -252,6 +258,16 @@ static boolean extractTensorRT() {
     return extractProviderLibrary(ONNXRUNTIME_LIBRARY_TENSORRT_NAME);
   }
 
+  /**
+   * Extracts the QNN provider library from the classpath resources if present, or checks to see if
+   * the QNN provider library is in the directory specified by {@link #ONNXRUNTIME_NATIVE_PATH}.
+   *
+   * @return True if the QNN provider library is ready for loading, false otherwise.
+   */
+  static boolean extractQNN() {
+    return extractProviderLibrary(ONNXRUNTIME_LIBRARY_QNN_NAME);
+  }
+
   /**
    * Extracts a shared provider library from the classpath resources if present, or checks to see if
    * that library is in the directory specified by {@link #ONNXRUNTIME_NATIVE_PATH}.
@@ -260,7 +276,7 @@ static boolean extractTensorRT() {
    * @return True if the library is ready for loading by ORT's native code, false otherwise.
    */
   static synchronized boolean extractProviderLibrary(String libraryName) {
-    // Android does not need to extract library and it has no shared provider library
+    // Android does not need to extract provider libraries.
     if (isAndroid()) {
       return false;
     }
@@ -312,7 +328,7 @@ static boolean isAndroid() {
   private static void load(String library) throws IOException {
     // On Android, we simply use System.loadLibrary
     if (isAndroid()) {
-      System.loadLibrary("onnxruntime4j_jni");
+      System.loadLibrary(library);
       return;
     }
 
diff --git a/java/src/main/java/ai/onnxruntime/OrtSession.java b/java/src/main/java/ai/onnxruntime/OrtSession.java
index 32dc9d9f84aaa..c41d06bb1b2bf 100644
--- a/java/src/main/java/ai/onnxruntime/OrtSession.java
+++ b/java/src/main/java/ai/onnxruntime/OrtSession.java
@@ -1320,7 +1320,14 @@ public void addXnnpack(Map<String, String> providerOptions) throws OrtException
      */
     public void addQnn(Map<String, String> providerOptions) throws OrtException {
       String qnnProviderName = "QNN";
-      addExecutionProvider(qnnProviderName, providerOptions);
+      if (OnnxRuntime.isAndroid()) {
+        addExecutionProvider(qnnProviderName, providerOptions);
+      } else if (OnnxRuntime.extractQNN()) {
+        addExecutionProvider(qnnProviderName, providerOptions);
+      } else {
+        throw new OrtException(
+            OrtException.OrtErrorCode.ORT_EP_FAIL, "Failed to find QNN shared provider");
+      }
     }
 
     /**
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
index 79674fd706151..36bccfe7effce 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -2,13 +2,15 @@
 // Licensed under the MIT License.
 
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
-#include "core/graph/constants.h"
-#include "core/providers/qnn/builder/qnn_model.h"
 
 #include <iostream>
 #include <fstream>
 #include <filesystem>
 
+#include "core/providers/qnn/ort_api.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
+#include "core/providers/qnn/builder/qnn_model.h"
+
 namespace onnxruntime {
 namespace qnn {
 
@@ -17,7 +19,7 @@ bool GraphHasEpContextNode(const onnxruntime::GraphViewer& graph_viewer) {
   // and the source is QNN or QNNExecutionProvider.
   for (const auto& node : graph_viewer.Nodes()) {
     if (EPCONTEXT_OP == node.OpType()) {
-      NodeAttrHelper node_helper(node);
+      utils::NodeAttrHelper node_helper(node);
       std::string cache_source = node_helper.Get(SOURCE, "");
 
       std::transform(cache_source.begin(),
@@ -51,9 +53,9 @@ Status GetMainContextNode(const std::vector<IExecutionProvider::FusedNodeAndGrap
     // There is only one EPContext node in one filtered graph -- this is guaranteed by GetCapability
     const onnxruntime::GraphViewer& graph_viewer(fused_nodes_and_graphs[i].filtered_graph);
     ORT_RETURN_IF(graph_viewer.NumberOfNodes() != 1, "One filtered graph should has only one EPContext node!");
-    const auto& ep_context_node = graph_viewer.Nodes().begin();
-    ORT_RETURN_IF_NOT(EPCONTEXT_OP == ep_context_node->OpType(), "Should only filter in the EPContext node.");
-    NodeAttrHelper node_helper(*ep_context_node);
+    const Node& ep_context_node = *graph_viewer.Nodes().begin();
+    ORT_RETURN_IF_NOT(EPCONTEXT_OP == ep_context_node.OpType(), "Should only filter in the EPContext node.");
+    utils::NodeAttrHelper node_helper(ep_context_node);
     int64_t is_main_context = node_helper.Get(MAIN_CONTEXT, static_cast<int64_t>(0));
     if (1 == is_main_context) {
       main_context_pos.push_back(static_cast<int>(i));
@@ -73,12 +75,12 @@ Status CreateNodeArgs(const std::vector<std::string>& names,
     std::string name = names[i];
     ORT_RETURN_IF(tensor_info_table.find(name) == tensor_info_table.end(), "Tensor name: ", name, " not found in tensor_info_table");
     const OnnxTensorInfo& tensor_info = tensor_info_table.at(name);
-    TypeProto tensor_type;
-    tensor_type.mutable_tensor_type()->set_elem_type(tensor_info.data_type_);
+    std::unique_ptr<TypeProto> tensor_type = TypeProto::Create();
+    tensor_type->mutable_tensor_type()->set_elem_type(tensor_info.data_type_);
     for (size_t j = 0; j < tensor_info.shape_.size(); ++j) {
-      tensor_type.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(tensor_info.shape_[j]);
+      tensor_type->mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(tensor_info.shape_[j]);
     }
-    auto& input_arg = graph.GetOrCreateNodeArg(name, &tensor_type);
+    auto& input_arg = graph.GetOrCreateNodeArg(name, tensor_type.get());
     node_args.push_back(&input_arg);
   }
   return Status::OK();
@@ -90,7 +92,7 @@ Status GetEpContextFromMainNode(const onnxruntime::Node& main_context_node,
                                 QnnModelLookupTable& qnn_models,
                                 int64_t max_spill_fill_size) {
   ORT_RETURN_IF_NOT(EPCONTEXT_OP == main_context_node.OpType(), "Should only filter in the EPContext node.");
-  NodeAttrHelper node_helper(main_context_node);
+  utils::NodeAttrHelper node_helper(main_context_node);
   bool is_embed_mode = node_helper.Get(EMBED_MODE, true);
   if (is_embed_mode) {
     const std::string& context_binary = node_helper.Get(EP_CACHE_CONTEXT, "");
@@ -161,8 +163,8 @@ Status TryGetMaxSpillFillSize(const std::vector<IExecutionProvider::FusedNodeAnd
     auto index = main_context_pos_list[i];
     const onnxruntime::GraphViewer& main_ctx_graph_viewer(fused_nodes_and_graphs[index].filtered_graph);
     ORT_RETURN_IF(main_ctx_graph_viewer.NumberOfNodes() != 1, "One filtered graph should has only one EPContext node!");
-    const auto& ep_context_node = main_ctx_graph_viewer.Nodes().begin();
-    NodeAttrHelper node_helper(*ep_context_node);
+    const Node& ep_context_node = *main_ctx_graph_viewer.Nodes().begin();
+    qnn::utils::NodeAttrHelper node_helper(ep_context_node);
     int64_t max_size = node_helper.Get(MAX_SIZE, static_cast<int64_t>(0));
     if (max_size > max_spill_fill_size) {
       max_spill_fill_size = max_size;
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
index 92c5391b40f09..48d68aec55c38 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
@@ -7,11 +7,7 @@
 #include <vector>
 
 #include "qnn_def.h"
-#include "core/common/logging/logging.h"
-#include "core/graph/graph_viewer.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/graph/model.h"
-#include "core/framework/execution_provider.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime {
 
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder.h b/onnxruntime/core/providers/qnn/builder/op_builder.h
index 05398c3f22ea2..0846275496ebf 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/op_builder.h
@@ -3,9 +3,7 @@
 
 #pragma once
 
-#include "core/graph/graph_viewer.h"
-#include "core/framework/node_unit.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
index 6ef17b40d274b..3e337f679056f 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
+++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
@@ -5,8 +5,6 @@
 #include <unordered_map>
 #include <string>
 
-#include <core/graph/graph.h>
-
 #include "op_builder_factory.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc
index c685fa065e2ba..76762c82ece2e 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc
@@ -1,14 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "base_op_builder.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 
@@ -57,7 +54,7 @@ Status ArgMaxMinOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_mode
   param_tensor_names.push_back(axis_param.GetParamTensorName());
   qnn_model_wrapper.AddParamWrapper(std::move(axis_param));
 
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   auto select_last_index = node_helper.Get("select_last_index", static_cast<int32_t>(0));
   if (select_last_index != 0) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN ArgMax/ArgMin only support select_last_index=0.");
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
index ed70111087e19..7db4f8c0c609d 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
@@ -4,13 +4,6 @@
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
-#include <core/providers/common.h>
-
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
-#include "core/providers/cpu/tensor/transpose.h"
-#include "core/common/safeint.h"
-
 namespace onnxruntime {
 namespace qnn {
 
@@ -271,37 +264,199 @@ Status BaseOpBuilder::SetOutputQParamEqualToInputIfNearlyEqual(QnnModelWrapper&
   return Status::OK();
 }
 
-Status BaseOpBuilder::TransposeInitializer(const QnnModelWrapper& qnn_model_wrapper,
-                                           const onnx::TensorProto& initializer,
-                                           const std::vector<size_t>& perm,
-                                           std::vector<uint8_t>& transposed_data) const {
-  const DataTypeImpl* tensor_dtype = DataTypeImpl::TensorTypeFromONNXEnum(initializer.data_type())->GetElementType();
-  const auto tensor_shape_dims = onnxruntime::utils::GetTensorShapeFromTensorProto(initializer);
-  TensorShape tensor_shape{tensor_shape_dims};
-  AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();
-  Tensor in_tensor = Tensor(tensor_dtype, tensor_shape, cpu_allocator);
-
-  auto rank = perm.size();
-  std::vector<int64_t> new_tensor_shape_dims;
-  std::vector<size_t> permutations;
-  new_tensor_shape_dims.reserve(rank);
-  permutations.reserve(rank);
-  for (int64_t p : perm) {
-    permutations.push_back(p);
-    new_tensor_shape_dims.push_back(tensor_shape_dims[p]);
+// Internal function to transpose input from either (N,C,H,W,D) or (C,N,H,W,D) to (H,W,D,C,N).
+static Status TransposeDataRank5(const TensorShape& input_shape,
+                                 gsl::span<const size_t> perm,
+                                 size_t elem_byte_size,
+                                 gsl::span<const uint8_t> input_buffer,
+                                 gsl::span<uint8_t> output_buffer) {
+  const size_t rank = input_shape.NumDimensions();
+  ORT_RETURN_IF_NOT(rank == 5 && perm.size() == 5, "Invalid input tensor rank");
+  std::vector<size_t> perm_inverse(perm.size());
+  ORT_RETURN_IF_ERROR(qnn::utils::InvertPerm<size_t>(perm, perm_inverse));
+
+  std::vector<int64_t> output_shape_dims(rank);
+  ORT_RETURN_IF_ERROR((qnn::utils::PermuteShape<int64_t, size_t>(input_shape.GetDims(), perm, output_shape_dims)));
+  const TensorShape output_shape = TensorShape::FromExistingBuffer(output_shape_dims);
+
+  std::array<size_t, 5> src_strides = {};
+  for (size_t i = 0; i < rank; ++i) {
+    int64_t stride = (i < rank - 1) ? input_shape.SizeFromDimension(i + 1) : 1;
+    ORT_RETURN_IF_NOT(stride > 0, "Expected positive shape dims when computing strides.");
+    src_strides[i] = static_cast<size_t>(stride);
   }
 
-  TensorShape new_tensor_shape(new_tensor_shape_dims);
-  Tensor out_tensor = Tensor(tensor_dtype, new_tensor_shape, cpu_allocator);
-  ORT_RETURN_IF_ERROR(onnxruntime::utils::TensorProtoToTensor(
-      Env::Default(), qnn_model_wrapper.GetGraphViewer().ModelPath(), initializer, in_tensor));
-  ORT_RETURN_IF_ERROR(Transpose::DoTranspose(permutations, in_tensor, out_tensor));
-  onnx::TensorProto new_tensor_proto = onnxruntime::utils::TensorToTensorProto(out_tensor, "test");
-  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(new_tensor_proto, transposed_data));
+  std::array<size_t, 5> dst_strides = {};
+  for (size_t i = 0; i < rank; ++i) {
+    int64_t stride = (i < rank - 1) ? output_shape.SizeFromDimension(i + 1) : 1;
+    ORT_RETURN_IF_NOT(stride > 0, "Expected positive shape dims when computing strides.");
+    dst_strides[i] = static_cast<size_t>(stride);
+  }
+
+  for (int64_t d0 = 0; d0 < input_shape[0]; ++d0) {
+    for (int64_t d1 = 0; d1 < input_shape[1]; ++d1) {
+      for (int64_t d2 = 0; d2 < input_shape[2]; ++d2) {
+        for (int64_t d3 = 0; d3 < input_shape[3]; ++d3) {
+          for (int64_t d4 = 0; d4 < input_shape[4]; ++d4) {
+            const size_t src_elem_index = ((d0 * src_strides[0]) +
+                                           (d1 * src_strides[1]) +
+                                           (d2 * src_strides[2]) +
+                                           (d3 * src_strides[3]) +
+                                           (d4 * src_strides[4]));
+            const size_t dst_elem_index = ((d0 * dst_strides[perm_inverse[0]]) +
+                                           (d1 * dst_strides[perm_inverse[1]]) +
+                                           (d2 * dst_strides[perm_inverse[2]]) +
+                                           (d3 * dst_strides[perm_inverse[3]]) +
+                                           (d4 * dst_strides[perm_inverse[4]]));
+
+            const size_t src_byte_index = src_elem_index * elem_byte_size;
+            const size_t dst_byte_index = dst_elem_index * elem_byte_size;
+            assert(src_byte_index < input_buffer.size());
+            assert(dst_byte_index < output_buffer.size());
+
+            std::memcpy(&output_buffer[dst_byte_index], &input_buffer[src_byte_index], elem_byte_size);
+          }
+        }
+      }
+    }
+  }
 
   return Status::OK();
 }
 
+Status BaseOpBuilder::TwoDimensionTranspose(const QnnModelWrapper& qnn_model_wrapper,
+                                            std::vector<uint32_t>& data_shape,
+                                            const onnx::TensorProto& initializer,
+                                            std::vector<uint8_t>& transposed_data) const {
+  ORT_RETURN_IF_NOT(data_shape.size() == 2, "Expected shape of rank 2");
+
+  std::array<size_t, 2> perm = {1, 0};
+  std::vector<uint32_t> output_shape(data_shape.size());
+  ORT_RETURN_IF_ERROR((qnn::utils::PermuteShape<uint32_t, size_t>(data_shape, perm, output_shape)));
+
+  auto onnx_type = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(initializer.data_type());
+  const size_t elem_byte_size = qnn::utils::GetElementSizeByType(onnx_type);
+  ORT_RETURN_IF_NOT(elem_byte_size != 0, "Can't get element byte size from given ONNX type");
+
+  std::vector<uint8_t> input_buffer;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(initializer, input_buffer));
+  transposed_data.resize(input_buffer.size());
+
+  for (size_t row = 0; row < data_shape[0]; row++) {
+    for (size_t col = 0; col < data_shape[1]; col++) {
+      const size_t src_elem_index = (row * data_shape[1] + col);
+      const size_t dst_elem_index = (col * output_shape[1] + row);
+      const size_t src_byte_index = src_elem_index * elem_byte_size;
+      const size_t dst_byte_index = dst_elem_index * elem_byte_size;
+      assert(src_byte_index < input_buffer.size());
+      assert(dst_byte_index < transposed_data.size());
+
+      std::memcpy(&transposed_data[dst_byte_index], &input_buffer[src_byte_index], elem_byte_size);
+    }
+  }
+
+  data_shape = std::move(output_shape);  // Update parameter with final transposed shape
+  return Status::OK();
+}
+
+Status BaseOpBuilder::TransposeFromNchwToHwcn(const QnnModelWrapper& qnn_model_wrapper,
+                                              const onnx::TensorProto& initializer,
+                                              std::vector<uint8_t>& transposed_data,
+                                              bool is_3d) const {
+  auto onnx_type = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(initializer.data_type());
+  const size_t elem_byte_size = qnn::utils::GetElementSizeByType(onnx_type);
+  ORT_RETURN_IF_NOT(elem_byte_size != 0, "Can't get element byte size from given ONNX type");
+
+  std::vector<int64_t> input_shape = qnn::utils::GetInitializerShape<int64_t>(initializer);
+  ORT_RETURN_IF_NOT((is_3d && input_shape.size() == 5) || (!is_3d && input_shape.size() == 4),
+                    "Unexpected rank: only support rank 4 or rank 5 input shapes");
+
+  if (!is_3d) {
+    input_shape.push_back(1);  // Make it 3D by making shape (N,C,H,W,1)
+  }
+
+  std::vector<uint8_t> input_buffer;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(initializer, input_buffer));
+  transposed_data.resize(input_buffer.size());
+
+  return TransposeDataRank5(TensorShape::FromExistingBuffer(input_shape),
+                            nchw2hwcn_perm_3d,
+                            elem_byte_size,
+                            input_buffer,
+                            transposed_data);
+}
+
+Status BaseOpBuilder::TransposeFromNchwToHwcn(std::vector<int64_t> input_shape_dims,
+                                              size_t elem_byte_size,
+                                              gsl::span<const uint8_t> input_buffer,
+                                              gsl::span<uint8_t> output_buffer,
+                                              bool is_3d) const {
+  const size_t rank = input_shape_dims.size();
+  ORT_RETURN_IF_NOT((is_3d && rank == 5) || (!is_3d && rank == 4), "Invalid input tensor rank");
+  ORT_RETURN_IF_NOT(input_buffer.size() == output_buffer.size(),
+                    "Expected input_buffer.size() == output_buffer.size()");
+  ORT_RETURN_IF_NOT(elem_byte_size != 0, "Can't get element byte size from given ONNX type");
+
+  if (!is_3d) {
+    input_shape_dims.push_back(1);  // Make it 3D by making shape (N,C,H,W,1)
+  }
+
+  return TransposeDataRank5(TensorShape::FromExistingBuffer(input_shape_dims),
+                            nchw2hwcn_perm_3d,
+                            elem_byte_size,
+                            input_buffer,
+                            output_buffer);
+}
+
+Status BaseOpBuilder::TransposeFromCnhwToHwcn(const QnnModelWrapper& qnn_model_wrapper,
+                                              const onnx::TensorProto& initializer,
+                                              std::vector<uint8_t>& transposed_data,
+                                              bool is_3d) const {
+  auto onnx_type = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(initializer.data_type());
+  const size_t elem_byte_size = qnn::utils::GetElementSizeByType(onnx_type);
+  ORT_RETURN_IF_NOT(elem_byte_size != 0, "Can't get element byte size from given ONNX type");
+
+  std::vector<int64_t> input_shape = qnn::utils::GetInitializerShape<int64_t>(initializer);
+  ORT_RETURN_IF_NOT((is_3d && input_shape.size() == 5) || (!is_3d && input_shape.size() == 4),
+                    "Unexpected rank: only support rank 4 or rank 5 input shapes");
+
+  if (!is_3d) {
+    input_shape.push_back(1);  // Make it 3D by making shape (C,N,H,W,1)
+  }
+
+  std::vector<uint8_t> input_buffer;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(initializer, input_buffer));
+  transposed_data.resize(input_buffer.size());
+
+  return TransposeDataRank5(TensorShape::FromExistingBuffer(input_shape),
+                            cnhw2hwcn_perm_3d,
+                            elem_byte_size,
+                            input_buffer,
+                            transposed_data);
+}
+
+Status BaseOpBuilder::TransposeFromCnhwToHwcn(std::vector<int64_t> input_shape_dims,
+                                              size_t elem_byte_size,
+                                              gsl::span<const uint8_t> input_buffer,
+                                              gsl::span<uint8_t> output_buffer,
+                                              bool is_3d) const {
+  const size_t rank = input_shape_dims.size();
+  ORT_RETURN_IF_NOT((is_3d && rank == 5) || (!is_3d && rank == 4), "Invalid input tensor rank");
+  ORT_RETURN_IF_NOT(input_buffer.size() == output_buffer.size(),
+                    "Expected input_buffer.size() == output_buffer.size()");
+  ORT_RETURN_IF_NOT(elem_byte_size != 0, "Can't get element byte size from given ONNX type");
+
+  if (!is_3d) {
+    input_shape_dims.push_back(1);  // Make it 3D by making shape (C,N,H,W,1)
+  }
+
+  return TransposeDataRank5(TensorShape::FromExistingBuffer(input_shape_dims),
+                            cnhw2hwcn_perm_3d,
+                            elem_byte_size,
+                            input_buffer,
+                            output_buffer);
+}
+
 Status BaseOpBuilder::ProcessAxisAttribute(const QnnModelWrapper& qnn_model_wrapper,
                                            const NodeUnit& node_unit,
                                            Qnn_Scalar_t& axis_qnn_scalar,
@@ -311,7 +466,7 @@ Status BaseOpBuilder::ProcessAxisAttribute(const QnnModelWrapper& qnn_model_wrap
   ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[0].node_arg, input_shape), "Cannot get shape");
 
   auto rank = static_cast<int32_t>(input_shape.size());
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   int32_t onnx_axis = node_helper.Get("axis", default_axis_value);
   if (onnx_axis < 0) {
     onnx_axis += rank;
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
index 055c0f6ccf2fa..0c400c4a5abc9 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
@@ -3,11 +3,11 @@
 
 #pragma once
 
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/ort_api.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder.h"
 #include "core/providers/qnn/builder/qnn_quant_params_wrapper.h"
-#include "core/framework/allocator.h"
 
 #include "QnnOpDef.h"
 
@@ -215,7 +215,8 @@ class BaseOpBuilder : public IOpBuilder {
   }
 
   // NCHW shape to channel last
-  Status NchwShapeToNhwc(const std::vector<uint32_t>& nchw_shape, std::vector<uint32_t>& nhwc_shape) const {
+  template <typename T>
+  Status NchwShapeToNhwc(gsl::span<const T> nchw_shape, gsl::span<T> nhwc_shape) const {
     ORT_RETURN_IF_NOT(nchw_shape.size() == 4, "shape should have 4 dimension NCHW.");
     nhwc_shape[0] = nchw_shape[0];
     nhwc_shape[1] = nchw_shape[2];
@@ -226,7 +227,8 @@ class BaseOpBuilder : public IOpBuilder {
   }
 
   // NCHW shape to HWCN shape, required for Conv weight
-  Status NchwShapeToHwcn(const std::vector<uint32_t>& nchw_shape, std::vector<uint32_t>& hwcn_shape) const {
+  template <typename T>
+  Status NchwShapeToHwcn(gsl::span<const T> nchw_shape, gsl::span<T> hwcn_shape) const {
     if (nchw_shape.size() == 4) {
       hwcn_shape[0] = nchw_shape[2];
       hwcn_shape[1] = nchw_shape[3];
@@ -246,7 +248,8 @@ class BaseOpBuilder : public IOpBuilder {
   }
 
   // CNHW shape to HWCN shape, required for Conv weight
-  Status CnhwShapeToHwcn(const std::vector<uint32_t>& cnhw_shape, std::vector<uint32_t>& hwcn_shape) const {
+  template <typename T>
+  Status CnhwShapeToHwcn(gsl::span<const T> cnhw_shape, gsl::span<T> hwcn_shape) const {
     if (cnhw_shape.size() == 4) {
       hwcn_shape[0] = cnhw_shape[2];
       hwcn_shape[1] = cnhw_shape[3];
@@ -264,37 +267,32 @@ class BaseOpBuilder : public IOpBuilder {
 
     return Status::OK();
   }
-  Status TransposeInitializer(const QnnModelWrapper& qnn_model_wrapper,
-                              const onnx::TensorProto& initializer,
-                              const std::vector<size_t>& perm,
-                              std::vector<uint8_t>& transposed_data) const;
 
   Status TransposeFromNchwToHwcn(const QnnModelWrapper& qnn_model_wrapper,
                                  const onnx::TensorProto& initializer,
                                  std::vector<uint8_t>& transposed_data,
-                                 bool is_3d = false) const {
-    auto& perm = is_3d ? nchw2hwcn_perm_3d : nchw2hwcn_perm;
-    return TransposeInitializer(qnn_model_wrapper, initializer, perm, transposed_data);
-  }
+                                 bool is_3d = false) const;
+
+  Status TransposeFromNchwToHwcn(std::vector<int64_t> input_shape_dims,
+                                 size_t elem_byte_size,
+                                 gsl::span<const uint8_t> input_buffer,
+                                 gsl::span<uint8_t> output_buffer,
+                                 bool is_3d = false) const;
 
   Status TransposeFromCnhwToHwcn(const QnnModelWrapper& qnn_model_wrapper,
                                  const onnx::TensorProto& initializer,
                                  std::vector<uint8_t>& transposed_data,
-                                 bool is_3d = false) const {
-    auto& perm = is_3d ? cnhw2hwcn_perm_3d : cnhw2hwcn_perm;
-    return TransposeInitializer(qnn_model_wrapper, initializer, perm, transposed_data);
-  }
+                                 bool is_3d = false) const;
+  Status TransposeFromCnhwToHwcn(std::vector<int64_t> input_shape_dims,
+                                 size_t elem_byte_size,
+                                 gsl::span<const uint8_t> input_buffer,
+                                 gsl::span<uint8_t> output_buffer,
+                                 bool is_3d = false) const;
 
   Status TwoDimensionTranspose(const QnnModelWrapper& qnn_model_wrapper,
                                std::vector<uint32_t>& data_shape,
                                const onnx::TensorProto& initializer,
-                               std::vector<uint8_t>& transposed_data) const {
-    auto tmp = data_shape[0];
-    data_shape[0] = data_shape[1];
-    data_shape[1] = tmp;
-    std::vector<size_t> two_dim_trans_perm{1, 0};
-    return TransposeInitializer(qnn_model_wrapper, initializer, two_dim_trans_perm, transposed_data);
-  }
+                               std::vector<uint8_t>& transposed_data) const;
 
   // Onnx Pads is [x1_begin, x2_begin, x1_end, x2_end], QNN requires [x1_begin, x1_end, x2_begin, x2_end]
   void ReArranagePads(std::vector<uint32_t>& pads) const {
@@ -352,7 +350,7 @@ struct OnnxAttrInfo {
 };
 
 template <typename ValType>
-inline ValType GetOnnxAttr(const NodeAttrHelper& node_helper, const OnnxAttrInfo<ValType>& attr_info) {
+inline ValType GetOnnxAttr(const qnn::utils::NodeAttrHelper& node_helper, const OnnxAttrInfo<ValType>& attr_info) {
   return node_helper.Get(attr_info.name, attr_info.default_val);
 }
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc
index 07abcf1c7bf84..5be54729a6a86 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc
@@ -5,16 +5,11 @@
 #include <cmath>
 #include <utility>
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/float16.h"
-#include "core/framework/tensorprotoutils.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 class BatchNormOpBuilder : public BaseOpBuilder {
@@ -546,7 +541,7 @@ Status BatchNormOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
     std::vector<double> scale_double_tensor;
     std::vector<double> bias_double_tensor;
 
-    NodeAttrHelper node_helper(node_unit);
+    utils::NodeAttrHelper node_helper(node_unit);
     const float epsilon = node_helper.Get("epsilon", 1e-05f);  // Default is 1e-05 according to ONNX spec.
 
     double scale_rmax = std::numeric_limits<double>::min();
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc
index d3bdee02437e4..5b3dd223a9f3f 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc
@@ -4,12 +4,11 @@
 #include <string>
 #include <vector>
 
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
index e5dc4d04afefd..a86354edf7e8e 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
@@ -4,14 +4,11 @@
 #include <cassert>
 #include <limits>
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 class ClipOpBuilder : public BaseOpBuilder {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
index 12887f0fb72d6..5f723e2e262be 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
@@ -1,16 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 
@@ -97,7 +92,7 @@ Status ConvOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
                 "QNN EP: Data type ", input_data_type->c_str(),
                 " is not supported for Conv operator in CPU backend.");
 
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   auto auto_pad = node_helper.Get("auto_pad", std::string("NOTSET"));
   ORT_RETURN_IF(auto_pad != "NOTSET" && auto_pad != "SAME_LOWER" && auto_pad != "SAME_UPPER",
                 "QNN Conv operators do not support 'auto_pad' value: ", auto_pad.c_str());
@@ -211,9 +206,9 @@ Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper,
 
     // Change shape to HWCN, it could be initializer or normal input
     if (conv_type == OnnxConvType::kConv) {
-      ORT_RETURN_IF_ERROR(NchwShapeToHwcn(input_info.shape, actual_shape));
+      ORT_RETURN_IF_ERROR(NchwShapeToHwcn<uint32_t>(input_info.shape, actual_shape));
     } else if (conv_type == OnnxConvType::kConvTranspose) {
-      ORT_RETURN_IF_ERROR(CnhwShapeToHwcn(input_info.shape, actual_shape));
+      ORT_RETURN_IF_ERROR(CnhwShapeToHwcn<uint32_t>(input_info.shape, actual_shape));
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unexpected convolution op type: ", node_unit.OpType().c_str());
     }
@@ -413,9 +408,9 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
 
     // Create the final shape after the weights are transposed to HWCN.
     if (conv_type == OnnxConvType::kConv) {
-      ORT_RETURN_IF_ERROR(NchwShapeToHwcn(shape_2d, final_shape));
+      ORT_RETURN_IF_ERROR(NchwShapeToHwcn<uint32_t>(shape_2d, final_shape));
     } else if (conv_type == OnnxConvType::kConvTranspose) {
-      ORT_RETURN_IF_ERROR(CnhwShapeToHwcn(shape_2d, final_shape));
+      ORT_RETURN_IF_ERROR(CnhwShapeToHwcn<uint32_t>(shape_2d, final_shape));
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unexpected convolution op type: ", node_unit.OpType().c_str());
     }
@@ -434,15 +429,11 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
         return static_cast<int64_t>(dim);
       });
 
-      const TensorShape tensor_shape = TensorShape::FromExistingBuffer(shape_2d_int64);  // Does not own shape data.
-      const DataTypeImpl* tensor_dtype = DataTypeImpl::TensorTypeFromONNXEnum(
-                                             input_info.initializer_tensor->data_type())
-                                             ->GetElementType();
-      ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor, unpacked_tensor));
-
-      Tensor tensor_2d(tensor_dtype, tensor_shape, unpacked_tensor.data(), OrtMemoryInfo{});  // Does not own data.
-      ONNX_NAMESPACE::TensorProto reshaped_initializer = onnxruntime::utils::TensorToTensorProto(tensor_2d,
-                                                                                                 reshape_output);
+      std::vector<uint8_t> original_tensor_bytes;
+      ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor, original_tensor_bytes));
+      unpacked_tensor.resize(original_tensor_bytes.size());
+      size_t elem_byte_size = qnn::utils::GetElementSizeByType(
+          static_cast<ONNX_NAMESPACE::TensorProto_DataType>(input_info.initializer_tensor->data_type()));
 
       // The reshape (unsqueeze) may require us to shift the quant parameter's axis.
       if (input_info.quant_param.IsPerChannel()) {
@@ -453,9 +444,9 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
       // Get transposed initializer bytes.
       //
       if (conv_type == OnnxConvType::kConv) {
-        ORT_RETURN_IF_ERROR(TransposeFromNchwToHwcn(qnn_model_wrapper, reshaped_initializer, unpacked_tensor));
+        ORT_RETURN_IF_ERROR(TransposeFromNchwToHwcn(shape_2d_int64, elem_byte_size, original_tensor_bytes, unpacked_tensor));
       } else if (conv_type == OnnxConvType::kConvTranspose) {
-        ORT_RETURN_IF_ERROR(TransposeFromCnhwToHwcn(qnn_model_wrapper, reshaped_initializer, unpacked_tensor));
+        ORT_RETURN_IF_ERROR(TransposeFromCnhwToHwcn(shape_2d_int64, elem_byte_size, original_tensor_bytes, unpacked_tensor));
       } else {
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unexpected convolution op type: ", node_unit.OpType().c_str());
       }
@@ -539,7 +530,7 @@ Status ConvOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra
   OnnxConvType conv_type = {};
   ORT_RETURN_IF_ERROR(GetOnnxConvType(node_unit.OpType(), conv_type));
 
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   std::vector<std::string> param_tensor_names;
 
   const auto& input_0 = node_unit.Inputs()[0];
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
index 64f676aaa9875..1beab0ed8b735 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
@@ -1,14 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/common/safeint.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
index 3737fcb54f4cf..a3ada04b7b017 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
@@ -2,14 +2,10 @@
 // Licensed under the MIT License.
 
 #include <cassert>
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/common/safeint.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -100,7 +96,7 @@ static Status GetInpu0AxisDimValue(const QnnModelWrapper& qnn_model_wrapper,
                     "Cannot get shape for ", node_unit.OpType(), " input[0] ", input0.node_arg.Name());
 
   int64_t rank = static_cast<int64_t>(input0_shape.size());
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   int64_t onnx_axis = node_helper.Get("axis", default_axis_value);
   if (onnx_axis < 0) {
     onnx_axis += rank;
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc
index eeee26c177281..8a36aa192313f 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc
@@ -1,14 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/common/safeint.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -36,7 +32,7 @@ class GemmOpBuilder : public BaseOpBuilder {
 };
 
 Status GemmOpBuilder::ExplictOpCheck(const NodeUnit& node_unit) const {
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   auto alpha = node_helper.Get("alpha", (float)1.0);
   if (alpha != 1.0) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN FullyConnected Op only support alpha=1.0.");
@@ -79,7 +75,7 @@ Status GemmOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
 
   // for Input A, B, C: 1 -- need transpose, 0 -- not needed
   std::vector<int64_t> input_trans_flag(3, 0);
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   input_trans_flag.at(0) = node_helper.Get("transA", (int64_t)0);
   auto transB = node_helper.Get("transB", (int64_t)0);
   // QNN input_1 [m, n] vs Onnx [n, m]
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc
index 4b8d079c0062a..80d96f513df63 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc
@@ -1,16 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-#include "onnx/defs/data_type_utils.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -74,7 +68,7 @@ Status InstanceNormOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN InstanceNorm input 2 (bias) must have 1D shape [channel].");
   }
 
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   const float epsilon = node_helper.Get("epsilon", 1e-05f);  // Default is 1e-05 according to ONNX spec.
   if (epsilon <= 0.0f) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN InstanceNorm epsilon must be greater than 0.0");
@@ -160,7 +154,7 @@ Status InstanceNormOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_m
                                                           std::vector<std::string>&& input_names,
                                                           const logging::Logger& logger,
                                                           bool do_op_validation) const {
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   std::vector<std::string> param_tensor_names;
 
   const float epsilon = node_helper.Get("epsilon", 1e-05f);  // Default is 1e-05 according to ONNX spec.
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
index d1a0e88686f39..7f830d68999a1 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
@@ -2,16 +2,10 @@
 // Licensed under the MIT License.
 
 #include <cassert>
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-#include "onnx/defs/data_type_utils.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -114,7 +108,7 @@ Status LayerNormOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_mode
                                                        std::vector<std::string>&& input_names,
                                                        const logging::Logger& logger,
                                                        bool do_op_validation) const {
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   std::vector<std::string> param_tensor_names;
 
   const float epsilon = node_helper.Get("epsilon", 1e-05f);  // Default is 1e-05 according to ONNX spec.
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc
index 2f66069b6609e..bc2b7a01c0779 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc
@@ -2,11 +2,9 @@
 // Licensed under the MIT License.
 
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-#include "onnx/defs/data_type_utils.h"
 
 #include "QnnOpDef.h"  // From QNN SDK: contains QNN constants (e.g., op names, param values).
 
@@ -75,7 +73,7 @@ Status LRNOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
 
   ORT_RETURN_IF(output_shape != input_shape, "QNN EP: LRN operator's output must have the same shape as the input.");
 
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
 
   // 'size' attribute must be odd and > 0.
   const int64_t onnx_size = GetOnnxAttr(node_helper, onnx_size_attr);
@@ -98,7 +96,7 @@ Status LRNOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrap
                                                  const logging::Logger& logger,
                                                  bool do_op_validation) const {
   std::vector<std::string> param_tensor_names;
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
 
   const int64_t onnx_size = GetOnnxAttr(node_helper, onnx_size_attr);
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
index 5fc6d42a8a179..7d40a3489e550 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
@@ -1,15 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/providers/cpu/tensor/slice_helper.h"
-#include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-
-#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
 namespace onnxruntime {
@@ -204,7 +198,7 @@ Status PadOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrap
   std::vector<uint32_t> input_shape;
   ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[0].node_arg, input_shape), "Cannot get shape of input 0.");
 
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   std::string mode = node_helper.Get("mode", "constant");
   Qnn_Scalar_t mode_qnn_scalar = QNN_SCALAR_INIT;
   mode_qnn_scalar.dataType = QNN_DATATYPE_UINT_32;
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
index ef1990ad8e69a..4923371ce9ee2 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
@@ -1,16 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-#include "onnx/defs/data_type_utils.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -39,7 +33,7 @@ class PoolOpBuilder : public BaseOpBuilder {
                                   QnnQuantParamsWrapper& quant_param) const override ORT_MUST_USE_RESULT;
 
  private:
-  Status SetCommonPoolParams(const NodeAttrHelper& node_helper, std::vector<uint32_t>& filter_size,
+  Status SetCommonPoolParams(const utils::NodeAttrHelper& node_helper, std::vector<uint32_t>& filter_size,
                              std::vector<uint32_t>& pad_amount, std::vector<uint32_t>& stride,
                              int32_t& ceil_mode,
                              std::vector<uint32_t>&& input_shape,
@@ -79,7 +73,7 @@ Status PoolOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
     return Status::OK();
   }
 
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   auto dilation_values = node_helper.Get("dilations", std::vector<int32_t>{1, 1});
   if (dilation_values != std::vector<int32_t>{1, 1}) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN does not support Dilation attribute");
@@ -94,7 +88,7 @@ Status PoolOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
   return Status::OK();
 }
 
-Status PoolOpBuilder::SetCommonPoolParams(const NodeAttrHelper& node_helper,
+Status PoolOpBuilder::SetCommonPoolParams(const utils::NodeAttrHelper& node_helper,
                                           std::vector<uint32_t>& filter_size,
                                           std::vector<uint32_t>& pad_amount, std::vector<uint32_t>& strides,
                                           int32_t& ceil_mode,
@@ -155,7 +149,7 @@ Status PoolOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra
                                                   std::vector<std::string>&& input_names,
                                                   const logging::Logger& logger,
                                                   bool do_op_validation) const {
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   // Get the NCHW from input data, use HW for the pool filter size and pool stride
   const auto& inputs = node_unit.Inputs();
   std::vector<uint32_t> input_shape;
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
index 77bc58bd6f833..2ad6d3741d0ba 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
@@ -6,12 +6,8 @@
 #include <array>
 #include <vector>
 
-#include "core/common/safeint.h"
-#include "onnx/defs/data_type_utils.h"
-#include "core/providers/common.h"
-#include "core/framework/endian_utils.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
@@ -71,7 +67,7 @@ class ReduceOpBuilder : public BaseOpBuilder {
   using AxesQnnIntType = uint32_t;
 
   Status GetAxesSet(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
-                    InlinedHashSet<AxesOnnxIntType>& axes_set) const;
+                    std::set<AxesOnnxIntType>& axes_set) const;
 
   // Maps an operator type to the opset in which "axes" became an input instead of an attribute.
   static const std::array<int, REDUCE_OP_TYPE_COUNT> opset_with_axes_as_input;
@@ -87,7 +83,7 @@ const std::array<int, REDUCE_OP_TYPE_COUNT> ReduceOpBuilder::opset_with_axes_as_
 };
 
 Status ReduceOpBuilder::GetAxesSet(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
-                                   InlinedHashSet<AxesOnnxIntType>& axes_set) const {
+                                   std::set<AxesOnnxIntType>& axes_set) const {
   ReduceOpType reduce_op_type = GetReduceOpType(node_unit.OpType());
   if (reduce_op_type == ReduceOpType::REDUCE_OP_TYPE_UNKNOWN) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unknown reduce operator ", node_unit.OpType());
@@ -102,7 +98,7 @@ Status ReduceOpBuilder::GetAxesSet(QnnModelWrapper& qnn_model_wrapper, const Nod
 
   const int opset_axes_as_input = ReduceOpBuilder::opset_with_axes_as_input[reduce_op_type];
   const int opset = node_unit.SinceVersion();
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
 
   // Extract the axes values from either the attribute or initializer input (depending on opset).
   if (opset < opset_axes_as_input) {  // Axes is in ONNX node attribute.
@@ -146,10 +142,7 @@ Status ReduceOpBuilder::GetAxesSet(QnnModelWrapper& qnn_model_wrapper, const Nod
       auto src_span = gsl::make_span(axes_bytes.data(), axes_bytes.size());
       auto dst_span = gsl::make_span(reduce_axes.data(), reduce_axes.size());
 
-      // Copy initializer bytes (stored in little-endian order) to vector of int64_t.
-      // ReadLittleEndian returns a status error if the source and destination spans do not have
-      // matching byte sizes.
-      ORT_RETURN_IF_ERROR(onnxruntime::utils::ReadLittleEndian(src_span, dst_span));
+      std::memcpy(dst_span.data(), src_span.data(), src_span.size_bytes());
     }
   }
 
@@ -212,13 +205,13 @@ Status ReduceOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, const
 Status ReduceOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
                                                     std::vector<std::string>&& input_names,
                                                     const logging::Logger& logger, bool do_op_validation) const {
-  NodeAttrHelper node_attr_helper(node_unit);
+  utils::NodeAttrHelper node_attr_helper(node_unit);
   std::vector<std::string> param_tensor_names;
 
   //
   // Handle axes param.
   //
-  InlinedHashSet<AxesOnnxIntType> axes_set;
+  std::set<AxesOnnxIntType> axes_set;
   ORT_RETURN_IF_ERROR(GetAxesSet(qnn_model_wrapper, node_unit, axes_set));
   const size_t num_axes = axes_set.size();
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc
index b6f414da950d8..7d12a6843d4a6 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc
@@ -1,15 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 
@@ -39,7 +35,7 @@ Status ReshapeOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
                                        std::vector<std::string>& input_names,
                                        bool do_op_validation) const {
   if (do_op_validation) {
-    NodeAttrHelper node_helper(node_unit);
+    utils::NodeAttrHelper node_helper(node_unit);
     auto allowzero = node_helper.Get("allowzero", static_cast<int64_t>(0));
     if (0 != allowzero) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN Reshape doesn't support dynamic shape!");
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
index c62fca88b6ec2..5298b20033115 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
@@ -5,17 +5,10 @@
 #include <cassert>
 #include <unordered_map>
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
-#include "core/providers/qnn/builder/qnn_model_wrapper.h"
-#include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/providers/cpu/tensor/slice_helper.h"
-#include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+#include "core/providers/qnn/builder/op_builder_factory.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -124,7 +117,7 @@ Status ResizeOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
   }
 
   const bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType());
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
 
   // QNN doesn't support anti-aliasing (added in opset 18)
   if (node_unit.SinceVersion() >= 18) {
@@ -260,7 +253,7 @@ Status ResizeOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
                                                     const logging::Logger& logger,
                                                     bool do_op_validation) const {
   std::vector<std::string> param_tensor_names;
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
 
   const auto& input_0 = node_unit.Inputs()[0];
   std::vector<uint32_t> input_shape;
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
index a6c4203ad92e4..307ab31a09651 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
@@ -1,16 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/common/safeint.h"
-#include "core/util/qmath.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -143,7 +137,7 @@ Status SimpleOpBuilder::ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper,
   const std::string& op_type = node_unit.OpType();
 
   if (op_type == "GridSample") {
-    NodeAttrHelper node_helper(node_unit);
+    utils::NodeAttrHelper node_helper(node_unit);
     std::string mode = node_helper.Get("mode", "linear");
     ORT_RETURN_IF_NOT(utils::ArrayHasString(gridsample_supported_modes, mode), "GridSample does not support mode ",
                       mode.c_str());
@@ -193,7 +187,7 @@ Status ProcessNodeAttribute(QnnModelWrapper& qnn_model_wrapper,
                             const std::string& qnn_param_key,
                             std::vector<std::string>& param_tensor_names,
                             const float default_value = 1.0f) {
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   float attr_value = node_helper.Get(onnx_attr_key, default_value);
   Qnn_Scalar_t attr_qnn_scalar = QNN_SCALAR_INIT;
   attr_qnn_scalar.dataType = QNN_DATATYPE_FLOAT_32;
@@ -209,7 +203,7 @@ Status ProcessNodeAttribute(QnnModelWrapper& qnn_model_wrapper,
 Status ProcessBlockSizeAttribute(QnnModelWrapper& qnn_model_wrapper,
                                  const NodeUnit& node_unit,
                                  std::vector<std::string>& param_tensor_names) {
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   uint32_t block_size = node_helper.Get("blocksize", static_cast<uint32_t>(0));
   std::vector<uint32_t> block_size_shape{2};
   std::vector<uint32_t> block_size_data(2, block_size);
@@ -224,7 +218,7 @@ Status ProcessBlockSizeAttribute(QnnModelWrapper& qnn_model_wrapper,
 Status ProcessModeAttribute(QnnModelWrapper& qnn_model_wrapper,
                             const NodeUnit& node_unit,
                             std::vector<std::string>& param_tensor_names) {
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   std::string mode = node_helper.Get("mode", "DCR");
   Qnn_Scalar_t mode_qnn_scalar = QNN_SCALAR_INIT;
   mode_qnn_scalar.dataType = QNN_DATATYPE_UINT_32;
@@ -247,7 +241,7 @@ Status ProcessModeAttribute(QnnModelWrapper& qnn_model_wrapper,
 Status ProcessAlphaAttributeAsInput(QnnModelWrapper& qnn_model_wrapper,
                                     const NodeUnit& node_unit,
                                     const std::string input_name) {
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   QnnQuantParamsWrapper quantize_param;
   Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
   union {
@@ -260,15 +254,16 @@ Status ProcessAlphaAttributeAsInput(QnnModelWrapper& qnn_model_wrapper,
   // Check LeakyRelu input 0 to see if it's quantized tensor
   bool is_quantized_tensor = node_unit.Outputs()[0].quant_param.has_value();
   if (is_quantized_tensor) {
-    float scale;
-    uint8_t zero_point;
-    int64_t num_of_elements = 1;
-    concurrency::ThreadPool* thread_pool = nullptr;
-    GetQuantizationParameter(&tensor_data.alpha, num_of_elements, scale, zero_point, thread_pool);
-    unpacked_data.resize(1);
-    ParQuantizeLinearStd(&tensor_data.alpha, unpacked_data.data(), num_of_elements, scale, zero_point, thread_pool);
-    quantize_param = QnnQuantParamsWrapper(scale, static_cast<int32_t>(zero_point));
     qnn_data_type = QNN_DATATYPE_UFIXED_POINT_8;
+    std::array<float, 1> scales = {1.0f};
+    std::array<int32_t, 1> offsets = {0};
+    std::array<uint32_t, 1> shape = {1};
+    auto float_data = gsl::make_span<const float>(&tensor_data.alpha, 1);
+    ORT_RETURN_IF_ERROR(qnn::utils::GetDataQuantParams(float_data, shape, scales, offsets, qnn_data_type));
+
+    unpacked_data.resize(1);
+    ORT_RETURN_IF_ERROR(qnn::utils::QuantizeData(float_data, shape, scales, offsets, unpacked_data, qnn_data_type));
+    quantize_param = QnnQuantParamsWrapper(scales[0], static_cast<int32_t>(offsets[0]));
   } else {
     const auto& inputs = node_unit.Inputs();
     TensorInfo input_info = {};
@@ -293,7 +288,7 @@ Status ProcessAlphaAttributeAsInput(QnnModelWrapper& qnn_model_wrapper,
 Status ProcessGridSampleAttributes(QnnModelWrapper& qnn_model_wrapper,
                                    const NodeUnit& node_unit,
                                    std::vector<std::string>& param_tensor_names) {
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   int64_t align_corners = node_helper.Get("align_corners", static_cast<int64_t>(0));
   Qnn_Scalar_t align_corners_qnn_scalar = QNN_SCALAR_INIT;
   align_corners_qnn_scalar.dataType = QNN_DATATYPE_BOOL_8;
@@ -373,7 +368,7 @@ Status SimpleOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
     param_tensor_names.push_back(axis_param.GetParamTensorName());
     qnn_model_wrapper.AddParamWrapper(std::move(axis_param));
 
-    NodeAttrHelper node_helper(node_unit);
+    utils::NodeAttrHelper node_helper(node_unit);
     int64_t norm_p_order = node_helper.Get("p", static_cast<int64_t>(2));
     ORT_RETURN_IF(norm_p_order != 2, "QNN EP only supports LpNormalization with 'p' attribute equal to 2.");
   }
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
index b033c8723ea86..3096967a5f166 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
@@ -1,17 +1,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/cpu/tensor/slice_helper.h"
 
-#include "core/framework/tensorprotoutils.h"
-
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 
@@ -63,7 +58,7 @@ void SliceOpBuilder::GetDataFromAttribute(const NodeUnit& node_unit,
                                           TensorShapeVector& raw_starts,
                                           TensorShapeVector& raw_ends,
                                           TensorShapeVector& raw_axes) const {
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   auto starts = node_helper.Get("starts", std::vector<int64_t>{0});
   raw_starts.assign(starts.begin(), starts.end());
   auto ends = node_helper.Get("ends", std::vector<int64_t>{0});
@@ -86,26 +81,22 @@ static Status GetInitializerInputData(const NodeUnitIODef& input, const QnnModel
   ORT_RETURN_IF_NOT(initializer_proto->has_data_type(), "Expected initializer ", input_name.c_str(),
                     " to have a proto data type.");
 
-  // Create empty Tensor.
-  const auto* dtype = DataTypeImpl::TensorTypeFromONNXEnum(initializer_proto->data_type())->GetElementType();
-  TensorShape shape = onnxruntime::utils::GetTensorShapeFromTensorProto(*initializer_proto);
-  Tensor tensor(dtype, shape, std::make_shared<CPUAllocator>());
-
-  // Deserialize initializer into Tensor.
-  ORT_RETURN_IF_ERROR(onnxruntime::utils::TensorProtoToTensor(
-      onnxruntime::Env::Default(), qnn_model_wrapper.GetGraphViewer().ModelPath(), *initializer_proto, tensor));
+  // Deserialize initializer into byte buffer
+  std::vector<uint8_t> initializer_bytes;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*initializer_proto, initializer_bytes));
 
   Status status;
 
   // Copy Tensor of int32_t or int64_t elems into output (int64_ts).
-  if (tensor.IsDataType<int64_t>()) {
-    gsl::span<const int64_t> tensor_elems = tensor.DataAsSpan<int64_t>();
+  auto onnx_type = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(initializer_proto->data_type());
+  if (onnx_type == ONNX_NAMESPACE::TensorProto_DataType_INT64) {
+    gsl::span<const int64_t> tensor_elems = ReinterpretAsSpan<int64_t, uint8_t>(initializer_bytes);
     output.insert(output.end(), tensor_elems.begin(), tensor_elems.end());
-  } else if (tensor.IsDataType<int32_t>()) {
-    gsl::span<const int32_t> tensor_elems = tensor.DataAsSpan<int32_t>();
+  } else if (onnx_type == ONNX_NAMESPACE::TensorProto_DataType_INT32) {
+    gsl::span<const int32_t> tensor_elems = ReinterpretAsSpan<int32_t, uint8_t>(initializer_bytes);
     output.insert(output.end(), tensor_elems.begin(), tensor_elems.end());
   } else {
-    status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Data type ", DataTypeImpl::ToString(dtype),
+    status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Data type ", onnx_type,
                              " is not supported for Slice initializer input ", input.node_arg.Name().c_str());
   }
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc
index b62534bacf426..e7d37937d527f 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc
@@ -1,15 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
index ba5ad2cf03cef..de7277390f9bb 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
@@ -1,16 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/cpu/tensor/slice_helper.h"
-#include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-
-#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -98,7 +93,7 @@ Status SplitOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wr
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN doesn't support dynamic split");
     }
   } else {
-    NodeAttrHelper node_helper(node_unit);
+    utils::NodeAttrHelper node_helper(node_unit);
     if (node_helper.HasAttr("split")) {
       auto split_lengths = node_helper.Get("split", std::vector<int64_t>{0});
       ConvertSplitLengthsToSplitIndices(split_lengths, split_index);
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc
index 851ca84dce075..1d518c3ed5359 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc
@@ -1,16 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/cpu/tensor/slice_helper.h"
-#include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-
-#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
index d22c0811682d0..b2891022e73a5 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
-#include "core/framework/utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 namespace onnxruntime {
 namespace qnn {
 const int TOPK_MIN_INPUT = 2;
@@ -48,7 +48,7 @@ Status TopKOpBuilder::ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const N
   if (!qnn_model_wrapper.IsInitializerInput(input_1)) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "The number of top elements to retrieve must be specified as constant input.");
   }
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   auto largest = node_helper.Get("largest", 1);
   auto sorted = node_helper.Get("sorted", 1);
   if (0 == sorted) {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc
index a42d7312f0203..7bc43f04d5361 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc
@@ -4,10 +4,10 @@
 #include <string>
 #include <vector>
 
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/common/safeint.h"
 
 #include "base_op_builder.h"
 
@@ -45,7 +45,7 @@ Status TransposeOpBuilder::ProcessPermAttribute(QnnModelWrapper& qnn_model_wrapp
     transpose_perm[i] = rank - 1 - i;
   }
 
-  NodeAttrHelper node_helper(node_unit);
+  utils::NodeAttrHelper node_helper(node_unit);
   transpose_perm = node_helper.Get("perm", transpose_perm);
   auto perm_size = static_cast<uint32_t>(transpose_perm.size());
   std::vector<uint32_t> perm_shape{perm_size};
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index fa38fad8eeb59..f6614a00a30f5 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -17,16 +17,11 @@
 #include "HTP/QnnHtpContext.h"
 #include "Saver/QnnSaver.h"
 #include <gsl/gsl>
-#include "core/framework/endian_utils.h"
-#include "core/common/logging/capture.h"
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
 #include "core/providers/qnn/builder/qnn_configs_helper.h"
 
-#ifdef _WIN32
-#include <winmeta.h>
-#include "core/platform/tracing.h"
-#endif
-
 // Flag to determine if Backend should do node validation for each opNode added
 #define DO_GRAPH_NODE_VALIDATIONS 1
 
@@ -246,12 +241,12 @@ void QnnLogging(const char* format,
   const auto data_type = ::onnxruntime::logging::DataType::SYSTEM;
 
   if (logger.OutputIsEnabled(severity, data_type)) {
-    ::onnxruntime::logging::Capture(logger,
-                                    severity,
-                                    ::onnxruntime::logging::Category::onnxruntime,
-                                    data_type,
-                                    ORT_WHERE)
-        .ProcessPrintf(format, argument_parameter);
+    auto log_capture = ::onnxruntime::logging::Capture::Create(logger,
+                                                               severity,
+                                                               ::onnxruntime::logging::Category::onnxruntime,
+                                                               data_type,
+                                                               ORT_WHERE);
+    log_capture->ProcessPrintf(format, argument_parameter);
   }
 }
 
@@ -389,25 +384,25 @@ Status QnnBackendManager::CreateDevice() {
     // Set SoC Model. The *enum* Qnn_SocModel_t is deprecated and will not be updated in the future. Therefore,
     // must use the latest SDK documentation to get the SoC model of the latest HW.
     if (soc_model_ != QNN_SOC_MODEL_UNKNOWN) {
-      QnnHtpDevice_CustomConfig_t& custom_config = device_configs_builder.PushCustomConfig();
-      custom_config.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC;
-      custom_config.socModel = soc_model_;
+      gsl::not_null<QnnHtpDevice_CustomConfig_t*> custom_config = device_configs_builder.PushCustomConfig();
+      custom_config->option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC;
+      custom_config->socModel = soc_model_;
 
-      QnnDevice_Config_t& device_config = device_configs_builder.PushConfig();
-      device_config.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
-      device_config.customConfig = &custom_config;
+      gsl::not_null<QnnDevice_Config_t*> device_config = device_configs_builder.PushConfig();
+      device_config->option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
+      device_config->customConfig = custom_config;
     }
 
     // Set the minimum HTP architecture. The driver will use ops that are compatible with this minimum architecture.
     if (htp_arch_ != QNN_HTP_DEVICE_ARCH_NONE) {
-      QnnHtpDevice_CustomConfig_t& custom_config = device_configs_builder.PushCustomConfig();
-      custom_config.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH;
-      custom_config.arch.arch = htp_arch_;
-      custom_config.arch.deviceId = device_id_;
-
-      QnnDevice_Config_t& device_config = device_configs_builder.PushConfig();
-      device_config.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
-      device_config.customConfig = &custom_config;
+      gsl::not_null<QnnHtpDevice_CustomConfig_t*> custom_config = device_configs_builder.PushCustomConfig();
+      custom_config->option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH;
+      custom_config->arch.arch = htp_arch_;
+      custom_config->arch.deviceId = device_id_;
+
+      gsl::not_null<QnnDevice_Config_t*> device_config = device_configs_builder.PushConfig();
+      device_config->option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
+      device_config->customConfig = custom_config;
     }
   }
 
@@ -1110,7 +1105,7 @@ Status QnnBackendManager::ExtractBackendProfilingInfo() {
   }
 
   bool tracelogging_provider_ep_enabled = false;
-  const Env& env = Env::Default();
+  const Env& env = GetDefaultEnv();
   auto& provider = env.GetTelemetryProvider();
   auto level = provider.Level();
   if (provider.IsEnabled()) {
@@ -1336,6 +1331,8 @@ void QnnBackendManager::LogQnnProfileEventAsTraceLogging(
     const std::string& timingSource,
     const std::string& eventLevel,
     const char* eventIdentifier) {
+  // TODO: Re-enable when add a method to ORT Telemetry provider to log EP profiling data.
+#if 0
   TraceLoggingWrite(
       telemetry_provider_handle,
       "QNNProfilingEvent",
@@ -1348,6 +1345,15 @@ void QnnBackendManager::LogQnnProfileEventAsTraceLogging(
       TraceLoggingString(timingSource.c_str(), "Timing Source"),
       TraceLoggingString(eventLevel.c_str(), "Event Level"),
       TraceLoggingString(eventIdentifier, "Event Identifier"));
+#else
+  ORT_UNUSED_PARAMETER(timestamp);
+  ORT_UNUSED_PARAMETER(message);
+  ORT_UNUSED_PARAMETER(qnnScalarValue);
+  ORT_UNUSED_PARAMETER(unit);
+  ORT_UNUSED_PARAMETER(timingSource);
+  ORT_UNUSED_PARAMETER(eventLevel);
+  ORT_UNUSED_PARAMETER(eventIdentifier);
+#endif
 }
 #endif
 
@@ -1504,7 +1510,8 @@ void* QnnBackendManager::LoadLib(const char* file_name, int flags, std::string&
   auto file_path = std::filesystem::path(file_name);
   if (!file_path.is_absolute()) {
     // construct an absolute path from ORT runtime path + file_name and check whether it exists.
-    auto pathstring = Env::Default().GetRuntimePath() + ToPathString(file_name);
+    const Env& env = GetDefaultEnv();
+    auto pathstring = env.GetRuntimePath() + ToPathString(file_name);
     auto absolute_path = pathstring.c_str();
     if (std::filesystem::exists(std::filesystem::path(absolute_path))) {
       // load library from absolute path and search for dependencies there.
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
index beabc9bd71b94..41a2e8777f61e 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
@@ -21,9 +21,8 @@
 #include "QnnLog.h"
 #include "QnnTypes.h"
 #include "System/QnnSystemInterface.h"
-#include "core/common/status.h"
-#include "core/common/logging/logging.h"
-#include "core/common/path_string.h"
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h b/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h
index 9dd9bbaa08d64..b581cd90537d9 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h
@@ -3,7 +3,8 @@
 
 #pragma once
 
-#include <core/common/inlined_containers_fwd.h>
+#include <gsl/gsl>
+#include <vector>
 
 namespace onnxruntime {
 namespace qnn {
@@ -49,9 +50,9 @@ class QnnConfigsBuilder {
    *
    * \return A reference to a default CustomConfigType object.
    */
-  CustomConfigType& PushCustomConfig() {
-    custom_configs_.push_back(custom_config_init_);
-    return custom_configs_.back();
+  gsl::not_null<CustomConfigType*> PushCustomConfig() {
+    custom_configs_.push_back(std::make_unique<CustomConfigType>(custom_config_init_));
+    return custom_configs_.back().get();
   }
 
   /**
@@ -60,15 +61,15 @@ class QnnConfigsBuilder {
    *
    * \return A reference to a default BaseConfigType object.
    */
-  BaseConfigType& PushConfig() {
-    configs_.push_back(base_config_init_);
-    BaseConfigType& config = configs_.back();
+  gsl::not_null<BaseConfigType*> PushConfig() {
+    configs_.push_back(std::make_unique<BaseConfigType>(base_config_init_));
+    BaseConfigType* config = configs_.back().get();
 
     // Add pointer to this new config to the list of config pointers.
     if (IsNullTerminated()) {
-      config_ptrs_.back() = &config;  // Replace last nullptr entry.
+      config_ptrs_.back() = config;  // Replace last nullptr entry.
     } else {
-      config_ptrs_.push_back(&config);
+      config_ptrs_.push_back(config);
     }
 
     return config;
@@ -81,9 +82,14 @@ class QnnConfigsBuilder {
 
   BaseConfigType base_config_init_;
   CustomConfigType custom_config_init_;
-  InlinedVector<CustomConfigType> custom_configs_;
-  InlinedVector<BaseConfigType> configs_;
-  InlinedVector<const BaseConfigType*> config_ptrs_;
+
+  // Store elements of unique_ptrs instead of by value because std::vector reallocation would change the
+  // location of elements in memory. BaseConfigType objects may contain pointers to CustomConfigType objects,
+  // so we need to make sure that pointers to these objects are stable in memory.
+  std::vector<std::unique_ptr<CustomConfigType>> custom_configs_;
+  std::vector<std::unique_ptr<BaseConfigType>> configs_;
+
+  std::vector<const BaseConfigType*> config_ptrs_;
 };
 
 }  // namespace qnn
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_def.h b/onnxruntime/core/providers/qnn/builder/qnn_def.h
index ffd2dc9b11010..705212ae52c77 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_def.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_def.h
@@ -9,8 +9,7 @@
 #include <memory>
 #include <climits>
 #include <type_traits>
-#include "core/graph/basic_types.h"
-#include "core/common/common.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_quant_params_wrapper.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.cc b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
index 4f73e4c532ed4..de8fa816efdb1 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
@@ -6,12 +6,9 @@
 #include <iostream>
 #include "QnnOpDef.h"
 
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/utils.h"
-#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
-#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
 namespace onnxruntime {
@@ -104,7 +101,7 @@ Status QnnModel::ComposeGraph(const GraphViewer& graph_viewer,
   // valid throughout the lifetime of the ModelBuilder
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
-  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer, logger);
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(&graph_viewer, logger);
 
   // This name must be same with the EPContext node name
   const auto& graph_name = fused_node.Name();
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.h b/onnxruntime/core/providers/qnn/builder/qnn_model.h
index 2e0935391ca78..489acaacde4fe 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.h
@@ -3,16 +3,13 @@
 
 #pragma once
 
+#include <mutex>
 #include <vector>
 
-#include "core/common/status.h"
-#include "core/framework/node_unit.h"
-#include "core/graph/graph_viewer.h"
-#include <mutex>
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/qnn_backend_manager.h"
-#include "core/session/onnxruntime_cxx_api.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
index 2c7f3c8b22ddd..79e9e1408a9ca 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "qnn_model_wrapper.h"
+
 #include <algorithm>
 #include <cstdlib>
 #include <cstring>
@@ -8,10 +10,7 @@
 #include <utility>
 #include <vector>
 
-#include "qnn_model_wrapper.h"
-#include "core/common/safeint.h"
-#include "core/framework/tensorprotoutils.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
 namespace onnxruntime {
@@ -445,7 +444,7 @@ Status QnnModelWrapper::IsPerChannelQuantized(const onnxruntime::NodeUnitIODef&
   ORT_RETURN_IF(iter == graph_initializers.end(), "Unable to find initializer for scale(s): ",
                 scale_name.c_str());
   gsl::not_null<const onnx::TensorProto*> scale_tensor_proto = iter->second;
-  TensorShape scale_shape = onnxruntime::utils::GetTensorShapeFromTensorProto(*scale_tensor_proto);
+  TensorShape scale_shape(qnn::utils::GetInitializerShape<int64_t>(*scale_tensor_proto));
 
   // Check the number of scale values to determine if the tensor is per-channel.
   // This is consistent with CPU EP's Quant/Dequant logic. We can't use the presence of an axis because even a
@@ -624,29 +623,13 @@ Status QnnModelWrapper::UnpackInitializerData(const ONNX_NAMESPACE::TensorProto&
 
   // If this is an int4, we need to unpack it because QNN treats int4 as a full int8.
   if (onnx_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT4) {
-    TensorShape shape = onnxruntime::utils::GetTensorShapeFromTensorProto(initializer);
-    const size_t num_elems = shape.Size();
-    std::vector<uint8_t> packed_int4_bytes = std::move(unpacked_tensor);
-    unpacked_tensor = std::vector<uint8_t>(num_elems);
-
-    auto dst = gsl::make_span(reinterpret_cast<int8_t*>(unpacked_tensor.data()), unpacked_tensor.size());
-    auto src = gsl::make_span(reinterpret_cast<const Int4x2*>(packed_int4_bytes.data()), packed_int4_bytes.size());
-    ORT_RETURN_IF_NOT(Int4x2::Unpack(dst, src), "Failed to unpack Tensor<Int4x2> for QNN");
-
-    // NOTE: Masking off top 4 bits to workaround a QNN INT4 accuracy bug.
-    // Docs explicitly state that masking off top 4 bits should not be required.
-    for (size_t i = 0; i < dst.size(); i++) {
-      dst[i] &= 0x0F;  // -3 (0b1111_1101) becomes 13 (0b0000_1101)
-    }
+    TensorShape shape(qnn::utils::GetInitializerShape<int64_t>(initializer));
+    const size_t num_int4_elems = shape.Size();
+    ORT_RETURN_IF_ERROR(qnn::utils::UnpackInt4ToInt8<true>(num_int4_elems, unpacked_tensor));
   } else if (onnx_data_type == ONNX_NAMESPACE::TensorProto_DataType_UINT4) {
-    TensorShape shape = onnxruntime::utils::GetTensorShapeFromTensorProto(initializer);
-    const size_t num_elems = shape.Size();
-    std::vector<uint8_t> packed_int4_bytes = std::move(unpacked_tensor);
-    unpacked_tensor = std::vector<uint8_t>(num_elems);
-
-    auto dst = gsl::make_span(reinterpret_cast<uint8_t*>(unpacked_tensor.data()), unpacked_tensor.size());
-    auto src = gsl::make_span(reinterpret_cast<const UInt4x2*>(packed_int4_bytes.data()), packed_int4_bytes.size());
-    ORT_RETURN_IF_NOT(UInt4x2::Unpack(dst, src), "Failed to unpack Tensor<UInt4x2> for QNN");
+    TensorShape shape(qnn::utils::GetInitializerShape<int64_t>(initializer));
+    const size_t num_uint4_elems = shape.Size();
+    ORT_RETURN_IF_ERROR(qnn::utils::UnpackInt4ToInt8<false>(num_uint4_elems, unpacked_tensor));
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
index f3e52050e79e0..8cd7360606d71 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
@@ -7,13 +7,10 @@
 #include <string>
 #include <vector>
 
-#include "core/common/status.h"
 #include "QnnInterface.h"
 #include "qnn_def.h"
-#include "core/common/logging/logging.h"
-#include "core/framework/node_unit.h"
-#include "core/graph/graph_viewer.h"
-#include "core/providers/shared/utils/utils.h"
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_quant_params_wrapper.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group.h
index f9ef01411310f..276fbaae3b3c9 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group.h
@@ -8,8 +8,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "core/common/logging/logging.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.cc
index 813bba8a5952b..567dd5c1d6567 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.cc
@@ -6,9 +6,8 @@
 #include <limits>
 #include <optional>
 #include <string>
-#include "core/graph/graph_utils.h"
-#include "core/framework/node_unit.h"
-#include "core/providers/shared/utils/utils.h"
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_node_group/utils.h"
@@ -110,8 +109,8 @@ static bool CanClipBeRemoved(const QnnModelWrapper& qnn_model_wrapper,
   float clip_min = std::numeric_limits<float>::lowest();
   float clip_max = std::numeric_limits<float>::max();
 
-  if (!onnxruntime::GetClipMinMax(qnn_model_wrapper.GetGraphViewer(), clip_node_unit.GetNode(),
-                                  clip_min, clip_max, logger)) {
+  if (!qnn::utils::GetClipMinMax(qnn_model_wrapper.GetGraphViewer(), clip_node_unit.GetNode(),
+                                 clip_min, clip_max, logger)) {
     return false;
   }
 
@@ -174,7 +173,7 @@ static bool CanActivationBeRemoved(const QnnModelWrapper& qnn_model_wrapper,
 static std::vector<const Node*> FindParentDQNodes(const GraphViewer& graph_viewer, const Node& node) {
   // Get all parent DQ nodes sorted by destination argument index.
   std::vector<const Node*> parents(node.InputDefs().size(), nullptr);
-  for (auto it = node.InputEdgesBegin(); it != node.InputEdgesEnd(); it++) {
+  for (auto it = node.InputEdgesBegin(); it != node.InputEdgesEnd(); ++it) {
     if (it->GetNode().OpType().compare(DEQUANTIZE_LINEAR) == 0) {
       parents[it->GetDstArgIndex()] = &(it->GetNode());
     }
@@ -318,7 +317,7 @@ static Status CreateOrValidateOnQnn(QnnModelWrapper& qnn_model_wrapper,
 
     std::optional<int64_t> axis;
     if (auto entry = dq_attrs.find("axis"); entry != dq_attrs.end()) {
-      axis = entry->second.i();
+      axis = entry->second().i();
     }
 
     // quantization scale and zp are always the input[1, 2]
@@ -328,7 +327,8 @@ static Status CreateOrValidateOnQnn(QnnModelWrapper& qnn_model_wrapper,
 
   // Populate NodeUnit outputs and output edges
   std::vector<NodeUnitIODef> outputs;
-  Node::EdgeSet output_edges;
+  std::vector<std::unique_ptr<Node_EdgeEnd>> output_edges_holder;
+  std::vector<const Node_EdgeEnd*> output_edges;
   for (const Node* q_node : q_nodes) {
     const auto q_inputs = q_node->InputDefs();
     const auto& q_attrs = q_node->GetAttributes();
@@ -336,7 +336,7 @@ static Status CreateOrValidateOnQnn(QnnModelWrapper& qnn_model_wrapper,
 
     std::optional<int64_t> axis;
     if (auto entry = q_attrs.find("axis"); entry != q_attrs.end()) {
-      axis = entry->second.i();
+      axis = entry->second().i();
     }
 
     // quantization scale and zp are always the input[1, 2]
@@ -347,22 +347,25 @@ static Status CreateOrValidateOnQnn(QnnModelWrapper& qnn_model_wrapper,
     auto q_cur_edge = q_node->OutputEdgesBegin();
     auto q_end_edge = q_node->OutputEdgesEnd();
     for (; q_cur_edge != q_end_edge; ++q_cur_edge) {
-      output_edges.insert(Node::EdgeEnd{q_cur_edge->GetNode(), 0, q_cur_edge->GetDstArgIndex()});
+      auto output_edge = Node_EdgeEnd::Create(q_cur_edge->GetNode(), 0, q_cur_edge->GetDstArgIndex());
+      output_edges.push_back(output_edge.get());
+      output_edges_holder.push_back(std::move(output_edge));
     }
   }
 
-  NodeUnit custom_node_unit(dq_nodes, target_node, q_nodes, NodeUnit::Type::QDQGroup,
-                            inputs, outputs, num_dqs, output_edges);
-  const auto* conv_op_builder = qnn::GetOpBuilder(custom_node_unit.OpType());
+  std::unique_ptr<NodeUnit> custom_node_unit = NodeUnit::Create(dq_nodes, target_node,
+                                                                q_nodes, NodeUnit::Type::QDQGroup,
+                                                                inputs, outputs, num_dqs, output_edges);
+  const auto* conv_op_builder = qnn::GetOpBuilder(custom_node_unit->OpType());
   if (conv_op_builder == nullptr) {
     return Status::OK();
   }
 
   if (validate) {
-    return conv_op_builder->IsOpSupported(qnn_model_wrapper, custom_node_unit, logger);
+    return conv_op_builder->IsOpSupported(qnn_model_wrapper, *custom_node_unit, logger);
   }
 
-  return conv_op_builder->AddToModelBuilder(qnn_model_wrapper, custom_node_unit, logger, validate);
+  return conv_op_builder->AddToModelBuilder(qnn_model_wrapper, *custom_node_unit, logger, validate);
 }
 
 // Traverses graph to check if the given NodeUnit is part of a valid DQ* -> Conv -> Relu/Clip -> Q sequence.
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.h
index b604b25e943e6..a211c86c2301e 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.h
@@ -9,7 +9,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc
index caf4725626338..3af2fdd1f0276 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc
@@ -6,9 +6,8 @@
 #include <limits>
 #include <optional>
 #include <utility>
-#include "core/graph/graph_utils.h"
-#include "core/framework/node_unit.h"
-#include "core/providers/shared/utils/utils.h"
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_node_group/utils.h"
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h
index 90fe44c3af059..d3d552bc172ec 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h
@@ -7,8 +7,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "core/common/common.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc
index 76b1726646486..0b2d7451553e7 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc
@@ -6,9 +6,8 @@
 #include <limits>
 #include <optional>
 #include <utility>
-#include "core/graph/graph_utils.h"
-#include "core/framework/node_unit.h"
-#include "core/providers/shared/utils/utils.h"
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
@@ -39,7 +38,7 @@ std::unique_ptr<IQnnNodeGroup> HardSigmoidMulFusion::TryFusion(
     return nullptr;
   }
 
-  NodeAttrHelper hs_attr_helper(hardsigmoid_node_unit);
+  utils::NodeAttrHelper hs_attr_helper(hardsigmoid_node_unit);
   float alpha = hs_attr_helper.Get("alpha", 0.2f);
   float beta = hs_attr_helper.Get("beta", 0.5f);
   constexpr float req_alpha = 1.0f / 6.0f;
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h
index 3b67f13492a46..0a1b16d24ffcd 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h
@@ -7,8 +7,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "core/common/common.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
index 9fb9e815321c0..56413b781b246 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
@@ -10,8 +10,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
-#include "core/graph/graph_utils.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc
index 5548d7d37c378..93b2fca296389 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc
@@ -4,8 +4,7 @@
 #include <string_view>
 #include <unordered_map>
 
-#include "core/graph/graph_viewer.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h
index 0d11d21906ccb..c4cf4e8a20a92 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h
@@ -7,8 +7,7 @@
 #include <string_view>
 #include <unordered_map>
 
-#include "core/graph/graph_viewer.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
index 23330f5616d73..01c15cf4bebe6 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
@@ -4,10 +4,10 @@
 #pragma once
 #include <memory>
 #include <vector>
-#include "QnnTypes.h"
-#include "core/common/common.h"
 #include <gsl/gsl>
-#include "core/framework/node_unit.h"
+
+#include "core/providers/qnn/ort_api.h"
+#include "QnnTypes.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
index 8d2cb5bdb6da0..74cdf2ad17f42 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
@@ -1,15 +1,15 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "qnn_utils.h"
+
 #include <functional>
 #include <numeric>
 #include <string>
 #include <vector>
 #include <map>
 
-#include "core/common/common.h"
-#include "core/framework/data_types.h"
-#include "qnn_utils.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 
 namespace onnxruntime {
@@ -63,6 +63,42 @@ size_t GetElementSizeByType(ONNXTensorElementDataType elem_type) {
   return pos->second;
 }
 
+size_t GetElementSizeByType(ONNX_NAMESPACE::TensorProto_DataType onnx_type) {
+  switch (onnx_type) {
+    case ONNX_NAMESPACE::TensorProto_DataType_INT4:
+      return sizeof(Int4x2);
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT4:
+      return sizeof(UInt4x2);
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      return sizeof(int8_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
+      return sizeof(uint8_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_INT16:
+      return sizeof(int16_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT16:
+      return sizeof(uint16_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_INT32:
+      return sizeof(int32_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT32:
+      return sizeof(uint32_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_INT64:
+      return sizeof(int64_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT64:
+      return sizeof(uint64_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
+      return 2;
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
+      return sizeof(float);
+    case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE:
+      return sizeof(double);
+    case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+      return sizeof(bool);
+    default:
+      return 0;
+  }
+  // Unreachable
+}
+
 std::ostream& operator<<(std::ostream& out, const Qnn_Scalar_t& scalar) {
   switch (scalar.dataType) {
     case QNN_DATATYPE_INT_8:
@@ -487,39 +523,22 @@ bool OnnxDataTypeToQnnDataType(const int32_t onnx_data_type, Qnn_DataType_t& qnn
 }
 
 std::pair<float, float> CheckMinMax(float rmin, float rmax) {
-  // Ensure a minimum range of 0.0001 (required by QNN)
-  rmax = std::max(rmax, rmin + 0.0001f);
-
   // Both QNN and ORT require the range to include 0.0f
   rmin = std::min(rmin, 0.0f);
   rmax = std::max(rmax, 0.0f);
 
+  // Ensure a minimum range of 0.0001 (required by QNN)
+  rmax = std::max(rmax, rmin + 0.0001f);
+
   return std::make_pair(rmin, rmax);
 }
 
-template <typename T>
-Status GetQminQmax(const Qnn_DataType_t qnn_data_type,
-                   T& qmin,
-                   T& qmax) {
-  if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_8) {
-    qmin = static_cast<T>(std::numeric_limits<int8_t>::min());
-    qmax = static_cast<T>(std::numeric_limits<int8_t>::max());
-  } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_8) {
-    qmin = static_cast<T>(std::numeric_limits<uint8_t>::min());
-    qmax = static_cast<T>(std::numeric_limits<uint8_t>::max());
-  } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_16) {
-    qmin = static_cast<T>(std::numeric_limits<int16_t>::min());
-    qmax = static_cast<T>(std::numeric_limits<int16_t>::max());
-  } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) {
-    qmin = static_cast<T>(std::numeric_limits<uint16_t>::min());
-    qmax = static_cast<T>(std::numeric_limits<uint16_t>::max());
-  } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_32) {
-    qmin = static_cast<T>(std::numeric_limits<int32_t>::min());
-    qmax = static_cast<T>(std::numeric_limits<int32_t>::max());
-  } else {
-    ORT_RETURN_IF(true, "Qnn Data Type: %d not supported yet.", qnn_data_type);
+inline float RoundHalfToEven(float input) {
+  if (!std::isfinite(input)) {
+    return input;
   }
-  return Status::OK();
+  // std::remainder returns x - n, where n is the integral value nearest to x. When |x - n| = 0.5, n is chosen to be even
+  return input - std::remainderf(input, 1.f);
 }
 
 Status GetQuantParams(float rmin,
@@ -535,20 +554,22 @@ Status GetQuantParams(float rmin,
     rmin = -abs_max;
   }
 
-  float qmin = 0.0f;
-  float qmax = 255.0f;
-  ORT_RETURN_IF_ERROR(GetQminQmax(qnn_data_type, qmin, qmax));
+  double rmin_dbl = static_cast<double>(rmin);
+  double rmax_dbl = static_cast<double>(rmax);
+  double qmin = 0.0;
+  double qmax = 0.0;
+  ORT_RETURN_IF_ERROR(GetQminQmax(qnn_data_type, qmin, qmax, symmetric));
 
-  scale = (rmax - rmin) / (qmax - qmin);
-  float initial_zero_point = 0.0f;
+  double scale_dbl = (rmax_dbl - rmin_dbl) / (qmax - qmin);
+  double initial_zero_point = 0.0;
   if (symmetric) {
-    initial_zero_point = std::round(rmin + rmax) / 2;
+    initial_zero_point = std::round(rmin_dbl + rmax_dbl) / 2;
   } else {
-    initial_zero_point = qmin - (rmin / scale);
+    initial_zero_point = qmin - (rmin_dbl / scale_dbl);
   }
-  zero_point = static_cast<int32_t>(RoundHalfToEven(Saturate(qmax, qmin, initial_zero_point)));
-  // To match QNN quantization definition
-  zero_point = 0 - zero_point;
+  zero_point = static_cast<int32_t>(RoundHalfToEven(static_cast<float>(Saturate(qmax, qmin, initial_zero_point))));
+  zero_point = -zero_point;  // Negate to match QNN quantization definition.
+  scale = static_cast<float>(scale_dbl);
   return Status::OK();
 }
 
@@ -570,6 +591,373 @@ Status Quantize(const double double_value,
   return Status::OK();
 }
 
+size_t ShapeSizeCalc(gsl::span<const uint32_t> shape, size_t start, size_t end) {
+  size_t size = 1;
+  for (size_t i = start; i < end; i++) {
+    size *= shape[i];
+  }
+  return size;
+}
+
+Status GetDataQuantParams(gsl::span<const float> data, gsl::span<const uint32_t> shape,
+                          /*out*/ gsl::span<float> scales, /*out*/ gsl::span<int32_t> offsets,
+                          Qnn_DataType_t data_type, bool symmetric, std::optional<int64_t> axis) {
+  const size_t num_dims = shape.size();
+  const size_t num_elems = ShapeSizeCalc(shape, 0, num_dims);
+  ORT_RETURN_IF_NOT(num_elems == data.size(), "Shape mismatch with data to quantize");
+
+  size_t block_count = 1;
+  size_t broadcast_dim = 1;
+  size_t block_size = num_elems;
+
+  if (axis.has_value()) {
+    size_t axis_no_neg = *axis < 0 ? static_cast<size_t>(*axis) + num_dims : static_cast<size_t>(*axis);
+    block_count = ShapeSizeCalc(shape, 0, axis_no_neg);
+    broadcast_dim = shape[axis_no_neg];
+    block_size = ShapeSizeCalc(shape, axis_no_neg + 1, num_dims);
+  }
+
+  ORT_RETURN_IF_NOT(scales.size() == broadcast_dim, "Unexpected size of scales output buffer");
+  ORT_RETURN_IF_NOT(offsets.size() == broadcast_dim, "Unexpected size of offsets output buffer");
+
+  size_t i = 0;
+  for (size_t n = 0; n < block_count; n++) {
+    for (size_t bd = 0; bd < broadcast_dim; bd++) {
+      float rmin = std::numeric_limits<float>::max();
+      float rmax = std::numeric_limits<float>::lowest();
+      for (size_t j = 0; j < block_size; j++) {
+        rmin = std::min(rmin, data[i]);
+        rmax = std::max(rmax, data[i]);
+        i++;
+      }
+
+      scales[bd] = 1.0f;
+      offsets[bd] = 0;
+      ORT_RETURN_IF_ERROR(GetQuantParams(rmin, rmax, data_type, scales[bd], offsets[bd], symmetric));
+    }
+  }
+
+  assert(i == data.size());
+  return Status::OK();
+}
+
+Status QuantizeData(gsl::span<const float> data, gsl::span<const uint32_t> shape,
+                    gsl::span<const float> scales, gsl::span<const int32_t> offsets,
+                    /*out*/ gsl::span<uint8_t> quant_bytes, Qnn_DataType_t data_type,
+                    std::optional<int64_t> axis) {
+  const size_t num_dims = shape.size();
+  const size_t num_elems = ShapeSizeCalc(shape, 0, num_dims);
+  ORT_RETURN_IF_NOT(num_elems == data.size(), "Shape mismatch with data to quantize");
+  size_t expected_num_quant_bytes = GetElementSizeByType(data_type) * data.size();
+  ORT_RETURN_IF_NOT(quant_bytes.size() == expected_num_quant_bytes,
+                    "Cannot quantize data because output buffer is not the correct size");
+
+  size_t block_count = 1;
+  size_t broadcast_dim = 1;
+  size_t block_size = num_elems;
+
+  if (axis.has_value()) {
+    size_t axis_no_neg = *axis < 0 ? static_cast<size_t>(*axis) + num_dims : static_cast<size_t>(*axis);
+    block_count = ShapeSizeCalc(shape, 0, axis_no_neg);
+    broadcast_dim = shape[axis_no_neg];
+    block_size = ShapeSizeCalc(shape, axis_no_neg + 1, num_dims);
+  }
+
+  ORT_RETURN_IF_NOT(scales.size() == broadcast_dim, "Unexpected size of scales output buffer");
+  ORT_RETURN_IF_NOT(offsets.size() == broadcast_dim, "Unexpected size of offsets output buffer");
+
+  size_t i = 0;
+  for (size_t n = 0; n < block_count; n++) {
+    for (size_t bd = 0; bd < broadcast_dim; bd++) {
+      switch (data_type) {
+        case QNN_DATATYPE_SFIXED_POINT_8: {
+          auto input_span = gsl::make_span<const float>(&data[i], block_size);
+          auto output_span = gsl::make_span<uint8_t>(&quant_bytes[i * sizeof(int8_t)], sizeof(int8_t) * block_size);
+          ORT_RETURN_IF_ERROR(QuantizeData<int8_t>(input_span, scales[bd], offsets[bd], output_span));
+          break;
+        }
+        case QNN_DATATYPE_UFIXED_POINT_8: {
+          auto input_span = gsl::make_span<const float>(&data[i], block_size);
+          auto output_span = gsl::make_span<uint8_t>(&quant_bytes[i * sizeof(uint8_t)], sizeof(uint8_t) * block_size);
+          ORT_RETURN_IF_ERROR(QuantizeData<uint8_t>(input_span, scales[bd], offsets[bd], output_span));
+          break;
+        }
+        case QNN_DATATYPE_SFIXED_POINT_16: {
+          auto input_span = gsl::make_span<const float>(&data[i], block_size);
+          auto output_span = gsl::make_span<uint8_t>(&quant_bytes[i * sizeof(int16_t)], sizeof(int16_t) * block_size);
+          ORT_RETURN_IF_ERROR(QuantizeData<int16_t>(input_span, scales[bd], offsets[bd], output_span));
+          break;
+        }
+        case QNN_DATATYPE_UFIXED_POINT_16: {
+          auto input_span = gsl::make_span<const float>(&data[i], block_size);
+          auto output_span = gsl::make_span<uint8_t>(&quant_bytes[i * sizeof(uint16_t)], sizeof(uint16_t) * block_size);
+          ORT_RETURN_IF_ERROR(QuantizeData<uint16_t>(input_span, scales[bd], offsets[bd], output_span));
+          break;
+        }
+        case QNN_DATATYPE_SFIXED_POINT_32: {
+          auto input_span = gsl::make_span<const float>(&data[i], block_size);
+          auto output_span = gsl::make_span<uint8_t>(&quant_bytes[i * sizeof(int32_t)], sizeof(int32_t) * block_size);
+          ORT_RETURN_IF_ERROR(QuantizeData<int32_t>(input_span, scales[bd], offsets[bd], output_span));
+          break;
+        }
+        default:
+          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported quantization data type for QuantizeData");
+      }
+      i += block_size;
+    }
+  }
+  assert(i == data.size());
+
+  return Status::OK();
+}
+
+static bool GetType(const NodeArg& node_arg, int32_t& type, const logging::Logger& logger) {
+  type = ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED;
+  const auto* type_proto = node_arg.TypeAsProto();
+  if (!type_proto || !type_proto->has_tensor_type() || !type_proto->tensor_type().has_elem_type()) {
+    LOGS(logger, WARNING) << "NodeArg [" << node_arg.Name() << "] has no input type";
+    return false;
+  }
+
+  type = type_proto->tensor_type().elem_type();
+  return true;
+}
+
+NodeAttrHelper::NodeAttrHelper(const onnxruntime::Node& node)
+    : node_attributes_(node.GetAttributes()) {}
+
+NodeAttrHelper::NodeAttrHelper(const NodeUnit& node_unit)
+    : node_attributes_(node_unit.GetNode().GetAttributes()) {}
+
+float NodeAttrHelper::Get(const std::string& key, float def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return entry->second().f();
+  }
+
+  return def_val;
+}
+
+int32_t NodeAttrHelper::Get(const std::string& key, int32_t def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return narrow<int32_t>(entry->second().i());
+  }
+
+  return def_val;
+}
+
+uint32_t NodeAttrHelper::Get(const std::string& key, uint32_t def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return narrow<uint32_t>(entry->second().i());
+  }
+
+  return def_val;
+}
+
+int64_t NodeAttrHelper::Get(const std::string& key, int64_t def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return entry->second().i();
+  }
+
+  return def_val;
+}
+
+const std::string& NodeAttrHelper::Get(const std::string& key, const std::string& def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return entry->second().s();
+  }
+
+  return def_val;
+}
+
+std::vector<int32_t> NodeAttrHelper::Get(const std::string& key, const std::vector<int32_t>& def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = entry->second().ints();
+    const int64_t* cbegin = values.data();
+    const int64_t* cend = values.data() + values.size();
+    std::vector<int32_t> v;
+    v.reserve(static_cast<size_t>(values.size()));
+    std::transform(cbegin, cend, std::back_inserter(v),
+                   [](int64_t val) -> int32_t { return narrow<int32_t>(val); });
+    return v;
+  }
+
+  return def_val;
+}
+
+std::vector<uint32_t> NodeAttrHelper::Get(const std::string& key, const std::vector<uint32_t>& def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = entry->second().ints();
+    const int64_t* cbegin = values.data();
+    const int64_t* cend = values.data() + values.size();
+    std::vector<uint32_t> v;
+    v.reserve(static_cast<size_t>(values.size()));
+    std::transform(cbegin, cend, std::back_inserter(v),
+                   [](int64_t val) -> uint32_t { return narrow<uint32_t>(val); });
+    return v;
+  }
+
+  return def_val;
+}
+
+std::vector<int64_t> NodeAttrHelper::Get(const std::string& key, const std::vector<int64_t>& def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = entry->second().ints();
+    const int64_t* cbegin = values.data();
+    const int64_t* cend = values.data() + values.size();
+    return std::vector<int64_t>{cbegin, cend};
+  }
+
+  return def_val;
+}
+
+std::vector<float> NodeAttrHelper::Get(const std::string& key, const std::vector<float>& def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = entry->second().floats();
+    const float* cbegin = values.data();
+    const float* cend = values.data() + values.size();
+    return std::vector<float>{cbegin, cend};
+  }
+
+  return def_val;
+}
+
+std::optional<float> NodeAttrHelper::GetFloat(const std::string& key) const {
+  std::optional<float> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    result = entry->second().f();
+  }
+
+  return result;
+}
+
+std::optional<int64_t> NodeAttrHelper::GetInt64(const std::string& key) const {
+  std::optional<int64_t> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    result = entry->second().i();
+  }
+
+  return result;
+}
+
+std::optional<std::vector<float>> NodeAttrHelper::GetFloats(const std::string& key) const {
+  std::optional<std::vector<float>> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = entry->second().floats();
+    const float* cbegin = values.data();
+    const float* cend = values.data() + values.size();
+    result = std::vector<float>(cbegin, cend);
+  }
+
+  return result;
+}
+
+std::optional<std::vector<int64_t>> NodeAttrHelper::GetInt64s(const std::string& key) const {
+  std::optional<std::vector<int64_t>> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = entry->second().ints();
+    const int64_t* cbegin = values.data();
+    const int64_t* cend = values.data() + values.size();
+    result = std::vector<int64_t>(cbegin, cend);
+  }
+
+  return result;
+}
+
+std::optional<std::string> NodeAttrHelper::GetString(const std::string& key) const {
+  std::optional<std::string> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    result = entry->second().s();
+  }
+
+  return result;
+}
+
+bool NodeAttrHelper::HasAttr(const std::string& key) const {
+  return node_attributes_.find(key) != node_attributes_.end();
+}
+
+static bool GetClipMinMaxImpl(const GraphViewer& graph_viewer, const Node& node, float& min, float& max,
+                              const logging::Logger& logger) {
+  const auto& node_name = node.Name();
+  int32_t input_type;
+  if (!GetType(*node.InputDefs()[0], input_type, logger)) {
+    return false;
+  }
+
+  min = std::numeric_limits<float>::lowest();
+  max = std::numeric_limits<float>::max();
+
+  if (node.SinceVersion() < 11) {  // Clip opset 1, 6 is using attributes for min/max
+    NodeAttrHelper helper(node);
+    // attributes will be always float
+    min = helper.Get("min", std::numeric_limits<float>::lowest());
+    max = helper.Get("max", std::numeric_limits<float>::max());
+  } else {
+    auto get_value =
+        [&](const ONNX_NAMESPACE::TensorProto* initializer, std::string_view type, float& value) -> bool {
+      if (!initializer) {
+        LOGS(logger, VERBOSE) << type << " input of Clip must be a constant initializer";
+        return false;
+      }
+
+      switch (input_type) {
+        case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: {
+          std::vector<uint8_t> bytes(sizeof(float));
+          auto status = onnxruntime::utils::UnpackInitializerData(*initializer, graph_viewer.ModelPath(), bytes);
+          if (!status.IsOK()) {
+            LOGS(logger, ERROR) << "GetClipMinMax() failed to unpack float initializer: " << status.ErrorMessage();
+            return false;
+          }
+          value = *reinterpret_cast<float*>(bytes.data());
+          break;
+        }
+        case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16: {
+          std::vector<uint8_t> bytes(sizeof(MLFloat16));
+          auto status = onnxruntime::utils::UnpackInitializerData(*initializer, graph_viewer.ModelPath(), bytes);
+          if (!status.IsOK()) {
+            LOGS(logger, ERROR) << "GetClipMinMax() failed to unpack float16 initializer: " << status.ErrorMessage();
+            return false;
+          }
+          value = reinterpret_cast<MLFloat16*>(bytes.data())->ToFloat();
+          break;
+        }
+        default:
+          LOGS(logger, VERBOSE) << "GetClipMinMax() only supports float and float16 as min and max inputs for now."
+                                << " The node [" << node_name << "] has input type: " << input_type;
+          return false;
+      }
+
+      return true;
+    };
+
+    // min and max are both optional. could have neither, one or both.
+    if (node.InputDefs().size() > 1 && node.InputDefs()[1]->Exists()) {
+      // we have input min
+      const auto& min_name = node.InputDefs()[1]->Name();
+      const auto* min_value = graph_viewer.GetConstantInitializer(min_name);
+      if (!get_value(min_value, "Min", min)) {
+        return false;
+      }
+    }
+
+    if (node.InputDefs().size() > 2 && node.InputDefs()[2]->Exists()) {
+      // we have input max
+      const auto& max_name = node.InputDefs()[2]->Name();
+      const auto* max_value = graph_viewer.GetConstantInitializer(max_name);
+      if (!get_value(max_value, "Max", max)) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+bool GetClipMinMax(const GraphViewer& graph_viewer, const Node& node, float& min, float& max,
+                   const logging::Logger& logger) {
+  return GetClipMinMaxImpl(graph_viewer, node, min, max, logger);
+}
+
 }  // namespace utils
 }  // namespace qnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
index aa4a27460563f..1a30b10105fbf 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
@@ -9,9 +9,7 @@
 #include <vector>
 
 #include "QnnTypes.h"
-#include "core/session/onnxruntime_cxx_api.h"
-#include "core/framework/node_unit.h"
-#include "core/util/qmath.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -22,6 +20,8 @@ size_t GetElementSizeByType(const Qnn_DataType_t& data_type);
 
 size_t GetElementSizeByType(ONNXTensorElementDataType elem_type);
 
+size_t GetElementSizeByType(ONNX_NAMESPACE::TensorProto_DataType onnx_type);
+
 // TODO: make these work with Wrappers?
 std::ostream& operator<<(std::ostream& out, const Qnn_Param_t& qnn_param);
 std::ostream& operator<<(std::ostream& out, const Qnn_Tensor_t& tensor);
@@ -74,7 +74,30 @@ static bool ArrayHasString(const std::array<std::string_view, N>& strings, std::
 std::pair<float, float> CheckMinMax(float rmin, float rmax);
 
 template <typename T>
-Status GetQminQmax(const Qnn_DataType_t qnn_data_type, T& qmin, T& qmax);
+Status GetQminQmax(const Qnn_DataType_t qnn_data_type,
+                   T& qmin,
+                   T& qmax,
+                   bool symmetric = false) {
+  if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_8) {
+    qmin = static_cast<T>(std::numeric_limits<int8_t>::min() + static_cast<int8_t>(symmetric));
+    qmax = static_cast<T>(std::numeric_limits<int8_t>::max());
+  } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_8) {
+    qmin = static_cast<T>(std::numeric_limits<uint8_t>::min());
+    qmax = static_cast<T>(std::numeric_limits<uint8_t>::max());
+  } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_16) {
+    qmin = static_cast<T>(std::numeric_limits<int16_t>::min() + static_cast<int16_t>(symmetric));
+    qmax = static_cast<T>(std::numeric_limits<int16_t>::max());
+  } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) {
+    qmin = static_cast<T>(std::numeric_limits<uint16_t>::min());
+    qmax = static_cast<T>(std::numeric_limits<uint16_t>::max());
+  } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_32) {
+    qmin = static_cast<T>(std::numeric_limits<int32_t>::min() + static_cast<int32_t>(symmetric));
+    qmax = static_cast<T>(std::numeric_limits<int32_t>::max());
+  } else {
+    ORT_RETURN_IF(true, "Qnn Data Type: %d not supported yet.", qnn_data_type);
+  }
+  return Status::OK();
+}
 
 template <typename T>
 inline T Saturate(const T qmax,
@@ -104,6 +127,145 @@ Status Quantize(const double double_value,
                 const Qnn_DataType_t qnn_data_type,
                 int& quant_value);
 
+size_t ShapeSizeCalc(gsl::span<const uint32_t> shape, size_t start, size_t end);
+
+Status GetDataQuantParams(gsl::span<const float> data, gsl::span<const uint32_t> shape,
+                          /*out*/ gsl::span<float> scales, /*out*/ gsl::span<int32_t> offsets,
+                          Qnn_DataType_t data_type, bool symmetric = false,
+                          std::optional<int64_t> axis = std::nullopt);
+
+Status QuantizeData(gsl::span<const float> data, gsl::span<const uint32_t> shape,
+                    gsl::span<const float> scales, gsl::span<const int32_t> offsets,
+                    /*out*/ gsl::span<uint8_t> quant_bytes, Qnn_DataType_t data_type,
+                    std::optional<int64_t> axis = std::nullopt);
+
+template <typename QuantType>
+inline Status QuantizeData(gsl::span<const float> data, float scale, int32_t offset,
+                           /*out*/ gsl::span<uint8_t> quant_bytes) {
+  const size_t num_elems = data.size();
+  const size_t expected_output_bytes = sizeof(QuantType) * num_elems;
+  ORT_RETURN_IF_NOT(expected_output_bytes == quant_bytes.size(),
+                    "Output buffer is not large enough to hold quantized bytes.");
+  const double clip_min = static_cast<double>(std::numeric_limits<QuantType>::lowest());
+  const double clip_max = static_cast<double>(std::numeric_limits<QuantType>::max());
+
+  QuantType* output = reinterpret_cast<QuantType*>(quant_bytes.data());
+  for (size_t i = 0; i < num_elems; ++i) {
+    const double scale_dbl = static_cast<double>(scale);
+    const double offset_dbl = static_cast<double>(offset);
+    double float_val = std::nearbyint(static_cast<double>(data[i]) / scale_dbl) - offset_dbl;
+    float_val = std::max(float_val, clip_min);
+    float_val = std::min(float_val, clip_max);
+    output[i] = static_cast<QuantType>(float_val);
+  }
+  return Status::OK();
+}
+
+// Re-writes a buffer of packed 4-bit elements to a buffer of unpacked 8-bit elements.
+// QNN requires that 4-bit weights are unpacked to 8-bit.
+template <bool Signed>
+Status UnpackInt4ToInt8(size_t num_int4_elems, std::vector<uint8_t>& data_bytes) {
+  if constexpr (Signed) {  // INT4
+    std::vector<uint8_t> packed_int4_bytes = std::move(data_bytes);
+    data_bytes = std::vector<uint8_t>(num_int4_elems);
+
+    auto dst = gsl::make_span(reinterpret_cast<int8_t*>(data_bytes.data()), data_bytes.size());
+    auto src = gsl::make_span(reinterpret_cast<const Int4x2*>(packed_int4_bytes.data()), packed_int4_bytes.size());
+    ORT_RETURN_IF_NOT(Int4x2::Unpack(dst, src), "Failed to unpack Tensor<Int4x2> for QNN");
+
+    // NOTE: Masking off top 4 bits to workaround a QNN INT4 accuracy bug.
+    // Docs explicitly state that masking off top 4 bits should not be required, but we have to do it.
+    for (size_t i = 0; i < dst.size(); i++) {
+      dst[i] &= 0x0F;  // -3 (0b1111_1101) becomes 13 (0b0000_1101)
+    }
+  } else {  // UINT4
+    std::vector<uint8_t> packed_uint4_bytes = std::move(data_bytes);
+    data_bytes = std::vector<uint8_t>(num_int4_elems);
+
+    auto dst = gsl::make_span(reinterpret_cast<uint8_t*>(data_bytes.data()), data_bytes.size());
+    auto src = gsl::make_span(reinterpret_cast<const UInt4x2*>(packed_uint4_bytes.data()), packed_uint4_bytes.size());
+    ORT_RETURN_IF_NOT(UInt4x2::Unpack(dst, src), "Failed to unpack Tensor<UInt4x2> for QNN");
+  }
+
+  return Status::OK();
+}
+
+template <typename T>
+std::vector<T> GetInitializerShape(const ONNX_NAMESPACE::TensorProto& tensor_proto) {
+  const auto& dims = tensor_proto.dims();
+  std::vector<T> tensor_shape_vec(static_cast<size_t>(dims.size()));
+  for (int i = 0; i < dims.size(); ++i) {
+    tensor_shape_vec[i] = static_cast<T>(dims[i]);
+  }
+
+  return tensor_shape_vec;
+}
+
+template <typename T, typename P>
+Status PermuteShape(gsl::span<const T> input_shape, gsl::span<const P> perm, gsl::span<T> output_shape) {
+  const size_t rank = input_shape.size();
+  ORT_RETURN_IF_NOT(rank == perm.size() && rank == output_shape.size(),
+                    "PermuteShape(): expect all arguments to have the same rank.");
+
+  for (size_t i = 0; i < rank; ++i) {
+    size_t p = static_cast<size_t>(perm[i]);
+    output_shape[i] = input_shape[p];
+  }
+
+  return Status::OK();
+}
+
+/**
+ * Wrapping onnxruntime::Node for retrieving attribute values
+ */
+class NodeAttrHelper {
+ public:
+  explicit NodeAttrHelper(const Node& node);
+
+  // Get the attributes from the target node of the node_unit
+  explicit NodeAttrHelper(const NodeUnit& node_unit);
+
+  /*
+   * Get with default
+   */
+  float Get(const std::string& key, float def_val) const;
+  std::vector<float> Get(const std::string& key, const std::vector<float>& def_val) const;
+
+  int64_t Get(const std::string& key, int64_t def_val) const;
+  std::vector<int64_t> Get(const std::string& key, const std::vector<int64_t>& def_val) const;
+
+  const std::string& Get(const std::string& key, const std::string& def_val) const;
+
+  // Convert the i() or ints() of the attribute from int64_t to int32_t
+  int32_t Get(const std::string& key, int32_t def_val) const;
+  std::vector<int32_t> Get(const std::string& key, const std::vector<int32_t>& def_val) const;
+
+  // Convert the i() or ints() of the attribute from int64_t to uint32_t
+  uint32_t Get(const std::string& key, uint32_t def_val) const;
+  std::vector<uint32_t> Get(const std::string& key, const std::vector<uint32_t>& def_val) const;
+
+  /*
+   * Get without default.
+   */
+  std::optional<float> GetFloat(const std::string& key) const;
+  std::optional<std::vector<float>> GetFloats(const std::string& key) const;
+
+  std::optional<int64_t> GetInt64(const std::string& key) const;
+  std::optional<std::vector<int64_t>> GetInt64s(const std::string& key) const;
+
+  std::optional<std::string> GetString(const std::string& key) const;
+
+  bool HasAttr(const std::string& key) const;
+
+ private:
+  const NodeAttributes& node_attributes_;
+};
+
+// Get the min/max of a Clip operator. Reads values from attributes for opset < 11 and inputs after that.
+// For opset 11+, if min/max are not constant initializers, will return false.
+// For now we only support getting float min/max.
+bool GetClipMinMax(const GraphViewer& graph_viewer, const Node& node,
+                   float& min, float& max, const logging::Logger& logger);
 }  // namespace utils
 }  // namespace qnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/ort_api.h b/onnxruntime/core/providers/qnn/ort_api.h
new file mode 100644
index 0000000000000..96f6ab76f113f
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/ort_api.h
@@ -0,0 +1,51 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License
+
+#pragma once
+
+#define BUILD_QNN_EP_STATIC 0
+
+#if BUILD_QNN_EP_STATIC
+#ifdef _WIN32
+#include <winmeta.h>
+#include "core/platform/tracing.h"
+#include "core/platform/windows/logging/etw_sink.h"
+#endif
+
+// Includes when building QNN EP statically
+#include "onnx/defs/data_type_utils.h"
+#include "core/common/common.h"
+#include "core/common/status.h"
+#include "core/common/safeint.h"
+#include "core/common/logging/logging.h"
+#include "core/common/logging/capture.h"
+#include "core/common/path_string.h"
+#include "core/platform/env.h"
+#include "core/framework/data_types.h"
+#include "core/framework/float16.h"
+#include "core/framework/run_options.h"
+#include "core/framework/execution_provider.h"
+#include "core/framework/model_metadef_id_generator.h"
+#include "core/framework/compute_capability.h"
+#include "core/framework/tensor_shape.h"
+#include "core/framework/node_unit.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/framework/utils.h"
+#include "core/graph/constants.h"
+#include "core/graph/basic_types.h"
+#include "core/graph/model.h"
+#include "core/graph/graph_viewer.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
+#include "core/providers/common.h"
+#include "core/providers/partitioning_utils.h"
+#include "core/session/onnxruntime_cxx_api.h"
+#else
+// Includes when building QNN EP as a shared library
+#include "core/providers/shared_library/provider_api.h"
+#define ORT_API_MANUAL_INIT
+#include "core/session/onnxruntime_cxx_api.h"
+#endif
+
+#include "core/session/onnxruntime_session_options_config_keys.h"
+#include "core/session/onnxruntime_run_options_config_keys.h"
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 1d9242f8a5939..41a7f780a34f0 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -5,58 +5,18 @@
 
 #include <filesystem>
 #include <unordered_set>
-#include "core/framework/compute_capability.h"
-#include "core/graph/graph_viewer.h"
-#include "core/session/onnxruntime_session_options_config_keys.h"
-#include "core/session/onnxruntime_run_options_config_keys.h"
-#include "core/session/onnxruntime_cxx_api.h"
-#include "core/framework/kernel_registry.h"
-#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
-#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
-#include "core/platform/env.h"
-#include "core/providers/common.h"
-#include "core/providers/partitioning_utils.h"
-#include "core/providers/partitioning_utils.h"
+#include "core/providers/qnn/ort_api.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
-#include "core/framework/run_options.h"
-
-#ifdef _WIN32
-#include <Windows.h>
-#include "core/platform/windows/logging/etw_sink.h"
-#endif
 
 namespace onnxruntime {
 
 constexpr const char* QNN = "QNN";
 
-static std::unique_ptr<std::vector<std::function<void()>>> s_run_on_unload_;
-
-void RunOnUnload(std::function<void()> function) {
-  static std::mutex mutex;
-  std::lock_guard<std::mutex> guard(mutex);
-  if (!s_run_on_unload_) {
-    s_run_on_unload_ = std::make_unique<std::vector<std::function<void()>>>();
-  }
-  s_run_on_unload_->push_back(std::move(function));
-}
-
-struct OnUnload {
-  ~OnUnload() {
-    if (!s_run_on_unload_)
-      return;
-
-    for (auto& function : *s_run_on_unload_)
-      function();
-
-    s_run_on_unload_.reset();
-  }
-
-} g_on_unload;
-
 static void ParseProfilingLevel(std::string profiling_level_string,
                                 qnn::ProfilingLevel& profiling_level) {
   std::transform(profiling_level_string.begin(),
@@ -193,17 +153,20 @@ qnn::ProfilingLevel QNNExecutionProvider::GetProfilingLevelFromETWLevel(unsigned
 }
 
 QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_options_map,
-                                           const SessionOptions* session_options)
+                                           const ConfigOptions* config_options)
     : IExecutionProvider{onnxruntime::kQnnExecutionProvider} {
-  if (session_options) {
-    disable_cpu_ep_fallback_ = session_options->config_options.GetConfigOrDefault(
+  InitProviderOrtApi();
+  metadef_id_generator_ = ModelMetadefIdGenerator::Create();
+
+  if (config_options) {
+    disable_cpu_ep_fallback_ = config_options->GetConfigOrDefault(
                                    kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
 
-    context_cache_enabled_ = session_options->config_options.GetConfigOrDefault(
+    context_cache_enabled_ = config_options->GetConfigOrDefault(
                                  kOrtSessionOptionEpContextEnable, "0") == "1";
     LOGS_DEFAULT(VERBOSE) << "Context cache enable: " << context_cache_enabled_;
 
-    std::string embed_mode = session_options->config_options.GetConfigOrDefault(
+    std::string embed_mode = config_options->GetConfigOrDefault(
         kOrtSessionOptionEpContextEmbedMode, "0");
     if ("1" == embed_mode) {
       qnn_context_embed_mode_ = true;
@@ -214,18 +177,18 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
     }
     LOGS_DEFAULT(VERBOSE) << "User specified context cache embed mode: " << qnn_context_embed_mode_;
 
-    context_cache_path_cfg_ = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+    context_cache_path_cfg_ = config_options->GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
     LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path_cfg_;
 
     // For the case that workaround QNN context PD memory limit, user need split the model into pieces and
     // generate the QNN context model separately.
     // It could happen that the generated EPContext node in separate graph has same node name.
     // User can set this context_node_name_prefix for each split pieces to avoid that happens.
-    context_node_name_prefix_ = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextNodeNamePrefix, "");
+    context_node_name_prefix_ = config_options->GetConfigOrDefault(kOrtSessionOptionEpContextNodeNamePrefix, "");
     LOGS_DEFAULT(VERBOSE) << "User specified QNN context node name prefix: " << context_node_name_prefix_;
 
     share_ep_contexts_ =
-        session_options->config_options.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1";
+        config_options->GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1";
     LOGS_DEFAULT(VERBOSE) << "User specified option - share EP contexts across sessions: " << share_ep_contexts_;
   }
 
@@ -246,7 +209,8 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
   // separate out the profiling level for ETW in case it gets disabled later when we extract the events
   // set to invalid to indicate that ETW is no enabled when we setup QNN
   qnn::ProfilingLevel profiling_level_etw = qnn::ProfilingLevel::INVALID;
-  const Env& env = Env::Default();
+
+  const Env& env = GetDefaultEnv();
   auto& provider = env.GetTelemetryProvider();
   if (provider.IsEnabled()) {
     auto level = provider.Level();
@@ -402,7 +366,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
       soc_model,
       enable_htp_weight_sharing);
 
-#ifdef _WIN32
+#if defined(_WIN32) && defined(ETW_TRACE_LOGGING_SUPPORTED)
   auto& etwRegistrationManager = logging::EtwRegistrationManager::Instance();
   // Register callback for ETW capture state (rundown)
   callback_ETWSink_provider_ = onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback(
@@ -456,7 +420,7 @@ QNNExecutionProvider::~QNNExecutionProvider() {
   }
 
   // Unregister the ETW callback
-#ifdef _WIN32
+#if defined(_WIN32) && defined(ETW_TRACE_LOGGING_SUPPORTED)
   if (callback_ETWSink_provider_ != nullptr) {
     logging::EtwRegistrationManager::Instance().UnregisterInternalCallback(callback_ETWSink_provider_);
   }
@@ -488,9 +452,10 @@ static void LogNodeSupport(const logging::Logger& logger,
     oss << "\tREASON : " << support_status.ErrorMessage() << std::endl;
   }
 
-  logging::Capture(logger, log_severity, logging::Category::onnxruntime,
-                   log_data_type, call_site)
-          .Stream()
+  std::unique_ptr<logging::Capture> log_capture = logging::Capture::Create(logger, log_severity,
+                                                                           logging::Category::onnxruntime,
+                                                                           log_data_type, call_site);
+  log_capture->Stream()
       << (support_status.IsOK() ? "Validation PASSED " : "Validation FAILED ") << "for " << num_nodes
       << " nodes in " << qnn_node_group.Type() << " (" << qnn_node_group.GetTargetNodeUnit()->OpType() << ") :"
       << std::endl
@@ -569,7 +534,7 @@ QNNExecutionProvider::GetSupportedNodes(const GraphViewer& graph_viewer,
 static bool EpSharedContextsHasAllGraphs(const onnxruntime::GraphViewer& graph_viewer,
                                          const logging::Logger& logger) {
   for (const auto& node : graph_viewer.Nodes()) {
-    NodeAttrHelper node_helper(node);
+    qnn::utils::NodeAttrHelper node_helper(node);
     std::string cache_source = node_helper.Get(qnn::SOURCE, "");
 
     std::transform(cache_source.begin(),
@@ -594,11 +559,11 @@ static bool EpSharedContextsHasAllGraphs(const std::vector<IExecutionProvider::F
                                          const logging::Logger& logger) {
   for (auto fused_node_and_graph : fused_nodes_and_graphs) {
     const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
-    const auto& ep_context_node = graph_viewer.Nodes().begin();
-    NodeAttrHelper node_helper(*ep_context_node);
+    const Node& ep_context_node = *graph_viewer.Nodes().begin();
+    qnn::utils::NodeAttrHelper node_helper(ep_context_node);
     std::string cache_source = node_helper.Get(qnn::SOURCE, "");
 
-    const std::string& graph_name = ep_context_node->Name();
+    const std::string& graph_name = ep_context_node.Name();
     bool has_shared_qnn_model = SharedContext::GetInstance().HasQnnModel(graph_name);
     if (!has_shared_qnn_model) {
       LOGS(logger, VERBOSE) << "Graph: " << graph_name << " from EpContext node not found from shared EP contexts.";
@@ -613,13 +578,13 @@ static bool EpSharedContextsHasAllGraphs(const std::vector<IExecutionProvider::F
 static void PartitionCtxModel(const onnxruntime::GraphViewer& graph_viewer,
                               const size_t num_nodes_in_graph,
                               std::vector<std::unique_ptr<ComputeCapability>>& result,
-                              const utils::GenerateMetadefNameFn& gen_metadef_name,
+                              const std::function<std::string()>& gen_metadef_name,
                               const logging::Logger& logger) {
   std::unordered_set<const Node*> supported_nodes{};
   std::vector<std::vector<const Node*>> supported_groups{};
 
   for (const auto& node : graph_viewer.Nodes()) {
-    NodeAttrHelper node_helper(node);
+    qnn::utils::NodeAttrHelper node_helper(node);
     std::string cache_source = node_helper.Get(qnn::SOURCE, "");
 
     std::transform(cache_source.begin(),
@@ -673,7 +638,7 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
 
   const auto gen_metadef_name = [&]() {
     uint64_t model_hash;
-    int metadef_id = metadef_id_generator_.GenerateId(graph_viewer, model_hash);
+    int metadef_id = metadef_id_generator_->GenerateId(graph_viewer, model_hash);
     return MakeString(QNN, context_node_name_prefix_, "_", model_hash, "_", metadef_id);
   };
 
@@ -718,7 +683,7 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
 
-  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer, logger);
+  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(&graph_viewer, logger);
 
   // remove is_qnn_ctx_model related code
   const auto supported_nodes = GetSupportedNodes(graph_viewer, node_unit_map,
@@ -761,11 +726,11 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
     bool is_valid_partition = true;
     size_t nodes_in_partition = 0;
 
-    if (partition && partition->sub_graph) {
-      nodes_in_partition = partition->sub_graph->nodes.size();
+    if (partition && partition->SubGraph()) {
+      nodes_in_partition = partition->SubGraph()->Nodes().size();
 
       if (nodes_in_partition == 1 && !is_qnn_ctx_model) {
-        const Node* node = graph_viewer.GetNode(partition->sub_graph->nodes[0]);
+        const Node* node = graph_viewer.GetNode(partition->SubGraph()->Nodes()[0]);
 
         if (!node) {
           LOGS(logger, ERROR) << "QNN EP: Invalid node in partition of one node.";
@@ -834,34 +799,34 @@ Status QNNExecutionProvider::CreateComputeFunc(std::vector<NodeComputeInfo>& nod
 void QNNExecutionProvider::InitQnnGraphConfigs(qnn::QnnConfigsBuilder<QnnGraph_Config_t, QnnHtpGraph_CustomConfig_t>& configs_builder) const {
   if (qnn_backend_manager_->GetQnnBackendType() == qnn::QnnBackendType::HTP) {
     if (htp_graph_finalization_opt_mode_ != qnn::HtpGraphFinalizationOptimizationMode::kDefault) {
-      QnnHtpGraph_CustomConfig_t& htp_graph_opt_config = configs_builder.PushCustomConfig();
-      htp_graph_opt_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
-      htp_graph_opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
-      htp_graph_opt_config.optimizationOption.floatValue = static_cast<float>(htp_graph_finalization_opt_mode_);
-
-      QnnGraph_Config_t& graph_opt_config = configs_builder.PushConfig();
-      graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-      graph_opt_config.customConfig = &htp_graph_opt_config;
+      gsl::not_null<QnnHtpGraph_CustomConfig_t*> htp_graph_opt_config = configs_builder.PushCustomConfig();
+      htp_graph_opt_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
+      htp_graph_opt_config->optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
+      htp_graph_opt_config->optimizationOption.floatValue = static_cast<float>(htp_graph_finalization_opt_mode_);
+
+      gsl::not_null<QnnGraph_Config_t*> graph_opt_config = configs_builder.PushConfig();
+      graph_opt_config->option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+      graph_opt_config->customConfig = htp_graph_opt_config;
     }
 
     if (vtcm_size_in_mb_ > 0) {
-      QnnHtpGraph_CustomConfig_t& htp_graph_opt_config_vtcm = configs_builder.PushCustomConfig();
-      htp_graph_opt_config_vtcm.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
-      htp_graph_opt_config_vtcm.vtcmSizeInMB = static_cast<uint32_t>(vtcm_size_in_mb_);
+      gsl::not_null<QnnHtpGraph_CustomConfig_t*> htp_graph_opt_config_vtcm = configs_builder.PushCustomConfig();
+      htp_graph_opt_config_vtcm->option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
+      htp_graph_opt_config_vtcm->vtcmSizeInMB = static_cast<uint32_t>(vtcm_size_in_mb_);
 
-      QnnGraph_Config_t& graph_opt_config_vtcm = configs_builder.PushConfig();
-      graph_opt_config_vtcm.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-      graph_opt_config_vtcm.customConfig = &htp_graph_opt_config_vtcm;
+      gsl::not_null<QnnGraph_Config_t*> graph_opt_config_vtcm = configs_builder.PushConfig();
+      graph_opt_config_vtcm->option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+      graph_opt_config_vtcm->customConfig = htp_graph_opt_config_vtcm;
     }
 
     if (enable_HTP_FP16_precision_) {
-      QnnHtpGraph_CustomConfig_t& htp_graph_precision_config = configs_builder.PushCustomConfig();
-      htp_graph_precision_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
-      htp_graph_precision_config.precision = QNN_PRECISION_FLOAT16;
+      gsl::not_null<QnnHtpGraph_CustomConfig_t*> htp_graph_precision_config = configs_builder.PushCustomConfig();
+      htp_graph_precision_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
+      htp_graph_precision_config->precision = QNN_PRECISION_FLOAT16;
 
-      QnnGraph_Config_t& graph_precision_config = configs_builder.PushConfig();
-      graph_precision_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-      graph_precision_config.customConfig = &htp_graph_precision_config;
+      gsl::not_null<QnnGraph_Config_t*> graph_precision_config = configs_builder.PushConfig();
+      graph_precision_config->option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+      graph_precision_config->customConfig = htp_graph_precision_config;
     }
   }
 }
@@ -918,10 +883,10 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
       if (EpSharedContextsHasAllGraphs(fused_nodes_and_graphs, logger)) {
         for (auto fused_node_and_graph : fused_nodes_and_graphs) {
           const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
-          const auto& ep_context_node = graph_viewer.Nodes().begin();
+          const Node& ep_context_node = *graph_viewer.Nodes().begin();
           const Node& fused_node = fused_node_and_graph.fused_node;
           const std::string& graph_meta_id = fused_node.Name();
-          std::string key = ep_context_node->Name();
+          std::string key = ep_context_node.Name();
           auto qnn_model_shared = SharedContext::GetInstance().GetSharedQnnModel(key);
           ORT_RETURN_IF(nullptr == qnn_model_shared, "Graph: " + key + " not found from shared EP contexts.");
           ORT_RETURN_IF_ERROR(qnn_model_shared->SetGraphInputOutputInfo(graph_viewer, fused_node, logger));
@@ -963,10 +928,10 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
 
     for (auto fused_node_and_graph : fused_nodes_and_graphs) {
       const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
-      const auto& ep_context_node = graph_viewer.Nodes().begin();
+      const Node& ep_context_node = *graph_viewer.Nodes().begin();
       const Node& fused_node = fused_node_and_graph.fused_node;
       const std::string& graph_meta_id = fused_node.Name();
-      std::string key = ep_context_node->Name();
+      std::string key = ep_context_node.Name();
       ORT_RETURN_IF(qnn_models.find(key) == qnn_models.end(), key + " key name not exist in table qnn_models.");
       auto qnn_model = std::move(qnn_models[key]);
       ORT_RETURN_IF_ERROR(qnn_model->SetGraphInputOutputInfo(graph_viewer, fused_node, logger));
@@ -1007,7 +972,7 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
                                                                           buffer_size,
                                                                           max_spill_fill_buffer_size));
     }
-    qnn_ep_context_model_ = std::make_unique<Model>("qnn_ep_context_model", false, logger);
+    qnn_ep_context_model_ = Model::Create("qnn_ep_context_model", false, logger);
     ORT_RETURN_IF_ERROR(qnn::CreateEPContextNodes(qnn_ep_context_model_.get(),
                                                   context_buffer.get(),
                                                   buffer_size,
@@ -1026,8 +991,8 @@ const InlinedVector<const Node*> QNNExecutionProvider::GetEpContextNodes() const
   InlinedVector<const Node*> ep_context_nodes;
   if (qnn_ep_context_model_) {
     const auto& graph = qnn_ep_context_model_->MainGraph();
-    for (const auto& node : graph.Nodes()) {
-      ep_context_nodes.push_back(graph.GetNode(node.Index()));
+    for (const Node* node : graph.Nodes()) {
+      ep_context_nodes.push_back(graph.GetNode(node->Index()));
     }
   }
 
@@ -1118,22 +1083,34 @@ void QNNExecutionProvider::ReleasePerThreadContext() const {
   per_thread_context_cache->erase(cached_context_it);
 }
 
+static bool TryGetConfigEntry(const ConfigOptions& config_options, const std::string& key, std::string& value) {
+  std::optional<std::string> new_value = config_options.GetConfigEntry(key);
+  if (!new_value.has_value()) {
+    return false;
+  }
+
+  value = *new_value;
+  return true;
+}
+
 Status QNNExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_options) {
   auto backend_type = qnn_backend_manager_->GetQnnBackendType();
   if (qnn::QnnBackendType::HTP != backend_type && qnn::QnnBackendType::DSP != backend_type) {
     return Status::OK();
   }
 
+  const ConfigOptions& config_options = run_options.GetConfigOptions();
+
   std::string htp_perf_mode = "";
   qnn::HtpPerformanceMode htp_performance_mode = qnn::HtpPerformanceMode::kHtpDefault;
-  if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnPerfMode, htp_perf_mode)) {
+  if (TryGetConfigEntry(config_options, kOrtRunOptionsConfigQnnPerfMode, htp_perf_mode)) {
     // set power mode
     ParseHtpPerformanceMode(htp_perf_mode, htp_performance_mode);
   }
 
   std::string rpc_latency = "";
   uint32_t rpc_control_latency = 0;
-  if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnRpcControlLatency, rpc_latency)) {
+  if (TryGetConfigEntry(config_options, kOrtRunOptionsConfigQnnRpcControlLatency, rpc_latency)) {
     rpc_control_latency = static_cast<uint32_t>(std::stoul(rpc_latency));
     LOGS_DEFAULT(VERBOSE) << "rpc_control_latency: " << rpc_control_latency;
   }
@@ -1159,9 +1136,11 @@ Status QNNExecutionProvider::OnRunEnd(bool /*sync_stream*/, const onnxruntime::R
     return Status::OK();
   }
 
+  const ConfigOptions& config_options = run_options.GetConfigOptions();
+
   std::string htp_perf_mode = "";
   qnn::HtpPerformanceMode htp_performance_mode = qnn::HtpPerformanceMode::kHtpDefault;
-  if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnPerfModePostRun, htp_perf_mode)) {
+  if (TryGetConfigEntry(config_options, kOrtRunOptionsConfigQnnPerfModePostRun, htp_perf_mode)) {
     // set power mode
     ParseHtpPerformanceMode(htp_perf_mode, htp_performance_mode);
   }
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index a0577e8fd87f2..cd3ccd96e31ab 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -3,26 +3,19 @@
 
 #pragma once
 
-#include "core/framework/execution_provider.h"
-#include "core/framework/session_options.h"
-#include "core/framework/model_metadef_id_generator.h"
-#include "core/graph/model.h"
+#include <set>
 #include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_backend_manager.h"
 #include "core/providers/qnn/builder/qnn_model.h"
 #include "core/providers/qnn/builder/qnn_configs_helper.h"
 #include "HTP/QnnHtpGraph.h"
-#include <vector>
-#include <set>
-#include <unordered_map>
-#ifdef _WIN32
-#include "core/platform/windows/logging/etw_sink.h"
-#endif
 
 namespace onnxruntime {
 
-void RunOnUnload(std::function<void()> function);
-
 class SharedContext {
  public:
   static SharedContext& GetInstance() {
@@ -87,7 +80,7 @@ class SharedContext {
 // Logical device representation.
 class QNNExecutionProvider : public IExecutionProvider {
  public:
-  explicit QNNExecutionProvider(const ProviderOptions& provider_options_map, const SessionOptions* session_options);
+  explicit QNNExecutionProvider(const ProviderOptions& provider_options_map, const ConfigOptions* config_options);
   virtual ~QNNExecutionProvider();
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(QNNExecutionProvider);
 
@@ -143,14 +136,14 @@ class QNNExecutionProvider : public IExecutionProvider {
   bool qnn_context_embed_mode_ = true;
   int32_t vtcm_size_in_mb_ = 0;
   std::unique_ptr<onnxruntime::Model> qnn_ep_context_model_;
-  ModelMetadefIdGenerator metadef_id_generator_;
+  std::unique_ptr<ModelMetadefIdGenerator> metadef_id_generator_;
   uint32_t device_id_ = 0;
   qnn::HtpPerformanceMode default_htp_performance_mode_ = qnn::HtpPerformanceMode::kHtpDefault;
   uint32_t default_rpc_control_latency_ = 0;
   bool enable_HTP_FP16_precision_ = true;
   bool share_ep_contexts_ = false;
   bool enable_spill_fill_buffer_ = false;
-#ifdef _WIN32
+#if defined(_WIN32) && defined(ETW_TRACE_LOGGING_SUPPORTED)
   onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback callback_ETWSink_provider_ = nullptr;
 #endif
   qnn::ModelSettings model_settings_ = {};
diff --git a/onnxruntime/core/providers/qnn/qnn_provider_factory.cc b/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
index 4095d7ff02a33..2407a7c83bfeb 100644
--- a/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
+++ b/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
@@ -2,32 +2,54 @@
 // Licensed under the MIT License
 
 #include "core/providers/qnn/qnn_provider_factory_creator.h"
-
-#include "core/session/abi_session_options_impl.h"
 #include "core/providers/qnn/qnn_execution_provider.h"
-#include "core/session/ort_apis.h"
 
 namespace onnxruntime {
 struct QNNProviderFactory : IExecutionProviderFactory {
-  QNNProviderFactory(const ProviderOptions& provider_options_map, const SessionOptions* session_options)
-      : provider_options_map_(provider_options_map), session_options_(session_options) {
+  QNNProviderFactory(const ProviderOptions& provider_options_map, const ConfigOptions* config_options)
+      : provider_options_map_(provider_options_map), config_options_(config_options) {
   }
 
   ~QNNProviderFactory() override {
   }
 
   std::unique_ptr<IExecutionProvider> CreateProvider() override {
-    return std::make_unique<QNNExecutionProvider>(provider_options_map_, session_options_);
+    return std::make_unique<QNNExecutionProvider>(provider_options_map_, config_options_);
   }
 
  private:
   ProviderOptions provider_options_map_;
-  const SessionOptions* session_options_;
+  const ConfigOptions* config_options_;
 };
 
-std::shared_ptr<IExecutionProviderFactory> QNNProviderFactoryCreator::Create(const ProviderOptions& provider_options_map,
-                                                                             const SessionOptions* session_options) {
-  return std::make_shared<onnxruntime::QNNProviderFactory>(provider_options_map, session_options);
-}
+struct QNN_Provider : Provider {
+  std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory(const void* param) override {
+    if (param == nullptr) {
+      LOGS_DEFAULT(ERROR) << "[QNN EP] Passed NULL options to CreateExecutionProviderFactory()";
+      return nullptr;
+    }
+
+    std::array<const void*, 2> pointers_array = *reinterpret_cast<const std::array<const void*, 2>*>(param);
+    const ProviderOptions* provider_options = reinterpret_cast<const ProviderOptions*>(pointers_array[0]);
+    const ConfigOptions* config_options = reinterpret_cast<const ConfigOptions*>(pointers_array[1]);
+
+    if (provider_options == nullptr) {
+      LOGS_DEFAULT(ERROR) << "[QNN EP] Passed NULL ProviderOptions to CreateExecutionProviderFactory()";
+      return nullptr;
+    }
+
+    return std::make_shared<onnxruntime::QNNProviderFactory>(*provider_options, config_options);
+  }
+
+  void Initialize() override {}
+  void Shutdown() override {}
+} g_provider;
 
 }  // namespace onnxruntime
+
+extern "C" {
+
+ORT_API(onnxruntime::Provider*, GetProvider) {
+  return &onnxruntime::g_provider;
+}
+}
diff --git a/onnxruntime/core/providers/qnn/qnn_provider_factory_creator.h b/onnxruntime/core/providers/qnn/qnn_provider_factory_creator.h
index 80f9d99b804e7..859152752893e 100644
--- a/onnxruntime/core/providers/qnn/qnn_provider_factory_creator.h
+++ b/onnxruntime/core/providers/qnn/qnn_provider_factory_creator.h
@@ -11,6 +11,7 @@
 namespace onnxruntime {
 struct SessionOptions;
 
+// defined in core/session/provider_bridge_ort.cc
 struct QNNProviderFactoryCreator {
   static std::shared_ptr<IExecutionProviderFactory> Create(const ProviderOptions& provider_options_map,
                                                            const SessionOptions* session_options);
diff --git a/onnxruntime/core/providers/qnn/symbols.def b/onnxruntime/core/providers/qnn/symbols.def
new file mode 100644
index 0000000000000..4ec2f7914c208
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/symbols.def
@@ -0,0 +1,2 @@
+EXPORTS
+   GetProvider
diff --git a/onnxruntime/core/providers/qnn/version_script.lds b/onnxruntime/core/providers/qnn/version_script.lds
new file mode 100644
index 0000000000000..094abb3329781
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/version_script.lds
@@ -0,0 +1,9 @@
+#_init and _fini should be local
+VERS_1.0 {
+  global:
+    GetProvider;    
+
+  # Hide everything else.
+  local:
+    *;
+};
diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index 45f81ed22b7f7..d4295b88faa79 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -9,6 +9,25 @@
 #pragma once
 #define SHARED_PROVIDER 1
 
+#ifdef _WIN32
+#include <ntverp.h>
+
+// ETW requires Windows 10 SDK or later
+// https://stackoverflow.com/questions/2665755/how-can-i-determine-the-version-of-the-windows-sdk-installed-on-my-computer
+#if VER_PRODUCTBUILD > 9600
+// ETW trace logging uses Windows 10 SDK's TraceLoggingProvider.h
+#define ETW_TRACE_LOGGING_SUPPORTED 1
+#endif  // VER_PRODUCTBUILD > 9600
+
+#ifdef ETW_TRACE_LOGGING_SUPPORTED
+#include <Windows.h>
+// TraceLoggingProvider.h must follow Windows.h
+#include <TraceLoggingProvider.h>
+#include <evntrace.h>
+#include <winmeta.h>
+#endif  // defined(ETW_TRACE_LOGGING_SUPPORTED)
+#endif  // defined(_WIN32)
+
 #include <vector>
 #include <string>
 #include <map>
@@ -136,6 +155,17 @@ enum class DataType {
   USER = 1     ///< Contains potentially sensitive user data.
 };
 
+enum class ORTTraceLoggingKeyword : uint64_t {
+  Session = 0x1,    // ORT Session TraceLoggingWrite
+  Logs = 0x2,       // LOGS() Macro ORT logs. Pair with an appropriate level depending on detail required
+  Reserved1 = 0x4,  // Reserved if we want to add some specific sub-categories instead of just LOGS() or other uses
+  Reserved2 = 0x8,
+  Reserved3 = 0x10,
+  Reserved4 = 0x20,
+  Reserved5 = 0x40,
+  Reserved6 = 0x80,
+  Profiling = 0x100  // Enables profiling. At higher levels >5 can impact inference performance
+};
 }  // namespace logging
 
 // OnnxRuntime Types (these are the internal types)
@@ -143,6 +173,13 @@ struct CPUIDInfo;
 namespace logging {
 struct Logger;
 struct Capture;
+#ifdef ETW_TRACE_LOGGING_SUPPORTED
+struct EtwRegistrationManager;
+using EtwRegistrationManager_EtwInternalCallback = std::function<void(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level,
+                                                                      ULONGLONG MatchAnyKeyword, ULONGLONG MatchAllKeyword,
+                                                                      PEVENT_FILTER_DESCRIPTOR FilterData,
+                                                                      PVOID CallbackContext)>;
+#endif
 }  // namespace logging
 struct ComputeCapability;
 struct ConfigOptions;
@@ -157,10 +194,12 @@ struct KernelRegistry;
 struct Function;
 struct Graph;
 class GraphViewer;
+struct ConstGraphNodes;
 enum class DataLayout;
 struct Model;
 struct Path;
 struct Node;
+struct Node_EdgeEnd;
 struct NodeArg;
 struct NodeAttributes;
 struct NodeUnitIODef;
@@ -215,6 +254,7 @@ using DeleteFunc = void (*)(void*);
 using NodeArgInfo = ONNX_NAMESPACE::ValueInfoProto;
 
 using NameMLValMap = std::unordered_map<std::string, OrtValue>;
+
 }  // namespace onnxruntime
 
 #include "core/platform/threadpool.h"
@@ -368,6 +408,28 @@ template <>
 constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<UInt4x2>() {
   return ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT4;
 }
+
+inline std::vector<std::unique_ptr<ComputeCapability>>
+CreateSupportedPartitions(const GraphViewer& graph_viewer,
+                          const std::unordered_set<const Node*>& supported_nodes,
+                          const std::unordered_set<std::string>& stop_ops,
+                          const std::function<std::string()>& generate_metadef_name,
+                          const std::string& execution_provider_name,
+                          const std::string& execution_provider_type,
+                          const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
+                          bool drop_constant_initializers = false) {
+  return g_host->Utils__CreateSupportedPartitions(graph_viewer, supported_nodes, stop_ops, generate_metadef_name,
+                                                  execution_provider_name, execution_provider_type, node_unit_map,
+                                                  drop_constant_initializers);
+}
+inline std::unique_ptr<ComputeCapability> MakeComputeCapability(const GraphViewer& graph_viewer,
+                                                                const std::vector<const Node*>& group,
+                                                                const std::function<std::string()>& generate_metadef_name,
+                                                                const std::string& execution_provider_name,
+                                                                bool drop_constant_initializers) {
+  return g_host->Utils__MakeComputeCapability(graph_viewer, group, generate_metadef_name,
+                                              execution_provider_name, drop_constant_initializers);
+}
 }  // namespace utils
 
 namespace QDQ {
@@ -381,6 +443,10 @@ GetAllNodeUnits(const GraphViewer* graph_viewer, const logging::Logger& logger)
 // So the C API (and C++) becomes available when ORT_API_MANUAL_INIT is used.
 void InitProviderOrtApi();
 
+// This is a replacement for Env::Default(). Returns a reference to the default ORT Environment.
+inline Env& GetDefaultEnv() {
+  return g_host->Env__Default();
+}
 }  // namespace onnxruntime
 
 #define CREATE_MESSAGE(logger, severity, category, datatype) \
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index aa8c367d25d51..4c050534456da 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -505,6 +505,9 @@ Status UnpackInitializerData(const ONNX_NAMESPACE::TensorProto& tensor, const st
                              /*out*/ std::vector<uint8_t>& unpacked_tensor) {
   return g_host->UnpackInitializerData(tensor, model_path, unpacked_tensor);
 }
+Status UnpackInitializerData(const ONNX_NAMESPACE::TensorProto& tensor, /*out*/ std::vector<uint8_t>& unpacked_tensor) {
+  return g_host->UnpackInitializerData(tensor, std::filesystem::path(), unpacked_tensor);
+}
 
 }  // namespace utils
 
@@ -788,5 +791,5 @@ std::string ToUTF8String(const std::wstring& s) {
 std::wstring ToWideString(const std::string& s) {
   return g_host->ToWideString(s);
 }
-#endif
+#endif  // _WIN32
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 5a179ec622f8c..a4fcc313b19fe 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -120,11 +120,20 @@ struct Node__EdgeIterator {
   virtual bool operator!=(const Node__EdgeIterator& p) const = 0;
 
   virtual void operator++() = 0;
+  virtual const Node_EdgeEnd& operator*() const = 0;
   virtual const Node& GetNode() const = 0;
   virtual int GetSrcArgIndex() const = 0;
   virtual int GetDstArgIndex() const = 0;
 };
 
+struct ConstGraphNodes_Iterator {
+  virtual ~ConstGraphNodes_Iterator() {}
+
+  virtual bool operator!=(const ConstGraphNodes_Iterator& other) const = 0;
+  virtual void operator++() = 0;
+  virtual const Node& operator*() = 0;
+};
+
 // There are two ways to route a function, one is a virtual method and the other is a function pointer (or pointer to
 // member function).
 // The function pointers are nicer in that they directly call the target function, but they cannot be used in cases
@@ -273,6 +282,7 @@ struct ProviderHost {
 
   // logging::Logger
   virtual bool logging__Logger__OutputIsEnabled(const logging::Logger* p, logging::Severity severity, logging::DataType data_type) = 0;
+  virtual logging::Severity logging__Logger__GetSeverity(const logging::Logger* p) = 0;
 
   // logging::LoggingManager
   virtual const logging::Logger& logging__LoggingManager__DefaultLogger() = 0;
@@ -281,12 +291,26 @@ struct ProviderHost {
   virtual std::unique_ptr<logging::Capture> logging__Capture__construct(const logging::Logger& logger, logging::Severity severity, const char* category, logging::DataType dataType, const CodeLocation& location) = 0;
   virtual void logging__Capture__operator_delete(logging::Capture* p) noexcept = 0;
   virtual std::ostream& logging__Capture__Stream(logging::Capture* p) noexcept = 0;
+  virtual void logging__Capture__ProcessPrintf(logging::Capture* p, const char* format, va_list args) = 0;
+
+#if defined(ETW_TRACE_LOGGING_SUPPORTED)
+  // logging::EtwRegistrationManager
+  virtual logging::EtwRegistrationManager& logging__EtwRegistrationManager__Instance() = 0;
+  virtual logging::Severity logging__EtwRegistrationManager__MapLevelToSeverity(logging::EtwRegistrationManager* p) = 0;
+  virtual void logging__EtwRegistrationManager__RegisterInternalCallback(
+      logging::EtwRegistrationManager* p,
+      const logging::EtwRegistrationManager_EtwInternalCallback& callback) = 0;
+  virtual void logging__EtwRegistrationManager__UnregisterInternalCallback(
+      logging::EtwRegistrationManager* p,
+      const logging::EtwRegistrationManager_EtwInternalCallback& callback) = 0;
+#endif  // defined(ETW_TRACE_LOGGING_SUPPORTED)
 
   // Env
   virtual Env& Env__Default() = 0;
 
   // Utils::DataTypeUtils
   virtual const std::string* Utils__DataTypeUtils__ToType(const ONNX_NAMESPACE::TypeProto& type_proto) = 0;
+  virtual const std::string* Utils__DataTypeUtils__ToType(const std::string& type_str) = 0;
 
   // int64s
   virtual int int64s__size(const ONNX_NAMESPACE::int64s* p) = 0;
@@ -328,6 +352,7 @@ struct ProviderHost {
   virtual bool TypeProto_Tensor__has_shape(const ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
   virtual const ONNX_NAMESPACE::TensorShapeProto& TypeProto_Tensor__shape(const ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
   virtual ONNX_NAMESPACE::TensorShapeProto* TypeProto_Tensor__mutable_shape(ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
+  virtual bool TypeProto_Tensor__has_elem_type(const ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
   virtual int32_t TypeProto_Tensor__elem_type(const ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
   virtual void TypeProto_Tensor__set_elem_type(ONNX_NAMESPACE::TypeProto_Tensor* p, int32_t value) = 0;
 
@@ -342,6 +367,7 @@ struct ProviderHost {
   // TypeProto
   virtual std::unique_ptr<ONNX_NAMESPACE::TypeProto> TypeProto__construct() = 0;
   virtual void TypeProto__CopyFrom(ONNX_NAMESPACE::TypeProto* p, const ONNX_NAMESPACE::TypeProto* other) = 0;
+  virtual bool TypeProto__has_tensor_type(const ONNX_NAMESPACE::TypeProto* p) = 0;
   virtual const ONNX_NAMESPACE::TypeProto_Tensor& TypeProto__tensor_type(const ONNX_NAMESPACE::TypeProto* p) = 0;
   virtual ONNX_NAMESPACE::TypeProto_Tensor* TypeProto__mutable_tensor_type(ONNX_NAMESPACE::TypeProto* p) = 0;
 
@@ -462,6 +488,7 @@ struct ProviderHost {
   virtual bool TensorProto__has_raw_data(const ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual const std::string& TensorProto__raw_data(const ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual std::string* TensorProto__mutable_raw_data(ONNX_NAMESPACE::TensorProto* p) = 0;
+  virtual bool TensorProto__has_data_type(const ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual int32_t TensorProto__data_type(const ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual void TensorProto__set_data_type(ONNX_NAMESPACE::TensorProto* p, int32_t type) = 0;
   virtual void TensorProto__CopyFrom(ONNX_NAMESPACE::TensorProto* p, const ONNX_NAMESPACE::TensorProto* other) = 0;
@@ -495,6 +522,7 @@ struct ProviderHost {
   // TensorShapeProto_Dimensions
   virtual std::unique_ptr<TensorShapeProto_Dimension_Iterator> TensorShapeProto_Dimensions__begin(const ONNX_NAMESPACE::TensorShapeProto_Dimensions* p) = 0;
   virtual std::unique_ptr<TensorShapeProto_Dimension_Iterator> TensorShapeProto_Dimensions__end(const ONNX_NAMESPACE::TensorShapeProto_Dimensions* p) = 0;
+  virtual size_t TensorShapeProto_Dimensions__size(const ONNX_NAMESPACE::TensorShapeProto_Dimensions* p) = 0;
 
   // TensorShapeProto
   virtual int TensorShapeProto__dim_size(const ONNX_NAMESPACE::TensorShapeProto* p) = 0;
@@ -823,6 +851,8 @@ struct ProviderHost {
 
   virtual const NodeAttributes& Node__GetAttributes(const Node* p) noexcept = 0;
   virtual void Node__AddAttribute(Node* p, const ::std::string& attr_name, const ONNX_NAMESPACE::GraphProto& value) = 0;
+  virtual void Node__AddAttribute(Node* p, const ::std::string& attr_name, const std::string& value) = 0;
+  virtual void Node__AddAttribute(Node* p, const ::std::string& attr_name, int64_t value) = 0;
   virtual size_t Node__GetInputEdgesCount(const Node* p) noexcept = 0;
   virtual size_t Node__GetOutputEdgesCount(const Node* p) noexcept = 0;
 
@@ -842,6 +872,14 @@ struct ProviderHost {
   virtual const std::unordered_map<std::string, gsl::not_null<Graph*>>& Node__GetAttributeNameToMutableSubgraphMap(Node* p) = 0;
   virtual std::unordered_map<std::string, gsl::not_null<const Graph*>> Node__GetAttributeNameToSubgraphMap(const Node* p) const = 0;
 
+  // Node_EdgeEnd
+  virtual std::unique_ptr<Node_EdgeEnd> Node_EdgeEnd__construct(const Node& node, int src_arg_index, int dst_arg_index) = 0;
+  virtual void Node_EdgeEnd__operator_delete(Node_EdgeEnd* p) noexcept = 0;
+
+  virtual const Node& Node_EdgeEnd__GetNode(const Node_EdgeEnd* p) = 0;
+  virtual int Node_EdgeEnd__GetSrcArgIndex(const Node_EdgeEnd* p) = 0;
+  virtual int Node_EdgeEnd__GetDstArgIndex(const Node_EdgeEnd* p) = 0;
+
   // NodeArg
   virtual const std::string& NodeArg__Name(const NodeArg* p) noexcept = 0;
   virtual const ONNX_NAMESPACE::TensorShapeProto* NodeArg__Shape(const NodeArg* p) = 0;
@@ -872,6 +910,12 @@ struct ProviderHost {
   virtual void NodeAttributes__reserve(NodeAttributes* p, size_t size) = 0;
 
   // NodeUnit
+  virtual std::unique_ptr<NodeUnit> NodeUnit__construct(gsl::span<const Node* const> dq_nodes, const Node& target_node,
+                                                        gsl::span<const Node* const> q_nodes, uint8_t unit_type,
+                                                        gsl::span<const NodeUnitIODef> inputs, gsl::span<const NodeUnitIODef> outputs,
+                                                        size_t input_edge_count, gsl::span<const Node_EdgeEnd* const> output_edges) = 0;
+  virtual void NodeUnit__operator_delete(NodeUnit* p) noexcept = 0;
+
   virtual int NodeUnit__UnitType(const NodeUnit* p) noexcept = 0;
 
   virtual const std::vector<NodeUnitIODef>& NodeUnit__Inputs(const NodeUnit* p) noexcept = 0;
@@ -897,10 +941,29 @@ struct ProviderHost {
   virtual std::pair<std::vector<std::unique_ptr<NodeUnit>>, std::unordered_map<const Node*, const NodeUnit*>>
   QDQ__GetAllNodeUnits(const GraphViewer* graph_viewer, const logging::Logger& logger) = 0;
 
+  // Partitioning utils
+  virtual std::vector<std::unique_ptr<ComputeCapability>>
+  Utils__CreateSupportedPartitions(const GraphViewer& graph_viewer,
+                                   const std::unordered_set<const Node*>& supported_nodes,
+                                   const std::unordered_set<std::string>& stop_ops,
+                                   const std::function<std::string()>& generate_metadef_name,
+                                   const std::string& execution_provider_name,
+                                   const std::string& execution_provider_type,
+                                   const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
+                                   bool drop_constant_initializers) = 0;
+
+  virtual std::unique_ptr<ComputeCapability>
+  Utils__MakeComputeCapability(const GraphViewer& graph_viewer,
+                               const std::vector<const Node*>& group,
+                               const std::function<std::string()>& generate_metadef_name,
+                               const std::string& execution_provider_name,
+                               bool drop_constant_initializers) = 0;
   // Model
   virtual std::unique_ptr<Model> Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
                                                   const IOnnxRuntimeOpSchemaRegistryList* local_registries,
                                                   const logging::Logger& logger) = 0;
+  virtual std::unique_ptr<Model> Model__construct(const std::string& graph_name, bool is_onnx_domain_only,
+                                                  const logging::Logger& logger) = 0;
   virtual void Model__operator_delete(Model* p) = 0;
   virtual Graph& Model__MainGraph(Model* p) = 0;
   virtual std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToProto(Model* p) = 0;
@@ -974,6 +1037,7 @@ struct ProviderHost {
   virtual const std::string& GraphViewer__Name(const GraphViewer* p) noexcept = 0;
   virtual const std::filesystem::path& GraphViewer__ModelPath(const GraphViewer* p) noexcept = 0;
 
+  virtual const ConstGraphNodes& GraphViewer__Nodes(const GraphViewer* p) noexcept = 0;
   virtual const Node* GraphViewer__GetNode(const GraphViewer* p, NodeIndex node_index) = 0;
   virtual const NodeArg* GraphViewer__GetNodeArg(const GraphViewer* p, const std::string& name) = 0;
 
@@ -989,6 +1053,7 @@ struct ProviderHost {
 
   virtual const std::vector<const NodeArg*>& GraphViewer__GetInputs(const GraphViewer* p) noexcept = 0;
   virtual const std::vector<const NodeArg*>& GraphViewer__GetOutputs(const GraphViewer* p) noexcept = 0;
+  virtual bool GraphViewer__NodeProducesGraphOutput(const GraphViewer* p, const Node& node) = 0;
   virtual const std::unordered_set<const NodeArg*>& GraphViewer__GetValueInfo(const GraphViewer* p) noexcept = 0;
 
   virtual const InitializedTensorSet& GraphViewer__GetAllInitializedTensors(const GraphViewer* p) = 0;
@@ -1007,6 +1072,13 @@ struct ProviderHost {
   virtual const Node* GraphViewer__GetProducerNode(const GraphViewer* p, const std::string& node_arg_name) const = 0;
   virtual IOnnxRuntimeOpSchemaCollectionPtr GraphViewer__GetSchemaRegistry(const GraphViewer* p) const = 0;
 
+  // ConstGraphNodes
+  virtual std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__begin(const ConstGraphNodes* p) = 0;
+  virtual std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__end(const ConstGraphNodes* p) = 0;
+  virtual std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__cbegin(const ConstGraphNodes* p) = 0;
+  virtual std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__cend(const ConstGraphNodes* p) = 0;
+  virtual bool ConstGraphNodes__empty(const ConstGraphNodes* p) noexcept = 0;
+
   // OpKernel
   virtual const Node& OpKernel__Node(const OpKernel* p) = 0;
 
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index 76b6d8063fd66..9dd0ab5f19c84 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -23,6 +23,9 @@ namespace logging {
 
 struct Logger final {
   bool OutputIsEnabled(Severity severity, DataType data_type) const noexcept { return g_host->logging__Logger__OutputIsEnabled(this, severity, data_type); }
+  Severity GetSeverity() const noexcept {
+    return g_host->logging__Logger__GetSeverity(this);
+  }
 
   PROVIDER_DISALLOW_ALL(Logger)
 };
@@ -39,11 +42,27 @@ struct Capture final {
   static void operator delete(void* p) { g_host->logging__Capture__operator_delete(reinterpret_cast<Capture*>(p)); }
 
   std::ostream& Stream() noexcept { return g_host->logging__Capture__Stream(this); }
+  void ProcessPrintf(const char* format, va_list args) { g_host->logging__Capture__ProcessPrintf(this, format, args); }
 
   Capture() = delete;
   Capture(const Capture&) = delete;
   void operator=(const Capture&) = delete;
 };
+
+#if defined(ETW_TRACE_LOGGING_SUPPORTED)
+struct EtwRegistrationManager final {
+  using EtwInternalCallback = EtwRegistrationManager_EtwInternalCallback;
+  static EtwRegistrationManager& Instance() { return g_host->logging__EtwRegistrationManager__Instance(); }
+  Severity MapLevelToSeverity() { return g_host->logging__EtwRegistrationManager__MapLevelToSeverity(this); }
+  void RegisterInternalCallback(const EtwInternalCallback& callback) {
+    g_host->logging__EtwRegistrationManager__RegisterInternalCallback(this, callback);
+  }
+  void UnregisterInternalCallback(const EtwInternalCallback& callback) {
+    g_host->logging__EtwRegistrationManager__UnregisterInternalCallback(this, callback);
+  }
+};
+#endif  // defined(ETW_TRACE_LOGGING_SUPPORTED)
+
 }  // namespace logging
 }  // namespace onnxruntime
 
@@ -234,6 +253,7 @@ struct TensorProto final {
   const std::string& raw_data() const { return g_host->TensorProto__raw_data(this); }
   std::string* mutable_raw_data() { return g_host->TensorProto__mutable_raw_data(this); }
 
+  bool has_data_type() const { return g_host->TensorProto__has_data_type(this); }
   int32_t data_type() const { return g_host->TensorProto__data_type(this); }
   void set_data_type(int32_t type) { return g_host->TensorProto__set_data_type(this, type); }
 
@@ -286,6 +306,7 @@ struct TensorShapeProto_Dimension final {
 struct TensorShapeProto_Dimensions final {
   IteratorHolder<TensorShapeProto_Dimension_Iterator, const TensorShapeProto_Dimension> begin() const { return g_host->TensorShapeProto_Dimensions__begin(this); }
   IteratorHolder<TensorShapeProto_Dimension_Iterator, const TensorShapeProto_Dimension> end() const { return g_host->TensorShapeProto_Dimensions__end(this); }
+  size_t size() const { return g_host->TensorShapeProto_Dimensions__size(this); }
 
   PROVIDER_DISALLOW_ALL(TensorShapeProto_Dimensions)
 };
@@ -305,6 +326,7 @@ struct TypeProto_Tensor final {
   bool has_shape() const { return g_host->TypeProto_Tensor__has_shape(this); }
   const TensorShapeProto& shape() const { return g_host->TypeProto_Tensor__shape(this); }
   TensorShapeProto* mutable_shape() { return g_host->TypeProto_Tensor__mutable_shape(this); }
+  bool has_elem_type() const { return g_host->TypeProto_Tensor__has_elem_type(this); }
   int32_t elem_type() const { return g_host->TypeProto_Tensor__elem_type(this); }
   void set_elem_type(int32_t value) { g_host->TypeProto_Tensor__set_elem_type(this, value); }
 
@@ -339,6 +361,7 @@ struct TypeProto_Sequence final {
 struct TypeProto final {
   static std::unique_ptr<TypeProto> Create() { return g_host->TypeProto__construct(); }
 
+  bool has_tensor_type() const { return g_host->TypeProto__has_tensor_type(this); }
   const TypeProto_Tensor& tensor_type() const { return g_host->TypeProto__tensor_type(this); }
   TypeProto_Tensor* mutable_tensor_type() { return g_host->TypeProto__mutable_tensor_type(this); }
 
@@ -475,6 +498,7 @@ namespace Utils {
 
 struct DataTypeUtils final {
   static const std::string* ToType(const ONNX_NAMESPACE::TypeProto& type_proto) { return g_host->Utils__DataTypeUtils__ToType(type_proto); }
+  static const std::string* ToType(const std::string& type_str) { return g_host->Utils__DataTypeUtils__ToType(type_str); }
 
   PROVIDER_DISALLOW_ALL(DataTypeUtils)
 };
@@ -770,6 +794,21 @@ struct Function final {
   PROVIDER_DISALLOW_ALL(Function)
 };
 
+struct Node_EdgeEnd final {
+  static std::unique_ptr<Node_EdgeEnd> Create(const Node& node, int src_arg_index, int dst_arg_index) {
+    return g_host->Node_EdgeEnd__construct(node, src_arg_index, dst_arg_index);
+  }
+  static void operator delete(void* p) { g_host->Node_EdgeEnd__operator_delete(reinterpret_cast<Node_EdgeEnd*>(p)); }
+
+  const Node& GetNode() const { return g_host->Node_EdgeEnd__GetNode(this); }
+  int GetSrcArgIndex() const { return g_host->Node_EdgeEnd__GetSrcArgIndex(this); }
+  int GetDstArgIndex() const { return g_host->Node_EdgeEnd__GetDstArgIndex(this); }
+
+  Node_EdgeEnd() = delete;
+  Node_EdgeEnd(const Node_EdgeEnd&) = delete;
+  void operator=(const Node_EdgeEnd&) = delete;
+};
+
 struct Node final {
   enum class Type {
     Primitive = 0,
@@ -801,6 +840,12 @@ struct Node final {
   void AddAttribute(const ::std::string& attr_name, const ONNX_NAMESPACE::GraphProto& value) {
     g_host->Node__AddAttribute(this, attr_name, value);
   }
+  void AddAttribute(const std::string& attr_name, const std::string& value) {
+    g_host->Node__AddAttribute(this, attr_name, value);
+  }
+  void AddAttribute(const std::string& attr_name, int64_t value) {
+    g_host->Node__AddAttribute(this, attr_name, value);
+  }
 
   size_t GetInputEdgesCount() const noexcept { return g_host->Node__GetInputEdgesCount(this); }
   size_t GetOutputEdgesCount() const noexcept { return g_host->Node__GetOutputEdgesCount(this); }
@@ -832,6 +877,7 @@ struct Node final {
     }
 
     void operator++() { impl_->operator++(); }
+    const Node_EdgeEnd& operator*() { return impl_->operator*(); }
     const Node__EdgeIterator* operator->() const { return impl_.get(); }
 
     std::unique_ptr<Node__EdgeIterator> impl_;
@@ -906,6 +952,15 @@ struct NodeUnit final {
     QDQGroup,    // The NodeUnit contain a QDQ group of nodes, such as "DQ->Sigmoid->Q"
   };
 
+  static std::unique_ptr<NodeUnit> Create(gsl::span<const Node* const> dq_nodes, const Node& target_node,
+                                          gsl::span<const Node* const> q_nodes, Type unit_type,
+                                          gsl::span<const NodeUnitIODef> inputs, gsl::span<const NodeUnitIODef> outputs,
+                                          size_t input_edge_count, gsl::span<const Node_EdgeEnd* const> output_edges) {
+    return g_host->NodeUnit__construct(dq_nodes, target_node, q_nodes, static_cast<uint8_t>(unit_type),
+                                       inputs, outputs, input_edge_count, output_edges);
+  }
+  static void operator delete(void* p) { g_host->NodeUnit__operator_delete(reinterpret_cast<NodeUnit*>(p)); }
+
   Type UnitType() const noexcept { return static_cast<Type>(g_host->NodeUnit__UnitType(this)); }
 
   const std::vector<NodeUnitIODef>& Inputs() const noexcept { return g_host->NodeUnit__Inputs(this); }
@@ -932,6 +987,10 @@ struct NodeUnit final {
   // output. any Q nodes are hidden.
   Node::EdgeConstIterator OutputEdgesBegin() const { return g_host->NodeUnit__OutputEdgesBegin(this); }
   Node::EdgeConstIterator OutputEdgesEnd() const { return g_host->NodeUnit__OutputEdgesEnd(this); }
+
+  NodeUnit() = delete;
+  NodeUnit(const NodeUnit&) = delete;
+  void operator=(const NodeUnit& v) = delete;
 };
 
 struct ModelSavingOptions;
@@ -941,6 +1000,9 @@ struct Model final {
                                        const IOnnxRuntimeOpSchemaRegistryList* local_registries, const logging::Logger& logger) {
     return g_host->Model__construct(std::move(model_proto), model_path, local_registries, logger);
   }
+  static std::unique_ptr<Model> Create(const std::string& graph_name, bool is_onnx_domain_only, const logging::Logger& logger) {
+    return g_host->Model__construct(graph_name, is_onnx_domain_only, logger);
+  }
   static void operator delete(void* p) { g_host->Model__operator_delete(reinterpret_cast<Model*>(p)); }
   static Status Load(const PathString& file_path, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) { return g_host->Model__Load(file_path, model_proto); }
 
@@ -1041,6 +1103,7 @@ class GraphViewer final {
   const std::string& Name() const noexcept { return g_host->GraphViewer__Name(this); }
   const std::filesystem::path& ModelPath() const noexcept { return g_host->GraphViewer__ModelPath(this); }
 
+  const ConstGraphNodes& Nodes() const noexcept { return g_host->GraphViewer__Nodes(this); }
   const Node* GetNode(NodeIndex node_index) const { return g_host->GraphViewer__GetNode(this, node_index); }
   const NodeArg* GetNodeArg(const std::string& name) const { return g_host->GraphViewer__GetNodeArg(this, name); }
 
@@ -1058,6 +1121,9 @@ class GraphViewer final {
 
   const std::vector<const NodeArg*>& GetInputs() const noexcept { return g_host->GraphViewer__GetInputs(this); }
   const std::vector<const NodeArg*>& GetOutputs() const noexcept { return g_host->GraphViewer__GetOutputs(this); }
+  bool NodeProducesGraphOutput(const Node& node) const {
+    return g_host->GraphViewer__NodeProducesGraphOutput(this, node);
+  }
   const std::unordered_set<const NodeArg*>& GetValueInfo() const noexcept { return g_host->GraphViewer__GetValueInfo(this); }
 
   const InitializedTensorSet& GetAllInitializedTensors() const noexcept { return g_host->GraphViewer__GetAllInitializedTensors(this); }
@@ -1085,6 +1151,25 @@ class GraphViewer final {
   void operator=(const GraphViewer&) = delete;
 };
 
+struct ConstGraphNodes final {
+  IteratorHolder<ConstGraphNodes_Iterator, const Node> begin() const {
+    return g_host->ConstGraphNodes__begin(this);
+  }
+  IteratorHolder<ConstGraphNodes_Iterator, const Node> end() const {
+    return g_host->ConstGraphNodes__end(this);
+  }
+  IteratorHolder<ConstGraphNodes_Iterator, const Node> cbegin() const {
+    return g_host->ConstGraphNodes__cbegin(this);
+  }
+  IteratorHolder<ConstGraphNodes_Iterator, const Node> cend() const {
+    return g_host->ConstGraphNodes__cend(this);
+  }
+
+  bool empty() const noexcept { return g_host->ConstGraphNodes__empty(this); }
+
+  PROVIDER_DISALLOW_ALL(ConstGraphNodes)
+};
+
 struct OpKernelContext final {
   template <typename T>
   const T& RequiredInput(int index) const;
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index af39edae2074d..518304e5c7fa9 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -37,7 +37,6 @@
 #include "core/framework/model_metadef_id_generator.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
-#include "core/session/onnxruntime_session_options_config_keys.h"
 
 #include "core/session/onnxruntime_c_api.h"
 #include "core/common/string_helper.h"
@@ -62,6 +61,10 @@
 #include "orttraining/core/framework/distributed_run_context.h"
 #endif
 
+#ifdef _WIN32
+#include "core/platform/windows/logging/etw_sink.h"
+#endif
+
 namespace ONNX_NAMESPACE {
 // We use these names in the provider API because we don't have the protobuf definitions of the RepeatedField* types
 using int64s = google::protobuf::RepeatedField<int64_t>;
@@ -76,11 +79,18 @@ using FunctionProtos = google::protobuf::RepeatedPtrField<FunctionProto>;
 namespace onnxruntime {
 using IndexedSubGraph_MetaDef = IndexedSubGraph::MetaDef;
 using IndexedSubGraph_SourceOfSchema = IndexedSubGraph::SourceOfSchema;
+using Node_EdgeEnd = Node::EdgeEnd;
+#ifdef ETW_TRACE_LOGGING_SUPPORTED
+namespace logging {
+using EtwRegistrationManager_EtwInternalCallback = EtwRegistrationManager::EtwInternalCallback;
+}
+#endif
 }  // namespace onnxruntime
 
 #include "core/common/cpuid_info.h"
 #include "core/common/logging/logging.h"
 #include "core/providers/shared_library/provider_interfaces.h"
+#include "core/providers/partitioning_utils.h"
 
 #include "core/providers/cuda/cuda_provider_factory_creator.h"
 #include "core/providers/cann/cann_provider_factory_creator.h"
@@ -90,6 +100,7 @@ using IndexedSubGraph_SourceOfSchema = IndexedSubGraph::SourceOfSchema;
 #include "core/providers/openvino/openvino_provider_factory_creator.h"
 #include "core/providers/tensorrt/tensorrt_provider_factory_creator.h"
 #include "core/providers/vitisai/vitisai_provider_factory_creator.h"
+#include "core/providers/qnn/qnn_provider_factory_creator.h"
 
 #include "core/providers/cuda/cuda_provider_factory.h"
 #include "core/providers/cann/cann_provider_factory.h"
@@ -181,6 +192,7 @@ struct Node__EdgeIterator_Impl : Node__EdgeIterator {
   bool operator!=(const Node__EdgeIterator& p) const override { return v_ != static_cast<const Node__EdgeIterator_Impl*>(&p)->v_; }
 
   void operator++() override { v_.operator++(); }
+  const Node_EdgeEnd& operator*() const override { return v_.operator*(); }
   const Node& GetNode() const override { return v_->GetNode(); }
   int GetSrcArgIndex() const override { return v_->GetSrcArgIndex(); }
   int GetDstArgIndex() const override { return v_->GetDstArgIndex(); }
@@ -188,6 +200,18 @@ struct Node__EdgeIterator_Impl : Node__EdgeIterator {
   Node::EdgeConstIterator v_;
 };
 
+struct ConstGraphNodes_Iterator_Impl : ConstGraphNodes_Iterator {
+  ConstGraphNodes_Iterator_Impl(ConstGraphNodes::ConstNodeIterator&& v) : v_{std::move(v)} {}
+
+  bool operator!=(const ConstGraphNodes_Iterator& other) const override {
+    return v_ != static_cast<const ConstGraphNodes_Iterator_Impl*>(&other)->v_;
+  }
+  void operator++() override { v_.operator++(); }
+  const Node& operator*() override { return *v_; }
+
+  ConstGraphNodes::ConstNodeIterator v_;
+};
+
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
 common::Status LoadDynamicLibraryFromProvider(onnxruntime::PathString library_name) {
   const auto& platform_env = onnxruntime::Env::Default();
@@ -367,6 +391,9 @@ struct ProviderHostImpl : ProviderHost {
 
   // logging::Logger (wrapped)
   bool logging__Logger__OutputIsEnabled(const logging::Logger* p, logging::Severity severity, logging::DataType data_type) override { return p->OutputIsEnabled(severity, data_type); }
+  logging::Severity logging__Logger__GetSeverity(const logging::Logger* p) override {
+    return p->GetSeverity();
+  }
 
   // logging::LoggingManager (wrapped)
   const logging::Logger& logging__LoggingManager__DefaultLogger() override { return logging::LoggingManager::DefaultLogger(); }
@@ -377,12 +404,38 @@ struct ProviderHostImpl : ProviderHost {
   }
   void logging__Capture__operator_delete(logging::Capture* p) noexcept override { delete p; }
   std::ostream& logging__Capture__Stream(logging::Capture* p) noexcept override { return p->Stream(); }
+  void logging__Capture__ProcessPrintf(logging::Capture* p, const char* format, va_list args) override {
+    p->ProcessPrintf(format, args);
+  }
+
+#if defined(ETW_TRACE_LOGGING_SUPPORTED)
+  // logging::EtwRegistrationManager
+  logging::EtwRegistrationManager& logging__EtwRegistrationManager__Instance() override {
+    return logging::EtwRegistrationManager::Instance();
+  }
+  logging::Severity logging__EtwRegistrationManager__MapLevelToSeverity(logging::EtwRegistrationManager* p) override {
+    return p->MapLevelToSeverity();
+  }
+  void logging__EtwRegistrationManager__RegisterInternalCallback(
+      logging::EtwRegistrationManager* p,
+      const logging::EtwRegistrationManager_EtwInternalCallback& callback) override {
+    p->RegisterInternalCallback(callback);
+  }
+  void logging__EtwRegistrationManager__UnregisterInternalCallback(
+      logging::EtwRegistrationManager* p,
+      const logging::EtwRegistrationManager_EtwInternalCallback& callback) override {
+    p->UnregisterInternalCallback(callback);
+  }
+#endif  // defined(ETW_TRACE_LOGGING_SUPPORTED)
 
   // Env
   Env& Env__Default() override { return Env::Default(); }
 
   // Utils::DataTypeUtils (wrapped)
   const std::string* Utils__DataTypeUtils__ToType(const ONNX_NAMESPACE::TypeProto& type_proto) override { return ONNX_NAMESPACE::Utils::DataTypeUtils::ToType(type_proto); }
+  const std::string* Utils__DataTypeUtils__ToType(const std::string& type_str) override {
+    return ONNX_NAMESPACE::Utils::DataTypeUtils::ToType(type_str);
+  }
 
   // int64s (wrapped)
   int int64s__size(const ONNX_NAMESPACE::int64s* p) override { return p->size(); }
@@ -424,6 +477,7 @@ struct ProviderHostImpl : ProviderHost {
   bool TypeProto_Tensor__has_shape(const ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->has_shape(); }
   const ONNX_NAMESPACE::TensorShapeProto& TypeProto_Tensor__shape(const ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->shape(); }
   ONNX_NAMESPACE::TensorShapeProto* TypeProto_Tensor__mutable_shape(ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->mutable_shape(); }
+  bool TypeProto_Tensor__has_elem_type(const ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->has_elem_type(); }
   int32_t TypeProto_Tensor__elem_type(const ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->elem_type(); }
   void TypeProto_Tensor__set_elem_type(ONNX_NAMESPACE::TypeProto_Tensor* p, int32_t value) override { p->set_elem_type(value); };
 
@@ -444,6 +498,7 @@ struct ProviderHostImpl : ProviderHost {
   // TypeProto (wrapped)
   std::unique_ptr<ONNX_NAMESPACE::TypeProto> TypeProto__construct() override { return std::make_unique<ONNX_NAMESPACE::TypeProto>(); }
   void TypeProto__CopyFrom(ONNX_NAMESPACE::TypeProto* p, const ONNX_NAMESPACE::TypeProto* other) override { p->CopyFrom(*other); }
+  bool TypeProto__has_tensor_type(const ONNX_NAMESPACE::TypeProto* p) override { return p->has_tensor_type(); }
   const ONNX_NAMESPACE::TypeProto_Tensor& TypeProto__tensor_type(const ONNX_NAMESPACE::TypeProto* p) override { return p->tensor_type(); }
   ONNX_NAMESPACE::TypeProto_Tensor* TypeProto__mutable_tensor_type(ONNX_NAMESPACE::TypeProto* p) override { return p->mutable_tensor_type(); }
   int TypeProto__value_case(const ONNX_NAMESPACE::TypeProto* p) override { return p->value_case(); }
@@ -572,6 +627,7 @@ struct ProviderHostImpl : ProviderHost {
   const std::string& TensorProto__raw_data(const ONNX_NAMESPACE::TensorProto* p) override { return p->raw_data(); }
   std::string* TensorProto__mutable_raw_data(ONNX_NAMESPACE::TensorProto* p) override { return p->mutable_raw_data(); }
 
+  bool TensorProto__has_data_type(const ONNX_NAMESPACE::TensorProto* p) override { return p->has_data_type(); }
   int32_t TensorProto__data_type(const ONNX_NAMESPACE::TensorProto* p) override { return p->data_type(); }
   void TensorProto__set_data_type(ONNX_NAMESPACE::TensorProto* p, int32_t type) override { p->set_data_type(type); }
 
@@ -610,6 +666,10 @@ struct ProviderHostImpl : ProviderHost {
     return std::make_unique<TensorShapeProto_Dimension_Iterator_Impl>(p->end());
   }
 
+  size_t TensorShapeProto_Dimensions__size(const ONNX_NAMESPACE::TensorShapeProto_Dimensions* p) override {
+    return p->size();
+  }
+
   // TensorShapeProto (wrapped)
   int TensorShapeProto__dim_size(const ONNX_NAMESPACE::TensorShapeProto* p) override { return p->dim_size(); }
   const ONNX_NAMESPACE::TensorShapeProto_Dimensions& TensorShapeProto__dim(const ONNX_NAMESPACE::TensorShapeProto* p) override { return p->dim(); }
@@ -960,6 +1020,12 @@ struct ProviderHostImpl : ProviderHost {
   void Node__AddAttribute(Node* p, const ::std::string& attr_name, const ONNX_NAMESPACE::GraphProto& value) override {
     p->AddAttribute(attr_name, value);
   }
+  void Node__AddAttribute(Node* p, const ::std::string& attr_name, const std::string& value) override {
+    p->AddAttribute(attr_name, value);
+  }
+  void Node__AddAttribute(Node* p, const ::std::string& attr_name, int64_t value) override {
+    p->AddAttribute(attr_name, value);
+  }
   size_t Node__GetInputEdgesCount(const Node* p) noexcept override { return p->GetInputEdgesCount(); }
   size_t Node__GetOutputEdgesCount(const Node* p) noexcept override { return p->GetOutputEdgesCount(); }
 
@@ -982,6 +1048,16 @@ struct ProviderHostImpl : ProviderHost {
   std::unordered_map<std::string, gsl::not_null<const Graph*>> Node__GetAttributeNameToSubgraphMap(const Node* p) const override { return p->GetAttributeNameToSubgraphMap(); }
   int Node__NodeType(const Node* p) const noexcept override { return int(p->NodeType()); }
 
+  // Node_EdgeEnd (wrapped). Maps to Node::EdgeEnd struct.
+  std::unique_ptr<Node_EdgeEnd> Node_EdgeEnd__construct(const Node& node, int src_arg_index, int dst_arg_index) override {
+    return std::make_unique<Node::EdgeEnd>(node, src_arg_index, dst_arg_index);
+  }
+  void Node_EdgeEnd__operator_delete(Node_EdgeEnd* p) noexcept override { delete p; }
+
+  const Node& Node_EdgeEnd__GetNode(const Node_EdgeEnd* p) override { return p->GetNode(); }
+  int Node_EdgeEnd__GetSrcArgIndex(const Node_EdgeEnd* p) override { return p->GetSrcArgIndex(); }
+  int Node_EdgeEnd__GetDstArgIndex(const Node_EdgeEnd* p) override { return p->GetDstArgIndex(); }
+
   // NodeArg (wrapped)
   const std::string& NodeArg__Name(const NodeArg* p) noexcept override { return p->Name(); }
   const ONNX_NAMESPACE::TensorShapeProto* NodeArg__Shape(const NodeArg* p) override { return p->Shape(); }
@@ -1017,6 +1093,24 @@ struct ProviderHostImpl : ProviderHost {
   void NodeAttributes__reserve(NodeAttributes* p, size_t size) override { p->reserve(size); }
 
   // NodeUnit (wrapped)
+  std::unique_ptr<NodeUnit> NodeUnit__construct(gsl::span<const Node* const> dq_nodes,
+                                                const Node& target_node,
+                                                gsl::span<const Node* const> q_nodes,
+                                                uint8_t unit_type,
+                                                gsl::span<const NodeUnitIODef> inputs,
+                                                gsl::span<const NodeUnitIODef> outputs,
+                                                size_t input_edge_count,
+                                                gsl::span<const Node_EdgeEnd* const> output_edges) override {
+    Node::EdgeSet output_edge_set;
+    for (const Node_EdgeEnd* edge_end : output_edges) {
+      output_edge_set.insert(*edge_end);
+    }
+
+    return std::make_unique<NodeUnit>(dq_nodes, target_node, q_nodes, static_cast<NodeUnit::Type>(unit_type),
+                                      inputs, outputs, input_edge_count, output_edge_set);
+  }
+  void NodeUnit__operator_delete(NodeUnit* p) noexcept override { delete p; }
+
   int NodeUnit__UnitType(const NodeUnit* p) noexcept override { return static_cast<int>(p->UnitType()); }
 
   const std::vector<NodeUnitIODef>& NodeUnit__Inputs(const NodeUnit* p) noexcept override {
@@ -1064,12 +1158,46 @@ struct ProviderHostImpl : ProviderHost {
     return QDQ::GetAllNodeUnits(*graph_viewer, logger);
   }
 
+  // Partitioning utils
+  std::vector<std::unique_ptr<ComputeCapability>>
+  Utils__CreateSupportedPartitions(const GraphViewer& graph_viewer,
+                                   const std::unordered_set<const Node*>& supported_nodes,
+                                   const std::unordered_set<std::string>& stop_ops,
+                                   const utils::GenerateMetadefNameFn& generate_metadef_name,
+                                   const std::string& execution_provider_name,
+                                   const std::string& execution_provider_type,
+                                   const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
+                                   bool drop_constant_initializers) override {
+    return onnxruntime::utils::CreateSupportedPartitions(graph_viewer,
+                                                         supported_nodes,
+                                                         stop_ops,
+                                                         generate_metadef_name,
+                                                         execution_provider_name,
+                                                         execution_provider_type,
+                                                         node_unit_map,
+                                                         drop_constant_initializers);
+  }
+
+  std::unique_ptr<ComputeCapability>
+  Utils__MakeComputeCapability(const GraphViewer& graph_viewer,
+                               const std::vector<const Node*>& group,
+                               const std::function<std::string()>& generate_metadef_name,
+                               const std::string& execution_provider_name,
+                               bool drop_constant_initializers) override {
+    return onnxruntime::utils::MakeComputeCapability(graph_viewer, group, generate_metadef_name,
+                                                     execution_provider_name, drop_constant_initializers);
+  }
+
   // Model (wrapped)
   std::unique_ptr<Model> Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
                                           const IOnnxRuntimeOpSchemaRegistryList* local_registries,
                                           const logging::Logger& logger) override {
     return std::make_unique<Model>(model_proto, model_path, local_registries, logger);
   }
+  std::unique_ptr<Model> Model__construct(const std::string& graph_name, bool is_onnx_domain_only,
+                                          const logging::Logger& logger) override {
+    return std::make_unique<Model>(graph_name, is_onnx_domain_only, logger);
+  }
   void Model__operator_delete(Model* p) override { delete p; }
   Graph& Model__MainGraph(Model* p) override { return p->MainGraph(); }
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToProto(Model* p) override { return std::make_unique<ONNX_NAMESPACE::ModelProto>(p->ToProto()); }
@@ -1179,6 +1307,7 @@ struct ProviderHostImpl : ProviderHost {
   const std::string& GraphViewer__Name(const GraphViewer* p) noexcept override { return p->Name(); }
   const std::filesystem::path& GraphViewer__ModelPath(const GraphViewer* p) noexcept override { return p->ModelPath(); }
 
+  const ConstGraphNodes& GraphViewer__Nodes(const GraphViewer* p) noexcept override { return p->Nodes(); }
   const Node* GraphViewer__GetNode(const GraphViewer* p, NodeIndex node_index) override { return p->GetNode(node_index); }
   const NodeArg* GraphViewer__GetNodeArg(const GraphViewer* p, const std::string& name) override { return p->GetNodeArg(name); }
 
@@ -1196,6 +1325,9 @@ struct ProviderHostImpl : ProviderHost {
 
   const std::vector<const NodeArg*>& GraphViewer__GetInputs(const GraphViewer* p) noexcept override { return p->GetInputs(); }
   const std::vector<const NodeArg*>& GraphViewer__GetOutputs(const GraphViewer* p) noexcept override { return p->GetOutputs(); }
+  bool GraphViewer__NodeProducesGraphOutput(const GraphViewer* p, const Node& node) override {
+    return p->NodeProducesGraphOutput(node);
+  }
   const std::unordered_set<const NodeArg*>& GraphViewer__GetValueInfo(const GraphViewer* p) noexcept override { return p->GetValueInfo(); }
 
   const InitializedTensorSet& GraphViewer__GetAllInitializedTensors(const GraphViewer* p) override { return p->GetAllInitializedTensors(); }
@@ -1224,6 +1356,21 @@ struct ProviderHostImpl : ProviderHost {
   const Node* GraphViewer__GetProducerNode(const GraphViewer* p, const std::string& node_arg_name) const override { return p->GetProducerNode(node_arg_name); }
   IOnnxRuntimeOpSchemaCollectionPtr GraphViewer__GetSchemaRegistry(const GraphViewer* p) const override { return p->GetSchemaRegistry(); }
 
+  // ConstGraphNodes
+  std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__begin(const ConstGraphNodes* p) override {
+    return std::make_unique<ConstGraphNodes_Iterator_Impl>(p->begin());
+  }
+  std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__end(const ConstGraphNodes* p) override {
+    return std::make_unique<ConstGraphNodes_Iterator_Impl>(p->end());
+  }
+  std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__cbegin(const ConstGraphNodes* p) override {
+    return std::make_unique<ConstGraphNodes_Iterator_Impl>(p->cbegin());
+  }
+  std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__cend(const ConstGraphNodes* p) override {
+    return std::make_unique<ConstGraphNodes_Iterator_Impl>(p->cend());
+  }
+  bool ConstGraphNodes__empty(const ConstGraphNodes* p) noexcept override { return p->empty(); }
+
   // OpKernel (direct)
   const Node& OpKernel__Node(const OpKernel* p) override { return p->OpKernel::Node(); }
 
@@ -1650,6 +1797,12 @@ static ProviderLibrary s_library_tensorrt(LIBRARY_PREFIX ORT_TSTR("onnxruntime_p
 #endif
 );
 static ProviderLibrary s_library_migraphx(LIBRARY_PREFIX ORT_TSTR("onnxruntime_providers_migraphx") LIBRARY_EXTENSION);
+static ProviderLibrary s_library_qnn(LIBRARY_PREFIX ORT_TSTR("onnxruntime_providers_qnn") LIBRARY_EXTENSION
+#ifndef _WIN32
+                                     ,
+                                     false /* unload - On Linux if we unload the vitisai shared provider we crash */
+#endif
+);
 
 void UnloadSharedProviders() {
   s_library_dnnl.Unload();
@@ -1662,6 +1815,7 @@ void UnloadSharedProviders() {
   s_library_rocm.Unload();
   s_library_shared.Unload();
   s_library_migraphx.Unload();
+  s_library_qnn.Unload();
 }
 
 // Used by test code
@@ -1832,6 +1986,18 @@ ProviderOptions OrtOpenVINOProviderOptionsToOrtOpenVINOProviderOptionsV2(const O
   return ov_options_converted_map;
 }
 
+std::shared_ptr<IExecutionProviderFactory> QNNProviderFactoryCreator::Create(const ProviderOptions& provider_options_map,
+                                                                             const SessionOptions* session_options) {
+  const ConfigOptions* config_options = nullptr;
+  if (session_options != nullptr) {
+    config_options = &session_options->config_options;
+  }
+
+  std::array<const void*, 2> configs_array = {&provider_options_map, config_options};
+  const void* arg = reinterpret_cast<const void*>(&configs_array);
+  return s_library_qnn.Get().CreateExecutionProviderFactory(arg);
+}
+
 std::shared_ptr<IExecutionProviderFactory> OpenVINOProviderFactoryCreator::Create(
     const ProviderOptions* provider_options_map, const SessionOptions* session_options) {
   // Append session options applicable for EP to EP Provider options.
diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
index a3f0ed55b83f2..38fde332ca992 100644
--- a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
@@ -7,7 +7,6 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "core/session/inference_session.h"
-#include "core/providers/shared/utils/utils.h"
 
 #include "test/providers/qnn/qnn_test_utils.h"
 
@@ -25,6 +24,24 @@ namespace test {
 
 #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
+static int64_t GetNodeAttr(const Node& node, const std::string& attr_name, int64_t default_val) {
+  const auto& attributes = node.GetAttributes();
+  if (auto entry = attributes.find(attr_name); entry != attributes.end()) {
+    return entry->second.i();
+  }
+
+  return default_val;
+}
+
+static const std::string& GetNodeAttr(const Node& node, const std::string& attr_name, const std::string& default_val) {
+  const auto& attributes = node.GetAttributes();
+  if (auto entry = attributes.find(attr_name); entry != attributes.end()) {
+    return entry->second.s();
+  }
+
+  return default_val;
+}
+
 // Create a model with FusedMatMul + Add (quantized)
 // input1 -> Add -> Q -> DQ \
 //                           FusedMatMul -> Q -> DQ -> output
@@ -873,10 +890,9 @@ static void GetLastContextBinaryFileName(const std::string last_onnx_ctx_file,
   auto& ctx_graph = ctx_model->MainGraph();
   for (auto& node : ctx_graph.Nodes()) {
     if (node.OpType() == "EPContext") {
-      NodeAttrHelper node_helper(node);
-      int64_t is_main_context = node_helper.Get("main_context", static_cast<int64_t>(0));
+      int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast<int64_t>(0));
       if (1 == is_main_context) {
-        last_ctx_bin_file = node_helper.Get("ep_cache_context", "");
+        last_ctx_bin_file = GetNodeAttr(node, "ep_cache_context", "");
         return;
       }
     }
@@ -899,10 +915,9 @@ static void UpdateEpContextModel(const std::vector<std::string>& ep_ctx_files,
 
     for (auto& node : ctx_graph.Nodes()) {
       if (node.OpType() == "EPContext") {
-        NodeAttrHelper node_helper(node);
-        int64_t is_main_context = node_helper.Get("main_context", static_cast<int64_t>(0));
+        int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast<int64_t>(0));
         if (1 == is_main_context) {
-          std::string old_qnn_ctx_binary_file_name = node_helper.Get("ep_cache_context", "");
+          std::string old_qnn_ctx_binary_file_name = GetNodeAttr(node, "ep_cache_context", "");
           auto file_path = path.replace_filename(old_qnn_ctx_binary_file_name);
           std::remove(file_path.string().c_str());
           node.ClearAttribute("ep_cache_context");
diff --git a/onnxruntime/test/qnn_ctx_gen/main.cc b/onnxruntime/test/qnn_ctx_gen/main.cc
index 3be0bd253c8a4..bb5007b40b072 100644
--- a/onnxruntime/test/qnn_ctx_gen/main.cc
+++ b/onnxruntime/test/qnn_ctx_gen/main.cc
@@ -16,7 +16,6 @@
 #include "core/common/logging/sinks/clog_sink.h"
 
 #include "core/graph/model.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/session/environment.h"
 #include "core/common/logging/logging.h"
 
@@ -31,6 +30,24 @@ static void CheckStatus(const Status& status) {
   }
 }
 
+static int64_t GetNodeAttr(const Node& node, const std::string& attr_name, int64_t default_val) {
+  const auto& attributes = node.GetAttributes();
+  if (auto entry = attributes.find(attr_name); entry != attributes.end()) {
+    return entry->second.i();
+  }
+
+  return default_val;
+}
+
+static const std::string& GetNodeAttr(const Node& node, const std::string& attr_name, const std::string& default_val) {
+  const auto& attributes = node.GetAttributes();
+  if (auto entry = attributes.find(attr_name); entry != attributes.end()) {
+    return entry->second.s();
+  }
+
+  return default_val;
+}
+
 // from the last context cache Onnx model, find the EPContext node with main_context=1,
 // and get the QNN context binary file name, this context binary contains all graphs from all Onnx models
 // get the max spill fill buffer size
@@ -44,11 +61,10 @@ static void GetLastContextBinaryFileName(const std::basic_string<ORTCHAR_T> last
   auto& ctx_graph = ctx_model->MainGraph();
   for (auto& node : ctx_graph.Nodes()) {
     if (node.OpType() == "EPContext") {
-      NodeAttrHelper node_helper(node);
-      int64_t is_main_context = node_helper.Get("main_context", static_cast<int64_t>(0));
-      max_size = node_helper.Get("max_size", static_cast<int64_t>(0));
+      int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast<int64_t>(0));
+      max_size = GetNodeAttr(node, "max_size", static_cast<int64_t>(0));
       if (1 == is_main_context) {
-        last_ctx_bin_file = node_helper.Get("ep_cache_context", "");
+        last_ctx_bin_file = GetNodeAttr(node, "ep_cache_context", "");
         return;
       }
     }
@@ -72,10 +88,9 @@ static void UpdateEpContextModel(const std::vector<std::basic_string<ORTCHAR_T>>
 
     for (auto& node : ctx_graph.Nodes()) {
       if (node.OpType() == "EPContext") {
-        NodeAttrHelper node_helper(node);
-        int64_t is_main_context = node_helper.Get("main_context", static_cast<int64_t>(0));
+        int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast<int64_t>(0));
         if (1 == is_main_context) {
-          std::string old_qnn_ctx_binary_file_name = node_helper.Get("ep_cache_context", "");
+          std::string old_qnn_ctx_binary_file_name = GetNodeAttr(node, "ep_cache_context", "");
           auto file_path = path.replace_filename(old_qnn_ctx_binary_file_name);
           std::remove(file_path.string().c_str());
           node.ClearAttribute("ep_cache_context");
diff --git a/setup.py b/setup.py
index c1580eeb9e8f9..5c464eec537ec 100644
--- a/setup.py
+++ b/setup.py
@@ -311,17 +311,20 @@ def finalize_options(self):
 providers_tensorrt_or_migraphx = "onnxruntime_providers_" + ("migraphx" if is_migraphx else "tensorrt")
 providers_openvino = "onnxruntime_providers_openvino"
 providers_cann = "onnxruntime_providers_cann"
+providers_qnn = "onnxruntime_providers_qnn"
 
 if platform.system() == "Linux":
     providers_cuda_or_rocm = "lib" + providers_cuda_or_rocm + ".so"
     providers_tensorrt_or_migraphx = "lib" + providers_tensorrt_or_migraphx + ".so"
     providers_openvino = "lib" + providers_openvino + ".so"
     providers_cann = "lib" + providers_cann + ".so"
+    providers_qnn = "lib" + providers_qnn + ".so"
 elif platform.system() == "Windows":
     providers_cuda_or_rocm = providers_cuda_or_rocm + ".dll"
     providers_tensorrt_or_migraphx = providers_tensorrt_or_migraphx + ".dll"
     providers_openvino = providers_openvino + ".dll"
     providers_cann = providers_cann + ".dll"
+    providers_qnn = providers_qnn + ".dll"
 
 # Additional binaries
 dl_libs = []
@@ -341,8 +344,9 @@ def finalize_options(self):
     dl_libs.append(providers_cuda_or_rocm)
     dl_libs.append(providers_tensorrt_or_migraphx)
     dl_libs.append(providers_cann)
+    dl_libs.append(providers_qnn)
     dl_libs.append("libonnxruntime.so*")
-    # DNNL, TensorRT & OpenVINO EPs are built as shared libs
+    # DNNL, TensorRT, OpenVINO, and QNN EPs are built as shared libs
     libs.extend(["libonnxruntime_providers_shared.so"])
     libs.extend(["libonnxruntime_providers_dnnl.so"])
     libs.extend(["libonnxruntime_providers_openvino.so"])
@@ -350,6 +354,7 @@ def finalize_options(self):
     libs.append(providers_cuda_or_rocm)
     libs.append(providers_tensorrt_or_migraphx)
     libs.append(providers_cann)
+    libs.append(providers_qnn)
     # QNN
     qnn_deps = [
         "libQnnCpu.so",
@@ -388,13 +393,14 @@ def finalize_options(self):
         providers_cann,
         "onnxruntime.dll",
     ]
-    # DNNL, TensorRT & OpenVINO EPs are built as shared libs
+    # DNNL, TensorRT, OpenVINO, and QNN EPs are built as shared libs
     libs.extend(["onnxruntime_providers_shared.dll"])
     libs.extend(["onnxruntime_providers_dnnl.dll"])
     libs.extend(["onnxruntime_providers_tensorrt.dll"])
     libs.extend(["onnxruntime_providers_openvino.dll"])
     libs.extend(["onnxruntime_providers_cuda.dll"])
     libs.extend(["onnxruntime_providers_vitisai.dll"])
+    libs.extend(["onnxruntime_providers_qnn.dll"])
     # DirectML Libs
     libs.extend(["DirectML.dll"])
     # QNN V68/V73 dependencies
diff --git a/tools/ci_build/github/android/build_aar_package.py b/tools/ci_build/github/android/build_aar_package.py
index 1b34b3d302e57..7294e0548e1b2 100644
--- a/tools/ci_build/github/android/build_aar_package.py
+++ b/tools/ci_build/github/android/build_aar_package.py
@@ -129,7 +129,10 @@ def _build_aar(args):
         # to jnilibs/[abi] for later compiling the aar package
         abi_jnilibs_dir = os.path.join(jnilibs_dir, abi)
         os.makedirs(abi_jnilibs_dir, exist_ok=True)
-        for lib_name in ["libonnxruntime.so", "libonnxruntime4j_jni.so"]:
+        sym_link_libs = ["libonnxruntime.so", "libonnxruntime4j_jni.so"]
+        if qnn_android_build:
+            sym_link_libs.extend(["libonnxruntime_providers_shared.so", "libonnxruntime_providers_qnn.so"])
+        for lib_name in sym_link_libs:
             target_lib_name = os.path.join(abi_jnilibs_dir, lib_name)
             # If the symbolic already exists, delete it first
             # For some reason, os.path.exists will return false for a symbolic link in Linux,
@@ -141,7 +144,10 @@ def _build_aar(args):
         # copy executables for each abi, in case we want to publish those as well
         # some of them might not exist, e.g., if we skip building the tests
         abi_exe_dir = os.path.join(exe_dir, abi)
-        for exe_name in ["libonnxruntime.so", "onnxruntime_perf_test", "onnx_test_runner"]:
+        execs_to_copy = ["libonnxruntime.so", "onnxruntime_perf_test", "onnx_test_runner"]
+        if qnn_android_build:
+            execs_to_copy.extend(["libonnxruntime_providers_shared.so", "libonnxruntime_providers_qnn.so"])
+        for exe_name in execs_to_copy:
             src_exe_path = os.path.join(abi_build_dir, build_config, exe_name)
             if not os.path.exists(src_exe_path):
                 continue
diff --git a/tools/ci_build/github/android/default_qnn_aar_build_settings.json b/tools/ci_build/github/android/default_qnn_aar_build_settings.json
index 599c108f830e7..66ae7d25153f0 100644
--- a/tools/ci_build/github/android/default_qnn_aar_build_settings.json
+++ b/tools/ci_build/github/android/default_qnn_aar_build_settings.json
@@ -10,6 +10,7 @@
         "--parallel",
         "--cmake_generator=Ninja",
         "--build_java",
+        "--android_cpp_shared",
         "--build_shared_lib",
         "--use_qnn",
         "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF",
diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
index c3dbee336b69d..d080f68ca292f 100644
--- a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
@@ -72,6 +72,7 @@ jobs:
         --android_abi=x86_64 \
         --android_api=31 \
         --parallel \
+        --build_shared_lib \
         --use_qnn \
         --qnn_home $(QnnSDKRootDir) \
         --cmake_generator=Ninja \
diff --git a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
index d3826d90f9073..78bd2e20a4763 100644
--- a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
@@ -65,6 +65,7 @@ jobs:
             --config Release \
             --use_binskim_compliant_compile_flags \
             --build_java \
+            --build_shared_lib \
             --use_qnn \
             --qnn_home $(QnnSDKRootDir) \
             --cmake_generator=Ninja \
@@ -77,6 +78,7 @@ jobs:
             --config Release \
             --use_binskim_compliant_compile_flags \
             --build_java \
+            --build_shared_lib \
             --use_qnn \
             --qnn_home $(QnnSDKRootDir) \
             --cmake_generator=Ninja \
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
index e07f0afa6109c..da58b70be7f83 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
@@ -94,6 +94,7 @@ jobs:
             --build_dir $(Build.BinariesDirectory)
             --skip_submodule_sync
             --cmake_generator "$(VSGenerator)"
+            --build_shared_lib
             --use_qnn
             --qnn_home $(QnnSDKRootDir)
             --enable_pybind
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
index 8cc647c2464f3..e64a184d8ebeb 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
@@ -92,6 +92,7 @@ jobs:
             --build_dir $(Build.BinariesDirectory)
             --skip_submodule_sync
             --cmake_generator "$(VSGenerator)"
+            --build_shared_lib
             --use_qnn
             --qnn_home $(QnnSDKRootDir)
             --enable_pybind
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
index 466fee92d0d5e..a61bfc7706818 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
@@ -92,6 +92,7 @@ jobs:
             --build_dir $(Build.BinariesDirectory)
             --skip_submodule_sync
             --cmake_generator "$(VSGenerator)"
+            --build_shared_lib
             --use_qnn
             --qnn_home $(QnnSDKRootDir)
             --enable_pybind
diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
index aa0b6bf6d391e..4a437be325e7a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
@@ -93,12 +93,18 @@ stages:
           workingFolder: '$(Build.BinariesDirectory)\${{ parameters.build_config }}'
           createLogFile: true
 
+      - task: CmdLine@2
+        displayName: 'Print contents of binaries directory'
+        inputs:
+          script: |
+            dir $(Build.BinariesDirectory)\${{ parameters.build_config }}\${{ parameters.build_config }}
+
       - template: win-esrp-dll.yml
         parameters:
           FolderPath: '$(Build.BinariesDirectory)\${{ parameters.build_config }}\${{ parameters.build_config }}'
           DisplayName: 'ESRP - Sign dlls'
           DoEsrp: ${{ parameters.DoEsrp }}
-          Pattern: 'onnxruntime.dll'
+          Pattern: 'onnxruntime*.dll'
 
       - task: MSBuild@1
         displayName: 'Restore NuGet Packages and create project.assets.json'
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
index 5c013fae6be0b..826c43ebd9a15 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
@@ -79,6 +79,7 @@ jobs:
         --config $(BuildConfig)
         --build_dir $(Build.BinariesDirectory)
         --cmake_generator "Visual Studio 17 2022"
+        --build_shared_lib
         --use_qnn
         --qnn_home $(QnnSDKRootDir)
         --update --build --parallel
@@ -88,6 +89,7 @@ jobs:
         --config $(BuildConfig) ^
         --build_dir $(Build.BinariesDirectory) ^
         --cmake_generator "Visual Studio 17 2022" ^
+        --build_shared_lib ^
         --use_qnn ^
         --qnn_home $(QnnSDKRootDir) ^
         --test --enable_onnx_tests
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
index 53700c58c7e7d..485c06fdbed04 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
@@ -72,6 +72,7 @@ jobs:
         --build_dir $(Build.BinariesDirectory)
         --cmake_generator "Visual Studio 17 2022"
         --build_java
+        --build_shared_lib
         --use_qnn
         --qnn_home $(QnnSDKRootDir)
         --use_binskim_compliant_compile_flags
@@ -87,6 +88,7 @@ jobs:
         --build_dir $(Build.BinariesDirectory) ^
         --cmake_generator "Visual Studio 17 2022" ^
         --build_java ^
+        --build_shared_lib ^
         --use_qnn ^
         --qnn_home $(QnnSDKRootDir) ^
         --use_binskim_compliant_compile_flags ^
diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh
index e2e0cea69efb5..11997382d119c 100755
--- a/tools/ci_build/github/linux/build_linux_python_package.sh
+++ b/tools/ci_build/github/linux/build_linux_python_package.sh
@@ -94,7 +94,7 @@ fi
 
 if [ "$BUILD_DEVICE" == "NPU" ]; then
     #Enable QNN EP
-    BUILD_ARGS+=("--use_qnn" "--qnn_home=/qnn_sdk")
+    BUILD_ARGS+=("--build_shared_lib" "--use_qnn" "--qnn_home=/qnn_sdk")
 fi
 
 export ONNX_ML=1
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index 11842f34ce45b..980455ccddb0e 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -382,6 +382,7 @@ def generate_files(line_list, args):
             "tensorrt_ep_shared_lib": "onnxruntime_providers_tensorrt.dll",
             "openvino_ep_shared_lib": "onnxruntime_providers_openvino.dll",
             "cuda_ep_shared_lib": "onnxruntime_providers_cuda.dll",
+            "qnn_ep_shared_lib": "onnxruntime_providers_qnn.dll",
             "onnxruntime_perf_test": "onnxruntime_perf_test.exe",
             "onnx_test_runner": "onnx_test_runner.exe",
         }
@@ -777,6 +778,24 @@ def generate_files(line_list, args):
             + '\\native" />'
         )
 
+    if args.execution_provider == "qnn" or is_qnn_package and not is_ado_packaging_build:
+        files_list.append(
+            "<file src="
+            + '"'
+            + os.path.join(args.native_build_path, nuget_dependencies["providers_shared_lib"])
+            + runtimes_target
+            + args.target_architecture
+            + '\\native" />'
+        )
+        files_list.append(
+            "<file src="
+            + '"'
+            + os.path.join(args.native_build_path, nuget_dependencies["qnn_ep_shared_lib"])
+            + runtimes_target
+            + args.target_architecture
+            + '\\native" />'
+        )
+
     # process all other library dependencies
     if is_cpu_package or is_cuda_gpu_package or is_dml_package or is_mklml_package:
         # Process dnnl dependency