From 2ff66b80e0e075696e34c78ab59b351bc8590d56 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Mon, 16 Dec 2024 09:05:12 -0800
Subject: [PATCH 01/25] Fix a deadlock bug in EigenNonBlockingThreadPool.h
 (#23098)

### Description
This PR fixes a deadlock bug in EigenNonBlockingThreadPool.h. It only happens on platforms with weakly ordered memory model, such as ARM64.
---
 .../platform/EigenNonBlockingThreadPool.h     | 124 ++++++++++--------
 1 file changed, 66 insertions(+), 58 deletions(-)

diff --git a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
index 27b14f008a8ba..a7c63c507d1ba 100644
--- a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
+++ b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
@@ -1467,11 +1467,14 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
       status = ThreadStatus::Spinning;
     }
 
-    void SetBlocked(std::function<bool()> should_block,
+    bool SetBlocked(std::function<bool()> should_block,
                     std::function<void()> post_block) {
       std::unique_lock<std::mutex> lk(mutex);
-      assert(GetStatus() == ThreadStatus::Spinning);
-      status.store(ThreadStatus::Blocking, std::memory_order_relaxed);
+      auto old_status = status.exchange(ThreadStatus::Blocking, std::memory_order_seq_cst);
+      if (old_status != ThreadStatus::Spinning) {
+        // Encountered a logical error
+        return false;
+      }
       if (should_block()) {
         status.store(ThreadStatus::Blocked, std::memory_order_relaxed);
         do {
@@ -1480,6 +1483,7 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
         post_block();
       }
       status.store(ThreadStatus::Spinning, std::memory_order_relaxed);
+      return true;
     }
 
    private:
@@ -1558,62 +1562,66 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
 
         // Attempt to block
         if (!t) {
-          td.SetBlocked(  // Pre-block test
-              [&]() -> bool {
-                bool should_block = true;
-                // Check whether work was pushed to us while attempting to block.  We make
-                // this test while holding the per-thread status lock, and after setting
-                // our status to ThreadStatus::Blocking.
-                //
-                // This synchronizes with ThreadPool::Schedule which pushes work to the queue
-                // and then tests for ThreadStatus::Blocking/Blocked (via EnsureAwake):
-                //
-                // Main thread:                    Worker:
-                //   #1 Push work                   #A Set status blocking
-                //   #2 Read worker status          #B Check queue
-                //   #3 Wake if blocking/blocked
-                //
-                // If #A is before #2 then main sees worker blocked and wakes
-                //
-                // If #A if after #2 then #B will see #1, and we abandon blocking
-                assert(!t);
-                t = q.PopFront();
-                if (t) {
-                  should_block = false;
-                }
-
-                // No work pushed to us, continue attempting to block.  The remaining
-                // test  is to synchronize with termination requests.  If we are
-                // shutting down and all worker threads blocked without work, that's
-                // we are done.
-                if (should_block) {
-                  blocked_++;
-                  if (done_ && blocked_ == num_threads_) {
-                    should_block = false;
-                    // Almost done, but need to re-check queues.
-                    // Consider that all queues are empty and all worker threads are preempted
-                    // right after incrementing blocked_ above. Now a free-standing thread
-                    // submits work and calls destructor (which sets done_). If we don't
-                    // re-check queues, we will exit leaving the work unexecuted.
-                    if (NonEmptyQueueIndex() != -1) {
-                      // Note: we must not pop from queues before we decrement blocked_,
-                      // otherwise the following scenario is possible. Consider that instead
-                      // of checking for emptiness we popped the only element from queues.
-                      // Now other worker threads can start exiting, which is bad if the
-                      // work item submits other work. So we just check emptiness here,
-                      // which ensures that all worker threads exit at the same time.
-                      blocked_--;
-                    } else {
-                      should_exit = true;
+          if (!td.SetBlocked(  // Pre-block test
+                  [&]() -> bool {
+                    bool should_block = true;
+                    // Check whether work was pushed to us while attempting to block.  We make
+                    // this test while holding the per-thread status lock, and after setting
+                    // our status to ThreadStatus::Blocking.
+                    //
+                    // This synchronizes with ThreadPool::Schedule which pushes work to the queue
+                    // and then tests for ThreadStatus::Blocking/Blocked (via EnsureAwake):
+                    //
+                    // Main thread:                    Worker:
+                    //   #1 Push work                   #A Set status blocking
+                    //   #2 Read worker status          #B Check queue
+                    //   #3 Wake if blocking/blocked
+                    //
+                    // If #A is before #2 then main sees worker blocked and wakes
+                    //
+                    // If #A if after #2 then #B will see #1, and we abandon blocking
+                    assert(!t);
+                    t = q.PopFront();
+                    if (t) {
+                      should_block = false;
+                    }
+
+                    // No work pushed to us, continue attempting to block.  The remaining
+                    // test  is to synchronize with termination requests.  If we are
+                    // shutting down and all worker threads blocked without work, that's
+                    // we are done.
+                    if (should_block) {
+                      blocked_++;
+                      if (done_ && blocked_ == num_threads_) {
+                        should_block = false;
+                        // Almost done, but need to re-check queues.
+                        // Consider that all queues are empty and all worker threads are preempted
+                        // right after incrementing blocked_ above. Now a free-standing thread
+                        // submits work and calls destructor (which sets done_). If we don't
+                        // re-check queues, we will exit leaving the work unexecuted.
+                        if (NonEmptyQueueIndex() != -1) {
+                          // Note: we must not pop from queues before we decrement blocked_,
+                          // otherwise the following scenario is possible. Consider that instead
+                          // of checking for emptiness we popped the only element from queues.
+                          // Now other worker threads can start exiting, which is bad if the
+                          // work item submits other work. So we just check emptiness here,
+                          // which ensures that all worker threads exit at the same time.
+                          blocked_--;
+                        } else {
+                          should_exit = true;
+                        }
+                      }
                     }
-                  }
-                }
-                return should_block;
-              },
-              // Post-block update (executed only if we blocked)
-              [&]() {
-                blocked_--;
-              });
+                    return should_block;
+                  },
+                  // Post-block update (executed only if we blocked)
+                  [&]() {
+                    blocked_--;
+                  })) {
+            // Encountered a fatal logic error in SetBlocked
+            should_exit = true;
+            break;
+          }
           // Thread just unblocked.  Unless we picked up work while
           // blocking, or are exiting, then either work was pushed to
           // us, or it was pushed to an overloaded queue

From a4eb8f27b6e51dec41f943b614702dd114731e13 Mon Sep 17 00:00:00 2001
From: tianf-fff <80665242+tianfang-fafafa@users.noreply.github.com>
Date: Mon, 16 Dec 2024 11:09:48 -0600
Subject: [PATCH 02/25] [VitisAI] Add profiler interface for vitisai (#23032)

### Description
<!-- Describe your changes. -->
Add common interfaces for vitis ep profiler.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Vitis ep can collect and record api and kernel timestamps in file when
onnxruntime '-p' is enabled.
---
 .../core/providers/vitisai/imp/global_api.cc  | 12 +++++
 .../vitisai/include/vaip/global_api.h         | 15 ++++++
 .../vitisai/vitisai_execution_provider.cc     |  5 ++
 .../vitisai/vitisai_execution_provider.h      |  2 +
 .../providers/vitisai/vitisai_profiler.cc     | 49 +++++++++++++++++++
 .../core/providers/vitisai/vitisai_profiler.h | 23 +++++++++
 6 files changed, 106 insertions(+)
 create mode 100644 onnxruntime/core/providers/vitisai/vitisai_profiler.cc
 create mode 100644 onnxruntime/core/providers/vitisai/vitisai_profiler.h

diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc
index 51dc79c569589..cccaa65de45f2 100644
--- a/onnxruntime/core/providers/vitisai/imp/global_api.cc
+++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc
@@ -58,6 +58,9 @@ struct OrtVitisAIEpAPI {
       const std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>& eps,
       const char* const* keys,
       const char* const* values, size_t kv_len) = nullptr;
+  void (*profiler_collect)(
+      std::vector<EventInfo>& api_events,
+      std::vector<EventInfo>& kernel_events);
   void Ensure() {
     if (handle_)
       return;
@@ -81,6 +84,7 @@ struct OrtVitisAIEpAPI {
     }
     std::ignore = env.GetSymbolFromLibrary(handle_, "vaip_get_version",
                                            (void**)&vaip_get_version);
+    std::ignore = env.GetSymbolFromLibrary(handle_, "profiler_collect", (void**)&profiler_collect);
     ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "create_ep_context_nodes", (void**)&create_ep_context_nodes));
     ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "vitisai_ep_on_run_start", (void**)&vitisai_ep_on_run_start));
     ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "vitisai_ep_set_ep_dynamic_options", (void**)&vitisai_ep_set_ep_dynamic_options));
@@ -97,6 +101,14 @@ static vaip_core::OrtApiForVaip the_global_api;
 std::shared_ptr<KernelRegistry> get_kernel_registry_vitisaiep() { return s_kernel_registry_vitisaiep; }
 const std::vector<OrtCustomOpDomain*>& get_domains_vitisaiep() { return s_domains_vitisaiep; }
 
+void profiler_collect(
+    std::vector<EventInfo>& api_events,
+    std::vector<EventInfo>& kernel_events) {
+  if (s_library_vitisaiep.profiler_collect) {
+    s_library_vitisaiep.profiler_collect(api_events, kernel_events);
+  }
+}
+
 vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> compile_onnx_model(
     const onnxruntime::GraphViewer& graph_viewer, const logging::Logger& logger, const ProviderOptions& options) {
   auto model_path = graph_viewer.ModelPath().string();
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
index b0353bd6adae9..704b156dff57f 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
@@ -24,3 +24,18 @@ int vitisai_ep_set_ep_dynamic_options(
     const std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>& eps,
     const char* const* keys,
     const char* const* values, size_t kv_len);
+/**
+ * Replace EventRecord with std::tuple<std::string, int ,int, long long, long long>,
+ * because EventRecord is defined in profiler_common.h which is used inside onnxruntime.
+ * However, profiler_collect function will call vitis ep which can't include profiler_common.h.
+ */
+using EventInfo = std::tuple<
+    std::string,  // name
+    int,          // pid
+    int,          // tid
+    long long,    // timestamp
+    long long     // duration
+    >;
+void profiler_collect(
+    std::vector<EventInfo>& api_events,
+    std::vector<EventInfo>& kernel_events);
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
index 023a954c83d70..3a99f56bb732a 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
@@ -1,6 +1,7 @@
 // Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
 // Licensed under the MIT License.
 #include "vitisai_execution_provider.h"
+#include "vitisai_profiler.h"
 
 // Standard headers/libs.
 #include <cassert>
@@ -135,4 +136,8 @@ common::Status VitisAIExecutionProvider::SetEpDynamicOptions(gsl::span<const cha
   }
   return Status::OK();
 }
+
+std::unique_ptr<profiling::EpProfiler> VitisAIExecutionProvider::GetProfiler() {
+  return std::make_unique<profiling::VitisaiProfiler>();
+}
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
index 77dede6035b4c..f0d1a289a2a73 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
@@ -36,6 +36,8 @@ class VitisAIExecutionProvider : public IExecutionProvider {
                          std::vector<NodeComputeInfo>& node_compute_funcs) override;
   std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
 
+  std::unique_ptr<profiling::EpProfiler> GetProfiler() override;
+
   // This method is called after both `GetComputeCapabilityOps()` and `Compile()`.
   // This timing is required to work with both compliation-based EPs and non-compilation-based EPs.
   const InlinedVector<const Node*> GetEpContextNodes() const override;
diff --git a/onnxruntime/core/providers/vitisai/vitisai_profiler.cc b/onnxruntime/core/providers/vitisai/vitisai_profiler.cc
new file mode 100644
index 0000000000000..d84507ec6ad02
--- /dev/null
+++ b/onnxruntime/core/providers/vitisai/vitisai_profiler.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+// Licensed under the MIT License.
+
+#include "vitisai_profiler.h"
+
+namespace onnxruntime {
+namespace profiling {
+
+#if defined(USE_VITISAI)
+
+bool VitisaiProfiler::StartProfiling(TimePoint tp) {
+  return true;
+}
+
+void VitisaiProfiler::EndProfiling(TimePoint tp, Events& events) {
+  auto time_point =
+      std::chrono::duration_cast<std::chrono::microseconds>(tp.time_since_epoch()).count();
+
+  std::vector<EventInfo> api_events;
+  std::vector<EventInfo> kernel_events;
+  profiler_collect(api_events, kernel_events);
+
+  std::unordered_map<std::string, std::string> event_args;
+
+  for (auto& a : api_events) {
+    events.emplace_back(EventCategory::API_EVENT,
+                        std::get<1>(a),               // pid
+                        std::get<2>(a),               // tid
+                        std::get<0>(a),               // name
+                        std::get<3>(a) - time_point,  // timestamp
+                        std::get<4>(a),               // duration
+                        event_args);
+  }
+
+  for (auto& k : kernel_events) {
+    events.emplace_back(EventCategory::KERNEL_EVENT,
+                        std::get<1>(k),
+                        std::get<2>(k),
+                        std::get<0>(k),
+                        std::get<3>(k) - time_point,
+                        std::get<4>(k),
+                        event_args);
+  }
+}
+
+#endif
+
+}  // namespace profiling
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/vitisai/vitisai_profiler.h b/onnxruntime/core/providers/vitisai/vitisai_profiler.h
new file mode 100644
index 0000000000000..aedbda31f7b1d
--- /dev/null
+++ b/onnxruntime/core/providers/vitisai/vitisai_profiler.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/vitisai/include/vaip/global_api.h"
+
+namespace onnxruntime {
+namespace profiling {
+
+#if defined(USE_VITISAI)
+class VitisaiProfiler final : public EpProfiler {
+ public:
+  VitisaiProfiler() = default;
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(VitisaiProfiler);
+  ~VitisaiProfiler() {}
+  bool StartProfiling(TimePoint) override;
+  void EndProfiling(TimePoint, Events&) override;
+  void Start(uint64_t) override{};
+  void Stop(uint64_t) override{};
+};
+#endif
+
+}  // namespace profiling
+}  // namespace onnxruntime

From ae970681372e20c3df3f1b40bfb0ee06a02c39c8 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Mon, 16 Dec 2024 10:38:23 -0800
Subject: [PATCH 03/25] Fix Pybind memory leak (#23105)

### Description
<!-- Describe your changes. -->
Array GETITEM returns new reference which is a leak


### Motivation and Context
Address  https://github.com/microsoft/onnxruntime/issues/22271
---
 onnxruntime/python/onnxruntime_pybind_mlvalue.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
index 92396bb09bd4c..5742b4db42512 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@@ -280,7 +280,7 @@ void DmlToCpuMemCpy(void* dst, const void* src, size_t num_bytes) {
   uint32_t readback_heap_size = gsl::narrow_cast<uint32_t>(sizeof(readback_heap));
   ORT_THROW_IF_FAILED(d3d12_device->GetPrivateData(dml_readback_heap_guid, &readback_heap_size, &readback_heap));
 
-  // ReadbackFromGpu already syncs with the CPU and waits for the copy to be completed, so we don't need to sync after
+  // ReadbackFromGpu already syncs with the CPU and waits for the copy to be completed, so we dont need to sync after
   // this call
   readback_heap->ReadbackFromGpu(
       gsl::make_span(static_cast<std::byte*>(dst), num_bytes),
@@ -428,7 +428,7 @@ MLDataType NumpyTypeToOnnxRuntimeTensorType(int numpy_type) {
       // Special, not a C type expands to enum value of 16
       {NPY_FLOAT16, DataTypeImpl::GetType<MLFloat16>()},
       {NPY_DOUBLE, DataTypeImpl::GetType<double>()},
-      // We don't want to use size specific types such
+      // We dont want to use size specific types such
       // as NPY_INT32 bc they are not enums but hash defines
       // which may map into other enums and may conflict with other entries here
       // also NPY docs define these sizes as platform specific, thus we
@@ -581,6 +581,7 @@ static void CopyDataToTensor(PyArrayObject* darray, int npy_type, Tensor& tensor
     for (int i = 0; i < total_items; ++i, src += item_size) {
       // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
       PyObject* item = PyArray_GETITEM(darray, src);
+      UniqueDecRefPtr<PyObject> itemGuard(item, DecRefFn<PyObject>());
       PyObject* pStr = PyObject_Str(item);
       UniqueDecRefPtr<PyObject> strGuard(pStr, DecRefFn<PyObject>());
       dst[i] = py::reinterpret_borrow<py::str>(pStr);

From 9115682d69d381e6f31f1431cf6b037bfd458536 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 16 Dec 2024 15:35:47 -0800
Subject: [PATCH 04/25] [js/webgpu] disable failed tests temporarily (#23127)

### Description


Those test cases start to fail for unknown reasons.

To unblock the CI, I disabled those tests temporarily to earn time to
investigate the root cause.
---
 js/web/test/data/ops/conv.jsonc       | 160 +++++++++++++-------------
 js/web/test/data/ops/fused-conv.jsonc | 152 ++++++++++++------------
 2 files changed, 156 insertions(+), 156 deletions(-)

diff --git a/js/web/test/data/ops/conv.jsonc b/js/web/test/data/ops/conv.jsonc
index 262503214a50a..f514ae5fa75e6 100644
--- a/js/web/test/data/ops/conv.jsonc
+++ b/js/web/test/data/ops/conv.jsonc
@@ -391,48 +391,48 @@
       }
     ]
   },
-  {
-    "name": "conv - vectorize group - B",
-    "operator": "Conv",
-    "inputShapeDefinitions": "rankOnly",
-    "opset": { "domain": "", "version": 17 },
-    "attributes": [
-      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
-      { "name": "group", "data": 3, "type": "int" }
-    ],
-    "cases": [
-      {
-        "name": "T[0]",
-        "inputs": [
-          {
-            "data": [
-              0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
-              19.0, 20.0, 21.0, 22.0, 23.0, 0, 0, 0
-            ],
-            "dims": [1, 3, 3, 3],
-            "type": "float32"
-          },
-          {
-            "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
-            "dims": [3, 1, 2, 2],
-            "type": "float32"
-          },
-          {
-            "data": [0.1, 0.2, 0.3],
-            "dims": [3],
-            "type": "float32"
-          }
-        ],
-        "outputs": [
-          {
-            "data": [27.1, 37.1, 57.1, 67.1, 293.2, 319.2, 371.2, 397.2, 847.3, 889.3, 409.3, 428.3],
-            "dims": [1, 3, 2, 2],
-            "type": "float32"
-          }
-        ]
-      }
-    ]
-  },
+  // {
+  //   "name": "conv - vectorize group - B",
+  //   "operator": "Conv",
+  //   "inputShapeDefinitions": "rankOnly",
+  //   "opset": { "domain": "", "version": 17 },
+  //   "attributes": [
+  //     { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+  //     { "name": "group", "data": 3, "type": "int" }
+  //   ],
+  //   "cases": [
+  //     {
+  //       "name": "T[0]",
+  //       "inputs": [
+  //         {
+  //           "data": [
+  //             0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+  //             19.0, 20.0, 21.0, 22.0, 23.0, 0, 0, 0
+  //           ],
+  //           "dims": [1, 3, 3, 3],
+  //           "type": "float32"
+  //         },
+  //         {
+  //           "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+  //           "dims": [3, 1, 2, 2],
+  //           "type": "float32"
+  //         },
+  //         {
+  //           "data": [0.1, 0.2, 0.3],
+  //           "dims": [3],
+  //           "type": "float32"
+  //         }
+  //       ],
+  //       "outputs": [
+  //         {
+  //           "data": [27.1, 37.1, 57.1, 67.1, 293.2, 319.2, 371.2, 397.2, 847.3, 889.3, 409.3, 428.3],
+  //           "dims": [1, 3, 2, 2],
+  //           "type": "float32"
+  //         }
+  //       ]
+  //     }
+  //   ]
+  // },
   {
     "name": "conv - vectorize group - C",
     "operator": "Conv",
@@ -470,44 +470,44 @@
       }
     ]
   },
-  {
-    "name": "conv - vectorize group - D",
-    "operator": "Conv",
-    "inputShapeDefinitions": "rankOnly",
-    "opset": { "domain": "", "version": 17 },
-    "attributes": [
-      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
-      { "name": "group", "data": 3, "type": "int" },
-      { "name": "strides", "data": [2, 2], "type": "ints" }
-    ],
-    "cases": [
-      {
-        "name": "T[0] strides = [2, 2]",
-        "inputs": [
-          {
-            "data": [
-              0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
-              19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0
-            ],
-            "dims": [1, 3, 3, 4],
-            "type": "float32"
-          },
-          {
-            "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
-            "dims": [3, 1, 2, 2],
-            "type": "float32"
-          }
-        ],
-        "outputs": [
-          {
-            "data": [34, 54, 386, 438, 1122, 1206],
-            "dims": [1, 3, 1, 2],
-            "type": "float32"
-          }
-        ]
-      }
-    ]
-  },
+  // {
+  //   "name": "conv - vectorize group - D",
+  //   "operator": "Conv",
+  //   "inputShapeDefinitions": "rankOnly",
+  //   "opset": { "domain": "", "version": 17 },
+  //   "attributes": [
+  //     { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+  //     { "name": "group", "data": 3, "type": "int" },
+  //     { "name": "strides", "data": [2, 2], "type": "ints" }
+  //   ],
+  //   "cases": [
+  //     {
+  //       "name": "T[0] strides = [2, 2]",
+  //       "inputs": [
+  //         {
+  //           "data": [
+  //             0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+  //             19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0
+  //           ],
+  //           "dims": [1, 3, 3, 4],
+  //           "type": "float32"
+  //         },
+  //         {
+  //           "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+  //           "dims": [3, 1, 2, 2],
+  //           "type": "float32"
+  //         }
+  //       ],
+  //       "outputs": [
+  //         {
+  //           "data": [34, 54, 386, 438, 1122, 1206],
+  //           "dims": [1, 3, 1, 2],
+  //           "type": "float32"
+  //         }
+  //       ]
+  //     }
+  //   ]
+  // },
   {
     "name": "conv - pointwise",
     "operator": "Conv",
diff --git a/js/web/test/data/ops/fused-conv.jsonc b/js/web/test/data/ops/fused-conv.jsonc
index d88c91ebc9de7..ebb0b5d3e1f58 100644
--- a/js/web/test/data/ops/fused-conv.jsonc
+++ b/js/web/test/data/ops/fused-conv.jsonc
@@ -249,44 +249,44 @@
       }
     ]
   },
-  {
-    "name": "NHWC group-conv with HardSigmoid",
-    "operator": "Conv",
-    "attributes": [
-      { "name": "activation", "data": "HardSigmoid", "type": "string" },
-      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
-      { "name": "group", "data": 3, "type": "int" },
-      { "name": "activation_params", "data": [2.0, 5.0], "type": "floats" }
-    ],
-    "opset": { "domain": "com.ms.internal.nhwc", "version": 1 },
-    "cases": [
-      {
-        "name": "T[0]",
-        "inputs": [
-          {
-            "data": [
-              0.0, 1.0, 2.0, -3.0, 4.0, -5.0, 6.0, 7.0, 8.0, -9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0, 16.0, 17.0,
-              18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0
-            ],
-            "dims": [1, 3, 3, 3],
-            "type": "float32"
-          },
-          {
-            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
-            "dims": [3, 1, 2, 2],
-            "type": "float32"
-          }
-        ],
-        "outputs": [
-          {
-            "data": [0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-            "dims": [1, 2, 2, 3],
-            "type": "float32"
-          }
-        ]
-      }
-    ]
-  },
+  // {
+  //   "name": "NHWC group-conv with HardSigmoid",
+  //   "operator": "Conv",
+  //   "attributes": [
+  //     { "name": "activation", "data": "HardSigmoid", "type": "string" },
+  //     { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+  //     { "name": "group", "data": 3, "type": "int" },
+  //     { "name": "activation_params", "data": [2.0, 5.0], "type": "floats" }
+  //   ],
+  //   "opset": { "domain": "com.ms.internal.nhwc", "version": 1 },
+  //   "cases": [
+  //     {
+  //       "name": "T[0]",
+  //       "inputs": [
+  //         {
+  //           "data": [
+  //             0.0, 1.0, 2.0, -3.0, 4.0, -5.0, 6.0, 7.0, 8.0, -9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0, 16.0, 17.0,
+  //             18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0
+  //           ],
+  //           "dims": [1, 3, 3, 3],
+  //           "type": "float32"
+  //         },
+  //         {
+  //           "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+  //           "dims": [3, 1, 2, 2],
+  //           "type": "float32"
+  //         }
+  //       ],
+  //       "outputs": [
+  //         {
+  //           "data": [0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+  //           "dims": [1, 2, 2, 3],
+  //           "type": "float32"
+  //         }
+  //       ]
+  //     }
+  //   ]
+  // },
   {
     "name": "fused group-conv with LeakyRelu",
     "operator": "FusedConv",
@@ -325,44 +325,44 @@
       }
     ]
   },
-  {
-    "name": "NHWC group-conv with LeakyRelu",
-    "operator": "Conv",
-    "attributes": [
-      { "name": "activation", "data": "LeakyRelu", "type": "string" },
-      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
-      { "name": "group", "data": 3, "type": "int" },
-      { "name": "activation_params", "data": [2.0], "type": "floats" }
-    ],
-    "opset": { "domain": "com.ms.internal.nhwc", "version": 1 },
-    "cases": [
-      {
-        "name": "T[0]",
-        "inputs": [
-          {
-            "data": [
-              0.0, 1.0, 2.0, -3.0, 4.0, -5.0, 6.0, 7.0, 8.0, -9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0, 16.0, 17.0,
-              18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0
-            ],
-            "dims": [1, 3, 3, 3],
-            "type": "float32"
-          },
-          {
-            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
-            "dims": [3, 1, 2, 2],
-            "type": "float32"
-          }
-        ],
-        "outputs": [
-          {
-            "data": [-162, 63, -158, 33, 281, 85, 105, 337, 455, 177, 515, 609],
-            "dims": [1, 2, 2, 3],
-            "type": "float32"
-          }
-        ]
-      }
-    ]
-  },
+  // {
+  //   "name": "NHWC group-conv with LeakyRelu",
+  //   "operator": "Conv",
+  //   "attributes": [
+  //     { "name": "activation", "data": "LeakyRelu", "type": "string" },
+  //     { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+  //     { "name": "group", "data": 3, "type": "int" },
+  //     { "name": "activation_params", "data": [2.0], "type": "floats" }
+  //   ],
+  //   "opset": { "domain": "com.ms.internal.nhwc", "version": 1 },
+  //   "cases": [
+  //     {
+  //       "name": "T[0]",
+  //       "inputs": [
+  //         {
+  //           "data": [
+  //             0.0, 1.0, 2.0, -3.0, 4.0, -5.0, 6.0, 7.0, 8.0, -9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0, 16.0, 17.0,
+  //             18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0
+  //           ],
+  //           "dims": [1, 3, 3, 3],
+  //           "type": "float32"
+  //         },
+  //         {
+  //           "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+  //           "dims": [3, 1, 2, 2],
+  //           "type": "float32"
+  //         }
+  //       ],
+  //       "outputs": [
+  //         {
+  //           "data": [-162, 63, -158, 33, 281, 85, 105, 337, 455, 177, 515, 609],
+  //           "dims": [1, 2, 2, 3],
+  //           "type": "float32"
+  //         }
+  //       ]
+  //     }
+  //   ]
+  // },
   {
     "name": "fused conv with LeakyRelu",
     "operator": "FusedConv",

From 0981bbf4ca4af4d7216299f15de784f19ce6123a Mon Sep 17 00:00:00 2001
From: Jiajia Qin <jiajiaqin@microsoft.com>
Date: Tue, 17 Dec 2024 12:47:40 +0800
Subject: [PATCH 05/25] [webgpu] Optimize matmulnbits with M > 1 (#23102)

This is the webgpu native ep implementation of #23092.

I used https://github.com/fs-eire/ort-webgpu-nodejs-chatapp-prototype to
test. Meanwhile, applied
https://github.com/fs-eire/ort-webgpu-nodejs-chatapp-prototype/pull/2 to
print the first token time.

The result is like below:
The latest main branch:
Intel Arc Graphics
```
659 tokens in 24.8sec, 26.57 tokens/sec
    Decoding first token with input 449 tokens: 13.0 sec
    Decoding remaining 210 tokens:
        11.8 sec
        17.79 tokens/sec
```
NV RTX 2000
```
659 tokens in 14.4sec, 45.85 tokens/sec
    Decoding first token with input 449 tokens: 7.3 sec
    Decoding remaining 210 tokens:
        7.0 sec
        29.81 tokens/sec
```

-------------------------------------------------------------------------
With this PR:
Intel Arc Graphics
```
657 tokens in 20.6sec, 31.92 tokens/sec
    Decoding first token with input 449 tokens: 8.5 sec
    Decoding remaining 208 tokens:
        12.1 sec
        17.23 tokens/sec
```
NV RTX 2000
```
659 tokens in 11.4sec, 57.93 tokens/sec
    Decoding first token with input 449 tokens: 4.1 sec
    Decoding remaining 210 tokens:
        7.2 sec
        28.98 tokens/sec
```

From above data, you can see that with this PR, both intel (13s -> 8.5s)
and NV (7.3s -> 4.1s) GPUs for the first token time are performing
better.
---
 .../webgpu/quantization/matmul_nbits.cc       | 363 +++++++-----------
 .../webgpu/quantization/matmul_nbits.h        |  30 +-
 2 files changed, 151 insertions(+), 242 deletions(-)

diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
index be18f820e2747..9a49adf347a29 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
@@ -39,7 +39,7 @@ std::string QuantizedDataType(int components) {
   }
 }
 
-constexpr unsigned int kMinSequenceLengthForPrefillOptimization = 16;
+constexpr unsigned int kMinMForTileOptimization = 4;
 }  // namespace
 
 ONNX_OPERATOR_KERNEL_EX(
@@ -60,33 +60,59 @@ Status MatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
   const auto& scales = shader.AddInput("scales", ShaderUsage::UseUniform);
   const auto& y = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias | ShaderUsage::UseIndicesTypeAlias);
 
-  if (use_block32_) {
+  if ((is_intel_ || tile_m_ > 1) && block_size_ == 32) {
     const uint32_t workgroup_size = WorkgroupSizeX() * WorkgroupSizeY();
     const uint32_t tile_size = WorkgroupSizeX() * components_b_ * 8;  // each uint32 has 8 data.
     const uint32_t a_length_per_tile = tile_size / a.NumComponents();
-    constexpr uint32_t block_size = 32;
-    const uint32_t blocks_per_tile = tile_size / block_size;
-    shader.AdditionalImplementation() << "var<workgroup> sub_a: array<input_a_value_t, " << a_length_per_tile << ">;\n"
-                                      << "var<workgroup> inter_results: array<array<output_value_t, " << WorkgroupSizeX() << ">, " << WorkgroupSizeY() << ">;\n";
-    std::string offset = "workgroup_idx * " + std::to_string(WorkgroupSizeY());
-    shader.MainFunctionBody() << "  let output_indices = " << y.OffsetToIndices(offset) << ";\n"
-                              << "  let col = output_indices[2];\n"
-                                 "  let row = output_indices[1];\n"
-                                 "  let batch = output_indices[0];\n"
-                                 "  let n_blocks_per_col = uniforms.input_b_shape[1];\n"
+    const uint32_t blocks_per_tile = tile_size / block_size_;
+    if (tile_m_ == 1) {
+      shader.AdditionalImplementation() << "fn mm_readA(batch : u32, row : u32, col : u32) -> input_a_value_t {\n"
+                                           "  if (col < uniforms.input_a_shape[2]) {\n"
+                                        << "    return " << a.GetByIndices("input_a_indices_t(batch, row, col)") << ";\n"
+                                        << "  } else {\n"
+                                           "    return input_a_value_t(0);\n"
+                                           "  }\n"
+                                           "}\n"
+                                        << "var<workgroup> sub_a: array<input_a_value_t, " << a_length_per_tile << ">;\n"
+                                        << "var<workgroup> inter_results: array<array<output_value_t, " << WorkgroupSizeX() << ">, " << WorkgroupSizeY() << ">;\n";
+      std::string offset = "workgroup_idx * " + std::to_string(WorkgroupSizeY());
+      shader.MainFunctionBody() << "  let output_indices = " << y.OffsetToIndices(offset) << ";\n"
+                                << "  let col = output_indices[2];\n"
+                                   "  let row = output_indices[1];\n"
+                                   "  let batch = output_indices[0];\n";
+    } else {
+      ORT_ENFORCE(tile_m_ < WorkgroupSizeY(), "tile_m must be less than or equal to WorkgroupSizeY.");
+      ORT_ENFORCE(WorkgroupSizeX() == WorkgroupSizeY(), "WorkgroupSizeX must be equal to WorkgroupSizeY.");
+
+      shader.AdditionalImplementation() << "fn mm_readA(batch : u32, row : u32, col : u32) -> input_a_value_t {\n"
+                                           "  if (row < uniforms.input_a_shape[1] && col < uniforms.input_a_shape[2]) {\n"
+                                        << "    return " << a.GetByIndices("input_a_indices_t(batch, row, col)") << ";\n"
+                                        << "  } else {\n"
+                                           "    return input_a_value_t(0);\n"
+                                           "  }\n"
+                                           "}\n"
+                                        << "var<workgroup> sub_a: array<array<input_a_value_t, " << a_length_per_tile << ">," << tile_m_ << ">;\n"
+                                        << "var<workgroup> inter_results: array<array<array<output_value_t, " << WorkgroupSizeX() << ">, " << WorkgroupSizeY() << ">," << tile_m_ << ">;\n";
+      shader.MainFunctionBody() << "  let col = workgroup_id.x * " << WorkgroupSizeY() << ";\n"
+                                << "  let row = workgroup_id.y * " << tile_m_ << ";\n"
+                                << "  let batch = workgroup_id.z;\n";
+    }
+    shader.MainFunctionBody() << "  let n_blocks_per_col = uniforms.input_b_shape[1];\n"
                               << "  let num_tiles =  (n_blocks_per_col - 1) / " << blocks_per_tile << " + 1;\n"
                               // Loop over shared dimension.
                               << "  for (var tile: u32 = 0; tile < num_tiles; tile += 1) {\n"
                               << "    let a_col_start = tile * " << a_length_per_tile << ";\n"
                               << "    // load one tile A data into shared memory.\n"
                               << "    for (var a_offset = local_idx; a_offset < " << a_length_per_tile << "; a_offset += " << workgroup_size << ") {\n"
-                              << "      let a_col = a_col_start + a_offset;\n"
-                                 "      if (a_col < uniforms.input_a_shape[2]) {\n"
-                              << "        sub_a[a_offset] = " << a.GetByIndices("input_a_indices_t(batch, row, a_col)") << ";\n"
-                              << "      } else {\n"
-                                 "        sub_a[a_offset] = input_a_value_t(0);\n"
-                                 "      }\n"
-                                 "    }\n"
+                              << "      let a_col = a_col_start + a_offset;\n";
+    if (tile_m_ == 1) {
+      shader.MainFunctionBody() << "      sub_a[a_offset] = mm_readA(batch, row, a_col);\n";
+    } else {
+      for (uint32_t i = 0; i < tile_m_; i++) {
+        shader.MainFunctionBody() << "      sub_a[" << i << "][a_offset] = mm_readA(batch, row + " << i << ", a_col);\n";
+      }
+    }
+    shader.MainFunctionBody() << "    }\n"
                                  "    workgroupBarrier();\n"
                                  // Each thread processes one block.
                                  "    let b_row = col + local_id.y;\n"
@@ -111,24 +137,8 @@ Status MatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
                               << "      scale = " << scales.GetByOffset("b_row * n_blocks_per_col + block") << ";\n"
                               << "      b_data = " << b.GetByIndices("input_b_indices_t(b_row, block, 0)") << ";\n"
                               << "    }\n"
-                              << "    var word_offset = local_id.x * " << block_size / a.NumComponents() << ";\n"
+                              << "    var word_offset = local_id.x * " << block_size_ / a.NumComponents() << ";\n"
                               << "    for (var i: u32 = 0; i < " << components_b_ << "; i++) {\n";
-    switch (a.NumComponents()) {
-      case 1:
-        shader.MainFunctionBody() << "      let a_data0 = vec4<output_element_t>(sub_a[word_offset], sub_a[word_offset + 1], sub_a[word_offset + 2], sub_a[word_offset + 3]);\n"
-                                     "      let a_data1 = vec4<output_element_t>(sub_a[word_offset + 4], sub_a[word_offset + 5], sub_a[word_offset + 6], sub_a[word_offset + 7]);\n";
-        break;
-      case 2:
-        shader.MainFunctionBody() << "      let a_data0 = vec4<output_element_t>(sub_a[word_offset], sub_a[word_offset + 1]);\n"
-                                     "      let a_data1 = vec4<output_element_t>(sub_a[word_offset + 2], sub_a[word_offset + 3]);\n";
-        break;
-      case 4:
-        shader.MainFunctionBody() << "      let a_data0 = sub_a[word_offset];\n"
-                                     "      let a_data1 = sub_a[word_offset + 1];\n";
-        break;
-      default:
-        break;
-    }
     shader.MainFunctionBody() << "      let b_value = b_data";
     if (components_b_ > 1) {
       shader.MainFunctionBody() << "[i]";
@@ -144,21 +154,63 @@ Status MatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
         shader.MainFunctionBody() << ", ";
       }
     }
-    shader.MainFunctionBody() << ")) * scale;\n"
-                                 "      inter_results[local_id.y][local_id.x] += dot(a_data0, b_dequantized_values[0]) + dot(a_data1, b_dequantized_values[1]);\n"
-                              << "      word_offset += " << 8 / a.NumComponents() << ";\n"
+    shader.MainFunctionBody() << ")) * scale;\n";
+    if (tile_m_ == 1) {
+      switch (a.NumComponents()) {
+        case 1:
+          shader.MainFunctionBody() << "      inter_results[local_id.y][local_id.x] += dot(vec4<output_element_t>(sub_a[word_offset], sub_a[word_offset + 1], sub_a[word_offset + 2], sub_a[word_offset + 3]), b_dequantized_values[0]) + dot(vec4<output_element_t>(sub_a[word_offset + 4], sub_a[word_offset + 5], sub_a[word_offset + 6], sub_a[word_offset + 7]), b_dequantized_values[1]);\n";
+          break;
+        case 2:
+          shader.MainFunctionBody() << "      inter_results[local_id.y][local_id.x] += dot(vec4<output_element_t>(sub_a[word_offset], sub_a[word_offset + 1]), b_dequantized_values[0]) + dot(vec4<output_element_t>(sub_a[word_offset + 2], sub_a[word_offset + 3]), b_dequantized_values[1]);\n";
+          break;
+        case 4:
+          shader.MainFunctionBody() << "      inter_results[local_id.y][local_id.x] += dot(sub_a[word_offset], b_dequantized_values[0]) + dot(sub_a[word_offset + 1], b_dequantized_values[1]);\n";
+          break;
+        default:
+          break;
+      }
+    } else {
+      for (uint32_t i = 0; i < tile_m_; i++) {
+        switch (a.NumComponents()) {
+          case 1:
+            shader.MainFunctionBody() << "      inter_results[" << i << "][local_id.y][local_id.x] += dot(vec4<output_element_t>(sub_a[" << i << "][word_offset], sub_a[" << i << "][word_offset + 1], sub_a[" << i << "][word_offset + 2], sub_a[" << i << "][word_offset + 3]), b_dequantized_values[0]) + dot(vec4<output_element_t>(sub_a[" << i << "][word_offset + 4], sub_a[" << i << "][word_offset + 5], sub_a[" << i << "][word_offset + 6], sub_a[" << i << "][word_offset + 7]), b_dequantized_values[1]);\n";
+            break;
+          case 2:
+            shader.MainFunctionBody() << "      inter_results[" << i << "][local_id.y][local_id.x] += dot(vec4<output_element_t>(sub_a[" << i << "][word_offset], sub_a[" << i << "][word_offset + 1]), b_dequantized_values[0]) + dot(vec4<output_element_t>(sub_a[" << i << "][word_offset + 2], sub_a[" << i << "][word_offset + 3]), b_dequantized_values[1]);\n";
+            break;
+          case 4:
+            shader.MainFunctionBody() << "      inter_results[" << i << "][local_id.y][local_id.x] += dot(sub_a[" << i << "][word_offset], b_dequantized_values[0]) + dot(sub_a[" << i << "][word_offset + 1], b_dequantized_values[1]);\n";
+            break;
+          default:
+            break;
+        }
+      }
+    }
+    shader.MainFunctionBody() << "      word_offset += " << 8 / a.NumComponents() << ";\n"
                               << "    }\n"
                                  "    workgroupBarrier();\n"
-                                 "  }\n"
-                              << "  if (local_idx < " << WorkgroupSizeY() << ") {\n"
-                              << "    var output_value = output_value_t(0);\n"
-                              << "    for (var b = 0u; b < " << WorkgroupSizeX() << "; b++) {\n"
-                              << "      output_value += inter_results[local_idx][b];\n"
-                                 "    }\n"
-                                 "    if (col + local_idx < uniforms.output_shape[2]) {\n"
-                              << "      " << y.SetByIndices("output_indices_t(batch, row, col + local_idx)", "output_value") << ";\n"
-                              << "    }\n"
                                  "  }\n";
+    if (tile_m_ == 1) {
+      shader.MainFunctionBody() << "  if (local_idx < " << WorkgroupSizeY() << ") {\n"
+                                << "    var output_value = output_value_t(0);\n"
+                                << "    for (var b = 0u; b < " << WorkgroupSizeX() << "; b++) {\n"
+                                << "      output_value += inter_results[local_idx][b];\n"
+                                   "    }\n"
+                                   "    if (col + local_idx < uniforms.output_shape[2]) {\n"
+                                << "      " << y.SetByIndices("output_indices_t(batch, row, col + local_idx)", "output_value") << ";\n"
+                                << "    }\n"
+                                   "  }\n";
+    } else {
+      shader.MainFunctionBody() << "  if (local_id.y < " << tile_m_ << ") {\n"
+                                << "    var output_value = output_value_t(0);\n"
+                                << "    for (var b = 0u; b < " << WorkgroupSizeX() << "; b++) {\n"
+                                << "      output_value += inter_results[local_id.y][local_id.x][b];\n"
+                                   "    }\n"
+                                   "    if (row + local_id.y < uniforms.output_shape[1] && col + local_id.x < uniforms.output_shape[2]) {\n"
+                                << "      " << y.SetByIndices("output_indices_t(batch, row + local_id.y, col + local_id.x)", "output_value") << ";\n"
+                                << "    }\n"
+                                   "  }\n";
+    }
   } else {
     const std::string quantized_data_type = QuantizedDataType(a.NumComponents());
     const int output_element_number = y.NumComponents() * gsl::narrow<int>(output_number_);
@@ -322,121 +374,6 @@ Status MatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
   return Status::OK();
 }
 
-Status MatMulNBitsProgramPrefill::GenerateShaderCode(ShaderHelper& shader) const {
-  shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
-  shader.AddInput("input_b", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
-  shader.AddInput("scales", ShaderUsage::UseUniform);
-  shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias | ShaderUsage::UseIndicesTypeAlias);
-  // This shader uses uniforms with the M,N,K convention from traditional matrix multiplicatiion
-  // M is the number of rows in A and M rows in the output.
-  // N is the number of columns in B and N columns in the output.
-  // K is the hidden/shared dimension number of columns in A and K rows in B.
-  // Note in matmulnbits, B matrix is already transposed, however the following remains true
-  // for the shader below M describes A, N describes B and K is the hidden/shared dimension.
-  // K4/K8 are simply K divided by 4 or 8 respectively.
-  shader.AdditionalImplementation() << R"INIT_SECTION(
-// Matrix dimensions and quantization parameters
-const TILE_SIZE : u32 = 16u;
-const VALUES_PER_VEC4 : u32 = 4u;
-const QUANTIZATION_BLOCK_SIZE : u32 = 32;
-// We want INNER_DIMENSION_ITEMS_PER_CYCLE to be the number of lanes in an EU/SM,
-// so we use BLOCKS_PER_CYCLE as 2u, or process weights 2 blocks at a time.
-// This uses all 16 lanes on 12th gen intel chips.
-const BLOCKS_PER_CYCLE : u32 = 2u;
-const INNER_DIMENSION_ITEMS_PER_CYCLE : u32 = 16u; // (QUANTIZATION_BLOCK_SIZE/VALUES_PER_VEC4)*BLOCKS_PER_CYCLE
-const VECTORIZED_QUANTIZATION_BLOCK_SIZE: u32 = 8u; // QUANTIZATION_BLOCK_SIZE / VALUES_PER_VEC4;
-
-//Shared memory
-var<workgroup> tile_A : array<array<input_a_value_t, INNER_DIMENSION_ITEMS_PER_CYCLE>, TILE_SIZE>;
-var<workgroup> tile_B : array<array<input_a_value_t, INNER_DIMENSION_ITEMS_PER_CYCLE>, TILE_SIZE>;
-var<workgroup> tile_O : array<array<output_value_t, TILE_SIZE>, TILE_SIZE>;
-
-fn loadA(slot: u32, a_global : u32, step_idx : u32, parallel_id : u32)
-{
-    if (a_global >= uniforms.M) {
-        return;
-    }
-    let local_A = input_a[a_global*uniforms.K4+step_idx*INNER_DIMENSION_ITEMS_PER_CYCLE+parallel_id];
-    tile_A[slot][parallel_id] = local_A;
-}
-
-fn getBScale(slot: u32, b_global : u32, vec_step_idx : u32, scale_idx: u32) -> output_value_t
-{
-    // Since scales are output_value_t holding 1 for every 32 values, vec_step_idx jumps over 64 weights at
-    // a time or 2 scales at every step.
-    let scale_offset = vec_step_idx*2;
-    let idx = u32(b_global*(uniforms.K/QUANTIZATION_BLOCK_SIZE)+scale_offset);
-    return scales[idx+scale_idx];
-}
-
-fn loadB(slot: u32, b_global : u32, vec_step_idx : u32, parallel_id : u32)
-{
-    if (b_global >= uniforms.N) {
-        return;
-    }
-    let scale = getBScale(slot, b_global, vec_step_idx, u32(parallel_id/VECTORIZED_QUANTIZATION_BLOCK_SIZE));
-    let idx:u32 = parallel_id;
-    if (idx % 2 == 0)
-    {
-      // Weights are u32 holding 8 values each, each step (vec_step_idx) jumps over 64 weights at a time.
-      // Therefore the weight_offset begin for the current step would be vec_step_idx * 64 if weight
-      // elements were holding one element each. For the case of each element holding 8 values, begin
-      // would become vec_step_idx * 64/8 or vec_step_idx * 8.
-      var weight_offset:u32 = (vec_step_idx*8)+ u32(idx/2);
-      let b_value = input_b[b_global*uniforms.K8+weight_offset];
-      let b_value_lower = unpack4xU8(b_value & 0x0F0F0F0Fu);
-      let b_value_upper = unpack4xU8((b_value >> 4) & 0x0F0F0F0Fu);
-      tile_B[slot][idx].x = (output_value_t(b_value_lower[0]) - 8.0) * scale;
-      tile_B[slot][idx].y = (output_value_t(b_value_upper[0]) - 8.0) * scale;
-      tile_B[slot][idx].z = (output_value_t(b_value_lower[1]) - 8.0) * scale;
-      tile_B[slot][idx].w = (output_value_t(b_value_upper[1]) - 8.0) * scale;
-      tile_B[slot][idx+1].x = (output_value_t(b_value_lower[2]) - 8.0)* scale;
-      tile_B[slot][idx+1].y = (output_value_t(b_value_upper[2]) - 8.0)* scale;
-      tile_B[slot][idx+1].z = (output_value_t(b_value_lower[3]) - 8.0)* scale;
-      tile_B[slot][idx+1].w = (output_value_t(b_value_upper[3]) - 8.0)* scale;
-    }
-}
-
-fn computeDotProduct(slot_a: u32, slot_b:u32)  -> output_value_t
-{
-   var sum:output_value_t = 0;
-   for (var idx:u32 = 0 ; idx < INNER_DIMENSION_ITEMS_PER_CYCLE; idx++)
-   {
-      sum += dot(tile_A[slot_a][idx], tile_B[slot_b][idx]);
-   }
-   return sum;
-}
-)INIT_SECTION";
-
-  shader.MainFunctionBody() << R"MAIN_FN(
-  // Indexing with idx,idy instead of using a 2d dispatch of TILE_SIZE, TILE_SIZE
-  // appears to give a performance win on Intel Gen12LP architecture.
-  // This is likley because of locality of memory access, idy below in this approach
-  // is the same as subgroup_id or lane id, while idx is the wave_id.
-  // The work distribution therefore keeps memory accesses close together in
-  // a single wave in this approach of indexing.
-  let idx = u32(local_idx / TILE_SIZE);
-  let idy = u32(local_idx % TILE_SIZE);
-  let a_global_base = workgroup_id.x * TILE_SIZE;
-  let b_global_base = workgroup_id.y * TILE_SIZE;
-  let step_count:u32 = u32(uniforms.K/(BLOCKS_PER_CYCLE*QUANTIZATION_BLOCK_SIZE));
-  for (var vec_step:u32 = 0; vec_step < step_count; vec_step++)
-  {
-    workgroupBarrier();
-    loadA(idx, a_global_base+idx, vec_step, idy);
-    loadB(idx, b_global_base+idx, vec_step, idy);
-    workgroupBarrier();
-    let result = computeDotProduct(idx, idy);
-    tile_O[idx][idy]+=result;
-  }
-  workgroupBarrier();
-  if (a_global_base+idx < uniforms.M && b_global_base+idy < uniforms.N) {
-    output[(a_global_base+idx) * uniforms.N + b_global_base + idy] = tile_O[idx][idy];
-  }
-)MAIN_FN";
-  return Status::OK();
-}
-
 Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const {
   const Tensor* a = context.Input(0);
   const Tensor* b = context.Input(1);
@@ -471,70 +408,52 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context
   const uint32_t components_b = GetMaxComponents(blob_size_in_words);
   uint32_t components = GetMaxComponents(N);
 
-  // Use block32 for Intel Gen12LP architecture.
-  const bool use_block32 = context.AdapterInfo().vendor == std::string_view{"intel"} &&
-                           context.AdapterInfo().architecture == std::string_view{"gen-12lp"} &&
-                           block_size == 32;
+  const bool is_intel = context.AdapterInfo().vendor == std::string_view{"intel"} &&
+                        context.AdapterInfo().architecture == std::string_view{"gen-12lp"};
   const bool has_zero_points = zero_points != nullptr;
 
-  if (use_block32 && batch_count == 1 &&
-      components_a == 4 && components_b == 4 &&
-      !has_zero_points && M >= kMinSequenceLengthForPrefillOptimization) {
-    MatMulNBitsProgramPrefill program;
-    constexpr int32_t tile_size = 16;
-    // subgroup_size here controls how many elements of the hidden dimension we load in a cycle.
-    // MatMulNBitsProgramPrefill does not use any of the subgroup wgsl instructions. The subgroup
-    // size just helps with optimal lane usage in the shader.
-    constexpr int32_t subgroup_size = 16;
-    program.SetWorkgroupSize(tile_size * subgroup_size);
-    program.SetDispatchGroupSize((M + tile_size - 1) / tile_size,
-                                 (N + tile_size - 1) / tile_size,
-                                 1);
-    program
-        .AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(4)},
-                    {b, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(4)},
-                    {scales, ProgramTensorMetadataDependency::None}})
-        .AddUniformVariables({{static_cast<uint32_t>(M)},
-                              {static_cast<uint32_t>(N)},
-                              {static_cast<uint32_t>(K)},
-                              {static_cast<uint32_t>(K / 4)},
-                              {static_cast<uint32_t>(K / 8)}})
-        .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(1)});
-    return context.RunProgram(program);
+  // TODO: Support output_number > 1. Some cases are failed when output_number > 1.
+  constexpr uint32_t output_number = 1;
+  const uint32_t tile_m = M > kMinMForTileOptimization ? 4 : 1;
+  MatMulNBitsProgram program{output_number, block_size, tile_m, gsl::narrow<int>(components_b), has_zero_points, is_intel};
+  if (M > kMinMForTileOptimization && block_size == 32) {
+    components = 1;
+    constexpr uint32_t workgroup_size = 64;
+    constexpr uint32_t workgroup_y = 8;
+    constexpr uint32_t workgroup_x = workgroup_size / workgroup_y;
+    program.SetWorkgroupSize(workgroup_x, workgroup_y, 1);
+    program.SetDispatchGroupSize((N + workgroup_y - 1) / workgroup_y,
+                                 (M + tile_m - 1) / tile_m,
+                                 batch_count);
+    program.CacheHint("T_M" + std::to_string(tile_m));
+  } else if (is_intel && block_size == 32) {
+    components = 1;
+    constexpr uint32_t workgroup_size = 128;
+    const uint32_t workgroup_y = N % 8 == 0 ? 8 : N % 4 == 0 ? 4
+                                                             : 1;
+    const uint32_t workgroup_x = workgroup_size / workgroup_y;
+    program.SetWorkgroupSize(workgroup_x, workgroup_y, 1);
+    program.SetDispatchGroupSize(data_size / components / workgroup_y);
+    program.CacheHint("T_M" + std::to_string(tile_m));
   } else {
-    // TODO: Support output_number > 1. Some cases are failed when output_number > 1.
-    // const uint32_t output_number = M > 1 && (N / components) % 2 == 0 ? 2 : 1;
-    constexpr uint32_t output_number = 1;
-    MatMulNBitsProgram program{output_number, gsl::narrow<int>(components_b), has_zero_points, use_block32};
-
-    if (use_block32) {
-      components = 1;
-      constexpr uint32_t workgroup_size = 128;
-      const uint32_t workgroup_y = N % 8 == 0 ? 8 : N % 4 == 0 ? 4
-                                                               : 1;
-      const uint32_t workgroup_x = workgroup_size / workgroup_y;
-      program.SetWorkgroupSize(workgroup_x, workgroup_y, 1);
-      program.SetDispatchGroupSize(data_size / components / workgroup_y);
-    } else {
-      program.SetDispatchGroupSize(data_size / components / output_number);
-    }
-
-    TensorShape reshaped_a_shape{batch_count, M, K / components_a};
-    TensorShape reshaped_b_shape{N, n_blocks_per_col, blob_size_in_words / components_b};
-    TensorShape reshaped_y_shape{batch_count, M, N / components};
+    program.SetDispatchGroupSize(data_size / components / output_number);
+    program.CacheHint("O_N" + std::to_string(output_number));
+  }
 
-    program
-        .AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, reshaped_a_shape, gsl::narrow<int>(components_a)},
-                    {b, ProgramTensorMetadataDependency::TypeAndRank, reshaped_b_shape, gsl::narrow<int>(components_b * 4 /** b will be accessed as uint32 which includs 4 uint8. So here we need to multiply 4.*/)},
-                    {scales, ProgramTensorMetadataDependency::None}})
-        .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, gsl::narrow<int>(components)})
-        .AddUniformVariable({block_size})
-        .CacheHint(std::to_string(output_number));
-    if (has_zero_points) {
-      program.AddInput({zero_points, ProgramTensorMetadataDependency::None, {(zero_points->Shape().Size() + 3) / 4}, 4});
-    }
-    return context.RunProgram(program);
+  TensorShape reshaped_a_shape{batch_count, M, K / components_a};
+  TensorShape reshaped_b_shape{N, n_blocks_per_col, blob_size_in_words / components_b};
+  TensorShape reshaped_y_shape{batch_count, M, N / components};
+
+  program
+      .AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, reshaped_a_shape, gsl::narrow<int>(components_a)},
+                  {b, ProgramTensorMetadataDependency::TypeAndRank, reshaped_b_shape, gsl::narrow<int>(components_b * 4 /** b will be accessed as uint32 which includs 4 uint8. So here we need to multiply 4.*/)},
+                  {scales, ProgramTensorMetadataDependency::None}})
+      .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, gsl::narrow<int>(components)})
+      .AddUniformVariable({block_size});
+  if (has_zero_points) {
+    program.AddInput({zero_points, ProgramTensorMetadataDependency::None, {(zero_points->Shape().Size() + 3) / 4}, 4});
   }
+  return context.RunProgram(program);
 }
 
 }  // namespace webgpu
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h
index 5f785c03f6a5e..8a4626083419c 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h
+++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h
@@ -14,11 +14,13 @@ using namespace onnxruntime::webgpu;
 
 class MatMulNBitsProgram final : public Program<MatMulNBitsProgram> {
  public:
-  MatMulNBitsProgram(uint32_t output_number, int components_b, bool has_zero_points, bool use_block32) : Program{"MatMulNBits"},
-                                                                                                         output_number_{output_number},
-                                                                                                         components_b_{components_b},
-                                                                                                         has_zero_points_{has_zero_points},
-                                                                                                         use_block32_{use_block32} {
+  MatMulNBitsProgram(uint32_t output_number, uint32_t block_size, uint32_t tile_m, int components_b, bool has_zero_points, bool is_intel) : Program{"MatMulNBits"},
+                                                                                                                                            output_number_{output_number},
+                                                                                                                                            block_size_{block_size},
+                                                                                                                                            tile_m_{tile_m},
+                                                                                                                                            components_b_{components_b},
+                                                                                                                                            has_zero_points_{has_zero_points},
+                                                                                                                                            is_intel_{is_intel} {
   }
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
@@ -26,23 +28,11 @@ class MatMulNBitsProgram final : public Program<MatMulNBitsProgram> {
 
  private:
   uint32_t output_number_;
+  uint32_t block_size_;
+  uint32_t tile_m_;
   int components_b_;
   bool has_zero_points_;
-  bool use_block32_;
-};
-
-class MatMulNBitsProgramPrefill final : public Program<MatMulNBitsProgramPrefill> {
- public:
-  MatMulNBitsProgramPrefill() : Program{"MatMulNBitsPrefill"} {
-  }
-
-  Status GenerateShaderCode(ShaderHelper& sh) const override;
-  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES(
-      {"M", ProgramUniformVariableDataType::Uint32},
-      {"N", ProgramUniformVariableDataType::Uint32},
-      {"K", ProgramUniformVariableDataType::Uint32},
-      {"K4", ProgramUniformVariableDataType::Uint32},
-      {"K8", ProgramUniformVariableDataType::Uint32});
+  bool is_intel_;
 };
 
 class MatMulNBits final : public WebGpuKernel {

From 5afab787db9489cc4210bc4b1a809ab29037c1a5 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Tue, 17 Dec 2024 10:59:20 -0800
Subject: [PATCH 06/25] Update python version metadata (remove 3.7, 3.8, 3.9;
 add 3.13). (#23067)

### Description

* Update python version metadata to be in sync with latest python
packages (onnxruntime, onnxruntime-gpu and onnxruntime-qnn).
* Update black format target-version to 3.10, and use lintrunner to
format all files.
* Update the lintrunner installation command line to be consistent.
* Include `requirements-lintrunner.txt` in `requirements-dev.txt` to
avoid duplicated settings.

### Motivation and Context

https://github.com/microsoft/onnxruntime/issues/22993

Python support by numpy:
https://numpy.org/neps/nep-0029-deprecation_policy.html#drop-schedule
```
On Apr 05, 2024 drop support for Python 3.9
On Apr 04, 2025 drop support for Python 3.10
```
---
 .github/workflows/lint.yml                     |  4 +---
 .lintrunner.toml                               | 18 +++++-------------
 docs/Coding_Conventions_and_Standards.md       | 12 +++---------
 .../orttraining/python/training/artifacts.py   | 11 +++++++----
 .../ortmodule/_graph_transition_manager.py     |  5 +++--
 pyproject.toml                                 |  3 ++-
 requirements-dev.txt                           |  3 +--
 setup.py                                       |  8 +++-----
 8 files changed, 25 insertions(+), 39 deletions(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 64785574c7728..8d966d358de01 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -45,7 +45,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
-          # Version range or exact version of Python to use, using SemVer's version range syntax. Reads from .python-version if unset.
+          # Use the version configured in target-version of [tool.black] section in pyproject.toml.
           python-version: "3.10"
       - name: Setup Rust
         uses: actions-rs/toolchain@v1
@@ -55,12 +55,10 @@ jobs:
       - name: Update PATH
         run: |
           echo "$HOME/.local/bin" >> "$GITHUB_PATH"
-
       - name: Install dependencies
         run: |
           set -e -x
           python -m pip install --user -r requirements-dev.txt
-          python -m pip install --user lintrunner lintrunner-adapters
           lintrunner init
       - name: Run lintrunner on all files
         run: |
diff --git a/.lintrunner.toml b/.lintrunner.toml
index be46ba0baabdb..5ef9ad9337f57 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -2,31 +2,23 @@
 # You can install the dependencies and initialize with
 #
 # ```sh
-# pip install lintrunner lintrunner-adapters
+# pip install -r requirements-lintrunner.txt
 # lintrunner init
 # ```
 #
 # This will install lintrunner on your system and download all the necessary
 # dependencies to run linters locally.
-# If you want to see what lintrunner init will install, run
-# `lintrunner init --dry-run`.
 #
-# To lint local changes:
+# To format local changes:
 #
 # ```bash
-# lintrunner
+# lintrunner -a
 # ```
 #
-# To lint all files:
+# To format all files:
 #
 # ```bash
-# lintrunner --all-files
-# ```
-#
-# To format files:
-#
-# ```bash
-# lintrunner f --all-files
+# lintrunner -a --all-files
 # ```
 #
 # To read more about lintrunner, see [wiki](https://github.com/pytorch/pytorch/wiki/lintrunner).
diff --git a/docs/Coding_Conventions_and_Standards.md b/docs/Coding_Conventions_and_Standards.md
index f18f1036efee8..02af7ddaa49be 100644
--- a/docs/Coding_Conventions_and_Standards.md
+++ b/docs/Coding_Conventions_and_Standards.md
@@ -164,22 +164,16 @@ dependencies to run linters locally.
 If you want to see what lintrunner init will install, run
 `lintrunner init --dry-run`.
 
-To lint local changes:
-
-```bash
-lintrunner
-```
-
-To format files and apply suggestions:
+To format local changes:
 
 ```bash
 lintrunner -a
 ```
 
-To lint all files:
+To format all files:
 
 ```bash
-lintrunner --all-files
+lintrunner -a --all-files
 ```
 
 To show help text:
diff --git a/orttraining/orttraining/python/training/artifacts.py b/orttraining/orttraining/python/training/artifacts.py
index c98e5bcd97092..31591c0156b14 100644
--- a/orttraining/orttraining/python/training/artifacts.py
+++ b/orttraining/orttraining/python/training/artifacts.py
@@ -185,10 +185,13 @@ def build(self, *inputs_to_loss):
         logging.info("Custom op library provided: %s", custom_op_library)
         custom_op_library_path = pathlib.Path(custom_op_library)
 
-    with onnxblock.base(loaded_model, model_path), (
-        onnxblock.custom_op_library(custom_op_library_path)
-        if custom_op_library is not None
-        else contextlib.nullcontext()
+    with (
+        onnxblock.base(loaded_model, model_path),
+        (
+            onnxblock.custom_op_library(custom_op_library_path)
+            if custom_op_library is not None
+            else contextlib.nullcontext()
+        ),
     ):
         _ = training_block(*[output.name for output in loaded_model.graph.output])
         training_model, eval_model = training_block.to_model_proto()
diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_transition_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_transition_manager.py
index 22627749c316c..d9cae8e1f99e8 100755
--- a/orttraining/orttraining/python/training/ortmodule/_graph_transition_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_transition_manager.py
@@ -867,8 +867,9 @@ def _get_exported_model(
         assert model_info_for_export.export_mode is not None, "Please use a concrete instance of ExecutionManager"
 
         try:
-            with torch.no_grad(), stage3_export_context(
-                enable_zero_stage3_support, stage3_param_handle, flattened_module
+            with (
+                torch.no_grad(),
+                stage3_export_context(enable_zero_stage3_support, stage3_param_handle, flattened_module),
             ):
                 required_export_kwargs = {
                     "input_names": model_info_for_export.onnx_graph_input_names,  # did not contains parameters as its input yet
diff --git a/pyproject.toml b/pyproject.toml
index 6429df2722b2d..40e6eb96dff94 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,7 +2,8 @@
 line-length = 120
 # NOTE: Do not extend the exclude list. Edit .lintrunner.toml instead
 extend-exclude = "cmake|onnxruntime/core/flatbuffers/"
-target-version = ["py37", "py38", "py39", "py310", "py311"]
+# NOTE: use the minimum supported python version as target-version
+target-version = ["py310"]
 
 [tool.isort]
 # NOTE: Do not extend the exclude list. Edit .lintrunner.toml instead
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 1b5ca65cf8037..b95b85781a398 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,7 +1,6 @@
-black>=22.3
+-r requirements-lintrunner.txt
 cerberus
 flatbuffers
-isort
 jinja2
 numpy
 onnx
diff --git a/setup.py b/setup.py
index 1ca31cb0019f0..c1580eeb9e8f9 100644
--- a/setup.py
+++ b/setup.py
@@ -529,6 +529,8 @@ def finalize_options(self):
     "Intended Audience :: Developers",
     "License :: OSI Approved :: MIT License",
     "Operating System :: POSIX :: Linux",
+    "Operating System :: Microsoft :: Windows",
+    "Operating System :: MacOS",
     "Topic :: Scientific/Engineering",
     "Topic :: Scientific/Engineering :: Mathematics",
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
@@ -537,14 +539,10 @@ def finalize_options(self):
     "Topic :: Software Development :: Libraries :: Python Modules",
     "Programming Language :: Python",
     "Programming Language :: Python :: 3 :: Only",
-    "Programming Language :: Python :: 3.7",
-    "Programming Language :: Python :: 3.8",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
-    "Operating System :: Microsoft :: Windows",
-    "Operating System :: MacOS",
+    "Programming Language :: Python :: 3.13",
 ]
 
 if enable_training or enable_training_apis:

From 54edb43e7768c79794495079287f5906fb05991a Mon Sep 17 00:00:00 2001
From: Enrico Galli <enrico.galli@intel.com>
Date: Tue, 17 Dec 2024 12:51:16 -0800
Subject: [PATCH 07/25] [WebNN] Fixes MLTensor caching across different
 contexts (#23100)

We weren't checking that MLTensors were from the same context before
reusing them.

Found while debugging microsoft/webnn-developer-preview#69
---
 js/web/lib/wasm/jsep/webnn/tensor-manager.ts | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webnn/tensor-manager.ts b/js/web/lib/wasm/jsep/webnn/tensor-manager.ts
index 4932691bda65b..45b5b8b4fa932 100644
--- a/js/web/lib/wasm/jsep/webnn/tensor-manager.ts
+++ b/js/web/lib/wasm/jsep/webnn/tensor-manager.ts
@@ -141,8 +141,9 @@ class TensorWrapper {
     return this.mlContext.readTensor(this.mlTensor);
   }
 
-  public sameTypeAndShape(dataType: MLOperandDataType, shape: readonly number[]): boolean {
+  public canReuseTensor(context: MLContext, dataType: MLOperandDataType, shape: readonly number[]): boolean {
     return (
+      this.mlContext === context &&
       this.dataType === dataType &&
       this.tensorShape.length === shape.length &&
       this.tensorShape.every((v, i) => v === shape[i])
@@ -176,12 +177,13 @@ class TensorIdTracker {
   }
 
   public async ensureTensor(
+    context: MLContext,
     dataType: MLOperandDataType,
     shape: readonly number[],
     copyOld: boolean,
   ): Promise<MLTensor> {
     if (this.wrapper) {
-      if (this.wrapper.sameTypeAndShape(dataType, shape)) {
+      if (this.wrapper.canReuseTensor(context, dataType, shape)) {
         return this.wrapper.tensor;
       } else {
         if (copyOld) {
@@ -288,7 +290,7 @@ class TensorManagerImpl implements TensorManager {
     if (!tensor) {
       throw new Error('Tensor not found.');
     }
-    return tensor.ensureTensor(dataType, shape, copyOld);
+    return tensor.ensureTensor(this.backend.currentContext, dataType, shape, copyOld);
   }
 
   public upload(tensorId: TensorId, data: Uint8Array): void {
@@ -354,15 +356,15 @@ class TensorManagerImpl implements TensorManager {
     readable: boolean,
   ): Promise<TensorWrapper> {
     const sessionId = this.backend.currentSessionId;
+    const context = this.backend.currentContext;
     for (const [index, tensor] of this.freeTensors.entries()) {
-      if (tensor.sameTypeAndShape(dataType, shape)) {
+      if (tensor.canReuseTensor(context, dataType, shape)) {
         LOG_DEBUG('verbose', () => `[WebNN] Reusing tensor {dataType: ${dataType}, shape: ${shape}}`);
         const wrapper = this.freeTensors.splice(index, 1)[0];
         wrapper.sessionId = sessionId;
         return wrapper;
       }
     }
-    const context = this.backend.currentContext;
     LOG_DEBUG('verbose', () => `[WebNN] MLContext.createTensor {dataType: ${dataType}, shape: ${shape}}`);
     const tensor = await context.createTensor({
       dataType,

From a5b60ec03fcf06bfcf8e1e732b87b11793bb64b7 Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Wed, 18 Dec 2024 04:52:08 +0800
Subject: [PATCH 08/25] [WebNN] Add limit to QDQ ops (#23076)

WebNN requires the `scale_shape` to be a subsample of the `input_shape`.
---
 js/web/docs/webnn-operators.md                |  4 +--
 .../webnn/builders/impl/qdq_op_builder.cc     | 34 +++++++++++++++++++
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md
index e0012e70a7dec..af7348dba532f 100644
--- a/js/web/docs/webnn-operators.md
+++ b/js/web/docs/webnn-operators.md
@@ -27,7 +27,7 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
 | Cos | ai.onnx(7+) | cos | ✓ | ✓ | |
 | CumSum | ai.onnx(11-13, 14+) | cumulativeSum | ✓ | ✓ | 'axis' input should be a constant |
 | Div | ai.onnx(7-12, 13, 14+) | div | ✓ | ✓ | |
-| DequantizeLinear | ai.onnx(10-12, 13-18, 19-20, 21-22, 23+) | dequantizeLinear | ✗ | ✓ | |
+| DequantizeLinear | ai.onnx(10-12, 13-18, 19-20, 21-22, 23+) | dequantizeLinear | ✓ | ✓ | The shape of x_scale should be a subsample of the shape of input |
 | Dropout | ai.onnx(7-9, 10-11, 12, 13-21, 22+) | identity | ✓ | ✓ | Only supports test mode |
 | Einsum | ai.onnx(12+) | reshape, transpose, matmul, reduceSum, mul, triangular | ✓ | ✓ | |
 | Elu | ai.onnx(7+) | elu | ✓ | ✓ | WebNN CPU backend only supports 'alpha' value is 1.0 |
@@ -71,7 +71,7 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
 | Pad | ai.onnx(7-10, 11-12, 13-17, 18, 19-20, 21+) | pad | ✓ | ✓ | modes == 'wrap' is not supported |
 | Pow | ai.onnx(7-11, 12, 13-14, 15+) | pow | ✓ | ✓ | |
 | PRelu | ai.onnx(7-8, 9-15, 16+) | prelu | ✓ | ✓ | WebNN CPU backend restricts the last dimension of input and slope to be same (Chromium issue: https://issues.chromium.org/issues/335517470) |
-| QuantizeLinear | ai.onnx(10-12, 13-18, 19-20, 21-22, 23+) | quantizeLinear | ✗ | ✓ | |
+| QuantizeLinear | ai.onnx(10-12, 13-18, 19-20, 21-22, 23+) | quantizeLinear | ✓ | ✓ | The shape of x_scale should be a subsample of the shape of input |
 | Reciprocal | ai.onnx(7-12, 13+) | reciprocal | ✓ | ✓ | |
 | ReduceL1 | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceL1 | ✓ | ✓ | Input 'axes' if present should be a constant |
 | ReduceL2 | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceL2 | ✓ | ✓ | Input 'axes' if present should be a constant |
diff --git a/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc
index b71507a871bf6..bd7c23d75eba4 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc
@@ -22,6 +22,8 @@ class QDQOpBuilder : public BaseOpBuilder {
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
 
   // Operator support related.
+  bool IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+                         const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
   bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
 };
@@ -118,6 +120,38 @@ Status QDQOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   return Status::OK();
 }
 
+// Operator support related.
+bool QDQOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */,
+                                     const Node& node,
+                                     const WebnnDeviceType /* device_type */,
+                                     const logging::Logger& logger) const {
+  const auto& input_defs = node.InputDefs();
+
+  std::vector<int64_t> input_shape;
+  std::vector<int64_t> scale_shape;
+
+  if (!GetShape(*input_defs[0], input_shape, logger) || !GetShape(*input_defs[1], scale_shape, logger)) {
+    return false;
+  }
+
+  // WebNN requires the scale_shape to be a subsample of the input_shape.
+  if (scale_shape.size() > input_shape.size()) {
+    LOGS(logger, VERBOSE) << "The rank of scale is larger than the rank of input";
+    return false;
+  }
+
+  for (size_t i = 0; i < scale_shape.size(); ++i) {
+    auto scale_dim = scale_shape[scale_shape.size() - i - 1];
+    auto input_dim = input_shape[input_shape.size() - i - 1];
+    if (input_dim % scale_dim != 0) {
+      LOGS(logger, VERBOSE) << "The shape of scale is not a subsample of the shape of input";
+      return false;
+    }
+  }
+
+  return true;
+}
+
 bool QDQOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
                                           const emscripten::val& wnn_limits, const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();

From e76bd2f5e98dda71b96e93d23ca275ca8a3eec47 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Tue, 17 Dec 2024 13:39:13 -0800
Subject: [PATCH 09/25] Update CODEOWNERS: remove onnxruntime-es (#21677)

Removing this restriction for now.
---
 CODEOWNERS | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index f7dfa419500d0..a55067ed798d8 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -9,10 +9,6 @@
 /onnxruntime/core/graph/contrib_ops/quantization_defs.* @microsoft/onnxruntime-mlas
 /onnxruntime/core/mlas/** @microsoft/onnxruntime-mlas
 
-# build pipelines and workflows
-/tools/ci_build/github/azure-pipelines @microsoft/onnxruntime-es
-/.github/workflows @microsoft/onnxruntime-es
-
 # Dependencies
 requirements-dev.txt @microsoft/onnxruntime-admin
 requirements-doc.txt @microsoft/onnxruntime-admin

From 5d7030e4c6b3af907ad792cad3d14068202d91c1 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Wed, 18 Dec 2024 10:42:10 -0800
Subject: [PATCH 10/25] Revert DML pipeline changes (#23135)

### Description
Previously we wanted to add DirectML EP to existing onnxruntime Windows
CUDA packages. After careful consideration, we will postpone the change.
This PR reverts some pipeline changes previously made by @mszhanyi and
@jchen351 .
---
 .../java/ai/onnxruntime/InferenceTest.java    |  1 -
 .../providers/ProviderOptionsTest.java        |  3 -
 onnxruntime/test/common/cuda_op_test_utils.h  |  9 ---
 .../test/contrib_ops/beam_search_test.cc      |  6 --
 .../test/contrib_ops/bias_dropout_op_test.cc  |  3 -
 .../contrib_ops/bitmask_dropout_op_test.cc    |  7 +--
 .../test/contrib_ops/layer_norm_test.cc       | 13 +----
 .../test/contrib_ops/matmul_4bits_test.cc     | 28 +++-------
 .../matmul_integer_to_float_test.cc           |  2 +-
 .../test/contrib_ops/tensor_op_test.cc        | 20 +------
 .../test/framework/allocation_planner_test.cc | 56 -------------------
 .../test/framework/cuda/fence_cuda_test.cc    |  9 ---
 .../test/framework/inference_session_test.cc  | 23 --------
 .../test/framework/memcpy_transformer_test.cc | 25 ---------
 .../test/framework/sparse_kernels_test.cc     |  6 --
 onnxruntime/test/lora/lora_test.cc            | 10 ----
 onnxruntime/test/providers/base_tester.cc     | 11 ----
 .../providers/compare_provider_test_utils.cc  |  5 --
 onnxruntime/test/providers/cpu/model_tests.cc | 12 ----
 .../providers/cpu/tensor/gather_op_test.cc    |  6 --
 .../providers/cpu/tensor/grid_sample_test.cc  |  8 +--
 .../test/providers/cuda/cuda_provider_test.cc |  2 +-
 .../cuda/test_cases/allocator_cuda_test.cc    |  4 +-
 .../attention_kernel_options_test.cc          |  6 +-
 .../cuda/test_cases/beam_search_topk.cc       |  2 +-
 .../test_cases/blkq4_fp16_gemm_sm80_test.cc   | 10 ++--
 .../cuda_execution_provider_test.cc           |  4 +-
 .../cuda/test_cases/cuda_utils_test.cc        |  2 +-
 .../cuda/test_cases/gemm_options_test.cc      | 12 ++--
 .../cuda/test_cases/greedy_search_top_one.cc  |  2 +-
 .../test_cases/reduction_functions_test.cc    | 12 ++--
 .../test/python/onnx_backend_test_series.py   | 41 +++++---------
 .../onnx_backend_test_series_filters.jsonc    |  7 ---
 onnxruntime/test/util/default_providers.cc    | 18 ------
 .../jobs/steps/py_packaging_test_step.yml     | 21 -------
 .../stages/nuget-combine-cuda-stage.yml       |  2 -
 .../stages/nuget-win-cuda-packaging-stage.yml |  7 +--
 .../stages/py-gpu-packaging-stage.yml         |  2 +-
 .../stages/py-win-gpu-stage.yml               | 27 +++++----
 .../templates/jobs/win-ci-vs-2022-job.yml     | 28 ++--------
 .../azure-pipelines/templates/win-ci.yml      | 48 +++-------------
 .../win-gpu-cuda-ci-pipeline.yml              | 26 +--------
 .../win-gpu-dml-ci-pipeline.yml               |  4 +-
 43 files changed, 94 insertions(+), 456 deletions(-)
 delete mode 100644 tools/ci_build/github/azure-pipelines/stages/jobs/steps/py_packaging_test_step.yml

diff --git a/java/src/test/java/ai/onnxruntime/InferenceTest.java b/java/src/test/java/ai/onnxruntime/InferenceTest.java
index 15d89b536b39a..e11537492d3a7 100644
--- a/java/src/test/java/ai/onnxruntime/InferenceTest.java
+++ b/java/src/test/java/ai/onnxruntime/InferenceTest.java
@@ -737,7 +737,6 @@ public void testCoreML() throws OrtException {
     runProvider(OrtProvider.CORE_ML);
   }
 
-  @Disabled("DirectML Java API hasn't been supported yet")
   @Test
   @EnabledIfSystemProperty(named = "USE_DML", matches = "1")
   public void testDirectML() throws OrtException {
diff --git a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
index fa0b6fd0ef9d9..57c4eb3577fd0 100644
--- a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
+++ b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
@@ -27,7 +27,6 @@
 import java.util.HashMap;
 import java.util.Map;
 import org.junit.jupiter.api.Test;
-import org.junit.jupiter.api.condition.DisabledIfSystemProperty;
 import org.junit.jupiter.api.condition.EnabledIfSystemProperty;
 
 public class ProviderOptionsTest {
@@ -35,7 +34,6 @@ public class ProviderOptionsTest {
 
   @Test
   @EnabledIfSystemProperty(named = "USE_CUDA", matches = "1")
-  @DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1")
   public void testCUDAOptions() throws OrtException {
     // Test standard options
     OrtCUDAProviderOptions cudaOpts = new OrtCUDAProviderOptions(0);
@@ -63,7 +61,6 @@ public void testCUDAOptions() throws OrtException {
 
   @Test
   @EnabledIfSystemProperty(named = "USE_TENSORRT", matches = "1")
-  @DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1")
   public void testTensorRT() throws OrtException {
     // Test standard options
     OrtTensorRTProviderOptions rtOpts = new OrtTensorRTProviderOptions(0);
diff --git a/onnxruntime/test/common/cuda_op_test_utils.h b/onnxruntime/test/common/cuda_op_test_utils.h
index d3e069237217e..6f3e460628566 100644
--- a/onnxruntime/test/common/cuda_op_test_utils.h
+++ b/onnxruntime/test/common/cuda_op_test_utils.h
@@ -5,11 +5,6 @@
 
 #include "test/util/include/default_providers.h"
 
-#define SKIP_CUDA_TEST_WITH_DML                                          \
-  if (DefaultCudaExecutionProvider() == nullptr) {                       \
-    GTEST_SKIP() << "CUDA Tests are not supported while DML is enabled"; \
-  }
-
 namespace onnxruntime {
 namespace test {
 
@@ -18,10 +13,6 @@ namespace test {
 int GetCudaArchitecture();
 
 inline bool HasCudaEnvironment(int min_cuda_architecture) {
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return false;
-  }
-
   if (DefaultCudaExecutionProvider().get() == nullptr) {
     return false;
   }
diff --git a/onnxruntime/test/contrib_ops/beam_search_test.cc b/onnxruntime/test/contrib_ops/beam_search_test.cc
index 8c69e2d9810b8..9f4ee071925b4 100644
--- a/onnxruntime/test/contrib_ops/beam_search_test.cc
+++ b/onnxruntime/test/contrib_ops/beam_search_test.cc
@@ -75,9 +75,6 @@ TEST(BeamSearchTest, GptBeamSearchFp32) {
   const char* const output_names[] = {"sequences"};
 
   Ort::SessionOptions session_options;
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
 #ifdef USE_CUDA
   OrtCUDAProviderOptionsV2 cuda_options;
   cuda_options.use_tf32 = false;
@@ -171,9 +168,6 @@ TEST(BeamSearchTest, GptBeamSearchFp16) {
   bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get());
   if (enable_cuda || enable_rocm) {
     Ort::SessionOptions session_options;
-#if defined(USE_CUDA) && defined(USE_DML)
-    SKIP_CUDA_TEST_WITH_DML;
-#endif
 #ifdef USE_CUDA
     OrtCUDAProviderOptionsV2 cuda_options;
     cuda_options.use_tf32 = false;
diff --git a/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc b/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc
index 297629b015796..027d4b3fff1b0 100644
--- a/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc
+++ b/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc
@@ -181,9 +181,6 @@ void RunBiasDropoutTest(const bool use_mask, const std::vector<int64_t>& input_s
   t.SetCustomOutputVerifier(output_verifier);
   std::vector<std::unique_ptr<IExecutionProvider>> t_eps;
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   t_eps.emplace_back(DefaultCudaExecutionProvider());
 #elif USE_ROCM
   t_eps.emplace_back(DefaultRocmExecutionProvider());
diff --git a/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc b/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc
index 26b0e3a4dd7a9..7ca4e1004066c 100644
--- a/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc
+++ b/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc
@@ -61,9 +61,7 @@ void RunTestForInference(const std::vector<int64_t>& input_dims, bool has_ratio
 
   std::vector<std::unique_ptr<IExecutionProvider>> test_eps;
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() != nullptr) {
-    test_eps.emplace_back(DefaultCudaExecutionProvider());
-  }
+  test_eps.emplace_back(DefaultCudaExecutionProvider());
 #elif USE_ROCM
   test_eps.emplace_back(DefaultRocmExecutionProvider());
 #endif
@@ -124,9 +122,6 @@ void RunTestForTraining(const std::vector<int64_t>& input_dims) {
 
     std::vector<std::unique_ptr<IExecutionProvider>> dropout_eps;
 #ifdef USE_CUDA
-    if (DefaultCudaExecutionProvider() == nullptr) {
-      return;
-    }
     dropout_eps.emplace_back(DefaultCudaExecutionProvider());
 #elif USE_ROCM
     dropout_eps.emplace_back(DefaultRocmExecutionProvider());
diff --git a/onnxruntime/test/contrib_ops/layer_norm_test.cc b/onnxruntime/test/contrib_ops/layer_norm_test.cc
index b414a98c4e756..46082e1b0cd31 100644
--- a/onnxruntime/test/contrib_ops/layer_norm_test.cc
+++ b/onnxruntime/test/contrib_ops/layer_norm_test.cc
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 #include "test/providers/compare_provider_test_utils.h"
-#include "test/util/include/default_providers.h"
 
 namespace onnxruntime {
 namespace test {
@@ -80,20 +79,14 @@ static void TestLayerNorm(const std::vector<int64_t>& x_dims,
 #endif
 
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() != nullptr) {
-    test.CompareWithCPU(kCudaExecutionProvider);
-  }
+  test.CompareWithCPU(kCudaExecutionProvider);
 #elif USE_ROCM
   test.CompareWithCPU(kRocmExecutionProvider);
+#elif USE_DML
+  test.CompareWithCPU(kDmlExecutionProvider);
 #elif USE_WEBGPU
   test.CompareWithCPU(kWebGpuExecutionProvider);
 #endif
-
-#ifdef USE_DML
-  if (DefaultDmlExecutionProvider() != nullptr) {
-    test.CompareWithCPU(kDmlExecutionProvider);
-  }
-#endif
 }
 
 TEST(CudaKernelTest, LayerNorm_NullInput) {
diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index 6dedce24e7e07..eebe9197573c6 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -490,17 +490,13 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
   if (use_float16) {
 #ifdef USE_CUDA
-    if (DefaultCudaExecutionProvider() != nullptr) {
-      execution_providers.push_back(DefaultCudaExecutionProvider());
-    }
+    execution_providers.push_back(DefaultCudaExecutionProvider());
 #endif
 #ifdef USE_ROCM
     execution_providers.push_back(DefaultRocmExecutionProvider());
 #endif
 #ifdef USE_DML
-    if (DefaultDmlExecutionProvider() != nullptr) {
-      execution_providers.push_back(DefaultDmlExecutionProvider());
-    }
+    execution_providers.push_back(DefaultDmlExecutionProvider());
 #endif
 #ifdef USE_WEBGPU
     execution_providers.push_back(DefaultWebGpuExecutionProvider());
@@ -518,11 +514,8 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura
 }  // namespace
 
 TEST(MatMulNBits, Float16Cuda) {
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
-  std::vector<bool> has_gidx_options = {true, false};
-  if (DefaultDmlExecutionProvider() != nullptr) {
-    has_gidx_options.assign(1, false);
-  }
+#if defined(USE_CUDA) || defined(USE_ROCM)
+  auto has_gidx_options = {true, false};
 #else
   auto has_gidx_options = {false};
 #endif
@@ -533,9 +526,7 @@ TEST(MatMulNBits, Float16Cuda) {
         for (auto block_size : {16, 32, 64, 128}) {
           for (auto has_gidx : has_gidx_options) {
 #ifdef USE_DML
-            if (DefaultDmlExecutionProvider() != nullptr) {
-              RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f);
-            }
+            RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f);
 #else
             RunTest(M, N, K, block_size, 0, false, true, has_gidx);
             RunTest(M, N, K, block_size, 0, true, true, has_gidx, false);
@@ -548,16 +539,12 @@ TEST(MatMulNBits, Float16Cuda) {
 }
 
 TEST(MatMulNBits, Float16Large) {
-#if defined(USE_CUDA) || defined(USE_DML)
+#ifdef USE_DML
   // For some reason, the A10 machine that runs these tests during CI has a much bigger error than all retail
   // machines we tested on. All consumer-grade machines from Nvidia/AMD/Intel seem to pass these tests with an
   // absolute error of 0.08, but the A10 has errors going as high as 0.22. Ultimately, given the large number
   // of elements in this test, ULPs should probably be used instead of absolute/relative tolerances.
-  float abs_error = 0.05f;
-  if (DefaultDmlExecutionProvider() != nullptr) {
-    // it means the ep is dml in runtime, the abs_error is changed to 0.3f
-    abs_error = 0.3f;
-  }
+  float abs_error = 0.3f;
 #elif USE_WEBGPU
   // See Intel A770 to pass these tests with an absolute error of 0.08.
   float abs_error = 0.08f;
@@ -573,6 +560,7 @@ TEST(MatMulNBits, Float16Large) {
     }
   }
 }
+
 #endif  // defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
index d88c3131a4ca5..8d7629b5fda1c 100644
--- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
@@ -227,7 +227,7 @@ TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8S8) {
 }
 
 // DML EP supports Float16 output type and Signed A Matrix and Unsigned B Matric for Float32 output
-#if defined(USE_DML) && !defined(USE_CUDA)
+#if defined(USE_DML)
 
 TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8U8) {
   RunMatMulIntegerToFloatTest<int8_t, uint8_t, float, true, false>();
diff --git a/onnxruntime/test/contrib_ops/tensor_op_test.cc b/onnxruntime/test/contrib_ops/tensor_op_test.cc
index d5e2ddebfe67f..bc2ff5f4f724d 100644
--- a/onnxruntime/test/contrib_ops/tensor_op_test.cc
+++ b/onnxruntime/test/contrib_ops/tensor_op_test.cc
@@ -121,15 +121,7 @@ void MeanVarianceNormalizationAcrossChannels(bool across_channels, bool normaliz
   test.AddAttribute("normalize_variance", normalize_variance ? one : zero);
   test.AddInput<float>("input", {N, C, H, W}, X);
   test.AddOutput<float>("output", {N, C, H, W}, result);
-#if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kCudaExecutionProvider, kTensorrtExecutionProvider});
-  } else if (DefaultDmlExecutionProvider() == nullptr) {
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kDmlExecutionProvider, kTensorrtExecutionProvider});
-  }
-#else
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kTensorrtExecutionProvider});  // OpenVINO doesn't support MVN operator below opset 9. TensorRT doesn't support opset 8 of MVN operator.
-#endif
 }
 
 void MeanVarianceNormalizationPerChannel(bool across_channels, bool normalize_variance) {
@@ -196,15 +188,7 @@ void MeanVarianceNormalizationPerChannel(bool across_channels, bool normalize_va
   test.AddAttribute("normalize_variance", normalize_variance ? one : zero);
   test.AddInput<float>("input", {N, C, H, W}, X);
   test.AddOutput<float>("output", {N, C, H, W}, result);
-#if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kCudaExecutionProvider, kTensorrtExecutionProvider});
-  } else if (DefaultDmlExecutionProvider() == nullptr) {
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kDmlExecutionProvider, kTensorrtExecutionProvider});
-  }
-#else
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kTensorrtExecutionProvider});  // OpenVINO doesn't support MVN operator below opset 9. TensorRT doesn't support opset 8 of MVN operator.
-#endif
 }
 
 TEST(MVNContribOpTest, MeanVarianceNormalizationCPUTest_Version1_TO_8) {
@@ -246,9 +230,7 @@ TEST(UnfoldTensorOpTest, LastDim) {
 
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() != nullptr) {
-    execution_providers.push_back(DefaultCudaExecutionProvider());
-  }
+  execution_providers.push_back(DefaultCudaExecutionProvider());
 #endif
   execution_providers.push_back(DefaultCpuExecutionProvider());
   tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc
index adab93908cdc4..eaebac177ca91 100644
--- a/onnxruntime/test/framework/allocation_planner_test.cc
+++ b/onnxruntime/test/framework/allocation_planner_test.cc
@@ -28,7 +28,6 @@ using json = nlohmann::json;
 #ifdef USE_CUDA
 #include "core/providers/cuda/cuda_execution_provider.h"
 #include "core/providers/cuda/cuda_provider_factory.h"
-#include "test/common/cuda_op_test_utils.h"
 #endif  // USE_CUDA
 #include "core/session/onnxruntime_session_options_config_keys.h"
 using namespace ONNX_NAMESPACE;
@@ -897,9 +896,6 @@ TEST_F(PlannerTest, LocationPlanningForPassThroughExplicitAndImplicitSubgraphInp
   SessionOptions so;
   InferenceSession sess{so, GetEnvironment()};
 
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
   ASSERT_TRUE(status.IsOK());
 
@@ -1042,9 +1038,6 @@ TEST_F(PlannerTest, LocationPlanningForInitializersOnlyUsedInANestedSubgraph) {
   SessionOptions so;
   InferenceSession sess{so, GetEnvironment()};
 
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
   ASSERT_TRUE(status.IsOK());
 
@@ -1152,9 +1145,6 @@ TEST_F(PlannerTest, LocationPlanningForInitializersUsedOnDifferentDevicesInMainG
   SessionOptions so;
   InferenceSession sess{so, GetEnvironment()};
 
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
   ASSERT_TRUE(status.IsOK());
 
@@ -1247,9 +1237,6 @@ TEST_F(PlannerTest, LocationPlanningForImplicitInputsWithoutExplicitConsumersInM
   SessionOptions so;
   InferenceSession sess{so, GetEnvironment()};
 
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
   ASSERT_TRUE(status.IsOK());
 
@@ -1282,10 +1269,6 @@ TEST_F(PlannerTest, LocationPlanningForImplicitInputsWithoutExplicitConsumersInM
 // Test MultiStream scenario for the graph:
 // node1(CPU ep)->node2(CPU ep)->node3(CUDA ep)->node4(CPU ep)
 TEST_F(PlannerTest, MultiStream) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
-
   ONNX_NAMESPACE::TensorProto tensor;
   tensor.add_dims(1);
   tensor.add_float_data(1.0f);
@@ -1304,7 +1287,6 @@ TEST_F(PlannerTest, MultiStream) {
   onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA();
   auto epFactory = ep.CreateExecutionProviderFactory(epi);
   std::unique_ptr<IExecutionProvider> execution_provider = epFactory->CreateProvider();
-
   ORT_THROW_IF_ERROR(GetExecutionProviders().Add("CUDAExecutionProvider", std::move(execution_provider)));
 
   CreatePlan({}, false);
@@ -1332,9 +1314,6 @@ TEST_F(PlannerTest, MultiStream) {
 //      node3
 // All 3 nodes are CUDA EP, node1 is in stream0, node2 is in stream1, node3 is in stream2
 TEST_F(PlannerTest, MultiStream1StreamWaitFor2Streams) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
   std::unique_ptr<::onnxruntime::KernelDef> cudaKernelAdd = KernelDefBuilder().SetName("Add").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
   std::string Graph_input("Graph_input"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2"), node3("node3");
@@ -1376,9 +1355,6 @@ TEST_F(PlannerTest, MultiStream1StreamWaitFor2Streams) {
 // stream 1: node2 (CPU EP)
 // node1's output, which is consumed by both node2 and node3, is in CPU.
 TEST_F(PlannerTest, MultiStreamCudaEPNodeCPUOutput) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   MemcpyToHostInCuda_TransposeInCudaAndCpu("./testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json");
   EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan.size(), 2) << "2 logic streams";
   EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan[0]->steps_.size(), 5) << "stream 0 has 5 steps";
@@ -1400,11 +1376,6 @@ TEST_F(PlannerTest, MultiStreamCudaEPNodeCPUOutput) {
 // TODO(leca): there is a bug in the corresponding graph that node2 will be visited twice when traversing node1's output nodes
 // (see: for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) in BuildExecutionPlan()). We can just break the loop and don't need the extra variables once it is fixed
 TEST_F(PlannerTest, MultiStreamMultiOutput) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
-#endif
   std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("RNN").Provider(kCudaExecutionProvider).SinceVersion(7).Build();
   std::string Graph_input1("Graph_input1"), Graph_input2("Graph_input2"), Graph_input3("Graph_input3"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2");
   std::vector<onnxruntime::NodeArg*> input1{Arg(Graph_input1), Arg(Graph_input2), Arg(Graph_input3)}, output1{Arg(Arg1), Arg(Arg2)}, input2{Arg(Arg1), Arg(Arg2)}, output2{Arg(Arg3)};
@@ -1442,9 +1413,6 @@ TEST_F(PlannerTest, MultiStreamMultiOutput) {
 // TODO(leca): the ideal case is there is only 1 wait step before launching node3,
 // as there is a specific order between node1 and node2 if they are in the same stream, thus node3 will only need to wait the latter one
 TEST_F(PlannerTest, MultiStream2NodesSameStreamConsumedBy1NodeInDifferentStream) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
   std::string Graph_input1("Graph_input1"), Graph_input2("Graph_input2"), Graph_input3("Graph_input3"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2"), node3("node3");
   std::vector<onnxruntime::NodeArg*> input1{Arg(Graph_input1)}, input2{Arg(Graph_input2)}, output1{Arg(Arg1)}, output2{Arg(Arg2)}, input3{Arg(Arg1), Arg(Arg2)}, output3{Arg(Arg3)};
@@ -1482,9 +1450,6 @@ TEST_F(PlannerTest, MultiStream2NodesSameStreamConsumedBy1NodeInDifferentStream)
 
 #if !defined(__wasm__) && defined(ORT_ENABLE_STREAM)
 TEST_F(PlannerTest, ParaPlanCreation) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   TypeProto graph_in_type;
   graph_in_type.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
   auto* graph_in_shape = graph_in_type.mutable_tensor_type()->mutable_shape();
@@ -1926,10 +1891,6 @@ TEST_F(PlannerTest, ParaPlanCreation) {
 }
 
 TEST_F(PlannerTest, TestMultiStreamConfig) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
-
   const char* type = "DeviceBasedPartitioner";
   constexpr size_t type_len = 22;
 
@@ -2003,10 +1964,6 @@ TEST_F(PlannerTest, TestMultiStreamSaveConfig) {
 
 // Load with partition config where a node is missing, session load expected to fail.
 TEST_F(PlannerTest, TestMultiStreamMissingNodeConfig) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
-
   const char* config_file_path = "./testdata/multi_stream_models/conv_add_relu_single_stream_missing_node.json";
   SessionOptions sess_opt;
   sess_opt.graph_optimization_level = TransformerLevel::Default;
@@ -2027,9 +1984,6 @@ TEST_F(PlannerTest, TestMultiStreamMissingNodeConfig) {
 
 // Load with partition config where streams and devices has mismatch
 TEST_F(PlannerTest, TestMultiStreamMismatchDevice) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   const char* config_file_path = "./testdata/multi_stream_models/conv_add_relu_single_stream_mismatch_device.json";
   SessionOptions sess_opt;
   sess_opt.graph_optimization_level = TransformerLevel::Default;
@@ -2055,9 +2009,6 @@ TEST_F(PlannerTest, TestCpuIf) {
   sess_opt.graph_optimization_level = TransformerLevel::Default;
 
   InferenceSession sess(sess_opt, GetEnvironment(), ORT_TSTR("./testdata/multi_stream_models/cpu_if.onnx"));
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(sess.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
   ASSERT_STATUS_OK(sess.Load());
   ASSERT_STATUS_OK(sess.Initialize());
@@ -2118,17 +2069,10 @@ TEST_F(PlannerTest, TestCpuIf) {
 //    onnx.save(model, 'issue_19480.onnx')
 //
 TEST(AllocationPlannerTest, ReusedInputCrossDifferentStreams) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
-
   SessionOptions sess_opt;
   sess_opt.graph_optimization_level = TransformerLevel::Default;
 
   InferenceSession sess(sess_opt, GetEnvironment(), ORT_TSTR("./testdata/multi_stream_models/issue_19480.onnx"));
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
   status = sess.Load();
   status = sess.Initialize();
diff --git a/onnxruntime/test/framework/cuda/fence_cuda_test.cc b/onnxruntime/test/framework/cuda/fence_cuda_test.cc
index 3e5ef30e7ebef..e28327941dda4 100644
--- a/onnxruntime/test/framework/cuda/fence_cuda_test.cc
+++ b/onnxruntime/test/framework/cuda/fence_cuda_test.cc
@@ -115,9 +115,6 @@ TEST(CUDAFenceTests, DISABLED_PartOnCPU) {
   SessionOptions so;
   FenceCudaTestInferenceSession session(so, GetEnvironment());
   ASSERT_STATUS_OK(LoadInferenceSessionFromModel(session, *model));
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(session.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
   ASSERT_TRUE(session.Initialize().IsOK());
   ASSERT_TRUE(1 == CountCopyNodes(graph));
@@ -167,9 +164,6 @@ TEST(CUDAFenceTests, TileWithInitializer) {
   SessionOptions so;
   FenceCudaTestInferenceSession session(so, GetEnvironment());
   ASSERT_STATUS_OK(LoadInferenceSessionFromModel(session, *model));
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(session.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
   ASSERT_STATUS_OK(session.Initialize());
 
@@ -230,9 +224,6 @@ TEST(CUDAFenceTests, TileWithComputedInput) {
   SessionOptions so;
   FenceCudaTestInferenceSession session(so, GetEnvironment());
   ASSERT_STATUS_OK(LoadInferenceSessionFromModel(session, *model));
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(session.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
   ASSERT_TRUE(session.Initialize().IsOK());
 
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 7f4616c964e33..740c566794f15 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -34,7 +34,6 @@
 #ifdef USE_CUDA
 #include "core/providers/cuda/cuda_provider_factory.h"
 #include "core/providers/cuda/gpu_data_transfer.h"
-#include "test/common/cuda_op_test_utils.h"
 #endif
 #ifdef USE_TENSORRT
 #include "core/providers/tensorrt/tensorrt_provider_options.h"
@@ -636,9 +635,6 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) {
 
   InferenceSession session_object(so, GetEnvironment());
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
 #endif
 #ifdef USE_ROCM
@@ -693,9 +689,6 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions2) {
 
   InferenceSession session_object(so, GetEnvironment());
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
 #endif
 #ifdef USE_ROCM
@@ -1049,9 +1042,6 @@ static void TestBindHelper(const std::string& log_str,
   if (bind_provider_type == kCudaExecutionProvider || bind_provider_type == kRocmExecutionProvider) {
 #ifdef USE_CUDA
     auto provider = DefaultCudaExecutionProvider();
-    if (provider == nullptr) {
-      return;
-    }
     gpu_provider = provider.get();
     ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(std::move(provider)));
 #endif
@@ -1647,9 +1637,6 @@ TEST(InferenceSessionTests, Test3LayerNestedSubgraph) {
 #if USE_TENSORRT
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultTensorrtExecutionProvider()));
 #elif USE_CUDA
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
 #elif USE_ROCM
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultRocmExecutionProvider()));
@@ -1802,9 +1789,6 @@ TEST(InferenceSessionTests, Test2LayerNestedSubgraph) {
 #if USE_TENSORRT
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultTensorrtExecutionProvider()));
 #elif USE_CUDA
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
 #elif USE_ROCM
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultRocmExecutionProvider()));
@@ -2160,9 +2144,6 @@ TEST(InferenceSessionTests, TestStrictShapeInference) {
 #ifdef USE_CUDA
 // disable it, since we are going to enable parallel execution with cuda ep
 TEST(InferenceSessionTests, DISABLED_TestParallelExecutionWithCudaProvider) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   string model_uri = "testdata/transform/fusion/fuse-conv-bn-mul-add-unsqueeze.onnx";
 
   SessionOptions so;
@@ -2186,10 +2167,6 @@ TEST(InferenceSessionTests, DISABLED_TestParallelExecutionWithCudaProvider) {
 }
 
 TEST(InferenceSessionTests, TestArenaShrinkageAfterRun) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
-
   OrtArenaCfg arena_cfg;
   arena_cfg.arena_extend_strategy = 1;  // kSameAsRequested
 
diff --git a/onnxruntime/test/framework/memcpy_transformer_test.cc b/onnxruntime/test/framework/memcpy_transformer_test.cc
index 2313f00e4d123..6e86e5b58aead 100644
--- a/onnxruntime/test/framework/memcpy_transformer_test.cc
+++ b/onnxruntime/test/framework/memcpy_transformer_test.cc
@@ -9,9 +9,6 @@
 #include "default_providers.h"
 #include "gtest/gtest.h"
 #include "test_utils.h"
-#ifdef USE_CUDA
-#include "test/common/cuda_op_test_utils.h"
-#endif
 #include "test/test_environment.h"
 #include "asserts.h"
 
@@ -77,9 +74,6 @@ void ExpectCopy(const onnxruntime::Node& source, const std::string copy_op,
 #ifdef USE_CUDA
 
 TEST(TransformerTest, MemcpyTransformerTest) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   std::unordered_map<std::string, int> domain_to_version;
   domain_to_version[kOnnxDomain] = 7;
   auto model = std::make_shared<onnxruntime::Model>("test", false, ModelMetaData(), PathString(),
@@ -112,9 +106,7 @@ TEST(TransformerTest, MemcpyTransformerTest) {
 
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
-#if defined(USE_CUDA)
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
-#endif
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                                            std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo())));
   KernelRegistryManager test_registry_manager;
@@ -137,9 +129,6 @@ TEST(TransformerTest, MemcpyTransformerTest) {
 }
 
 TEST(TransformerTest, MemcpyTransformerTestCudaFirst) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   std::unordered_map<std::string, int> domain_to_version;
   domain_to_version[kOnnxDomain] = 7;
   auto model = std::make_shared<onnxruntime::Model>("test", false, ModelMetaData(), PathString(),
@@ -172,9 +161,7 @@ TEST(TransformerTest, MemcpyTransformerTestCudaFirst) {
 
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
-
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
-
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                                            std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo())));
   KernelRegistryManager test_registry_manager;
@@ -294,11 +281,7 @@ TEST(TransformerTest, TestInitializerDuplicationInSubgraph) {
 
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
-
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                                            std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo())));
   KernelRegistryManager test_registry_manager;
@@ -340,11 +323,7 @@ TEST(TransformerTest, MemcpyTransformerTestGraphInputConsumedOnMultipleDevices)
 
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
-
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                                            std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo())));
   KernelRegistryManager test_registry_manager;
@@ -446,11 +425,7 @@ TEST(TransformerTest, MemcpyTransformerTestImplicitInputConsumedOnMultipleDevice
 
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
-
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                                            std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo())));
   KernelRegistryManager test_registry_manager;
diff --git a/onnxruntime/test/framework/sparse_kernels_test.cc b/onnxruntime/test/framework/sparse_kernels_test.cc
index db9592c293fd0..7bd6b47f52b7d 100644
--- a/onnxruntime/test/framework/sparse_kernels_test.cc
+++ b/onnxruntime/test/framework/sparse_kernels_test.cc
@@ -1457,9 +1457,6 @@ TEST(SparseTensorConversionTests, CsrConversion) {
 
 #ifdef USE_CUDA
   auto cuda_provider = DefaultCudaExecutionProvider();
-  if (cuda_provider == nullptr) {
-    return;
-  }
   auto cuda_allocator = cuda_provider->CreatePreferredAllocators()[0];
   {
     auto cuda_transfer = cuda_provider->GetDataTransfer();
@@ -1687,9 +1684,6 @@ TEST(SparseTensorConversionTests, CooConversion) {
 
 #ifdef USE_CUDA
   auto cuda_provider = DefaultCudaExecutionProvider();
-  if (cuda_provider == nullptr) {
-    return;
-  }
   auto cuda_allocator = cuda_provider->CreatePreferredAllocators()[0];
   {
     auto cuda_transfer = cuda_provider->GetDataTransfer();
diff --git a/onnxruntime/test/lora/lora_test.cc b/onnxruntime/test/lora/lora_test.cc
index 9d8febb453739..e8291a36447ca 100644
--- a/onnxruntime/test/lora/lora_test.cc
+++ b/onnxruntime/test/lora/lora_test.cc
@@ -201,16 +201,6 @@ TEST(LoraAdapterTest, Load) {
 
 #ifdef USE_CUDA
 TEST(LoraAdapterTest, VerifyDeviceCopy) {
-  // These checks for CUDA/DML combined Package, Be careful when you want to remove it!
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    GTEST_SKIP() << "Skip This Test Due to this EP is null";
-  }
-#ifdef USE_DML
-  if (DefaultDmlExecutionProvider() != nullptr) {
-    GTEST_FAIL() << "It should not run with DML EP";
-  }
-#endif
-
   auto cpu_ep = DefaultCpuExecutionProvider();
   auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0];
   auto cuda_ep = DefaultCudaExecutionProvider();
diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc
index b0958e05dc373..aa68f68f3e735 100644
--- a/onnxruntime/test/providers/base_tester.cc
+++ b/onnxruntime/test/providers/base_tester.cc
@@ -532,17 +532,6 @@ void BaseTester::Run(ExpectResult expect_result, const std::string& expected_fai
   so.use_deterministic_compute = use_determinism_;
   so.graph_optimization_level = TransformerLevel::Default;  // 'Default' == off
 
-  // remove nullptr in execution_providers.
-  // it's a little ugly but we need to do this because DefaultXXXExecutionProvider() can return nullptr in Runtime.
-  // And there're many places adding DefaultXXXExecutionProvider() to execution_providers directly.
-  if (execution_providers != nullptr) {
-    execution_providers->erase(std::remove(execution_providers->begin(), execution_providers->end(), nullptr), execution_providers->end());
-    if (execution_providers->size() == 0) {
-      // In fact, no ep is needed to run
-      return;
-    }
-  }
-
   Run(so, expect_result, expected_failure_string, excluded_provider_types, run_options, execution_providers, options);
 }
 
diff --git a/onnxruntime/test/providers/compare_provider_test_utils.cc b/onnxruntime/test/providers/compare_provider_test_utils.cc
index 9acb37c24ddd0..386a5656d8a01 100644
--- a/onnxruntime/test/providers/compare_provider_test_utils.cc
+++ b/onnxruntime/test/providers/compare_provider_test_utils.cc
@@ -53,11 +53,6 @@ void CompareOpTester::CompareWithCPU(const std::string& target_provider_type,
   SetTestFunctionCalled();
 
   std::unique_ptr<IExecutionProvider> target_execution_provider = GetExecutionProvider(target_provider_type);
-#if defined(USE_CUDA) && defined(USE_DML)
-  if (target_execution_provider == nullptr) {
-    return;
-  }
-#endif
   ASSERT_TRUE(target_execution_provider != nullptr) << "provider_type " << target_provider_type
                                                     << " is not supported.";
 
diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc
index b46c253fb8ed9..e3c86a137484f 100644
--- a/onnxruntime/test/providers/cpu/model_tests.cc
+++ b/onnxruntime/test/providers/cpu/model_tests.cc
@@ -491,18 +491,6 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
   // the number of times these are run to reduce the CI time.
   provider_names.erase(provider_name_cpu);
 #endif
-
-#if defined(USE_CUDA) && defined(USE_DML)
-  const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST");
-  if (no_cuda_ep_test == "1") {
-    provider_names.erase(provider_name_cuda);
-  }
-  const std::string no_dml_ep_test = Env::Default().GetEnvironmentVar("NO_DML_TEST");
-  if (no_dml_ep_test == "1") {
-    provider_names.erase(provider_name_dml);
-  }
-#endif
-
   std::vector<std::basic_string<ORTCHAR_T>> v;
   // Permanently exclude following tests because ORT support only opset starting from 7,
   // Please make no more changes to the list
diff --git a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
index 0f23e4c39d7e2..be79a6d29d539 100644
--- a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
@@ -3,9 +3,6 @@
 
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "gtest/gtest.h"
-#if USE_CUDA
-#include "test/common/cuda_op_test_utils.h"
-#endif
 #include "test/providers/provider_test_utils.h"
 #include "test/util/include/default_providers.h"
 
@@ -125,9 +122,6 @@ TEST(GatherOpTest, Gather_invalid_index_gpu) {
                          4.0f, 5.0f, 6.0f, 7.0f,
                          0.0f, 0.0f, 0.0f, 0.0f});
 
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   // On GPU, just set the value to 0 instead of report error. exclude all other providers
   test
 #if defined(USE_CUDA)
diff --git a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
index 7e1a2384d7fc6..05cfb5c13d689 100644
--- a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
@@ -15,13 +15,11 @@ std::vector<std::unique_ptr<IExecutionProvider>> GetExecutionProviders(int opset
   execution_providers.emplace_back(DefaultCpuExecutionProvider());
 
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() != nullptr) {
-    if (opset_version < 20) {
-      execution_providers.emplace_back(DefaultCudaExecutionProvider());
+  if (opset_version < 20) {
+    execution_providers.emplace_back(DefaultCudaExecutionProvider());
 #ifdef ENABLE_CUDA_NHWC_OPS
-      execution_providers.push_back(DefaultCudaNHWCExecutionProvider());
+    execution_providers.push_back(DefaultCudaNHWCExecutionProvider());
 #endif
-    }
   }
 #endif
 
diff --git a/onnxruntime/test/providers/cuda/cuda_provider_test.cc b/onnxruntime/test/providers/cuda/cuda_provider_test.cc
index e745e1bcb8171..e57cdd2350fab 100644
--- a/onnxruntime/test/providers/cuda/cuda_provider_test.cc
+++ b/onnxruntime/test/providers/cuda/cuda_provider_test.cc
@@ -11,7 +11,7 @@ ProviderInfo_CUDA& GetProviderInfo_CUDA_Test();
 
 namespace test {
 namespace cuda {
-TEST(CudaEpUnittest, All) {
+TEST(CUDA_EP_Unittest, All) {
   onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA_Test();
   ep.TestAll();
 }
diff --git a/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc b/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
index ec7c6ec4e1605..b413d04fe81e8 100644
--- a/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
@@ -11,7 +11,7 @@
 namespace onnxruntime {
 namespace test {
 
-TEST(CudaEpAllocatorTest, CUDAAllocatorTest) {
+TEST(AllocatorTest, CUDAAllocatorTest) {
   OrtDevice::DeviceId cuda_device_id = 0;
 
   // ensure CUDA device is available.
@@ -77,7 +77,7 @@ TEST(CudaEpAllocatorTest, CUDAAllocatorTest) {
 }
 
 // test that we fallback to smaller allocations if the growth of the arena exceeds the available memory
-TEST(CudaEpAllocatorTest, CUDAAllocatorFallbackTest) {
+TEST(AllocatorTest, CUDAAllocatorFallbackTest) {
   OrtDevice::DeviceId cuda_device_id = 0;
 
   size_t free = 0;
diff --git a/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc
index ccdc56de5937d..b2e986f680763 100644
--- a/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc
@@ -17,7 +17,7 @@ using onnxruntime::contrib::attention::AttentionBackend;
 namespace onnxruntime {
 namespace test {
 
-TEST(CudaEpAttentionKernelOptionsTest, NonZeroValue) {
+TEST(AttentionKernelOptionsTest, NonZeroValue) {
   {
     AttentionKernelOptions options;
     int value = static_cast<int>(AttentionBackend::FLASH_ATTENTION) | static_cast<int>(AttentionBackend::EFFICIENT_ATTENTION);
@@ -156,7 +156,7 @@ TEST(CudaEpAttentionKernelOptionsTest, NonZeroValue) {
 }
 
 // Test all environment variables take effect when option value is 0.
-TEST(CudaEpAttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
+TEST(AttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
   constexpr int value = 0;
   ScopedEnvironmentVariables scoped_env_vars{
       EnvVarMap{
@@ -186,7 +186,7 @@ TEST(CudaEpAttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
 }
 
 // Test default min sequence lengths when environment variables are not set.
-TEST(CudaEpAttentionKernelOptionsTest, DefaultMinSeqLens) {
+TEST(AttentionKernelOptionsTest, DefaultMinSeqLens) {
   constexpr int value = 0;
   ScopedEnvironmentVariables scoped_env_vars{
       EnvVarMap{
diff --git a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
index 97d50398a5550..a0d115c41c14b 100644
--- a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
@@ -68,7 +68,7 @@ void ComputeTopKReference(const std::vector<float>& values,
   }
 }
 
-TEST(CudaEpTestBeamSearch, TopK) {
+TEST(TestBeamSearch, TopK) {
   int32_t batch_size = 4;
   int32_t beam_size = 4;
   int32_t vocab_size = 50257;
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
index d8fb3c8256012..3fcb9045ee7e6 100644
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
@@ -230,7 +230,7 @@ void testPrepack(int rows, int columns) {
 }
 
 // TODO: code runs on CPU, but this is for sm80 only, maybe enable only when test on sm80
-TEST(CudaEpBlkQ4_GEMM, PrepackSm80Test) {
+TEST(BlkQ4_GEMM, PrepackSm80Test) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
@@ -263,7 +263,7 @@ TEST(CudaEpBlkQ4_GEMM, PrepackSm80Test) {
   testPrepack<true, false>(256, 256);
 }
 
-TEST(CudaEpBlkQ4_GEMM, Sm80RowBlockingTest) {
+TEST(BlkQ4_GEMM, Sm80RowBlockingTest) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
@@ -292,7 +292,7 @@ TEST(CudaEpBlkQ4_GEMM, Sm80RowBlockingTest) {
   onnxruntime::cuda::test::run_blkq4_gemm<64, false, false, true>(256, 1024, 576);
 }
 
-TEST(CudaEpBlkQ4_GEMM, Sm80ColBlockingTest) {
+TEST(BlkQ4_GEMM, Sm80ColBlockingTest) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
@@ -305,7 +305,7 @@ TEST(CudaEpBlkQ4_GEMM, Sm80ColBlockingTest) {
   onnxruntime::cuda::test::run_blkq4_gemm<64, true, false, true>(256, 1024, 576);
 }
 
-TEST(CudaEpBlkQ4_GEMM, Sm80SmallMTest) {
+TEST(BlkQ4_GEMM, Sm80SmallMTest) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
@@ -326,7 +326,7 @@ TEST(CudaEpBlkQ4_GEMM, Sm80SmallMTest) {
   onnxruntime::cuda::test::run_blkq4_gemm<64, true, true, true>(16, 1024, 576);
 }
 
-TEST(CudaEpBlkQ4_GEMM, Sm80SmallTileKernelTest) {
+TEST(BlkQ4_GEMM, Sm80SmallTileKernelTest) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
index f3222c6f683b5..72357ec7e02d2 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
@@ -19,7 +19,7 @@ namespace cuda {
 namespace test {
 // TODO: Since the "DeferredRelease" has been migrated to CudaStream class,
 // we should migrate this test from CudaEP unit test to CudaStream unit test.
-TEST(CudaEpTestDeferredRelease, WithArena) {
+TEST(TestDeferredRelease, WithArena) {
   // Create CUDA EP.
   CUDAExecutionProviderInfo info;
   CUDAExecutionProvider ep(info);
@@ -52,7 +52,7 @@ TEST(CudaEpTestDeferredRelease, WithArena) {
   ORT_THROW_IF_ERROR(ep.OnRunEnd(true, run_opts));
 }
 
-TEST(CudaEpTestDeferredRelease, WithoutArena) {
+TEST(TestDeferredRelease, WithoutArena) {
   // Create CUDA EP.
   CUDAExecutionProviderInfo info;
   CUDAExecutionProvider ep(info);
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
index 3538c7add94d0..7468a5718425e 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
@@ -40,7 +40,7 @@ void TestFillCorrectness(size_t num_elements, TElement value) {
 }
 }  // namespace
 
-TEST(CudaEpUnittest, FillCorrectness) {
+TEST(CudaUtilsTest, FillCorrectness) {
   TestFillCorrectness<int8_t>(1 << 20, 1);
   TestFillCorrectness<int16_t>(1 << 20, 2);
   TestFillCorrectness<int32_t>(1 << 20, 3);
diff --git a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
index 518fde5804b23..6636e15040393 100644
--- a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
@@ -10,7 +10,7 @@ namespace onnxruntime {
 namespace cuda {
 namespace test {
 
-TEST(CudaEpGemmOptions, TestDefaultOptions) {
+TEST(CudaGemmOptions, TestDefaultOptions) {
   HalfGemmOptions gemm_options;
   ASSERT_FALSE(gemm_options.IsCompute16F());
 #if defined(USE_CUDA)
@@ -22,7 +22,7 @@ TEST(CudaEpGemmOptions, TestDefaultOptions) {
 #endif
 }
 
-TEST(CudaEpGemmOptions, TestCompute16F) {
+TEST(CudaGemmOptions, TestCompute16F) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(1);
   ASSERT_TRUE(gemm_options.IsCompute16F());
@@ -35,7 +35,7 @@ TEST(CudaEpGemmOptions, TestCompute16F) {
 #endif
 }
 
-TEST(CudaEpGemmOptions, NoReducedPrecision) {
+TEST(CudaGemmOptions, NoReducedPrecision) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(2);
   ASSERT_FALSE(gemm_options.IsCompute16F());
@@ -48,7 +48,7 @@ TEST(CudaEpGemmOptions, NoReducedPrecision) {
 #endif
 }
 
-TEST(CudaEpGemmOptions, Pedantic) {
+TEST(CudaGemmOptions, Pedantic) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(4);
   ASSERT_FALSE(gemm_options.IsCompute16F());
@@ -61,7 +61,7 @@ TEST(CudaEpGemmOptions, Pedantic) {
 #endif
 }
 
-TEST(CudaEpGemmOptions, Compute16F_Pedantic) {
+TEST(CudaGemmOptions, Compute16F_Pedantic) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(5);
   ASSERT_TRUE(gemm_options.IsCompute16F());
@@ -74,7 +74,7 @@ TEST(CudaEpGemmOptions, Compute16F_Pedantic) {
 #endif
 }
 
-TEST(CudaEpGemmOptions, Compute16F_NoReducedPrecision) {
+TEST(CudaGemmOptions, Compute16F_NoReducedPrecision) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(3);
   ASSERT_TRUE(gemm_options.IsCompute16F());
diff --git a/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc b/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc
index ba24cf858e80f..6b8cd68de0fca 100644
--- a/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc
@@ -41,7 +41,7 @@ void ComputeTop1Reference(const std::vector<float>& values,
   }
 }
 
-TEST(CudaEpTestGreedySearch, TopOne) {
+TEST(TestGreedySearch, TopOne) {
   int32_t batch_size = 4;
   int32_t vocab_size = 50257;
   int32_t batch_x_vocab = batch_size * vocab_size;
diff --git a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
index 09c9c1e5f8f6a..ec7e98528504e 100644
--- a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
@@ -179,7 +179,7 @@ void TestReduceColumnsToColumn(int m, int n, float relative_error_tolerance = 1e
 }
 }  // namespace
 
-TEST(CudaEpReductionFunctionsTest, ReduceRowToScalar) {
+TEST(ReductionFunctionsTest, ReduceRowToScalar) {
   TestReduceRowToScalarApis(3);
   TestReduceRowToScalarApis(19);
   TestReduceRowToScalarApis(123);
@@ -188,7 +188,7 @@ TEST(CudaEpReductionFunctionsTest, ReduceRowToScalar) {
   TestReduceRowToScalarApis(941736, 2e-4f);
 }
 
-TEST(CudaEpReductionFunctionsTest, ReduceRowsToRow) {
+TEST(ReductionFunctionsTest, ReduceRowsToRow) {
   for (int m : {3, 193, 2945}) {
     for (int n : {3, 193, 2945}) {
       TestReduceRowsToRow(m, n, true);
@@ -197,7 +197,7 @@ TEST(CudaEpReductionFunctionsTest, ReduceRowsToRow) {
   }
 }
 
-TEST(CudaEpReductionFunctionsTest, ReduceColumnsToColumn) {
+TEST(ReductionFunctionsTest, ReduceColumnsToColumn) {
   for (int m : {3, 193, 2945}) {
     for (int n : {3, 193, 2945}) {
       TestReduceColumnsToColumn(m, n);
@@ -205,7 +205,7 @@ TEST(CudaEpReductionFunctionsTest, ReduceColumnsToColumn) {
   }
 }
 
-TEST(CudaEpReductionFunctionsTest, BufferOffsets) {
+TEST(ReductionFunctionsTest, BufferOffsets) {
   const int m = 2048;
   const int n = 1024;
   const TensorShape shape{m, n};
@@ -240,7 +240,7 @@ TEST(CudaEpReductionFunctionsTest, BufferOffsets) {
   }
 }
 
-TEST(CudaEpReductionFunctionsTest, InvalidBufferSize) {
+TEST(ReductionFunctionsTest, InvalidBufferSize) {
   const int m = 2048;
   const int n = 1024;
   const TensorShape shape{m, n};
@@ -262,7 +262,7 @@ TEST(CudaEpReductionFunctionsTest, InvalidBufferSize) {
   ASSERT_FALSE(status.IsOK());
 }
 
-TEST(CudaEpReductionFunctionsTest, GetApplicableMatrixReduction) {
+TEST(ReductionFunctionsTest, GetApplicableMatrixReduction) {
   auto test_get_applicable_matrix_reduction =
       [](cudnnReduceTensorOp_t cudnn_op,
          const std::vector<int64_t>& dims, const std::vector<int64_t>& axes,
diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py
index a274b90dc042f..8fc76da3495a8 100644
--- a/onnxruntime/test/python/onnx_backend_test_series.py
+++ b/onnxruntime/test/python/onnx_backend_test_series.py
@@ -105,7 +105,7 @@ def load_jsonc(basename: str):
     return json.loads("\n".join(lines))
 
 
-def create_backend_test(devices: list[str], test_name=None):
+def create_backend_test(test_name=None):
     """Creates an OrtBackendTest and adds its TestCase's to global scope so unittest will find them."""
 
     overrides = load_jsonc("onnx_backend_test_series_overrides.jsonc")
@@ -126,29 +126,30 @@ def create_backend_test(devices: list[str], test_name=None):
     else:
         filters = load_jsonc("onnx_backend_test_series_filters.jsonc")
         current_failing_tests = apply_filters(filters, "current_failing_tests")
+
         if platform.architecture()[0] == "32bit":
             current_failing_tests += apply_filters(filters, "current_failing_tests_x86")
 
-        if backend.supports_device("DNNL") or "DNNL" in devices:
+        if backend.supports_device("DNNL"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_DNNL")
 
-        if backend.supports_device("NNAPI") or "NNAPI" in devices:
+        if backend.supports_device("NNAPI"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_NNAPI")
 
-        if backend.supports_device("OPENVINO_GPU") or "OPENVINO_GPU" in devices:
+        if backend.supports_device("OPENVINO_GPU"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_GPU")
 
-        if backend.supports_device("OPENVINO_CPU") or "OPENVINO_CPU" in devices:
+        if backend.supports_device("OPENVINO_CPU"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_CPU_FP32")
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_CPU_FP16")
 
-        if backend.supports_device("OPENVINO_NPU") or "OPENVINO_NPU" in devices:
+        if backend.supports_device("OPENVINO_NPU"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_NPU")
 
-        if backend.supports_device("OPENVINO") or "OPENVINO" in devices:
+        if backend.supports_device("OPENVINO"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_opset18")
 
-        if backend.supports_device("MIGRAPHX") or "MIGRAPHX" in devices:
+        if backend.supports_device("MIGRAPHX"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_MIGRAPHX")
 
         if backend.supports_device("WEBGPU"):
@@ -157,16 +158,8 @@ def create_backend_test(devices: list[str], test_name=None):
         # Skip these tests for a "pure" DML onnxruntime python wheel. We keep these tests enabled for instances where both DML and CUDA
         # EPs are available (Windows GPU CI pipeline has this config) - these test will pass because CUDA has higher precedence than DML
         # and the nodes are assigned to only the CUDA EP (which supports these tests)
-        if (backend.supports_device("DML") and not backend.supports_device("GPU")) or "DML" in devices:
+        if backend.supports_device("DML") and not backend.supports_device("GPU"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_pure_DML")
-            # exclude CUDA EP when DML test is running.
-            os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider,CUDAExecutionProvider"
-        elif backend.supports_device("DML") and "DML" not in devices:
-            # exclude DML EP when CUDA test is running.
-            os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider,DmlExecutionProvider"
-        else:
-            # exclude TRT EP temporarily and only test CUDA EP to retain previous behavior
-            os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider"
 
         filters = (
             current_failing_tests
@@ -179,6 +172,9 @@ def create_backend_test(devices: list[str], test_name=None):
         backend_test.exclude("(" + "|".join(filters) + ")")
         print("excluded tests:", filters)
 
+        # exclude TRT EP temporarily and only test CUDA EP to retain previous behavior
+        os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider"
+
     # import all test cases at global scope to make
     # them visible to python.unittest.
     globals().update(backend_test.enable_report().test_cases)
@@ -203,15 +199,6 @@ def parse_args():
         help="Only run tests that match this value. Matching is regex based, and '.*' is automatically appended",
     )
 
-    parser.add_argument(
-        "--devices",
-        type=str,
-        choices=["CPU", "CUDA", "MIGRAPHX", "DNNL", "DML", "OPENVINO_GPU", "OPENVINO_CPU", "OPENVINO_NPU", "OPENVINO"],
-        nargs="+",  # allows multiple values
-        default=["CPU"],  # default to ["CPU"] if no input is given
-        help="Select one or more devices CPU, CUDA, MIGRAPHX, DNNL, DML, OPENVINO_GPU, OPENVINO_CPU, OPENVINO_NPU, OPENVINO",
-    )
-
     # parse just our args. python unittest has its own args and arg parsing, and that runs inside unittest.main()
     parsed, unknown = parser.parse_known_args()
     sys.argv = sys.argv[:1] + unknown
@@ -222,5 +209,5 @@ def parse_args():
 if __name__ == "__main__":
     args = parse_args()
 
-    create_backend_test(args.devices, args.test_name)
+    create_backend_test(args.test_name)
     unittest.main()
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index 7ecaab6fedb02..f083ab14ad133 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -750,13 +750,6 @@
         "^test_reduce_log_sum_empty_set_cpu",
         "^test_reduce_log_sum_exp_empty_set_cpu",
         "^test_reduce_prod_empty_set_cpu",
-        // Bug: DML EP some how executes these CUDA tests and failed
-        // TODO: Remove these tests when DML EP is fixed
-        "^test_convtranspose_autopad_same_cuda",
-        "^test_asin_example_cuda",
-        "^test_dynamicquantizelinear_cuda",
-        "^test_dynamicquantizelinear_expanded_cuda",
-        "^test_reduce_min_empty_set_cuda",
         //Bug: DML EP does not execute operators with an empty input tensor
         //TODO: Resolve as a graph implementation that returns a constant inf tensor with appropriate strides
         "^test_reduce_min_empty_set_cpu"
diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index 59926bbcd1c6f..c1564997c42b8 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -122,12 +122,6 @@ std::unique_ptr<IExecutionProvider> DefaultOpenVINOExecutionProvider() {
 
 std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider() {
 #ifdef USE_CUDA
-#ifdef USE_DML
-  const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST");
-  if (no_cuda_ep_test == "1") {
-    return nullptr;
-  }
-#endif
   OrtCUDAProviderOptionsV2 provider_options{};
   provider_options.do_copy_in_default_stream = true;
   provider_options.use_tf32 = false;
@@ -140,12 +134,6 @@ std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider() {
 #ifdef ENABLE_CUDA_NHWC_OPS
 std::unique_ptr<IExecutionProvider> DefaultCudaNHWCExecutionProvider() {
 #if defined(USE_CUDA)
-#ifdef USE_DML
-  const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST");
-  if (no_cuda_ep_test == "1") {
-    return nullptr;
-  }
-#endif
   OrtCUDAProviderOptionsV2 provider_options{};
   provider_options.do_copy_in_default_stream = true;
   provider_options.use_tf32 = false;
@@ -332,12 +320,6 @@ std::unique_ptr<IExecutionProvider> DefaultCannExecutionProvider() {
 
 std::unique_ptr<IExecutionProvider> DefaultDmlExecutionProvider() {
 #ifdef USE_DML
-#ifdef USE_CUDA
-  const std::string no_dml_ep_test = Env::Default().GetEnvironmentVar("NO_DML_TEST");
-  if (no_dml_ep_test == "1") {
-    return nullptr;
-  }
-#endif
   ConfigOptions config_options{};
   if (auto factory = DMLProviderFactoryCreator::CreateFromDeviceOptions(config_options, nullptr, false, false)) {
     return factory->CreateProvider();
diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/steps/py_packaging_test_step.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/steps/py_packaging_test_step.yml
deleted file mode 100644
index 9a721c65de332..0000000000000
--- a/tools/ci_build/github/azure-pipelines/stages/jobs/steps/py_packaging_test_step.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-parameters:
-- name: EP_NAME
-  type: string
-  default: CPU
-
-- name: PYTHON_VERSION
-  type: string
-
-steps:
-- powershell: |
-    python -m pip uninstall -y onnxruntime onnxruntime-gpu -qq
-    Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*cp${{ replace(parameters.PYTHON_VERSION,'.','') }}*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate}
-    mkdir -p $(Agent.TempDirectory)\ort_test_data
-    Copy-Item -Path $(Build.sourcesDirectory)/onnxruntime/test/python/onnx_backend_test_series.py -Destination $(Agent.TempDirectory)\ort_test_data
-    Copy-Item -Recurse -Path $(Build.sourcesDirectory)/onnxruntime/test/testdata -Destination $(Agent.TempDirectory)\ort_test_data
-    cd $(Agent.TempDirectory)\ort_test_data
-    python onnx_backend_test_series.py --devices ${{ parameters.EP_NAME }} -v
-    cd $(Agent.TempDirectory)
-    Remove-Item -Path $(Agent.TempDirectory)\ort_test_data -Recurse -Force
-  workingDirectory: '$(Build.sourcesDirectory)'
-  displayName: 'Run Python Tests with ${{ parameters.EP_NAME }} EP'
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
index 0b3eac0110abc..9c7fbc24ab1b6 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
@@ -50,8 +50,6 @@ stages:
     win_trt_home: ${{ parameters.win_trt_home }}
     win_cuda_home: ${{ parameters.win_cuda_home }}
     buildJava: ${{ parameters.buildJava }}
-    SpecificArtifact: ${{ parameters.SpecificArtifact }}
-    BuildId: ${{ parameters.BuildId }}
 
 - template: nuget-cuda-packaging-stage.yml
   parameters:
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
index d6b25c98936f0..445066f08995a 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
@@ -34,7 +34,7 @@ parameters:
   displayName: Specific Artifact's BuildId
   type: string
   default: '0'
-
+  
 - name: buildJava
   type: boolean
 
@@ -50,14 +50,13 @@ stages:
     msbuildPlatform: x64
     packageName: x64-cuda
     CudaVersion: ${{ parameters.CudaVersion }}
-    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --use_dml --build_csharp --parallel
+    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80"
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu
     UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }}
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
     BuildId: ${{ parameters.BuildId }}
-    ComboTests: true
 # Windows CUDA with TensorRT Packaging
 - template: ../templates/win-ci.yml
   parameters:
@@ -69,7 +68,7 @@ stages:
     msbuildPlatform: x64
     CudaVersion: ${{ parameters.CudaVersion }}
     packageName: x64-tensorrt
-    buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }}  --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --parallel
+    buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }}  --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80"
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
index f7235e3ad2076..947e4f99b984f 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
@@ -56,7 +56,7 @@ stages:
           PYTHON_VERSION: ${{ python_version }}
           EP_NAME: gpu
           CudaVersion: ${{ parameters.cuda_version }}
-          EP_BUILD_FLAGS: --use_dml --enable_lto --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+          EP_BUILD_FLAGS: --enable_lto --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
           use_tensorrt: True
 
   - ${{ if eq(parameters.enable_linux_cuda, true) }}:
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
index dd0539f751c89..aa7f2845fc0fa 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
@@ -33,7 +33,7 @@ parameters:
    - Release
    - RelWithDebInfo
    - MinSizeRel
-
+   
 - name: use_tensorrt
   type: boolean
   default: false
@@ -134,7 +134,7 @@ stages:
                 --cmake_generator "$(VSGenerator)"
                 --enable_pybind
                 --enable_onnx_tests
-                --parallel 4 --use_binskim_compliant_compile_flags --update --build
+                --parallel --use_binskim_compliant_compile_flags --update --build
                 $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }} ${{ variables.trt_build_flag }}
               workingDirectory: '$(Build.BinariesDirectory)'
 
@@ -206,20 +206,19 @@ stages:
             DownloadTRT: ${{ parameters.use_tensorrt }}
 
         - task: PowerShell@2
-          displayName: 'Install Third Party Dependencies'
+          displayName: 'Install ONNX'
           inputs:
             filePath: '$(Build.SourcesDirectory)/tools/ci_build/github/windows/install_third_party_deps.ps1'
             workingDirectory: '$(Build.BinariesDirectory)'
             arguments: -cpu_arch x64 -install_prefix $(Build.BinariesDirectory)\${{ parameters.cmake_build_type }}\installed -build_config ${{ parameters.cmake_build_type }}
 
-        - template: jobs/steps/py_packaging_test_step.yml
-          parameters:
-            EP_NAME: DML
-            PYTHON_VERSION: ${{ parameters.PYTHON_VERSION }}
-
-        - template: jobs/steps/py_packaging_test_step.yml
-          parameters:
-            EP_NAME: CUDA
-            PYTHON_VERSION: ${{ parameters.PYTHON_VERSION }}
-
-
+        - powershell: |
+            python -m pip uninstall -y onnxruntime onnxruntime-gpu -qq
+            Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*cp${{ replace(parameters.PYTHON_VERSION,'.','') }}*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate}
+            mkdir -p $(Agent.TempDirectory)\ort_test_data
+            Copy-Item -Path $(Build.sourcesDirectory)/onnxruntime/test/python/onnx_backend_test_series.py -Destination $(Agent.TempDirectory)\ort_test_data
+            Copy-Item -Recurse -Path $(Build.sourcesDirectory)/onnxruntime/test/testdata -Destination $(Agent.TempDirectory)\ort_test_data
+            cd $(Agent.TempDirectory)\ort_test_data
+            python onnx_backend_test_series.py
+          workingDirectory: '$(Build.sourcesDirectory)'
+          displayName: 'Run Python Tests'
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
index 7bdd069de711b..e8f391a73fa7b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
@@ -218,32 +218,16 @@ jobs:
       - powershell: |
          python3 -m pip uninstall -y onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml -qq
          Get-ChildItem -Path dist/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname}
+
         workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
         displayName: 'Install onnxruntime wheel'
 
   - ${{ if eq(parameters.RunOnnxRuntimeTests, true) }}:
-      - ${{ if and(contains(parameters.additionalBuildFlags, 'use_cuda'), contains(parameters.additionalBuildFlags, 'use_dml')) }}:
-        - powershell: |
-           python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
-          workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
-          displayName: 'Run tests excluding CUDA tests'
-          env:
-            NO_CUDA_TEST: '1'
-            GTEST_FILTER: '-CudaEp*:CudaNhwcTypedTest*:*cpu_*models*' # Exclude CUDA EP tests under providers/cuda/ and cpu models test
-            PATH: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }};$(PATH)' # For onnxruntime4j_test to find dependent dlls
-        - powershell: |
-            python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
-          workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
-          displayName: 'Run tests excluding DML tests'
-          env:
-            NO_DML_TEST: '1'
-            GTEST_FILTER: '-*cpu_*models*'
-            PATH: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }};$(PATH)'
-      - ${{ else }}:
-        - powershell: |
-           python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
-          workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
-          displayName: 'Run tests'
+      - powershell: |
+         python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022"  --build_shared_lib --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
+
+        workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
+        displayName: 'Run tests'
 
   - ${{ if eq(parameters.GenerateDocumentation, true) }}:
     - task: PythonScript@0
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index e046997b4f49a..59950433b3d40 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -25,7 +25,7 @@ parameters:
 
 - name: runTests
   type: boolean
-  default: false
+  default: true
 
 - name: buildJava
   type: boolean
@@ -71,10 +71,6 @@ parameters:
       - 11.8
       - 12.2
 
-- name: ComboTests
-  type: boolean
-  default: false
-
 - name: SpecificArtifact
   displayName: Use Specific Artifact
   type: boolean
@@ -226,7 +222,7 @@ stages:
           condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
           inputs:
             scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --test --skip_submodule_sync --build_shared_lib --cmake_generator "$(VSGenerator)" --enable_onnx_tests  $(TelemetryOption) ${{ parameters.buildparameter }}'
+            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "$(VSGenerator)" --enable_onnx_tests  $(TelemetryOption) ${{ parameters.buildparameter }}'
             workingDirectory: '$(Build.BinariesDirectory)'
       - ${{ else }}:
         - powershell: |
@@ -338,10 +334,6 @@ stages:
           displayName: 'Clean Agent Directories'
           condition: always()
 
-        - script:
-            echo ${{ parameters.SpecificArtifact }}
-          displayName: 'Print Specific Artifact'
-
         - checkout: self
           clean: true
           submodules: none
@@ -407,35 +399,13 @@ stages:
           displayName: 'Append dotnet x86  Directory to PATH'
           condition: and(succeeded(), eq('${{ parameters.buildArch}}', 'x86'))
 
-        - ${{ if eq(parameters.ComboTests, 'true') }}:
-          - task: PythonScript@0
-            displayName: 'test excludes CUDA'
-            condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
-            inputs:
-              scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-              arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
-              workingDirectory: '$(Build.BinariesDirectory)'
-            env:
-              NO_CUDA_TEST: '1'
-              GTEST_FILTER: '-CudaEp*:CudaNhwcTypedTest*' # Exclude CUDA EP tests under providers/cuda/
-          - task: PythonScript@0
-            displayName: 'test excludes DML'
-            condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
-            inputs:
-              scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-              arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
-              workingDirectory: '$(Build.BinariesDirectory)'
-            env:
-              NO_DML_TEST: '1'
-        - ${{ else }}:
-          - task: PythonScript@0
-            displayName: 'test'
-            condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
-            inputs:
-              scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-              arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
-              workingDirectory: '$(Build.BinariesDirectory)'
-
+        - task: PythonScript@0
+          displayName: 'test'
+          condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
+          inputs:
+            scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
+            workingDirectory: '$(Build.BinariesDirectory)'
 # Previous stage only assembles the java binaries, testing will be done in this stage with GPU machine
         - ${{ if eq(parameters.buildJava, 'true') }}:
           - template: make_java_win_binaries.yml
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml
index 67fd47c3150af..47ece37e66e09 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml
@@ -62,28 +62,4 @@ stages:
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
         ORT_EP_NAME: CUDA
         WITH_CACHE: true
-        MachinePool: onnxruntime-Win2022-GPU-A10
-
-- stage: cuda_dml
-  dependsOn: []
-  jobs:
-    - template: templates/jobs/win-ci-vs-2022-job.yml
-      parameters:
-        BuildConfig: 'RelWithDebInfo'
-        EnvSetupScript: setup_env_cuda.bat
-        buildArch: x64
-        additionalBuildFlags: >-
-          --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}"
-          --enable_cuda_profiling --enable_transformers_tool_test
-          --use_dml
-          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
-          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
-          --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
-        msbuildPlatform: x64
-        isX86: false
-        job_name_suffix: x64_RelWithDebInfo
-        RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        ORT_EP_NAME: CUDA
-        EnablePython: false
-        WITH_CACHE: true
-        MachinePool: onnxruntime-Win2022-GPU-A10
+        MachinePool: onnxruntime-Win2022-GPU-A10
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml
index 911d99cd2adf3..94b0aa680d54d 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml
@@ -43,11 +43,11 @@ stages:
         BuildConfig: 'RelWithDebInfo'
         EnvSetupScript: setup_env.bat
         buildArch: x64
-        additionalBuildFlags: --enable_pybind --use_dml --enable_wcos --use_winml
+        additionalBuildFlags: --enable_pybind --use_dml --enable_wcos  --use_winml
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
         ORT_EP_NAME: DML
         WITH_CACHE: false
-        MachinePool: onnxruntime-Win2022-GPU-dml-A10
+        MachinePool: onnxruntime-Win2022-GPU-dml-A10
\ No newline at end of file

From 31e6e1010c9a51ba908f01fd03cf01cd55a75b83 Mon Sep 17 00:00:00 2001
From: Prathik Rao <prathik.rao@gmail.com>
Date: Wed, 18 Dec 2024 16:29:26 -0800
Subject: [PATCH 11/25] gather elements webgpu implementation (#23137)

Increases operator coverage for WebGPU EP.
---
 .../webgpu/tensor/gather_elements.cc          | 86 +++++++++++++++++++
 .../providers/webgpu/tensor/gather_elements.h | 36 ++++++++
 .../webgpu/webgpu_execution_provider.cc       |  4 +-
 .../cpu/tensor/gather_elements_op_test.cc     |  3 +-
 4 files changed, 126 insertions(+), 3 deletions(-)
 create mode 100644 onnxruntime/core/providers/webgpu/tensor/gather_elements.cc
 create mode 100644 onnxruntime/core/providers/webgpu/tensor/gather_elements.h

diff --git a/onnxruntime/core/providers/webgpu/tensor/gather_elements.cc b/onnxruntime/core/providers/webgpu/tensor/gather_elements.cc
new file mode 100644
index 0000000000000..00d8caf2624a9
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/tensor/gather_elements.cc
@@ -0,0 +1,86 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/common/inlined_containers.h"
+#include "core/providers/webgpu/tensor/gather_elements.h"
+#include "core/providers/cpu/tensor/utils.h"
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    GatherElements,
+    kOnnxDomain,
+    11, 12,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedFloatTypes()),
+    GatherElements);
+
+ONNX_OPERATOR_KERNEL_EX(
+    GatherElements,
+    kOnnxDomain,
+    13,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedFloatTypes()),
+    GatherElements);
+
+Status GatherElementsProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  const ShaderVariableHelper& input = shader.AddInput("input", ShaderUsage::UseUniform);
+  const ShaderVariableHelper& indices = shader.AddInput("indices", ShaderUsage::UseUniform);
+  const ShaderVariableHelper& output = shader.AddOutput("output", ShaderUsage::UseUniform);
+
+  shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
+                            << "let output_indices = " << output.OffsetToIndices("global_idx") << ";\n"
+                            << "var idx = " << indices.GetByOffset("global_idx") << ";\n"
+                            << "if (idx < 0) {\n"
+                            << "  idx = idx + uniforms.axis_dim_limit;\n"
+                            << "}\n"
+                            << "var input_indices = output_indices;\n"
+                            << input.IndicesSet("input_indices", "uniforms.axis", "u32(idx)") << ";\n"
+                            << "let value = " << input.GetByIndices("input_indices") << ";\n"
+                            << output.SetByOffset("global_idx", "value") << ";\n";
+
+  return Status::OK();
+}
+
+Status GatherElements::ComputeInternal(ComputeContext& context) const {
+  const auto* input_tensor = context.Input(0);
+  const TensorShape& input_shape = input_tensor->Shape();
+  int64_t input_rank = input_shape.NumDimensions();
+
+  const auto* indices_tensor = context.Input(1);
+  const TensorShape& indices_shape = indices_tensor->Shape();
+
+  // Handle negative axis
+  int64_t axis = axis_;
+  if (axis < 0) {
+    axis += input_rank;
+  }
+
+  auto axis_dim_limit = input_shape[axis];
+
+  auto output_dims = indices_shape.AsShapeVector();
+  TensorShape output_shape(output_dims);
+  auto* output_tensor = context.Output(0, output_shape);
+  int64_t output_size = output_tensor->Shape().Size();
+
+  if (output_size == 0) {
+    return Status::OK();
+  }
+
+  GatherElementsProgram program{};
+  program
+      .AddInputs({{input_tensor, ProgramTensorMetadataDependency::TypeAndRank}})
+      .AddInputs({{indices_tensor, ProgramTensorMetadataDependency::TypeAndRank}})
+      .AddOutputs({output_tensor})
+      .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+      .AddUniformVariables({{static_cast<uint32_t>(output_size)},
+                            {static_cast<int32_t>(axis_dim_limit)},
+                            {static_cast<int32_t>(axis)}});
+  return context.RunProgram(program);
+}
+
+}  // namespace webgpu
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/webgpu/tensor/gather_elements.h b/onnxruntime/core/providers/webgpu/tensor/gather_elements.h
new file mode 100644
index 0000000000000..f70bbda84c933
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/tensor/gather_elements.h
@@ -0,0 +1,36 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/webgpu_kernel.h"
+#include "core/providers/webgpu/program.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+class GatherElementsProgram final : public Program<GatherElementsProgram> {
+ public:
+  GatherElementsProgram() : Program{"GatherElements"} {}
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32},
+                                          {"axis_dim_limit", ProgramUniformVariableDataType::Int32},
+                                          {"axis", ProgramUniformVariableDataType::Int32});
+};
+
+class GatherElements final : public WebGpuKernel {
+ public:
+  GatherElements(const OpKernelInfo& info) : WebGpuKernel(info) {
+    axis_ = info.GetAttrOrDefault<int64_t>("axis", 0);
+  }
+
+  Status ComputeInternal(ComputeContext& context) const override;
+
+ private:
+  int64_t axis_;
+};
+
+}  // namespace webgpu
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index 66209adf6f1a9..295a8de31ed50 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -649,8 +649,8 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Gather)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, Gather)>,
 
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, GatherElements)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, GatherElements)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, GatherElements)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, GatherElements)>,
 
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 10, 10, Resize)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Resize)>,
diff --git a/onnxruntime/test/providers/cpu/tensor/gather_elements_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_elements_op_test.cc
index 5b2d00bb956bf..81e51375b9992 100644
--- a/onnxruntime/test/providers/cpu/tensor/gather_elements_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/gather_elements_op_test.cc
@@ -389,9 +389,10 @@ TEST(GatherElementsOpTest, IndicesOutOfBounds) {
   // skip openvino which will not throw error message but will ensure no out-of-bound access
   // skip TensorRT because it doesn't support out of bounds indices
   // skip QNN because it doesn't support out of bounds indices
+  // skip WebGPU because it doesn't support out of bounds indices
   test.Run(OpTester::ExpectResult::kExpectFailure, "",
            {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider, kOpenVINOExecutionProvider,
-            kTensorrtExecutionProvider, kDmlExecutionProvider, kQnnExecutionProvider});
+            kTensorrtExecutionProvider, kDmlExecutionProvider, kQnnExecutionProvider, kWebGpuExecutionProvider});
 }
 
 TEST(GatherElementsOpTest, BigIndices) {

From ae6dcc839eccb689524e07e4bc6a577d37ba9e33 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 18 Dec 2024 18:07:50 -0800
Subject: [PATCH 12/25] Revert "[js/webgpu] disable failed tests temporarily
 (#23127)" (#23130)

### Description

This reverts commit 9115682d69d381e6f31f1431cf6b037bfd458536.

### Motivation and Context
---
 js/web/test/data/ops/conv.jsonc       | 160 +++++++++++++-------------
 js/web/test/data/ops/fused-conv.jsonc | 152 ++++++++++++------------
 2 files changed, 156 insertions(+), 156 deletions(-)

diff --git a/js/web/test/data/ops/conv.jsonc b/js/web/test/data/ops/conv.jsonc
index f514ae5fa75e6..262503214a50a 100644
--- a/js/web/test/data/ops/conv.jsonc
+++ b/js/web/test/data/ops/conv.jsonc
@@ -391,48 +391,48 @@
       }
     ]
   },
-  // {
-  //   "name": "conv - vectorize group - B",
-  //   "operator": "Conv",
-  //   "inputShapeDefinitions": "rankOnly",
-  //   "opset": { "domain": "", "version": 17 },
-  //   "attributes": [
-  //     { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
-  //     { "name": "group", "data": 3, "type": "int" }
-  //   ],
-  //   "cases": [
-  //     {
-  //       "name": "T[0]",
-  //       "inputs": [
-  //         {
-  //           "data": [
-  //             0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
-  //             19.0, 20.0, 21.0, 22.0, 23.0, 0, 0, 0
-  //           ],
-  //           "dims": [1, 3, 3, 3],
-  //           "type": "float32"
-  //         },
-  //         {
-  //           "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
-  //           "dims": [3, 1, 2, 2],
-  //           "type": "float32"
-  //         },
-  //         {
-  //           "data": [0.1, 0.2, 0.3],
-  //           "dims": [3],
-  //           "type": "float32"
-  //         }
-  //       ],
-  //       "outputs": [
-  //         {
-  //           "data": [27.1, 37.1, 57.1, 67.1, 293.2, 319.2, 371.2, 397.2, 847.3, 889.3, 409.3, 428.3],
-  //           "dims": [1, 3, 2, 2],
-  //           "type": "float32"
-  //         }
-  //       ]
-  //     }
-  //   ]
-  // },
+  {
+    "name": "conv - vectorize group - B",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+              19.0, 20.0, 21.0, 22.0, 23.0, 0, 0, 0
+            ],
+            "dims": [1, 3, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [0.1, 0.2, 0.3],
+            "dims": [3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [27.1, 37.1, 57.1, 67.1, 293.2, 319.2, 371.2, 397.2, 847.3, 889.3, 409.3, 428.3],
+            "dims": [1, 3, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
   {
     "name": "conv - vectorize group - C",
     "operator": "Conv",
@@ -470,44 +470,44 @@
       }
     ]
   },
-  // {
-  //   "name": "conv - vectorize group - D",
-  //   "operator": "Conv",
-  //   "inputShapeDefinitions": "rankOnly",
-  //   "opset": { "domain": "", "version": 17 },
-  //   "attributes": [
-  //     { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
-  //     { "name": "group", "data": 3, "type": "int" },
-  //     { "name": "strides", "data": [2, 2], "type": "ints" }
-  //   ],
-  //   "cases": [
-  //     {
-  //       "name": "T[0] strides = [2, 2]",
-  //       "inputs": [
-  //         {
-  //           "data": [
-  //             0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
-  //             19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0
-  //           ],
-  //           "dims": [1, 3, 3, 4],
-  //           "type": "float32"
-  //         },
-  //         {
-  //           "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
-  //           "dims": [3, 1, 2, 2],
-  //           "type": "float32"
-  //         }
-  //       ],
-  //       "outputs": [
-  //         {
-  //           "data": [34, 54, 386, 438, 1122, 1206],
-  //           "dims": [1, 3, 1, 2],
-  //           "type": "float32"
-  //         }
-  //       ]
-  //     }
-  //   ]
-  // },
+  {
+    "name": "conv - vectorize group - D",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" },
+      { "name": "strides", "data": [2, 2], "type": "ints" }
+    ],
+    "cases": [
+      {
+        "name": "T[0] strides = [2, 2]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+              19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0
+            ],
+            "dims": [1, 3, 3, 4],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [34, 54, 386, 438, 1122, 1206],
+            "dims": [1, 3, 1, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
   {
     "name": "conv - pointwise",
     "operator": "Conv",
diff --git a/js/web/test/data/ops/fused-conv.jsonc b/js/web/test/data/ops/fused-conv.jsonc
index ebb0b5d3e1f58..d88c91ebc9de7 100644
--- a/js/web/test/data/ops/fused-conv.jsonc
+++ b/js/web/test/data/ops/fused-conv.jsonc
@@ -249,44 +249,44 @@
       }
     ]
   },
-  // {
-  //   "name": "NHWC group-conv with HardSigmoid",
-  //   "operator": "Conv",
-  //   "attributes": [
-  //     { "name": "activation", "data": "HardSigmoid", "type": "string" },
-  //     { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
-  //     { "name": "group", "data": 3, "type": "int" },
-  //     { "name": "activation_params", "data": [2.0, 5.0], "type": "floats" }
-  //   ],
-  //   "opset": { "domain": "com.ms.internal.nhwc", "version": 1 },
-  //   "cases": [
-  //     {
-  //       "name": "T[0]",
-  //       "inputs": [
-  //         {
-  //           "data": [
-  //             0.0, 1.0, 2.0, -3.0, 4.0, -5.0, 6.0, 7.0, 8.0, -9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0, 16.0, 17.0,
-  //             18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0
-  //           ],
-  //           "dims": [1, 3, 3, 3],
-  //           "type": "float32"
-  //         },
-  //         {
-  //           "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
-  //           "dims": [3, 1, 2, 2],
-  //           "type": "float32"
-  //         }
-  //       ],
-  //       "outputs": [
-  //         {
-  //           "data": [0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-  //           "dims": [1, 2, 2, 3],
-  //           "type": "float32"
-  //         }
-  //       ]
-  //     }
-  //   ]
-  // },
+  {
+    "name": "NHWC group-conv with HardSigmoid",
+    "operator": "Conv",
+    "attributes": [
+      { "name": "activation", "data": "HardSigmoid", "type": "string" },
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" },
+      { "name": "activation_params", "data": [2.0, 5.0], "type": "floats" }
+    ],
+    "opset": { "domain": "com.ms.internal.nhwc", "version": 1 },
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, -3.0, 4.0, -5.0, 6.0, 7.0, 8.0, -9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0, 16.0, 17.0,
+              18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0
+            ],
+            "dims": [1, 3, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+            "dims": [1, 2, 2, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
   {
     "name": "fused group-conv with LeakyRelu",
     "operator": "FusedConv",
@@ -325,44 +325,44 @@
       }
     ]
   },
-  // {
-  //   "name": "NHWC group-conv with LeakyRelu",
-  //   "operator": "Conv",
-  //   "attributes": [
-  //     { "name": "activation", "data": "LeakyRelu", "type": "string" },
-  //     { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
-  //     { "name": "group", "data": 3, "type": "int" },
-  //     { "name": "activation_params", "data": [2.0], "type": "floats" }
-  //   ],
-  //   "opset": { "domain": "com.ms.internal.nhwc", "version": 1 },
-  //   "cases": [
-  //     {
-  //       "name": "T[0]",
-  //       "inputs": [
-  //         {
-  //           "data": [
-  //             0.0, 1.0, 2.0, -3.0, 4.0, -5.0, 6.0, 7.0, 8.0, -9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0, 16.0, 17.0,
-  //             18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0
-  //           ],
-  //           "dims": [1, 3, 3, 3],
-  //           "type": "float32"
-  //         },
-  //         {
-  //           "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
-  //           "dims": [3, 1, 2, 2],
-  //           "type": "float32"
-  //         }
-  //       ],
-  //       "outputs": [
-  //         {
-  //           "data": [-162, 63, -158, 33, 281, 85, 105, 337, 455, 177, 515, 609],
-  //           "dims": [1, 2, 2, 3],
-  //           "type": "float32"
-  //         }
-  //       ]
-  //     }
-  //   ]
-  // },
+  {
+    "name": "NHWC group-conv with LeakyRelu",
+    "operator": "Conv",
+    "attributes": [
+      { "name": "activation", "data": "LeakyRelu", "type": "string" },
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" },
+      { "name": "activation_params", "data": [2.0], "type": "floats" }
+    ],
+    "opset": { "domain": "com.ms.internal.nhwc", "version": 1 },
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, -3.0, 4.0, -5.0, 6.0, 7.0, 8.0, -9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0, 16.0, 17.0,
+              18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0
+            ],
+            "dims": [1, 3, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [-162, 63, -158, 33, 281, 85, 105, 337, 455, 177, 515, 609],
+            "dims": [1, 2, 2, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
   {
     "name": "fused conv with LeakyRelu",
     "operator": "FusedConv",

From 780735098d8ecc90f0dc74a442e448f90c227d45 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 19 Dec 2024 10:23:27 -0800
Subject: [PATCH 13/25] [nodejs binding] Fix building in latest clang (#23146)

### Description

This change fixes the build break for Node.js binding on latest
AppleClang:

```
...tensor_helper.cc:65:5 error: integer value -1 is outside of the valid range of values [0,15] for the enumeration type 'napi_typedarray_type' [-Wenum-constexpr-conversion]

```

Use the underlying type of enum `napi_typedarray_type` for
`DATA_TYPE_TYPEDARRAY_MAP` to solve this issue.

Because the underlying type is implementation defined (it's `int` for
MSVC and `unsigned int` for Clang), we use `std::underlying_type_t` to
get the correct type.
---
 js/node/src/tensor_helper.cc | 55 ++++++++++++++++++++++--------------
 1 file changed, 34 insertions(+), 21 deletions(-)

diff --git a/js/node/src/tensor_helper.cc b/js/node/src/tensor_helper.cc
index 27eb9b65c62d3..12b1a79793ff3 100644
--- a/js/node/src/tensor_helper.cc
+++ b/js/node/src/tensor_helper.cc
@@ -53,24 +53,24 @@ constexpr size_t DATA_TYPE_ELEMENT_SIZE_MAP[] = {
 static_assert(sizeof(DATA_TYPE_ELEMENT_SIZE_MAP) == sizeof(size_t) * ONNX_TENSOR_ELEMENT_DATA_TYPE_COUNT,
               "definition not matching");
 
-constexpr napi_typedarray_type DATA_TYPE_TYPEDARRAY_MAP[] = {
-    (napi_typedarray_type)(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED     not supported
-    napi_float32_array,          // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
-    napi_uint8_array,            // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8
-    napi_int8_array,             // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8
-    napi_uint16_array,           // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16
-    napi_int16_array,            // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16
-    napi_int32_array,            // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32
-    napi_bigint64_array,         // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64
-    (napi_typedarray_type)(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING        not supported
-    napi_uint8_array,            // ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL
-    napi_uint16_array,           // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16       FLOAT16 uses Uint16Array
-    napi_float64_array,          // ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE
-    napi_uint32_array,           // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32
-    napi_biguint64_array,        // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64
-    (napi_typedarray_type)(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64     not supported
-    (napi_typedarray_type)(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128    not supported
-    (napi_typedarray_type)(-1)   // ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16      not supported
+constexpr std::underlying_type_t<napi_typedarray_type> DATA_TYPE_TYPEDARRAY_MAP[] = {
+    std::underlying_type_t<napi_typedarray_type>(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED     not supported
+    napi_float32_array,                                // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
+    napi_uint8_array,                                  // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8
+    napi_int8_array,                                   // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8
+    napi_uint16_array,                                 // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16
+    napi_int16_array,                                  // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16
+    napi_int32_array,                                  // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32
+    napi_bigint64_array,                               // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64
+    std::underlying_type_t<napi_typedarray_type>(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING        not supported
+    napi_uint8_array,                                  // ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL
+    napi_uint16_array,                                 // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16       FLOAT16 uses Uint16Array
+    napi_float64_array,                                // ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE
+    napi_uint32_array,                                 // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32
+    napi_biguint64_array,                              // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64
+    std::underlying_type_t<napi_typedarray_type>(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64     not supported
+    std::underlying_type_t<napi_typedarray_type>(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128    not supported
+    std::underlying_type_t<napi_typedarray_type>(-1)   // ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16      not supported
 };
 static_assert(sizeof(DATA_TYPE_TYPEDARRAY_MAP) == sizeof(napi_typedarray_type) * ONNX_TENSOR_ELEMENT_DATA_TYPE_COUNT,
               "definition not matching");
@@ -98,7 +98,20 @@ static_assert(sizeof(DATA_TYPE_ID_TO_NAME_MAP) == sizeof(const char*) * ONNX_TEN
               "definition not matching");
 
 const std::unordered_map<std::string, ONNXTensorElementDataType> DATA_TYPE_NAME_TO_ID_MAP = {
-    {"float32", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT}, {"uint8", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8}, {"int8", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8}, {"uint16", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16}, {"int16", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16}, {"int32", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32}, {"int64", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64}, {"string", ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING}, {"bool", ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL}, {"float16", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16}, {"float64", ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE}, {"uint32", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32}, {"uint64", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64}};
+    {"float32", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT},
+    {"uint8", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8},
+    {"int8", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8},
+    {"uint16", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16},
+    {"int16", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16},
+    {"int32", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32},
+    {"int64", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64},
+    {"string", ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING},
+    {"bool", ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL},
+    {"float16", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16},
+    {"float64", ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE},
+    {"uint32", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32},
+    {"uint64", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64},
+};
 
 // currently only support tensor
 Ort::Value NapiValueToOrtValue(Napi::Env env, Napi::Value value, OrtMemoryInfo* cpu_memory_info, OrtMemoryInfo* webgpu_memory_info) {
@@ -181,7 +194,7 @@ Ort::Value NapiValueToOrtValue(Napi::Env env, Napi::Value value, OrtMemoryInfo*
                                   "Tensor.data must be a typed array for numeric tensor.");
 
       auto tensorDataTypedArray = tensorDataValue.As<Napi::TypedArray>();
-      auto typedArrayType = tensorDataValue.As<Napi::TypedArray>().TypedArrayType();
+      std::underlying_type_t<napi_typedarray_type> typedArrayType = tensorDataValue.As<Napi::TypedArray>().TypedArrayType();
       ORT_NAPI_THROW_TYPEERROR_IF(DATA_TYPE_TYPEDARRAY_MAP[elemType] != typedArrayType, env,
                                   "Tensor.data must be a typed array (", DATA_TYPE_TYPEDARRAY_MAP[elemType], ") for ",
                                   tensorTypeString, " tensors, but got typed array (", typedArrayType, ").");
@@ -294,7 +307,7 @@ Napi::Value OrtValueToNapiValue(Napi::Env env, Ort::Value&& value) {
       }
       napi_value typedArrayData;
       napi_status status =
-          napi_create_typedarray(env, DATA_TYPE_TYPEDARRAY_MAP[elemType], size, arrayBuffer, 0, &typedArrayData);
+          napi_create_typedarray(env, (napi_typedarray_type)DATA_TYPE_TYPEDARRAY_MAP[elemType], size, arrayBuffer, 0, &typedArrayData);
       NAPI_THROW_IF_FAILED(env, status, Napi::Value);
 
       // new Tensor(type, typedArrayData, dims)

From 8680244ebc4457ac3fef7bb504d3560259766ae6 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 19 Dec 2024 10:23:48 -0800
Subject: [PATCH 14/25] Fix delay load for WebGPU EP and DML EP (#23111)

### Description

This change fixes the DLL delay load problem for the WebGPU EP and
DirectML EP. See detailed explanation below.

### Problem

When onnxruntime.dll uses delay loading for its dependencies, the
dependencies are loaded using `LoadLibraryEx()`, which search the
directory of process (.exe) instead of this library (onnxruntime.dll).
This is a problem for usages of Node.js binding and python binding,
because Windows will try to find the dependencies in the directory of
node.exe or python.exe, which is not the directory of onnxruntime.dll.

There was previous attempt to fix this by loading DirectML.dll in the
initialization of onnxruntime nodejs binding, which works for DML EP but
is not a good solution because it does not really "delay" the load.

For WebGPU, the situation became worse because webgpu_dawn.dll depends
on dxil.dll and dxcompiler.dll, which are explicitly dynamically loaded
in the code using `LoadLibraryA()`. This has the same problem of the DLL
search.

### Solutions

For onnxruntime.dll loading its direct dependencies, it can be resolved
by set the [`__pfnDliNotifyHook2`
hook](https://learn.microsoft.com/en-us/cpp/build/reference/understanding-the-helper-function?view=msvc-170#structure-and-constant-definitions)
to load from an absolute path that constructed from the onnxruntime.dll
folder and the DLL name.

For webgpu_dawn.dll loading dxil.dll and dxcompiler.dll, since they are
explicitly loaded in the code, the hook does not work. Instead, it can
be resolved by ~~using WIN32 API `SetDllDirectory()` to add the
onnxruntime.dll folder to the search path.~~ preloading the 2 DLLs from
the onnxruntime.dll folder .
---
 cmake/onnxruntime.cmake                       |   1 +
 cmake/onnxruntime_nodejs.cmake                |  20 ++-
 cmake/onnxruntime_providers_webgpu.cmake      |  36 +++--
 cmake/onnxruntime_unittests.cmake             |  12 ++
 js/node/CMakeLists.txt                        |  10 +-
 js/node/script/build.ts                       |   5 +
 js/node/src/directml_load_helper.cc           |  37 -----
 js/node/src/directml_load_helper.h            |   6 -
 js/node/src/inference_session_wrap.cc         |   4 -
 onnxruntime/core/dll/delay_load_hook.cc       |  83 ++++++++++
 onnxruntime/core/dll/dllmain.cc               |   2 +-
 .../core/providers/webgpu/webgpu_context.cc   |  26 ++++
 .../core/providers/webgpu/webgpu_context.h    |   3 +
 onnxruntime/test/webgpu/delay_load/main.cc    | 142 ++++++++++++++++++
 onnxruntime/test/webgpu/external_dawn/main.cc |   1 -
 .../win-gpu-webgpu-ci-pipeline.yml            |   2 +-
 16 files changed, 324 insertions(+), 66 deletions(-)
 delete mode 100644 js/node/src/directml_load_helper.cc
 delete mode 100644 js/node/src/directml_load_helper.h
 create mode 100644 onnxruntime/core/dll/delay_load_hook.cc
 create mode 100644 onnxruntime/test/webgpu/delay_load/main.cc

diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 732c0511d400f..d72b61a0859b2 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -77,6 +77,7 @@ if(WIN32)
   onnxruntime_add_shared_library(onnxruntime
     ${SYMBOL_FILE}
     "${ONNXRUNTIME_ROOT}/core/dll/dllmain.cc"
+    "${ONNXRUNTIME_ROOT}/core/dll/delay_load_hook.cc"
     "${ONNXRUNTIME_ROOT}/core/dll/onnxruntime.rc"
   )
 elseif(onnxruntime_BUILD_APPLE_FRAMEWORK)
diff --git a/cmake/onnxruntime_nodejs.cmake b/cmake/onnxruntime_nodejs.cmake
index 376d895be34a9..355575be3bcf7 100644
--- a/cmake/onnxruntime_nodejs.cmake
+++ b/cmake/onnxruntime_nodejs.cmake
@@ -60,15 +60,26 @@ else()
     endif()
 endif()
 
+# a list of DLLs that the Node.js binding depends on
+set(NODEJS_DLL_DEPS)
+
 # setup providers
 if (onnxruntime_USE_CUDA)
     set(NODEJS_BINDING_USE_CUDA "--use_cuda")
 endif()
 if (onnxruntime_USE_DML)
     set(NODEJS_BINDING_USE_DML "--use_dml")
+    list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE_DIR:onnxruntime>/DirectML.dll")
 endif()
 if (onnxruntime_USE_WEBGPU)
     set(NODEJS_BINDING_USE_WEBGPU "--use_webgpu")
+    if (WIN32 AND onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
+        list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE_DIR:dxcompiler>/dxil.dll")
+        list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE_DIR:dxcompiler>/dxcompiler.dll")
+    endif()
+    if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
+        list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE:dawn::webgpu_dawn>")
+    endif()
 endif()
 if (onnxruntime_USE_TENSORRT)
     set(NODEJS_BINDING_USE_TENSORRT "--use_tensorrt")
@@ -94,9 +105,12 @@ add_custom_target(js_common_npm_ci ALL
 
 add_custom_target(nodejs_binding_wrapper ALL
     COMMAND ${NPM_CLI} ci
-    COMMAND ${NPM_CLI} run build -- --onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR} --config=${CMAKE_BUILD_TYPE} --onnxruntime-generator=${CMAKE_GENERATOR}
-        --arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_WEBGPU} ${NODEJS_BINDING_USE_TENSORRT}
-        ${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN}
+    COMMAND ${NPM_CLI} run build -- "--onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR}"
+        --config=${CMAKE_BUILD_TYPE}
+        "--onnxruntime-generator=${CMAKE_GENERATOR}"
+        "--dll_deps=${NODEJS_DLL_DEPS}"
+        --arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_WEBGPU}
+        ${NODEJS_BINDING_USE_TENSORRT} ${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN}
     WORKING_DIRECTORY ${JS_NODE_ROOT}
     COMMENT "Using cmake-js to build OnnxRuntime Node.js binding")
 
diff --git a/cmake/onnxruntime_providers_webgpu.cmake b/cmake/onnxruntime_providers_webgpu.cmake
index fea5964f0dda9..e527d538d8757 100644
--- a/cmake/onnxruntime_providers_webgpu.cmake
+++ b/cmake/onnxruntime_providers_webgpu.cmake
@@ -23,19 +23,18 @@
   onnxruntime_add_include_to_target(onnxruntime_providers_webgpu
     onnxruntime_common dawn::dawncpp_headers dawn::dawn_headers onnx onnx_proto flatbuffers::flatbuffers Boost::mp11 safeint_interface)
 
+  set(onnxruntime_providers_webgpu_dll_deps)
+
   if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
     target_link_libraries(onnxruntime_providers_webgpu dawn::webgpu_dawn)
 
-    if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS)
-      list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll")
-    endif()
+    if (WIN32)
+      if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS)
+        list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll")
+      endif()
 
-    # Copy webgpu_dawn.dll to the output directory
-    add_custom_command(
-      TARGET onnxruntime_providers_webgpu
-      POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy_if_different "$<TARGET_FILE:dawn::webgpu_dawn>" "$<TARGET_FILE_DIR:onnxruntime_providers_webgpu>"
-      VERBATIM )
+      list(APPEND onnxruntime_providers_webgpu_dll_deps "$<TARGET_FILE:dawn::webgpu_dawn>")
+    endif()
   else()
     if (NOT onnxruntime_USE_EXTERNAL_DAWN)
       target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_native)
@@ -43,4 +42,23 @@
     target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_proc)
   endif()
 
+  if (WIN32 AND onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
+    # Ensure dxil.dll and dxcompiler.dll exist in the output directory $<TARGET_FILE_DIR:dxcompiler>
+    add_dependencies(onnxruntime_providers_webgpu copy_dxil_dll)
+    add_dependencies(onnxruntime_providers_webgpu dxcompiler)
+
+    list(APPEND onnxruntime_providers_webgpu_dll_deps "$<TARGET_FILE_DIR:dxcompiler>/dxil.dll")
+    list(APPEND onnxruntime_providers_webgpu_dll_deps "$<TARGET_FILE_DIR:dxcompiler>/dxcompiler.dll")
+  endif()
+
+  if (onnxruntime_providers_webgpu_dll_deps)
+    # Copy dependency DLLs to the output directory
+    add_custom_command(
+      TARGET onnxruntime_providers_webgpu
+      POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different "${onnxruntime_providers_webgpu_dll_deps}" "$<TARGET_FILE_DIR:onnxruntime_providers_webgpu>"
+      COMMAND_EXPAND_LISTS
+      VERBATIM )
+  endif()
+
   set_target_properties(onnxruntime_providers_webgpu PROPERTIES FOLDER "ONNXRuntime")
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index e822f0a3655fc..9e3ab4d41f416 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -525,6 +525,9 @@ set (onnxruntime_global_thread_pools_test_SRC
 set (onnxruntime_webgpu_external_dawn_test_SRC
           ${TEST_SRC_DIR}/webgpu/external_dawn/main.cc)
 
+set (onnxruntime_webgpu_delay_load_test_SRC
+          ${TEST_SRC_DIR}/webgpu/delay_load/main.cc)
+
 # tests from lowest level library up.
 # the order of libraries should be maintained, with higher libraries being added first in the list
 
@@ -1864,4 +1867,13 @@ if (onnxruntime_USE_WEBGPU AND onnxruntime_USE_EXTERNAL_DAWN)
   onnxruntime_add_include_to_target(onnxruntime_webgpu_external_dawn_test dawn::dawncpp_headers dawn::dawn_headers)
 endif()
 
+if (onnxruntime_USE_WEBGPU AND WIN32 AND onnxruntime_BUILD_SHARED_LIB AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT onnxruntime_MINIMAL_BUILD)
+  AddTest(DYN
+          TARGET onnxruntime_webgpu_delay_load_test
+          SOURCES ${onnxruntime_webgpu_delay_load_test_SRC}
+          LIBS ${SYS_PATH_LIB}
+          DEPENDS ${all_dependencies}
+  )
+endif()
+
 include(onnxruntime_fuzz_test.cmake)
diff --git a/js/node/CMakeLists.txt b/js/node/CMakeLists.txt
index d79a82c572dc2..c78b40a3e7429 100644
--- a/js/node/CMakeLists.txt
+++ b/js/node/CMakeLists.txt
@@ -113,10 +113,12 @@ endif()
 if (WIN32)
   file(COPY ${ONNXRUNTIME_WIN_BIN_DIR}/onnxruntime.dll
       DESTINATION ${dist_folder})
-  if (USE_DML)
-    file(COPY ${ONNXRUNTIME_WIN_BIN_DIR}/DirectML.dll
-      DESTINATION ${dist_folder})
-  endif ()
+  if (ORT_NODEJS_DLL_DEPS)
+    foreach(dll ${ORT_NODEJS_DLL_DEPS})
+      file(COPY ${dll} DESTINATION ${dist_folder})
+    endforeach()
+  endif()
+
 elseif (APPLE)
   file(COPY ${ONNXRUNTIME_BUILD_DIR}/libonnxruntime.dylib
       DESTINATION ${dist_folder} FOLLOW_SYMLINK_CHAIN)
diff --git a/js/node/script/build.ts b/js/node/script/build.ts
index dcdcb93377b4c..b557368ed58c6 100644
--- a/js/node/script/build.ts
+++ b/js/node/script/build.ts
@@ -39,6 +39,8 @@ const USE_TENSORRT = !!buildArgs.use_tensorrt;
 const USE_COREML = !!buildArgs.use_coreml;
 // --use_qnn
 const USE_QNN = !!buildArgs.use_qnn;
+// --dll_deps=
+const DLL_DEPS = buildArgs.dll_deps;
 
 // build path
 const ROOT_FOLDER = path.join(__dirname, '..');
@@ -82,6 +84,9 @@ if (USE_COREML) {
 if (USE_QNN) {
   args.push('--CDUSE_QNN=ON');
 }
+if (DLL_DEPS) {
+  args.push(`--CDORT_NODEJS_DLL_DEPS=${DLL_DEPS}`);
+}
 
 // set CMAKE_OSX_ARCHITECTURES for macOS build
 if (os.platform() === 'darwin') {
diff --git a/js/node/src/directml_load_helper.cc b/js/node/src/directml_load_helper.cc
deleted file mode 100644
index 6aafe4d5fa788..0000000000000
--- a/js/node/src/directml_load_helper.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#ifdef _WIN32
-#include "common.h"
-#include "windows.h"
-
-void LoadDirectMLDll(Napi::Env env) {
-  DWORD pathLen = MAX_PATH;
-  std::wstring path(pathLen, L'\0');
-  HMODULE moduleHandle = nullptr;
-
-  GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
-                    reinterpret_cast<LPCSTR>(&LoadDirectMLDll), &moduleHandle);
-
-  DWORD getModuleFileNameResult = GetModuleFileNameW(moduleHandle, const_cast<wchar_t*>(path.c_str()), pathLen);
-  while (getModuleFileNameResult == 0 || getModuleFileNameResult == pathLen) {
-    int ret = GetLastError();
-    if (ret == ERROR_INSUFFICIENT_BUFFER && pathLen < 32768) {
-      pathLen *= 2;
-      path.resize(pathLen);
-      getModuleFileNameResult = GetModuleFileNameW(moduleHandle, const_cast<wchar_t*>(path.c_str()), pathLen);
-    } else {
-      ORT_NAPI_THROW_ERROR(env, "Failed getting path to load DirectML.dll, error code: ", ret);
-    }
-  }
-
-  path.resize(path.rfind(L'\\') + 1);
-  path.append(L"DirectML.dll");
-  HMODULE libraryLoadResult = LoadLibraryW(path.c_str());
-
-  if (!libraryLoadResult) {
-    int ret = GetLastError();
-    ORT_NAPI_THROW_ERROR(env, "Failed loading bundled DirectML.dll, error code: ", ret);
-  }
-}
-#endif
diff --git a/js/node/src/directml_load_helper.h b/js/node/src/directml_load_helper.h
deleted file mode 100644
index 074a4f95ed476..0000000000000
--- a/js/node/src/directml_load_helper.h
+++ /dev/null
@@ -1,6 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#if defined(USE_DML) && defined(_WIN32)
-void LoadDirectMLDll(Napi::Env env);
-#endif
diff --git a/js/node/src/inference_session_wrap.cc b/js/node/src/inference_session_wrap.cc
index 23d859351f426..04ab71dc48ec2 100644
--- a/js/node/src/inference_session_wrap.cc
+++ b/js/node/src/inference_session_wrap.cc
@@ -4,7 +4,6 @@
 #include "onnxruntime_cxx_api.h"
 
 #include "common.h"
-#include "directml_load_helper.h"
 #include "inference_session_wrap.h"
 #include "run_options_helper.h"
 #include "session_options_helper.h"
@@ -19,9 +18,6 @@ Napi::FunctionReference& InferenceSessionWrap::GetTensorConstructor() {
 }
 
 Napi::Object InferenceSessionWrap::Init(Napi::Env env, Napi::Object exports) {
-#if defined(USE_DML) && defined(_WIN32)
-  LoadDirectMLDll(env);
-#endif
   // create ONNX runtime env
   Ort::InitApi();
   ORT_NAPI_THROW_ERROR_IF(
diff --git a/onnxruntime/core/dll/delay_load_hook.cc b/onnxruntime/core/dll/delay_load_hook.cc
new file mode 100644
index 0000000000000..23fc8bca7368e
--- /dev/null
+++ b/onnxruntime/core/dll/delay_load_hook.cc
@@ -0,0 +1,83 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// == workaround for delay loading of dependencies of onnxruntime.dll ==
+//
+// Problem:
+//
+// When onnxruntime.dll uses delay loading for its dependencies, the dependencies are loaded using LoadLibraryEx,
+// which search the directory of process (.exe) instead of this library (onnxruntime.dll). This is a problem for
+// usages of Node.js binding and python binding, because Windows will try to find the dependencies in the directory
+// of node.exe or python.exe, which is not the directory of onnxruntime.dll.
+//
+// Solution:
+//
+// By using the delay load hook `__pfnDliNotifyHook2`, we can intervene the loading procedure by loading from an
+// absolute path. The absolute path is constructed by appending the name of the DLL to load to the directory of
+// onnxruntime.dll. This way, we can ensure that the dependencies are loaded from the same directory as onnxruntime.dll.
+//
+// See also:
+// - https://learn.microsoft.com/en-us/cpp/build/reference/understanding-the-helper-function?view=msvc-170#structure-and-constant-definitions
+// - https://learn.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-search-order#alternate-search-order-for-unpackaged-apps
+//
+// The DLL DelayLoad hook is only enabled when the compiler is MSVC and at least one of the following is True:
+// - both USE_WEBGPU and BUILD_DAWN_MONOLITHIC_LIBRARY are defined
+// - USE_DML is defined
+//
+#define ORT_DELAY_LOAD_WEBGPU_DAWN_DLL (defined(USE_WEBGPU) && defined(BUILD_DAWN_MONOLITHIC_LIBRARY))
+#define ORT_DELAY_LOAD_DIRECTML_DLL defined(USE_DML)
+#if defined(_MSC_VER) && (ORT_DELAY_LOAD_WEBGPU_DAWN_DLL || ORT_DELAY_LOAD_DIRECTML_DLL)
+
+#include <Windows.h>
+#include <delayimp.h>
+#include <stdlib.h>
+#include <string>
+
+#include "core/platform/env.h"
+
+namespace {
+
+#define DEFINE_KNOWN_DLL(name) {#name ".dll", L#name L".dll"}
+
+constexpr struct {
+  const char* str;
+  const wchar_t* wstr;
+} known_dlls[] = {
+#if ORT_DELAY_LOAD_WEBGPU_DAWN_DLL
+    DEFINE_KNOWN_DLL(webgpu_dawn),
+#endif
+#if ORT_DELAY_LOAD_DIRECTML_DLL
+    DEFINE_KNOWN_DLL(DirectML),
+#endif
+};
+}  // namespace
+
+FARPROC WINAPI delay_load_hook(unsigned dliNotify, PDelayLoadInfo pdli) {
+  if (dliNotify == dliNotePreLoadLibrary) {
+    for (size_t i = 0; i < _countof(known_dlls); ++i) {
+      if (_stricmp(pdli->szDll, known_dlls[i].str) == 0) {
+        // Try to load the DLL from the same directory as onnxruntime.dll
+
+        // First, get the path to onnxruntime.dll
+        auto path = Env::Default().GetRuntimePath();
+        if (path.empty()) {
+          // Failed to get the path to onnxruntime.dll. In this case, we will just return NULL and let the system
+          // search for the DLL in the default search order.
+          return NULL;
+        }
+
+        // Append the name of the DLL. Now `path` is the absolute path to the DLL to load.
+        path.append(known_dlls[i].wstr);
+
+        // Load the DLL
+        return FARPROC(LoadLibraryExW(path.c_str(), NULL,
+                                      LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR));
+      }
+    }
+  }
+  return NULL;
+}
+
+extern "C" const PfnDliHook __pfnDliNotifyHook2 = delay_load_hook;
+
+#endif
diff --git a/onnxruntime/core/dll/dllmain.cc b/onnxruntime/core/dll/dllmain.cc
index 2e7bdafd0599f..ac5dcd9c96084 100644
--- a/onnxruntime/core/dll/dllmain.cc
+++ b/onnxruntime/core/dll/dllmain.cc
@@ -13,7 +13,7 @@
 #pragma GCC diagnostic pop
 #endif
 
-// dllmain.cpp : Defines the entry point for the DLL application.
+// dllmain.cc : Defines the entry point for the DLL application.
 BOOL APIENTRY DllMain(HMODULE /*hModule*/,
                       DWORD ul_reason_for_call,
                       LPVOID /*lpReserved*/
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index d66c2a79d28a8..c85a15017659c 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -10,6 +10,8 @@
 #endif
 
 #include "core/common/common.h"
+#include "core/common/path_string.h"
+#include "core/platform/env.h"
 
 #include "core/providers/webgpu/compute_context.h"
 #include "core/providers/webgpu/webgpu_context.h"
@@ -50,6 +52,30 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info
 
     // Initialization.Step.2 - Create wgpu::Adapter
     if (adapter_ == nullptr) {
+#if !defined(__EMSCRIPTEN__) && defined(_MSC_VER) && defined(DAWN_ENABLE_D3D12) && !defined(USE_EXTERNAL_DAWN)
+      // If we are using the D3D12 backend on Windows and the build does not use external Dawn, dxil.dll and dxcompiler.dll are required.
+      //
+      // Dawn will try to load them later, but if they are in the different directory to the executable, it may fail to find them.
+      // To avoid this issue, we try to load them from the same directory as current module (usually onnxruntime.dll).
+      auto runtime_path = Env::Default().GetRuntimePath();
+      if (!runtime_path.empty()) {
+        Status status;
+        void* module_handle = nullptr;
+
+        PathString dxil_path = runtime_path + ToPathString(L"dxil.dll");
+        status = Env::Default().LoadDynamicLibrary(dxil_path, false, &module_handle);
+        if (status.IsOK() && module_handle != nullptr) {
+          modules_.Add(dxil_path, module_handle);
+        }
+
+        PathString dxcompiler_path = runtime_path + ToPathString(L"dxcompiler.dll");
+        status = Env::Default().LoadDynamicLibrary(dxcompiler_path, false, &module_handle);
+        if (status.IsOK() && module_handle != nullptr) {
+          modules_.Add(dxcompiler_path, module_handle);
+        }
+      }
+#endif
+
       wgpu::RequestAdapterOptions req_adapter_options = {};
       wgpu::DawnTogglesDescriptor adapter_toggles_desc = {};
       req_adapter_options.nextInChain = &adapter_toggles_desc;
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.h b/onnxruntime/core/providers/webgpu/webgpu_context.h
index be05b06523b9c..c41ef3e211264 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.h
@@ -13,6 +13,7 @@
 #include <webgpu/webgpu_cpp.h>
 
 #include "core/common/common.h"
+#include "core/framework/library_handles.h"
 #include "core/providers/webgpu/webgpu_execution_provider.h"
 #include "core/providers/webgpu/buffer_manager.h"
 #include "core/providers/webgpu/program_manager.h"
@@ -153,6 +154,8 @@ class WebGpuContext final {
 
   std::once_flag init_flag_;
 
+  LibraryHandles modules_;
+
   wgpu::Instance instance_;
   wgpu::Adapter adapter_;
   wgpu::Device device_;
diff --git a/onnxruntime/test/webgpu/delay_load/main.cc b/onnxruntime/test/webgpu/delay_load/main.cc
new file mode 100644
index 0000000000000..f909b4a6916b4
--- /dev/null
+++ b/onnxruntime/test/webgpu/delay_load/main.cc
@@ -0,0 +1,142 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <iostream>
+#include <Windows.h>
+#include <stdlib.h>
+#include <filesystem>
+#define ORT_API_MANUAL_INIT
+#include "core/session/onnxruntime_cxx_api.h"
+
+// This program is to test the delay loading of onnxruntime.dll.
+//
+// To verify the delay loading actually works, we need to do the test in 2 steps:
+//
+// 1. Prepare a folder structure like below:
+//
+//    ├── webgpu_delay_load_test_root (newly created folder)
+//    │   ├── dlls
+//    │   │   ├── onnxruntime.dll
+//    │   │   ├── webgpu_dawn.dll
+//    │   │   ├── dxil.dll
+//    │   │   └── dxcompiler.dll
+//    │   └── test.exe
+//    └── onnxruntime_webgpu_delay_load_test.exe (this binary)
+//
+//    This folder structure ensures no DLLs are in the same folder as the executable (test.exe).
+//
+// 2. Launch the test binary from the root folder of the above structure.
+//
+// So, there are 2 modes of this program:
+// 1. "Prepare" mode: Do the step 1 above. (default)
+// 2. "Test" mode: Do the step 2 above. (specified by --test argument)
+
+int prepare_main();
+int test_main();
+
+int wmain(int argc, wchar_t* argv[]) {
+  if (argc == 2 && wcscmp(argv[1], L"--test") == 0) {
+    return test_main();
+  } else {
+    return prepare_main();
+  }
+}
+
+int prepare_main() {
+  std::wstring path_str(32768, L'\0');
+  GetModuleFileNameW(NULL, path_str.data(), static_cast<DWORD>(path_str.size()));
+
+  namespace fs = std::filesystem;
+  fs::path exe_full_path{path_str};                                    // <TEST_DIR>/onnxruntime_webgpu_delay_load_test.exe
+  fs::path test_dir = exe_full_path.parent_path();                     // <TEST_DIR>/
+  fs::path exe_name = exe_full_path.filename();                        // onnxruntime_webgpu_delay_load_test.exe
+  fs::path root_folder = test_dir / L"webgpu_delay_load_test_root\\";  // <TEST_DIR>/webgpu_delay_load_test_root/
+  fs::path dlls_folder = root_folder / L"dlls\\";                      // <TEST_DIR>/webgpu_delay_load_test_root/dlls/
+
+  // ensure the test folder exists and is empty
+  if (fs::exists(root_folder)) {
+    fs::remove_all(root_folder);
+  }
+  fs::create_directories(dlls_folder);
+
+  fs::current_path(test_dir);
+
+  // copy the required DLLs to the dlls folder
+  fs::copy_file(L"onnxruntime.dll", dlls_folder / L"onnxruntime.dll");
+  fs::copy_file(L"dxil.dll", dlls_folder / L"dxil.dll");
+  fs::copy_file(L"dxcompiler.dll", dlls_folder / L"dxcompiler.dll");
+  if (fs::exists(L"webgpu_dawn.dll")) {
+    fs::copy_file(L"webgpu_dawn.dll", dlls_folder / L"webgpu_dawn.dll");
+  }
+
+  // copy the test binary to the root folder
+  fs::copy_file(exe_full_path, root_folder / L"test.exe");
+
+  // run "test.exe --test" from the test root folder
+  fs::current_path(root_folder);
+  return _wsystem(L"test.exe --test");
+}
+
+int run() {
+  Ort::Env env{nullptr};
+  int retval = 0;
+  try {
+    env = Ort::Env{ORT_LOGGING_LEVEL_WARNING, "Default"};
+
+    // model is https://github.com/onnx/onnx/blob/v1.15.0/onnx/backend/test/data/node/test_abs/model.onnx
+    constexpr uint8_t MODEL_DATA[] = {8, 7, 18, 12, 98, 97, 99, 107, 101, 110,
+                                      100, 45, 116, 101, 115, 116, 58, 73, 10, 11,
+                                      10, 1, 120, 18, 1, 121, 34, 3, 65, 98,
+                                      115, 18, 8, 116, 101, 115, 116, 95, 97, 98,
+                                      115, 90, 23, 10, 1, 120, 18, 18, 10, 16,
+                                      8, 1, 18, 12, 10, 2, 8, 3, 10, 2,
+                                      8, 4, 10, 2, 8, 5, 98, 23, 10, 1,
+                                      121, 18, 18, 10, 16, 8, 1, 18, 12, 10,
+                                      2, 8, 3, 10, 2, 8, 4, 10, 2, 8,
+                                      5, 66, 4, 10, 0, 16, 13};
+
+    Ort::SessionOptions session_options;
+    session_options.DisableMemPattern();
+    std::unordered_map<std::string, std::string> provider_options;
+    session_options.AppendExecutionProvider("WebGPU", provider_options);
+    Ort::Session session{env, MODEL_DATA, sizeof(MODEL_DATA), session_options};
+
+    // successfully initialized
+    std::cout << "Successfully initialized WebGPU EP." << std::endl;
+    retval = 0;
+  } catch (const std::exception& ex) {
+    std::cerr << ex.what() << std::endl;
+
+    std::cerr << "Unexpected exception." << std::endl;
+    retval = -1;
+  }
+
+  return retval;
+}
+
+int test_main() {
+  HMODULE hModule = LoadLibraryA("dlls\\onnxruntime.dll");
+  if (hModule == NULL) {
+    std::cout << "Failed to load dlls\\onnxruntime.dll" << std::endl;
+    return 1;
+  }
+
+  int retval = 0;
+
+  using OrtGetApiBaseFunction = decltype(&OrtGetApiBase);
+  auto fnOrtGetApiBase = (OrtGetApiBaseFunction)GetProcAddress(hModule, "OrtGetApiBase");
+  if (fnOrtGetApiBase == NULL) {
+    std::cout << "Failed to get OrtGetApiBase" << std::endl;
+    retval = 1;
+    goto cleanup;
+  }
+  Ort::InitApi(fnOrtGetApiBase()->GetApi(ORT_API_VERSION));
+
+  retval = run();
+
+cleanup:
+  if (hModule != NULL) {
+    FreeLibrary(hModule);
+  }
+  return retval;
+}
diff --git a/onnxruntime/test/webgpu/external_dawn/main.cc b/onnxruntime/test/webgpu/external_dawn/main.cc
index ed8d2eab94ce9..1cb22b131d76b 100644
--- a/onnxruntime/test/webgpu/external_dawn/main.cc
+++ b/onnxruntime/test/webgpu/external_dawn/main.cc
@@ -1,5 +1,4 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 // Licensed under the MIT License.
 
 #include <iostream>
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml
index 06f374afca57a..8460df2ec3799 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml
@@ -48,7 +48,7 @@ stages:
           --enable_pybind
           --build_nodejs
           --use_webgpu
-          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
+          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY=ON
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo

From a3bb3f148768cea41b59a2860a57264e85398dc7 Mon Sep 17 00:00:00 2001
From: Yifan Li <109183385+yf711@users.noreply.github.com>
Date: Thu, 19 Dec 2024 10:30:39 -0800
Subject: [PATCH 15/25] [TensorRT EP] New CIs to test TRT+minimal CUDA build
 (#23028)

### Description
<!-- Describe your changes. -->
New CI:
[Linux_TRT_Minimal_CUDA_Test_CI](https://dev.azure.com/onnxruntime/onnxruntime/_build?definitionId=230&_a=summary)
and [Win_TRT_Minimal_CUDA_Test_CI
](https://dev.azure.com/onnxruntime/onnxruntime/_build?definitionId=231)
Setting config for new CI to monitor if there's no issue to build
ORT-TRTEP with minimal CUDA
* yaml content is following Linux TRT CI yaml, with different build
arg/cache name
* build arg is following [[TensorRT EP] Enable a minimal CUDA EP
compilation without
kernels](https://github.com/microsoft/onnxruntime/pull/19052#issuecomment-1888066851)


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Monitor if user is able to build ORT-TRTEP-minimalCUDA without any
blocker
(which takes ~30min to build)
---
 tools/ci_build/build.py                       |   2 +
 ...-gpu-tensorrt-cuda-minimal-ci-pipeline.yml | 108 ++++++++++++++++++
 ...-gpu-tensorrt-cuda-minimal-ci-pipeline.yml |  86 ++++++++++++++
 .../github/linux/build_tensorrt_ci.sh         |  13 +++
 4 files changed, 209 insertions(+)
 create mode 100644 tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
 create mode 100644 tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 3527a89ca7a7b..53dcdc6e0c6fa 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -260,6 +260,7 @@ def convert_arg_line_to_args(self, arg_line):
     )
 
     parser.add_argument("--disable_cuda_nhwc_ops", action="store_true", help="Disable CUDA NHWC ops in build.")
+    parser.add_argument("--enable_cuda_minimal_build", action="store_true", help="Enable CUDA minimal build.")
 
     # Python bindings
     parser.add_argument("--enable_pybind", action="store_true", help="Enable Python Bindings.")
@@ -1093,6 +1094,7 @@ def generate_build_tree(
         "-Donnxruntime_DISABLE_FLOAT8_TYPES=" + ("ON" if disable_float8_types else "OFF"),
         "-Donnxruntime_DISABLE_SPARSE_TENSORS=" + ("ON" if disable_sparse_tensors else "OFF"),
         "-Donnxruntime_DISABLE_OPTIONAL_TYPE=" + ("ON" if disable_optional_type else "OFF"),
+        "-Donnxruntime_CUDA_MINIMAL=" + ("ON" if args.enable_cuda_minimal_build else "OFF"),
     ]
 
     if args.rv64:
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
new file mode 100644
index 0000000000000..2a32dd1a62408
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
@@ -0,0 +1,108 @@
+##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
+### please do rerun set-trigger-rules.py ###
+trigger:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+pr:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+#### end trigger ####
+parameters:
+  - name: CudaVersion
+    displayName: CUDA version
+    type: string
+    default: '12.2'
+    values:
+      - 11.8
+      - 12.2
+
+variables:
+  - template: templates/common-variables.yml
+  - name: docker_base_image
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241120.3
+  - name: linux_trt_version
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: ${{ variables.linux_trt_version_cuda11 }}
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: ${{ variables.linux_trt_version_cuda12 }}
+
+jobs:
+- job: Linux_Build
+  timeoutInMinutes: 180
+  variables:
+    skipComponentGovernanceDetection: true
+    ALLOW_RELEASED_ONNX_OPSET_ONLY: '1'
+    ORT_CACHE_DIR: '$(Agent.TempDirectory)/ort/ccache'
+    TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
+  workspace:
+    clean: all
+  pool: onnxruntime-tensorrt-linuxbuild-T4
+  steps:
+  - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+    displayName: 'Clean Agent Directories'
+    condition: always()
+
+  - checkout: self
+    clean: true
+    submodules: none
+
+  - template: templates/get-docker-image-steps.yml
+    parameters:
+      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
+      Context: tools/ci_build/github/linux/docker
+      DockerBuildArgs: "
+      --network=host
+      --build-arg BASEIMAGE=${{ variables.docker_base_image }}
+      --build-arg TRT_VERSION=${{ variables.linux_trt_version }}
+      --build-arg BUILD_UID=$( id -u )
+      "
+      Repository: onnxruntimetensorrtcudaminimalbuild
+
+  - template: templates/linux-build-step-with-cache.yml
+    parameters:
+      WithCache: true
+      Today: $(TODAY)
+      AdditionalKey: gpu_tensorrt_cuda_minimal
+      CacheDir: '$(ORT_CACHE_DIR)'
+      BuildStep:
+        - task: CmdLine@2
+          inputs:
+            script: |
+              docker run --gpus all --rm \
+                  --volume /data/onnx:/data/onnx:ro \
+                  --volume $(Build.SourcesDirectory):/onnxruntime_src \
+                  --volume $(Build.BinariesDirectory):/build \
+                  --volume /data/models:/build/models:ro \
+                  --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+                  --volume $(ORT_CACHE_DIR):/cache \
+                  -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
+                  -e NIGHTLY_BUILD \
+                  -e BUILD_BUILDNUMBER \
+                  -e CCACHE_DIR=/cache -w /onnxruntime_src \
+                  onnxruntimetensorrtcudaminimalbuild tools/ci_build/github/linux/build_tensorrt_ci.sh --cuda_minimal=ON
+            workingDirectory: $(Build.SourcesDirectory)
+
+  - template: templates/explicitly-defined-final-tasks.yml
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
new file mode 100644
index 0000000000000..c68ba01485db2
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
@@ -0,0 +1,86 @@
+##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
+### please do rerun set-trigger-rules.py ###
+trigger:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+pr:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+#### end trigger ####
+parameters:
+- name: CudaVersion
+  displayName: CUDA version
+  type: string
+  default: '12.2'
+  values:
+    - 11.8
+    - 12.2
+
+variables:
+  - template: templates/common-variables.yml
+  - name: win_trt_folder
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: ${{ variables.win_trt_folder_cuda11 }}
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: ${{ variables.win_trt_folder_cuda12 }}
+
+jobs:
+- job: 'build'
+  pool: 'onnxruntime-Win2022-GPU-A10'
+  variables:
+    MsbuildArguments: '-detailedsummary -maxcpucount -consoleloggerparameters:PerformanceSummary'
+    EnvSetupScript: setup_env_trt.bat
+    skipComponentGovernanceDetection: true
+    TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
+  timeoutInMinutes: 150
+  workspace:
+    clean: all
+  steps:
+  - template: templates/jobs/win-ci-prebuild-steps.yml
+    parameters:
+      EnvSetupScript: $(EnvSetupScript)
+      DownloadCUDA: true
+      DownloadTRT: true
+      BuildArch: 'x64'
+      BuildConfig: RelWithDebInfo
+      MachinePool: 'onnxruntime-Win2022-GPU-A10'
+      WithCache: true
+      Today: $(Today)
+
+  - template: templates/jobs/win-ci-build-steps.yml
+    parameters:
+      WithCache: True
+      Today: $(TODAY)
+      AdditionalKey: "gpu_tensorrt_cuda_minimal | RelWithDebInfo"
+      BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\${{ variables.win_trt_folder }}" --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --enable_cuda_minimal_build'
+      MsbuildArguments: $(MsbuildArguments)
+      BuildArch: 'x64'
+      Platform: 'x64'
+      BuildConfig: RelWithDebInfo
+
+  - task: PythonScript@0
+    displayName: 'Build wheel'
+    inputs:
+      scriptPath: '$(Build.SourcesDirectory)\setup.py'
+      arguments: 'bdist_wheel'
+      workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
diff --git a/tools/ci_build/github/linux/build_tensorrt_ci.sh b/tools/ci_build/github/linux/build_tensorrt_ci.sh
index 5b206bc0a92d9..ccf7a6f4ea630 100755
--- a/tools/ci_build/github/linux/build_tensorrt_ci.sh
+++ b/tools/ci_build/github/linux/build_tensorrt_ci.sh
@@ -21,6 +21,19 @@ BUILD_ARGS=('--config' 'Release'
 	      "CMAKE_CUDA_ARCHITECTURES=75"
 	      "onnxruntime_BUILD_UNIT_TESTS=ON"
 	      "onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON")
+
+# Parse external args
+for arg in "$@"; do
+  case $arg in
+    --cuda_minimal=ON)
+      # Replace onnxruntime_BUILD_UNIT_TESTS=ON with OFF
+      BUILD_ARGS=("${BUILD_ARGS[@]/onnxruntime_BUILD_UNIT_TESTS=ON/onnxruntime_BUILD_UNIT_TESTS=OFF}")
+      BUILD_ARGS+=("--enable_cuda_minimal_build")
+      BUILD_ARGS+=("--skip_tests")
+      ;;
+  esac
+done
+
 if [ -x "$(command -v ninja)" ]; then
     BUILD_ARGS+=('--cmake_generator' 'Ninja')
 fi

From d9d07ad8ae5c5fded75b307b2bd83ed3f44dd186 Mon Sep 17 00:00:00 2001
From: Yifan Li <109183385+yf711@users.noreply.github.com>
Date: Thu, 19 Dec 2024 10:39:15 -0800
Subject: [PATCH 16/25] [TensorRT EP] support TensorRT 10.7-GA (#23011)

### Description
<!-- Describe your changes. -->
Update CIs to TRT10.7

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 cgmanifests/generated/cgmanifest.json                |  2 +-
 cmake/deps.txt                                       |  4 ++--
 .../python/tools/tensorrt/perf/build/build_image.py  |  8 ++++----
 .../linux-gpu-tensorrt-daily-perf-pipeline.yml       | 12 ++++++------
 .../py-cuda-alt-package-test-pipeline.yml            |  2 +-
 .../azure-pipelines/templates/common-variables.yml   |  2 +-
 .../azure-pipelines/templates/download-deps.yml      |  4 ++--
 .../templates/jobs/download_win_gpu_library.yml      |  6 +++---
 .../azure-pipelines/templates/jobs/set-winenv.yml    |  4 ++--
 .../docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 |  2 +-
 .../Dockerfile.package_ubi8_cuda_tensorrt10_0_torch  |  2 +-
 .../linux/docker/Dockerfile.package_ubuntu_2004_gpu  |  2 +-
 .../docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg |  2 +-
 .../docker/Dockerfile.package_ubuntu_2204_gpu_opencv |  2 +-
 .../linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 |  2 +-
 .../linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 |  2 +-
 .../docker/inference/x86_64/python/cuda/Dockerfile   |  2 +-
 tools/ci_build/github/windows/setup_env_gpu.bat      |  4 ++--
 tools/ci_build/github/windows/setup_env_trt.bat      |  2 +-
 19 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index 07dff50f9a3bd..ad4195f31aa7c 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -196,7 +196,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "bc0d2e35909b8456abe32f3b30a49bb0c125e8b7",
+          "commitHash": "9c69a24bc2e20c8a511a4e6b06fd49639ec5300a",
           "repositoryUrl": "https://github.com/onnx/onnx-tensorrt.git"
         },
         "comments": "onnx_tensorrt"
diff --git a/cmake/deps.txt b/cmake/deps.txt
index 21f9ee1701c46..04a306e0ee657 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -36,8 +36,8 @@ microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.z
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
 onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.16.1.zip;2eb9198bb352757d5ff13977cbe0634898e0837c
-# Use the latest commit of 10.6-GA-ORT-DDS
-onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/bc0d2e35909b8456abe32f3b30a49bb0c125e8b7.zip;f233ae871ad82c023da62e5dd620639f00bc2d15
+# Use the latest commit of 10.7-GA
+onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/9c69a24bc2e20c8a511a4e6b06fd49639ec5300a.zip;ff1fe9af78eb129b4a4cdcb7450b7390b4436dd3
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
 protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a
 protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874
diff --git a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
index 3ebc33c02592d..541dc4978dad1 100644
--- a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
+++ b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
@@ -15,10 +15,10 @@
 from typing import List, Optional
 
 TRT_DOCKER_FILES = {
-    "8.6.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6",
-    "8.6.cuda_12_3_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6",
-    "10.5.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10",
-    "10.5.cuda_12_5_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10",
+    "8.6_cuda11.8_cudnn8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6",
+    "8.6_cuda12.3_cudnn9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6",
+    "10.7_cuda11.8_cudnn8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10",
+    "10.7_cuda12.5_cudnn9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10",
     "BIN": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin",
 }
 
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
index 83cf26614a285..9286b5a54ac27 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
@@ -8,12 +8,12 @@ parameters:
 - name: TrtVersion
   displayName: TensorRT Version
   type: string
-  default: 10.5.cuda_12_5_cudnn_9
+  default: 10.7_cuda12.5_cudnn9
   values:
-  - 8.6.cuda_11_8_cudnn_8
-  - 8.6.cuda_12_3_cudnn_9
-  - 10.5.cuda_11_8_cudnn_8
-  - 10.5.cuda_12_5_cudnn_9
+  - 8.6_cuda11.8_cudnn8
+  - 8.6_cuda12.3_cudnn9
+  - 10.7_cuda11.8_cudnn8
+  - 10.7_cuda12.5_cudnn9
   - BIN
 
 - name: UseTensorrtOssParser
@@ -198,4 +198,4 @@ jobs:
       parameters :
         condition : 'succeeded'
 
-    - template: templates/clean-agent-build-directory-step.yml
+    - template: templates/clean-agent-build-directory-step.yml
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml
index 9296928ad97e0..cf434e4eadf0d 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml
@@ -19,6 +19,6 @@ stages:
           python_wheel_suffix: '_gpu'
           timeout: 480
           docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3
-          trt_version: '10.6.0.26-1.cuda11.8'
+          trt_version: '10.7.0.23-1.cuda11.8'
           cuda_version: '11.8'
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/common-variables.yml b/tools/ci_build/github/azure-pipelines/templates/common-variables.yml
index d35bed69ee409..3d4e5326ae7c6 100644
--- a/tools/ci_build/github/azure-pipelines/templates/common-variables.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/common-variables.yml
@@ -1,5 +1,5 @@
 variables:
-  common_trt_version: '10.6.0.26'
+  common_trt_version: '10.7.0.23'
   # As for Debian installation, replace '-1.' by '-1+' when assigning trt version below
   linux_trt_version_cuda11: ${{ variables.common_trt_version }}-1.cuda11.8
   linux_trt_version_cuda12: ${{ variables.common_trt_version }}-1.cuda12.6
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
index 949479fb8b5e4..8409edb4d0429 100644
--- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.201
+      version: 1.0.202
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.201
+      version: 1.0.202
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
index ae54b3849a862..14b9c378bec14 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
@@ -13,10 +13,10 @@ parameters:
       - 12.2
   - name: TrtVersion
     type: string
-    default: '10.6.0.26'
+    default: '10.7.0.23'
     values:
       - 8.6.1.6
-      - 10.6.0.26
+      - 10.7.0.23
 
 steps:
   - ${{ if eq(parameters.DownloadCUDA, true) }}:
@@ -42,7 +42,7 @@ steps:
         - powershell: |
             Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.0"
           displayName: Set trtCudaVersion
-    - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.6.0.26')) }}:
+    - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.7.0.23')) }}:
         - powershell: |
             Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.6"
           displayName: Set trtCudaVersion
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
index dfaf237a711fe..45572416350c3 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
@@ -15,10 +15,10 @@ parameters:
     default: '11.8'
   - name: win_trt_folder_cuda11
     type: string
-    default: 'TensorRT-10.6.0.26.Windows10.x86_64.cuda-11.8'
+    default: 'TensorRT-10.7.0.23.Windows10.x86_64.cuda-11.8'
   - name: win_trt_folder_cuda12
     type: string
-    default: 'TensorRT-10.6.0.26.Windows10.x86_64.cuda-12.6'
+    default: 'TensorRT-10.7.0.23.Windows10.x86_64.cuda-12.6'
 
 steps:
   - ${{ if eq(parameters.DownloadCUDA, 'true') }}:
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
index c2bae5fd7ee59..df5112dc38af4 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:12.5.1-cudnn-devel-ubi8
-ARG TRT_VERSION=10.6.0.26-1.cuda12.6
+ARG TRT_VERSION=10.7.0.23-1.cuda12.6
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
 ENV PATH=/opt/python/cp310-cp310/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
index 2ecc6d1918b1a..fef95b8574520 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-ARG TRT_VERSION=10.6.0.26-1.cuda11.8
+ARG TRT_VERSION=10.7.0.23-1.cuda11.8
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
 ENV PATH=/opt/python/cp310-cp310/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
index 81aeada6a4a46..e91f14ff955b9 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
-ARG TRT_VERSION=10.6.0.26-1+cuda11.8
+ARG TRT_VERSION=10.7.0.23-1+cuda11.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg
index 4298dd53e4c66..0b08d4b3024b8 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
-ARG TRT_VERSION=10.6.0.26-1+cuda11.8
+ARG TRT_VERSION=10.7.0.23-1+cuda11.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv
index 1312475ceca3a..3a7e064686ae5 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
-ARG TRT_VERSION=10.6.0.26-1+cuda11.8
+ARG TRT_VERSION=10.7.0.23-1+cuda11.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
index 22d5e3b0248a8..01f08ff41e2cc 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
@@ -31,7 +31,7 @@ RUN pip install --upgrade pip
 RUN pip install psutil setuptools>=68.2.2
 
 # Install TensorRT
-RUN TRT_VERSION="10.6.0.26-1+cuda11.8" &&\
+RUN TRT_VERSION="10.7.0.23-1+cuda11.8" &&\
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
     apt-get update &&\
     apt-get install -y \
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
index 819d9bab7be75..781f0647a084b 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
@@ -31,7 +31,7 @@ RUN pip install --upgrade pip
 RUN pip install setuptools>=68.2.2 psutil
 
 # Install TensorRT
-RUN TRT_VERSION="10.6.0.26-1+cuda12.6" &&\
+RUN TRT_VERSION="10.7.0.23-1+cuda12.6" &&\
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
     apt-get update &&\
     apt-get install -y \
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
index a69b98f86ba1b..5f10607b11626 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
@@ -5,7 +5,7 @@
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
 
 FROM $BASEIMAGE
-ARG TRT_VERSION=10.6.0.26-1.cuda11.8
+ARG TRT_VERSION=10.7.0.23-1.cuda11.8
 
 #Install TensorRT only if TRT_VERSION is not empty
 RUN if [ -n "${TRT_VERSION}" ]; then  \
diff --git a/tools/ci_build/github/windows/setup_env_gpu.bat b/tools/ci_build/github/windows/setup_env_gpu.bat
index 34ddd75da16fc..4e2bd8f8386e2 100644
--- a/tools/ci_build/github/windows/setup_env_gpu.bat
+++ b/tools/ci_build/github/windows/setup_env_gpu.bat
@@ -6,10 +6,10 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
 ) else (
     set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH%
 )
-set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.6.0.26.Windows10.x86_64.cuda-12.6\lib;%PATH%
+set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.7.0.23.Windows10.x86_64.cuda-12.6\lib;%PATH%
 
 @REM The default version is still cuda v12.2, because set cuda v11.8 after it
-set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.6.0.26.Windows10.x86_64.cuda-11.8\lib
+set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.7.0.23.Windows10.x86_64.cuda-11.8\lib
 if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
     set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64
 ) else (
diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat
index 03734293be5c4..6a602e46661e7 100644
--- a/tools/ci_build/github/windows/setup_env_trt.bat
+++ b/tools/ci_build/github/windows/setup_env_trt.bat
@@ -6,6 +6,6 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
 ) else (
     set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64
 )
-set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.6.0.26.Windows10.x86_64.cuda-12.6\lib;%PATH%
+set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.7.0.23.Windows10.x86_64.cuda-12.6\lib;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false
 set CUDA_MODULE_LOADING=LAZY

From b4a6a0d51100e0fdb3d1c4b818d897b398150b46 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 19 Dec 2024 15:33:40 -0800
Subject: [PATCH 17/25] [WebGPU EP] allows GPUDevice to be released after use
 (#23144)

### Description

This change allows the `WebGpuContext` class to be released after all
active inference sessions are released. This will cause:
- for default context (ID=0), the underlying `wgpu::Device` and
`wgpu::Adapter` to be released, together with all resources created by
the Device.
- for custom context (ID>0), the reference counts of passed in Instance,
Adapter and Device will decrement correctly.
---
 .../core/providers/webgpu/webgpu_context.cc   | 103 +++++----
 .../core/providers/webgpu/webgpu_context.h    |  39 +++-
 .../webgpu/webgpu_execution_provider.cc       |   9 +-
 .../webgpu/webgpu_execution_provider.h        |  24 +-
 .../webgpu/webgpu_provider_factory.cc         | 217 ++++++++++--------
 5 files changed, 225 insertions(+), 167 deletions(-)

diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index c85a15017659c..fae329835067f 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -25,32 +25,9 @@
 namespace onnxruntime {
 namespace webgpu {
 
-void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info, const void* dawn_proc_table) {
-  std::call_once(init_flag_, [this, &webgpu_ep_info, dawn_proc_table]() {
-    // Initialization.Step.1 - Create wgpu::Instance
-    if (instance_ == nullptr) {
-      const DawnProcTable* dawn_procs = reinterpret_cast<const DawnProcTable*>(dawn_proc_table);
-#if defined(BUILD_DAWN_MONOLITHIC_LIBRARY)
-      ORT_ENFORCE(dawn_procs == nullptr, "setting DawnProcTable is not allowed when dynamically linked to webgpu_dawn.");
-#else
-#if !defined(USE_EXTERNAL_DAWN)
-      if (dawn_procs == nullptr) {
-        dawn_procs = &dawn::native::GetProcs();
-      }
-#else
-      ORT_ENFORCE(dawn_procs != nullptr, "DawnProcTable must be provided.");
-#endif
-      dawnProcSetProcs(dawn_procs);
-#endif
-
-      wgpu::InstanceDescriptor instance_desc{};
-      instance_desc.features.timedWaitAnyEnable = true;
-      instance_ = wgpu::CreateInstance(&instance_desc);
-
-      ORT_ENFORCE(instance_ != nullptr, "Failed to create wgpu::Instance.");
-    }
-
-    // Initialization.Step.2 - Create wgpu::Adapter
+void WebGpuContext::Initialize(const WebGpuBufferCacheConfig& buffer_cache_config, int backend_type) {
+  std::call_once(init_flag_, [this, &buffer_cache_config, backend_type]() {
+    // Create wgpu::Adapter
     if (adapter_ == nullptr) {
 #if !defined(__EMSCRIPTEN__) && defined(_MSC_VER) && defined(DAWN_ENABLE_D3D12) && !defined(USE_EXTERNAL_DAWN)
       // If we are using the D3D12 backend on Windows and the build does not use external Dawn, dxil.dll and dxcompiler.dll are required.
@@ -79,7 +56,7 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info
       wgpu::RequestAdapterOptions req_adapter_options = {};
       wgpu::DawnTogglesDescriptor adapter_toggles_desc = {};
       req_adapter_options.nextInChain = &adapter_toggles_desc;
-      req_adapter_options.backendType = static_cast<wgpu::BackendType>(webgpu_ep_info.backend_type);
+      req_adapter_options.backendType = static_cast<wgpu::BackendType>(backend_type);
       req_adapter_options.powerPreference = wgpu::PowerPreference::HighPerformance;
 
       auto enabled_adapter_toggles = GetEnabledAdapterToggles();
@@ -98,7 +75,7 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info
       ORT_ENFORCE(adapter_ != nullptr, "Failed to get a WebGPU adapter.");
     }
 
-    // Initialization.Step.3 - Create wgpu::Device
+    // Create wgpu::Device
     if (device_ == nullptr) {
       wgpu::DeviceDescriptor device_desc = {};
       wgpu::DawnTogglesDescriptor device_toggles_desc = {};
@@ -150,7 +127,10 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info
     device_limits_ = device_supported_limits.limits;
 
     // create buffer manager
-    buffer_mgr_ = BufferManagerFactory::Create(*this, webgpu_ep_info.storage_buffer_cache_mode, webgpu_ep_info.uniform_buffer_cache_mode, webgpu_ep_info.query_resolve_buffer_cache_mode);
+    buffer_mgr_ = BufferManagerFactory::Create(*this,
+                                               buffer_cache_config.storage.mode,
+                                               buffer_cache_config.uniform.mode,
+                                               buffer_cache_config.query_resolve.mode);
 
     // create program manager
     program_mgr_ = std::make_unique<ProgramManager>(Device(), DeviceLimits());
@@ -661,18 +641,46 @@ void WebGpuContext::Flush() {
   num_pending_dispatches_ = 0;
 }
 
-std::unordered_map<int32_t, std::unique_ptr<WebGpuContext>> WebGpuContextFactory::contexts_;
+std::unordered_map<int32_t, WebGpuContextFactory::WebGpuContextInfo> WebGpuContextFactory::contexts_;
 std::mutex WebGpuContextFactory::mutex_;
+std::once_flag WebGpuContextFactory::init_default_flag_;
+wgpu::Instance WebGpuContextFactory::default_instance_;
+
+WebGpuContext& WebGpuContextFactory::CreateContext(const WebGpuContextConfig& config) {
+  const int context_id = config.context_id;
+  WGPUInstance instance = config.instance;
+  WGPUAdapter adapter = config.adapter;
+  WGPUDevice device = config.device;
 
-WebGpuContext& WebGpuContextFactory::CreateContext(int context_id,
-                                                   WGPUInstance instance,
-                                                   WGPUAdapter adapter,
-                                                   WGPUDevice device,
-                                                   ValidationMode validation_mode) {
   if (context_id == 0) {
     // context ID is preserved for the default context. User cannot use context ID 0 as a custom context.
     ORT_ENFORCE(instance == nullptr && adapter == nullptr && device == nullptr,
                 "WebGPU EP default context (contextId=0) must not have custom WebGPU instance, adapter or device.");
+
+    std::call_once(init_default_flag_, [dawn_proc_table = config.dawn_proc_table]() {
+      // Step.1 - setup dawn proc table
+      const DawnProcTable* dawn_procs = reinterpret_cast<const DawnProcTable*>(dawn_proc_table);
+#if defined(BUILD_DAWN_MONOLITHIC_LIBRARY)
+      ORT_ENFORCE(dawn_procs == nullptr, "setting DawnProcTable is not allowed when dynamically linked to webgpu_dawn.");
+#else
+#if !defined(USE_EXTERNAL_DAWN)
+      if (dawn_procs == nullptr) {
+        dawn_procs = &dawn::native::GetProcs();
+      }
+#else
+      ORT_ENFORCE(dawn_procs != nullptr, "DawnProcTable must be provided.");
+#endif
+      dawnProcSetProcs(dawn_procs);
+#endif
+
+      // Step.2 - Create wgpu::Instance
+      wgpu::InstanceDescriptor instance_desc{};
+      instance_desc.features.timedWaitAnyEnable = true;
+      default_instance_ = wgpu::CreateInstance(&instance_desc);
+
+      ORT_ENFORCE(default_instance_ != nullptr, "Failed to create wgpu::Instance.");
+    });
+    instance = default_instance_.Get();
   } else {
     // for context ID > 0, user must provide custom WebGPU instance, adapter and device.
     ORT_ENFORCE(instance != nullptr && adapter != nullptr && device != nullptr,
@@ -684,13 +692,16 @@ WebGpuContext& WebGpuContextFactory::CreateContext(int context_id,
   auto it = contexts_.find(context_id);
   if (it == contexts_.end()) {
     GSL_SUPPRESS(r.11)
-    auto context = std::unique_ptr<WebGpuContext>(new WebGpuContext(instance, adapter, device, validation_mode));
-    it = contexts_.emplace(context_id, std::move(context)).first;
+    auto context = std::unique_ptr<WebGpuContext>(new WebGpuContext(instance, adapter, device, config.validation_mode));
+    it = contexts_.emplace(context_id, WebGpuContextFactory::WebGpuContextInfo{std::move(context), 0}).first;
   } else if (context_id != 0) {
-    ORT_ENFORCE(it->second->instance_.Get() == instance && it->second->adapter_.Get() == adapter && it->second->device_.Get() == device,
+    ORT_ENFORCE(it->second.context->instance_.Get() == instance &&
+                    it->second.context->adapter_.Get() == adapter &&
+                    it->second.context->device_.Get() == device,
                 "WebGPU EP context ID ", context_id, " is already created with different WebGPU instance, adapter or device.");
   }
-  return *it->second;
+  it->second.ref_count++;
+  return *it->second.context;
 }
 
 WebGpuContext& WebGpuContextFactory::GetContext(int context_id) {
@@ -699,12 +710,24 @@ WebGpuContext& WebGpuContextFactory::GetContext(int context_id) {
   auto it = contexts_.find(context_id);
   ORT_ENFORCE(it != contexts_.end(), "WebGPU EP context ID ", context_id, " is not found.");
 
-  return *it->second;
+  return *it->second.context;
+}
+
+void WebGpuContextFactory::ReleaseContext(int context_id) {
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  auto it = contexts_.find(context_id);
+  ORT_ENFORCE(it != contexts_.end(), "WebGPU EP context ID ", context_id, " is not found.");
+
+  if (--it->second.ref_count == 0) {
+    contexts_.erase(it);
+  }
 }
 
 void WebGpuContextFactory::Cleanup() {
   std::lock_guard<std::mutex> lock(mutex_);
   contexts_.clear();
+  default_instance_ = nullptr;
 }
 
 void CleanupWebGpuContexts() {
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.h b/onnxruntime/core/providers/webgpu/webgpu_context.h
index c41ef3e211264..d1f43cdc4ddff 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.h
@@ -26,28 +26,53 @@ class WebGpuContext;
 class ComputeContext;
 class ProgramBase;
 
+struct WebGpuContextConfig {
+  int context_id;
+  WGPUInstance instance;
+  WGPUAdapter adapter;
+  WGPUDevice device;
+  const void* dawn_proc_table;
+  ValidationMode validation_mode;
+};
+
+struct WebGpuBufferCacheConfig {
+  struct ConfigEntry {
+    BufferCacheMode mode;
+    std::string config_string;
+  };
+  ConfigEntry storage;
+  ConfigEntry uniform;
+  ConfigEntry query_resolve;
+  ConfigEntry default_entry;
+};
+
 class WebGpuContextFactory {
  public:
-  static WebGpuContext& CreateContext(int context_id,
-                                      WGPUInstance instance,
-                                      WGPUAdapter adapter,
-                                      WGPUDevice device,
-                                      ValidationMode validation_mode);
+  struct WebGpuContextInfo {
+    std::unique_ptr<WebGpuContext> context;
+    int ref_count;
+  };
+
+  static WebGpuContext& CreateContext(const WebGpuContextConfig& config);
   static WebGpuContext& GetContext(int context_id);
 
+  static void ReleaseContext(int context_id);
+
   static void Cleanup();
 
  private:
   WebGpuContextFactory() {}
 
-  static std::unordered_map<int32_t, std::unique_ptr<WebGpuContext>> contexts_;
+  static std::unordered_map<int32_t, WebGpuContextInfo> contexts_;
   static std::mutex mutex_;
+  static std::once_flag init_default_flag_;
+  static wgpu::Instance default_instance_;
 };
 
 // Class WebGpuContext includes all necessary resources for the context.
 class WebGpuContext final {
  public:
-  void Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info, const void* dawn_proc_table);
+  void Initialize(const WebGpuBufferCacheConfig& buffer_cache_config, int backend_type);
 
   Status Wait(wgpu::Future f);
 
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index 295a8de31ed50..76a55b7ce4f2e 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -743,13 +743,13 @@ using namespace webgpu;
 
 WebGpuExecutionProvider::WebGpuExecutionProvider(int context_id,
                                                  WebGpuContext& context,
-                                                 WebGpuExecutionProviderInfo&& info)
+                                                 WebGpuExecutionProviderConfig&& config)
     : IExecutionProvider{kWebGpuExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0)},
       context_id_{context_id},
       context_{context},
-      preferred_data_layout_{info.data_layout},
-      force_cpu_node_names_{std::move(info.force_cpu_node_names)},
-      enable_graph_capture_{info.enable_graph_capture} {
+      preferred_data_layout_{config.data_layout},
+      force_cpu_node_names_{std::move(config.force_cpu_node_names)},
+      enable_graph_capture_{config.enable_graph_capture} {
 }
 
 std::vector<AllocatorPtr> WebGpuExecutionProvider::CreatePreferredAllocators() {
@@ -824,6 +824,7 @@ std::unique_ptr<onnxruntime::IDataTransfer> WebGpuExecutionProvider::GetDataTran
 }
 
 WebGpuExecutionProvider::~WebGpuExecutionProvider() {
+  WebGpuContextFactory::ReleaseContext(context_id_);
 }
 
 std::unique_ptr<profiling::EpProfiler> WebGpuExecutionProvider::GetProfiler() {
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
index f9c43c6bfd7d0..ad81924e06901 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
@@ -22,32 +22,22 @@ enum class BufferCacheMode;
 class WebGpuProfiler;
 }  // namespace webgpu
 
-struct WebGpuExecutionProviderInfo {
-  WebGpuExecutionProviderInfo(DataLayout data_layout, bool enable_graph_capture)
+struct WebGpuExecutionProviderConfig {
+  WebGpuExecutionProviderConfig(DataLayout data_layout, bool enable_graph_capture)
       : data_layout{data_layout},
-        enable_graph_capture{enable_graph_capture},
-        backend_type{},
-        storage_buffer_cache_mode{},
-        uniform_buffer_cache_mode{},
-        query_resolve_buffer_cache_mode{},
-        default_buffer_cache_mode{} {}
-  WebGpuExecutionProviderInfo(WebGpuExecutionProviderInfo&&) = default;
-  WebGpuExecutionProviderInfo& operator=(WebGpuExecutionProviderInfo&&) = default;
-  ORT_DISALLOW_COPY_AND_ASSIGNMENT(WebGpuExecutionProviderInfo);
+        enable_graph_capture{enable_graph_capture} {}
+  WebGpuExecutionProviderConfig(WebGpuExecutionProviderConfig&&) = default;
+  WebGpuExecutionProviderConfig& operator=(WebGpuExecutionProviderConfig&&) = default;
+  ORT_DISALLOW_COPY_AND_ASSIGNMENT(WebGpuExecutionProviderConfig);
 
   DataLayout data_layout;
   bool enable_graph_capture;
-  int backend_type;
-  webgpu::BufferCacheMode storage_buffer_cache_mode;
-  webgpu::BufferCacheMode uniform_buffer_cache_mode;
-  webgpu::BufferCacheMode query_resolve_buffer_cache_mode;
-  webgpu::BufferCacheMode default_buffer_cache_mode;
   std::vector<std::string> force_cpu_node_names;
 };
 
 class WebGpuExecutionProvider : public IExecutionProvider {
  public:
-  WebGpuExecutionProvider(int context_id, webgpu::WebGpuContext& context, WebGpuExecutionProviderInfo&& info);
+  WebGpuExecutionProvider(int context_id, webgpu::WebGpuContext& context, WebGpuExecutionProviderConfig&& config);
   ~WebGpuExecutionProvider() override;
 
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
index 6cfe9aac0b0e9..64eb80b26fbf9 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
@@ -17,25 +17,25 @@ using namespace onnxruntime::webgpu::options;
 namespace onnxruntime {
 
 struct WebGpuProviderFactory : IExecutionProviderFactory {
-  WebGpuProviderFactory(int context_id, webgpu::WebGpuContext& context, WebGpuExecutionProviderInfo&& webgpu_ep_info)
-      : context_id_{context_id}, context_{context}, info_{std::move(webgpu_ep_info)} {
+  WebGpuProviderFactory(int context_id, webgpu::WebGpuContext& context, WebGpuExecutionProviderConfig&& webgpu_ep_config)
+      : context_id_{context_id}, context_{context}, config_{std::move(webgpu_ep_config)} {
   }
 
   std::unique_ptr<IExecutionProvider> CreateProvider() override {
-    return std::make_unique<WebGpuExecutionProvider>(context_id_, context_, std::move(info_));
+    return std::make_unique<WebGpuExecutionProvider>(context_id_, context_, std::move(config_));
   }
 
  private:
   int context_id_;
   webgpu::WebGpuContext& context_;
-  WebGpuExecutionProviderInfo info_;
+  WebGpuExecutionProviderConfig config_;
 };
 
 std::shared_ptr<IExecutionProviderFactory> WebGpuProviderFactoryCreator::Create(const ConfigOptions& config_options) {
   //
-  // STEP.1 - prepare WebGpuExecutionProviderInfo
+  // STEP.1 - prepare WebGpuExecutionProviderConfig
   //
-  WebGpuExecutionProviderInfo webgpu_ep_info{
+  WebGpuExecutionProviderConfig webgpu_ep_config{
       // preferred layout is NHWC by default
       DataLayout::NHWC,
       // graph capture feature is disabled by default
@@ -45,109 +45,33 @@ std::shared_ptr<IExecutionProviderFactory> WebGpuProviderFactoryCreator::Create(
   std::string preferred_layout_str;
   if (config_options.TryGetConfigEntry(kPreferredLayout, preferred_layout_str)) {
     if (preferred_layout_str == kPreferredLayout_NHWC) {
-      webgpu_ep_info.data_layout = DataLayout::NHWC;
+      webgpu_ep_config.data_layout = DataLayout::NHWC;
     } else if (preferred_layout_str == kPreferredLayout_NCHW) {
-      webgpu_ep_info.data_layout = DataLayout::NCHW;
+      webgpu_ep_config.data_layout = DataLayout::NCHW;
     } else {
       ORT_THROW("Invalid preferred layout: ", preferred_layout_str);
     }
   }
-  LOGS_DEFAULT(VERBOSE) << "WebGPU EP preferred layout: " << int(webgpu_ep_info.data_layout) << " (parsed from \""
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP preferred layout: " << int(webgpu_ep_config.data_layout) << " (parsed from \""
                         << preferred_layout_str << "\")";
 
   std::string enable_graph_capture_str;
   if (config_options.TryGetConfigEntry(kEnableGraphCapture, enable_graph_capture_str)) {
     if (enable_graph_capture_str == kEnableGraphCapture_ON) {
-      webgpu_ep_info.enable_graph_capture = true;
+      webgpu_ep_config.enable_graph_capture = true;
     } else if (enable_graph_capture_str == kEnableGraphCapture_OFF) {
-      webgpu_ep_info.enable_graph_capture = false;
+      webgpu_ep_config.enable_graph_capture = false;
     } else {
       ORT_THROW("Invalid enable graph capture: ", enable_graph_capture_str);
     }
   }
-  LOGS_DEFAULT(VERBOSE) << "WebGPU EP graph capture enable: " << webgpu_ep_info.enable_graph_capture;
-
-  std::string backend_type_str;
-  if (config_options.TryGetConfigEntry(kDawnBackendType, backend_type_str)) {
-#ifdef _WIN32
-    // Setup Windows default backend type based on the build configuration
-#if defined(onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
-    webgpu_ep_info.backend_type = static_cast<int>(WGPUBackendType_D3D12);
-#elif defined(onnxruntime_ENABLE_DAWN_BACKEND_VULKAN)
-    webgpu_ep_info.backend_type = static_cast<int>(WGPUBackendType_Vulkan);
-#endif
-#endif
-    if (backend_type_str == kDawnBackendType_D3D12) {
-      webgpu_ep_info.backend_type = static_cast<int>(WGPUBackendType_D3D12);
-    } else if (backend_type_str == kDawnBackendType_Vulkan) {
-      webgpu_ep_info.backend_type = static_cast<int>(WGPUBackendType_Vulkan);
-    } else {
-      ORT_THROW("Invalid Dawn backend type: ", backend_type_str);
-    }
-  }
-  LOGS_DEFAULT(VERBOSE) << "WebGPU EP Dawn backend type: " << webgpu_ep_info.backend_type;
-
-  auto parse_buffer_cache_mode = [&config_options](const std::string& config_entry_str,
-                                                   webgpu::BufferCacheMode default_value) -> webgpu::BufferCacheMode {
-    std::string buffer_cache_mode_str;
-    if (config_options.TryGetConfigEntry(config_entry_str, buffer_cache_mode_str)) {
-      if (buffer_cache_mode_str == kBufferCacheMode_Disabled) {
-        return webgpu::BufferCacheMode::Disabled;
-      } else if (buffer_cache_mode_str == kBufferCacheMode_LazyRelease) {
-        return webgpu::BufferCacheMode::LazyRelease;
-      } else if (buffer_cache_mode_str == kBufferCacheMode_Simple) {
-        return webgpu::BufferCacheMode::Simple;
-      } else if (buffer_cache_mode_str == kBufferCacheMode_Bucket) {
-        return webgpu::BufferCacheMode::Bucket;
-      } else {
-        ORT_THROW("Invalid buffer cache mode: ", config_entry_str);
-      }
-    } else {
-      return default_value;
-    }
-  };
-
-  webgpu_ep_info.storage_buffer_cache_mode = parse_buffer_cache_mode(kStorageBufferCacheMode, webgpu::BufferCacheMode::Bucket);
-  LOGS_DEFAULT(VERBOSE) << "WebGPU EP storage buffer cache mode: " << webgpu_ep_info.storage_buffer_cache_mode;
-
-  webgpu_ep_info.uniform_buffer_cache_mode = parse_buffer_cache_mode(kUniformBufferCacheMode, webgpu::BufferCacheMode::Simple);
-  LOGS_DEFAULT(VERBOSE) << "WebGPU EP uniform buffer cache mode: " << webgpu_ep_info.uniform_buffer_cache_mode;
-
-  webgpu_ep_info.query_resolve_buffer_cache_mode = parse_buffer_cache_mode(kQueryResolveBufferCacheMode, webgpu::BufferCacheMode::Disabled);
-  LOGS_DEFAULT(VERBOSE) << "WebGPU EP query resolve buffer cache mode: " << webgpu_ep_info.query_resolve_buffer_cache_mode;
-
-  webgpu_ep_info.default_buffer_cache_mode = parse_buffer_cache_mode(kDefaultBufferCacheMode, webgpu::BufferCacheMode::Disabled);
-  LOGS_DEFAULT(VERBOSE) << "WebGPU EP default buffer cache mode: " << webgpu_ep_info.default_buffer_cache_mode;
-
-  webgpu::ValidationMode validation_mode =
-#ifndef NDEBUG
-      webgpu::ValidationMode::Full  // for debug build, enable full validation by default
-#else
-      webgpu::ValidationMode::Basic  // for release build, enable basic validation by default
-#endif  // !NDEBUG
-      ;
-  std::string validation_mode_str;
-  if (config_options.TryGetConfigEntry(kValidationMode, validation_mode_str)) {
-    if (validation_mode_str == kValidationMode_Disabled) {
-      validation_mode = webgpu::ValidationMode::Disabled;
-    } else if (validation_mode_str == kValidationMode_wgpuOnly) {
-      validation_mode = webgpu::ValidationMode::WGPUOnly;
-    } else if (validation_mode_str == kValidationMode_basic) {
-      validation_mode = webgpu::ValidationMode::Basic;
-    } else if (validation_mode_str == kValidationMode_full) {
-      validation_mode = webgpu::ValidationMode::Full;
-    } else {
-      ORT_THROW("Invalid validation mode: ", validation_mode_str);
-    }
-  }
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP graph capture enable: " << webgpu_ep_config.enable_graph_capture;
 
   // parse force CPU node names
   // The force CPU node names are separated by EOL (\n or \r\n) in the config entry.
   // each line is a node name that will be forced to run on CPU.
   std::string force_cpu_node_names_str;
   if (config_options.TryGetConfigEntry(kForceCpuNodeNames, force_cpu_node_names_str)) {
-    std::vector<std::string> force_cpu_node_names;
-
     // split the string by EOL (\n or \r\n)
     std::istringstream ss(force_cpu_node_names_str);
     std::string line;
@@ -157,14 +81,13 @@ std::shared_ptr<IExecutionProviderFactory> WebGpuProviderFactoryCreator::Create(
         continue;
       }
 
-      force_cpu_node_names.push_back(line);
+      webgpu_ep_config.force_cpu_node_names.push_back(line);
     }
-
-    webgpu_ep_info.force_cpu_node_names = std::move(force_cpu_node_names);
   }
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP force CPU node count: " << webgpu_ep_config.force_cpu_node_names.size();
 
   //
-  // STEP.2 - prepare WebGpuContext
+  // STEP.2 - prepare WebGpuContextConfig
   //
   int context_id = 0;
   std::string context_id_str;
@@ -204,14 +127,110 @@ std::shared_ptr<IExecutionProviderFactory> WebGpuProviderFactoryCreator::Create(
                 std::from_chars(dawn_proc_table_str.data(), dawn_proc_table_str.data() + dawn_proc_table_str.size(), dawn_proc_table).ec);
   }
 
-  auto& context = webgpu::WebGpuContextFactory::CreateContext(context_id,
-                                                              reinterpret_cast<WGPUInstance>(webgpu_instance),
-                                                              reinterpret_cast<WGPUAdapter>(webgpu_adapter),
-                                                              reinterpret_cast<WGPUDevice>(webgpu_device),
-                                                              validation_mode);
-  context.Initialize(webgpu_ep_info, reinterpret_cast<const void*>(dawn_proc_table));
+  webgpu::ValidationMode validation_mode =
+#ifndef NDEBUG
+      webgpu::ValidationMode::Full  // for debug build, enable full validation by default
+#else
+      webgpu::ValidationMode::Basic  // for release build, enable basic validation by default
+#endif  // !NDEBUG
+      ;
+  std::string validation_mode_str;
+  if (config_options.TryGetConfigEntry(kValidationMode, validation_mode_str)) {
+    if (validation_mode_str == kValidationMode_Disabled) {
+      validation_mode = webgpu::ValidationMode::Disabled;
+    } else if (validation_mode_str == kValidationMode_wgpuOnly) {
+      validation_mode = webgpu::ValidationMode::WGPUOnly;
+    } else if (validation_mode_str == kValidationMode_basic) {
+      validation_mode = webgpu::ValidationMode::Basic;
+    } else if (validation_mode_str == kValidationMode_full) {
+      validation_mode = webgpu::ValidationMode::Full;
+    } else {
+      ORT_THROW("Invalid validation mode: ", validation_mode_str);
+    }
+  }
+
+  webgpu::WebGpuContextConfig context_config{
+      context_id,
+      reinterpret_cast<WGPUInstance>(webgpu_instance),
+      reinterpret_cast<WGPUAdapter>(webgpu_adapter),
+      reinterpret_cast<WGPUDevice>(webgpu_device),
+      reinterpret_cast<const void*>(dawn_proc_table),
+      validation_mode,
+  };
+
+  //
+  // STEP.3 - prepare parameters for WebGPU context initialization.
+  //
+
+  int backend_type = 0;
+#ifdef _WIN32
+  // Setup Windows default backend type based on the build configuration
+#if defined(DAWN_ENABLE_D3D12)
+  backend_type = static_cast<int>(WGPUBackendType_D3D12);
+#elif defined(DAWN_ENABLE_VULKAN)
+  backend_type = static_cast<int>(WGPUBackendType_Vulkan);
+#endif
+#endif
+
+  std::string backend_type_str;
+  if (config_options.TryGetConfigEntry(kDawnBackendType, backend_type_str)) {
+    if (backend_type_str == kDawnBackendType_D3D12) {
+      backend_type = static_cast<int>(WGPUBackendType_D3D12);
+    } else if (backend_type_str == kDawnBackendType_Vulkan) {
+      backend_type = static_cast<int>(WGPUBackendType_Vulkan);
+    } else {
+      ORT_THROW("Invalid Dawn backend type: ", backend_type_str);
+    }
+  }
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP Dawn backend type: " << backend_type;
+
+  // buffer cache modes
+  auto parse_buffer_cache_mode = [&config_options](const std::string& config_entry_str,
+                                                   webgpu::BufferCacheMode default_value) -> webgpu::BufferCacheMode {
+    std::string buffer_cache_mode_str;
+    if (config_options.TryGetConfigEntry(config_entry_str, buffer_cache_mode_str)) {
+      if (buffer_cache_mode_str == kBufferCacheMode_Disabled) {
+        return webgpu::BufferCacheMode::Disabled;
+      } else if (buffer_cache_mode_str == kBufferCacheMode_LazyRelease) {
+        return webgpu::BufferCacheMode::LazyRelease;
+      } else if (buffer_cache_mode_str == kBufferCacheMode_Simple) {
+        return webgpu::BufferCacheMode::Simple;
+      } else if (buffer_cache_mode_str == kBufferCacheMode_Bucket) {
+        return webgpu::BufferCacheMode::Bucket;
+      } else {
+        ORT_THROW("Invalid buffer cache mode: ", config_entry_str);
+      }
+    } else {
+      return default_value;
+    }
+  };
+
+  webgpu::WebGpuBufferCacheConfig buffer_cache_config;
+
+  buffer_cache_config.storage.mode = parse_buffer_cache_mode(kStorageBufferCacheMode, webgpu::BufferCacheMode::Bucket);
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP storage buffer cache mode: " << buffer_cache_config.storage.mode;
+
+  buffer_cache_config.uniform.mode = parse_buffer_cache_mode(kUniformBufferCacheMode, webgpu::BufferCacheMode::Simple);
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP uniform buffer cache mode: " << buffer_cache_config.uniform.mode;
+
+  buffer_cache_config.query_resolve.mode = parse_buffer_cache_mode(kQueryResolveBufferCacheMode, webgpu::BufferCacheMode::Disabled);
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP query resolve buffer cache mode: " << buffer_cache_config.query_resolve.mode;
+
+  buffer_cache_config.default_entry.mode = parse_buffer_cache_mode(kDefaultBufferCacheMode, webgpu::BufferCacheMode::Disabled);
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP default buffer cache mode: " << buffer_cache_config.default_entry.mode;
+
+  //
+  // STEP.4 - start initialization.
+  //
+
+  // Load the Dawn library and create the WebGPU instance and adapter.
+  auto& context = webgpu::WebGpuContextFactory::CreateContext(context_config);
+
+  // Create WebGPU device and initialize the context.
+  context.Initialize(buffer_cache_config, backend_type);
 
-  return std::make_shared<WebGpuProviderFactory>(context_id, context, std::move(webgpu_ep_info));
+  // Create WebGPU EP factory.
+  return std::make_shared<WebGpuProviderFactory>(context_id, context, std::move(webgpu_ep_config));
 }
 
 }  // namespace onnxruntime

From 7c782f674179480c30860cb8f85ca9cc9c596253 Mon Sep 17 00:00:00 2001
From: Jiajia Qin <jiajiaqin@microsoft.com>
Date: Fri, 20 Dec 2024 08:22:53 +0800
Subject: [PATCH 18/25] [webgpu] Always use tile matmulnbits for block_size =
 32 (#23140)

### Description
After the optimization of prefill time with #23102, it seems that always
using the tile matmulnibits with block_size = 32 can bring better
performance even for discrete gpu for phi3 model.

Phi3 becomes 42.64 tokens/sec from 32.82 tokens/sec in easy mode on my
NV RTX 2000 GPU.
---
 .../webgpu/quantization/matmul_nbits.cc            |  8 +++-----
 .../contrib_ops/webgpu/quantization/matmul_nbits.h | 14 ++++++--------
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
index 9a49adf347a29..8abcd78bfff4c 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
@@ -60,7 +60,7 @@ Status MatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
   const auto& scales = shader.AddInput("scales", ShaderUsage::UseUniform);
   const auto& y = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias | ShaderUsage::UseIndicesTypeAlias);
 
-  if ((is_intel_ || tile_m_ > 1) && block_size_ == 32) {
+  if (block_size_ == 32) {
     const uint32_t workgroup_size = WorkgroupSizeX() * WorkgroupSizeY();
     const uint32_t tile_size = WorkgroupSizeX() * components_b_ * 8;  // each uint32 has 8 data.
     const uint32_t a_length_per_tile = tile_size / a.NumComponents();
@@ -408,14 +408,12 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context
   const uint32_t components_b = GetMaxComponents(blob_size_in_words);
   uint32_t components = GetMaxComponents(N);
 
-  const bool is_intel = context.AdapterInfo().vendor == std::string_view{"intel"} &&
-                        context.AdapterInfo().architecture == std::string_view{"gen-12lp"};
   const bool has_zero_points = zero_points != nullptr;
 
   // TODO: Support output_number > 1. Some cases are failed when output_number > 1.
   constexpr uint32_t output_number = 1;
   const uint32_t tile_m = M > kMinMForTileOptimization ? 4 : 1;
-  MatMulNBitsProgram program{output_number, block_size, tile_m, gsl::narrow<int>(components_b), has_zero_points, is_intel};
+  MatMulNBitsProgram program{output_number, block_size, tile_m, gsl::narrow<int>(components_b), has_zero_points};
   if (M > kMinMForTileOptimization && block_size == 32) {
     components = 1;
     constexpr uint32_t workgroup_size = 64;
@@ -426,7 +424,7 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context
                                  (M + tile_m - 1) / tile_m,
                                  batch_count);
     program.CacheHint("T_M" + std::to_string(tile_m));
-  } else if (is_intel && block_size == 32) {
+  } else if (block_size == 32) {
     components = 1;
     constexpr uint32_t workgroup_size = 128;
     const uint32_t workgroup_y = N % 8 == 0 ? 8 : N % 4 == 0 ? 4
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h
index 8a4626083419c..57615d3ddabcf 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h
+++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h
@@ -14,13 +14,12 @@ using namespace onnxruntime::webgpu;
 
 class MatMulNBitsProgram final : public Program<MatMulNBitsProgram> {
  public:
-  MatMulNBitsProgram(uint32_t output_number, uint32_t block_size, uint32_t tile_m, int components_b, bool has_zero_points, bool is_intel) : Program{"MatMulNBits"},
-                                                                                                                                            output_number_{output_number},
-                                                                                                                                            block_size_{block_size},
-                                                                                                                                            tile_m_{tile_m},
-                                                                                                                                            components_b_{components_b},
-                                                                                                                                            has_zero_points_{has_zero_points},
-                                                                                                                                            is_intel_{is_intel} {
+  MatMulNBitsProgram(uint32_t output_number, uint32_t block_size, uint32_t tile_m, int components_b, bool has_zero_points) : Program{"MatMulNBits"},
+                                                                                                                             output_number_{output_number},
+                                                                                                                             block_size_{block_size},
+                                                                                                                             tile_m_{tile_m},
+                                                                                                                             components_b_{components_b},
+                                                                                                                             has_zero_points_{has_zero_points} {
   }
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
@@ -32,7 +31,6 @@ class MatMulNBitsProgram final : public Program<MatMulNBitsProgram> {
   uint32_t tile_m_;
   int components_b_;
   bool has_zero_points_;
-  bool is_intel_;
 };
 
 class MatMulNBits final : public WebGpuKernel {

From 4aca8f33df95f2326ea3f6b8337b5e2ca53f0b96 Mon Sep 17 00:00:00 2001
From: mingyue <131847423+mingyueliuh@users.noreply.github.com>
Date: Fri, 20 Dec 2024 08:47:13 +0800
Subject: [PATCH 19/25] [Bug Fix] Missing CustomOp SchemaRegister when
 generator EPContext ONNX model (#23091)

### Description
Enhancements to EPContext Operations:
1. Introduced support for the bfloat16 data type in EPContext operations.
2. Bug Fix: Missing Custom OP Schema Registration when generator EPContext ONNX model

---------

Co-authored-by: mingyue <mingyue@xilinx.com>
Co-authored-by: Hector Li <hecli@microsoft.com>
---
 docs/ContribOperators.md                           | 2 +-
 onnxruntime/core/framework/graph_partitioner.cc    | 2 +-
 onnxruntime/core/graph/contrib_ops/contrib_defs.cc | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index 6ea3f93cdea12..2290030073e5c 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -1625,7 +1625,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
 <dd>Constrain input and output types.</dd>
 </dl>
 
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index 406fc1b15effc..b97cf03e3bf59 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -681,7 +681,7 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers
                            context_cache_path, "' exist already.");
   }
 
-  Model ep_context_model(graph.Name(), false, graph.GetModel().MetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+  Model ep_context_model(graph.Name(), false, graph.GetModel().MetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList{graph.GetSchemaRegistry()},
                          graph.DomainToVersionMap(), {}, logger);
   auto& ep_graph = ep_context_model.MainGraph();
   ep_graph.SetDescription(graph.Description());
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index c7a0793c4748f..d78fe7111c9be 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -3371,7 +3371,8 @@ void RegisterContribSchemas() {
            "tensor(uint64)",
            "tensor(float16)",
            "tensor(float)",
-           "tensor(double)"},
+           "tensor(double)",
+           "tensor(bfloat16)"},
           "Constrain input and output types.");
 
   static const char* BitmaskDropout_ver1_doc = R"DOC(

From 29bccad96dfd17174388abef9443086e84808e2d Mon Sep 17 00:00:00 2001
From: xhcao <xinghua.cao@intel.com>
Date: Sat, 21 Dec 2024 01:05:23 +0800
Subject: [PATCH 20/25] [webgpu] fix compiling error (#23139)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 onnxruntime/core/providers/webgpu/buffer_manager.cc | 4 ++--
 onnxruntime/core/providers/webgpu/webgpu_context.cc | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/webgpu/buffer_manager.cc b/onnxruntime/core/providers/webgpu/buffer_manager.cc
index 45eb123943de9..233bb24083289 100644
--- a/onnxruntime/core/providers/webgpu/buffer_manager.cc
+++ b/onnxruntime/core/providers/webgpu/buffer_manager.cc
@@ -321,8 +321,8 @@ void BufferManager::Download(WGPUBuffer src, void* dst, size_t size) {
 
   // TODO: revise wait in whole project
 
-  ORT_ENFORCE(context_.Wait(staging_buffer.MapAsync(wgpu::MapMode::Read, 0, buffer_size, wgpu::CallbackMode::WaitAnyOnly, [](wgpu::MapAsyncStatus status, const char* message) {
-    ORT_ENFORCE(status == wgpu::MapAsyncStatus::Success, "Failed to download data from buffer: ", message);
+  ORT_ENFORCE(context_.Wait(staging_buffer.MapAsync(wgpu::MapMode::Read, 0, buffer_size, wgpu::CallbackMode::WaitAnyOnly, [](wgpu::MapAsyncStatus status, wgpu::StringView message) {
+    ORT_ENFORCE(status == wgpu::MapAsyncStatus::Success, "Failed to download data from buffer: ", std::string_view{message});
   })) == Status::OK());
 
   auto mapped_data = staging_buffer.GetConstMappedRange();
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index fae329835067f..b2f7748a54743 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -532,8 +532,8 @@ void WebGpuContext::CollectProfilingData(profiling::Events& events) {
                                                   0,
                                                   query_read_buffer.GetSize(),
                                                   wgpu::CallbackMode::WaitAnyOnly,
-                                                  [](wgpu::MapAsyncStatus status, const char* message) {
-                                                    ORT_ENFORCE(status == wgpu::MapAsyncStatus::Success, "Failed to download data from buffer: ", message);
+                                                  [](wgpu::MapAsyncStatus status, wgpu::StringView message) {
+                                                    ORT_ENFORCE(status == wgpu::MapAsyncStatus::Success, "Failed to download data from buffer: ", std::string_view{message});
                                                   })) == Status::OK());
       auto mapped_data = static_cast<const uint64_t*>(query_read_buffer.GetConstMappedRange());
 

From 00b262dbb4328173bf0e11cbcd1d62e18df3d686 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Fri, 20 Dec 2024 10:49:08 -0800
Subject: [PATCH 21/25] Implement pre-packed blobs serialization on disk and
 their memory mapping on load (#23069)

### Description
<!-- Describe your changes. -->
Pre-packing is a feature, that allows kernels to re-arrange weights data
to gain performance at interference time

Currently, pre-packed blobs are shared when a cross-session weight
sharing is enabled and only for those weights that are marked as shared
by the user. Otherwise, data resides on the heap, the kernels own the
data which may be duplicated.

This change enables pre-packed data to be stored on disk alongside with
the external initializers.
The pre-packed blobs are memory mapped and are loaded into either the
X-session shared container
or a new container that shares pre-packed blobs within the session.

With the new approach, pre-packed blobs are always owned by the shared
container using the existing pre-pack mechanism for sharing. When
X-session sharing is enabled, then the external container owns the data.
A separate container owned by a root `SessionState` owns and shares the
data when X-session sharing is not enabled.

To facilitate this new approach, we introduce a new container that works
in two modes. When an optimized model is being saved, and pre-packed
weights saving is enabled, the new container will record pre-packed
blobs and serialize them to disk using existing
`ToGraphProtoWithExternalInitializers` function.

To externalize the pre-packed weights, we introduce a new session option
`kOrtSessionOptionsSavePrePackedConstantInitializers.` Note, that
pre-packing should be enabled (default) for this to work.

`ToGraphProtoWithExternalInitializers`function is modified to recurse
into subgraphs to make sure we properly account for local initializer
names.

In the second mode, the container would simply hold the pre-packed
weights memory-mapped from disk and share them with the kernels.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Reduce memory usage by pre-packed initializers and externalize them.
---
 .../onnxruntime/core/framework/op_kernel.h    |   1 +
 include/onnxruntime/core/graph/graph.h        |  92 ++++--
 .../core/graph/model_saving_options.h         |  44 +++
 .../onnxruntime_session_options_config_keys.h |  11 +
 .../core/framework/prepacked_weights.h        |  10 +-
 .../framework/prepacked_weights_container.cc  |  58 ++++
 .../framework/prepacked_weights_container.h   | 117 ++++++-
 onnxruntime/core/framework/session_state.cc   | 212 +++++++++---
 onnxruntime/core/framework/session_state.h    |  14 +-
 .../core/framework/session_state_utils.cc     |  31 +-
 .../core/framework/session_state_utils.h      |   4 +-
 .../framework/tensor_external_data_info.cc    | 123 ++++++-
 .../framework/tensor_external_data_info.h     |  61 +++-
 .../core/framework/tensorprotoutils.cc        |  44 ++-
 onnxruntime/core/framework/tensorprotoutils.h |  17 +-
 onnxruntime/core/graph/graph.cc               | 204 ++++++++----
 onnxruntime/core/graph/model.cc               |  25 +-
 onnxruntime/core/graph/model.h                |  35 +-
 .../shared_library/provider_interfaces.h      |   8 +-
 .../shared_library/provider_wrappedtypes.h    |   9 +-
 .../core/providers/vitisai/imp/graph.cc       |   5 +-
 onnxruntime/core/session/inference_session.cc |   8 +-
 .../core/session/provider_bridge_ort.cc       |   9 +-
 .../save_model_with_external_initializers.cc  |  44 ++-
 .../test/framework/session_state_test.cc      | 306 +++++++++++++++++-
 .../test/framework/tensorutils_test.cc        |  73 +++++
 .../core/session/training_session.cc          |   4 +-
 .../orttraining/training_api/module.cc        |   5 +-
 28 files changed, 1308 insertions(+), 266 deletions(-)
 create mode 100644 include/onnxruntime/core/graph/model_saving_options.h

diff --git a/include/onnxruntime/core/framework/op_kernel.h b/include/onnxruntime/core/framework/op_kernel.h
index 07625c38d8474..375f0a4dc8dd2 100644
--- a/include/onnxruntime/core/framework/op_kernel.h
+++ b/include/onnxruntime/core/framework/op_kernel.h
@@ -7,6 +7,7 @@
 
 // It is safe to include the below header even if SHARED_PROVIDER macro is enabled
 // as it doesn't include any pb headers.
+#include "core/framework/buffer_deleter.h"
 #include "core/framework/prepacked_weights_container.h"
 
 #ifndef SHARED_PROVIDER
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index eb9581e8018d1..7798394b045dc 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -3,14 +3,15 @@
 
 #pragma once
 
+#include <filesystem>
 #include <functional>
 #include <limits>
 #include <memory>
+#include <optional>
 #include <string>
 #include <type_traits>
 #include <unordered_map>
 #include <unordered_set>
-#include <filesystem>
 
 #include "core/common/flatbuffers.h"
 
@@ -19,13 +20,14 @@
 #include "core/common/common.h"
 #include "core/common/path_string.h"
 #include "core/common/const_pointer_container.h"
+#include "core/common/inlined_containers_fwd.h"
 #if !defined(ORT_MINIMAL_BUILD)
 #include "core/common/inlined_containers.h"
 #endif
-#include "core/common/inlined_containers_fwd.h"
 #include "core/common/span_utils.h"
 #include "core/common/status.h"
 #include "core/common/logging/logging.h"
+#include "core/framework/prepacked_weights_container.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/graph/basic_types.h"
 #include "core/graph/constants.h"
@@ -41,6 +43,7 @@ namespace onnxruntime {
 class Graph;
 struct IndexedSubGraph;
 class Model;
+struct ModelSavingOptions;
 class OpSignature;
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
@@ -1153,29 +1156,6 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   const ONNX_NAMESPACE::GraphProto& ToGraphProto();
   ONNX_NAMESPACE::GraphProto ToGraphProto() const;
 
-  // Options to align external initializer offset.
-  // For models running on CPU, ORT will try to use mmap to load external initializers.
-  // To use mmap, external initializer need to be offset aligned.
-  // ORT saves external initializers into signle data file, each initializer is accessed with
-  // offset(start position of initializer) and length(byte length of initializer) of the data file.
-  // To use mmap, each offset need to be aligned which means offset need to divisible by
-  // allocation granularity(64KB for windows and 4K for other OSes).
-  // With align_offset to true, ORT will align offset for large initializer when
-  // save ONNX model with external data file.
-  struct OffsetAlignmentInfo {
-    // Offset will always be page aligned and allocation granularity aligned for mmap support.
-    // This is done by padding previous tensor data with zeros keeping same length.
-    bool align_offset = false;
-    // Alignment threshold for size of data.
-    // Having a low threshold will waste file space for small initializers.
-    // Only when tensor's data size is > the page_align_threshold it will be force aligned.
-    // Default to 1MB.
-    int64_t align_threshold = 1048576;
-    // The allocation Granularity for mmap() support.
-    // Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
-    int64_t allocation_granularity = 65536;
-  };
-
   /** Gets the GraphProto representation of this Graph
   @param external_file_path File path of the binary file to use for initializers.
   @param model_file_path path of the model file.
@@ -1186,15 +1166,7 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   */
   ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
                                                                   const std::filesystem::path& model_file_path,
-                                                                  size_t initializer_size_threshold,
-                                                                  const OffsetAlignmentInfo& align_info) const;
-
-  ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
-                                                                  const std::filesystem::path& model_file_path,
-                                                                  size_t initializer_size_threshold) const {
-    OffsetAlignmentInfo default_options;
-    return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options);
-  }
+                                                                  const ModelSavingOptions& model_saving_options) const;
 
   /** Gets the ISchemaRegistry instances being used with this Graph. */
   IOnnxRuntimeOpSchemaCollectionPtr GetSchemaRegistry() const;
@@ -1400,6 +1372,18 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
 
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
+  // This function constructs PrepackedSharedContainer in the root graph only
+  // and initializes a reference to it in all (sub)graphs
+  void ConstructPrepackedSharedContainerAndSetMode(bool saving_mode_on);
+
+  const PrepackedWeightsForGraph& GetPrepacked() const noexcept {
+    return *prepacked_weights_for_graph_;
+  }
+
+  PrepackedWeightsForGraph& GetPrepacked() noexcept {
+    return *prepacked_weights_for_graph_;
+  }
+
   /** Returns the Node containing the GraphProto for this Graph instance if IsSubgraph is true */
   const Node* ParentNode() const { return parent_node_; }
 
@@ -1519,6 +1503,31 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   Status AddConstantProtoAsInitializer(const ONNX_NAMESPACE::NodeProto& constant_node_proto,
                                        std::optional<std::string_view> new_name);
 
+  /// <summary>
+  /// This function traverses the graph bottom up and externalizes
+  /// constant initializers along with their pre-packed blobs from different
+  /// kernels. Writes constant initializers to the external file with any pre-packed
+  /// blobs (if enabled and produced for this initializer) and then modifies TensorProto
+  /// entry with external data references.
+  /// </summary>
+  /// <param name="model_path">model file path from Model</param>
+  /// <param name="external_file_path">a binary file path for relative to the model file path
+  /// where the initializers data is written</param>
+  /// <param name="model_external_file_path">model file folder path with external file path appended</param>
+  /// <param name="model_saving_options">model saving options including alignment and pre-packs</param>
+  /// <param name="output_graph_proto">The graph proto to be modified</param>
+  /// <param name="external_stream">external file stream</param>
+  /// <param name="external_offset">current external file offset updated with each write</param>
+  /// <returns>Status instance</returns>
+  Status AddExternalInitializersToGraphProtoImpl(
+      const std::filesystem::path& model_path,
+      const std::filesystem::path& external_file_path,
+      const std::filesystem::path& model_external_file_path,
+      const ModelSavingOptions& model_saving_options,
+      ONNX_NAMESPACE::GraphProto& output_graph_proto,
+      std::ostream& external_stream,
+      int64_t& external_offset) const;
+
 #endif
 
   Version IrVersion() const noexcept {
@@ -1703,6 +1712,21 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
                      std::hash<std::string>, std::equal_to<std::string>>
       sparse_tensor_names_;
 
+  // Prepacked blobs container that stored pre-packed initializers
+  // data that is:
+  // - mem-mapped from disk
+  // - shared within the session
+  // - shared across sessions by transferring the ownership of loaded data entries to
+  // SessionState::PrepackedWeightsContainer* if one is present.
+  // This container is optional because it is present only in the root graph.
+  std::optional<PrepackedKeyToBlobMap> prepacked_key_to_blobs_;
+
+  // This container contains a reference to the root prepacked_key_to_blobs_
+  // and also (in the save mode) records association between the initializer
+  // names and their pre-packed blobs (via keys).
+  // This is optional due to delayed construction.
+  std::optional<PrepackedWeightsForGraph> prepacked_weights_for_graph_;
+
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
   // Runtime optimization storage.
   // Note: runtime_optimizations_ == *runtime_optimizations_ptr_ and must be initialized
diff --git a/include/onnxruntime/core/graph/model_saving_options.h b/include/onnxruntime/core/graph/model_saving_options.h
new file mode 100644
index 0000000000000..924799f15b247
--- /dev/null
+++ b/include/onnxruntime/core/graph/model_saving_options.h
@@ -0,0 +1,44 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace onnxruntime {
+
+class PrepackedWeightsForGraph;
+
+// These options affect how the model initializers are written to the external file.
+// This includes options to align external initializer offset.
+// For models running on CPU, ORT will try to use mmap to load external
+// initializers. To use mmap, external initializer need to be offset aligned.
+// ORT saves external initializers into single data file, each initializer is
+// accessed with offset(start position of initializer) and length(byte length of
+// initializer) of the data file. To use mmap, each offset need to be aligned
+// which means offset need to divisible by allocation granularity(64KB for
+// windows and 4K for other OSes). With align_offset to true, ORT will align
+// offset for large initializer when save ONNX model with external data file.
+struct ModelSavingOptions {
+  explicit ModelSavingOptions(size_t size_threshold)
+      : initializer_size_threshold(size_threshold) {}
+
+  // Mimimal initializer size in bytes to be externalized on disk
+  size_t initializer_size_threshold;
+  // Offset will always be page aligned and allocation granularity aligned for
+  // mmap support. This is done by padding previous tensor data with zeros
+  // keeping same length.
+  bool align_offset = false;
+  // Alignment threshold for size of data.
+  // Having a low threshold will waste file space for small initializers.
+  // Only when tensor's data size is > the page_align_threshold it will be force
+  // aligned. Default to 1MB.
+  int64_t align_threshold = 1048576;
+  // The allocation Granularity for mmap() support.
+  // Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
+#ifdef _WIN32
+  int64_t allocation_granularity = 65536;
+#else
+  int64_t allocation_granularity = 4096;
+#endif
+};
+
+}  // namespace onnxruntime
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index 8f1bc98ce7b49..64a4dd19c12b0 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -250,6 +250,17 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFil
 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
     "session.optimized_model_external_initializers_min_size_in_bytes";
 
+// Use this config when saving pre-packed constant initializers to an external data file.
+// This allows you to memory map pre-packed initializers on model load and leave it to
+// to the OS the amount of memory consumed by the pre-packed initializers. Otherwise,
+// pre-packed data resides on the heap.
+//
+// - "0": Default is not save pre-packed initializers to a data file.
+// - "1": Save pre-packed constant initializers to an external data file.
+// Sample usage: sess_options.add_session_config_entry(kOrtSessionOptionsSavePrePackedConstantInitializers,  "1")
+static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers =
+    "session.save_external_prepacked_constant_initializers";
+
 // Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file.
 // The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
 // "0": disable. (default)
diff --git a/onnxruntime/core/framework/prepacked_weights.h b/onnxruntime/core/framework/prepacked_weights.h
index fbf99b81937ee..9695be1e0554c 100644
--- a/onnxruntime/core/framework/prepacked_weights.h
+++ b/onnxruntime/core/framework/prepacked_weights.h
@@ -6,7 +6,8 @@
 #include <vector>
 
 #include "core/common/basic_types.h"
-#include "core/framework/buffer_deleter.h"
+#include "core/common/inlined_containers_fwd.h"
+#include "core/framework/allocator.h"
 #include "core/framework/tensor_shape.h"
 
 namespace onnxruntime {
@@ -16,11 +17,14 @@ struct PrePackedWeights final {
   // Hence we hold them in container. It is upto the developer implementing each PrePack()
   // method to define what gets stored in which position of the container.
 
-  std::vector<IAllocatorUniquePtr<void>> buffers_;  // cache pre-packed buffers associated with the kernel
-  std::vector<size_t> buffer_sizes_;                // cache sizes of pre-packed buffers (in bytes)
+  InlinedVector<IAllocatorUniquePtr<void>> buffers_;  // cache pre-packed buffers associated with the kernel
+  InlinedVector<size_t> buffer_sizes_;                // cache sizes of pre-packed buffers (in bytes)
 
   // Produces a hash of the buffers stored in the given instance of this class
   HashValue GetHash() const;
+
+  // The function creates a copy with non-owning BufferUniquePtrs.
+  PrePackedWeights CreateReferringCopy() const;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/prepacked_weights_container.cc b/onnxruntime/core/framework/prepacked_weights_container.cc
index b6d44dd248bdd..7c832a0ac2691 100644
--- a/onnxruntime/core/framework/prepacked_weights_container.cc
+++ b/onnxruntime/core/framework/prepacked_weights_container.cc
@@ -3,9 +3,21 @@
 
 #include "core/framework/prepacked_weights_container.h"
 #include "core/framework/allocator_utils.h"
+#include "core/graph/graph.h"
 
 namespace onnxruntime {
 
+PrePackedWeights PrePackedWeights::CreateReferringCopy() const {
+  PrePackedWeights copy;
+  for (const auto& prepacked_buffer : buffers_) {
+    // No deleter is needed as the buffer is not owned by the unique_ptr
+    copy.buffers_.emplace_back(prepacked_buffer.get(), [](void*) {});
+  }
+
+  copy.buffer_sizes_ = buffer_sizes_;
+  return copy;
+}
+
 AllocatorPtr PrepackedWeightsContainer::GetOrCreateAllocator(const std::string& device_name) {
   auto iter = allocators_.find(device_name);
 
@@ -49,4 +61,50 @@ size_t PrepackedWeightsContainer::GetNumberOfElements() const {
   return prepacked_weights_map_.size();
 }
 
+void PrepackedWeightsForGraph::InsertPrepackedWeights(const std::string& key, PrePackedWeights&& packed_weight) {
+  // We may have duplicate entries mapped from disk if the same weight is pre-packed from subgraphs and
+  // up the tree by the same kernel with the same result. The map prevents this from happening.
+  key_to_blobs_.emplace(key, std::move(packed_weight));
+}
+
+void PrepackedWeightsForGraph::WritePackedMaybeForSave(const std::string& weight_name, const std::string& key,
+                                                       PrePackedWeights&& packed_weight) {
+  key_to_blobs_.insert_or_assign(key, std::move(packed_weight));
+
+  if (save_mode_on_) {
+    weight_prepacks_for_saving_[weight_name].insert(key);
+  }
+}
+
+const PrePackedWeights* PrepackedWeightsForGraph::GetPrepackedWeights(const std::string& key) const {
+  auto it = key_to_blobs_.find(key);
+  if (it == key_to_blobs_.end()) {
+    return nullptr;
+  }
+  return &it->second;
+}
+
+std::optional<PrePackedWeights> PrepackedWeightsForGraph::ReplaceWithReferenceIfSaving(
+    const std::string& weight_name,
+    const std::string& key,
+    const PrePackedWeights& refer_to_if_absent) {
+  auto it = key_to_blobs_.find(key);
+  if (it == key_to_blobs_.end()) {
+    if (save_mode_on_) {
+      key_to_blobs_.emplace(key, refer_to_if_absent.CreateReferringCopy());
+      weight_prepacks_for_saving_[weight_name].insert(key);
+    }
+    return std::nullopt;
+  }
+
+  PrePackedWeights result = std::move(it->second);
+  if (save_mode_on_) {
+    it->second = result.CreateReferringCopy();
+    weight_prepacks_for_saving_[weight_name].insert(key);
+  } else {
+    key_to_blobs_.erase(it);
+  }
+  return result;
+}
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/prepacked_weights_container.h b/onnxruntime/core/framework/prepacked_weights_container.h
index 37fc01c05f2ae..f48c790eb4126 100644
--- a/onnxruntime/core/framework/prepacked_weights_container.h
+++ b/onnxruntime/core/framework/prepacked_weights_container.h
@@ -3,19 +3,26 @@
 
 #pragma once
 
-#include <unordered_map>
-#include <unordered_set>
-#include <string>
-#include <cstdint>
-
-#include "core/framework/buffer_deleter.h"
-
+#include "core/common/common.h"
 #include "core/framework/allocator.h"
-#include <mutex>
 #include "prepacked_weights.h"
 
+#include <cstdint>
+#include <mutex>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+
 namespace onnxruntime {
 
+#ifndef SHARED_PROVIDER
+class Graph;
+#else
+struct Graph;
+#endif
+
 class PrepackedWeightsContainer final {
  public:
   PrepackedWeightsContainer() {
@@ -66,4 +73,98 @@ class PrepackedWeightsContainer final {
   std::unordered_map<std::string, PrePackedWeights> prepacked_weights_map_;
 };
 
+// Maps a pre-packed weight blob key to PrepackedWeights instance
+using PrepackedKeyToBlobMap = std::unordered_map<std::string, PrePackedWeights>;
+
+/// <summary>
+/// This class has a dual purpose.
+/// If saving is OFF (IsSaveModeOn() false), it is used to contain the weights memory mapped from disk.
+/// Those weights are then moved to the shared container if weight sharing is enabled.
+/// If cross-session weight sharing is not enabled, the weights are stored in this container,
+/// and shared with the interested kernels.
+///
+/// When saving to disk is ON (IsSaveModeOn() true)
+/// It records the pre-packed weights blobs and associates them with the weight name.
+/// When saving the model with external initializers, the weights are written to disk along
+/// with the pre-packed blobs.
+///
+/// </summary>
+class PrepackedWeightsForGraph {
+ public:
+  PrepackedWeightsForGraph(PrepackedKeyToBlobMap& key_blobs, bool save_mode_on_)
+      : key_to_blobs_(key_blobs), save_mode_on_(save_mode_on_) {
+  }
+
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PrepackedWeightsForGraph);
+
+  // WeightToPrePacksMap maps weight name to a set of pre-packed
+  // keys contained in the KeyToBlobMap
+  using KeysPerWeight = std::unordered_set<std::string>;  // blob keys
+  using WeightToPrePacksMap = std::unordered_map<std::string, KeysPerWeight>;
+
+  void InsertPrepackedWeights(const std::string& key, PrePackedWeights&& packed_weight);
+
+  // Overwrites the existing weights and associates key with weight_name
+  void WritePackedMaybeForSave(const std::string& weight_name, const std::string& key,
+                               PrePackedWeights&& packed_weight);
+
+  const PrePackedWeights* GetPrepackedWeights(const std::string& key) const;
+
+  // The function would add or replace existing entry with references to it.
+  // If the entry is present, it would replace it with references to the existing entry.
+  // If the entry is not present, it would add reference to refer_if_absent
+  // If the entry is present it would return the existing entry otherwise std::nullopt
+  // Reference in this context means a non-owning smart pointer. Essentially, this function
+  // replaces the existing entry with the same entry, but transfers the ownership outside
+  // the container.
+  std::optional<PrePackedWeights> ReplaceWithReferenceIfSaving(const std::string& weight_name,
+                                                               const std::string& key,
+                                                               const PrePackedWeights& refer_to_if_absent);
+
+  bool IsSaveModeOn() const noexcept {
+    return save_mode_on_;
+  }
+
+  void SetSaveMode(bool value) noexcept {
+    save_mode_on_ = value;
+  }
+
+  const KeysPerWeight* GetKeysForWeightForSaving(const std::string& weight_name) const {
+    auto hit = weight_prepacks_for_saving_.find(weight_name);
+    if (hit != weight_prepacks_for_saving_.end()) {
+      return &hit->second;
+    }
+    return nullptr;
+  }
+
+  size_t GetNumberOfWeightsForWriting() const noexcept {
+    return weight_prepacks_for_saving_.size();
+  }
+
+  size_t GetNumberOfKeyedBlobsForWriting() const noexcept {
+    size_t result = 0;
+    for (const auto& [_, keys] : weight_prepacks_for_saving_) {
+      result += keys.size();
+    }
+    return result;
+  }
+
+  const WeightToPrePacksMap& GetWeightToPrepack() const noexcept {
+    return weight_prepacks_for_saving_;
+  }
+
+  PrepackedKeyToBlobMap& GetKeyToBlob() noexcept {
+    return key_to_blobs_;
+  }
+
+  const PrepackedKeyToBlobMap& GetKeyToBlob() const noexcept {
+    return key_to_blobs_;
+  }
+
+ private:
+  PrepackedKeyToBlobMap& key_to_blobs_;
+  bool save_mode_on_;
+  WeightToPrePacksMap weight_prepacks_for_saving_;
+};
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc
index 0ac2271ba09f1..d7059bf848e83 100644
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@@ -13,6 +13,7 @@
 #include "core/framework/node_index_info.h"
 #include "core/framework/op_kernel.h"
 #include "core/framework/ort_value_pattern_planner.h"
+#include "core/framework/prepacked_weights_container.h"
 #include "core/framework/session_state_utils.h"
 #include "core/framework/utils.h"
 #include "core/providers/cpu/controlflow/utils.h"
@@ -122,7 +123,9 @@ void SessionState::UpdateAllocatorsWithEnvAllocators(const std::vector<Allocator
   }
 }
 
-void SessionState::CreateGraphInfo() {
+void SessionState::CreateGraphInfo(bool save_prepacked_on) {
+  graph_.ConstructPrepackedSharedContainerAndSetMode(save_prepacked_on);
+
   graph_viewer_.emplace(graph_);
   // use graph_viewer_ to initialize ort_value_name_idx_map_
   LOGS(logger_, VERBOSE) << "SaveMLValueNameIndexMapping";
@@ -316,6 +319,10 @@ const std::unordered_map<int, OrtValue>& SessionState::GetConstantInitializedTen
   return constant_initialized_tensors_;
 }
 
+const PrepackedWeightsForGraph& onnxruntime::SessionState::GetPrepackedIniitializersForGraph() const {
+  return graph_.GetPrepacked();
+}
+
 #if !defined(DISABLE_SPARSE_TENSORS)
 bool SessionState::IsSparseInitializer(int ort_value_index) const {
   return sparse_initialized_tensors_.count(ort_value_index) > 0;
@@ -396,8 +403,9 @@ static std::string GenerateKeyForPrepackedWeightsMap(const std::string& op_type,
   return ss_1.str();
 }
 
-Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::string, size_t>& constant_initializers_use_count,
-                                                       const std::unordered_map<std::string, const OrtValue*>& initializers_to_share_map) {
+Status SessionState::PrepackConstantInitializedTensors(
+    InlinedHashMap<std::string, size_t>& constant_initializers_use_count,
+    const std::unordered_map<std::string, const OrtValue*>& initializers_to_share_map) {
   auto prepacked_constant_weights = [this, &constant_initializers_use_count, &initializers_to_share_map](
                                         bool should_cache_prepacked_weights_for_shared_initializers) -> Status {
     for (auto& node : GetGraphViewer().Nodes()) {
@@ -407,6 +415,7 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
         if (input_def->Exists()) {
           const std::string& input_name = input_def->Name();
           SessionState* st = this;
+          auto* prepacked_for_graph = &graph_.GetPrepacked();
           // subgraph can use the value from outer scope,
           // so it needs to check if current node uses constant initialized tensor from current and outer graphs
           do {
@@ -423,7 +432,8 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
 
                 // Caching pre-packed weights is limited to shared initializers associated with the CPU EP for now
                 if (is_shared_initializer && should_cache_prepacked_weights_for_shared_initializers &&
-                    node.GetExecutionProviderType() == kCpuExecutionProvider) {  // caching of pre-packed weights' turned ON
+                    node.GetExecutionProviderType() == kCpuExecutionProvider) {
+                  // caching of pre-packed weights' turned ON
 
                   AllocatorPtr allocator_for_caching = prepacked_weights_container_->GetOrCreateAllocator(CPU);
                   ORT_ENFORCE(allocator_for_caching.get() != nullptr);
@@ -431,16 +441,19 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
                   PrePackedWeights weights_to_be_filled_in;
                   // The reason we invoke PrePack() before looking into the container for any pre-packed weight
                   // cached by another instance of the same op_type (for the same constant initializer) is because
-                  // to truly know if we can use a cached pre-packed weight, we would have to compare the cached pre-packed
-                  // weight with the pre-packed weight generated by this instance of the same op_type because other static
-                  // properties of the node like node attributes could play a role in the pre-packed weights' contents.
+                  // to truly know if we can use a cached pre-packed weight, we would have to compare the cached
+                  // pre-packed  weight with the pre-packed weight generated by this instance of the same op_type
+                  // because other static properties of the node like node attributes could play a role in the
+                  // pre-packed weights' contents.
                   ORT_RETURN_IF_ERROR(kernel->PrePack(const_initialized_tensor, input_idx, allocator_for_caching,
                                                       is_packed,
                                                       &weights_to_be_filled_in));
 
                   if (is_packed) {
-                    // BUG CHECK: Ensure that the kernel has filled in the pre-packed weight to be cached if the weight was pre-packed
-                    ORT_ENFORCE(weights_to_be_filled_in.buffers_.size() > 0, "The kernel corresponding to the node ", node.Name(),
+                    // BUG CHECK: Ensure that the kernel has filled in the pre-packed weight
+                    // to be cached if the weight was pre-packed
+                    ORT_ENFORCE(weights_to_be_filled_in.buffers_.size() > 0,
+                                "The kernel corresponding to the node ", node.Name(),
                                 " doesn't have an implementation that can cache computed pre-packed weights");
 
                     const auto& op_type = node.OpType();
@@ -452,40 +465,117 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
                     // The key for the pre-packed weights container lookup is the op_type + hash of the prepacked-weight
                     // that we just got by invoking PrePack() on this kernel.
 
-                    const std::string& prepacked_weights_container_key = GenerateKeyForPrepackedWeightsMap(op_type,
-                                                                                                           weights_to_be_filled_in);
+                    const std::string prepacked_weights_container_key =
+                        GenerateKeyForPrepackedWeightsMap(op_type,
+                                                          weights_to_be_filled_in);
 
-                    bool container_contains_packed_weight = prepacked_weights_container_->HasWeight(prepacked_weights_container_key);
+                    bool container_contains_packed_weight = prepacked_weights_container_->HasWeight(
+                        prepacked_weights_container_key);
 
                     if (container_contains_packed_weight) {
-                      LOGS(logger_, INFO) << "Using cached version of pre-packed weight for constant initializer: " << input_name
-                                          << " used in the node: " << node.Name() << " which is of op type: " << node.OpType();
+                      LOGS(logger_, INFO) << "Using cached version of pre-packed weight for constant initializer: "
+                                          << input_name
+                                          << " used in the node: " << node.Name() << " which is of op type: "
+                                          << node.OpType();
 
+                      const auto& prepacked_shared = prepacked_weights_container_->GetWeight(
+                          prepacked_weights_container_key);
                       ORT_RETURN_IF_ERROR(KernelUseSharedPrePackedBuffers(*kernel, input_idx,
-                                                                          prepacked_weights_container_->GetWeight(prepacked_weights_container_key),
+                                                                          prepacked_shared,
                                                                           node.Name()));
 
                       ++used_shared_pre_packed_weights_counter_;
-                    } else {  // container doesn't contain the pre-packed weight - so write into it for sharing across kernel instances
 
-                      if (!prepacked_weights_container_->WriteWeight(prepacked_weights_container_key, std::move(weights_to_be_filled_in))) {
-                        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unable to write the provided PrePackedWeights instance into the container");
+                      // Write references to what is stored in the shared container
+                      // and release memory mapped entries this container may have loaded from disk
+                      std::ignore = prepacked_for_graph->ReplaceWithReferenceIfSaving(input_name,
+                                                                                      prepacked_weights_container_key,
+                                                                                      prepacked_shared);
+
+                    } else {
+                      // container doesn't contain the pre-packed weight - so write into it for sharing across
+                      // kernel instances
+
+                      // Check if we loaded it from disk, then put it into the shared container so
+                      // everybody can share the same memory mapped entry
+                      // the shared container takes ownership of the memory mapped entries
+
+                      // The next line replaces the existing entry with references to it
+                      // and returns the container that holds the memory mapped entries
+                      // so we can transfer it to shared container.
+                      // if there is not an entry, we replace it with references to weights_to_be_filled_in
+                      // in saving mode and return std::nullopt
+                      auto prepacked_from_disk = prepacked_for_graph->ReplaceWithReferenceIfSaving(
+                          input_name,
+                          prepacked_weights_container_key,
+                          weights_to_be_filled_in);
+
+                      if (prepacked_from_disk.has_value()) {
+                        weights_to_be_filled_in = std::move(*prepacked_from_disk);
                       }
 
+                      if (!prepacked_weights_container_->WriteWeight(prepacked_weights_container_key,
+                                                                     std::move(weights_to_be_filled_in))) {
+                        return ORT_MAKE_STATUS(
+                            ONNXRUNTIME, FAIL,
+                            "Unable to write the provided PrePackedWeights instance into the container");
+                      }
+
+                      const auto& shared_prepacked = prepacked_weights_container_->GetWeight(
+                          prepacked_weights_container_key);
                       ORT_RETURN_IF_ERROR(KernelUseSharedPrePackedBuffers(*kernel, input_idx,
-                                                                          prepacked_weights_container_->GetWeight(prepacked_weights_container_key),
+                                                                          shared_prepacked,
                                                                           node.Name()));
                     }
                   }
 
-                } else {  // caching of pre-packed weights' turned OFF
+                } else {
+                  // cross session caching of pre-packed weights' turned OFF
+                  // we use serialization container to share weights loaded from disk
+                  // within this session. Or if the weight is not present on disk,
+                  // we store the newly minted pre-packed data.
+
                   AllocatorPtr session_cpu_alloc = GetAllocator(kernel->Info().GetDevice(OrtMemType::OrtMemTypeDefault));
-                  ORT_RETURN_IF_ERROR(kernel->PrePack(const_initialized_tensor, input_idx,
-                                                      session_cpu_alloc,  // use allocator tied to this session
+                  PrePackedWeights weights_to_be_filled_in;
+                  // The reason we invoke PrePack() before looking into the container for any pre-packed weight
+                  // cached by another instance of the same op_type (for the same constant initializer) is because
+                  // to truly know if we can use a cached pre-packed weight, we would have to compare the cached
+                  // pre-packed weight with the pre-packed weight generated by this instance of the same op_type because
+                  // other static properties of the node like node attributes could play a role in the pre-packed
+                  // weights' contents.
+                  ORT_RETURN_IF_ERROR(kernel->PrePack(const_initialized_tensor, input_idx, session_cpu_alloc,
                                                       is_packed,
-                                                      nullptr  // no caching required
-                                                      ));
+                                                      &weights_to_be_filled_in));
+
+                  // Some kernels (matmul_nbits and non-CPU related kernels) do not share their pre-packed results
+                  // even though they set is_packed = true so we leave it up to them.
+                  // We can change their behavior if we wish do so in a separate PR
+                  // XXX: Interestingly enough, matmul_nbits does accept shared pre-packs, but does not
+                  // produce them.
+                  if (is_packed && !weights_to_be_filled_in.buffers_.empty()) {
+                    const auto& op_type = node.OpType();
+                    const std::string prepacked_weights_container_key = GenerateKeyForPrepackedWeightsMap(
+                        op_type,
+                        weights_to_be_filled_in);
+
+                    // See if we can use pre-packed data from disk
+                    const auto* weights_to_use = prepacked_for_graph->GetPrepackedWeights(
+                        prepacked_weights_container_key);
+
+                    if (weights_to_use == nullptr) {
+                      // In this case pre-packed container owns the data
+                      prepacked_for_graph->WritePackedMaybeForSave(input_name, prepacked_weights_container_key,
+                                                                   std::move(weights_to_be_filled_in));
+                      weights_to_use = prepacked_for_graph->GetPrepackedWeights(prepacked_weights_container_key);
+                      assert(weights_to_use != nullptr);
+                    }
+
+                    ORT_RETURN_IF_ERROR(KernelUseSharedPrePackedBuffers(*kernel, input_idx,
+                                                                        *weights_to_use,
+                                                                        node.Name()));
+                  }
                 }
+
                 if (is_packed) {
                   ++number_of_prepacks_counter_;
 
@@ -504,6 +594,7 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
               }
             }
             st = st->Parent();
+            prepacked_for_graph = &st->graph_.GetPrepacked();
           } while (st);
         }
         input_idx++;
@@ -525,7 +616,8 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
   }
 }
 
-static int64_t CalculateMemoryPatternsKey(const gsl::span<const OrtValue>& tensor_inputs) {
+static int64_t
+CalculateMemoryPatternsKey(const gsl::span<const OrtValue>& tensor_inputs) {
   int64_t key = 0;
   for (const auto& input : tensor_inputs) {
     for (auto dim : input.Get<Tensor>().Shape().GetDims()) key ^= dim;
@@ -1068,9 +1160,12 @@ Status SessionState::CreateSubgraphSessionState() {
 
 // Calculate the use count of a constant initialized tensor, including the use in subgraph.
 // Note: This function doesn't handle the case below:
-// The main graph has a constant initializer called X, and the subgraph also has a constant initializer called X, which overrides the X from main graph.
-// For case like this, the current implementation will calculate the use count as 2, but they could contain completely different values so each should have a use count of 1.
-// This is a very rare case. If it happens and X is prepacked, the consequence is that X won't be released and memory usage of X won't be saved. This will be fine.
+// The main graph has a constant initializer called X, and the subgraph also has a constant initializer called X,
+// which overrides the X from main graph.
+// For case like this, the current implementation will calculate the use count as 2, but they could contain completely
+// different values so each should have a use count of 1.
+// This is a very rare case. If it happens and X is prepacked, the consequence is that X won't be released and memory
+// usage of X won't be saved. This will be fine.
 static void ComputeConstantInitializerUseCount(const Graph& graph, InlinedHashMap<std::string, size_t>& constant_initializers_use_count) {
   for (const auto& node : graph.Nodes()) {
     for (const auto* arg : node.InputDefs()) {
@@ -1189,7 +1284,30 @@ Status SessionState::FinalizeSessionState(const std::basic_string<PATH_CHAR_TYPE
   InlinedHashMap<std::string, size_t> constant_initializers_use_count;
   ComputeConstantInitializerUseCount(graph_, constant_initializers_use_count);
   return FinalizeSessionStateImpl(graph_location, kernel_registry_manager, nullptr, sess_options_,
-                                  remove_initializers, constant_initializers_use_count);
+                                  remove_initializers,
+                                  GetSaveModeForPrepacks(!remove_initializers, saving_ort_format),
+                                  constant_initializers_use_count);
+}
+
+bool SessionState::GetSaveModeForPrepacks(bool saving_model, bool saving_ort_format) {
+  bool save_prepacked_constant_initializers =
+      sess_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsSavePrePackedConstantInitializers,
+                                                      "0") == "1";
+
+  if (save_prepacked_constant_initializers && !saving_model) {
+    save_prepacked_constant_initializers = false;
+    LOGS(logger_, WARNING)
+        << "SavePrePackedConstantInitializers is set to true but the model is not being saved. Ignoring the flag.";
+  }
+
+  if (save_prepacked_constant_initializers && saving_ort_format) {
+    save_prepacked_constant_initializers = false;
+    LOGS(logger_, WARNING)
+        << "Serializing optimized model in ORT format with external pre-packed constant initializers is not supported."
+        << " Ignoring the flag.";
+  }
+
+  return save_prepacked_constant_initializers;
 }
 
 static Status Index(const OrtValueNameIdxMap& ort_value_name_idx_map,
@@ -1322,11 +1440,12 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
                                               _In_opt_ const Node* parent_node,
                                               const SessionOptions& session_options,
                                               bool remove_initializers,
+                                              bool save_prepacked_initializers,
                                               InlinedHashMap<std::string, size_t>& constant_initializers_use_count,
                                               const InlinedHashMap<OrtValueName, OrtDevice>& outer_scope_node_arg_to_location_map,
                                               bool graph_info_already_created) {
   if (!graph_info_already_created) {
-    CreateGraphInfo();
+    CreateGraphInfo(save_prepacked_initializers);
   }
 
 #if defined(ORT_EXTENDED_MINIMAL_BUILD)
@@ -1475,21 +1594,20 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
   }
 #endif
 
-  ORT_RETURN_IF_ERROR(
-      session_state_utils::SaveInitializedTensors(
-          Env::Default(), graph_location, *graph_viewer_,
-          GetAllocator(OrtDevice()),
-          ort_value_name_idx_map_, initializer_allocation_order, *tensor_allocator,
-          [this, remove_initializers](const std::string& name, int idx, const OrtValue& value, const OrtCallback& d,
-                                      bool constant, bool sparse) -> Status {
-            ORT_RETURN_IF_ERROR(AddInitializedTensor(idx, value, &d, constant, sparse));
-            if (remove_initializers) {
-              graph_.RemoveInitializedTensor(name);
-            }
-            return Status::OK();
-          },
-          logger_, data_transfer_mgr_, external_data_loader_mgr_, *p_seq_exec_plan_, session_options,
-          memory_profile_func, name_to_buffered_tensor_));
+  ORT_RETURN_IF_ERROR(session_state_utils::SaveInitializedTensors(
+      Env::Default(), graph_location, *graph_viewer_,
+      GetAllocator(OrtDevice()),
+      ort_value_name_idx_map_, initializer_allocation_order, *tensor_allocator,
+      [this, remove_initializers](const std::string& name, int idx, const OrtValue& value, const OrtCallback& d,
+                                  bool constant, bool sparse) -> Status {
+        ORT_RETURN_IF_ERROR(AddInitializedTensor(idx, value, &d, constant, sparse));
+        if (remove_initializers) {
+          graph_.RemoveInitializedTensor(name);
+        }
+        return Status::OK();
+      },
+      logger_, data_transfer_mgr_, external_data_loader_mgr_, *p_seq_exec_plan_, session_options,
+      memory_profile_func, name_to_buffered_tensor_, graph_.GetPrepacked()));
 
 #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
   // Record Weight allocation info on device
@@ -1537,15 +1655,17 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
 
       // We need to create graph info for the subgraphs because information accumulated there
       // is used in OuterScopeNodeArgLocationAccumulator()
-      subgraph_session_state.CreateGraphInfo();
+      subgraph_session_state.CreateGraphInfo(save_prepacked_initializers);
 
       InlinedHashMap<OrtValueName, OrtDevice> subgraph_outer_scope_node_arg_to_location_map;
       ORT_RETURN_IF_ERROR(OuterScopeNodeArgLocationAccumulator(*p_seq_exec_plan_, GetOrtValueNameIdxMap(),
                                                                node,
                                                                subgraph_session_state.GetGraphViewer(),
                                                                subgraph_outer_scope_node_arg_to_location_map));
+
       ORT_RETURN_IF_ERROR(subgraph_session_state.FinalizeSessionStateImpl(
           graph_location, kernel_registry_manager, &node, subgraph_session_options, remove_initializers,
+          save_prepacked_initializers,
           constant_initializers_use_count, subgraph_outer_scope_node_arg_to_location_map, true));
 
       // setup all the info for handling the feeds and fetches used in subgraph execution
diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h
index e1674ba4b690b..82f520f4a4252 100644
--- a/onnxruntime/core/framework/session_state.h
+++ b/onnxruntime/core/framework/session_state.h
@@ -164,6 +164,8 @@ class SessionState {
    */
   const std::unordered_map<int, OrtValue>& GetConstantInitializedTensors() const;
 
+  const PrepackedWeightsForGraph& GetPrepackedIniitializersForGraph() const;
+
 #if !defined(DISABLE_SPARSE_TENSORS)
   bool IsSparseInitializer(int ort_value_index) const;
 #endif
@@ -364,11 +366,20 @@ class SessionState {
 
   const SessionOptions& GetSessionOptions() const { return sess_options_; }
 
+  /// <summary>
+  /// Deduce the flag whether we need to enable or disable
+  /// saving for pre-packed weights serialization.
+  /// </summary>
+  /// <param name="saving_model"></param>
+  /// <param name="saving_ort_format"></param>
+  /// <returns>true of false
+  bool GetSaveModeForPrepacks(bool saving_model, bool saving_ort_format);
+
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SessionState);
 
   // Populate OrtValueNameIdxMap and create the graph viewer.
-  void CreateGraphInfo();
+  void CreateGraphInfo(bool save_prepacked_on);
 
   // create kernels using info in kernel_create_info_map_
   Status CreateKernels(const KernelRegistryManager& custom_registry_manager);
@@ -399,6 +410,7 @@ class SessionState {
                                   _In_opt_ const Node* parent_node,
                                   const SessionOptions& session_options,
                                   bool remove_initializers,
+                                  bool save_prepacked_initializers,
                                   InlinedHashMap<std::string, size_t>& constant_initializers_use_count,
                                   const InlinedHashMap<OrtValueName, OrtDevice>& outer_scope_node_arg_to_location_map = {},
                                   bool graph_info_already_created = false);
diff --git a/onnxruntime/core/framework/session_state_utils.cc b/onnxruntime/core/framework/session_state_utils.cc
index 2c74805c57dce..83a353615bc35 100644
--- a/onnxruntime/core/framework/session_state_utils.cc
+++ b/onnxruntime/core/framework/session_state_utils.cc
@@ -68,18 +68,19 @@ struct ExtDataValueDeleter {
 // buffered_tensor is not null, buffered_tensor holds the real buffer pointed
 // by tensor_proto. buffered_tensor must be the owner of the buffer and deleter
 // should release the buffer when tensor_proto is released.
-static inline common::Status ExtDataTensorProtoToTensor(const Env& env,
-                                                        const std::basic_string<PATH_CHAR_TYPE>& proto_path,
-                                                        const ONNX_NAMESPACE::TensorProto& tensor_proto,
-                                                        Tensor& tensor, OrtCallback& ext_data_deleter,
-                                                        Tensor* buffered_tensor = nullptr) {
+static common::Status ExtDataTensorProtoToTensor(const Env& env,
+                                                 const std::basic_string<PATH_CHAR_TYPE>& proto_path,
+                                                 const ONNX_NAMESPACE::TensorProto& tensor_proto,
+                                                 Tensor& tensor, OrtCallback& ext_data_deleter,
+                                                 PrepackedWeightsForGraph& prepacked_for_graph,
+                                                 Tensor* buffered_tensor = nullptr) {
   ORT_ENFORCE(utils::HasExternalData(tensor_proto));
 
   void* ext_data_buf = nullptr;
   SafeInt<size_t> ext_data_len = 0;
   ORT_RETURN_IF_ERROR(utils::GetExtDataFromTensorProto(env, proto_path.c_str(), tensor_proto,
                                                        ext_data_buf, ext_data_len, ext_data_deleter,
-                                                       buffered_tensor));
+                                                       buffered_tensor, &prepacked_for_graph));
 
   // NB: creating a do-nothing allocator per tensor is wasteful; can perhaps be
   // avoided if the Tensor class implements the do-nothing behavior when given a
@@ -100,6 +101,7 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
                                              const AllocatorPtr& alloc, const AllocatorPtr& default_cpu_alloc,
                                              OrtValue& ort_value, const DataTransferManager& data_transfer_mgr,
                                              const ExternalDataLoaderManager& external_data_loader_mgr,
+                                             PrepackedWeightsForGraph& prepacked_for_graph,
                                              bool use_device_allocator_for_initializers = false,
                                              Tensor* buffered_tensor = nullptr) {
   if (bool(alloc) == (m != nullptr)) {
@@ -127,8 +129,7 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
       ORT_RETURN_IF_ERROR(utils::LoadExtDataToTensorFromTensorProto(env, proto_path, tensor_proto,
                                                                     *external_data_loader, *p_tensor));
 
-      auto ml_tensor = DataTypeImpl::GetType<Tensor>();
-      ort_value.Init(p_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc());
+      Tensor::InitOrtValue(std::move(*p_tensor), ort_value);
       return common::Status::OK();
     } else if (device_type == OrtDevice::CPU) {
       // for external initializer on CPU we will use mmap for large initializers so don't need to allocate memory in advance
@@ -139,7 +140,8 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
       // TensorProtoToTensor it would copy the data, causing unnecessary overhead
       OrtCallback ext_data_deleter;
       ORT_RETURN_IF_ERROR(ExtDataTensorProtoToTensor(env, proto_path, tensor_proto, *p_tensor,
-                                                     ext_data_deleter, buffered_tensor));
+                                                     ext_data_deleter, prepacked_for_graph,
+                                                     buffered_tensor));
 
       ExtDataValueDeleter deleter{ext_data_deleter, p_tensor.get()};
       MLDataType ml_tensor_type = DataTypeImpl::GetType<Tensor>();
@@ -163,8 +165,9 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
       OrtCallback ext_data_deleter;
       std::optional<ScopedOrtCallbackInvoker> scoped_ort_callback_invoker;
       ORT_RETURN_IF_ERROR(ExtDataTensorProtoToTensor(env, proto_path, tensor_proto, *p_deserialize_tensor,
-                                                     ext_data_deleter, buffered_tensor));
-      scoped_ort_callback_invoker = ScopedOrtCallbackInvoker(ext_data_deleter);
+                                                     ext_data_deleter, prepacked_for_graph,
+                                                     buffered_tensor));
+      scoped_ort_callback_invoker.emplace(ext_data_deleter);
       // TODO!! Need a temp buffer allocator for non-escape buffers that maybe too big for stack allocation.
 
       return CopyTensorFromCPUToDevice(data_transfer_mgr, p_deserialize_tensor, p_tensor, ort_value);
@@ -272,13 +275,14 @@ common::Status SaveInitializedTensors(
     const ExecutionPlanBase& exec_plan,
     const SessionOptions& session_options,
     const MemoryProfileFunction& memory_profile_func,
-    std::unordered_map<std::string, std::unique_ptr<Tensor>>& buffered_tensors) {
+    std::unordered_map<std::string, std::unique_ptr<Tensor>>& buffered_tensors,
+    PrepackedWeightsForGraph& prepacked_for_graph) {
   LOGS(logger, INFO) << "Saving initialized tensors.";
   ORT_ENFORCE(ort_value_name_idx_map.MaxIdx() > -1, "OrtValue indexes should have been populated.");
 
   // Determine if an intializer was supplied by the user for the purpose of sharing and if it requires a cross-device
   // copy. In case a cross-device copy is required, sharing cannot be accomplished since we allocate our own buffer
-  // for the destn device which cannot be shared between sessions.
+  // for the destination device which cannot be shared between sessions.
   auto use_user_supplied_initializer =
       [&session_options, &exec_plan, &logger, &ort_value_name_idx_map](const std::string& name) -> bool {
     bool retval = false;
@@ -401,6 +405,7 @@ common::Status SaveInitializedTensors(
 
       Status st = DeserializeTensorProto(env, graph_loc, tensor_proto, (m.has_value()) ? &*m : nullptr, alloc,
                                          default_cpu_alloc, ort_value, data_transfer_mgr, external_data_loader_mgr,
+                                         prepacked_for_graph,
                                          use_device_allocator_for_initializers, p_tensor);
       if (!st.IsOK()) {
         std::ostringstream oss;
diff --git a/onnxruntime/core/framework/session_state_utils.h b/onnxruntime/core/framework/session_state_utils.h
index af27f5caba0f4..17400c45e5f32 100644
--- a/onnxruntime/core/framework/session_state_utils.h
+++ b/onnxruntime/core/framework/session_state_utils.h
@@ -9,6 +9,7 @@
 
 #include "core/common/const_pointer_container.h"
 #include "core/framework/allocator.h"
+#include "core/framework/prepacked_weights_container.h"
 #include "core/framework/tensor.h"
 #include "core/framework/tensor_allocator.h"
 #include "core/framework/session_options.h"
@@ -50,7 +51,8 @@ common::Status SaveInitializedTensors(
     const ExecutionPlanBase& exec_plan,
     const SessionOptions& session_options,
     const MemoryProfileFunction& memory_profile_func,
-    std::unordered_map<std::string, std::unique_ptr<Tensor>>& buffered_tensors);
+    std::unordered_map<std::string, std::unique_ptr<Tensor>>& buffered_tensors,
+    PrepackedWeightsForGraph& prepacked_for_graph);
 
 common::Status AllocateTensor(
     const onnxruntime::MemBuffer* m,
diff --git a/onnxruntime/core/framework/tensor_external_data_info.cc b/onnxruntime/core/framework/tensor_external_data_info.cc
index 93146e66d9f24..ec8b25e9f4afe 100644
--- a/onnxruntime/core/framework/tensor_external_data_info.cc
+++ b/onnxruntime/core/framework/tensor_external_data_info.cc
@@ -3,8 +3,13 @@
 
 #include "tensor_external_data_info.h"
 #include "core/common/common.h"
+#include "core/common/narrow.h"
+#include "core/common/safeint.h"
+#include "core/common/string_utils.h"
 #include "core/platform/path_lib.h"
 
+#include <vector>
+
 #ifdef _WIN32
 #include <Windows.h>
 #endif
@@ -14,8 +19,24 @@ using ::ONNX_NAMESPACE::StringStringEntryProto;
 namespace onnxruntime {
 Status ExternalDataInfo::Create(const RepeatedPtrField<StringStringEntryProto>& input,
                                 std::unique_ptr<ExternalDataInfo>& out) {
+  auto str_to_int = [](const std::string& s, OFFSET_TYPE& result) -> Status {
+    char* end;
+#ifdef _WIN32
+    result = _strtoi64(s.c_str(), &end, 10);
+#else
+    result = OrtStrToPtrDiff(s.c_str(), &end);
+#endif
+    if (end != s.c_str() + s.length()) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "parsing ", s, " failed");
+    }
+    return Status::OK();
+  };
+
   out = std::make_unique<ExternalDataInfo>();
+  PrepackedInfos prepacked_infos;
+
   const int input_size = input.size();
+
   for (int i = 0; i != input_size; ++i) {
     StringStringEntryProto stringmap = input[i];
     if (!stringmap.has_key())
@@ -25,28 +46,112 @@ Status ExternalDataInfo::Create(const RepeatedPtrField<StringStringEntryProto>&
     if (stringmap.key() == "location" && !stringmap.value().empty()) {
       out->rel_path_ = ToWideString(stringmap.value());
     } else if (stringmap.key() == "offset" && !stringmap.value().empty()) {
-      char* end;
-#ifdef _WIN32
-      out->offset_ = _strtoi64(stringmap.value().c_str(), &end, 10);
-#else
-      out->offset_ = OrtStrToPtrDiff(stringmap.value().c_str(), &end);
-#endif
-      if (end != stringmap.value().c_str() + stringmap.value().length())
-        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "parsing ", stringmap.value(), " failed");
+      ORT_RETURN_IF_ERROR(str_to_int(stringmap.value(), out->offset_));
     } else if (stringmap.key() == "length" && !stringmap.value().empty()) {
       char* end;
-      out->length_ = static_cast<size_t>(OrtStrToPtrDiff(stringmap.value().c_str(), &end));
+      out->length_ = narrow<size_t>(OrtStrToPtrDiff(stringmap.value().c_str(), &end));
       if (end != stringmap.value().c_str() + stringmap.value().length())
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "parsing ", stringmap.value(), " failed");
     } else if (stringmap.key() == "checksum" && !stringmap.value().empty()) {
       out->checksum_ = stringmap.value();
+    } else if (stringmap.key().find("prepacked", 0) == 0) {
+      // Starts with 'prepacked', each has its own key.
+      // Each prepacked entry may have multiple blobs with the same key
+      // we output them with the same key
+      // format = key|offset;length;checksum[|offset;length;checksum]
+      // We are ignoring invalid entries (should not be any), and rely
+      // on in memory pre-packs regenerated in this case.
+      // users can over-write this file with the correct pre-packed info.
+      const std::string& prepacked = stringmap.value();
+      if (!prepacked.empty()) {
+        auto split_fields = utils::SplitString(prepacked, "|", false);
+        if (split_fields.size() > 1) {
+          const std::string key{split_fields[0]};
+          auto& blob_infos = prepacked_infos[key];
+          for (size_t f = 1; f < split_fields.size(); ++f) {
+            const auto& blob = split_fields[f];
+            auto blob_fields = utils::SplitString(blob, ";", false);
+            if (blob_fields.size() == 3) {
+              OFFSET_TYPE offset, len;
+              ORT_RETURN_IF_ERROR(str_to_int(std::string(blob_fields[0]), offset));
+              ORT_RETURN_IF_ERROR(str_to_int(std::string(blob_fields[1]), len));
+              blob_infos.push_back(std::make_tuple(offset, narrow<size_t>(len), std::string(blob_fields[2])));
+            }
+          }
+          if (blob_infos.empty()) {
+            prepacked_infos.erase(key);
+          }
+        }
+      }
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "model format error!");
     }
   }
+
   if (out->rel_path_.empty()) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "model format error! Missing 'location'");
   }
+
+  if (!prepacked_infos.empty()) {
+    out->prepacked_infos_ = std::move(prepacked_infos);
+  }
+
   return Status::OK();
 }
+void ExternalDataInfo::SetExternalLocationToProto(const std::filesystem::path& external_file_path,
+                                                  int64_t external_offset, size_t tensor_bytes_size,
+                                                  ::ONNX_NAMESPACE::TensorProto& proto) {
+  proto.set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL);
+
+  auto* location = proto.add_external_data();
+  location->set_key("location");
+  location->set_value(ToUTF8String(external_file_path.native()));
+
+  auto* offset = proto.add_external_data();
+  offset->set_key("offset");
+  offset->set_value(std::to_string(external_offset));
+
+  auto* length = proto.add_external_data();
+  length->set_key("length");
+  length->set_value(std::to_string(tensor_bytes_size));
+}
+
+std::ostream& ExternalDataInfo::WritePrepackedToFileAndAddToProto(
+    const PrepackedWeightsForGraph& prepacked_for_graph,
+    const InlinedHashSet<std::string>& blob_keys, bool align,
+    int64_t align_threshold, int64_t allocation_granularity,
+    std::ostream& os, int64_t& external_offset, ::ONNX_NAMESPACE::TensorProto& proto) {
+  size_t key_count = 0;
+  for (const auto& key : blob_keys) {
+    size_t prepack_count = 0;
+    const auto* prepacked_weights = prepacked_for_graph.GetPrepackedWeights(key);
+    ORT_ENFORCE(prepacked_weights != nullptr, "Prepacked weights not found for key ", key);
+    std::stringstream prepacked_entry;
+    prepacked_entry << key << "|";
+    for (size_t i = 0, size = prepacked_weights->buffers_.size(); i < size; ++i) {
+      const auto size_in_bytes = prepacked_weights->buffer_sizes_[i];
+      if (align && static_cast<int64_t>(size_in_bytes) > align_threshold) {
+        // return early on error
+        if (!AlignAndPad(os, allocation_granularity, external_offset)) {
+          return os;
+        }
+      }
+      if (prepack_count++ > 0) {
+        prepacked_entry << "|";
+      }
+      // Checksum is currently not validated
+      prepacked_entry << external_offset << ";" << size_in_bytes << ";0";
+      if (!os.write(reinterpret_cast<const char*>(prepacked_weights->buffers_[i].get()), size_in_bytes)) {
+        return os;
+      }
+      external_offset = SafeInt<int64_t>(external_offset) + size_in_bytes;
+    }
+    auto* prepacked = proto.add_external_data();
+    std::string prepacked_key("prepacked_");
+    prepacked_key.append(std::to_string(key_count++));
+    prepacked->set_key(std::move(prepacked_key));
+    prepacked->set_value(prepacked_entry.str());
+  }
+  return os;
+}
 }  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/framework/tensor_external_data_info.h b/onnxruntime/core/framework/tensor_external_data_info.h
index afc8fda6c3037..1b185b8c5da7d 100644
--- a/onnxruntime/core/framework/tensor_external_data_info.h
+++ b/onnxruntime/core/framework/tensor_external_data_info.h
@@ -2,12 +2,21 @@
 // Licensed under the MIT License.
 #pragma once
 
+#include <cmath>
+#include <filesystem>
+#include <ostream>
 #include <string>
+#include <tuple>
+
+#include <core/common/inlined_containers_fwd.h>
+#include "core/common/path_string.h"
+#include "core/common/safeint.h"
 #include "core/common/status.h"
+#include "core/framework/prepacked_weights_container.h"
 #include "core/graph/onnx_protobuf.h"
-#include "core/session/onnxruntime_c_api.h"
 
 namespace onnxruntime {
+
 class ExternalDataInfo {
  public:
 #ifdef _WIN32
@@ -16,7 +25,7 @@ class ExternalDataInfo {
   using OFFSET_TYPE = off_t;
 #endif
 
-  const std::basic_string<ORTCHAR_T>& GetRelPath() const { return rel_path_; }
+  const PathString& GetRelPath() const { return rel_path_; }
 
   OFFSET_TYPE GetOffset() const { return offset_; }
   size_t GetLength() const { return length_; }
@@ -29,12 +38,58 @@ class ExternalDataInfo {
       const ::google::protobuf::RepeatedPtrField<::ONNX_NAMESPACE::StringStringEntryProto>& input,
       std::unique_ptr<ExternalDataInfo>& out);
 
+  static void SetExternalLocationToProto(const std::filesystem::path& external_file_path,
+                                         int64_t offset,
+                                         size_t tensor_bytes_size,
+                                         ::ONNX_NAMESPACE::TensorProto& proto);
+
+  // Pads the output with zeros according to the specified allocation_granularity
+  // It updates external_offset for alignment.
+  // need to do padding before write actual tensor data as we do offset alignment at the begin of
+  // large tensors (offset need to be page aligned and allocation granularity aligned) like below:
+  // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX
+  // |<---smaller tensor---->|<---padding--->|<------------------large tensor----------------------------->|
+  static std::ostream& AlignAndPad(std::ostream& stream, int64_t allocation_granularity, int64_t& external_offset) {
+    // Align to the larger of the page size or the allocation granularity
+    int64_t alignment_factor = std::max(static_cast<int64_t>(4096), allocation_granularity);
+    // Align to the next page or alloc granularity boundary
+    SafeInt<int64_t> safe_external_offset = external_offset;
+    int64_t new_external_offset = ((safe_external_offset + alignment_factor - 1) / alignment_factor) *
+                                  alignment_factor;
+
+    // padding tensor with zeros for alignment
+    for (int64_t index = external_offset; index != new_external_offset; ++index) {
+      stream << '\0';
+    }
+    external_offset = new_external_offset;
+    return stream;
+  }
+
+  static std::ostream& WritePrepackedToFileAndAddToProto(
+      const PrepackedWeightsForGraph& prepacked_for_graph,
+      const InlinedHashSet<std::string>& blob_keys,
+      bool align, int64_t align_threshold, int64_t allocation_granularity,
+      std::ostream& os,
+      int64_t& external_offset,
+      ::ONNX_NAMESPACE::TensorProto& proto);
+
+  using PrepackedInfo = std::tuple<OFFSET_TYPE, size_t, std::string>;
+  using PrepackedInfos = std::unordered_map<std::string, std::vector<PrepackedInfo>>;
+
+  bool HasPrepackedInfo() const noexcept { return !prepacked_infos_.empty(); }
+
+  PrepackedInfos&& TakePrepackedInfos() { return std::move(prepacked_infos_); }
+
  private:
-  std::basic_string<ORTCHAR_T> rel_path_;
+  PathString rel_path_;
   OFFSET_TYPE offset_ = 0;
 
   // 0 means the whole file
   size_t length_ = 0;
   std::string checksum_;
+
+  // Pre-packed blobs found associated with this TensorProto if present
+  // format key, offset, length, checksum
+  PrepackedInfos prepacked_infos_;
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index 2af9f95ad059e..097ce436f4419 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -234,7 +234,8 @@ Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto,
                            const std::filesystem::path& tensor_proto_dir,
                            std::basic_string<ORTCHAR_T>& external_file_path,
                            onnxruntime::FileOffsetType& file_offset,
-                           SafeInt<size_t>& tensor_byte_size) {
+                           SafeInt<size_t>& tensor_byte_size,
+                           ExternalDataInfo::PrepackedInfos* prepacked_infos) {
   ORT_RETURN_IF_NOT(onnxruntime::utils::HasExternalData(tensor_proto),
                     "Tensor does not have external data to read from.");
 
@@ -258,6 +259,10 @@ Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto,
 
   file_offset = external_data_info->GetOffset();
 
+  if (prepacked_infos != nullptr && external_data_info->HasPrepackedInfo()) {
+    *prepacked_infos = external_data_info->TakePrepackedInfos();
+  }
+
   return Status::OK();
 }
 
@@ -988,7 +993,8 @@ static Status GetFileContent(const Env& env, const std::filesystem::path& file_p
 Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& model_path,
                                  const ONNX_NAMESPACE::TensorProto& tensor_proto, void*& ext_data_buf,
                                  SafeInt<size_t>& ext_data_len, OrtCallback& ext_data_deleter,
-                                 Tensor* buffered_tensor) {
+                                 Tensor* buffered_tensor,
+                                 PrepackedWeightsForGraph* prepacked_info) {
   ORT_ENFORCE(utils::HasExternalData(tensor_proto));
   std::basic_string<ORTCHAR_T> tensor_proto_dir;
   if (!model_path.empty()) {
@@ -997,8 +1003,13 @@ Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& mo
   std::basic_string<ORTCHAR_T> external_data_file_path;
   FileOffsetType file_offset;
   SafeInt<size_t> raw_data_safe_len = 0;
+  std::optional<ExternalDataInfo::PrepackedInfos> prepacked_infos;
+  if (prepacked_info != nullptr) {
+    prepacked_infos.emplace();
+  }
   ORT_RETURN_IF_ERROR(
-      GetExternalDataInfo(tensor_proto, tensor_proto_dir, external_data_file_path, file_offset, raw_data_safe_len));
+      GetExternalDataInfo(tensor_proto, tensor_proto_dir, external_data_file_path, file_offset,
+                          raw_data_safe_len, (prepacked_info != nullptr) ? &*prepacked_infos : nullptr));
 
   if (external_data_file_path == onnxruntime::utils::kTensorProtoMemoryAddressTag) {
     // the value in location is the memory address of the data
@@ -1042,6 +1053,33 @@ Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& mo
     ORT_RETURN_IF_ERROR(GetFileContent(env, external_data_file_path.c_str(), file_offset, raw_data_safe_len,
                                        ext_data_buf, ext_data_deleter));
     ext_data_len = raw_data_safe_len;
+
+    if (prepacked_info != nullptr && !prepacked_infos->empty()) {
+      for (const auto& [key, blobs] : *prepacked_infos) {
+        PrePackedWeights prepacked_weights;
+        prepacked_weights.buffers_.reserve(blobs.size());
+        prepacked_weights.buffer_sizes_.reserve(blobs.size());
+        for (const auto& blob : blobs) {
+          const auto blob_offset = std::get<0>(blob);
+          const auto blob_length = std::get<1>(blob);
+          SafeInt<FileOffsetType> end_of_blob{blob_offset};
+          end_of_blob += blob_length;
+          ORT_RETURN_IF(blob_offset < 0 || static_cast<uintmax_t>(end_of_blob) > file_length,
+                        "Pre-packed blob: ", key, " offset: ", blob_offset, " file_length: ", file_length,
+                        " is out of bounds and can not read in full");
+          void* data_ptr;
+          OrtCallback data_deleter;
+          ORT_RETURN_IF_ERROR(GetFileContent(env, external_data_file_path.c_str(), blob_offset, blob_length,
+                                             data_ptr, data_deleter));
+          IAllocatorUniquePtr<void> data_ptr_unique{data_ptr, OrtCallbackInvoker(data_deleter)};
+          prepacked_weights.buffers_.push_back(std::move(data_ptr_unique));
+          prepacked_weights.buffer_sizes_.push_back(blob_length);
+        }
+        if (!blobs.empty()) {
+          prepacked_info->InsertPrepackedWeights(key, std::move(prepacked_weights));
+        }
+      }
+    }
 #endif
   }
 
diff --git a/onnxruntime/core/framework/tensorprotoutils.h b/onnxruntime/core/framework/tensorprotoutils.h
index 262f7adaca1cb..7b9a47842388c 100644
--- a/onnxruntime/core/framework/tensorprotoutils.h
+++ b/onnxruntime/core/framework/tensorprotoutils.h
@@ -3,20 +3,21 @@
 
 #pragma once
 
-#include <vector>
-#include <type_traits>
-#include <string>
 #include <filesystem>
+#include <string>
+#include <type_traits>
+#include <vector>
 
 #ifndef SHARED_PROVIDER
 #include "core/common/common.h"
 #include "core/common/status.h"
 #include "core/common/safeint.h"
-#include "core/framework/endian_utils.h"
 #include "core/framework/allocator.h"
+#include "core/framework/endian_utils.h"
 #include "core/framework/external_data_loader.h"
-#include "core/framework/ort_value.h"
 #include "core/framework/mem_buffer.h"
+#include "core/framework/ort_value.h"
+#include "core/framework/prepacked_weights_container.h"
 #include "core/framework/tensor_external_data_info.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/platform/env.h"
@@ -36,7 +37,8 @@ Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto,
                            const std::filesystem::path& tensor_proto_dir,
                            std::basic_string<ORTCHAR_T>& external_file_path,
                            onnxruntime::FileOffsetType& file_offset,
-                           SafeInt<size_t>& tensor_byte_size);
+                           SafeInt<size_t>& tensor_byte_size,
+                           ExternalDataInfo::PrepackedInfos* prepacked_infos = nullptr);
 /**
  * This function is used to convert the endianess of Tensor data.
  * Mostly, will be used in big endian system to support the model file
@@ -172,7 +174,8 @@ common::Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::
                                          const ONNX_NAMESPACE::TensorProto& tensor_proto,
                                          void*& ext_data_buf, SafeInt<size_t>& ext_data_len,
                                          OrtCallback& ext_data_deleter,
-                                         Tensor* buffered_tensor = nullptr);
+                                         Tensor* buffered_tensor = nullptr,
+                                         PrepackedWeightsForGraph* prepacked_for_graph = nullptr);
 
 // Given a tensor proto with external data obtain a tensor using the specified custom external data loader.
 common::Status LoadExtDataToTensorFromTensorProto(const Env& env, const std::filesystem::path& model_path,
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index e8a5855b36496..0b6610db5e007 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -18,6 +18,7 @@
 #include "core/flatbuffers/flatbuffers_utils.h"
 #include "core/flatbuffers/schema/ort.fbs.h"
 #include "core/framework/tensor_shape.h"
+#include "core/framework/tensor_external_data_info.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/utils.h"
 #include "core/graph/graph_flatbuffers_utils.h"
@@ -25,6 +26,7 @@
 #include "core/graph/indexed_sub_graph.h"
 #include "core/graph/model.h"
 #include "core/graph/model_load_utils.h"
+#include "core/graph/model_saving_options.h"
 #include "core/graph/node_attr_utils.h"
 #include "core/graph/op.h"
 #include "core/graph/runtime_optimization_record_container.h"
@@ -1543,6 +1545,17 @@ Status Graph::VerifyNoDuplicateName() {
 
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
+void Graph::ConstructPrepackedSharedContainerAndSetMode(bool saving_mode_on) {
+  if (parent_graph_ == nullptr) {
+    prepacked_key_to_blobs_.emplace();
+    prepacked_weights_for_graph_.emplace(*prepacked_key_to_blobs_, saving_mode_on);
+  } else {
+    // Subgraph
+    prepacked_weights_for_graph_.emplace(parent_graph_->prepacked_weights_for_graph_->GetKeyToBlob(),
+                                         saving_mode_on);
+  }
+}
+
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 void Graph::AddEdge(NodeIndex src_node_index, NodeIndex dst_node_index, int src_arg_slot, int dst_arg_slot) {
   if (nodes_.size() <= src_node_index || src_arg_slot < 0 || nodes_.size() <= dst_node_index || dst_arg_slot < 0 ||
@@ -4084,82 +4097,103 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProto() const {
   return result;
 }
 
-ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
-                                                                       const std::filesystem::path& model_file_path,
-                                                                       size_t initializer_size_threshold,
-                                                                       const OffsetAlignmentInfo& align_info) const {
-  GraphProto result;
-  ToGraphProtoInternal(result);
-  ORT_ENFORCE(external_file_path.is_relative());
-  // If model_file_path is just a file name without a path separator, for example: "model.onnx". Its parent path could
-  // be empty. Else, save external data file in same directory as the model.
-  const std::filesystem::path modified_external_file_path = model_file_path.parent_path() / external_file_path;
+Status Graph::AddExternalInitializersToGraphProtoImpl(
+    const std::filesystem::path& model_path,
+    const std::filesystem::path& external_file_path,
+    const std::filesystem::path& model_external_file_path,
+    const ModelSavingOptions& model_saving_options,
+    ONNX_NAMESPACE::GraphProto& output_graph_proto,
+    std::ostream& external_stream,
+    int64_t& external_offset) const {
+  // Process initializers in a subgraph, check their size and
+  // write to an external file. This function also saves pre-packed
+  // blobs for the initializer being saved to disk, if the initializer has any pre-packs.
+  // This function is invoked by ToGraphProtoWithExternalInitiallizers() and processes subgraphs
+  // bottom up.
+  for (const auto& node : Nodes()) {
+    if (node.ContainsSubgraph()) {
+      // Let find this node in the output_graph_proto
+      auto hit = std::find_if(output_graph_proto.mutable_node()->begin(),
+                              output_graph_proto.mutable_node()->end(),
+                              [&node](const ONNX_NAMESPACE::NodeProto& proto) {
+                                return proto.name() == node.Name();
+                              });
+      ORT_RETURN_IF_NOT(hit != output_graph_proto.mutable_node()->end(), "Node ", node.Name(),
+                        " not found in output_graph_proto");
+      auto& result_node = *hit;
+      for (const auto& e : node.GetAttributeNameToSubgraphMap()) {
+        const auto& name = e.first;
+        const auto& subgraph = e.second;
+        // Lets find this subgraph in the result_node
+        auto sub_hit = std::find_if(result_node.mutable_attribute()->begin(),
+                                    result_node.mutable_attribute()->end(),
+                                    [&name](const ONNX_NAMESPACE::AttributeProto& proto) {
+                                      return proto.name() == name;
+                                    });
+        ORT_RETURN_IF_NOT(sub_hit != result_node.mutable_attribute()->end() && utils::HasGraph(*sub_hit),
+                          "Subgraph ", name, " is referred to in GetAttributeNameToSubgraphMap, but not found in node ",
+                          node.Name(), " while attempting to recurse into it.");
+        auto& result_subgraph = *sub_hit->mutable_g();
+        ORT_RETURN_IF_ERROR(subgraph->AddExternalInitializersToGraphProtoImpl(
+            model_path, external_file_path,
+            model_external_file_path, model_saving_options,
+            result_subgraph,
+            external_stream, external_offset));
+      }
+    }
+  }
 
-  std::ofstream external_stream(modified_external_file_path, std::ofstream::out | std::ofstream::binary);
-  ORT_ENFORCE(external_stream.is_open());
-  int64_t external_offset = 0;
+  // Used only when pre-packed weights are serialized
+  InlinedHashSet<std::string> processed_weights;
+  // prepacked_weights_for_graph_ is present only when SessionState is finalized.
+  const bool process_prepacks = prepacked_weights_for_graph_.has_value() &&
+                                prepacked_weights_for_graph_->GetNumberOfWeightsForWriting() > 0;
+  if (process_prepacks) {
+    processed_weights.reserve(graph_proto_->initializer_size());
+  }
 
   // Add the initializers to the result graph.
-  const auto& model_path = ModelPath();
-#if !defined(DISABLE_SPARSE_TENSORS)
-  const auto sparse_end = sparse_tensor_names_.end();
-#endif
-
   for (const auto& initializer : graph_proto_->initializer()) {
 #if !defined(DISABLE_SPARSE_TENSORS)
-    if (sparse_end != sparse_tensor_names_.find(initializer.name())) {
+    if (IsSparseInitializer(initializer.name())) {
       // Sparse tensors are added to the ONNX file.
-      auto& sparse_initializer = *result.add_sparse_initializer();
+      auto& sparse_initializer = *output_graph_proto.add_sparse_initializer();
       auto status = utils::DenseTensorToSparseTensorProto(initializer, model_path, sparse_initializer);
-      ORT_ENFORCE(status.IsOK(), "Failed to convert dense initializer to sparse");
+      ORT_RETURN_IF_NOT(status.IsOK(), "Failed to convert dense initializer to sparse");
     } else {
 #endif
       // Dense tensors larger than the threshold are added to the external file.
-      TensorProto* output_proto = result.add_initializer();
+      TensorProto* output_proto = output_graph_proto.add_initializer();
 
       std::vector<uint8_t> raw_data;
-      ORT_THROW_IF_ERROR(utils::UnpackInitializerData(initializer, model_path, raw_data));
+      ORT_RETURN_IF_ERROR(utils::UnpackInitializerData(initializer, model_path, raw_data));
       size_t tensor_bytes_size = raw_data.size();
-      if (tensor_bytes_size < initializer_size_threshold) {
+      if (tensor_bytes_size < model_saving_options.initializer_size_threshold) {
         *output_proto = initializer;
+        if (process_prepacks) {
+          // These pre-packs will reside in memory
+          processed_weights.insert(initializer.name());
+        }
         continue;
       }
 
       // update external_offset for alignment
       // need to do padding before write actual tensor data as we do offset alignment at the begin of
-      // large tensors (offset need to be page aligned and alloction granularity aligned) like below:
+      // large tensors (offset need to be page aligned and allocation granularity aligned) like below:
       // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX
-      // |<---small tensor---->|<---padding--->|<------------------large tensor----------------------------->|
-      if (align_info.align_offset && static_cast<int64_t>(tensor_bytes_size) > align_info.align_threshold) {
-        // Align to the larger of the page size or the allocation granularity
-        int64_t alignment_factor = std::max(static_cast<int64_t>(4096), align_info.allocation_granularity);
-        // Align to the next page or alloc granularity boundary
-        int64_t new_external_offset = static_cast<int64_t>(
-                                          std::floor((external_offset + alignment_factor - 1) / alignment_factor)) *
-                                      alignment_factor;
-
-        // padding tensor with zeros for alignment
-        for (int64_t index = external_offset; index != new_external_offset; ++index) {
-          external_stream << '0';
-        }
-
-        external_offset = new_external_offset;
+      // |<---smaller tensor---->|<---padding--->|<------------------large tensor----------------------------->|
+      if (model_saving_options.align_offset && static_cast<int64_t>(tensor_bytes_size) >
+                                                   model_saving_options.align_threshold) {
+        ORT_RETURN_IF_NOT(ExternalDataInfo::AlignAndPad(external_stream, model_saving_options.allocation_granularity,
+                                                        external_offset),
+                          "Failed writing external data to: ", model_external_file_path);
       }
 
-      for (size_t index = 0; index != tensor_bytes_size; ++index) {
-        external_stream << raw_data[index];
-      }
+      ORT_RETURN_IF_NOT(external_stream.write(reinterpret_cast<const char*>(raw_data.data()), tensor_bytes_size),
+                        "Failed to write external initializers to file: ", model_external_file_path);
 
-      output_proto->set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL);
-      ONNX_NAMESPACE::StringStringEntryProto* location = output_proto->add_external_data();
-      location->set_key("location");
-      location->set_value(ToUTF8String(external_file_path.native()));
-      ONNX_NAMESPACE::StringStringEntryProto* offset = output_proto->add_external_data();
-      offset->set_key("offset");
-      offset->set_value(std::to_string(external_offset));
-      ONNX_NAMESPACE::StringStringEntryProto* length = output_proto->add_external_data();
-      length->set_key("length");
-      length->set_value(std::to_string(tensor_bytes_size));
+      ExternalDataInfo::SetExternalLocationToProto(external_file_path, external_offset,
+                                                   tensor_bytes_size, *output_proto);
 
       output_proto->set_name(initializer.name());
       output_proto->set_data_type(initializer.data_type());
@@ -4168,12 +4202,74 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(const std
       }
       output_proto->set_doc_string(initializer.doc_string());
 
-      external_offset += tensor_bytes_size;
+      external_offset = SafeInt<int64_t>(external_offset) + tensor_bytes_size;
+
+      if (process_prepacks) {
+        // check if this weight was referred to in subgraphs
+        InlinedHashSet<std::string> blob_keys_to_external_data;
+
+        // See if this weight has any pre-prepacks referred to in this graph.
+        const auto* blobs_keys_for_weight = prepacked_weights_for_graph_->GetKeysForWeightForSaving(initializer.name());
+        if (blobs_keys_for_weight != nullptr && !blobs_keys_for_weight->empty()) {
+          // Add all the blob_keys to the set of keys to process
+          blob_keys_to_external_data.insert(blobs_keys_for_weight->begin(), blobs_keys_for_weight->end());
+        }
+
+        if (!blob_keys_to_external_data.empty()) {
+          auto& os = ExternalDataInfo::WritePrepackedToFileAndAddToProto(
+              *prepacked_weights_for_graph_, blob_keys_to_external_data,
+              model_saving_options.align_offset, model_saving_options.align_threshold,
+              model_saving_options.allocation_granularity,
+              external_stream, external_offset, *output_proto);
+          ORT_RETURN_IF_NOT(os.good(), "Failed to write pre-packed blobs to external file");
+        }
+
+        processed_weights.insert(initializer.name());
+      }
+
 #if !defined(DISABLE_SPARSE_TENSORS)
     }
 #endif
   }
 
+  // Check if there are any pre-packed weights this graph refers to, but they have
+  // not been processed.
+  if (process_prepacks) {
+    const auto& sorted_by_weights = prepacked_weights_for_graph_->GetWeightToPrepack();
+    for (const auto& [weight_name, blob_keys] : sorted_by_weights) {
+      ORT_ENFORCE(processed_weights.find(weight_name) != processed_weights.end());
+    }
+  }
+
+  return Status::OK();
+}
+
+ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(
+    const std::filesystem::path& external_file_path,
+    const std::filesystem::path& model_file_path,
+    const ModelSavingOptions& model_saving_options) const {
+  GraphProto result;
+  ToGraphProtoInternal(result);
+  ORT_ENFORCE(external_file_path.is_relative());
+  // If model_file_path is just a file name without a path separator, for example: "model.onnx". Its parent path could
+  // be empty. Else, save external data file in same directory as the model.
+  const std::filesystem::path modified_external_file_path = model_file_path.parent_path() / external_file_path;
+  const auto& model_path = ModelPath();
+
+  // Create the external file.
+  std::ofstream external_stream(modified_external_file_path, std::ofstream::out | std::ofstream::binary);
+  ORT_ENFORCE(external_stream.is_open(), "Failed to open for writing:", modified_external_file_path);
+  int64_t external_offset = 0;
+
+  ORT_THROW_IF_ERROR(AddExternalInitializersToGraphProtoImpl(model_path, external_file_path,
+                                                             modified_external_file_path, model_saving_options,
+                                                             result,
+                                                             external_stream, external_offset));
+
+  if (!external_stream.flush()) {
+    ORT_THROW("Failed to flush file with external initializers: ", modified_external_file_path);
+  }
+
   return result;
 }
 
diff --git a/onnxruntime/core/graph/model.cc b/onnxruntime/core/graph/model.cc
index 1bae63b510563..be0531e6473fb 100644
--- a/onnxruntime/core/graph/model.cc
+++ b/onnxruntime/core/graph/model.cc
@@ -383,14 +383,12 @@ ModelProto Model::ToProto() const {
 
 ModelProto Model::ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name,
                                                        const std::filesystem::path& file_path,
-                                                       size_t initializer_size_threshold,
-                                                       const Graph::OffsetAlignmentInfo& align_info) const {
+                                                       const ModelSavingOptions& model_saving_options) const {
   ModelProto result(model_proto_);
   const auto& graph = *graph_;
   *(result.mutable_graph()) = graph.ToGraphProtoWithExternalInitializers(external_file_name,
                                                                          file_path,
-                                                                         initializer_size_threshold,
-                                                                         align_info);
+                                                                         model_saving_options);
   return result;
 }
 
@@ -607,16 +605,13 @@ template <typename T>
 static Status SaveModelWithExternalInitializers(Model& model,
                                                 const T& file_path,
                                                 const std::filesystem::path& external_file_name,
-                                                size_t initializer_size_threshold,
-                                                const Graph::OffsetAlignmentInfo& align_info) {
+                                                const ModelSavingOptions& save_options) {
   int fd = 0;
   Status status = Env::Default().FileOpenWr(file_path, fd);
   ORT_RETURN_IF_ERROR(status);
 
   ORT_TRY {
-    status = Model::SaveWithExternalInitializers(model, fd, file_path, external_file_name,
-                                                 initializer_size_threshold,
-                                                 align_info);
+    status = Model::SaveWithExternalInitializers(model, fd, file_path, external_file_name, save_options);
   }
   ORT_CATCH(const std::exception& ex) {
     ORT_HANDLE_EXCEPTION([&]() {
@@ -646,10 +641,8 @@ Status Model::Load(const PathString& file_path, std::shared_ptr<Model>& p_model,
 
 Status Model::SaveWithExternalInitializers(Model& model, const std::filesystem::path& file_path,
                                            const std::filesystem::path& external_file_name,
-                                           size_t initializer_size_threshold,
-                                           const Graph::OffsetAlignmentInfo& align_info) {
-  return SaveModelWithExternalInitializers(model, file_path, external_file_name, initializer_size_threshold,
-                                           align_info);
+                                           const ModelSavingOptions& save_options) {
+  return SaveModelWithExternalInitializers(model, file_path, external_file_name, save_options);
 }
 
 Status Model::LoadFromBytes(int count, const void* p_bytes, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) {
@@ -765,8 +758,7 @@ Status Model::SaveWithExternalInitializers(Model& model,
                                            int fd,
                                            const std::filesystem::path& file_path,
                                            const std::filesystem::path& external_file_name,
-                                           size_t initializer_size_threshold,
-                                           const Graph::OffsetAlignmentInfo& align_info) {
+                                           const ModelSavingOptions& model_saving_options) {
   if (fd < 0) {
     return Status(ONNXRUNTIME, INVALID_ARGUMENT, "<fd> is less than 0.");
   }
@@ -774,8 +766,7 @@ Status Model::SaveWithExternalInitializers(Model& model,
   ORT_RETURN_IF_ERROR(model.MainGraph().Resolve());
 
   auto model_proto = model.ToGraphProtoWithExternalInitializers(external_file_name, file_path,
-                                                                initializer_size_threshold,
-                                                                align_info);
+                                                                model_saving_options);
   google::protobuf::io::FileOutputStream output(fd);
   const bool result = model_proto.SerializeToZeroCopyStream(&output) && output.Flush();
   if (result) {
diff --git a/onnxruntime/core/graph/model.h b/onnxruntime/core/graph/model.h
index 9bcec6f78ca08..2d2086aef41fd 100644
--- a/onnxruntime/core/graph/model.h
+++ b/onnxruntime/core/graph/model.h
@@ -20,6 +20,8 @@
 
 namespace onnxruntime {
 
+class PrepackedShareableWeightsContainer;
+
 namespace fbs {
 struct Model;
 }  // namespace fbs
@@ -190,15 +192,7 @@ class Model {
   // initializer offset could be page aligned and allocation granularity aligned for mmap support.
   ONNX_NAMESPACE::ModelProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name,
                                                                   const std::filesystem::path& file_path,
-                                                                  size_t initializer_size_threshold,
-                                                                  const Graph::OffsetAlignmentInfo& align_info) const;
-
-  ONNX_NAMESPACE::ModelProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name,
-                                                                  const std::filesystem::path& file_path,
-                                                                  size_t initializer_size_threshold) const {
-    Graph::OffsetAlignmentInfo default_align_info;
-    return ToGraphProtoWithExternalInitializers(external_file_name, file_path, initializer_size_threshold, default_align_info);
-  }
+                                                                  const ModelSavingOptions& model_saving_options) const;
 
   static common::Status Save(Model& model, const PathString& file_path);
 
@@ -209,32 +203,13 @@ class Model {
   static common::Status SaveWithExternalInitializers(Model& model,
                                                      const std::filesystem::path& file_path,
                                                      const std::filesystem::path& external_file_path,
-                                                     size_t initializer_size_threshold,
-                                                     const Graph::OffsetAlignmentInfo& align_info);
-
-  static common::Status SaveWithExternalInitializers(Model& model,
-                                                     const std::filesystem::path& file_path,
-                                                     const std::filesystem::path& external_file_path,
-                                                     size_t initializer_size_threshold) {
-    Graph::OffsetAlignmentInfo default_align_info;
-    return SaveWithExternalInitializers(model, file_path, external_file_path, initializer_size_threshold, default_align_info);
-  }
-
-  static common::Status SaveWithExternalInitializers(Model& model,
-                                                     int fd,
-                                                     const std::filesystem::path& file_path,
-                                                     const std::filesystem::path& external_file_path,
-                                                     size_t initializer_size_threshold,
-                                                     const Graph::OffsetAlignmentInfo& align_info);
+                                                     const ModelSavingOptions& save_options);
 
   static common::Status SaveWithExternalInitializers(Model& model,
                                                      int fd,
                                                      const std::filesystem::path& file_path,
                                                      const std::filesystem::path& external_file_path,
-                                                     size_t initializer_size_threshold) {
-    Graph::OffsetAlignmentInfo default_align_info;
-    return SaveWithExternalInitializers(model, fd, file_path, external_file_path, initializer_size_threshold, default_align_info);
-  }
+                                                     const ModelSavingOptions& save_options);
 
   static common::Status Load(std::istream& model_istream, ONNX_NAMESPACE::ModelProto* p_model_proto);
 
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index d182d0b9173bd..8bd4067e59492 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -42,6 +42,8 @@ using ProviderType = const std::string&;
 class RandomGenerator;
 class IOnnxRuntimeOpSchemaCollection;
 
+struct ModelSavingOptions;
+
 #ifdef ENABLE_TRAINING_TORCH_INTEROP
 namespace contrib {
 class PythonOpBase;
@@ -901,7 +903,11 @@ struct ProviderHost {
   virtual void Model__operator_delete(Model* p) = 0;
   virtual Graph& Model__MainGraph(Model* p) = 0;
   virtual std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToProto(Model* p) = 0;
-  virtual std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToGraphProtoWithExternalInitializers(Model* p, const std::filesystem::path& external_file_name, const std::filesystem::path& file_path, size_t initializer_size_threshold) = 0;
+  virtual std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToGraphProtoWithExternalInitializers(
+      Model* p,
+      const std::filesystem::path& external_file_name,
+      const std::filesystem::path& file_path,
+      const ModelSavingOptions&) = 0;
   virtual const ModelMetaData& Model__MetaData(const Model* p) const noexcept = 0;
   virtual Status Model__Load(const PathString& file_path, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) = 0;
 
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index 54249f0864cd7..d8516d5858a2f 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -934,6 +934,8 @@ struct NodeUnit final {
   Node::EdgeConstIterator OutputEdgesEnd() const { return g_host->NodeUnit__OutputEdgesEnd(this); }
 };
 
+struct ModelSavingOptions;
+
 struct Model final {
   static std::unique_ptr<Model> Create(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
                                        const IOnnxRuntimeOpSchemaRegistryList* local_registries, const logging::Logger& logger) {
@@ -945,7 +947,12 @@ struct Model final {
   Graph& MainGraph() { return g_host->Model__MainGraph(this); }
 
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> ToProto() { return g_host->Model__ToProto(this); }
-  std::unique_ptr<ONNX_NAMESPACE::ModelProto> ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name, const std::filesystem::path& file_path, size_t initializer_size_threshold) { return g_host->Model__ToGraphProtoWithExternalInitializers(this, external_file_name, file_path, initializer_size_threshold); }
+  std::unique_ptr<ONNX_NAMESPACE::ModelProto> ToGraphProtoWithExternalInitializers(
+      const std::filesystem::path& external_file_name,
+      const std::filesystem::path& file_path, const ModelSavingOptions& model_saving_options) {
+    return g_host->Model__ToGraphProtoWithExternalInitializers(this, external_file_name, file_path,
+                                                               model_saving_options);
+  }
   const ModelMetaData& MetaData() const noexcept { return g_host->Model__MetaData(this); }
 
   Model() = delete;
diff --git a/onnxruntime/core/providers/vitisai/imp/graph.cc b/onnxruntime/core/providers/vitisai/imp/graph.cc
index 191d26f3ab269..e7b39546fda6a 100644
--- a/onnxruntime/core/providers/vitisai/imp/graph.cc
+++ b/onnxruntime/core/providers/vitisai/imp/graph.cc
@@ -9,6 +9,7 @@
 #include <locale>
 #include <string>
 
+#include "core/graph/model_saving_options.h"
 #include "core/providers/shared_library/provider_api.h"
 #include "./vai_assert.h"
 
@@ -111,7 +112,9 @@ void graph_save(const Graph& graph, const std::string& filename, const std::stri
   if (initializer_size_threshold == std::numeric_limits<size_t>::max()) {
     model_proto = model->ToProto();
   } else {
-    model_proto = model->ToGraphProtoWithExternalInitializers(ToPathString(filename_dat), ToPathString(filename), initializer_size_threshold);
+    ModelSavingOptions model_saving_options{initializer_size_threshold};
+    model_proto = model->ToGraphProtoWithExternalInitializers(ToPathString(filename_dat), ToPathString(filename),
+                                                              model_saving_options);
   }
   auto& metadata = model->MetaData();
   if (!metadata.empty()) {
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index a60ee500a9898..223eed248800e 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -38,6 +38,7 @@
 #include "core/framework/utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
+#include "core/graph/model_saving_options.h"
 #include "core/optimizer/graph_transformer_utils.h"
 #include "core/optimizer/graph_transformer.h"
 #include "core/optimizer/layout_transformation/layout_transformation.h"
@@ -2099,13 +2100,12 @@ common::Status InferenceSession::Initialize() {
           const size_t optimized_model_external_initializers_min_size_in_bytes =
               ParseStringWithClassicLocale<size_t>(session_options_.config_options.GetConfigOrDefault(
                   kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes, "1024"));
-          Graph::OffsetAlignmentInfo align_info;
-          align_info.align_offset = true;
+          ModelSavingOptions model_saving_options{optimized_model_external_initializers_min_size_in_bytes};
+          model_saving_options.align_offset = true;
           ORT_RETURN_IF_ERROR_SESSIONID_(Model::SaveWithExternalInitializers(*model_,
                                                                              session_options_.optimized_model_filepath,
                                                                              optimized_model_external_initializers_file_name,
-                                                                             optimized_model_external_initializers_min_size_in_bytes,
-                                                                             align_info));
+                                                                             model_saving_options));
         }
       }
     }
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 1444c1976d447..a40fabd6a607c 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -1072,7 +1072,14 @@ struct ProviderHostImpl : ProviderHost {
   void Model__operator_delete(Model* p) override { delete p; }
   Graph& Model__MainGraph(Model* p) override { return p->MainGraph(); }
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToProto(Model* p) override { return std::make_unique<ONNX_NAMESPACE::ModelProto>(p->ToProto()); }
-  std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToGraphProtoWithExternalInitializers(Model* p, const std::filesystem::path& external_file_name, const std::filesystem::path& file_path, size_t initializer_size_threshold) override { return std::make_unique<ONNX_NAMESPACE::ModelProto>(p->ToGraphProtoWithExternalInitializers(external_file_name, file_path, initializer_size_threshold)); };
+  std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToGraphProtoWithExternalInitializers(Model* p,
+                                                                                          const std::filesystem::path& external_file_name,
+                                                                                          const std::filesystem::path& file_path,
+                                                                                          const ModelSavingOptions& model_saving_options) override {
+    return std::make_unique<ONNX_NAMESPACE::ModelProto>(p->ToGraphProtoWithExternalInitializers(external_file_name,
+                                                                                                file_path,
+                                                                                                model_saving_options));
+  };
   const ModelMetaData& Model__MetaData(const Model* p) const noexcept override { return p->MetaData(); };
   Status Model__Load(const PathString& file_path, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) override { return Model::Load(file_path, model_proto); }
 
diff --git a/onnxruntime/test/framework/save_model_with_external_initializers.cc b/onnxruntime/test/framework/save_model_with_external_initializers.cc
index d0bc088175755..98874874d50e9 100644
--- a/onnxruntime/test/framework/save_model_with_external_initializers.cc
+++ b/onnxruntime/test/framework/save_model_with_external_initializers.cc
@@ -6,6 +6,7 @@
 #include "core/common/path_string.h"
 #include "core/framework/data_types.h"
 #include "core/graph/model.h"
+#include "core/graph/model_saving_options.h"
 #include "core/framework/tensorprotoutils.h"
 #include "test/test_environment.h"
 #include "test_utils.h"
@@ -23,15 +24,14 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
                                const std::filesystem::path& input_external_init_file,
                                const std::filesystem::path& output_onnx,
                                const std::filesystem::path& output_external_init_file,
-                               size_t initializer_size_threshold,
-                               const Graph::OffsetAlignmentInfo& align_info) {
+                               const ModelSavingOptions& model_saving_options) {
   auto logger = DefaultLoggingManager().CreateLogger("LoadSaveAndCompareModel");
   std::shared_ptr<Model> model;
   ORT_RETURN_IF_ERROR(Model::Load(input_onnx, model, nullptr, *logger));
   std::filesystem::remove(output_onnx);
   std::filesystem::remove(output_external_init_file);
-  ORT_RETURN_IF_ERROR(Model::SaveWithExternalInitializers(*model, output_onnx, output_external_init_file, initializer_size_threshold,
-                                                          align_info));
+  ORT_RETURN_IF_ERROR(Model::SaveWithExternalInitializers(*model, output_onnx, output_external_init_file,
+                                                          model_saving_options));
 
   std::shared_ptr<Model> model_from_external;
   ORT_RETURN_IF_ERROR(Model::Load(output_onnx.native(), model_from_external, nullptr, *logger));
@@ -67,7 +67,7 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
     ORT_RETURN_IF_ERROR(utils::UnpackInitializerData(*from_external_tensor_proto, model_path, from_external_tensor_proto_data));
     size_t from_external_tensor_proto_size = from_external_tensor_proto_data.size();
 
-    if (from_external_tensor_proto_size < initializer_size_threshold) {
+    if (from_external_tensor_proto_size < model_saving_options.initializer_size_threshold) {
       // 'Small' tensors should be embedded in the onnx file.
       ORT_RETURN_IF_NOT(from_external_tensor_proto->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_DEFAULT, "location mismatch");
     } else {
@@ -78,13 +78,14 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
     ORT_RETURN_IF_NOT(tensor_proto_size == from_external_tensor_proto_size, "size mismatch");
     ORT_RETURN_IF_NOT(memcmp(tensor_proto_data.data(), from_external_tensor_proto_data.data(), tensor_proto_size) == 0, "data mismatch");
 
-    if (align_info.align_offset) {
+    if (model_saving_options.align_offset) {
       for (const StringStringEntryProto& entry : from_external_tensor_proto->external_data()) {
         if (entry.has_key() && entry.has_value() && entry.key() == "offset") {
           size_t tensor_offset;
           std::stringstream stream(entry.value());
           stream >> tensor_offset;
-          ORT_RETURN_IF_NOT(tensor_offset % align_info.allocation_granularity == 0, "tensor offset not align");
+          ORT_RETURN_IF_NOT(tensor_offset % model_saving_options.allocation_granularity == 0,
+                            "tensor offset not align");
         }
       }
     }
@@ -97,22 +98,35 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
 
 // Original model does not have external initializers
 TEST(SaveWithExternalInitializers, Mnist) {
-  Graph::OffsetAlignmentInfo align_info;
-  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/mnist.onnx"), ORT_TSTR(""), ORT_TSTR("testdata/mnist_with_external_initializers.onnx"), ORT_TSTR("mnist_external_initializers.bin"), 100, align_info));
+  ModelSavingOptions model_saving_options{100};
+  ASSERT_STATUS_OK(LoadSaveAndCompareModel(
+      ORT_TSTR("testdata/mnist.onnx"),
+      ORT_TSTR(""), ORT_TSTR("testdata/mnist_with_external_initializers.onnx"),
+      ORT_TSTR("mnist_external_initializers.bin"),
+      model_saving_options));
 }
 
 // Original model has external initializers
 TEST(SaveWithExternalInitializers, ModelWithOriginalExternalData) {
-  Graph::OffsetAlignmentInfo align_info;
-  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/model_with_orig_ext_data.onnx"), ORT_TSTR("model_with_orig_ext_data.onnx.data"), ORT_TSTR("testdata/model_with_new_external_initializers.onnx"), ORT_TSTR("model_with_new_external_initializers.bin"), 0, align_info));
+  ModelSavingOptions model_saving_options{0};
+  ASSERT_STATUS_OK(LoadSaveAndCompareModel(
+      ORT_TSTR("testdata/model_with_orig_ext_data.onnx"),
+      ORT_TSTR("model_with_orig_ext_data.onnx.data"),
+      ORT_TSTR("testdata/model_with_new_external_initializers.onnx"),
+      ORT_TSTR("model_with_new_external_initializers.bin"),
+      model_saving_options));
 }
 
 // Original model has external initializers, align offset
 TEST(SaveWithExternalInitializers, ModelWithOriginalExternalDataAlignOffset) {
-  Graph::OffsetAlignmentInfo align_info;
-  align_info.align_offset = true;
-  align_info.align_threshold = 0;
-  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/model_with_orig_ext_data.onnx"), ORT_TSTR("model_with_orig_ext_data.onnx.data"), ORT_TSTR("testdata/model_with_new_external_initializers.onnx"), ORT_TSTR("model_with_new_external_initializers.bin"), 0, align_info));
+  ModelSavingOptions model_saving_options{0};
+  model_saving_options.align_offset = true;
+  model_saving_options.align_threshold = 0;
+  ASSERT_STATUS_OK(LoadSaveAndCompareModel(
+      ORT_TSTR("testdata/model_with_orig_ext_data.onnx"),
+      ORT_TSTR("model_with_orig_ext_data.onnx.data"),
+      ORT_TSTR("testdata/model_with_new_external_initializers.onnx"),
+      ORT_TSTR("model_with_new_external_initializers.bin"), model_saving_options));
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc
index 3e694020f796b..e7f8b1aaa49d8 100644
--- a/onnxruntime/test/framework/session_state_test.cc
+++ b/onnxruntime/test/framework/session_state_test.cc
@@ -15,6 +15,7 @@
 #include "core/graph/graph_utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
+#include "core/graph/model_saving_options.h"
 #include "core/graph/op.h"
 #include "core/providers/cpu/cpu_execution_provider.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
@@ -22,13 +23,101 @@
 #include "gtest/gtest.h"
 #include "test/test_environment.h"
 #include "test/util/include/default_providers.h"
+#include "test/util/include/file_util.h"
 #include "core/optimizer/layout_transformation/layout_transformation.h"
 
 using namespace ONNX_NAMESPACE;
-using namespace std;
 namespace onnxruntime {
-
 namespace test {
+
+#ifndef ENABLE_TRAINING_CORE
+#ifndef __wasm__
+static void TestSavedPrepacks(const Model& model) {
+  auto inspect = [](const Graph& graph) {
+    const auto& prepacked_for_graph = graph.GetPrepacked();
+    const auto& key_to_blob = prepacked_for_graph.GetKeyToBlob();
+    ASSERT_EQ(1U, key_to_blob.size());
+    const size_t expected_prepacks_for_writing = (graph.ParentGraph() == nullptr) ? 1U : 0U;
+    ASSERT_EQ(expected_prepacks_for_writing, prepacked_for_graph.GetNumberOfWeightsForWriting());
+
+    const size_t expected_blobs_for_writing = (graph.ParentGraph() == nullptr) ? 1U : 0U;
+    ASSERT_EQ(expected_blobs_for_writing, prepacked_for_graph.GetNumberOfKeyedBlobsForWriting());
+
+    if (graph.ParentGraph() == nullptr) {
+      const auto* blob_keys = prepacked_for_graph.GetKeysForWeightForSaving("if_shared");
+      ASSERT_TRUE(blob_keys != nullptr);
+      ASSERT_EQ(blob_keys->size(), 1U);
+      const auto* prepacked_weights = prepacked_for_graph.GetPrepackedWeights(*blob_keys->cbegin());
+      ASSERT_TRUE(prepacked_weights != nullptr);
+      ASSERT_EQ(prepacked_weights->buffer_sizes_.size(), 1U);
+      ASSERT_EQ(prepacked_weights->buffer_sizes_[0], sizeof(float) * 2);
+    }
+  };
+
+  const auto& main_graph = model.MainGraph();
+  inspect(main_graph);
+
+  const auto& nodes = main_graph.Nodes();
+  auto if_node_hit = std::find_if(nodes.begin(), nodes.end(),
+                                  [](const Node& node) { return node.Name() == "if"; });
+  ASSERT_FALSE(if_node_hit == nodes.end());
+  const Node& if_node = *if_node_hit;
+  for (const auto& [_, subgraph] : if_node.GetAttributeNameToSubgraphMap()) {
+    inspect(*subgraph);
+  }
+}
+
+static void TestLoadedSharedUserSupplied(const Model& model) {
+  auto inspect = [](const Graph& graph) {
+    const auto& prepacked_for_graph = graph.GetPrepacked();
+    constexpr size_t expected_prepacks_for_writing = 0U;
+    ASSERT_EQ(expected_prepacks_for_writing, prepacked_for_graph.GetNumberOfWeightsForWriting());
+
+    // We have not loaded anything since this initializer is user supplied
+    const auto& key_to_blob = prepacked_for_graph.GetKeyToBlob();
+    ASSERT_EQ(0U, key_to_blob.size());
+  };
+
+  const auto& main_graph = model.MainGraph();
+  inspect(main_graph);
+
+  const auto& nodes = main_graph.Nodes();
+  auto if_node_hit = std::find_if(nodes.begin(), nodes.end(),
+                                  [](const Node& node) { return node.Name() == "if"; });
+  ASSERT_FALSE(if_node_hit == nodes.end());
+  const Node& if_node = *if_node_hit;
+  for (const auto& [_, subgraph] : if_node.GetAttributeNameToSubgraphMap()) {
+    inspect(*subgraph);
+  }
+}
+
+static void TestLoadedSharedNoUserSupplied(const Model& model) {
+  auto inspect = [](const Graph& graph) {
+    const auto& prepacked_for_graph = graph.GetPrepacked();
+    constexpr size_t expected_prepacks_for_writing = 0U;
+    ASSERT_EQ(expected_prepacks_for_writing, prepacked_for_graph.GetNumberOfWeightsForWriting());
+
+    // We have not loaded anything since this initializer is user supplied
+    const auto& key_to_blob = prepacked_for_graph.GetKeyToBlob();
+    ASSERT_EQ(1U, key_to_blob.size());
+  };
+
+  const auto& main_graph = model.MainGraph();
+  inspect(main_graph);
+
+  const auto& nodes = main_graph.Nodes();
+  auto if_node_hit = std::find_if(nodes.begin(), nodes.end(),
+                                  [](const Node& node) { return node.Name() == "if"; });
+  ASSERT_FALSE(if_node_hit == nodes.end());
+  const Node& if_node = *if_node_hit;
+  for (const auto& [_, subgraph] : if_node.GetAttributeNameToSubgraphMap()) {
+    inspect(*subgraph);
+  }
+}
+
+#endif  // __wasm__
+#endif  // ENABLE_TRAINING_CORE
+
 class TestOpKernel : public OpKernel {
  public:
   TestOpKernel(const OpKernelInfo& p) : OpKernel(p) {
@@ -378,7 +467,7 @@ class PrePackingTestOpKernel : public OpKernel {
     ORT_UNUSED_PARAMETER(tensor);
     ORT_UNUSED_PARAMETER(input_idx);
 
-    size_t weight_packed_len = 8;
+    constexpr const size_t weight_packed_len = sizeof(float) * 2;
     weight_packed_ = IAllocator::MakeUniquePtr<void>(alloc, weight_packed_len, true);
     float* data_weights_packed = reinterpret_cast<float*>(weight_packed_.get());
     data_weights_packed[0] = 1.2345f;
@@ -647,7 +736,8 @@ class SessionStateTestSharedInitalizersWithPrePacking : public ::testing::Test {
   }
 };
 
-// Pre-packing enabled + no shared initializers = no pre-packed weights caching
+// Pre-packing enabled + no shared initializers, however, we put all the pre-packs
+// in a session_state container for ownership.
 TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test1) {
   SessionOptions sess_options;
   sess_options.enable_mem_pattern = true;
@@ -679,10 +769,11 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test1) {
 
   const auto* kernel = reinterpret_cast<const PrePackingTestOpKernel*>(session_state_1.GetKernel(0));
 
-  // Assert that a pre-pack call was made and that no mechanism to store weight from shared container was invoked
+  // Assert that a pre-pack call was made. However, they sharing call is still made from a serialized container.
   ASSERT_EQ(session_state_1.GetNumberOfPrepacksCounter(), static_cast<size_t>(1));
   ASSERT_EQ(kernel->prepack_calls_count, 1);
-  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 0);
+  // In this case the sharing comes from the serialized container
+  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1);
 
   // Second session/model
   Model model_2("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
@@ -706,10 +797,11 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test1) {
 
   kernel = reinterpret_cast<const PrePackingTestOpKernel*>(session_state_2.GetKernel(0));
 
-  // Assert that a pre-pack call was made and that no mechanism to store weight from shared container was invoked
+  // Assert that a pre-pack call was made. The weights are still shared from the serialized container
+  // either because they are loaded from disk or because the container takes ownership of them.
   ASSERT_EQ(session_state_2.GetNumberOfPrepacksCounter(), static_cast<size_t>(1));
   ASSERT_EQ(kernel->prepack_calls_count, 1);
-  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 0);
+  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1);
 }
 
 // Pre-packing enabled + shared initializers + no pre-packed weights container = no pre-packed weights caching
@@ -754,10 +846,10 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test2) {
 
   const auto* kernel = reinterpret_cast<const PrePackingTestOpKernel*>(session_state_1.GetKernel(0));
 
-  // Assert that a pre-pack call was made and that no mechanism to store weight from shared container was invoked
+  // Assert that a pre-pack call was made, but sharing still takes place from the serialized container
   ASSERT_EQ(session_state_1.GetNumberOfPrepacksCounter(), static_cast<size_t>(1));
   ASSERT_EQ(kernel->prepack_calls_count, 1);
-  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 0);
+  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1);
 
   // Second session/model
   Model model_2("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
@@ -781,10 +873,10 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test2) {
 
   kernel = reinterpret_cast<const PrePackingTestOpKernel*>(session_state_2.GetKernel(0));
 
-  // Assert that a pre-pack call was made and that no mechanism to store weight from shared container was invoked
+  // Assert that a pre-pack call was made, but sharing still takes place from the serialized container
   ASSERT_EQ(session_state_2.GetNumberOfPrepacksCounter(), static_cast<size_t>(1));
   ASSERT_EQ(kernel->prepack_calls_count, 1);
-  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 0);
+  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1);
 }
 
 // Pre-packing enabled + shared initializers + pre-packed weights container = pre-packed weights caching enabled
@@ -999,6 +1091,196 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test4) {
   ASSERT_EQ(if_node_branches_shared_prepack_counter_2, static_cast<size_t>(2));
 }
 
+#ifndef __wasm__
+// sharing is on
+TEST_F(SessionStateTestSharedInitalizersWithPrePacking, TestPrepackedSerialization) {
+  const std::filesystem::path model_with_external_initializers =
+      "testdata/test_prepacked_serialization_optimized_model.onnx";
+
+  const std::filesystem::path external_initializers_file =
+      "test_prepacked_serialization_optimized_model.bin";
+
+  {
+    SessionOptions sess_options;
+    sess_options.enable_mem_pattern = true;
+    sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
+    sess_options.use_deterministic_compute = false;
+    sess_options.enable_mem_reuse = true;
+    sess_options.optimized_model_filepath = model_with_external_initializers;
+
+    // Enable pre-packing
+    sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0";
+    // Enable saving model with pre-packed weights
+    sess_options.config_options.configurations[kOrtSessionOptionsSavePrePackedConstantInitializers] = "1";
+
+    // Enable shared initializer
+    OrtMemoryInfo mem_info(CPU, OrtDeviceAllocator);
+    std::vector<float> float_data(1, 1);
+    auto value = std::make_unique<OrtValue>();
+    Tensor::InitOrtValue(DataTypeImpl::GetType<float>(), TensorShape(std::vector<int64_t>{1}),
+                         reinterpret_cast<void*>(float_data.data()), mem_info, *value);
+
+    ASSERT_STATUS_OK(sess_options.AddInitializer("if_shared", value.get()));
+
+    // Enable pre-packed weights container for shared initializers
+    PrepackedWeightsContainer prepacked_weights_container;
+    Model model_1("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+                  domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
+                  DefaultLoggingManager().DefaultLogger());
+
+    CreateGraphWithSubgraph(model_1.MainGraph());
+    PlaceAllNodesToCPUEP(model_1.MainGraph());
+    SessionState session_state_1(model_1.MainGraph(),
+                                 execution_providers,
+                                 tp.get(),
+                                 nullptr, /*inter_op_thread_pool*/
+                                 dtm,
+                                 edlm,
+                                 DefaultLoggingManager().DefaultLogger(),
+                                 profiler,
+                                 sess_options,
+                                 &prepacked_weights_container);
+
+    constexpr const bool saving_model_true = true;
+
+    ASSERT_STATUS_OK(session_state_1.FinalizeSessionState(std::basic_string<PATH_CHAR_TYPE>(),
+                                                          kernel_registry_manager,
+                                                          !saving_model_true));
+
+    TestSavedPrepacks(model_1);
+
+    ModelSavingOptions model_saving_options{4};
+    model_saving_options.align_offset = true;
+
+    ASSERT_STATUS_OK(Model::SaveWithExternalInitializers(model_1, model_with_external_initializers,
+                                                         external_initializers_file,
+                                                         model_saving_options));
+  }
+  ScopedFileDeleter test_model_deleter(model_with_external_initializers);
+  ScopedFileDeleter binary_file_deleter(external_initializers_file);
+
+  // Now let's load the model along with the initializers
+  {
+    SessionOptions sess_options;
+    sess_options.enable_mem_pattern = true;
+    sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
+    sess_options.use_deterministic_compute = false;
+    sess_options.enable_mem_reuse = true;
+
+    // Enable pre-packing
+    sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0";
+
+    // We are expecting this weight to be loaded from disk along
+    // with its pre-packed version
+    // Enable shared initializer
+    OrtMemoryInfo mem_info(CPU, OrtDeviceAllocator);
+    std::vector<float> float_data(1, 1);
+    auto value = std::make_unique<OrtValue>();
+    Tensor::InitOrtValue(DataTypeImpl::GetType<float>(), TensorShape(std::vector<int64_t>{1}),
+                         reinterpret_cast<void*>(float_data.data()), mem_info, *value);
+
+    ASSERT_STATUS_OK(sess_options.AddInitializer("if_shared", value.get()));
+
+    // Enable pre-packed weights container for shared initializers
+    PrepackedWeightsContainer prepacked_weights_container;
+
+    std::shared_ptr<Model> model;
+    ASSERT_STATUS_OK(Model::Load(model_with_external_initializers, model, nullptr,
+                                 DefaultLoggingManager().DefaultLogger()));
+
+    PlaceAllNodesToCPUEP(model->MainGraph());
+    SessionState session_state(model->MainGraph(),
+                               execution_providers,
+                               tp.get(),
+                               nullptr, /*inter_op_thread_pool*/
+                               dtm,
+                               edlm,
+                               DefaultLoggingManager().DefaultLogger(),
+                               profiler,
+                               sess_options,
+                               &prepacked_weights_container);
+
+    ASSERT_STATUS_OK(session_state.FinalizeSessionState(std::basic_string<PATH_CHAR_TYPE>(),
+                                                        kernel_registry_manager,
+                                                        false));
+
+    TestLoadedSharedUserSupplied(*model);
+  }
+
+  // Load again, this time sharing is enabled, but no shared initializer in the map
+  {
+    SessionOptions sess_options;
+    sess_options.enable_mem_pattern = true;
+    sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
+    sess_options.use_deterministic_compute = false;
+    sess_options.enable_mem_reuse = true;
+
+    // Enable pre-packing
+    sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0";
+
+    // Enable pre-packed weights container for shared initializers
+    PrepackedWeightsContainer prepacked_weights_container;
+
+    std::shared_ptr<Model> model;
+    ASSERT_STATUS_OK(Model::Load(model_with_external_initializers, model, nullptr,
+                                 DefaultLoggingManager().DefaultLogger()));
+
+    PlaceAllNodesToCPUEP(model->MainGraph());
+    SessionState session_state(model->MainGraph(),
+                               execution_providers,
+                               tp.get(),
+                               nullptr, /*inter_op_thread_pool*/
+                               dtm,
+                               edlm,
+                               DefaultLoggingManager().DefaultLogger(),
+                               profiler,
+                               sess_options,
+                               &prepacked_weights_container);
+
+    ASSERT_STATUS_OK(session_state.FinalizeSessionState(model_with_external_initializers,
+                                                        kernel_registry_manager,
+                                                        false));
+
+    TestLoadedSharedNoUserSupplied(*model);
+  }
+  // Load again, sharing is disabled
+  {
+    SessionOptions sess_options;
+    sess_options.enable_mem_pattern = true;
+    sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
+    sess_options.use_deterministic_compute = false;
+    sess_options.enable_mem_reuse = true;
+
+    // Enable pre-packing
+    sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0";
+
+    std::shared_ptr<Model> model;
+    ASSERT_STATUS_OK(Model::Load(model_with_external_initializers, model, nullptr,
+                                 DefaultLoggingManager().DefaultLogger()));
+
+    PlaceAllNodesToCPUEP(model->MainGraph());
+    SessionState session_state(model->MainGraph(),
+                               execution_providers,
+                               tp.get(),
+                               nullptr, /*inter_op_thread_pool*/
+                               dtm,
+                               edlm,
+                               DefaultLoggingManager().DefaultLogger(),
+                               profiler,
+                               sess_options,
+                               nullptr);
+
+    ASSERT_STATUS_OK(session_state.FinalizeSessionState(model_with_external_initializers,
+                                                        kernel_registry_manager,
+                                                        false));
+
+    const auto& prepacked_for_main_graph = model->MainGraph().GetPrepacked();
+    ASSERT_FALSE(prepacked_for_main_graph.IsSaveModeOn());
+    ASSERT_EQ(1U, prepacked_for_main_graph.GetKeyToBlob().size());
+  }
+}
+#endif  // __wasm__
+
 INSTANTIATE_TEST_SUITE_P(SessionStateTests,
                          SessionStatePrepackingTest,
                          testing::Values(PrepackingTestParam{false, false},
diff --git a/onnxruntime/test/framework/tensorutils_test.cc b/onnxruntime/test/framework/tensorutils_test.cc
index 6821f582ce2de..229f4f95b8394 100644
--- a/onnxruntime/test/framework/tensorutils_test.cc
+++ b/onnxruntime/test/framework/tensorutils_test.cc
@@ -1,6 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/common/inlined_containers.h"
+#include "core/framework/prepacked_weights.h"
+#include "core/framework/prepacked_weights_container.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/onnx_protobuf.h"
 #include "test/util/include/asserts.h"
@@ -19,6 +22,76 @@ using namespace ONNX_NAMESPACE;
 namespace onnxruntime {
 namespace test {
 
+// Test ExternalData functionality
+TEST(TensorProtoUtilsTest, SetExternalDataInformation) {
+  ONNX_NAMESPACE::TensorProto tensor_proto;
+  const std::filesystem::path kExternalDataPath("test.bin");
+  constexpr const int64_t init_offset = 100;
+  constexpr const size_t init_length = 200;
+
+  ExternalDataInfo::SetExternalLocationToProto(kExternalDataPath, init_offset, init_length, tensor_proto);
+
+  ASSERT_EQ(tensor_proto.data_location(), ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL);
+  ASSERT_EQ(tensor_proto.external_data_size(), 3);
+  ASSERT_EQ(tensor_proto.external_data(0).key(), "location");
+  ASSERT_EQ(tensor_proto.external_data(0).value(), ToUTF8String(kExternalDataPath.native()));
+  ASSERT_EQ(tensor_proto.external_data(1).key(), "offset");
+  ASSERT_EQ(tensor_proto.external_data(1).value(), std::to_string(init_offset));
+  ASSERT_EQ(tensor_proto.external_data(2).key(), "length");
+  ASSERT_EQ(tensor_proto.external_data(2).value(), std::to_string(init_length));
+
+  PrepackedKeyToBlobMap key_to_blob;
+  constexpr bool save_mode_on = true;
+  PrepackedWeightsForGraph prepacked_for_graph(key_to_blob, save_mode_on);
+  PrePackedWeights prepacked_weights;
+  const std::string init_name = "test_initializer";
+  const std::string blob_key = "test_key";
+
+  std::array<float, 2> kData = {1.2345f, 2.4690f};
+  const size_t buffer_size = kData.size() * sizeof(float);
+
+  prepacked_weights.buffers_.push_back(BufferUniquePtr(kData.data(), BufferDeleter(nullptr)));
+  prepacked_weights.buffer_sizes_.push_back(buffer_size);
+  // Write a second entry like this
+  prepacked_weights.buffers_.push_back(BufferUniquePtr(kData.data(), BufferDeleter(nullptr)));
+  prepacked_weights.buffer_sizes_.push_back(buffer_size);
+
+  prepacked_for_graph.WritePackedMaybeForSave(init_name, blob_key, std::move(prepacked_weights));
+
+  constexpr const int64_t starting_offset = 300;
+  int64_t external_offset = starting_offset;
+  std::stringstream ss;
+  const auto* blobs_for_weight = prepacked_for_graph.GetKeysForWeightForSaving(init_name);
+  ASSERT_TRUE(blobs_for_weight != nullptr);
+  InlinedHashSet<std::string> blob_keys{blobs_for_weight->begin(), blobs_for_weight->end()};
+  ASSERT_TRUE(ExternalDataInfo::WritePrepackedToFileAndAddToProto(prepacked_for_graph,
+                                                                  blob_keys,
+                                                                  true, 1024 * 1024, 0,
+                                                                  ss, external_offset,
+                                                                  tensor_proto));
+
+  auto external_data_info = std::make_unique<ExternalDataInfo>();
+  ASSERT_STATUS_OK(ExternalDataInfo::Create(tensor_proto.external_data(), external_data_info));
+
+  // This should have prepacked_data entry with two blobs for a single key.
+  ASSERT_TRUE(external_data_info->HasPrepackedInfo());
+  auto prepacked_infos = external_data_info->TakePrepackedInfos();
+  ASSERT_EQ(prepacked_infos.size(), 1U);
+  ASSERT_TRUE(prepacked_infos.count(blob_key) > 0);
+
+  int64_t final_offset = starting_offset;
+  for (const auto& blob_info : prepacked_infos[blob_key]) {
+    int64_t offset = std::get<0>(blob_info);
+    ASSERT_EQ(offset, final_offset);
+    size_t length = std::get<1>(blob_info);
+    std::string checksum = std::get<2>(blob_info);  // currently "0"
+    final_offset = offset + length;
+    ASSERT_EQ(length, buffer_size);
+    ASSERT_EQ(checksum, "0");
+  }
+  ASSERT_EQ(final_offset, external_offset);
+}
+
 // T must be float for double, and it must match with the 'type' argument
 template <typename T>
 void TestUnpackFloatTensor(TensorProto_DataType type, const std::filesystem::path& model_path) {
diff --git a/orttraining/orttraining/core/session/training_session.cc b/orttraining/orttraining/core/session/training_session.cc
index f1545e96481fa..b03f1b1eadb3b 100644
--- a/orttraining/orttraining/core/session/training_session.cc
+++ b/orttraining/orttraining/core/session/training_session.cc
@@ -5,6 +5,7 @@
 
 #include "core/framework/data_transfer_utils.h"
 #include "core/graph/model.h"
+#include "core/graph/model_saving_options.h"
 #include "core/session/IOBinding.h"
 #include "core/optimizer/rule_based_graph_transformer.h"
 #include "core/providers/cpu/controlflow/utils.h"
@@ -1003,7 +1004,8 @@ Status TrainingSession::SaveWithExternalInitializers(const PathString& model_uri
   std::remove(ToUTF8String(model_uri).c_str());
   std::remove(external_file_name.c_str());
 
-  return Model::SaveWithExternalInitializers(*model_, model_uri, external_file_name, initializer_size_threshold);
+  ModelSavingOptions model_saving_options{initializer_size_threshold};
+  return Model::SaveWithExternalInitializers(*model_, model_uri, external_file_name, model_saving_options);
 }
 
 Status TrainingSession::Save(const PathString& model_uri, TrainingSession::SaveOption opt) {
diff --git a/orttraining/orttraining/training_api/module.cc b/orttraining/orttraining/training_api/module.cc
index 939e1de334e52..60708b05626c5 100644
--- a/orttraining/orttraining/training_api/module.cc
+++ b/orttraining/orttraining/training_api/module.cc
@@ -11,6 +11,7 @@
 #include "core/session/inference_session.h"
 #include "core/session/environment.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
+#include "core/graph/model_saving_options.h"
 #include "core/graph/graph_utils.h"
 
 #include "orttraining/training_api/checkpoint.h"
@@ -689,8 +690,10 @@ Status Module::ExportModelForInferencing(const std::string& inference_model_path
     std::string external_data_name =
         ORT_TSTR_CONVERT_TO_PRINTABLE_STRING(ExternalCheckpointDataPath(ToPathString(inference_model_path)));
     PathString inference_model_pathstring = ToPathString(inference_model_path);
+    ModelSavingOptions model_saving_options{64};
     ORT_THROW_IF_ERROR(
-        Model::SaveWithExternalInitializers(*inference_model, inference_model_pathstring, external_data_name, 64));
+        Model::SaveWithExternalInitializers(*inference_model, inference_model_pathstring, external_data_name,
+                                            model_saving_options));
   } else {
     ORT_THROW_IF_ERROR(Model::Save(*inference_model, ToPathString(inference_model_path)));
   }

From fcc34da5e9ad804c48b3beedb99c5e129df10334 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Fri, 20 Dec 2024 11:48:43 -0800
Subject: [PATCH 22/25] Fix a tiny problem in winml.cmake (#23173)

### Description
CMake's
[target_link_libraries](https://cmake.org/cmake/help/latest/command/target_link_libraries.html#id2)
function accepts plain library name(like `re2`) or target name(like
`re2::re2`) or some other kinds of names. "plain library names" are
old-fashioned, for compatibility only. We should use target names.

### Motivation and Context
To make vcpkg work with winml build. See #23158
---
 cmake/winml.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/winml.cmake b/cmake/winml.cmake
index ff6b71217ad87..63f356fcf831d 100644
--- a/cmake/winml.cmake
+++ b/cmake/winml.cmake
@@ -782,7 +782,7 @@ add_dependencies(winml_dll winml_api_native)
 add_dependencies(winml_dll winml_api_native_internal)
 
 # Link libraries
-target_link_libraries(winml_dll PRIVATE re2)
+target_link_libraries(winml_dll PRIVATE re2::re2)
 target_link_libraries(winml_dll PRIVATE ${WIL_TARGET})
 target_link_libraries(winml_dll PRIVATE winml_lib_api)
 if (NOT winml_is_inbox)

From 6806174096a24691c0838dff2e25e064eb0d6d2d Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 20 Dec 2024 13:37:12 -0800
Subject: [PATCH 23/25] fix webgpu delay load test (#23157)

### Description

This change fixes the WebGPU delay load test.


<details>
<summary>Fix UB in macro</summary>

The following C++ code outputs `2, 1` in MSVC, while it outputs `1, 1`
in GCC:

```c++
#include <iostream>

#define A 1
#define B 1

#define ENABLE defined(A) && defined(B)

#if ENABLE
int x = 1;
#else
int x = 2;
#endif

#if defined(A) && defined(B)
int y = 1;
#else
int y = 2;
#endif

int main()
{
    std::cout << x << ", " << y << "\n";
}
```

Clang reports `macro expansion producing 'defined' has undefined
behavior [-Wexpansion-to-defined]`.

</details>

<details>
<summary>Fix condition of build option
onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS</summary>

Delay load is explicitly disabled when python binding is being built.
modifies the condition.

</details>
---
 cmake/CMakeLists.txt                       |  3 +--
 onnxruntime/core/dll/delay_load_hook.cc    | 14 +++++++++++---
 onnxruntime/test/webgpu/delay_load/main.cc |  1 +
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index d2fe7e7457983..febefff6756e7 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -130,8 +130,7 @@ option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS "Dump debug information about node
 cmake_dependent_option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB "Build dump debug information about node inputs and outputs with support for sql database." OFF "onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS" OFF)
 
 # When loading a delay loaded DLL, Windows searches the main EXE's folder first.
-# In a Python process, it searches where python.exe lives, but it doesn't search the python package's installation folder. Therefore we cannot enable this flag when Python is enabled.
-cmake_dependent_option(onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS "Delay load some of the dependent DLls that are part of the OS" ON "WIN32;NOT GDK_PLATFORM;NOT onnxruntime_ENABLE_PYTHON" OFF)
+cmake_dependent_option(onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS "Delay load some of the dependent DLls that are part of the OS" ON "WIN32;NOT GDK_PLATFORM" OFF)
 option(onnxruntime_USE_DML "Build with DirectML support" OFF)
 option(onnxruntime_USE_MIGRAPHX "Build with AMDMIGraphX support" OFF)
 option(onnxruntime_USE_WINML "Build with WinML support" OFF)
diff --git a/onnxruntime/core/dll/delay_load_hook.cc b/onnxruntime/core/dll/delay_load_hook.cc
index 23fc8bca7368e..bc5e1aa662721 100644
--- a/onnxruntime/core/dll/delay_load_hook.cc
+++ b/onnxruntime/core/dll/delay_load_hook.cc
@@ -24,8 +24,16 @@
 // - both USE_WEBGPU and BUILD_DAWN_MONOLITHIC_LIBRARY are defined
 // - USE_DML is defined
 //
-#define ORT_DELAY_LOAD_WEBGPU_DAWN_DLL (defined(USE_WEBGPU) && defined(BUILD_DAWN_MONOLITHIC_LIBRARY))
-#define ORT_DELAY_LOAD_DIRECTML_DLL defined(USE_DML)
+#if defined(USE_WEBGPU) && defined(BUILD_DAWN_MONOLITHIC_LIBRARY)
+#define ORT_DELAY_LOAD_WEBGPU_DAWN_DLL 1
+#else
+#define ORT_DELAY_LOAD_WEBGPU_DAWN_DLL 0
+#endif
+#if defined(USE_DML)
+#define ORT_DELAY_LOAD_DIRECTML_DLL 1
+#else
+#define ORT_DELAY_LOAD_DIRECTML_DLL 0
+#endif
 #if defined(_MSC_VER) && (ORT_DELAY_LOAD_WEBGPU_DAWN_DLL || ORT_DELAY_LOAD_DIRECTML_DLL)
 
 #include <Windows.h>
@@ -59,7 +67,7 @@ FARPROC WINAPI delay_load_hook(unsigned dliNotify, PDelayLoadInfo pdli) {
         // Try to load the DLL from the same directory as onnxruntime.dll
 
         // First, get the path to onnxruntime.dll
-        auto path = Env::Default().GetRuntimePath();
+        auto path = onnxruntime::Env::Default().GetRuntimePath();
         if (path.empty()) {
           // Failed to get the path to onnxruntime.dll. In this case, we will just return NULL and let the system
           // search for the DLL in the default search order.
diff --git a/onnxruntime/test/webgpu/delay_load/main.cc b/onnxruntime/test/webgpu/delay_load/main.cc
index f909b4a6916b4..14300f3b3751b 100644
--- a/onnxruntime/test/webgpu/delay_load/main.cc
+++ b/onnxruntime/test/webgpu/delay_load/main.cc
@@ -118,6 +118,7 @@ int test_main() {
   HMODULE hModule = LoadLibraryA("dlls\\onnxruntime.dll");
   if (hModule == NULL) {
     std::cout << "Failed to load dlls\\onnxruntime.dll" << std::endl;
+    std::cout << "Error code: " << GetLastError() << std::endl;
     return 1;
   }
 

From ebdbbb7531f6be3a4df7901ff5482a8174b51bd7 Mon Sep 17 00:00:00 2001
From: Yueqing Zhang <yuz75@Pitt.edu>
Date: Fri, 20 Dec 2024 22:03:27 -0800
Subject: [PATCH 24/25] [VitisAI] Int4 support (#22850)

### Description
<!-- Describe your changes. -->
1. Add support for throwing error when hardware is not supported for
VitisAI.
2. Add support for unloading VitisAI EP.
3. Add API for Win25.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
This is requirement for Win25
---
 .../shared_library/provider_interfaces.h      |  1 +
 .../shared_library/provider_wrappedtypes.h    |  3 ++
 .../core/providers/vitisai/imp/global_api.cc  | 52 ++++++++++++++++---
 .../providers/vitisai/imp/tensor_proto.cc     | 13 +++++
 .../core/providers/vitisai/imp/tensor_proto.h |  4 ++
 .../vitisai/include/vaip/global_api.h         |  1 +
 .../providers/vitisai/include/vaip/my_ort.h   |  1 +
 .../vitisai/include/vaip/vaip_ort_api.h       | 10 +++-
 .../vitisai/vitisai_provider_factory.cc       |  2 +-
 .../core/session/provider_bridge_ort.cc       |  1 +
 10 files changed, 80 insertions(+), 8 deletions(-)

diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 8bd4067e59492..5a179ec622f8c 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -589,6 +589,7 @@ struct ProviderHost {
   virtual const ConfigOptions& RunOptions__GetConfigOptions(const RunOptions* p) = 0;
   // OrtSessionOptions
   virtual const std::unordered_map<std::string, std::string>& SessionOptions__GetConfigOptionsMap(const OrtSessionOptions* p) = 0;
+  virtual bool SessionOptions__GetEnableProfiling(const OrtSessionOptions* p) = 0;
   // ComputeCapability
   virtual std::unique_ptr<ComputeCapability> ComputeCapability__construct(std::unique_ptr<IndexedSubGraph> t_sub_graph) = 0;
   virtual void ComputeCapability__operator_delete(ComputeCapability* p) = 0;
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index d8516d5858a2f..76b6d8063fd66 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -1476,5 +1476,8 @@ struct OrtSessionOptions final {
   const std::unordered_map<std::string, std::string>& GetConfigOptions() const {
     return onnxruntime::g_host->SessionOptions__GetConfigOptionsMap(this);
   }
+  bool GetEnableProfiling() const {
+    return onnxruntime::g_host->SessionOptions__GetEnableProfiling(this);
+  }
   PROVIDER_DISALLOW_ALL(OrtSessionOptions)
 };
diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc
index cccaa65de45f2..8111ee3c1fe61 100644
--- a/onnxruntime/core/providers/vitisai/imp/global_api.cc
+++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc
@@ -47,6 +47,8 @@ struct OrtVitisAIEpAPI {
   void (*initialize_onnxruntime_vitisai_ep)(vaip_core::OrtApiForVaip* api, std::vector<OrtCustomOpDomain*>& ret_domain);
   std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>* (*compile_onnx_model_with_options)(
       const std::string& model_path, const onnxruntime::Graph& graph, const onnxruntime::ProviderOptions& options);
+  std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>* (*compile_onnx_model_vitisai_ep_with_error_handling)(
+      const std::string& model_path, const onnxruntime::Graph& graph, const onnxruntime::ProviderOptions& options, void* status, vaip_core::error_report_func func);
   uint32_t (*vaip_get_version)();
   void (*create_ep_context_nodes)(
       const std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>& eps,
@@ -77,10 +79,11 @@ struct OrtVitisAIEpAPI {
     ORT_THROW_IF_ERROR(env.LoadDynamicLibrary(full_path, true, &handle_));
 #endif
     ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "initialize_onnxruntime_vitisai_ep", (void**)&initialize_onnxruntime_vitisai_ep));
-    auto status = env.GetSymbolFromLibrary(handle_, "compile_onnx_model_vitisai_ep_with_options", (void**)&compile_onnx_model_with_options);
-    if (!status.IsOK()) {
-      ::onnxruntime::LogRuntimeError(0, status, __FILE__, static_cast<const char*>(__FUNCTION__), __LINE__);
-      ORT_THROW(status);
+    auto status1 = env.GetSymbolFromLibrary(handle_, "compile_onnx_model_vitisai_ep_with_error_handling", (void**)&compile_onnx_model_vitisai_ep_with_error_handling);
+    auto status2 = env.GetSymbolFromLibrary(handle_, "compile_onnx_model_vitisai_ep_with_options", (void**)&compile_onnx_model_with_options);
+    if ((!status1.IsOK()) && (!status2.IsOK())) {
+      ::onnxruntime::LogRuntimeError(0, status2, __FILE__, static_cast<const char*>(__FUNCTION__), __LINE__);
+      ORT_THROW(status2);
     }
     std::ignore = env.GetSymbolFromLibrary(handle_, "vaip_get_version",
                                            (void**)&vaip_get_version);
@@ -89,6 +92,14 @@ struct OrtVitisAIEpAPI {
     ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "vitisai_ep_on_run_start", (void**)&vitisai_ep_on_run_start));
     ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "vitisai_ep_set_ep_dynamic_options", (void**)&vitisai_ep_set_ep_dynamic_options));
   }
+  void Clear() {
+    if (handle_) {
+      auto& env = Provider_GetHost()->Env__Default();
+      auto status = env.UnloadDynamicLibrary(handle_);
+      vai_assert(status.IsOK(), status.ErrorMessage());
+      handle_ = nullptr;
+    }
+  }
 
  private:
   void* handle_{};
@@ -109,10 +120,25 @@ void profiler_collect(
   }
 }
 
+void change_status_with_error(void* status_ptr, int error_code, const char* error_msg) {
+  auto status = reinterpret_cast<Status*>(status_ptr);
+  *status = Status(onnxruntime::common::ONNXRUNTIME, error_code, error_msg);
+}
+
 vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> compile_onnx_model(
-    const onnxruntime::GraphViewer& graph_viewer, const logging::Logger& logger, const ProviderOptions& options) {
+    const onnxruntime::GraphViewer& graph_viewer, const onnxruntime::logging::Logger& logger, const onnxruntime::ProviderOptions& options) {
   auto model_path = graph_viewer.ModelPath().string();
-  return vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_with_options(model_path, graph_viewer.GetGraph(), options));
+  if (s_library_vitisaiep.compile_onnx_model_vitisai_ep_with_error_handling) {
+    Status status = Status::OK();
+    auto status_ptr = reinterpret_cast<void*>(&status);
+    auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_with_error_handling(model_path, graph_viewer.GetGraph(), options, status_ptr, change_status_with_error));
+    if (!status.IsOK()) {
+      ORT_THROW(status);
+    }
+    return ret;
+  } else {
+    return vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_with_options(model_path, graph_viewer.GetGraph(), options));
+  }
 }
 
 std::optional<std::vector<Node*>> create_ep_context_nodes(
@@ -396,10 +422,12 @@ vaip_core::OrtApiForVaip* create_org_api_hook() {
   the_global_api.tensor_proto_get_shape_unsafe = vaip::tensor_proto_get_shape;
   the_global_api.tensor_proto_data_type = [](const ONNX_NAMESPACE::TensorProto& t) -> int { return t.data_type(); };
   the_global_api.tensor_proto_delete = [](ONNX_NAMESPACE::TensorProto* tp) { delete tp; };
+  the_global_api.tensor_proto_new_i4 = vaip::tensor_proto_new_i4;
   the_global_api.tensor_proto_new_i8 = vaip::tensor_proto_new_i8;
   the_global_api.tensor_proto_new_i16 = vaip::tensor_proto_new_i16;
   the_global_api.tensor_proto_new_i32 = vaip::tensor_proto_new_i32;
   the_global_api.tensor_proto_new_i64 = vaip::tensor_proto_new_i64;
+  the_global_api.tensor_proto_new_u4 = vaip::tensor_proto_new_u4;
   the_global_api.tensor_proto_new_u8 = vaip::tensor_proto_new_u8;
   the_global_api.tensor_proto_new_u16 = vaip::tensor_proto_new_u16;
   the_global_api.tensor_proto_new_u32 = vaip::tensor_proto_new_u32;
@@ -468,9 +496,21 @@ vaip_core::OrtApiForVaip* create_org_api_hook() {
     return vaip_core::DllSafe<std::string>(std::move(local_str));
   };
 
+  the_global_api.is_profiling_enabled = [](void* session_options) {
+    auto options = reinterpret_cast<OrtSessionOptions*>(session_options);
+    return options->GetEnableProfiling();
+  };
+  the_global_api.graph_remove_initialized_tensor = [](Graph& graph, const std::string& tensor_name) {
+    graph.RemoveInitializedTensor(tensor_name);
+  };
   if (!s_library_vitisaiep.vaip_get_version) {
     return reinterpret_cast<vaip_core::OrtApiForVaip*>(&(the_global_api.host_));
   } else {
     return &the_global_api;
   }
 }
+
+void deinitialize_vitisai_ep() {
+  s_library_vitisaiep.Clear();
+  s_kernel_registry_vitisaiep.reset();
+}
diff --git a/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc b/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc
index 872d022e85264..bb942c69003a1 100644
--- a/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc
+++ b/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc
@@ -87,6 +87,12 @@ static ONNX_NAMESPACE::TensorProto* tensor_proto_new(const std::string& name, co
   return tensor_proto.release();
 }
 
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_i4(const std::string& name, const std::vector<int64_t>& shape,
+                                                 const std::vector<int8_t>& data) {
+  return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_INT4,
+                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(data[0]));
+}
+
 ONNX_NAMESPACE::TensorProto* tensor_proto_new_i8(const std::string& name, const std::vector<int64_t>& shape,
                                                  const std::vector<int8_t>& data) {
   return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_INT8,
@@ -108,6 +114,13 @@ ONNX_NAMESPACE::TensorProto* tensor_proto_new_i64(const std::string& name, const
   return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_INT64,
                           reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(data[0]));
 }
+
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_u4(const std::string& name, const std::vector<int64_t>& shape,
+                                                 const std::vector<uint8_t>& data) {
+  return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_UINT4,
+                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(data[0]));
+}
+
 ONNX_NAMESPACE::TensorProto* tensor_proto_new_u8(const std::string& name, const std::vector<int64_t>& shape,
                                                  const std::vector<uint8_t>& data) {
   return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_UINT8,
diff --git a/onnxruntime/core/providers/vitisai/imp/tensor_proto.h b/onnxruntime/core/providers/vitisai/imp/tensor_proto.h
index 618d9c4728e2f..73015d3411a54 100644
--- a/onnxruntime/core/providers/vitisai/imp/tensor_proto.h
+++ b/onnxruntime/core/providers/vitisai/imp/tensor_proto.h
@@ -9,6 +9,10 @@ namespace vaip {
 gsl::span<const char> tensor_proto_as_raw(const onnxruntime::Graph& graph, const ONNX_NAMESPACE::TensorProto& tensor);
 vaip_core::DllSafe<std::vector<int64_t>> tensor_proto_get_shape(const ONNX_NAMESPACE::TensorProto& tensor);
 const std::string& tensor_proto_get_name(const ONNX_NAMESPACE::TensorProto& tensor);
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_i4(const std::string& name, const std::vector<int64_t>& shape,
+                                                 const std::vector<int8_t>& data);
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_u4(const std::string& name, const std::vector<int64_t>& shape,
+                                                 const std::vector<uint8_t>& data);
 ONNX_NAMESPACE::TensorProto* tensor_proto_new_i8(const std::string& name, const std::vector<int64_t>& shape,
                                                  const std::vector<int8_t>& data);
 ONNX_NAMESPACE::TensorProto* tensor_proto_new_u8(const std::string& name, const std::vector<int64_t>& shape,
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
index 704b156dff57f..7791ea430054a 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
@@ -11,6 +11,7 @@
 #include "vaip/custom_op.h"
 #include <optional>
 void initialize_vitisai_ep();
+void deinitialize_vitisai_ep();
 vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> compile_onnx_model(const onnxruntime::GraphViewer& graph_viewer, const onnxruntime::logging::Logger& logger, const onnxruntime::ProviderOptions& options);
 std::shared_ptr<onnxruntime::KernelRegistry> get_kernel_registry_vitisaiep();
 const std::vector<OrtCustomOpDomain*>& get_domains_vitisaiep();
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/my_ort.h b/onnxruntime/core/providers/vitisai/include/vaip/my_ort.h
index 7628e45d2b933..85a1262d8489b 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/my_ort.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/my_ort.h
@@ -122,4 +122,5 @@ using InitializedTensorSet =
     std::unordered_map<std::string, const TensorProto*>;
 
 using ModelMetaData = std::unordered_map<std::string, std::string>;
+using error_report_func = void (*)(void*, int, const char*);
 }  // namespace vaip_core
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h b/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h
index 9425c08dceebc..6a51ef862280b 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h
@@ -13,7 +13,7 @@ struct OrtApi;
 
 namespace vaip_core {
 
-#define VAIP_ORT_API_MAJOR (12u)
+#define VAIP_ORT_API_MAJOR (13u)
 #define VAIP_ORT_API_MINOR (0u)
 #define VAIP_ORT_API_PATCH (0u)
 struct OrtApiForVaip {
@@ -235,6 +235,14 @@ struct OrtApiForVaip {
   DllSafe<std::string> (*model_proto_serialize_as_string)(ModelProto& model_proto);                                                                   // [96]
   void (*model_proto_delete)(ModelProto* p);                                                                                                          // [97]
   DllSafe<std::string> (*attr_proto_release_string)(AttributeProto* attr);                                                                            // [98]
+  bool (*is_profiling_enabled)(void* session_options);                                                                                                // [99]                                                                                        // [98]
+  TensorProto* (*tensor_proto_new_i4)(const std::string& name,
+                                      const std::vector<int64_t>& shape,
+                                      const std::vector<int8_t>& data);  // [100]
+  TensorProto* (*tensor_proto_new_u4)(const std::string& name,
+                                      const std::vector<int64_t>& shape,
+                                      const std::vector<uint8_t>& data);                  // [101]
+  void (*graph_remove_initialized_tensor)(Graph& graph, const std::string& tensor_name);  // [102]
 };
 
 #ifndef USE_VITISAI
diff --git a/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc b/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc
index 453db30e1320f..99d9845302d9a 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc
@@ -50,7 +50,7 @@ struct VitisAI_Provider : Provider {
   // Called right after loading the shared library, if this throws any errors Shutdown() will be called and the library unloaded
   void Initialize() override { initialize_vitisai_ep(); }
   // Called right before unloading the shared library
-  void Shutdown() override {}
+  void Shutdown() override { deinitialize_vitisai_ep(); }
 } g_provider;
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index a40fabd6a607c..af39edae2074d 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -720,6 +720,7 @@ struct ProviderHostImpl : ProviderHost {
 
   // OrtSessionOptions (wrapped)
   const std::unordered_map<std::string, std::string>& SessionOptions__GetConfigOptionsMap(const OrtSessionOptions* p) override { return p->value.config_options.configurations; }
+  bool SessionOptions__GetEnableProfiling(const OrtSessionOptions* p) override { return p->value.enable_profiling; };
   // ComputeCapability (wrapped)
   std::unique_ptr<ComputeCapability> ComputeCapability__construct(std::unique_ptr<IndexedSubGraph> t_sub_graph) override { return std::make_unique<ComputeCapability>(std::move(t_sub_graph)); }
   void ComputeCapability__operator_delete(ComputeCapability* p) override { delete p; }

From c6ba7edd830087bc52311a3b10b1f0692ef64b3b Mon Sep 17 00:00:00 2001
From: amancini-N <63410090+amancini-N@users.noreply.github.com>
Date: Mon, 23 Dec 2024 06:30:49 +0100
Subject: [PATCH 25/25] Enable pointer-generator T5 models in BeamSearch
 (#23134)

### Description
Introduces a new optional input (encoder_ibnput_ids) in the decoder
graph of the T5 implementation for BeamSearch. This allows usage of
pointer generator networks in decoder graph.

### Motivation and Context
- Fixes #23123
---
 .../cpu/transformers/subgraph_t5_decoder.cc   |  65 ++-
 .../cpu/transformers/subgraph_t5_decoder.h    |  10 +-
 .../test/contrib_ops/beam_search_test.cc      |  22 +
 .../test/testdata/dummy_t5_model_generator.py | 377 ++++++++++++++++++
 .../testdata/dummy_t5_pointer_generator.onnx  | Bin 0 -> 7100 bytes
 5 files changed, 448 insertions(+), 26 deletions(-)
 create mode 100644 onnxruntime/test/testdata/dummy_t5_model_generator.py
 create mode 100644 onnxruntime/test/testdata/dummy_t5_pointer_generator.onnx

diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc
index f4e7173c917c1..997beb198f450 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc
@@ -20,8 +20,9 @@ namespace transformers {
 
    Inputs:
       input_ids: int32 (B, 1)
+      encoder_input_ids: int32 (B, encode_sequence_length) (optional)
       encoder_attention_mask: int32 (B, encode_sequence_length)
-      encoder_hidden_states: (B, encode_sequence_length, encoder_hidden_size)
+      encoder_hidden_states: (B, encode_sequence_length, encoder_hidden_size) (optional)
 
       past_key_self_0: (B, num_heads, past_decode_sequence_length, head_size)
       past_value_self_0: (B, num_heads, past_decode_sequence_length, head_size)
@@ -49,11 +50,9 @@ namespace transformers {
 
 Status T5DecoderSubgraph::Validate(const std::vector<const NodeArg*>& subgraph_inputs,
                                    const std::vector<const NodeArg*>& subgraph_outputs) {
-  bool has_hidden_state = subgraph_inputs[2]->Name() == "encoder_hidden_states" ? true : false;
-  SetPastInputIndex(has_hidden_state);
-
-  ORT_RETURN_IF(first_past_input_index_ != 2 && first_past_input_index_ != 3,
-                "kFirstPastInputIndex currently only supports 2 or 3");
+  bool has_encoder_input_ids = subgraph_inputs[1]->Name() == "encoder_input_ids";
+  bool has_hidden_state = subgraph_inputs[2 + has_encoder_input_ids]->Name() == "encoder_hidden_states";
+  SetPastInputIndex(has_hidden_state, has_encoder_input_ids);
 
   if (!past_present_share_buffer_) {
     ORT_RETURN_IF(has_decoder_masked_attention_, "decoder_masked_attention shall use with past_present_share_buffer");
@@ -75,13 +74,17 @@ Status T5DecoderSubgraph::Validate(const std::vector<const NodeArg*>& subgraph_i
 
   ORT_RETURN_IF(subgraph_inputs[0]->Name() != "input_ids",
                 "decoder subgraph input 0 shall be named as input_ids, got: ", subgraph_inputs[0]->Name());
-  ORT_RETURN_IF(subgraph_inputs[1]->Name() != "encoder_attention_mask",
-                "decoder subgraph input 1 shall be named as encoder_attention_mask, got: ",
-                subgraph_inputs[1]->Name());
-  if (first_past_input_index_ == 3) {
-    ORT_RETURN_IF(subgraph_inputs[2]->Name() != "encoder_hidden_states",
-                  "decoder subgraph input 2 shall be named as encoder_hidden_states, got: ",
-                  subgraph_inputs[2]->Name());
+  const int enc_attn_mask_index = 1 + has_encoder_input_ids_;
+  const int enc_hidden_state_index = enc_attn_mask_index + 1;
+  ORT_RETURN_IF(subgraph_inputs[enc_attn_mask_index]->Name() != "encoder_attention_mask",
+                "decoder subgraph input ", std::to_string(enc_attn_mask_index),
+                " shall be named as encoder_attention_mask, got: ",
+                subgraph_inputs[enc_attn_mask_index]->Name());
+  if (has_hidden_state_) {
+    ORT_RETURN_IF(subgraph_inputs[enc_hidden_state_index]->Name() != "encoder_hidden_states",
+                  "decoder subgraph input ", std::to_string(enc_hidden_state_index),
+                  " shall be named as encoder_hidden_states, got: ",
+                  subgraph_inputs[enc_hidden_state_index]->Name());
   }
 
   // check subgraph outputs
@@ -108,12 +111,19 @@ Status T5DecoderSubgraph::Validate(const std::vector<const NodeArg*>& subgraph_i
 
   ORT_RETURN_IF(subgraph_inputs[0]->TypeAsProto()->tensor_type().elem_type() != int32_type,
                 "decoder subgraph input 0 (input_ids) shall have int32 type");
-  ORT_RETURN_IF(subgraph_inputs[1]->TypeAsProto()->tensor_type().elem_type() != int32_type,
-                "decoder subgraph input 1 (encoder_attention_mask) shall have int32 type");
-
-  auto float_type = subgraph_inputs[2]->TypeAsProto()->tensor_type().elem_type();
-  ORT_RETURN_IF(float_type != float32_type && float_type != float16_type,
-                "decoder subgraph input 2 (encoder_hidden_states) shall have float or float16 type");
+  if (has_encoder_input_ids_) {
+    ORT_RETURN_IF(subgraph_inputs[1]->TypeAsProto()->tensor_type().elem_type() != int32_type,
+                  "decoder subgraph input 1 (encoder_input_ids) shall have int32 type");
+  }
+  ORT_RETURN_IF(subgraph_inputs[enc_attn_mask_index]->TypeAsProto()->tensor_type().elem_type() != int32_type,
+                "decoder subgraph input ", std::to_string(enc_attn_mask_index),
+                " (encoder_attention_mask) shall have int32 type");
+
+  auto float_type = subgraph_inputs[enc_hidden_state_index]->TypeAsProto()->tensor_type().elem_type();
+  if (has_hidden_state_) {
+    ORT_RETURN_IF(float_type != float32_type && float_type != float16_type,
+                  "decoder subgraph input ", std::to_string(enc_hidden_state_index), " (encoder_hidden_states) shall have float or float16 type");
+  }
 
   for (int i = first_past_input_index_; i < first_past_input_index_ + 4 * num_layers; i++) {
     ORT_RETURN_IF(subgraph_inputs[i]->TypeAsProto()->tensor_type().elem_type() != float_type,
@@ -219,6 +229,19 @@ Status T5DecoderSubgraph::CreateInitialFeeds(
   decoder_feeds.reserve(static_cast<size_t>(num_subgraph_inputs) + static_cast<size_t>(num_implicit_inputs));
   decoder_feeds.push_back(input_ids);
 
+  if (has_encoder_input_ids_) {
+    // The encoder_input_ids is copied from the first input of encoder.
+    OrtValue expanded_encoder_input_ids;
+    ORT_RETURN_IF_ERROR(expand_buffer_int32_func(stream,
+                                                 encoder_feeds[0],
+                                                 num_beam,
+                                                 allocator,
+                                                 expanded_encoder_input_ids,
+                                                 false,
+                                                 0 /*max_sequence_length*/));
+    decoder_feeds.push_back(expanded_encoder_input_ids);
+  }
+
   // The encoder_attention_mask is copied from the second input of encoder.
   OrtValue expanded_decoder_attention_masks;
   ORT_RETURN_IF_ERROR(expand_buffer_int32_func(stream,
@@ -238,7 +261,9 @@ Status T5DecoderSubgraph::CreateInitialFeeds(
   // When first_past_input_index_ == 3, the encoder_hidden_states and past states are copied from the second output
   // of encoder.
   // When first_past_input_index_ == 2, the past states are copied from the second output of encoder.
-  for (size_t j = static_cast<size_t>(4) - first_past_input_index_; j < encoder_fetches.size(); j++) {
+  // TODO - probably more robust to introduce a encoder_out/decoder_in mapping instead of relying on positions.
+  // What happens if encoder_hidden_states is present in the encoder_fetches but not in the decoder_feeds?
+  for (size_t j = static_cast<size_t>(2) - has_hidden_state_; j < encoder_fetches.size(); j++) {
     if (j == 1) {
       ORT_RETURN_IF(has_hidden_state_ == false, "Invalid hidden_states expension: has_hidden_state_ == false");
       OrtValue expanded_hidden_states;
diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.h b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.h
index a72ce37a93aba..b5d727b67924c 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.h
@@ -54,13 +54,10 @@ class T5DecoderSubgraph : public Subgraph {
   Status Validate(const std::vector<const NodeArg*>& subgraph_inputs,
                   const std::vector<const NodeArg*>& subgraph_outputs) override;
 
-  void SetPastInputIndex(bool has_hidden_state) {
+  void SetPastInputIndex(bool has_hidden_state, bool has_encoder_input_ids) {
     has_hidden_state_ = has_hidden_state;
-    if (!has_hidden_state_) {
-      first_past_input_index_ = 2;
-    } else {
-      first_past_input_index_ = 3;
-    }
+    has_encoder_input_ids_ = has_encoder_input_ids;
+    first_past_input_index_ = 2 + has_hidden_state_ + has_encoder_input_ids_;
   }
 
   int GetFirstPastInputIndex() const {
@@ -79,6 +76,7 @@ class T5DecoderSubgraph : public Subgraph {
   int first_past_input_index_;
   int first_present_output_index_;
   bool has_hidden_state_;
+  bool has_encoder_input_ids_;
   bool use_sequence_as_input_ids_;
 };
 
diff --git a/onnxruntime/test/contrib_ops/beam_search_test.cc b/onnxruntime/test/contrib_ops/beam_search_test.cc
index 9f4ee071925b4..1ae15afdf7482 100644
--- a/onnxruntime/test/contrib_ops/beam_search_test.cc
+++ b/onnxruntime/test/contrib_ops/beam_search_test.cc
@@ -394,6 +394,8 @@ TEST(BeamSearchTest, DummyT5) {
 #if defined(USE_CUDA) && defined(USE_DML)
   SKIP_CUDA_TEST_WITH_DML;
 #endif
+  // dummy_t5.onnx model generated using following command:
+  // python onnxruntime/test/testdata/dummy_t5_generator.py --output-path dummy_t5.onnx
   ModelTester tester(CurrentTestName(), ORT_TSTR("testdata/dummy_t5.onnx"));
   tester.ConfigEp(DefaultCpuExecutionProvider());
   tester.AddInput("encoder_input_ids", {1, 5}, {14, 6, 13, 9, 7});
@@ -408,6 +410,8 @@ TEST(BeamSearchTest, DummyT5WithOuterScopeInitializers) {
 #if defined(USE_CUDA) && defined(USE_DML)
   SKIP_CUDA_TEST_WITH_DML;
 #endif
+  // dummy_t5_with_outer_scope_initializers.onnx model generated using following command:
+  // python onnxruntime/test/testdata/dummy_t5_generator.py --output-path dummy_t5_with_outer_scope_initializers.onnx --move-initializers
   ModelTester tester(CurrentTestName(), ORT_TSTR("testdata/dummy_t5_with_outer_scope_initializers.onnx"));
   tester.ConfigEp(DefaultCpuExecutionProvider());
   tester.AddInput("encoder_input_ids", {1, 5}, {14, 6, 13, 9, 7});
@@ -422,6 +426,8 @@ TEST(BeamSearchTest, DummyT5WithSequenceInputIds) {
 #if defined(USE_CUDA) && defined(USE_DML)
   SKIP_CUDA_TEST_WITH_DML;
 #endif
+  // dummy_t5_with_sequence_input_ids.onnx model generated using following command:
+  // python onnxruntime/test/testdata/dummy_t5_generator.py --output-path dummy_t5_with_sequence_input_ids.onnx --sequence-as-input
   ModelTester tester(CurrentTestName(), ORT_TSTR("testdata/dummy_t5_with_sequence_input_ids.onnx"));
   tester.ConfigEp(DefaultCpuExecutionProvider());
   tester.AddInput("encoder_input_ids", {1, 5}, {16, 17, 1, 0, 8});
@@ -432,5 +438,21 @@ TEST(BeamSearchTest, DummyT5WithSequenceInputIds) {
   tester.RunWithConfig();
 }
 
+TEST(BeamSearchTest, DummyT5PointerGenerator) {
+#if defined(USE_CUDA) && defined(USE_DML)
+  SKIP_CUDA_TEST_WITH_DML;
+#endif
+  // dummy_t5_pointer_generator.onnx model generated using following command:
+  // python onnxruntime/test/testdata/dummy_t5_generator.py --output-path dummy_t5_pointer_generator.onnx --decoder-needs-input-ids
+  ModelTester tester(CurrentTestName(), ORT_TSTR("testdata/dummy_t5_pointer_generator.onnx"));
+  tester.ConfigEp(DefaultCpuExecutionProvider());
+  tester.AddInput("encoder_input_ids", {1, 5}, {14, 6, 13, 9, 7});
+  tester.AddOutput("sequences", {1, 3, 10}, {2, 3, 6, 7, 3, 6, 7, 18, 3, 6, 2, 3, 6, 7, 18, 3, 6, 7, 18, 3, 2, 3, 6, 7, 3, 6, 7, 3, 6, 7});
+#ifdef USE_CUDA
+  tester.ConfigEp(DefaultCudaExecutionProvider());
+#endif
+  tester.RunWithConfig();
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/testdata/dummy_t5_model_generator.py b/onnxruntime/test/testdata/dummy_t5_model_generator.py
new file mode 100644
index 0000000000000..1ecd8b9ee9c92
--- /dev/null
+++ b/onnxruntime/test/testdata/dummy_t5_model_generator.py
@@ -0,0 +1,377 @@
+""" Script to generate a dummy ONNX model emulating T5 model with BeamSearch op. """
+
+import argparse
+
+import numpy as np
+import onnx
+
+import onnxruntime as ort
+from onnxruntime.transformers.convert_generation import move_initializers
+
+
+def create_model(
+    vocab_size: int,
+    embed_dim: int,
+    num_heads: int,
+    head_size: int,
+    beam_size: int,
+    min_length: int,
+    max_length: int,
+    length_penalty: float,
+    sequence_as_input: bool,
+    decoder_needs_input_ids: bool,
+) -> onnx.ModelProto:
+    encoder_graph = create_encoder(vocab_size, embed_dim, num_heads, head_size)
+    decoder_graph = create_decoder(
+        vocab_size, embed_dim, num_heads, head_size, sequence_as_input, decoder_needs_input_ids
+    )
+
+    # Inputs: encoder_input_ids
+    encoder_input_ids = onnx.helper.make_tensor_value_info(
+        "encoder_input_ids", onnx.TensorProto.INT32, ["batch_size", "encode_sequence_length"]
+    )
+
+    # Outputs: sequences, scores
+    sequences = onnx.helper.make_tensor_value_info(
+        "sequences", onnx.TensorProto.INT32, ["batch_size", beam_size, "decode_sequence_length"]
+    )
+    scores = onnx.helper.make_tensor_value_info("scores", onnx.TensorProto.FLOAT, ["batch_size", beam_size])
+
+    # Tensors
+    max_length_t = onnx.numpy_helper.from_array(np.array(max_length, dtype=np.int32), name="max_length")
+    min_length_t = onnx.numpy_helper.from_array(np.array(min_length, dtype=np.int32), name="min_length")
+    num_beams_t = onnx.numpy_helper.from_array(np.array(beam_size, dtype=np.int32), name="num_beams")
+    length_penalty_t = onnx.numpy_helper.from_array(
+        np.array(length_penalty, dtype=np.float32), name="length_penalty_as_tensor"
+    )
+
+    # Nodes
+    beam_search = onnx.helper.make_node(
+        "BeamSearch",
+        ["encoder_input_ids", "max_length", "min_length", "num_beams", "num_beams", "length_penalty_as_tensor"],
+        ["sequences", "scores"],
+        decoder_start_token_id=2,
+        eos_token_id=2,
+        early_stopping=0,
+        model_type=1,
+        pad_token_id=1,
+        decoder=decoder_graph,
+        encoder=encoder_graph,
+        domain="com.microsoft",
+    )
+
+    # Graph
+    graph = onnx.helper.make_graph(
+        [beam_search],
+        "model",
+        [encoder_input_ids],
+        [sequences, scores],
+        [max_length_t, min_length_t, num_beams_t, length_penalty_t],
+    )
+
+    # Model
+    model = onnx.helper.make_model(
+        graph, opset_imports=[onnx.helper.make_opsetid("", 17), onnx.helper.make_opsetid("com.microsoft", 1)]
+    )
+
+    return model
+
+
+def create_encoder(vocab_size, embed_dim, num_heads, head_size) -> onnx.GraphProto:
+    # Inputs: encoder_input_ids, encoder_attention_mask, decoder_input_ids
+    encoder_input_ids = onnx.helper.make_tensor_value_info(
+        "encoder_input_ids", onnx.TensorProto.INT32, ["batch_size", "encode_sequence_length"]
+    )
+    encoder_attention_mask = onnx.helper.make_tensor_value_info(
+        "encoder_attention_mask", onnx.TensorProto.INT32, ["batch_size", "encode_sequence_length"]
+    )
+    decoder_input_ids = onnx.helper.make_tensor_value_info(
+        "decoder_input_ids", onnx.TensorProto.INT32, ["batch_size", 1]
+    )
+
+    # Outputs: logits, present_key_self_0, present_value_self_0, present_key_cross_0, present_value_cross_0, encoder_hidden_states
+    logits = onnx.helper.make_tensor_value_info(
+        "logits", onnx.TensorProto.FLOAT, ["batch_size", "decode_sequence_length", vocab_size]
+    )
+    present_key_self_0 = onnx.helper.make_tensor_value_info(
+        "present_key_self_0", onnx.TensorProto.FLOAT, ["batch_size", num_heads, 1, head_size]
+    )
+    present_value_self_0 = onnx.helper.make_tensor_value_info(
+        "present_value_self_0", onnx.TensorProto.FLOAT, ["batch_size", num_heads, 1, head_size]
+    )
+    present_key_cross_0 = onnx.helper.make_tensor_value_info(
+        "present_key_cross_0", onnx.TensorProto.FLOAT, ["batch_size", num_heads, "encode_sequence_length", head_size]
+    )
+    present_value_cross_0 = onnx.helper.make_tensor_value_info(
+        "present_value_cross_0", onnx.TensorProto.FLOAT, ["batch_size", num_heads, "encode_sequence_length", head_size]
+    )
+    encoder_hidden_states = onnx.helper.make_tensor_value_info(
+        "encoder_hidden_states", onnx.TensorProto.FLOAT, ["batch_size", "encode_sequence_length", embed_dim]
+    )
+
+    # Tensors
+    encoder_embeddings_tensor = onnx.numpy_helper.from_array(
+        np.random.randn(vocab_size, embed_dim).astype(np.float32), name="encoder_embeddings"
+    )
+    num_heads_and_size_tensor = onnx.numpy_helper.from_array(
+        np.array([num_heads, head_size], dtype=np.int64), name="num_heads_and_size"
+    )
+    final_proj_tensor = onnx.numpy_helper.from_array(
+        np.random.randn(embed_dim, vocab_size).astype(np.float32), name="init_final_proj"
+    )
+    self_state_before_tranpose_shape_tensor = onnx.numpy_helper.from_array(
+        np.array([-1, 1, num_heads, head_size], dtype=np.int64), name="self_state_before_tranpose_shape"
+    )
+
+    # Nodes
+    nodes = [
+        onnx.helper.make_node("Gather", ["encoder_embeddings", "encoder_input_ids"], ["encoder_hidden_states"]),
+        onnx.helper.make_node("Shape", ["encoder_hidden_states"], ["encoder_batch_seq_len"], end=2),
+        onnx.helper.make_node(
+            "Concat", ["encoder_batch_seq_len", "num_heads_and_size"], ["encoder_final_shape"], axis=0
+        ),
+        onnx.helper.make_node(
+            "Reshape", ["encoder_hidden_states", "encoder_final_shape"], ["encoder_hidden_states_reshaped"]
+        ),
+        onnx.helper.make_node(
+            "Transpose", ["encoder_hidden_states_reshaped"], ["present_key_cross_0"], perm=[0, 2, 1, 3]
+        ),
+        onnx.helper.make_node(
+            "Transpose", ["encoder_hidden_states_reshaped"], ["present_value_cross_0"], perm=[0, 2, 1, 3]
+        ),
+        onnx.helper.make_node("Gather", ["encoder_embeddings", "decoder_input_ids"], ["decoder_hidden_states"]),
+        onnx.helper.make_node("ReduceMean", ["encoder_hidden_states"], ["encoder_hidden_states_mean"], axes=[1]),
+        onnx.helper.make_node("Add", ["decoder_hidden_states", "encoder_hidden_states_mean"], ["encoder_decoder_sum"]),
+        onnx.helper.make_node("MatMul", ["encoder_decoder_sum", "init_final_proj"], ["logits"]),
+        onnx.helper.make_node(
+            "Reshape", ["encoder_decoder_sum", "self_state_before_tranpose_shape"], ["self_state_before_tranpose"]
+        ),
+        onnx.helper.make_node("Transpose", ["self_state_before_tranpose"], ["present_key_self_0"], perm=[0, 2, 1, 3]),
+        onnx.helper.make_node("Transpose", ["self_state_before_tranpose"], ["present_value_self_0"], perm=[0, 2, 1, 3]),
+    ]
+
+    # Graph
+    graph = onnx.helper.make_graph(
+        nodes,
+        "encoder",
+        [encoder_input_ids, encoder_attention_mask, decoder_input_ids],
+        [
+            logits,
+            encoder_hidden_states,
+            present_key_self_0,
+            present_value_self_0,
+            present_key_cross_0,
+            present_value_cross_0,
+        ],
+        [
+            encoder_embeddings_tensor,
+            num_heads_and_size_tensor,
+            final_proj_tensor,
+            self_state_before_tranpose_shape_tensor,
+        ],
+    )
+    return graph
+
+
+def create_decoder(
+    vocab_size, embed_dim, num_heads, head_size, sequence_as_input, decoder_needs_input_ids
+) -> onnx.GraphProto:
+    # Inputs: input_ids, encoder_input_ids (optional), encoder_attention_mask, past_self_key_0, past_self_value_0, past_cross_key_0, past_cross_value_0
+    inputs = []
+    inputs.append(
+        onnx.helper.make_tensor_value_info(
+            "input_ids", onnx.TensorProto.INT32, ["batch_size", "decode_sequence_length" if sequence_as_input else 1]
+        )
+    )
+    if decoder_needs_input_ids:
+        inputs.append(
+            onnx.helper.make_tensor_value_info(
+                "encoder_input_ids", onnx.TensorProto.INT32, ["batch_size", "encode_sequence_length"]
+            )
+        )
+    inputs.append(
+        onnx.helper.make_tensor_value_info(
+            "encoder_attention_mask", onnx.TensorProto.INT32, ["batch_size", "encode_sequence_length"]
+        )
+    )
+    inputs.append(
+        onnx.helper.make_tensor_value_info(
+            "past_self_key_0", onnx.TensorProto.FLOAT, ["batch_size", num_heads, "decode_sequence_length", head_size]
+        )
+    )
+    inputs.append(
+        onnx.helper.make_tensor_value_info(
+            "past_self_value_0", onnx.TensorProto.FLOAT, ["batch_size", num_heads, "decode_sequence_length", head_size]
+        )
+    )
+    inputs.append(
+        onnx.helper.make_tensor_value_info(
+            "past_cross_key_0", onnx.TensorProto.FLOAT, ["batch_size", num_heads, "encode_sequence_length", head_size]
+        )
+    )
+    inputs.append(
+        onnx.helper.make_tensor_value_info(
+            "past_cross_value_0", onnx.TensorProto.FLOAT, ["batch_size", num_heads, "encode_sequence_length", head_size]
+        )
+    )
+
+    # Outputs: logits, present_key_self_0, present_value_self_0
+    outputs = [
+        onnx.helper.make_tensor_value_info("logits", onnx.TensorProto.FLOAT, ["batch_size", 1, vocab_size]),
+        onnx.helper.make_tensor_value_info(
+            "present_key_self_0",
+            onnx.TensorProto.FLOAT,
+            ["batch_size", num_heads, "present_decode_sequence_length", head_size],
+        ),
+        onnx.helper.make_tensor_value_info(
+            "present_value_self_0",
+            onnx.TensorProto.FLOAT,
+            ["batch_size", num_heads, "present_decode_sequence_length", head_size],
+        ),
+    ]
+
+    # Tensors: decoder_embeddings, final_proj, self_state_before_tranpose_shape_no_batch, hidden_states_mean
+    initializers = [
+        onnx.numpy_helper.from_array(
+            np.random.randn(vocab_size, embed_dim).astype(np.float32), name="decoder_embeddings"
+        ),
+        onnx.numpy_helper.from_array(np.random.randn(embed_dim, vocab_size).astype(np.float32), name="final_proj"),
+        onnx.numpy_helper.from_array(
+            np.array([-1, num_heads, head_size], dtype=np.int64), name="self_state_before_tranpose_shape_no_batch"
+        ),
+        onnx.numpy_helper.from_array(np.array([-1, 1, embed_dim], dtype=np.int64), name="hidden_states_mean_shape"),
+    ]
+
+    # Nodes
+    nodes = []
+    nodes.append(onnx.helper.make_node("Gather", ["decoder_embeddings", "input_ids"], ["decoder_hidden_states"]))
+    if decoder_needs_input_ids:
+        nodes.append(
+            onnx.helper.make_node("Gather", ["decoder_embeddings", "encoder_input_ids"], ["encoder_input_embeddings"])
+        )
+        nodes.append(
+            onnx.helper.make_node(
+                "ReduceMean", ["encoder_input_embeddings"], ["encoder_input_embeddings_mean"], axes=[1]
+            )
+        )
+        nodes.append(
+            onnx.helper.make_node(
+                "Mul", ["decoder_hidden_states", "encoder_input_embeddings_mean"], ["combined_hidden_states"]
+            )
+        )
+    else:
+        nodes.append(onnx.helper.make_node("Identity", ["decoder_hidden_states"], ["combined_hidden_states"]))
+    nodes.append(onnx.helper.make_node("ReduceMean", ["past_cross_key_0"], ["encoder_hidden_states_mean"], axes=[2]))
+    nodes.append(
+        onnx.helper.make_node(
+            "Reshape",
+            ["encoder_hidden_states_mean", "hidden_states_mean_shape"],
+            ["encoder_hidden_states_mean_reshaped"],
+        )
+    )
+    if sequence_as_input:
+        nodes.append(
+            onnx.helper.make_node("ReduceMean", ["combined_hidden_states"], ["decoder_hidden_states_mean"], axes=[1])
+        )
+        nodes.append(
+            onnx.helper.make_node(
+                "Add", ["decoder_hidden_states_mean", "encoder_hidden_states_mean_reshaped"], ["encoder_decoder_sum"]
+            )
+        )
+    else:
+        nodes.append(
+            onnx.helper.make_node(
+                "Add", ["combined_hidden_states", "encoder_hidden_states_mean_reshaped"], ["encoder_decoder_sum"]
+            )
+        )
+    nodes.append(onnx.helper.make_node("Shape", ["combined_hidden_states"], ["decoder_batch"], end=1))
+    nodes.append(
+        onnx.helper.make_node(
+            "Concat",
+            ["decoder_batch", "self_state_before_tranpose_shape_no_batch"],
+            ["self_state_before_tranpose_shape_dec"],
+            axis=0,
+        )
+    )
+    nodes.append(onnx.helper.make_node("MatMul", ["encoder_decoder_sum", "final_proj"], ["logits"]))
+    nodes.append(
+        onnx.helper.make_node(
+            "Reshape", ["encoder_decoder_sum", "self_state_before_tranpose_shape_dec"], ["self_state_before_tranpose"]
+        )
+    )
+    nodes.append(
+        onnx.helper.make_node("Transpose", ["self_state_before_tranpose"], ["single_self_key_0"], perm=[0, 2, 1, 3])
+    )
+    nodes.append(
+        onnx.helper.make_node("Transpose", ["self_state_before_tranpose"], ["single_self_value_0"], perm=[0, 2, 1, 3])
+    )
+    nodes.append(
+        onnx.helper.make_node("Concat", ["past_self_key_0", "single_self_key_0"], ["present_key_self_0"], axis=2)
+    )
+    nodes.append(
+        onnx.helper.make_node("Concat", ["past_self_value_0", "single_self_value_0"], ["present_value_self_0"], axis=2)
+    )
+
+    # Graph
+    graph = onnx.helper.make_graph(nodes, "decoder", inputs, outputs, initializers)
+    return graph
+
+
+def run_model(model_path):
+    ort_session = ort.InferenceSession(model_path)
+    encoder_input_ids = np.array([[14, 6, 13, 9, 7]]).astype(np.int32)
+    print("encoder_input_ids: ", encoder_input_ids)
+    sequence, scores = ort_session.run(None, {"encoder_input_ids": encoder_input_ids})
+    print("sequence: ", sequence)
+    print("scores: ", scores)
+
+
+def move_initializers_on_outer_scope(model) -> None:
+    main_graph = model.graph
+    beam_search_node = model.graph.node[0]
+    decoder_graph = next(attr for attr in beam_search_node.attribute if attr.name == "decoder").g
+    encoder_graph = next(attr for attr in beam_search_node.attribute if attr.name == "encoder").g
+    main_graph.initializer.extend(move_initializers(decoder_graph, min_elements=10))
+    main_graph.initializer.extend(move_initializers(encoder_graph, min_elements=10))
+
+
+def arg_parser():
+    parser = argparse.ArgumentParser(description="Generate a dummy ONNX model emulating T5 model with BeamSearch op.")
+    parser.add_argument("--output-path", type=str, default="model.onnx", help="Model output path")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    parser.add_argument("--vocab-size", type=int, default=20, help="Vocab size")
+    parser.add_argument("--embed-dim", type=int, default=8, help="Embedding dimension")
+    parser.add_argument("--num-heads", type=int, default=2, help="Number of heads")
+    parser.add_argument("--head-size", type=int, default=4, help="Head size")
+    parser.add_argument("--beam-size", type=int, default=3, help="Beam size")
+    parser.add_argument("--min-length", type=int, default=1, help="Min length")
+    parser.add_argument("--max-length", type=int, default=10, help="Max length")
+    parser.add_argument("--length-penalty", type=float, default=1.1, help="Length penalty")
+    parser.add_argument("--move-initializers", action="store_true", help="Move initializers to outer scope")
+    parser.add_argument("--sequence-as-input", action="store_true", help="Use sequence as input")
+    parser.add_argument("--decoder-needs-input-ids", action="store_true", help="Decoder needs model/encoder input ids")
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = arg_parser()
+    np.random.seed(args.seed)
+
+    model = create_model(
+        args.vocab_size,
+        args.embed_dim,
+        args.num_heads,
+        args.head_size,
+        args.beam_size,
+        args.min_length,
+        args.max_length,
+        args.length_penalty,
+        args.sequence_as_input,
+        args.decoder_needs_input_ids,
+    )
+    if args.move_initializers:
+        move_initializers_on_outer_scope(model)
+    onnx.save(model, args.output_path)
+
+    run_model(args.output_path)
diff --git a/onnxruntime/test/testdata/dummy_t5_pointer_generator.onnx b/onnxruntime/test/testdata/dummy_t5_pointer_generator.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..f7fee773cffe17cafa21af369ad06887d7cb38b4
GIT binary patch
literal 7100
zcmd5>c|cRg)`zf#D=J(RktZ%FsIjh9R3vi(A}F#6xD=3>kgGu<L=qIxTBs-$sr!Zt
zE|ubj+WMZ>4>AWWQtDPKR&jaOU7w0ualy8}cLPK~0{Z%We|-5TH@WBh=FH4FGr!-A
zknzYJ$}AYd*m0_u1O=BUja8|Ww9;6GhGF7m$<jDZHBGDJpJP>qXKPhbyj0G~;x$Zz
zN5@)AshU&C;<PDJnMSJRRGNfD)>^~OOyc1=4Qr{1Nl4^0E{r$t9>&QMW0a!HPK=d;
zt7&}DNhgLdY~3#@7cb`&idfY&zM;2avF#17m9Yv1r;=*4GA$2j=_k`FxkP4+QU9jh
zV;vj!xW2KsP@zn-Hmq}V-qLtZrgC9IIYm+o7sx-0S~2FbWKJV7-C}CRgfYhBXIchi
z9byvV<*_PGA#L1kv%sV{#*eX8%QRYPOk#pYBb~veNC&V^h5#GcHy@oqADx0}!Hscj
z3MJJjWonM?-h5+He!lfK3Ky%;+9zfr(@bxe55ZyW4UHKRL6a2kV&<h#FdoeTv#kxb
za+x+p>0%M44@tx@W}Hgl$WM%bNn#q?FugTg+|=6a;<J4!pSMzNqD-Yu&~UZAmZ}n@
zHE?W?_nPv}yI77)P{qi!wfPXMaWv64JCHHf9fp}2%a@T<otQA4wTw%c7OUkmJW!_9
z&-N7KE*}&G>(qRx^%E4xv|z^CY50<i;|D=MJ~heaVja$pj7C2+B0I)h%_YW5Oe6w{
zsl-g5RlzO7**AonC5ubqqyydq7|67($@lsQn5IFoteQ_fPNl7hvBq>jlR4nCUclF+
zT|Hb~1dOq;*mefUTB{a7S;Lwp+GHVYCluP6dK+2S{L?JX=9P-wqa5L1v&P|`e_2TS
z9!tY7rf<O4o~uB)%OA#-b^*!duCV{>(M7Yq<uDF5h?67wV&c{ma9`s<e+&!5ec9V-
z@r-if6=I9lpN3+OMJXhsB9&PEV2Y3XJf&&5@#I{@VJz+WcYN_+DBfBgLH+t~CMgv~
zuzG1ERF-z9*MHbhbR#qkuk2V`WOrgEjUJIll*hvG_HX8Jd_W0o*Im{1E;vQ}3I@Qj
zxKlJOnIn-QL7qycO#JHRC|veqndi3ZH^d$fL+Oenpr@=Qx70qU43UC>jv$4uHj=L%
z2BF!j33zg5N4!^QReWQuB`%oYjpt09$oAKJNwnJ&cpapH=lkq2{SAlbuKZ4>{ka24
z{C&9nat)jq`h>pmyNutAEr6>l@9Rp`xiG}xByLxC#EIp*M;um3#J{WVP)AJ{_%Z(r
ztUO^ymrQ;k9{H^qa38sYZ`D$~G`R%tXMF{>uDi&Yjeu_1rBHt7q1gSmHPFp3OFZ_b
znMAPf8YDfLN3VUC3{gW}A+-NK7;Z5e*{%ISo0<tzI$uZgq_tS~xuE!)$BXF2{X@Ys
zuoz5ntvF@Y84%^~N44-0{;?oN=drznJi2e8s}3<QUY>DKclp)VxHF*-lzdxAUTjcf
zS+*Glugb(blhbrVGPJOG(p(&9Z3-``6&|Yg$D%=(b!R`%z!!zB(Kb8>o#*}z8ERXx
zd$*7AOXo-;vALtWJW5WjJe<Mup%*+nZH47Zdt6zbO$$Qpi_4xK!J_ATA#llA2r~PO
zigWCtU({wIw;3fF5wsLUt1W~=VLLuQ8>~5eUY`3yKm&5MDBN-ik4*|8gSU8Ltj$3@
zwZIic>MZoGx&W>RWUzO$py(%+4tzDa;x3;?Vd3+mAb4R7b4oI>$31Vlpw|>!{qOx)
z-7y|Yrd7d6^8;Ai*%sz6bHg-cHSLz)0n_?T#!b{$>~=UD$E9SE)Y3rR$-P?YI6;YH
z^6fEH<^`QRO`xq;Di*0ui0?&6fZg{C=67rb2e<adBP<J5ijg?Q@-Bviox#IZKH^Va
zXOOY!)9_?!ci6w@99D#9>Q)IhVBu9a?Bwb#ZtW9=vu~Bq!Kc$;0H1FU9*+ml^~(^p
zT0!W7?;x&!=i+eF05mDIhi(so=`WU)l;%3&pX0v~UyAKQZ=|dS&B)J*_oY1~>-t%`
zFu$7mB>$6cd;TNwTYP~^C$A#W<>nGAffSZJ{F=&+<bc=XGjy9;P+Toc$4;{z<I3s-
zw8xD#o}aBg50)|c<W}SqS{W7&WPa=7r#>@D+{953;;q9krmlc#!389D%Q^by&u$<L
zvy-@u@E}=z=aGnk59qX|pWw}9;Sk&_5<{1-5N9l{q7|Oc$<beSkxr*GAYqvRZTe<W
z?`5f|Oub3J=zD|+j(dTXeH_lqjfTYw3P4v-PAWW}K<f^^SP`-r|5k7lW%Cd|&*%xo
zdt!7pla>-o+hzE1W+%zgP;(+<u8A+47!DP_FUVytjx37u$3VqX((ld_IPAU$vMNu2
z@K*uS37_Ja(<byZD8$KavSIJG3Jfay9?DO4qrH}0qF-gF<D|3Y7+e?%g+14Z)y{d?
z<5(Vw#t6-9t-L>c7xs7jcil%|Qu{Hl`xXtWfohh%)O(uNc?j#iBU~FYPe;Z<XvwmS
z4a?N*`%+D8D#r+frV%5J@MpFk^AT^}>m74P)}fZa@cU;iEw1C$5u=&9H;Gc2mgmme
z*aVd{UZ$DxUn7lavOlu!%wVA@JK&vQ1VRDB2UJUr-VKQFnR&!mBYWdt2J<(<Q(Fwo
z_(;_R7QnC#IB(sq-Ew5jqsiTvI+n$DVx0Jaa%eJ8rhKdI<l%g)O;`m>7*F13SQ8%s
z)0ORPuy1~l<r5gA#NvO8yT#N(<jmB~Zla#bCTgYHgc-bOid8u3sgI~V(}q`JaVh-E
z1htyybv3M1w8N3Hs$<!MavT|7hOPV6P?H(aRP7CBjg-P$LFL65H(=lXT~}A!Onp(j
zL(^-x+W2~iMz##ASEWi$rqD=bDn(6c)+>ctHeI7bnoz(R8r}f5QRwwly8#5-*=RS7
zD4j6oJ*KR^AwBd&x>movkNu8+FKBxMXl+@17;H0>Y!FT#EWiWmXQ<Io{@04MMlT08
zr<Yq0s`c}32>z`?bg!S#2Je`*v8q_Dv<1x-W!$f8jRL4SfiSKSKTK~0wJ4B`tT^w{
zM<FfZ8C`2yMq~}EK?5~osDb)}6P`7W8~29vTHTsHIPD7CpB;s}#}a7Hkp!C8{Wz?j
z<pa;6pU|PV8E8MMl6KQRqR&falju8D*y;Ecw13tXHQUqC>h>1mqg7&b>EBVdz7XyX
zoJKtYvczuD2XS{)E0p)SDLxvu66Zx0lceuE;*CkGX`;0gYCBZI?Cz6`*qBTVS`i8x
zd^Y05YvW1s`PXP7*oALak0Gu$V(2rcKM`Dfs2eF-Plrf!c+hVcR#ZHoWx<)Cd$f-P
z?!Jx_9_5kr-pA<o{(bT1HJ#wZ%lR-xse%{hC!oW)TpG1uCkfM~K-h%mG{%`w`Nl6X
zz&QuMF%1FVoC0W-x>8r7NrT8<hskm`SF8{oCi1H#5Jp)Y@hyY9J9`!nR)#{+sBgvd
zI^<CM%0lrD<s01nbP7$naFA^4ql3r-b5vIcp_|48PfVN+cA-1S<XkKKj5C9EXG+DL
zt7k*;;g50r_xW_{(K4D8UWn~BttHkw7my#9<d4`mR86mn{(<Gu7pPzIKH5qii0ym1
z(j&i~CRwVdkXbT_<o$C#ahknO7qI?0Zu$6PQPqK2;-h)a65rQ8bh=_adDOR!j;vjc
z7Nb|dzIi<BPWOl0Q%WpZasVzT96%pW2Z$eFBRRhEQ@r`g7R?<_ipy@AV2`<_h|{yh
z%d0c-AF0nsjC2M{6s6!le>B4F?R1xS2)xMNgx(L7L>rP1W=BuqPFpdFYP*0eb{!4V
z!gKMk`A&SX$pWUdSx0gtX=JDD8{*{95AT=9lXb1vkfLAR@xafIV8r@oWZT?zc*njM
zh(tmG&l-(v2L859XyJN;@_D!X`9#)bq~Mf!W8g}_DiAw{;?J9Bf#sxfvM4bWoR2>P
zOTRhvQk4XU9g4xH6(N{*catt<_g*q%*Oz4U(J;7aG2T<z`hUPFJ(Eg0^ph0z=?Vp^
zNHn!ygj+0M5>>f^>dM?<aKJMX)^oKk<%da_apwqb+;odpc}9X_L?nsUZpMid29nUx
z$H*KXIXX{wCO&a0@uh$jc(4B<vT|u#h})V4f81CI$rJZuV0$%2Tr`8!OfwK&GJ~Eg
zXGyNRvM@AaIm{Zag;ht~;MMf4^rx}jaP7h<T$PqcSERq9TMP2V(cRlgvIUP3_x6XB
z_|w>C>r+}JUkoH~zBuc*Md;djD0Y6`7lNxjFf@NM$=c~59{rOW%G|9b8KYg`$p+Kn
z>y?C@pWYkU;SsR?hCe=eeP4Ic*A!f~O@@%Kau7%EroxYYqiZBRQTh5heE-Q$Bu}vw
zO{%AZ@@Ff_*`672DD^02R=%XJqHR##dk6-4-Xt@Ed|}+!c)W5(qpMU07rS(skDEf$
zY1E<3c=_2G__fU;oF90dUb$vL^5%R8<BuQ0FK?`Z6+3!*_Vk&9=hi(WLFdPkGx;5n
zt6Cy1_d7<*=G+r=>AfLGZ~}DdT+E#>l$c$u#xLXlMxSqRfIe5363^H!XwrE+&7J-S
z-FVFb7k>5$6dgZE7Rm==-+-wwdBP@A*dYWW`*#EL;o)ehY>V5106e@p!P()vizbeF
zB3{TJH4LBG0av!MfwirI>Djx{^t<iZFgGX+`tU{YsAE1zZ1;F3uj&V5F6UC0>O3;s
z$AXGR3(fgNe(wh3-_@Xc4BF-0H2?o5PWm!-hOM`rQ#32B<eqPt`5<Nh&$2&yhh?`U
zU)ssXHb+L91*_fGjA$XD+}$Yo@5*%M!*Y5jEWNVgm7!j$$cHsU@89nk$Qa{?e_7=O
zGsc<WUj@_)4130gOW^N%8p#Sf{=$gAO^TChQ`B6|`MTatEmPFn=?~pKTJyIW{o-Tw
z1T|r*mbIvH61B6n^lotZ<ZsUKa=h-}x7Seq=K6#e+4Hvbw^07(X8J!}_|vRr@1y+9
zPo2z$|C6ea17mGa@fud2QkG5fQy?^JUSi8#n7S)5R$l?8Mh+Luy!r0i+IibCjR(ut
F^gp3hqnZE!

literal 0
HcmV?d00001