From 3a0b958586e93119b64de5a67b540d3f96219dba Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 13 Dec 2024 16:05:48 -0800
Subject: [PATCH 1/4] add 2 CMake build options of Dawn (#23096)

### Description

This change adds the following CMake build options for Dawn:
- onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY
  - OFF by default
  - when enabled, builds Dawn as a monolithic library (webgpu_dawn.dll)
- onnxruntime_ENABLE_DAWN_BACKEND_VULKAN
  - OFF by default
  - when enabled, build with Vulkan backend for Dawn on Windows
- onnxruntime_ENABLE_DAWN_BACKEND_D3D12
  - ON by default
  - when enabled, build with DirectX 12 backend for Dawn on Windows



### File Size Comparison (Windows)

|  Build | cmdline  |  File Size  |
|---|---|---|
| Baseline | --config Release<br/> --build_shared_lib | `12,755,456
onnxruntime.dll` |
| WebGPU D3D12 (default) | --use_webgpu<br/> --config Release<br/>
--build_shared_lib | `17,082,368 dxcompiler.dll`<br/>` 1,508,472
dxil.dll`<br/>`18,708,480 onnxruntime.dll` |
| WebGPU D3D12+Vulkan | --use_webgpu<br/> --config Release<br/>
--build_shared_lib<br/> --cmake_extra_defines<br/>
onnxruntime_ENABLE_DAWN_BACKEND_D3D12=1<br/>
onnxruntime_ENABLE_DAWN_BACKEND_VULKAN=1 | `17,081,344
dxcompiler.dll`<br/>` 1,508,472 dxil.dll`<br/>`19,388,416
onnxruntime.dll` |
| WebGPU Vulkan | --use_webgpu<br/> --config Release<br/>
--build_shared_lib<br/> --cmake_extra_defines<br/>
onnxruntime_ENABLE_DAWN_BACKEND_D3D12=0<br/>
onnxruntime_ENABLE_DAWN_BACKEND_VULKAN=1 | `17,615,872 onnxruntime.dll`
|
| Monolithic | --use_webgpu<br/> --config Release<br/>
--build_shared_lib<br/> --cmake_extra_defines<br/>
onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY=1 | `17,082,368
dxcompiler.dll`<br/>` 1,508,472 dxil.dll`<br/>`13,277,696
onnxruntime.dll`<br/>` 5,616,640 webgpu_dawn.dll` |
| External Dawn | --use_webgpu<br/> --config Release<br/>
--build_shared_lib<br/> --cmake_extra_defines<br/>
onnxruntime_USE_EXTERNAL_DAWN=1<br/> --skip_tests | `17,081,344
dxcompiler.dll`<br/>` 1,508,472 dxil.dll`<br/>`13,277,184
onnxruntime.dll`
---
 cmake/CMakeLists.txt                          | 13 ++++++
 .../external/onnxruntime_external_deps.cmake  | 41 +++++++++++++++----
 cmake/onnxruntime_providers_webgpu.cmake      | 22 ++++++++--
 .../core/providers/webgpu/webgpu_context.cc   |  8 ++--
 .../webgpu/webgpu_execution_provider.h        |  2 +
 .../webgpu/webgpu_provider_factory.cc         | 20 +++++++++
 .../webgpu/webgpu_provider_options.h          |  5 +++
 7 files changed, 97 insertions(+), 14 deletions(-)
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 7710ab2f4cac7..d2fe7e7457983 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -149,6 +149,10 @@ option(onnxruntime_USE_WEBNN "Build with WebNN support. Enable hardware accelera
 option(onnxruntime_USE_WEBGPU "Build with WebGPU support. Enable WebGPU via C/C++ interface." OFF)
 option(onnxruntime_USE_EXTERNAL_DAWN "Build with treating Dawn as external dependency. Will not link Dawn at build time." OFF)
 option(onnxruntime_CUSTOM_DAWN_SRC_PATH "Path to custom Dawn src dir.")
+option(onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY "Build Dawn as a monolithic library" OFF)
+# The following 2 options are only for Windows
+option(onnxruntime_ENABLE_DAWN_BACKEND_VULKAN "Enable Vulkan backend for Dawn (on Windows)" OFF)
+option(onnxruntime_ENABLE_DAWN_BACKEND_D3D12 "Enable D3D12 backend for Dawn (on Windows)" ON)
 
 # Options related to reducing the binary size produced by the build
 # XNNPACK EP requires the internal NHWC contrib ops to be available, so this option must be OFF when onnxruntime_USE_XNNPACK is ON
@@ -955,9 +959,18 @@ if (onnxruntime_USE_WEBGPU)
   list(APPEND ORT_PROVIDER_FLAGS -DUSE_WEBGPU=1)
   list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_WEBGPU=1)
   list(APPEND ONNXRUNTIME_PROVIDER_NAMES webgpu)
+  if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
+    list(APPEND ORT_PROVIDER_FLAGS -DBUILD_DAWN_MONOLITHIC_LIBRARY=1)
+  endif()
   if (onnxruntime_USE_EXTERNAL_DAWN)
     list(APPEND ORT_PROVIDER_FLAGS -DUSE_EXTERNAL_DAWN=1)
   endif()
+  if (onnxruntime_ENABLE_DAWN_BACKEND_VULKAN)
+    list(APPEND ORT_PROVIDER_FLAGS -DDAWN_ENABLE_VULKAN=1)
+  endif()
+  if (onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
+    list(APPEND ORT_PROVIDER_FLAGS -DDAWN_ENABLE_D3D12=1)
+  endif()
 endif()
 if (onnxruntime_USE_CANN)
     list(APPEND ORT_PROVIDER_FLAGS  -DUSE_CANN=1)
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index ee7abcbad025c..aeaaa7b51d595 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -635,10 +635,19 @@ if (onnxruntime_USE_WEBGPU)
     )
   endif()
 
-  # use dawn::dawn_native and dawn::dawn_proc instead of the monolithic dawn::webgpu_dawn to minimize binary size
-  set(DAWN_BUILD_MONOLITHIC_LIBRARY OFF CACHE BOOL "" FORCE)
+  if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
+    set(DAWN_BUILD_MONOLITHIC_LIBRARY ON CACHE BOOL "" FORCE)
+    set(DAWN_ENABLE_INSTALL ON CACHE BOOL "" FORCE)
+
+    if (onnxruntime_USE_EXTERNAL_DAWN)
+      message(FATAL_ERROR "onnxruntime_USE_EXTERNAL_DAWN and onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY cannot be enabled at the same time.")
+    endif()
+  else()
+    # use dawn::dawn_native and dawn::dawn_proc instead of the monolithic dawn::webgpu_dawn to minimize binary size
+    set(DAWN_BUILD_MONOLITHIC_LIBRARY OFF CACHE BOOL "" FORCE)
+    set(DAWN_ENABLE_INSTALL OFF CACHE BOOL "" FORCE)
+  endif()
   set(DAWN_BUILD_SAMPLES OFF CACHE BOOL "" FORCE)
-  set(DAWN_ENABLE_INSTALL OFF CACHE BOOL "" FORCE)
   set(DAWN_ENABLE_NULL OFF CACHE BOOL "" FORCE)
   set(DAWN_FETCH_DEPENDENCIES ON CACHE BOOL "" FORCE)
 
@@ -667,18 +676,34 @@ if (onnxruntime_USE_WEBGPU)
     set(DAWN_USE_BUILT_DXC ON CACHE BOOL "" FORCE)
     set(TINT_BUILD_HLSL_WRITER ON CACHE BOOL "" FORCE)
 
-    # Vulkan may optionally be included in a Windows build. Exclude until we have an explicit use case that requires it.
-    set(DAWN_ENABLE_VULKAN OFF CACHE BOOL "" FORCE)
+    if ((NOT onnxruntime_ENABLE_DAWN_BACKEND_VULKAN) AND (NOT onnxruntime_ENABLE_DAWN_BACKEND_D3D12))
+      message(FATAL_ERROR "At least one of onnxruntime_ENABLE_DAWN_BACKEND_VULKAN or onnxruntime_ENABLE_DAWN_BACKEND_D3D12 must be enabled when using Dawn on Windows.")
+    endif()
+    if (onnxruntime_ENABLE_DAWN_BACKEND_VULKAN)
+      set(DAWN_ENABLE_VULKAN ON CACHE BOOL "" FORCE)
+      set(TINT_BUILD_SPV_WRITER ON CACHE BOOL "" FORCE)
+    else()
+      set(DAWN_ENABLE_VULKAN OFF CACHE BOOL "" FORCE)
+    endif()
+    if (onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
+      set(DAWN_ENABLE_D3D12 ON CACHE BOOL "" FORCE)
+    else()
+      set(DAWN_ENABLE_D3D12 OFF CACHE BOOL "" FORCE)
+    endif()
     # We are currently always using the D3D12 backend.
     set(DAWN_ENABLE_D3D11 OFF CACHE BOOL "" FORCE)
   endif()
 
   onnxruntime_fetchcontent_makeavailable(dawn)
 
-  if (NOT onnxruntime_USE_EXTERNAL_DAWN)
-    list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::dawn_native)
+  if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
+    list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::webgpu_dawn)
+  else()
+    if (NOT onnxruntime_USE_EXTERNAL_DAWN)
+      list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::dawn_native)
+    endif()
+    list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::dawn_proc)
   endif()
-  list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::dawn_proc)
 endif()
 
 set(onnxruntime_LINK_DIRS)
diff --git a/cmake/onnxruntime_providers_webgpu.cmake b/cmake/onnxruntime_providers_webgpu.cmake
index 02c2a5aee481c..fea5964f0dda9 100644
--- a/cmake/onnxruntime_providers_webgpu.cmake
+++ b/cmake/onnxruntime_providers_webgpu.cmake
@@ -22,9 +22,25 @@
   onnxruntime_add_static_library(onnxruntime_providers_webgpu ${onnxruntime_providers_webgpu_cc_srcs})
   onnxruntime_add_include_to_target(onnxruntime_providers_webgpu
     onnxruntime_common dawn::dawncpp_headers dawn::dawn_headers onnx onnx_proto flatbuffers::flatbuffers Boost::mp11 safeint_interface)
-  if (NOT onnxruntime_USE_EXTERNAL_DAWN)
-    target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_native)
+
+  if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
+    target_link_libraries(onnxruntime_providers_webgpu dawn::webgpu_dawn)
+
+    if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS)
+      list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll")
+    endif()
+
+    # Copy webgpu_dawn.dll to the output directory
+    add_custom_command(
+      TARGET onnxruntime_providers_webgpu
+      POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different "$<TARGET_FILE:dawn::webgpu_dawn>" "$<TARGET_FILE_DIR:onnxruntime_providers_webgpu>"
+      VERBATIM )
+  else()
+    if (NOT onnxruntime_USE_EXTERNAL_DAWN)
+      target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_native)
+    endif()
+    target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_proc)
   endif()
-  target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_proc)
 
   set_target_properties(onnxruntime_providers_webgpu PROPERTIES FOLDER "ONNXRuntime")
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index ea0cbddb0205d..d66c2a79d28a8 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -28,6 +28,9 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info
     // Initialization.Step.1 - Create wgpu::Instance
     if (instance_ == nullptr) {
       const DawnProcTable* dawn_procs = reinterpret_cast<const DawnProcTable*>(dawn_proc_table);
+#if defined(BUILD_DAWN_MONOLITHIC_LIBRARY)
+      ORT_ENFORCE(dawn_procs == nullptr, "setting DawnProcTable is not allowed when dynamically linked to webgpu_dawn.");
+#else
 #if !defined(USE_EXTERNAL_DAWN)
       if (dawn_procs == nullptr) {
         dawn_procs = &dawn::native::GetProcs();
@@ -36,6 +39,7 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info
       ORT_ENFORCE(dawn_procs != nullptr, "DawnProcTable must be provided.");
 #endif
       dawnProcSetProcs(dawn_procs);
+#endif
 
       wgpu::InstanceDescriptor instance_desc{};
       instance_desc.features.timedWaitAnyEnable = true;
@@ -49,9 +53,7 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info
       wgpu::RequestAdapterOptions req_adapter_options = {};
       wgpu::DawnTogglesDescriptor adapter_toggles_desc = {};
       req_adapter_options.nextInChain = &adapter_toggles_desc;
-#ifdef _WIN32
-      req_adapter_options.backendType = wgpu::BackendType::D3D12;
-#endif
+      req_adapter_options.backendType = static_cast<wgpu::BackendType>(webgpu_ep_info.backend_type);
       req_adapter_options.powerPreference = wgpu::PowerPreference::HighPerformance;
 
       auto enabled_adapter_toggles = GetEnabledAdapterToggles();
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
index 336395a1dd0dd..f9c43c6bfd7d0 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
@@ -26,6 +26,7 @@ struct WebGpuExecutionProviderInfo {
   WebGpuExecutionProviderInfo(DataLayout data_layout, bool enable_graph_capture)
       : data_layout{data_layout},
         enable_graph_capture{enable_graph_capture},
+        backend_type{},
         storage_buffer_cache_mode{},
         uniform_buffer_cache_mode{},
         query_resolve_buffer_cache_mode{},
@@ -36,6 +37,7 @@ struct WebGpuExecutionProviderInfo {
 
   DataLayout data_layout;
   bool enable_graph_capture;
+  int backend_type;
   webgpu::BufferCacheMode storage_buffer_cache_mode;
   webgpu::BufferCacheMode uniform_buffer_cache_mode;
   webgpu::BufferCacheMode query_resolve_buffer_cache_mode;
diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
index 6115464cefa6d..6cfe9aac0b0e9 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
@@ -67,6 +67,26 @@ std::shared_ptr<IExecutionProviderFactory> WebGpuProviderFactoryCreator::Create(
   }
   LOGS_DEFAULT(VERBOSE) << "WebGPU EP graph capture enable: " << webgpu_ep_info.enable_graph_capture;
 
+  std::string backend_type_str;
+  if (config_options.TryGetConfigEntry(kDawnBackendType, backend_type_str)) {
+#ifdef _WIN32
+    // Setup Windows default backend type based on the build configuration
+#if defined(onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
+    webgpu_ep_info.backend_type = static_cast<int>(WGPUBackendType_D3D12);
+#elif defined(onnxruntime_ENABLE_DAWN_BACKEND_VULKAN)
+    webgpu_ep_info.backend_type = static_cast<int>(WGPUBackendType_Vulkan);
+#endif
+#endif
+    if (backend_type_str == kDawnBackendType_D3D12) {
+      webgpu_ep_info.backend_type = static_cast<int>(WGPUBackendType_D3D12);
+    } else if (backend_type_str == kDawnBackendType_Vulkan) {
+      webgpu_ep_info.backend_type = static_cast<int>(WGPUBackendType_Vulkan);
+    } else {
+      ORT_THROW("Invalid Dawn backend type: ", backend_type_str);
+    }
+  }
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP Dawn backend type: " << webgpu_ep_info.backend_type;
+
   auto parse_buffer_cache_mode = [&config_options](const std::string& config_entry_str,
                                                    webgpu::BufferCacheMode default_value) -> webgpu::BufferCacheMode {
     std::string buffer_cache_mode_str;
diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_options.h b/onnxruntime/core/providers/webgpu/webgpu_provider_options.h
index 63befedffea84..12bb4b32e6a35 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_provider_options.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_provider_options.h
@@ -14,6 +14,8 @@ constexpr const char* kEnableGraphCapture = "WebGPU:enableGraphCapture";
 
 constexpr const char* kDawnProcTable = "WebGPU:dawnProcTable";
 
+constexpr const char* kDawnBackendType = "WebGPU:dawnBackendType";
+
 constexpr const char* kDeviceId = "WebGPU:deviceId";
 constexpr const char* kWebGpuInstance = "WebGPU:webgpuInstance";
 constexpr const char* kWebGpuAdapter = "WebGPU:webgpuAdapter";
@@ -30,6 +32,9 @@ constexpr const char* kForceCpuNodeNames = "WebGPU:forceCpuNodeNames";
 
 // The following are the possible values for the provider options.
 
+constexpr const char* kDawnBackendType_D3D12 = "D3D12";
+constexpr const char* kDawnBackendType_Vulkan = "Vulkan";
+
 constexpr const char* kPreferredLayout_NCHW = "NCHW";
 constexpr const char* kPreferredLayout_NHWC = "NHWC";
 

From 2ff66b80e0e075696e34c78ab59b351bc8590d56 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Mon, 16 Dec 2024 09:05:12 -0800
Subject: [PATCH 2/4] Fix a deadlock bug in EigenNonBlockingThreadPool.h
 (#23098)

### Description
This PR fixes a deadlock bug in EigenNonBlockingThreadPool.h. It only happens on platforms with weakly ordered memory model, such as ARM64.
---
 .../platform/EigenNonBlockingThreadPool.h     | 124 ++++++++++--------
 1 file changed, 66 insertions(+), 58 deletions(-)

diff --git a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
index 27b14f008a8ba..a7c63c507d1ba 100644
--- a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
+++ b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
@@ -1467,11 +1467,14 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
       status = ThreadStatus::Spinning;
     }
 
-    void SetBlocked(std::function<bool()> should_block,
+    bool SetBlocked(std::function<bool()> should_block,
                     std::function<void()> post_block) {
       std::unique_lock<std::mutex> lk(mutex);
-      assert(GetStatus() == ThreadStatus::Spinning);
-      status.store(ThreadStatus::Blocking, std::memory_order_relaxed);
+      auto old_status = status.exchange(ThreadStatus::Blocking, std::memory_order_seq_cst);
+      if (old_status != ThreadStatus::Spinning) {
+        // Encountered a logical error
+        return false;
+      }
       if (should_block()) {
         status.store(ThreadStatus::Blocked, std::memory_order_relaxed);
         do {
@@ -1480,6 +1483,7 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
         post_block();
       }
       status.store(ThreadStatus::Spinning, std::memory_order_relaxed);
+      return true;
     }
 
    private:
@@ -1558,62 +1562,66 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
 
         // Attempt to block
         if (!t) {
-          td.SetBlocked(  // Pre-block test
-              [&]() -> bool {
-                bool should_block = true;
-                // Check whether work was pushed to us while attempting to block.  We make
-                // this test while holding the per-thread status lock, and after setting
-                // our status to ThreadStatus::Blocking.
-                //
-                // This synchronizes with ThreadPool::Schedule which pushes work to the queue
-                // and then tests for ThreadStatus::Blocking/Blocked (via EnsureAwake):
-                //
-                // Main thread:                    Worker:
-                //   #1 Push work                   #A Set status blocking
-                //   #2 Read worker status          #B Check queue
-                //   #3 Wake if blocking/blocked
-                //
-                // If #A is before #2 then main sees worker blocked and wakes
-                //
-                // If #A if after #2 then #B will see #1, and we abandon blocking
-                assert(!t);
-                t = q.PopFront();
-                if (t) {
-                  should_block = false;
-                }
-
-                // No work pushed to us, continue attempting to block.  The remaining
-                // test  is to synchronize with termination requests.  If we are
-                // shutting down and all worker threads blocked without work, that's
-                // we are done.
-                if (should_block) {
-                  blocked_++;
-                  if (done_ && blocked_ == num_threads_) {
-                    should_block = false;
-                    // Almost done, but need to re-check queues.
-                    // Consider that all queues are empty and all worker threads are preempted
-                    // right after incrementing blocked_ above. Now a free-standing thread
-                    // submits work and calls destructor (which sets done_). If we don't
-                    // re-check queues, we will exit leaving the work unexecuted.
-                    if (NonEmptyQueueIndex() != -1) {
-                      // Note: we must not pop from queues before we decrement blocked_,
-                      // otherwise the following scenario is possible. Consider that instead
-                      // of checking for emptiness we popped the only element from queues.
-                      // Now other worker threads can start exiting, which is bad if the
-                      // work item submits other work. So we just check emptiness here,
-                      // which ensures that all worker threads exit at the same time.
-                      blocked_--;
-                    } else {
-                      should_exit = true;
+          if (!td.SetBlocked(  // Pre-block test
+                  [&]() -> bool {
+                    bool should_block = true;
+                    // Check whether work was pushed to us while attempting to block.  We make
+                    // this test while holding the per-thread status lock, and after setting
+                    // our status to ThreadStatus::Blocking.
+                    //
+                    // This synchronizes with ThreadPool::Schedule which pushes work to the queue
+                    // and then tests for ThreadStatus::Blocking/Blocked (via EnsureAwake):
+                    //
+                    // Main thread:                    Worker:
+                    //   #1 Push work                   #A Set status blocking
+                    //   #2 Read worker status          #B Check queue
+                    //   #3 Wake if blocking/blocked
+                    //
+                    // If #A is before #2 then main sees worker blocked and wakes
+                    //
+                    // If #A if after #2 then #B will see #1, and we abandon blocking
+                    assert(!t);
+                    t = q.PopFront();
+                    if (t) {
+                      should_block = false;
+                    }
+
+                    // No work pushed to us, continue attempting to block.  The remaining
+                    // test  is to synchronize with termination requests.  If we are
+                    // shutting down and all worker threads blocked without work, that's
+                    // we are done.
+                    if (should_block) {
+                      blocked_++;
+                      if (done_ && blocked_ == num_threads_) {
+                        should_block = false;
+                        // Almost done, but need to re-check queues.
+                        // Consider that all queues are empty and all worker threads are preempted
+                        // right after incrementing blocked_ above. Now a free-standing thread
+                        // submits work and calls destructor (which sets done_). If we don't
+                        // re-check queues, we will exit leaving the work unexecuted.
+                        if (NonEmptyQueueIndex() != -1) {
+                          // Note: we must not pop from queues before we decrement blocked_,
+                          // otherwise the following scenario is possible. Consider that instead
+                          // of checking for emptiness we popped the only element from queues.
+                          // Now other worker threads can start exiting, which is bad if the
+                          // work item submits other work. So we just check emptiness here,
+                          // which ensures that all worker threads exit at the same time.
+                          blocked_--;
+                        } else {
+                          should_exit = true;
+                        }
+                      }
                     }
-                  }
-                }
-                return should_block;
-              },
-              // Post-block update (executed only if we blocked)
-              [&]() {
-                blocked_--;
-              });
+                    return should_block;
+                  },
+                  // Post-block update (executed only if we blocked)
+                  [&]() {
+                    blocked_--;
+                  })) {
+            // Encountered a fatal logic error in SetBlocked
+            should_exit = true;
+            break;
+          }
           // Thread just unblocked.  Unless we picked up work while
           // blocking, or are exiting, then either work was pushed to
           // us, or it was pushed to an overloaded queue

From a4eb8f27b6e51dec41f943b614702dd114731e13 Mon Sep 17 00:00:00 2001
From: tianf-fff <80665242+tianfang-fafafa@users.noreply.github.com>
Date: Mon, 16 Dec 2024 11:09:48 -0600
Subject: [PATCH 3/4] [VitisAI] Add profiler interface for vitisai (#23032)

### Description
<!-- Describe your changes. -->
Add common interfaces for vitis ep profiler.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Vitis ep can collect and record api and kernel timestamps in file when
onnxruntime '-p' is enabled.
---
 .../core/providers/vitisai/imp/global_api.cc  | 12 +++++
 .../vitisai/include/vaip/global_api.h         | 15 ++++++
 .../vitisai/vitisai_execution_provider.cc     |  5 ++
 .../vitisai/vitisai_execution_provider.h      |  2 +
 .../providers/vitisai/vitisai_profiler.cc     | 49 +++++++++++++++++++
 .../core/providers/vitisai/vitisai_profiler.h | 23 +++++++++
 6 files changed, 106 insertions(+)
 create mode 100644 onnxruntime/core/providers/vitisai/vitisai_profiler.cc
 create mode 100644 onnxruntime/core/providers/vitisai/vitisai_profiler.h

diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc
index 51dc79c569589..cccaa65de45f2 100644
--- a/onnxruntime/core/providers/vitisai/imp/global_api.cc
+++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc
@@ -58,6 +58,9 @@ struct OrtVitisAIEpAPI {
       const std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>& eps,
       const char* const* keys,
       const char* const* values, size_t kv_len) = nullptr;
+  void (*profiler_collect)(
+      std::vector<EventInfo>& api_events,
+      std::vector<EventInfo>& kernel_events);
   void Ensure() {
     if (handle_)
       return;
@@ -81,6 +84,7 @@ struct OrtVitisAIEpAPI {
     }
     std::ignore = env.GetSymbolFromLibrary(handle_, "vaip_get_version",
                                            (void**)&vaip_get_version);
+    std::ignore = env.GetSymbolFromLibrary(handle_, "profiler_collect", (void**)&profiler_collect);
     ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "create_ep_context_nodes", (void**)&create_ep_context_nodes));
     ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "vitisai_ep_on_run_start", (void**)&vitisai_ep_on_run_start));
     ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "vitisai_ep_set_ep_dynamic_options", (void**)&vitisai_ep_set_ep_dynamic_options));
@@ -97,6 +101,14 @@ static vaip_core::OrtApiForVaip the_global_api;
 std::shared_ptr<KernelRegistry> get_kernel_registry_vitisaiep() { return s_kernel_registry_vitisaiep; }
 const std::vector<OrtCustomOpDomain*>& get_domains_vitisaiep() { return s_domains_vitisaiep; }
 
+void profiler_collect(
+    std::vector<EventInfo>& api_events,
+    std::vector<EventInfo>& kernel_events) {
+  if (s_library_vitisaiep.profiler_collect) {
+    s_library_vitisaiep.profiler_collect(api_events, kernel_events);
+  }
+}
+
 vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> compile_onnx_model(
     const onnxruntime::GraphViewer& graph_viewer, const logging::Logger& logger, const ProviderOptions& options) {
   auto model_path = graph_viewer.ModelPath().string();
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
index b0353bd6adae9..704b156dff57f 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
@@ -24,3 +24,18 @@ int vitisai_ep_set_ep_dynamic_options(
     const std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>& eps,
     const char* const* keys,
     const char* const* values, size_t kv_len);
+/**
+ * Replace EventRecord with std::tuple<std::string, int ,int, long long, long long>,
+ * because EventRecord is defined in profiler_common.h which is used inside onnxruntime.
+ * However, profiler_collect function will call vitis ep which can't include profiler_common.h.
+ */
+using EventInfo = std::tuple<
+    std::string,  // name
+    int,          // pid
+    int,          // tid
+    long long,    // timestamp
+    long long     // duration
+    >;
+void profiler_collect(
+    std::vector<EventInfo>& api_events,
+    std::vector<EventInfo>& kernel_events);
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
index 023a954c83d70..3a99f56bb732a 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
@@ -1,6 +1,7 @@
 // Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
 // Licensed under the MIT License.
 #include "vitisai_execution_provider.h"
+#include "vitisai_profiler.h"
 
 // Standard headers/libs.
 #include <cassert>
@@ -135,4 +136,8 @@ common::Status VitisAIExecutionProvider::SetEpDynamicOptions(gsl::span<const cha
   }
   return Status::OK();
 }
+
+std::unique_ptr<profiling::EpProfiler> VitisAIExecutionProvider::GetProfiler() {
+  return std::make_unique<profiling::VitisaiProfiler>();
+}
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
index 77dede6035b4c..f0d1a289a2a73 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
@@ -36,6 +36,8 @@ class VitisAIExecutionProvider : public IExecutionProvider {
                          std::vector<NodeComputeInfo>& node_compute_funcs) override;
   std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
 
+  std::unique_ptr<profiling::EpProfiler> GetProfiler() override;
+
   // This method is called after both `GetComputeCapabilityOps()` and `Compile()`.
   // This timing is required to work with both compliation-based EPs and non-compilation-based EPs.
   const InlinedVector<const Node*> GetEpContextNodes() const override;
diff --git a/onnxruntime/core/providers/vitisai/vitisai_profiler.cc b/onnxruntime/core/providers/vitisai/vitisai_profiler.cc
new file mode 100644
index 0000000000000..d84507ec6ad02
--- /dev/null
+++ b/onnxruntime/core/providers/vitisai/vitisai_profiler.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+// Licensed under the MIT License.
+
+#include "vitisai_profiler.h"
+
+namespace onnxruntime {
+namespace profiling {
+
+#if defined(USE_VITISAI)
+
+bool VitisaiProfiler::StartProfiling(TimePoint tp) {
+  return true;
+}
+
+void VitisaiProfiler::EndProfiling(TimePoint tp, Events& events) {
+  auto time_point =
+      std::chrono::duration_cast<std::chrono::microseconds>(tp.time_since_epoch()).count();
+
+  std::vector<EventInfo> api_events;
+  std::vector<EventInfo> kernel_events;
+  profiler_collect(api_events, kernel_events);
+
+  std::unordered_map<std::string, std::string> event_args;
+
+  for (auto& a : api_events) {
+    events.emplace_back(EventCategory::API_EVENT,
+                        std::get<1>(a),               // pid
+                        std::get<2>(a),               // tid
+                        std::get<0>(a),               // name
+                        std::get<3>(a) - time_point,  // timestamp
+                        std::get<4>(a),               // duration
+                        event_args);
+  }
+
+  for (auto& k : kernel_events) {
+    events.emplace_back(EventCategory::KERNEL_EVENT,
+                        std::get<1>(k),
+                        std::get<2>(k),
+                        std::get<0>(k),
+                        std::get<3>(k) - time_point,
+                        std::get<4>(k),
+                        event_args);
+  }
+}
+
+#endif
+
+}  // namespace profiling
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/vitisai/vitisai_profiler.h b/onnxruntime/core/providers/vitisai/vitisai_profiler.h
new file mode 100644
index 0000000000000..aedbda31f7b1d
--- /dev/null
+++ b/onnxruntime/core/providers/vitisai/vitisai_profiler.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/vitisai/include/vaip/global_api.h"
+
+namespace onnxruntime {
+namespace profiling {
+
+#if defined(USE_VITISAI)
+class VitisaiProfiler final : public EpProfiler {
+ public:
+  VitisaiProfiler() = default;
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(VitisaiProfiler);
+  ~VitisaiProfiler() {}
+  bool StartProfiling(TimePoint) override;
+  void EndProfiling(TimePoint, Events&) override;
+  void Start(uint64_t) override{};
+  void Stop(uint64_t) override{};
+};
+#endif
+
+}  // namespace profiling
+}  // namespace onnxruntime

From ae970681372e20c3df3f1b40bfb0ee06a02c39c8 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Mon, 16 Dec 2024 10:38:23 -0800
Subject: [PATCH 4/4] Fix Pybind memory leak (#23105)

### Description
<!-- Describe your changes. -->
Array GETITEM returns new reference which is a leak


### Motivation and Context
Address  https://github.com/microsoft/onnxruntime/issues/22271
---
 onnxruntime/python/onnxruntime_pybind_mlvalue.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
index 92396bb09bd4c..5742b4db42512 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@@ -280,7 +280,7 @@ void DmlToCpuMemCpy(void* dst, const void* src, size_t num_bytes) {
   uint32_t readback_heap_size = gsl::narrow_cast<uint32_t>(sizeof(readback_heap));
   ORT_THROW_IF_FAILED(d3d12_device->GetPrivateData(dml_readback_heap_guid, &readback_heap_size, &readback_heap));
 
-  // ReadbackFromGpu already syncs with the CPU and waits for the copy to be completed, so we don't need to sync after
+  // ReadbackFromGpu already syncs with the CPU and waits for the copy to be completed, so we dont need to sync after
   // this call
   readback_heap->ReadbackFromGpu(
       gsl::make_span(static_cast<std::byte*>(dst), num_bytes),
@@ -428,7 +428,7 @@ MLDataType NumpyTypeToOnnxRuntimeTensorType(int numpy_type) {
       // Special, not a C type expands to enum value of 16
       {NPY_FLOAT16, DataTypeImpl::GetType<MLFloat16>()},
       {NPY_DOUBLE, DataTypeImpl::GetType<double>()},
-      // We don't want to use size specific types such
+      // We dont want to use size specific types such
       // as NPY_INT32 bc they are not enums but hash defines
       // which may map into other enums and may conflict with other entries here
       // also NPY docs define these sizes as platform specific, thus we
@@ -581,6 +581,7 @@ static void CopyDataToTensor(PyArrayObject* darray, int npy_type, Tensor& tensor
     for (int i = 0; i < total_items; ++i, src += item_size) {
       // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
       PyObject* item = PyArray_GETITEM(darray, src);
+      UniqueDecRefPtr<PyObject> itemGuard(item, DecRefFn<PyObject>());
       PyObject* pStr = PyObject_Str(item);
       UniqueDecRefPtr<PyObject> strGuard(pStr, DecRefFn<PyObject>());
       dst[i] = py::reinterpret_borrow<py::str>(pStr);