From 3a0b958586e93119b64de5a67b540d3f96219dba Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 13 Dec 2024 16:05:48 -0800
Subject: [PATCH 1/4] add 2 CMake build options of Dawn (#23096)
### Description
This change adds the following CMake build options for Dawn:
- onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY
- OFF by default
- when enabled, builds Dawn as a monolithic library (webgpu_dawn.dll)
- onnxruntime_ENABLE_DAWN_BACKEND_VULKAN
- OFF by default
- when enabled, build with Vulkan backend for Dawn on Windows
- onnxruntime_ENABLE_DAWN_BACKEND_D3D12
- ON by default
- when enabled, build with DirectX 12 backend for Dawn on Windows
### File Size Comparison (Windows)
| Build | cmdline | File Size |
|---|---|---|
| Baseline | --config Release
--build_shared_lib | `12,755,456
onnxruntime.dll` |
| WebGPU D3D12 (default) | --use_webgpu
--config Release
--build_shared_lib | `17,082,368 dxcompiler.dll`
` 1,508,472
dxil.dll`
`18,708,480 onnxruntime.dll` |
| WebGPU D3D12+Vulkan | --use_webgpu
--config Release
--build_shared_lib
--cmake_extra_defines
onnxruntime_ENABLE_DAWN_BACKEND_D3D12=1
onnxruntime_ENABLE_DAWN_BACKEND_VULKAN=1 | `17,081,344
dxcompiler.dll`
` 1,508,472 dxil.dll`
`19,388,416
onnxruntime.dll` |
| WebGPU Vulkan | --use_webgpu
--config Release
--build_shared_lib
--cmake_extra_defines
onnxruntime_ENABLE_DAWN_BACKEND_D3D12=0
onnxruntime_ENABLE_DAWN_BACKEND_VULKAN=1 | `17,615,872 onnxruntime.dll`
|
| Monolithic | --use_webgpu
--config Release
--build_shared_lib
--cmake_extra_defines
onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY=1 | `17,082,368
dxcompiler.dll`
` 1,508,472 dxil.dll`
`13,277,696
onnxruntime.dll`
` 5,616,640 webgpu_dawn.dll` |
| External Dawn | --use_webgpu
--config Release
--build_shared_lib
--cmake_extra_defines
onnxruntime_USE_EXTERNAL_DAWN=1
--skip_tests | `17,081,344
dxcompiler.dll`
` 1,508,472 dxil.dll`
`13,277,184
onnxruntime.dll`
---
cmake/CMakeLists.txt | 13 ++++++
.../external/onnxruntime_external_deps.cmake | 41 +++++++++++++++----
cmake/onnxruntime_providers_webgpu.cmake | 22 ++++++++--
.../core/providers/webgpu/webgpu_context.cc | 8 ++--
.../webgpu/webgpu_execution_provider.h | 2 +
.../webgpu/webgpu_provider_factory.cc | 20 +++++++++
.../webgpu/webgpu_provider_options.h | 5 +++
7 files changed, 97 insertions(+), 14 deletions(-)
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 7710ab2f4cac7..d2fe7e7457983 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -149,6 +149,10 @@ option(onnxruntime_USE_WEBNN "Build with WebNN support. Enable hardware accelera
option(onnxruntime_USE_WEBGPU "Build with WebGPU support. Enable WebGPU via C/C++ interface." OFF)
option(onnxruntime_USE_EXTERNAL_DAWN "Build with treating Dawn as external dependency. Will not link Dawn at build time." OFF)
option(onnxruntime_CUSTOM_DAWN_SRC_PATH "Path to custom Dawn src dir.")
+option(onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY "Build Dawn as a monolithic library" OFF)
+# The following 2 options are only for Windows
+option(onnxruntime_ENABLE_DAWN_BACKEND_VULKAN "Enable Vulkan backend for Dawn (on Windows)" OFF)
+option(onnxruntime_ENABLE_DAWN_BACKEND_D3D12 "Enable D3D12 backend for Dawn (on Windows)" ON)
# Options related to reducing the binary size produced by the build
# XNNPACK EP requires the internal NHWC contrib ops to be available, so this option must be OFF when onnxruntime_USE_XNNPACK is ON
@@ -955,9 +959,18 @@ if (onnxruntime_USE_WEBGPU)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_WEBGPU=1)
list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_WEBGPU=1)
list(APPEND ONNXRUNTIME_PROVIDER_NAMES webgpu)
+ if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
+ list(APPEND ORT_PROVIDER_FLAGS -DBUILD_DAWN_MONOLITHIC_LIBRARY=1)
+ endif()
if (onnxruntime_USE_EXTERNAL_DAWN)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_EXTERNAL_DAWN=1)
endif()
+ if (onnxruntime_ENABLE_DAWN_BACKEND_VULKAN)
+ list(APPEND ORT_PROVIDER_FLAGS -DDAWN_ENABLE_VULKAN=1)
+ endif()
+ if (onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
+ list(APPEND ORT_PROVIDER_FLAGS -DDAWN_ENABLE_D3D12=1)
+ endif()
endif()
if (onnxruntime_USE_CANN)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_CANN=1)
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index ee7abcbad025c..aeaaa7b51d595 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -635,10 +635,19 @@ if (onnxruntime_USE_WEBGPU)
)
endif()
- # use dawn::dawn_native and dawn::dawn_proc instead of the monolithic dawn::webgpu_dawn to minimize binary size
- set(DAWN_BUILD_MONOLITHIC_LIBRARY OFF CACHE BOOL "" FORCE)
+ if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
+ set(DAWN_BUILD_MONOLITHIC_LIBRARY ON CACHE BOOL "" FORCE)
+ set(DAWN_ENABLE_INSTALL ON CACHE BOOL "" FORCE)
+
+ if (onnxruntime_USE_EXTERNAL_DAWN)
+ message(FATAL_ERROR "onnxruntime_USE_EXTERNAL_DAWN and onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY cannot be enabled at the same time.")
+ endif()
+ else()
+ # use dawn::dawn_native and dawn::dawn_proc instead of the monolithic dawn::webgpu_dawn to minimize binary size
+ set(DAWN_BUILD_MONOLITHIC_LIBRARY OFF CACHE BOOL "" FORCE)
+ set(DAWN_ENABLE_INSTALL OFF CACHE BOOL "" FORCE)
+ endif()
set(DAWN_BUILD_SAMPLES OFF CACHE BOOL "" FORCE)
- set(DAWN_ENABLE_INSTALL OFF CACHE BOOL "" FORCE)
set(DAWN_ENABLE_NULL OFF CACHE BOOL "" FORCE)
set(DAWN_FETCH_DEPENDENCIES ON CACHE BOOL "" FORCE)
@@ -667,18 +676,34 @@ if (onnxruntime_USE_WEBGPU)
set(DAWN_USE_BUILT_DXC ON CACHE BOOL "" FORCE)
set(TINT_BUILD_HLSL_WRITER ON CACHE BOOL "" FORCE)
- # Vulkan may optionally be included in a Windows build. Exclude until we have an explicit use case that requires it.
- set(DAWN_ENABLE_VULKAN OFF CACHE BOOL "" FORCE)
+ if ((NOT onnxruntime_ENABLE_DAWN_BACKEND_VULKAN) AND (NOT onnxruntime_ENABLE_DAWN_BACKEND_D3D12))
+ message(FATAL_ERROR "At least one of onnxruntime_ENABLE_DAWN_BACKEND_VULKAN or onnxruntime_ENABLE_DAWN_BACKEND_D3D12 must be enabled when using Dawn on Windows.")
+ endif()
+ if (onnxruntime_ENABLE_DAWN_BACKEND_VULKAN)
+ set(DAWN_ENABLE_VULKAN ON CACHE BOOL "" FORCE)
+ set(TINT_BUILD_SPV_WRITER ON CACHE BOOL "" FORCE)
+ else()
+ set(DAWN_ENABLE_VULKAN OFF CACHE BOOL "" FORCE)
+ endif()
+ if (onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
+ set(DAWN_ENABLE_D3D12 ON CACHE BOOL "" FORCE)
+ else()
+ set(DAWN_ENABLE_D3D12 OFF CACHE BOOL "" FORCE)
+ endif()
# We are currently always using the D3D12 backend.
set(DAWN_ENABLE_D3D11 OFF CACHE BOOL "" FORCE)
endif()
onnxruntime_fetchcontent_makeavailable(dawn)
- if (NOT onnxruntime_USE_EXTERNAL_DAWN)
- list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::dawn_native)
+ if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
+ list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::webgpu_dawn)
+ else()
+ if (NOT onnxruntime_USE_EXTERNAL_DAWN)
+ list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::dawn_native)
+ endif()
+ list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::dawn_proc)
endif()
- list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::dawn_proc)
endif()
set(onnxruntime_LINK_DIRS)
diff --git a/cmake/onnxruntime_providers_webgpu.cmake b/cmake/onnxruntime_providers_webgpu.cmake
index 02c2a5aee481c..fea5964f0dda9 100644
--- a/cmake/onnxruntime_providers_webgpu.cmake
+++ b/cmake/onnxruntime_providers_webgpu.cmake
@@ -22,9 +22,25 @@
onnxruntime_add_static_library(onnxruntime_providers_webgpu ${onnxruntime_providers_webgpu_cc_srcs})
onnxruntime_add_include_to_target(onnxruntime_providers_webgpu
onnxruntime_common dawn::dawncpp_headers dawn::dawn_headers onnx onnx_proto flatbuffers::flatbuffers Boost::mp11 safeint_interface)
- if (NOT onnxruntime_USE_EXTERNAL_DAWN)
- target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_native)
+
+ if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
+ target_link_libraries(onnxruntime_providers_webgpu dawn::webgpu_dawn)
+
+ if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS)
+ list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll")
+ endif()
+
+ # Copy webgpu_dawn.dll to the output directory
+ add_custom_command(
+ TARGET onnxruntime_providers_webgpu
+ POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different "$" "$"
+ VERBATIM )
+ else()
+ if (NOT onnxruntime_USE_EXTERNAL_DAWN)
+ target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_native)
+ endif()
+ target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_proc)
endif()
- target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_proc)
set_target_properties(onnxruntime_providers_webgpu PROPERTIES FOLDER "ONNXRuntime")
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index ea0cbddb0205d..d66c2a79d28a8 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -28,6 +28,9 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info
// Initialization.Step.1 - Create wgpu::Instance
if (instance_ == nullptr) {
const DawnProcTable* dawn_procs = reinterpret_cast(dawn_proc_table);
+#if defined(BUILD_DAWN_MONOLITHIC_LIBRARY)
+ ORT_ENFORCE(dawn_procs == nullptr, "setting DawnProcTable is not allowed when dynamically linked to webgpu_dawn.");
+#else
#if !defined(USE_EXTERNAL_DAWN)
if (dawn_procs == nullptr) {
dawn_procs = &dawn::native::GetProcs();
@@ -36,6 +39,7 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info
ORT_ENFORCE(dawn_procs != nullptr, "DawnProcTable must be provided.");
#endif
dawnProcSetProcs(dawn_procs);
+#endif
wgpu::InstanceDescriptor instance_desc{};
instance_desc.features.timedWaitAnyEnable = true;
@@ -49,9 +53,7 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info
wgpu::RequestAdapterOptions req_adapter_options = {};
wgpu::DawnTogglesDescriptor adapter_toggles_desc = {};
req_adapter_options.nextInChain = &adapter_toggles_desc;
-#ifdef _WIN32
- req_adapter_options.backendType = wgpu::BackendType::D3D12;
-#endif
+ req_adapter_options.backendType = static_cast(webgpu_ep_info.backend_type);
req_adapter_options.powerPreference = wgpu::PowerPreference::HighPerformance;
auto enabled_adapter_toggles = GetEnabledAdapterToggles();
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
index 336395a1dd0dd..f9c43c6bfd7d0 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
@@ -26,6 +26,7 @@ struct WebGpuExecutionProviderInfo {
WebGpuExecutionProviderInfo(DataLayout data_layout, bool enable_graph_capture)
: data_layout{data_layout},
enable_graph_capture{enable_graph_capture},
+ backend_type{},
storage_buffer_cache_mode{},
uniform_buffer_cache_mode{},
query_resolve_buffer_cache_mode{},
@@ -36,6 +37,7 @@ struct WebGpuExecutionProviderInfo {
DataLayout data_layout;
bool enable_graph_capture;
+ int backend_type;
webgpu::BufferCacheMode storage_buffer_cache_mode;
webgpu::BufferCacheMode uniform_buffer_cache_mode;
webgpu::BufferCacheMode query_resolve_buffer_cache_mode;
diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
index 6115464cefa6d..6cfe9aac0b0e9 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
@@ -67,6 +67,26 @@ std::shared_ptr WebGpuProviderFactoryCreator::Create(
}
LOGS_DEFAULT(VERBOSE) << "WebGPU EP graph capture enable: " << webgpu_ep_info.enable_graph_capture;
+ std::string backend_type_str;
+ if (config_options.TryGetConfigEntry(kDawnBackendType, backend_type_str)) {
+#ifdef _WIN32
+ // Setup Windows default backend type based on the build configuration
+#if defined(onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
+ webgpu_ep_info.backend_type = static_cast(WGPUBackendType_D3D12);
+#elif defined(onnxruntime_ENABLE_DAWN_BACKEND_VULKAN)
+ webgpu_ep_info.backend_type = static_cast(WGPUBackendType_Vulkan);
+#endif
+#endif
+ if (backend_type_str == kDawnBackendType_D3D12) {
+ webgpu_ep_info.backend_type = static_cast(WGPUBackendType_D3D12);
+ } else if (backend_type_str == kDawnBackendType_Vulkan) {
+ webgpu_ep_info.backend_type = static_cast(WGPUBackendType_Vulkan);
+ } else {
+ ORT_THROW("Invalid Dawn backend type: ", backend_type_str);
+ }
+ }
+ LOGS_DEFAULT(VERBOSE) << "WebGPU EP Dawn backend type: " << webgpu_ep_info.backend_type;
+
auto parse_buffer_cache_mode = [&config_options](const std::string& config_entry_str,
webgpu::BufferCacheMode default_value) -> webgpu::BufferCacheMode {
std::string buffer_cache_mode_str;
diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_options.h b/onnxruntime/core/providers/webgpu/webgpu_provider_options.h
index 63befedffea84..12bb4b32e6a35 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_provider_options.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_provider_options.h
@@ -14,6 +14,8 @@ constexpr const char* kEnableGraphCapture = "WebGPU:enableGraphCapture";
constexpr const char* kDawnProcTable = "WebGPU:dawnProcTable";
+constexpr const char* kDawnBackendType = "WebGPU:dawnBackendType";
+
constexpr const char* kDeviceId = "WebGPU:deviceId";
constexpr const char* kWebGpuInstance = "WebGPU:webgpuInstance";
constexpr const char* kWebGpuAdapter = "WebGPU:webgpuAdapter";
@@ -30,6 +32,9 @@ constexpr const char* kForceCpuNodeNames = "WebGPU:forceCpuNodeNames";
// The following are the possible values for the provider options.
+constexpr const char* kDawnBackendType_D3D12 = "D3D12";
+constexpr const char* kDawnBackendType_Vulkan = "Vulkan";
+
constexpr const char* kPreferredLayout_NCHW = "NCHW";
constexpr const char* kPreferredLayout_NHWC = "NHWC";
From 2ff66b80e0e075696e34c78ab59b351bc8590d56 Mon Sep 17 00:00:00 2001
From: Changming Sun
Date: Mon, 16 Dec 2024 09:05:12 -0800
Subject: [PATCH 2/4] Fix a deadlock bug in EigenNonBlockingThreadPool.h
(#23098)
### Description
This PR fixes a deadlock bug in EigenNonBlockingThreadPool.h. It only happens on platforms with weakly ordered memory model, such as ARM64.
---
.../platform/EigenNonBlockingThreadPool.h | 124 ++++++++++--------
1 file changed, 66 insertions(+), 58 deletions(-)
diff --git a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
index 27b14f008a8ba..a7c63c507d1ba 100644
--- a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
+++ b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
@@ -1467,11 +1467,14 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
status = ThreadStatus::Spinning;
}
- void SetBlocked(std::function should_block,
+ bool SetBlocked(std::function should_block,
std::function post_block) {
std::unique_lock lk(mutex);
- assert(GetStatus() == ThreadStatus::Spinning);
- status.store(ThreadStatus::Blocking, std::memory_order_relaxed);
+ auto old_status = status.exchange(ThreadStatus::Blocking, std::memory_order_seq_cst);
+ if (old_status != ThreadStatus::Spinning) {
+ // Encountered a logical error
+ return false;
+ }
if (should_block()) {
status.store(ThreadStatus::Blocked, std::memory_order_relaxed);
do {
@@ -1480,6 +1483,7 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
post_block();
}
status.store(ThreadStatus::Spinning, std::memory_order_relaxed);
+ return true;
}
private:
@@ -1558,62 +1562,66 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
// Attempt to block
if (!t) {
- td.SetBlocked( // Pre-block test
- [&]() -> bool {
- bool should_block = true;
- // Check whether work was pushed to us while attempting to block. We make
- // this test while holding the per-thread status lock, and after setting
- // our status to ThreadStatus::Blocking.
- //
- // This synchronizes with ThreadPool::Schedule which pushes work to the queue
- // and then tests for ThreadStatus::Blocking/Blocked (via EnsureAwake):
- //
- // Main thread: Worker:
- // #1 Push work #A Set status blocking
- // #2 Read worker status #B Check queue
- // #3 Wake if blocking/blocked
- //
- // If #A is before #2 then main sees worker blocked and wakes
- //
- // If #A if after #2 then #B will see #1, and we abandon blocking
- assert(!t);
- t = q.PopFront();
- if (t) {
- should_block = false;
- }
-
- // No work pushed to us, continue attempting to block. The remaining
- // test is to synchronize with termination requests. If we are
- // shutting down and all worker threads blocked without work, that's
- // we are done.
- if (should_block) {
- blocked_++;
- if (done_ && blocked_ == num_threads_) {
- should_block = false;
- // Almost done, but need to re-check queues.
- // Consider that all queues are empty and all worker threads are preempted
- // right after incrementing blocked_ above. Now a free-standing thread
- // submits work and calls destructor (which sets done_). If we don't
- // re-check queues, we will exit leaving the work unexecuted.
- if (NonEmptyQueueIndex() != -1) {
- // Note: we must not pop from queues before we decrement blocked_,
- // otherwise the following scenario is possible. Consider that instead
- // of checking for emptiness we popped the only element from queues.
- // Now other worker threads can start exiting, which is bad if the
- // work item submits other work. So we just check emptiness here,
- // which ensures that all worker threads exit at the same time.
- blocked_--;
- } else {
- should_exit = true;
+ if (!td.SetBlocked( // Pre-block test
+ [&]() -> bool {
+ bool should_block = true;
+ // Check whether work was pushed to us while attempting to block. We make
+ // this test while holding the per-thread status lock, and after setting
+ // our status to ThreadStatus::Blocking.
+ //
+ // This synchronizes with ThreadPool::Schedule which pushes work to the queue
+ // and then tests for ThreadStatus::Blocking/Blocked (via EnsureAwake):
+ //
+ // Main thread: Worker:
+ // #1 Push work #A Set status blocking
+ // #2 Read worker status #B Check queue
+ // #3 Wake if blocking/blocked
+ //
+ // If #A is before #2 then main sees worker blocked and wakes
+ //
+ // If #A if after #2 then #B will see #1, and we abandon blocking
+ assert(!t);
+ t = q.PopFront();
+ if (t) {
+ should_block = false;
+ }
+
+ // No work pushed to us, continue attempting to block. The remaining
+ // test is to synchronize with termination requests. If we are
+ // shutting down and all worker threads blocked without work, that's
+ // we are done.
+ if (should_block) {
+ blocked_++;
+ if (done_ && blocked_ == num_threads_) {
+ should_block = false;
+ // Almost done, but need to re-check queues.
+ // Consider that all queues are empty and all worker threads are preempted
+ // right after incrementing blocked_ above. Now a free-standing thread
+ // submits work and calls destructor (which sets done_). If we don't
+ // re-check queues, we will exit leaving the work unexecuted.
+ if (NonEmptyQueueIndex() != -1) {
+ // Note: we must not pop from queues before we decrement blocked_,
+ // otherwise the following scenario is possible. Consider that instead
+ // of checking for emptiness we popped the only element from queues.
+ // Now other worker threads can start exiting, which is bad if the
+ // work item submits other work. So we just check emptiness here,
+ // which ensures that all worker threads exit at the same time.
+ blocked_--;
+ } else {
+ should_exit = true;
+ }
+ }
}
- }
- }
- return should_block;
- },
- // Post-block update (executed only if we blocked)
- [&]() {
- blocked_--;
- });
+ return should_block;
+ },
+ // Post-block update (executed only if we blocked)
+ [&]() {
+ blocked_--;
+ })) {
+ // Encountered a fatal logic error in SetBlocked
+ should_exit = true;
+ break;
+ }
// Thread just unblocked. Unless we picked up work while
// blocking, or are exiting, then either work was pushed to
// us, or it was pushed to an overloaded queue
From a4eb8f27b6e51dec41f943b614702dd114731e13 Mon Sep 17 00:00:00 2001
From: tianf-fff <80665242+tianfang-fafafa@users.noreply.github.com>
Date: Mon, 16 Dec 2024 11:09:48 -0600
Subject: [PATCH 3/4] [VitisAI] Add profiler interface for vitisai (#23032)
### Description
Add common interfaces for vitis ep profiler.
### Motivation and Context
Vitis ep can collect and record api and kernel timestamps in file when
onnxruntime '-p' is enabled.
---
.../core/providers/vitisai/imp/global_api.cc | 12 +++++
.../vitisai/include/vaip/global_api.h | 15 ++++++
.../vitisai/vitisai_execution_provider.cc | 5 ++
.../vitisai/vitisai_execution_provider.h | 2 +
.../providers/vitisai/vitisai_profiler.cc | 49 +++++++++++++++++++
.../core/providers/vitisai/vitisai_profiler.h | 23 +++++++++
6 files changed, 106 insertions(+)
create mode 100644 onnxruntime/core/providers/vitisai/vitisai_profiler.cc
create mode 100644 onnxruntime/core/providers/vitisai/vitisai_profiler.h
diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc
index 51dc79c569589..cccaa65de45f2 100644
--- a/onnxruntime/core/providers/vitisai/imp/global_api.cc
+++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc
@@ -58,6 +58,9 @@ struct OrtVitisAIEpAPI {
const std::vector>& eps,
const char* const* keys,
const char* const* values, size_t kv_len) = nullptr;
+ void (*profiler_collect)(
+ std::vector& api_events,
+ std::vector& kernel_events);
void Ensure() {
if (handle_)
return;
@@ -81,6 +84,7 @@ struct OrtVitisAIEpAPI {
}
std::ignore = env.GetSymbolFromLibrary(handle_, "vaip_get_version",
(void**)&vaip_get_version);
+ std::ignore = env.GetSymbolFromLibrary(handle_, "profiler_collect", (void**)&profiler_collect);
ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "create_ep_context_nodes", (void**)&create_ep_context_nodes));
ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "vitisai_ep_on_run_start", (void**)&vitisai_ep_on_run_start));
ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "vitisai_ep_set_ep_dynamic_options", (void**)&vitisai_ep_set_ep_dynamic_options));
@@ -97,6 +101,14 @@ static vaip_core::OrtApiForVaip the_global_api;
std::shared_ptr get_kernel_registry_vitisaiep() { return s_kernel_registry_vitisaiep; }
const std::vector& get_domains_vitisaiep() { return s_domains_vitisaiep; }
+void profiler_collect(
+ std::vector& api_events,
+ std::vector& kernel_events) {
+ if (s_library_vitisaiep.profiler_collect) {
+ s_library_vitisaiep.profiler_collect(api_events, kernel_events);
+ }
+}
+
vaip_core::DllSafe>> compile_onnx_model(
const onnxruntime::GraphViewer& graph_viewer, const logging::Logger& logger, const ProviderOptions& options) {
auto model_path = graph_viewer.ModelPath().string();
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
index b0353bd6adae9..704b156dff57f 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
@@ -24,3 +24,18 @@ int vitisai_ep_set_ep_dynamic_options(
const std::vector>& eps,
const char* const* keys,
const char* const* values, size_t kv_len);
+/**
+ * Replace EventRecord with std::tuple,
+ * because EventRecord is defined in profiler_common.h which is used inside onnxruntime.
+ * However, profiler_collect function will call vitis ep which can't include profiler_common.h.
+ */
+using EventInfo = std::tuple<
+ std::string, // name
+ int, // pid
+ int, // tid
+ long long, // timestamp
+ long long // duration
+ >;
+void profiler_collect(
+ std::vector& api_events,
+ std::vector& kernel_events);
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
index 023a954c83d70..3a99f56bb732a 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
@@ -1,6 +1,7 @@
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
// Licensed under the MIT License.
#include "vitisai_execution_provider.h"
+#include "vitisai_profiler.h"
// Standard headers/libs.
#include
@@ -135,4 +136,8 @@ common::Status VitisAIExecutionProvider::SetEpDynamicOptions(gsl::span VitisAIExecutionProvider::GetProfiler() {
+ return std::make_unique();
+}
} // namespace onnxruntime
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
index 77dede6035b4c..f0d1a289a2a73 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
@@ -36,6 +36,8 @@ class VitisAIExecutionProvider : public IExecutionProvider {
std::vector& node_compute_funcs) override;
std::shared_ptr GetKernelRegistry() const override;
+ std::unique_ptr GetProfiler() override;
+
// This method is called after both `GetComputeCapabilityOps()` and `Compile()`.
// This timing is required to work with both compliation-based EPs and non-compilation-based EPs.
const InlinedVector GetEpContextNodes() const override;
diff --git a/onnxruntime/core/providers/vitisai/vitisai_profiler.cc b/onnxruntime/core/providers/vitisai/vitisai_profiler.cc
new file mode 100644
index 0000000000000..d84507ec6ad02
--- /dev/null
+++ b/onnxruntime/core/providers/vitisai/vitisai_profiler.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+// Licensed under the MIT License.
+
+#include "vitisai_profiler.h"
+
+namespace onnxruntime {
+namespace profiling {
+
+#if defined(USE_VITISAI)
+
+bool VitisaiProfiler::StartProfiling(TimePoint tp) {
+ return true;
+}
+
+void VitisaiProfiler::EndProfiling(TimePoint tp, Events& events) {
+ auto time_point =
+ std::chrono::duration_cast(tp.time_since_epoch()).count();
+
+ std::vector api_events;
+ std::vector kernel_events;
+ profiler_collect(api_events, kernel_events);
+
+ std::unordered_map event_args;
+
+ for (auto& a : api_events) {
+ events.emplace_back(EventCategory::API_EVENT,
+ std::get<1>(a), // pid
+ std::get<2>(a), // tid
+ std::get<0>(a), // name
+ std::get<3>(a) - time_point, // timestamp
+ std::get<4>(a), // duration
+ event_args);
+ }
+
+ for (auto& k : kernel_events) {
+ events.emplace_back(EventCategory::KERNEL_EVENT,
+ std::get<1>(k),
+ std::get<2>(k),
+ std::get<0>(k),
+ std::get<3>(k) - time_point,
+ std::get<4>(k),
+ event_args);
+ }
+}
+
+#endif
+
+} // namespace profiling
+} // namespace onnxruntime
diff --git a/onnxruntime/core/providers/vitisai/vitisai_profiler.h b/onnxruntime/core/providers/vitisai/vitisai_profiler.h
new file mode 100644
index 0000000000000..aedbda31f7b1d
--- /dev/null
+++ b/onnxruntime/core/providers/vitisai/vitisai_profiler.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/vitisai/include/vaip/global_api.h"
+
+namespace onnxruntime {
+namespace profiling {
+
+#if defined(USE_VITISAI)
+class VitisaiProfiler final : public EpProfiler {
+ public:
+ VitisaiProfiler() = default;
+ ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(VitisaiProfiler);
+ ~VitisaiProfiler() {}
+ bool StartProfiling(TimePoint) override;
+ void EndProfiling(TimePoint, Events&) override;
+ void Start(uint64_t) override{};
+ void Stop(uint64_t) override{};
+};
+#endif
+
+} // namespace profiling
+} // namespace onnxruntime
From ae970681372e20c3df3f1b40bfb0ee06a02c39c8 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov
Date: Mon, 16 Dec 2024 10:38:23 -0800
Subject: [PATCH 4/4] Fix Pybind memory leak (#23105)
### Description
Array GETITEM returns new reference which is a leak
### Motivation and Context
Address https://github.com/microsoft/onnxruntime/issues/22271
---
onnxruntime/python/onnxruntime_pybind_mlvalue.cc | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
index 92396bb09bd4c..5742b4db42512 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@@ -280,7 +280,7 @@ void DmlToCpuMemCpy(void* dst, const void* src, size_t num_bytes) {
uint32_t readback_heap_size = gsl::narrow_cast(sizeof(readback_heap));
ORT_THROW_IF_FAILED(d3d12_device->GetPrivateData(dml_readback_heap_guid, &readback_heap_size, &readback_heap));
- // ReadbackFromGpu already syncs with the CPU and waits for the copy to be completed, so we don't need to sync after
+ // ReadbackFromGpu already syncs with the CPU and waits for the copy to be completed, so we dont need to sync after
// this call
readback_heap->ReadbackFromGpu(
gsl::make_span(static_cast(dst), num_bytes),
@@ -428,7 +428,7 @@ MLDataType NumpyTypeToOnnxRuntimeTensorType(int numpy_type) {
// Special, not a C type expands to enum value of 16
{NPY_FLOAT16, DataTypeImpl::GetType()},
{NPY_DOUBLE, DataTypeImpl::GetType()},
- // We don't want to use size specific types such
+ // We dont want to use size specific types such
// as NPY_INT32 bc they are not enums but hash defines
// which may map into other enums and may conflict with other entries here
// also NPY docs define these sizes as platform specific, thus we
@@ -581,6 +581,7 @@ static void CopyDataToTensor(PyArrayObject* darray, int npy_type, Tensor& tensor
for (int i = 0; i < total_items; ++i, src += item_size) {
// Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
PyObject* item = PyArray_GETITEM(darray, src);
+ UniqueDecRefPtr itemGuard(item, DecRefFn());
PyObject* pStr = PyObject_Str(item);
UniqueDecRefPtr strGuard(pStr, DecRefFn());
dst[i] = py::reinterpret_borrow(pStr);