From 3a0b958586e93119b64de5a67b540d3f96219dba Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Fri, 13 Dec 2024 16:05:48 -0800 Subject: [PATCH 1/4] add 2 CMake build options of Dawn (#23096) ### Description This change adds the following CMake build options for Dawn: - onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY - OFF by default - when enabled, builds Dawn as a monolithic library (webgpu_dawn.dll) - onnxruntime_ENABLE_DAWN_BACKEND_VULKAN - OFF by default - when enabled, build with Vulkan backend for Dawn on Windows - onnxruntime_ENABLE_DAWN_BACKEND_D3D12 - ON by default - when enabled, build with DirectX 12 backend for Dawn on Windows ### File Size Comparison (Windows) | Build | cmdline | File Size | |---|---|---| | Baseline | --config Release
--build_shared_lib | `12,755,456 onnxruntime.dll` | | WebGPU D3D12 (default) | --use_webgpu
--config Release
--build_shared_lib | `17,082,368 dxcompiler.dll`
` 1,508,472 dxil.dll`
`18,708,480 onnxruntime.dll` | | WebGPU D3D12+Vulkan | --use_webgpu
--config Release
--build_shared_lib
--cmake_extra_defines
onnxruntime_ENABLE_DAWN_BACKEND_D3D12=1
onnxruntime_ENABLE_DAWN_BACKEND_VULKAN=1 | `17,081,344 dxcompiler.dll`
` 1,508,472 dxil.dll`
`19,388,416 onnxruntime.dll` | | WebGPU Vulkan | --use_webgpu
--config Release
--build_shared_lib
--cmake_extra_defines
onnxruntime_ENABLE_DAWN_BACKEND_D3D12=0
onnxruntime_ENABLE_DAWN_BACKEND_VULKAN=1 | `17,615,872 onnxruntime.dll` | | Monolithic | --use_webgpu
--config Release
--build_shared_lib
--cmake_extra_defines
onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY=1 | `17,082,368 dxcompiler.dll`
` 1,508,472 dxil.dll`
`13,277,696 onnxruntime.dll`
` 5,616,640 webgpu_dawn.dll` | | External Dawn | --use_webgpu
--config Release
--build_shared_lib
--cmake_extra_defines
onnxruntime_USE_EXTERNAL_DAWN=1
--skip_tests | `17,081,344 dxcompiler.dll`
` 1,508,472 dxil.dll`
`13,277,184 onnxruntime.dll` --- cmake/CMakeLists.txt | 13 ++++++ .../external/onnxruntime_external_deps.cmake | 41 +++++++++++++++---- cmake/onnxruntime_providers_webgpu.cmake | 22 ++++++++-- .../core/providers/webgpu/webgpu_context.cc | 8 ++-- .../webgpu/webgpu_execution_provider.h | 2 + .../webgpu/webgpu_provider_factory.cc | 20 +++++++++ .../webgpu/webgpu_provider_options.h | 5 +++ 7 files changed, 97 insertions(+), 14 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 7710ab2f4cac7..d2fe7e7457983 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -149,6 +149,10 @@ option(onnxruntime_USE_WEBNN "Build with WebNN support. Enable hardware accelera option(onnxruntime_USE_WEBGPU "Build with WebGPU support. Enable WebGPU via C/C++ interface." OFF) option(onnxruntime_USE_EXTERNAL_DAWN "Build with treating Dawn as external dependency. Will not link Dawn at build time." OFF) option(onnxruntime_CUSTOM_DAWN_SRC_PATH "Path to custom Dawn src dir.") +option(onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY "Build Dawn as a monolithic library" OFF) +# The following 2 options are only for Windows +option(onnxruntime_ENABLE_DAWN_BACKEND_VULKAN "Enable Vulkan backend for Dawn (on Windows)" OFF) +option(onnxruntime_ENABLE_DAWN_BACKEND_D3D12 "Enable D3D12 backend for Dawn (on Windows)" ON) # Options related to reducing the binary size produced by the build # XNNPACK EP requires the internal NHWC contrib ops to be available, so this option must be OFF when onnxruntime_USE_XNNPACK is ON @@ -955,9 +959,18 @@ if (onnxruntime_USE_WEBGPU) list(APPEND ORT_PROVIDER_FLAGS -DUSE_WEBGPU=1) list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_WEBGPU=1) list(APPEND ONNXRUNTIME_PROVIDER_NAMES webgpu) + if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY) + list(APPEND ORT_PROVIDER_FLAGS -DBUILD_DAWN_MONOLITHIC_LIBRARY=1) + endif() if (onnxruntime_USE_EXTERNAL_DAWN) list(APPEND ORT_PROVIDER_FLAGS -DUSE_EXTERNAL_DAWN=1) endif() + if (onnxruntime_ENABLE_DAWN_BACKEND_VULKAN) + list(APPEND ORT_PROVIDER_FLAGS -DDAWN_ENABLE_VULKAN=1) + endif() + if (onnxruntime_ENABLE_DAWN_BACKEND_D3D12) + list(APPEND ORT_PROVIDER_FLAGS -DDAWN_ENABLE_D3D12=1) + endif() endif() if (onnxruntime_USE_CANN) list(APPEND ORT_PROVIDER_FLAGS -DUSE_CANN=1) diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index ee7abcbad025c..aeaaa7b51d595 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -635,10 +635,19 @@ if (onnxruntime_USE_WEBGPU) ) endif() - # use dawn::dawn_native and dawn::dawn_proc instead of the monolithic dawn::webgpu_dawn to minimize binary size - set(DAWN_BUILD_MONOLITHIC_LIBRARY OFF CACHE BOOL "" FORCE) + if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY) + set(DAWN_BUILD_MONOLITHIC_LIBRARY ON CACHE BOOL "" FORCE) + set(DAWN_ENABLE_INSTALL ON CACHE BOOL "" FORCE) + + if (onnxruntime_USE_EXTERNAL_DAWN) + message(FATAL_ERROR "onnxruntime_USE_EXTERNAL_DAWN and onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY cannot be enabled at the same time.") + endif() + else() + # use dawn::dawn_native and dawn::dawn_proc instead of the monolithic dawn::webgpu_dawn to minimize binary size + set(DAWN_BUILD_MONOLITHIC_LIBRARY OFF CACHE BOOL "" FORCE) + set(DAWN_ENABLE_INSTALL OFF CACHE BOOL "" FORCE) + endif() set(DAWN_BUILD_SAMPLES OFF CACHE BOOL "" FORCE) - set(DAWN_ENABLE_INSTALL OFF CACHE BOOL "" FORCE) set(DAWN_ENABLE_NULL OFF CACHE BOOL "" FORCE) set(DAWN_FETCH_DEPENDENCIES ON CACHE BOOL "" FORCE) @@ -667,18 +676,34 @@ if (onnxruntime_USE_WEBGPU) set(DAWN_USE_BUILT_DXC ON CACHE BOOL "" FORCE) set(TINT_BUILD_HLSL_WRITER ON CACHE BOOL "" FORCE) - # Vulkan may optionally be included in a Windows build. Exclude until we have an explicit use case that requires it. - set(DAWN_ENABLE_VULKAN OFF CACHE BOOL "" FORCE) + if ((NOT onnxruntime_ENABLE_DAWN_BACKEND_VULKAN) AND (NOT onnxruntime_ENABLE_DAWN_BACKEND_D3D12)) + message(FATAL_ERROR "At least one of onnxruntime_ENABLE_DAWN_BACKEND_VULKAN or onnxruntime_ENABLE_DAWN_BACKEND_D3D12 must be enabled when using Dawn on Windows.") + endif() + if (onnxruntime_ENABLE_DAWN_BACKEND_VULKAN) + set(DAWN_ENABLE_VULKAN ON CACHE BOOL "" FORCE) + set(TINT_BUILD_SPV_WRITER ON CACHE BOOL "" FORCE) + else() + set(DAWN_ENABLE_VULKAN OFF CACHE BOOL "" FORCE) + endif() + if (onnxruntime_ENABLE_DAWN_BACKEND_D3D12) + set(DAWN_ENABLE_D3D12 ON CACHE BOOL "" FORCE) + else() + set(DAWN_ENABLE_D3D12 OFF CACHE BOOL "" FORCE) + endif() # We are currently always using the D3D12 backend. set(DAWN_ENABLE_D3D11 OFF CACHE BOOL "" FORCE) endif() onnxruntime_fetchcontent_makeavailable(dawn) - if (NOT onnxruntime_USE_EXTERNAL_DAWN) - list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::dawn_native) + if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY) + list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::webgpu_dawn) + else() + if (NOT onnxruntime_USE_EXTERNAL_DAWN) + list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::dawn_native) + endif() + list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::dawn_proc) endif() - list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::dawn_proc) endif() set(onnxruntime_LINK_DIRS) diff --git a/cmake/onnxruntime_providers_webgpu.cmake b/cmake/onnxruntime_providers_webgpu.cmake index 02c2a5aee481c..fea5964f0dda9 100644 --- a/cmake/onnxruntime_providers_webgpu.cmake +++ b/cmake/onnxruntime_providers_webgpu.cmake @@ -22,9 +22,25 @@ onnxruntime_add_static_library(onnxruntime_providers_webgpu ${onnxruntime_providers_webgpu_cc_srcs}) onnxruntime_add_include_to_target(onnxruntime_providers_webgpu onnxruntime_common dawn::dawncpp_headers dawn::dawn_headers onnx onnx_proto flatbuffers::flatbuffers Boost::mp11 safeint_interface) - if (NOT onnxruntime_USE_EXTERNAL_DAWN) - target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_native) + + if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY) + target_link_libraries(onnxruntime_providers_webgpu dawn::webgpu_dawn) + + if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS) + list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll") + endif() + + # Copy webgpu_dawn.dll to the output directory + add_custom_command( + TARGET onnxruntime_providers_webgpu + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different "$" "$" + VERBATIM ) + else() + if (NOT onnxruntime_USE_EXTERNAL_DAWN) + target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_native) + endif() + target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_proc) endif() - target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_proc) set_target_properties(onnxruntime_providers_webgpu PROPERTIES FOLDER "ONNXRuntime") diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc index ea0cbddb0205d..d66c2a79d28a8 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_context.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc @@ -28,6 +28,9 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info // Initialization.Step.1 - Create wgpu::Instance if (instance_ == nullptr) { const DawnProcTable* dawn_procs = reinterpret_cast(dawn_proc_table); +#if defined(BUILD_DAWN_MONOLITHIC_LIBRARY) + ORT_ENFORCE(dawn_procs == nullptr, "setting DawnProcTable is not allowed when dynamically linked to webgpu_dawn."); +#else #if !defined(USE_EXTERNAL_DAWN) if (dawn_procs == nullptr) { dawn_procs = &dawn::native::GetProcs(); @@ -36,6 +39,7 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info ORT_ENFORCE(dawn_procs != nullptr, "DawnProcTable must be provided."); #endif dawnProcSetProcs(dawn_procs); +#endif wgpu::InstanceDescriptor instance_desc{}; instance_desc.features.timedWaitAnyEnable = true; @@ -49,9 +53,7 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info wgpu::RequestAdapterOptions req_adapter_options = {}; wgpu::DawnTogglesDescriptor adapter_toggles_desc = {}; req_adapter_options.nextInChain = &adapter_toggles_desc; -#ifdef _WIN32 - req_adapter_options.backendType = wgpu::BackendType::D3D12; -#endif + req_adapter_options.backendType = static_cast(webgpu_ep_info.backend_type); req_adapter_options.powerPreference = wgpu::PowerPreference::HighPerformance; auto enabled_adapter_toggles = GetEnabledAdapterToggles(); diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h index 336395a1dd0dd..f9c43c6bfd7d0 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h +++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h @@ -26,6 +26,7 @@ struct WebGpuExecutionProviderInfo { WebGpuExecutionProviderInfo(DataLayout data_layout, bool enable_graph_capture) : data_layout{data_layout}, enable_graph_capture{enable_graph_capture}, + backend_type{}, storage_buffer_cache_mode{}, uniform_buffer_cache_mode{}, query_resolve_buffer_cache_mode{}, @@ -36,6 +37,7 @@ struct WebGpuExecutionProviderInfo { DataLayout data_layout; bool enable_graph_capture; + int backend_type; webgpu::BufferCacheMode storage_buffer_cache_mode; webgpu::BufferCacheMode uniform_buffer_cache_mode; webgpu::BufferCacheMode query_resolve_buffer_cache_mode; diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc index 6115464cefa6d..6cfe9aac0b0e9 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc @@ -67,6 +67,26 @@ std::shared_ptr WebGpuProviderFactoryCreator::Create( } LOGS_DEFAULT(VERBOSE) << "WebGPU EP graph capture enable: " << webgpu_ep_info.enable_graph_capture; + std::string backend_type_str; + if (config_options.TryGetConfigEntry(kDawnBackendType, backend_type_str)) { +#ifdef _WIN32 + // Setup Windows default backend type based on the build configuration +#if defined(onnxruntime_ENABLE_DAWN_BACKEND_D3D12) + webgpu_ep_info.backend_type = static_cast(WGPUBackendType_D3D12); +#elif defined(onnxruntime_ENABLE_DAWN_BACKEND_VULKAN) + webgpu_ep_info.backend_type = static_cast(WGPUBackendType_Vulkan); +#endif +#endif + if (backend_type_str == kDawnBackendType_D3D12) { + webgpu_ep_info.backend_type = static_cast(WGPUBackendType_D3D12); + } else if (backend_type_str == kDawnBackendType_Vulkan) { + webgpu_ep_info.backend_type = static_cast(WGPUBackendType_Vulkan); + } else { + ORT_THROW("Invalid Dawn backend type: ", backend_type_str); + } + } + LOGS_DEFAULT(VERBOSE) << "WebGPU EP Dawn backend type: " << webgpu_ep_info.backend_type; + auto parse_buffer_cache_mode = [&config_options](const std::string& config_entry_str, webgpu::BufferCacheMode default_value) -> webgpu::BufferCacheMode { std::string buffer_cache_mode_str; diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_options.h b/onnxruntime/core/providers/webgpu/webgpu_provider_options.h index 63befedffea84..12bb4b32e6a35 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_provider_options.h +++ b/onnxruntime/core/providers/webgpu/webgpu_provider_options.h @@ -14,6 +14,8 @@ constexpr const char* kEnableGraphCapture = "WebGPU:enableGraphCapture"; constexpr const char* kDawnProcTable = "WebGPU:dawnProcTable"; +constexpr const char* kDawnBackendType = "WebGPU:dawnBackendType"; + constexpr const char* kDeviceId = "WebGPU:deviceId"; constexpr const char* kWebGpuInstance = "WebGPU:webgpuInstance"; constexpr const char* kWebGpuAdapter = "WebGPU:webgpuAdapter"; @@ -30,6 +32,9 @@ constexpr const char* kForceCpuNodeNames = "WebGPU:forceCpuNodeNames"; // The following are the possible values for the provider options. +constexpr const char* kDawnBackendType_D3D12 = "D3D12"; +constexpr const char* kDawnBackendType_Vulkan = "Vulkan"; + constexpr const char* kPreferredLayout_NCHW = "NCHW"; constexpr const char* kPreferredLayout_NHWC = "NHWC"; From 2ff66b80e0e075696e34c78ab59b351bc8590d56 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Mon, 16 Dec 2024 09:05:12 -0800 Subject: [PATCH 2/4] Fix a deadlock bug in EigenNonBlockingThreadPool.h (#23098) ### Description This PR fixes a deadlock bug in EigenNonBlockingThreadPool.h. It only happens on platforms with weakly ordered memory model, such as ARM64. --- .../platform/EigenNonBlockingThreadPool.h | 124 ++++++++++-------- 1 file changed, 66 insertions(+), 58 deletions(-) diff --git a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h index 27b14f008a8ba..a7c63c507d1ba 100644 --- a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h +++ b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h @@ -1467,11 +1467,14 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter status = ThreadStatus::Spinning; } - void SetBlocked(std::function should_block, + bool SetBlocked(std::function should_block, std::function post_block) { std::unique_lock lk(mutex); - assert(GetStatus() == ThreadStatus::Spinning); - status.store(ThreadStatus::Blocking, std::memory_order_relaxed); + auto old_status = status.exchange(ThreadStatus::Blocking, std::memory_order_seq_cst); + if (old_status != ThreadStatus::Spinning) { + // Encountered a logical error + return false; + } if (should_block()) { status.store(ThreadStatus::Blocked, std::memory_order_relaxed); do { @@ -1480,6 +1483,7 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter post_block(); } status.store(ThreadStatus::Spinning, std::memory_order_relaxed); + return true; } private: @@ -1558,62 +1562,66 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter // Attempt to block if (!t) { - td.SetBlocked( // Pre-block test - [&]() -> bool { - bool should_block = true; - // Check whether work was pushed to us while attempting to block. We make - // this test while holding the per-thread status lock, and after setting - // our status to ThreadStatus::Blocking. - // - // This synchronizes with ThreadPool::Schedule which pushes work to the queue - // and then tests for ThreadStatus::Blocking/Blocked (via EnsureAwake): - // - // Main thread: Worker: - // #1 Push work #A Set status blocking - // #2 Read worker status #B Check queue - // #3 Wake if blocking/blocked - // - // If #A is before #2 then main sees worker blocked and wakes - // - // If #A if after #2 then #B will see #1, and we abandon blocking - assert(!t); - t = q.PopFront(); - if (t) { - should_block = false; - } - - // No work pushed to us, continue attempting to block. The remaining - // test is to synchronize with termination requests. If we are - // shutting down and all worker threads blocked without work, that's - // we are done. - if (should_block) { - blocked_++; - if (done_ && blocked_ == num_threads_) { - should_block = false; - // Almost done, but need to re-check queues. - // Consider that all queues are empty and all worker threads are preempted - // right after incrementing blocked_ above. Now a free-standing thread - // submits work and calls destructor (which sets done_). If we don't - // re-check queues, we will exit leaving the work unexecuted. - if (NonEmptyQueueIndex() != -1) { - // Note: we must not pop from queues before we decrement blocked_, - // otherwise the following scenario is possible. Consider that instead - // of checking for emptiness we popped the only element from queues. - // Now other worker threads can start exiting, which is bad if the - // work item submits other work. So we just check emptiness here, - // which ensures that all worker threads exit at the same time. - blocked_--; - } else { - should_exit = true; + if (!td.SetBlocked( // Pre-block test + [&]() -> bool { + bool should_block = true; + // Check whether work was pushed to us while attempting to block. We make + // this test while holding the per-thread status lock, and after setting + // our status to ThreadStatus::Blocking. + // + // This synchronizes with ThreadPool::Schedule which pushes work to the queue + // and then tests for ThreadStatus::Blocking/Blocked (via EnsureAwake): + // + // Main thread: Worker: + // #1 Push work #A Set status blocking + // #2 Read worker status #B Check queue + // #3 Wake if blocking/blocked + // + // If #A is before #2 then main sees worker blocked and wakes + // + // If #A if after #2 then #B will see #1, and we abandon blocking + assert(!t); + t = q.PopFront(); + if (t) { + should_block = false; + } + + // No work pushed to us, continue attempting to block. The remaining + // test is to synchronize with termination requests. If we are + // shutting down and all worker threads blocked without work, that's + // we are done. + if (should_block) { + blocked_++; + if (done_ && blocked_ == num_threads_) { + should_block = false; + // Almost done, but need to re-check queues. + // Consider that all queues are empty and all worker threads are preempted + // right after incrementing blocked_ above. Now a free-standing thread + // submits work and calls destructor (which sets done_). If we don't + // re-check queues, we will exit leaving the work unexecuted. + if (NonEmptyQueueIndex() != -1) { + // Note: we must not pop from queues before we decrement blocked_, + // otherwise the following scenario is possible. Consider that instead + // of checking for emptiness we popped the only element from queues. + // Now other worker threads can start exiting, which is bad if the + // work item submits other work. So we just check emptiness here, + // which ensures that all worker threads exit at the same time. + blocked_--; + } else { + should_exit = true; + } + } } - } - } - return should_block; - }, - // Post-block update (executed only if we blocked) - [&]() { - blocked_--; - }); + return should_block; + }, + // Post-block update (executed only if we blocked) + [&]() { + blocked_--; + })) { + // Encountered a fatal logic error in SetBlocked + should_exit = true; + break; + } // Thread just unblocked. Unless we picked up work while // blocking, or are exiting, then either work was pushed to // us, or it was pushed to an overloaded queue From a4eb8f27b6e51dec41f943b614702dd114731e13 Mon Sep 17 00:00:00 2001 From: tianf-fff <80665242+tianfang-fafafa@users.noreply.github.com> Date: Mon, 16 Dec 2024 11:09:48 -0600 Subject: [PATCH 3/4] [VitisAI] Add profiler interface for vitisai (#23032) ### Description Add common interfaces for vitis ep profiler. ### Motivation and Context Vitis ep can collect and record api and kernel timestamps in file when onnxruntime '-p' is enabled. --- .../core/providers/vitisai/imp/global_api.cc | 12 +++++ .../vitisai/include/vaip/global_api.h | 15 ++++++ .../vitisai/vitisai_execution_provider.cc | 5 ++ .../vitisai/vitisai_execution_provider.h | 2 + .../providers/vitisai/vitisai_profiler.cc | 49 +++++++++++++++++++ .../core/providers/vitisai/vitisai_profiler.h | 23 +++++++++ 6 files changed, 106 insertions(+) create mode 100644 onnxruntime/core/providers/vitisai/vitisai_profiler.cc create mode 100644 onnxruntime/core/providers/vitisai/vitisai_profiler.h diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc index 51dc79c569589..cccaa65de45f2 100644 --- a/onnxruntime/core/providers/vitisai/imp/global_api.cc +++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc @@ -58,6 +58,9 @@ struct OrtVitisAIEpAPI { const std::vector>& eps, const char* const* keys, const char* const* values, size_t kv_len) = nullptr; + void (*profiler_collect)( + std::vector& api_events, + std::vector& kernel_events); void Ensure() { if (handle_) return; @@ -81,6 +84,7 @@ struct OrtVitisAIEpAPI { } std::ignore = env.GetSymbolFromLibrary(handle_, "vaip_get_version", (void**)&vaip_get_version); + std::ignore = env.GetSymbolFromLibrary(handle_, "profiler_collect", (void**)&profiler_collect); ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "create_ep_context_nodes", (void**)&create_ep_context_nodes)); ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "vitisai_ep_on_run_start", (void**)&vitisai_ep_on_run_start)); ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "vitisai_ep_set_ep_dynamic_options", (void**)&vitisai_ep_set_ep_dynamic_options)); @@ -97,6 +101,14 @@ static vaip_core::OrtApiForVaip the_global_api; std::shared_ptr get_kernel_registry_vitisaiep() { return s_kernel_registry_vitisaiep; } const std::vector& get_domains_vitisaiep() { return s_domains_vitisaiep; } +void profiler_collect( + std::vector& api_events, + std::vector& kernel_events) { + if (s_library_vitisaiep.profiler_collect) { + s_library_vitisaiep.profiler_collect(api_events, kernel_events); + } +} + vaip_core::DllSafe>> compile_onnx_model( const onnxruntime::GraphViewer& graph_viewer, const logging::Logger& logger, const ProviderOptions& options) { auto model_path = graph_viewer.ModelPath().string(); diff --git a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h index b0353bd6adae9..704b156dff57f 100644 --- a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h +++ b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h @@ -24,3 +24,18 @@ int vitisai_ep_set_ep_dynamic_options( const std::vector>& eps, const char* const* keys, const char* const* values, size_t kv_len); +/** + * Replace EventRecord with std::tuple, + * because EventRecord is defined in profiler_common.h which is used inside onnxruntime. + * However, profiler_collect function will call vitis ep which can't include profiler_common.h. + */ +using EventInfo = std::tuple< + std::string, // name + int, // pid + int, // tid + long long, // timestamp + long long // duration + >; +void profiler_collect( + std::vector& api_events, + std::vector& kernel_events); diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc index 023a954c83d70..3a99f56bb732a 100644 --- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc +++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc @@ -1,6 +1,7 @@ // Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. // Licensed under the MIT License. #include "vitisai_execution_provider.h" +#include "vitisai_profiler.h" // Standard headers/libs. #include @@ -135,4 +136,8 @@ common::Status VitisAIExecutionProvider::SetEpDynamicOptions(gsl::span VitisAIExecutionProvider::GetProfiler() { + return std::make_unique(); +} } // namespace onnxruntime diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h index 77dede6035b4c..f0d1a289a2a73 100644 --- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h +++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h @@ -36,6 +36,8 @@ class VitisAIExecutionProvider : public IExecutionProvider { std::vector& node_compute_funcs) override; std::shared_ptr GetKernelRegistry() const override; + std::unique_ptr GetProfiler() override; + // This method is called after both `GetComputeCapabilityOps()` and `Compile()`. // This timing is required to work with both compliation-based EPs and non-compilation-based EPs. const InlinedVector GetEpContextNodes() const override; diff --git a/onnxruntime/core/providers/vitisai/vitisai_profiler.cc b/onnxruntime/core/providers/vitisai/vitisai_profiler.cc new file mode 100644 index 0000000000000..d84507ec6ad02 --- /dev/null +++ b/onnxruntime/core/providers/vitisai/vitisai_profiler.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// Licensed under the MIT License. + +#include "vitisai_profiler.h" + +namespace onnxruntime { +namespace profiling { + +#if defined(USE_VITISAI) + +bool VitisaiProfiler::StartProfiling(TimePoint tp) { + return true; +} + +void VitisaiProfiler::EndProfiling(TimePoint tp, Events& events) { + auto time_point = + std::chrono::duration_cast(tp.time_since_epoch()).count(); + + std::vector api_events; + std::vector kernel_events; + profiler_collect(api_events, kernel_events); + + std::unordered_map event_args; + + for (auto& a : api_events) { + events.emplace_back(EventCategory::API_EVENT, + std::get<1>(a), // pid + std::get<2>(a), // tid + std::get<0>(a), // name + std::get<3>(a) - time_point, // timestamp + std::get<4>(a), // duration + event_args); + } + + for (auto& k : kernel_events) { + events.emplace_back(EventCategory::KERNEL_EVENT, + std::get<1>(k), + std::get<2>(k), + std::get<0>(k), + std::get<3>(k) - time_point, + std::get<4>(k), + event_args); + } +} + +#endif + +} // namespace profiling +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/vitisai/vitisai_profiler.h b/onnxruntime/core/providers/vitisai/vitisai_profiler.h new file mode 100644 index 0000000000000..aedbda31f7b1d --- /dev/null +++ b/onnxruntime/core/providers/vitisai/vitisai_profiler.h @@ -0,0 +1,23 @@ +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/vitisai/include/vaip/global_api.h" + +namespace onnxruntime { +namespace profiling { + +#if defined(USE_VITISAI) +class VitisaiProfiler final : public EpProfiler { + public: + VitisaiProfiler() = default; + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(VitisaiProfiler); + ~VitisaiProfiler() {} + bool StartProfiling(TimePoint) override; + void EndProfiling(TimePoint, Events&) override; + void Start(uint64_t) override{}; + void Stop(uint64_t) override{}; +}; +#endif + +} // namespace profiling +} // namespace onnxruntime From ae970681372e20c3df3f1b40bfb0ee06a02c39c8 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 16 Dec 2024 10:38:23 -0800 Subject: [PATCH 4/4] Fix Pybind memory leak (#23105) ### Description Array GETITEM returns new reference which is a leak ### Motivation and Context Address https://github.com/microsoft/onnxruntime/issues/22271 --- onnxruntime/python/onnxruntime_pybind_mlvalue.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc index 92396bb09bd4c..5742b4db42512 100644 --- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc +++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc @@ -280,7 +280,7 @@ void DmlToCpuMemCpy(void* dst, const void* src, size_t num_bytes) { uint32_t readback_heap_size = gsl::narrow_cast(sizeof(readback_heap)); ORT_THROW_IF_FAILED(d3d12_device->GetPrivateData(dml_readback_heap_guid, &readback_heap_size, &readback_heap)); - // ReadbackFromGpu already syncs with the CPU and waits for the copy to be completed, so we don't need to sync after + // ReadbackFromGpu already syncs with the CPU and waits for the copy to be completed, so we dont need to sync after // this call readback_heap->ReadbackFromGpu( gsl::make_span(static_cast(dst), num_bytes), @@ -428,7 +428,7 @@ MLDataType NumpyTypeToOnnxRuntimeTensorType(int numpy_type) { // Special, not a C type expands to enum value of 16 {NPY_FLOAT16, DataTypeImpl::GetType()}, {NPY_DOUBLE, DataTypeImpl::GetType()}, - // We don't want to use size specific types such + // We dont want to use size specific types such // as NPY_INT32 bc they are not enums but hash defines // which may map into other enums and may conflict with other entries here // also NPY docs define these sizes as platform specific, thus we @@ -581,6 +581,7 @@ static void CopyDataToTensor(PyArrayObject* darray, int npy_type, Tensor& tensor for (int i = 0; i < total_items; ++i, src += item_size) { // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8. PyObject* item = PyArray_GETITEM(darray, src); + UniqueDecRefPtr itemGuard(item, DecRefFn()); PyObject* pStr = PyObject_Str(item); UniqueDecRefPtr strGuard(pStr, DecRefFn()); dst[i] = py::reinterpret_borrow(pStr);