From e7b0074127c08947aeb9e41f7684f47f2be56f3c Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Mon, 25 Mar 2024 13:53:18 -0700 Subject: [PATCH 1/3] [DML] Hide command list reset latency with multiple threads --- .../src/CommandAllocatorRing.h | 74 ---------------- .../src/DmlCommandRecorder.cpp | 86 +++++++++++++++---- .../src/DmlCommandRecorder.h | 37 ++++++-- 3 files changed, 95 insertions(+), 102 deletions(-) delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandAllocatorRing.h diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandAllocatorRing.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandAllocatorRing.h deleted file mode 100644 index 2eee9c9a9e5a3..0000000000000 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandAllocatorRing.h +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -#include "GpuEvent.h" - -namespace Dml -{ - // A fixed-size ring of command allocators. Each time an allocator is retrieved, the allocator will - // be reset if its previously recorded commands have finished executing on the GPU. - template - class CommandAllocatorRing - { - public: - CommandAllocatorRing( - ID3D12Device* device, - D3D12_COMMAND_LIST_TYPE commandListType, - GpuEvent initialEvent) - { - for (auto& info : m_commandAllocators) - { - ORT_THROW_IF_FAILED(device->CreateCommandAllocator( - commandListType, - IID_GRAPHICS_PPV_ARGS(info.allocator.ReleaseAndGetAddressOf()))); - - info.completionEvent = initialEvent; - } - } - - ID3D12CommandAllocator* GetNextAllocator(GpuEvent nextCompletionEvent) - { - size_t earliestOtherAllocator = (m_currentCommandAllocator + 1) % AllocatorCount; - - assert(!m_commandAllocators[m_currentCommandAllocator].completionEvent.IsSignaled() || - m_commandAllocators[earliestOtherAllocator].completionEvent.IsSignaled()); - - if (m_commandAllocators[earliestOtherAllocator].completionEvent.IsSignaled()) - { - ORT_THROW_IF_FAILED(m_commandAllocators[earliestOtherAllocator].Get()->Reset()); - m_currentCommandAllocator = earliestOtherAllocator; - } - - // Set the completion event for the current allocator so it can be reset eventually. - m_commandAllocators[m_currentCommandAllocator].completionEvent = nextCompletionEvent; - - return m_commandAllocators[m_currentCommandAllocator].Get(); - } - - // Updates the completion event of the current allocator to a different value. This is used when the caller - // decides to issue an unrelated call to the queue such as ExecuteCommandLists which updates its fence between calling - // GetNextAllocator and executing the work which it recorded using the allocator it received. - void UpdateCurrentAllocatorCompletionEvent(GpuEvent nextCompletionEvent) - { - m_commandAllocators[m_currentCommandAllocator].completionEvent = nextCompletionEvent; - } - - private: - struct CommandAllocatorInfo - { - ComPtr allocator; - - // The event which will be signaled when the last command list submitted using this allocator - // completes execution on the GPU. - GpuEvent completionEvent = {}; - - ID3D12CommandAllocator* Get() const { return allocator.Get(); } - }; - - std::array m_commandAllocators; - size_t m_currentCommandAllocator = 0; - - }; -} \ No newline at end of file diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp index 5254b23f56376..5a5d769fd9995 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp @@ -15,13 +15,24 @@ DmlCommandRecorder::DmlCommandRecorder( : m_queue(std::move(commandQueue)), m_d3dDevice(d3dDevice), m_dmlDevice(dmlDevice), - m_descriptorPool(d3dDevice, 2048), - m_commandAllocatorRing(d3dDevice, m_queue->GetType(), m_queue->GetCurrentCompletionEvent()) + m_descriptorPool(d3dDevice, 2048) { ORT_THROW_IF_FAILED(dmlDevice->CreateOperatorInitializer(0, nullptr, IID_PPV_ARGS(&m_initializer))); ORT_THROW_IF_FAILED(dmlDevice->CreateCommandRecorder(IID_PPV_ARGS(&m_recorder))); } +DmlCommandRecorder::~DmlCommandRecorder() +{ + // Detach the threads to avoid crashes when terminating the program + for (auto& resetThread : m_resetThreads) + { + if (resetThread) + { + resetThread->detach(); + } + } +} + void DmlCommandRecorder::SetAllocator(std::weak_ptr allocator) { m_bufferAllocator = allocator; @@ -263,7 +274,7 @@ void DmlCommandRecorder::ExecuteCommandList( gsl::span(reinterpret_cast(&commandList), 1)); // The fence value at which the current command allocator may be re-used will now be higher - m_commandAllocatorRing.UpdateCurrentAllocatorCompletionEvent(m_queue->GetNextCompletionEvent()); + m_allocatorRing.back().completionEvent = m_queue->GetNextCompletionEvent(); // Fail early if something horrifying happens ORT_THROW_IF_FAILED(m_dmlDevice->GetDeviceRemovedReason()); @@ -313,23 +324,62 @@ void DmlCommandRecorder::Open() { assert(m_currentDescriptorHeap == nullptr); - ID3D12CommandAllocator* allocator = m_commandAllocatorRing.GetNextAllocator(m_queue->GetNextCompletionEvent()); - - if (!m_cachedCommandList) + if (m_currentCommandList) { - ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommandList( - 0, - m_queue->GetType(), - allocator, - nullptr, - IID_GRAPHICS_PPV_ARGS(m_currentCommandList.ReleaseAndGetAddressOf()))); + if (m_resetThreads.front()) + { + m_resetThreads.front()->join(); + } + + // Rotate the reset threads to the left + for (uint32_t i = 0; i < m_resetThreads.size() - 1; ++i) { + m_resetThreads[i] = std::move(m_resetThreads[i + 1]); + } + + // Rotate the allocators to the left + auto firstAllocator = std::move(m_allocatorRing.front()); + for (uint32_t i = 0; i < m_allocatorRing.size() - 1; ++i) + { + m_allocatorRing[i] = std::move(m_allocatorRing[i + 1]); + } + m_allocatorRing.back() = std::move(firstAllocator); + + // Rotate the command lists to the left + auto firstCommandList = std::move(m_commandListRing.front()); + for (uint32_t i = 0; i < m_commandListRing.size() - 1; ++i) + { + m_commandListRing[i] = std::move(m_commandListRing[i + 1]); + } + m_commandListRing.back() = std::move(firstCommandList); + + // The newest dirty allocator is now located before the last element in the ring buffer, so start resetting it + m_resetThreads.back() = std::thread([cachedAllocator = m_allocatorRing[m_allocatorRing.size() - 2], cachedCommandList = m_commandListRing[m_commandListRing.size() - 2]]() { + cachedAllocator.completionEvent.WaitForSignal(); + ORT_THROW_IF_FAILED(cachedAllocator.allocator->Reset()); + ORT_THROW_IF_FAILED(cachedCommandList->Reset(cachedAllocator.allocator.Get(), nullptr)); + }); } else { - m_currentCommandList = m_cachedCommandList; - m_cachedCommandList = nullptr; - ORT_THROW_IF_FAILED(m_currentCommandList->Reset(allocator, nullptr)); + assert(m_commandListRing.size() == m_allocatorRing.size()); + + for (uint32_t i = 0; i < m_commandListRing.size(); ++i) + { + ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommandAllocator( + m_queue->GetType(), + IID_GRAPHICS_PPV_ARGS(m_allocatorRing[i].allocator.ReleaseAndGetAddressOf()))); + + ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommandList( + 0, + m_queue->GetType(), + m_allocatorRing[i].allocator.Get(), + nullptr, + IID_GRAPHICS_PPV_ARGS(m_commandListRing[i].ReleaseAndGetAddressOf()))); + } } + + m_currentCommandList = m_commandListRing.back(); + m_allocatorRing.back().completionEvent = m_queue->GetNextCompletionEvent(); } void DmlCommandRecorder::CloseAndExecute() @@ -338,7 +388,7 @@ void DmlCommandRecorder::CloseAndExecute() } void DmlCommandRecorder::CloseAndExecute(_In_opt_ ID3D12GraphicsCommandList* commandList) -{ +{ ORT_THROW_IF_FAILED(m_currentCommandList->Close()); ID3D12GraphicsCommandList* commandListsToExecute[2] = {}; @@ -359,9 +409,7 @@ void DmlCommandRecorder::CloseAndExecute(_In_opt_ ID3D12GraphicsCommandList* com m_queue->ExecuteCommandLists( gsl::span(reinterpret_cast(commandListsToExecute), commandListsToExecuteCount)); } - - m_cachedCommandList = m_currentCommandList; - m_currentCommandList = nullptr; + m_operationsRecordedInCurrentCommandList = false; // The descriptor heap must be set on the command list the next time it's opened. diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h index 83051c8ca4ff9..c25bd53ba9440 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h @@ -4,7 +4,6 @@ #pragma once #include "ICommandRecorder.h" -#include "CommandAllocatorRing.h" namespace Dml { @@ -16,9 +15,11 @@ namespace Dml public: DmlCommandRecorder( ID3D12Device* d3dDevice, - IDMLDevice* device, + IDMLDevice* device, std::shared_ptr commandQueue); + ~DmlCommandRecorder(); + void InitializeOperator( IDMLCompiledOperator* op, const DML_BINDING_DESC& persistentResourceBinding, @@ -47,13 +48,13 @@ namespace Dml _Out_ uint64_t* completionValue); ComPtr GetCommandList(); - + void ResourceBarrier(gsl::span barriers); void AddUAVBarrier(); void Open() final; void CloseAndExecute() final; - + void SetAllocator(std::weak_ptr allocator); bool HasUnsubmittedWork() override @@ -68,8 +69,19 @@ namespace Dml } private: + struct CommandAllocatorInfo + { + ComPtr allocator; + + // The event which will be signaled when the last command list submitted using this allocator + // completes execution on the GPU. + GpuEvent completionEvent = {}; + + ID3D12CommandAllocator* Get() const { return allocator.Get(); } + }; + void CloseAndExecute(_In_opt_ ID3D12GraphicsCommandList* commandList); - + std::shared_ptr m_queue; ComPtr m_d3dDevice; ComPtr m_dmlDevice; @@ -84,14 +96,21 @@ namespace Dml // The weak pointer avoids a circular reference from context->recorder->allocator->context std::weak_ptr m_bufferAllocator; - CommandAllocatorRing<2> m_commandAllocatorRing; - // The command list currently being recorded into, and whether any command have been recorded yet. ComPtr m_currentCommandList; bool m_operationsRecordedInCurrentCommandList = false; - // A cached command list which may be re-used. - ComPtr m_cachedCommandList; + static constexpr int commandListCount = 3; + + // We use enough command lists and allocators to allow command lists to be reset in a different thread while + // there is another command list ready to receive commands. When we execute and close a command list, we start + // the resetting process on a different thread and set m_currentCommandList to the next available one. + std::array, commandListCount> m_commandListRing; + std::array m_allocatorRing; + + // We should always have 1 less reset thread than command lists since we always need a clean command list, but + // the other ones can all be in the process of getting reset + std::array, commandListCount - 1> m_resetThreads; void SetDescriptorHeap(ID3D12DescriptorHeap* descriptorHeap); }; From 6c4a3d526eeae33230d6a3502448a6ef650bf91d Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Thu, 4 Apr 2024 21:15:54 -0700 Subject: [PATCH 2/3] Use thread pool --- .../src/DmlCommandRecorder.cpp | 103 +++++++----------- .../src/DmlCommandRecorder.h | 20 ++-- 2 files changed, 48 insertions(+), 75 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp index 5a5d769fd9995..a3829ecc9b2f1 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp @@ -19,18 +19,13 @@ DmlCommandRecorder::DmlCommandRecorder( { ORT_THROW_IF_FAILED(dmlDevice->CreateOperatorInitializer(0, nullptr, IID_PPV_ARGS(&m_initializer))); ORT_THROW_IF_FAILED(dmlDevice->CreateCommandRecorder(IID_PPV_ARGS(&m_recorder))); -} -DmlCommandRecorder::~DmlCommandRecorder() -{ - // Detach the threads to avoid crashes when terminating the program - for (auto& resetThread : m_resetThreads) - { - if (resetThread) - { - resetThread->detach(); - } - } + m_threadPool = std::make_unique( + &onnxruntime::Env::Default(), + onnxruntime::ThreadOptions(), + ORT_TSTR("CommandListPool"), + threadPoolSize, + true); } void DmlCommandRecorder::SetAllocator(std::weak_ptr allocator) @@ -274,7 +269,7 @@ void DmlCommandRecorder::ExecuteCommandList( gsl::span(reinterpret_cast(&commandList), 1)); // The fence value at which the current command allocator may be re-used will now be higher - m_allocatorRing.back().completionEvent = m_queue->GetNextCompletionEvent(); + m_currentCommandListInfo->completionEvent = m_queue->GetNextCompletionEvent(); // Fail early if something horrifying happens ORT_THROW_IF_FAILED(m_dmlDevice->GetDeviceRemovedReason()); @@ -324,62 +319,35 @@ void DmlCommandRecorder::Open() { assert(m_currentDescriptorHeap == nullptr); - if (m_currentCommandList) + if (m_availableCommandLists.empty()) { - if (m_resetThreads.front()) - { - m_resetThreads.front()->join(); - } - - // Rotate the reset threads to the left - for (uint32_t i = 0; i < m_resetThreads.size() - 1; ++i) { - m_resetThreads[i] = std::move(m_resetThreads[i + 1]); - } - - // Rotate the allocators to the left - auto firstAllocator = std::move(m_allocatorRing.front()); - for (uint32_t i = 0; i < m_allocatorRing.size() - 1; ++i) - { - m_allocatorRing[i] = std::move(m_allocatorRing[i + 1]); - } - m_allocatorRing.back() = std::move(firstAllocator); - - // Rotate the command lists to the left - auto firstCommandList = std::move(m_commandListRing.front()); - for (uint32_t i = 0; i < m_commandListRing.size() - 1; ++i) - { - m_commandListRing[i] = std::move(m_commandListRing[i + 1]); - } - m_commandListRing.back() = std::move(firstCommandList); - - // The newest dirty allocator is now located before the last element in the ring buffer, so start resetting it - m_resetThreads.back() = std::thread([cachedAllocator = m_allocatorRing[m_allocatorRing.size() - 2], cachedCommandList = m_commandListRing[m_commandListRing.size() - 2]]() { - cachedAllocator.completionEvent.WaitForSignal(); - ORT_THROW_IF_FAILED(cachedAllocator.allocator->Reset()); - ORT_THROW_IF_FAILED(cachedCommandList->Reset(cachedAllocator.allocator.Get(), nullptr)); - }); + ComPtr allocator; + ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommandAllocator( + m_queue->GetType(), + IID_GRAPHICS_PPV_ARGS(allocator.ReleaseAndGetAddressOf()))); + + ComPtr commandList; + ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommandList( + 0, + m_queue->GetType(), + allocator.Get(), + nullptr, + IID_GRAPHICS_PPV_ARGS(commandList.ReleaseAndGetAddressOf()))); + + auto commandListInfo = std::make_shared(); + commandListInfo->allocator = std::move(allocator); + commandListInfo->commandList = std::move(commandList); + m_currentCommandListInfo = std::move(commandListInfo); } else { - assert(m_commandListRing.size() == m_allocatorRing.size()); - - for (uint32_t i = 0; i < m_commandListRing.size(); ++i) - { - ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommandAllocator( - m_queue->GetType(), - IID_GRAPHICS_PPV_ARGS(m_allocatorRing[i].allocator.ReleaseAndGetAddressOf()))); - - ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommandList( - 0, - m_queue->GetType(), - m_allocatorRing[i].allocator.Get(), - nullptr, - IID_GRAPHICS_PPV_ARGS(m_commandListRing[i].ReleaseAndGetAddressOf()))); - } + std::unique_lock lock(m_mutex); + m_currentCommandListInfo = m_availableCommandLists.back(); + m_availableCommandLists.pop_back(); } - m_currentCommandList = m_commandListRing.back(); - m_allocatorRing.back().completionEvent = m_queue->GetNextCompletionEvent(); + m_currentCommandList = m_currentCommandListInfo->commandList; + m_currentCommandListInfo->completionEvent = m_queue->GetNextCompletionEvent(); } void DmlCommandRecorder::CloseAndExecute() @@ -391,6 +359,17 @@ void DmlCommandRecorder::CloseAndExecute(_In_opt_ ID3D12GraphicsCommandList* com { ORT_THROW_IF_FAILED(m_currentCommandList->Close()); + onnxruntime::concurrency::ThreadPool::Schedule(m_threadPool.get(), [this, currentCommandListInfo = m_currentCommandListInfo]() { + currentCommandListInfo->completionEvent.WaitForSignal(); + ORT_THROW_IF_FAILED(currentCommandListInfo->allocator->Reset()); + ORT_THROW_IF_FAILED(currentCommandListInfo->commandList->Reset(currentCommandListInfo->allocator.Get(), nullptr)); + + { + std::unique_lock lock(m_mutex); + m_availableCommandLists.push_back(std::move(currentCommandListInfo)); + } + }); + ID3D12GraphicsCommandList* commandListsToExecute[2] = {}; uint32_t commandListsToExecuteCount = 0; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h index c25bd53ba9440..3d35c2c035db4 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h @@ -18,8 +18,6 @@ namespace Dml IDMLDevice* device, std::shared_ptr commandQueue); - ~DmlCommandRecorder(); - void InitializeOperator( IDMLCompiledOperator* op, const DML_BINDING_DESC& persistentResourceBinding, @@ -69,8 +67,9 @@ namespace Dml } private: - struct CommandAllocatorInfo + struct CommandListInfo { + ComPtr commandList; ComPtr allocator; // The event which will be signaled when the last command list submitted using this allocator @@ -98,19 +97,14 @@ namespace Dml // The command list currently being recorded into, and whether any command have been recorded yet. ComPtr m_currentCommandList; + std::shared_ptr m_currentCommandListInfo; bool m_operationsRecordedInCurrentCommandList = false; - static constexpr int commandListCount = 3; - - // We use enough command lists and allocators to allow command lists to be reset in a different thread while - // there is another command list ready to receive commands. When we execute and close a command list, we start - // the resetting process on a different thread and set m_currentCommandList to the next available one. - std::array, commandListCount> m_commandListRing; - std::array m_allocatorRing; + static constexpr int threadPoolSize = 8; - // We should always have 1 less reset thread than command lists since we always need a clean command list, but - // the other ones can all be in the process of getting reset - std::array, commandListCount - 1> m_resetThreads; + std::unique_ptr m_threadPool; + std::mutex m_mutex; + std::list> m_availableCommandLists; void SetDescriptorHeap(ID3D12DescriptorHeap* descriptorHeap); }; From b0131b2e84b7047a3e699d219029e5c18868808b Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Thu, 4 Apr 2024 23:24:55 -0700 Subject: [PATCH 3/3] Fix --- .../src/DmlCommandRecorder.cpp | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp index a3829ecc9b2f1..376937bad9c68 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp @@ -359,23 +359,23 @@ void DmlCommandRecorder::CloseAndExecute(_In_opt_ ID3D12GraphicsCommandList* com { ORT_THROW_IF_FAILED(m_currentCommandList->Close()); - onnxruntime::concurrency::ThreadPool::Schedule(m_threadPool.get(), [this, currentCommandListInfo = m_currentCommandListInfo]() { - currentCommandListInfo->completionEvent.WaitForSignal(); - ORT_THROW_IF_FAILED(currentCommandListInfo->allocator->Reset()); - ORT_THROW_IF_FAILED(currentCommandListInfo->commandList->Reset(currentCommandListInfo->allocator.Get(), nullptr)); - - { - std::unique_lock lock(m_mutex); - m_availableCommandLists.push_back(std::move(currentCommandListInfo)); - } - }); - ID3D12GraphicsCommandList* commandListsToExecute[2] = {}; uint32_t commandListsToExecuteCount = 0; if (m_operationsRecordedInCurrentCommandList) { commandListsToExecute[commandListsToExecuteCount++] = m_currentCommandList.Get(); + + onnxruntime::concurrency::ThreadPool::Schedule(m_threadPool.get(), [this, currentCommandListInfo = m_currentCommandListInfo]() { + currentCommandListInfo->completionEvent.WaitForSignal(); + ORT_THROW_IF_FAILED(currentCommandListInfo->allocator->Reset()); + ORT_THROW_IF_FAILED(currentCommandListInfo->commandList->Reset(currentCommandListInfo->allocator.Get(), nullptr)); + + { + std::unique_lock lock(m_mutex); + m_availableCommandLists.push_back(std::move(currentCommandListInfo)); + } + }); } if (commandList)