From e7b0074127c08947aeb9e41f7684f47f2be56f3c Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Mon, 25 Mar 2024 13:53:18 -0700
Subject: [PATCH 1/3] [DML] Hide command list reset latency with multiple
 threads

---
 .../src/CommandAllocatorRing.h                | 74 ----------------
 .../src/DmlCommandRecorder.cpp                | 86 +++++++++++++++----
 .../src/DmlCommandRecorder.h                  | 37 ++++++--
 3 files changed, 95 insertions(+), 102 deletions(-)
 delete mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandAllocatorRing.h

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandAllocatorRing.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandAllocatorRing.h
deleted file mode 100644
index 2eee9c9a9e5a3..0000000000000
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandAllocatorRing.h
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include "GpuEvent.h"
-
-namespace Dml
-{
-    // A fixed-size ring of command allocators. Each time an allocator is retrieved, the allocator will
-    // be reset if its previously recorded commands have finished executing on the GPU.
-    template <size_t AllocatorCount>
-    class CommandAllocatorRing
-    {
-    public:
-        CommandAllocatorRing(
-            ID3D12Device* device, 
-            D3D12_COMMAND_LIST_TYPE commandListType, 
-            GpuEvent initialEvent)
-        {
-            for (auto& info : m_commandAllocators)
-            {
-                ORT_THROW_IF_FAILED(device->CreateCommandAllocator(
-                    commandListType,
-                    IID_GRAPHICS_PPV_ARGS(info.allocator.ReleaseAndGetAddressOf())));
-
-                info.completionEvent = initialEvent;
-            }
-        }
-
-        ID3D12CommandAllocator* GetNextAllocator(GpuEvent nextCompletionEvent)
-        {
-            size_t earliestOtherAllocator = (m_currentCommandAllocator + 1) % AllocatorCount;
-
-            assert(!m_commandAllocators[m_currentCommandAllocator].completionEvent.IsSignaled() ||
-                    m_commandAllocators[earliestOtherAllocator].completionEvent.IsSignaled());
-
-            if (m_commandAllocators[earliestOtherAllocator].completionEvent.IsSignaled())
-            {
-                ORT_THROW_IF_FAILED(m_commandAllocators[earliestOtherAllocator].Get()->Reset());
-                m_currentCommandAllocator = earliestOtherAllocator;
-            }
-
-            // Set the completion event for the current allocator so it can be reset eventually.
-            m_commandAllocators[m_currentCommandAllocator].completionEvent = nextCompletionEvent;
-
-            return m_commandAllocators[m_currentCommandAllocator].Get();
-        }
-
-        // Updates the completion event of the current allocator to a different value.  This is used when the caller
-        // decides to issue an unrelated call to the queue such as ExecuteCommandLists which updates its fence between calling 
-        // GetNextAllocator and executing the work which it recorded using the allocator it received.
-        void UpdateCurrentAllocatorCompletionEvent(GpuEvent nextCompletionEvent)
-        {
-            m_commandAllocators[m_currentCommandAllocator].completionEvent = nextCompletionEvent;
-        }
-
-    private:
-        struct CommandAllocatorInfo
-        {
-            ComPtr<ID3D12CommandAllocator> allocator;
-
-            // The event which will be signaled when the last command list submitted using this allocator
-            // completes execution on the GPU.
-            GpuEvent completionEvent = {};
-
-            ID3D12CommandAllocator* Get() const { return allocator.Get(); }
-        };
-
-        std::array<CommandAllocatorInfo, AllocatorCount> m_commandAllocators;
-        size_t m_currentCommandAllocator = 0;
-
-    };
-}
\ No newline at end of file
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
index 5254b23f56376..5a5d769fd9995 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
@@ -15,13 +15,24 @@ DmlCommandRecorder::DmlCommandRecorder(
     : m_queue(std::move(commandQueue)),
       m_d3dDevice(d3dDevice),
       m_dmlDevice(dmlDevice),
-      m_descriptorPool(d3dDevice, 2048),
-      m_commandAllocatorRing(d3dDevice, m_queue->GetType(), m_queue->GetCurrentCompletionEvent())
+      m_descriptorPool(d3dDevice, 2048)
 {
     ORT_THROW_IF_FAILED(dmlDevice->CreateOperatorInitializer(0, nullptr, IID_PPV_ARGS(&m_initializer)));
     ORT_THROW_IF_FAILED(dmlDevice->CreateCommandRecorder(IID_PPV_ARGS(&m_recorder)));
 }
 
+DmlCommandRecorder::~DmlCommandRecorder()
+{
+    // Detach the threads to avoid crashes when terminating the program
+    for (auto& resetThread : m_resetThreads)
+    {
+        if (resetThread)
+        {
+            resetThread->detach();
+        }
+    }
+}
+
 void DmlCommandRecorder::SetAllocator(std::weak_ptr<BucketizedBufferAllocator> allocator)
 {
     m_bufferAllocator = allocator;
@@ -263,7 +274,7 @@ void DmlCommandRecorder::ExecuteCommandList(
         gsl::span<ID3D12CommandList*>(reinterpret_cast<ID3D12CommandList**>(&commandList), 1));
 
         // The fence value at which the current command allocator may be re-used will now be higher
-        m_commandAllocatorRing.UpdateCurrentAllocatorCompletionEvent(m_queue->GetNextCompletionEvent());
+        m_allocatorRing.back().completionEvent = m_queue->GetNextCompletionEvent();
 
         // Fail early if something horrifying happens
         ORT_THROW_IF_FAILED(m_dmlDevice->GetDeviceRemovedReason());
@@ -313,23 +324,62 @@ void DmlCommandRecorder::Open()
 {
     assert(m_currentDescriptorHeap == nullptr);
 
-    ID3D12CommandAllocator* allocator = m_commandAllocatorRing.GetNextAllocator(m_queue->GetNextCompletionEvent());
-
-    if (!m_cachedCommandList)
+    if (m_currentCommandList)
     {
-        ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommandList(
-            0,
-            m_queue->GetType(),
-            allocator,
-            nullptr,
-            IID_GRAPHICS_PPV_ARGS(m_currentCommandList.ReleaseAndGetAddressOf())));
+        if (m_resetThreads.front())
+        {
+            m_resetThreads.front()->join();
+        }
+
+        // Rotate the reset threads to the left
+        for (uint32_t i = 0; i < m_resetThreads.size() - 1; ++i) {
+            m_resetThreads[i] = std::move(m_resetThreads[i + 1]);
+        }
+
+        // Rotate the allocators to the left
+        auto firstAllocator = std::move(m_allocatorRing.front());
+        for (uint32_t i = 0; i < m_allocatorRing.size() - 1; ++i)
+        {
+            m_allocatorRing[i] = std::move(m_allocatorRing[i + 1]);
+        }
+        m_allocatorRing.back() = std::move(firstAllocator);
+
+        // Rotate the command lists to the left
+        auto firstCommandList = std::move(m_commandListRing.front());
+        for (uint32_t i = 0; i < m_commandListRing.size() - 1; ++i)
+        {
+            m_commandListRing[i] = std::move(m_commandListRing[i + 1]);
+        }
+        m_commandListRing.back() = std::move(firstCommandList);
+
+        // The newest dirty allocator is now located before the last element in the ring buffer, so start resetting it
+        m_resetThreads.back() = std::thread([cachedAllocator = m_allocatorRing[m_allocatorRing.size() - 2], cachedCommandList = m_commandListRing[m_commandListRing.size() - 2]]() {
+            cachedAllocator.completionEvent.WaitForSignal();
+            ORT_THROW_IF_FAILED(cachedAllocator.allocator->Reset());
+            ORT_THROW_IF_FAILED(cachedCommandList->Reset(cachedAllocator.allocator.Get(), nullptr));
+        });
     }
     else
     {
-        m_currentCommandList = m_cachedCommandList;
-        m_cachedCommandList = nullptr;
-        ORT_THROW_IF_FAILED(m_currentCommandList->Reset(allocator, nullptr));
+        assert(m_commandListRing.size() == m_allocatorRing.size());
+
+        for (uint32_t i = 0; i < m_commandListRing.size(); ++i)
+        {
+            ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommandAllocator(
+                m_queue->GetType(),
+                IID_GRAPHICS_PPV_ARGS(m_allocatorRing[i].allocator.ReleaseAndGetAddressOf())));
+
+            ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommandList(
+                0,
+                m_queue->GetType(),
+                m_allocatorRing[i].allocator.Get(),
+                nullptr,
+                IID_GRAPHICS_PPV_ARGS(m_commandListRing[i].ReleaseAndGetAddressOf())));
+        }
     }
+
+    m_currentCommandList = m_commandListRing.back();
+    m_allocatorRing.back().completionEvent = m_queue->GetNextCompletionEvent();
 }
 
 void DmlCommandRecorder::CloseAndExecute()
@@ -338,7 +388,7 @@ void DmlCommandRecorder::CloseAndExecute()
 }
 
 void DmlCommandRecorder::CloseAndExecute(_In_opt_ ID3D12GraphicsCommandList* commandList)
-{   
+{
     ORT_THROW_IF_FAILED(m_currentCommandList->Close());
 
     ID3D12GraphicsCommandList* commandListsToExecute[2] = {};
@@ -359,9 +409,7 @@ void DmlCommandRecorder::CloseAndExecute(_In_opt_ ID3D12GraphicsCommandList* com
         m_queue->ExecuteCommandLists(
                 gsl::span<ID3D12CommandList*>(reinterpret_cast<ID3D12CommandList**>(commandListsToExecute), commandListsToExecuteCount));
     }
-    
-    m_cachedCommandList = m_currentCommandList;
-    m_currentCommandList = nullptr;
+
     m_operationsRecordedInCurrentCommandList = false;
 
     // The descriptor heap must be set on the command list the next time it's opened.
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h
index 83051c8ca4ff9..c25bd53ba9440 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h
@@ -4,7 +4,6 @@
 #pragma once
 
 #include "ICommandRecorder.h"
-#include "CommandAllocatorRing.h"
 
 namespace Dml
 {
@@ -16,9 +15,11 @@ namespace Dml
     public:
         DmlCommandRecorder(
             ID3D12Device* d3dDevice,
-            IDMLDevice* device, 
+            IDMLDevice* device,
             std::shared_ptr<CommandQueue> commandQueue);
 
+        ~DmlCommandRecorder();
+
         void InitializeOperator(
             IDMLCompiledOperator* op,
             const DML_BINDING_DESC& persistentResourceBinding,
@@ -47,13 +48,13 @@ namespace Dml
             _Out_ uint64_t* completionValue);
 
         ComPtr<ID3D12GraphicsCommandList> GetCommandList();
-        
+
         void ResourceBarrier(gsl::span<const D3D12_RESOURCE_BARRIER> barriers);
         void AddUAVBarrier();
 
         void Open() final;
         void CloseAndExecute() final;
-        
+
         void SetAllocator(std::weak_ptr<BucketizedBufferAllocator> allocator);
 
         bool HasUnsubmittedWork() override
@@ -68,8 +69,19 @@ namespace Dml
         }
 
     private:
+        struct CommandAllocatorInfo
+        {
+            ComPtr<ID3D12CommandAllocator> allocator;
+
+            // The event which will be signaled when the last command list submitted using this allocator
+            // completes execution on the GPU.
+            GpuEvent completionEvent = {};
+
+            ID3D12CommandAllocator* Get() const { return allocator.Get(); }
+        };
+
         void CloseAndExecute(_In_opt_ ID3D12GraphicsCommandList* commandList);
-    
+
         std::shared_ptr<CommandQueue> m_queue;
         ComPtr<ID3D12Device> m_d3dDevice;
         ComPtr<IDMLDevice> m_dmlDevice;
@@ -84,14 +96,21 @@ namespace Dml
         // The weak pointer avoids a circular reference from context->recorder->allocator->context
         std::weak_ptr<BucketizedBufferAllocator> m_bufferAllocator;
 
-        CommandAllocatorRing<2> m_commandAllocatorRing;
-
         // The command list currently being recorded into, and whether any command have been recorded yet.
         ComPtr<ID3D12GraphicsCommandList> m_currentCommandList;
         bool m_operationsRecordedInCurrentCommandList = false;
 
-        // A cached command list which may be re-used.
-        ComPtr<ID3D12GraphicsCommandList> m_cachedCommandList;
+        static constexpr int commandListCount = 3;
+
+        // We use enough command lists and allocators to allow command lists to be reset in a different thread while
+        // there is another command list ready to receive commands. When we execute and close a command list, we start
+        // the resetting process on a different thread and set m_currentCommandList to the next available one.
+        std::array<ComPtr<ID3D12GraphicsCommandList>, commandListCount> m_commandListRing;
+        std::array<CommandAllocatorInfo, commandListCount> m_allocatorRing;
+
+        // We should always have 1 less reset thread than command lists since we always need a clean command list, but
+        // the other ones can all be in the process of getting reset
+        std::array<std::optional<std::thread>, commandListCount - 1> m_resetThreads;
 
         void SetDescriptorHeap(ID3D12DescriptorHeap* descriptorHeap);
     };

From 6c4a3d526eeae33230d6a3502448a6ef650bf91d Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Thu, 4 Apr 2024 21:15:54 -0700
Subject: [PATCH 2/3] Use thread pool

---
 .../src/DmlCommandRecorder.cpp                | 103 +++++++-----------
 .../src/DmlCommandRecorder.h                  |  20 ++--
 2 files changed, 48 insertions(+), 75 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
index 5a5d769fd9995..a3829ecc9b2f1 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
@@ -19,18 +19,13 @@ DmlCommandRecorder::DmlCommandRecorder(
 {
     ORT_THROW_IF_FAILED(dmlDevice->CreateOperatorInitializer(0, nullptr, IID_PPV_ARGS(&m_initializer)));
     ORT_THROW_IF_FAILED(dmlDevice->CreateCommandRecorder(IID_PPV_ARGS(&m_recorder)));
-}
 
-DmlCommandRecorder::~DmlCommandRecorder()
-{
-    // Detach the threads to avoid crashes when terminating the program
-    for (auto& resetThread : m_resetThreads)
-    {
-        if (resetThread)
-        {
-            resetThread->detach();
-        }
-    }
+    m_threadPool = std::make_unique<onnxruntime::concurrency::ThreadPool>(
+      &onnxruntime::Env::Default(),
+      onnxruntime::ThreadOptions(),
+      ORT_TSTR("CommandListPool"),
+      threadPoolSize,
+      true);
 }
 
 void DmlCommandRecorder::SetAllocator(std::weak_ptr<BucketizedBufferAllocator> allocator)
@@ -274,7 +269,7 @@ void DmlCommandRecorder::ExecuteCommandList(
         gsl::span<ID3D12CommandList*>(reinterpret_cast<ID3D12CommandList**>(&commandList), 1));
 
         // The fence value at which the current command allocator may be re-used will now be higher
-        m_allocatorRing.back().completionEvent = m_queue->GetNextCompletionEvent();
+        m_currentCommandListInfo->completionEvent = m_queue->GetNextCompletionEvent();
 
         // Fail early if something horrifying happens
         ORT_THROW_IF_FAILED(m_dmlDevice->GetDeviceRemovedReason());
@@ -324,62 +319,35 @@ void DmlCommandRecorder::Open()
 {
     assert(m_currentDescriptorHeap == nullptr);
 
-    if (m_currentCommandList)
+    if (m_availableCommandLists.empty())
     {
-        if (m_resetThreads.front())
-        {
-            m_resetThreads.front()->join();
-        }
-
-        // Rotate the reset threads to the left
-        for (uint32_t i = 0; i < m_resetThreads.size() - 1; ++i) {
-            m_resetThreads[i] = std::move(m_resetThreads[i + 1]);
-        }
-
-        // Rotate the allocators to the left
-        auto firstAllocator = std::move(m_allocatorRing.front());
-        for (uint32_t i = 0; i < m_allocatorRing.size() - 1; ++i)
-        {
-            m_allocatorRing[i] = std::move(m_allocatorRing[i + 1]);
-        }
-        m_allocatorRing.back() = std::move(firstAllocator);
-
-        // Rotate the command lists to the left
-        auto firstCommandList = std::move(m_commandListRing.front());
-        for (uint32_t i = 0; i < m_commandListRing.size() - 1; ++i)
-        {
-            m_commandListRing[i] = std::move(m_commandListRing[i + 1]);
-        }
-        m_commandListRing.back() = std::move(firstCommandList);
-
-        // The newest dirty allocator is now located before the last element in the ring buffer, so start resetting it
-        m_resetThreads.back() = std::thread([cachedAllocator = m_allocatorRing[m_allocatorRing.size() - 2], cachedCommandList = m_commandListRing[m_commandListRing.size() - 2]]() {
-            cachedAllocator.completionEvent.WaitForSignal();
-            ORT_THROW_IF_FAILED(cachedAllocator.allocator->Reset());
-            ORT_THROW_IF_FAILED(cachedCommandList->Reset(cachedAllocator.allocator.Get(), nullptr));
-        });
+        ComPtr<ID3D12CommandAllocator> allocator;
+        ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommandAllocator(
+            m_queue->GetType(),
+            IID_GRAPHICS_PPV_ARGS(allocator.ReleaseAndGetAddressOf())));
+
+        ComPtr<ID3D12GraphicsCommandList> commandList;
+        ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommandList(
+            0,
+            m_queue->GetType(),
+            allocator.Get(),
+            nullptr,
+            IID_GRAPHICS_PPV_ARGS(commandList.ReleaseAndGetAddressOf())));
+
+        auto commandListInfo = std::make_shared<CommandListInfo>();
+        commandListInfo->allocator = std::move(allocator);
+        commandListInfo->commandList = std::move(commandList);
+        m_currentCommandListInfo = std::move(commandListInfo);
     }
     else
     {
-        assert(m_commandListRing.size() == m_allocatorRing.size());
-
-        for (uint32_t i = 0; i < m_commandListRing.size(); ++i)
-        {
-            ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommandAllocator(
-                m_queue->GetType(),
-                IID_GRAPHICS_PPV_ARGS(m_allocatorRing[i].allocator.ReleaseAndGetAddressOf())));
-
-            ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommandList(
-                0,
-                m_queue->GetType(),
-                m_allocatorRing[i].allocator.Get(),
-                nullptr,
-                IID_GRAPHICS_PPV_ARGS(m_commandListRing[i].ReleaseAndGetAddressOf())));
-        }
+        std::unique_lock lock(m_mutex);
+        m_currentCommandListInfo = m_availableCommandLists.back();
+        m_availableCommandLists.pop_back();
     }
 
-    m_currentCommandList = m_commandListRing.back();
-    m_allocatorRing.back().completionEvent = m_queue->GetNextCompletionEvent();
+    m_currentCommandList = m_currentCommandListInfo->commandList;
+    m_currentCommandListInfo->completionEvent = m_queue->GetNextCompletionEvent();
 }
 
 void DmlCommandRecorder::CloseAndExecute()
@@ -391,6 +359,17 @@ void DmlCommandRecorder::CloseAndExecute(_In_opt_ ID3D12GraphicsCommandList* com
 {
     ORT_THROW_IF_FAILED(m_currentCommandList->Close());
 
+    onnxruntime::concurrency::ThreadPool::Schedule(m_threadPool.get(), [this, currentCommandListInfo = m_currentCommandListInfo]() {
+        currentCommandListInfo->completionEvent.WaitForSignal();
+        ORT_THROW_IF_FAILED(currentCommandListInfo->allocator->Reset());
+        ORT_THROW_IF_FAILED(currentCommandListInfo->commandList->Reset(currentCommandListInfo->allocator.Get(), nullptr));
+
+        {
+            std::unique_lock lock(m_mutex);
+            m_availableCommandLists.push_back(std::move(currentCommandListInfo));
+        }
+    });
+
     ID3D12GraphicsCommandList* commandListsToExecute[2] = {};
     uint32_t commandListsToExecuteCount = 0;
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h
index c25bd53ba9440..3d35c2c035db4 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h
@@ -18,8 +18,6 @@ namespace Dml
             IDMLDevice* device,
             std::shared_ptr<CommandQueue> commandQueue);
 
-        ~DmlCommandRecorder();
-
         void InitializeOperator(
             IDMLCompiledOperator* op,
             const DML_BINDING_DESC& persistentResourceBinding,
@@ -69,8 +67,9 @@ namespace Dml
         }
 
     private:
-        struct CommandAllocatorInfo
+        struct CommandListInfo
         {
+            ComPtr<ID3D12GraphicsCommandList> commandList;
             ComPtr<ID3D12CommandAllocator> allocator;
 
             // The event which will be signaled when the last command list submitted using this allocator
@@ -98,19 +97,14 @@ namespace Dml
 
         // The command list currently being recorded into, and whether any command have been recorded yet.
         ComPtr<ID3D12GraphicsCommandList> m_currentCommandList;
+        std::shared_ptr<CommandListInfo> m_currentCommandListInfo;
         bool m_operationsRecordedInCurrentCommandList = false;
 
-        static constexpr int commandListCount = 3;
-
-        // We use enough command lists and allocators to allow command lists to be reset in a different thread while
-        // there is another command list ready to receive commands. When we execute and close a command list, we start
-        // the resetting process on a different thread and set m_currentCommandList to the next available one.
-        std::array<ComPtr<ID3D12GraphicsCommandList>, commandListCount> m_commandListRing;
-        std::array<CommandAllocatorInfo, commandListCount> m_allocatorRing;
+        static constexpr int threadPoolSize = 8;
 
-        // We should always have 1 less reset thread than command lists since we always need a clean command list, but
-        // the other ones can all be in the process of getting reset
-        std::array<std::optional<std::thread>, commandListCount - 1> m_resetThreads;
+        std::unique_ptr<onnxruntime::concurrency::ThreadPool> m_threadPool;
+        std::mutex m_mutex;
+        std::list<std::shared_ptr<CommandListInfo>> m_availableCommandLists;
 
         void SetDescriptorHeap(ID3D12DescriptorHeap* descriptorHeap);
     };

From b0131b2e84b7047a3e699d219029e5c18868808b Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Thu, 4 Apr 2024 23:24:55 -0700
Subject: [PATCH 3/3] Fix

---
 .../src/DmlCommandRecorder.cpp                | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
index a3829ecc9b2f1..376937bad9c68 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
@@ -359,23 +359,23 @@ void DmlCommandRecorder::CloseAndExecute(_In_opt_ ID3D12GraphicsCommandList* com
 {
     ORT_THROW_IF_FAILED(m_currentCommandList->Close());
 
-    onnxruntime::concurrency::ThreadPool::Schedule(m_threadPool.get(), [this, currentCommandListInfo = m_currentCommandListInfo]() {
-        currentCommandListInfo->completionEvent.WaitForSignal();
-        ORT_THROW_IF_FAILED(currentCommandListInfo->allocator->Reset());
-        ORT_THROW_IF_FAILED(currentCommandListInfo->commandList->Reset(currentCommandListInfo->allocator.Get(), nullptr));
-
-        {
-            std::unique_lock lock(m_mutex);
-            m_availableCommandLists.push_back(std::move(currentCommandListInfo));
-        }
-    });
-
     ID3D12GraphicsCommandList* commandListsToExecute[2] = {};
     uint32_t commandListsToExecuteCount = 0;
 
     if (m_operationsRecordedInCurrentCommandList)
     {
         commandListsToExecute[commandListsToExecuteCount++] = m_currentCommandList.Get();
+
+        onnxruntime::concurrency::ThreadPool::Schedule(m_threadPool.get(), [this, currentCommandListInfo = m_currentCommandListInfo]() {
+            currentCommandListInfo->completionEvent.WaitForSignal();
+            ORT_THROW_IF_FAILED(currentCommandListInfo->allocator->Reset());
+            ORT_THROW_IF_FAILED(currentCommandListInfo->commandList->Reset(currentCommandListInfo->allocator.Get(), nullptr));
+
+            {
+                std::unique_lock lock(m_mutex);
+                m_availableCommandLists.push_back(std::move(currentCommandListInfo));
+            }
+        });
     }
 
     if (commandList)