From bdaeebd6ff5d4667f288ebb9b8072f66b45de1c8 Mon Sep 17 00:00:00 2001
From: Jeff Bloomfield <38966965+jeffbloo@users.noreply.github.com>
Date: Mon, 18 Dec 2023 15:47:57 -0800
Subject: [PATCH] Fix bug in DML EP ExecuteCommandList fast path and simplify
 design (#18866)

### Description
This addresses a bug in a fast path that was added for submission of
re-used command lists of fused graph kernels in the DML EP, addressing a
D3D debug layer error.

### Motivation and Context
The fast path in DmlCommandRecorder::ExecuteCommandList enabled a
current non-reused command list, if empty, to be used for commands
following submission of the fused command list. The fix ensures the
associated command allocator is only re-used after the next fence value
is completed, which is higher due to submission of the other command
list.

The command recorder design was intended to support batching of provided
command list execution, however it submits command lists immedately as
an implementation detail to maximize CPU/GPU parallelism. If that
heuristic was removed, it would expose additional issues in this same
fast path. Because of this and complexity and inefficiency of the old
batching mechanism, I also removed this.
---
 .../src/CommandAllocatorRing.h                |  8 ++
 .../src/DmlCommandRecorder.cpp                | 89 +++++++------------
 .../src/DmlCommandRecorder.h                  | 16 ++--
 3 files changed, 46 insertions(+), 67 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandAllocatorRing.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandAllocatorRing.h
index 570f62aac8105..2eee9c9a9e5a3 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandAllocatorRing.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandAllocatorRing.h
@@ -47,6 +47,14 @@ namespace Dml
             return m_commandAllocators[m_currentCommandAllocator].Get();
         }
 
+        // Updates the completion event of the current allocator to a different value.  This is used when the caller
+        // decides to issue an unrelated call to the queue such as ExecuteCommandLists which updates its fence between calling 
+        // GetNextAllocator and executing the work which it recorded using the allocator it received.
+        void UpdateCurrentAllocatorCompletionEvent(GpuEvent nextCompletionEvent)
+        {
+            m_commandAllocators[m_currentCommandAllocator].completionEvent = nextCompletionEvent;
+        }
+
     private:
         struct CommandAllocatorInfo
         {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
index 98345f37b68d4..5254b23f56376 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp
@@ -262,6 +262,9 @@ void DmlCommandRecorder::ExecuteCommandList(
         m_queue->ExecuteCommandLists(
         gsl::span<ID3D12CommandList*>(reinterpret_cast<ID3D12CommandList**>(&commandList), 1));
 
+        // The fence value at which the current command allocator may be re-used will now be higher
+        m_commandAllocatorRing.UpdateCurrentAllocatorCompletionEvent(m_queue->GetNextCompletionEvent());
+
         // Fail early if something horrifying happens
         ORT_THROW_IF_FAILED(m_dmlDevice->GetDeviceRemovedReason());
         ORT_THROW_IF_FAILED(m_d3dDevice->GetDeviceRemovedReason());
@@ -269,42 +272,20 @@ void DmlCommandRecorder::ExecuteCommandList(
         return;
     }
 
-    ORT_THROW_IF_FAILED(m_currentCommandList->Close());
-
-    if (m_operationsRecordedInCurrentCommandList)
-    {
-        m_pendingCommandLists.push_back(m_currentCommandList.Get());
-        m_pendingCommandListsCacheable.push_back(true);
-    }
-    else
-    {
-        m_cachedCommandLists.push_back(m_currentCommandList.Get());
-    }
-
-    m_currentCommandList = nullptr;
-    m_operationsRecordedInCurrentCommandList = false;
-
-    m_pendingCommandLists.push_back(commandList);
-    m_pendingCommandListsCacheable.push_back(false);
-
-    // Remember the descriptor heap and apply it to the next command list
+    // Remember the descriptor heap and apply it to the next command list.  This avoids unnecessarily setting it onto
+    // the D3D object lazily at a point when the operation may not be parallelized with GPU work.
     auto heap = m_currentDescriptorHeap;
-    m_currentDescriptorHeap = nullptr;
-    Open();
-
-    // The caller can re-use relevant resources after the next set of work to be
-    // flushed has completed.  Its command list hasn't been executed yet, just batched.
-    GpuEvent gpuEvent = m_queue->GetNextCompletionEvent();
-    gpuEvent.fence.CopyTo(fence);
-    *completionValue = gpuEvent.fenceValue;
 
-    // Trigger a flush of the command list, with the assumption that it contains enough GPU work that this
-    // will help parallelize GPU work with subsequent CPU work.  This policy is related to the choice of
-    // minNodeCountToReuseCommandList within FusedGraphKernel, so both should be tuned together.
-    CloseAndExecute();
+    // Execute work in the current command list plus provided command list while closing the recorder.
+    CloseAndExecute(commandList);
     Open();
 
+    // Reset the descriptor heap opportunistically per above comment
     SetDescriptorHeap(heap);
+
+    GpuEvent gpuEvent = m_queue->GetCurrentCompletionEvent();
+    gpuEvent.fence.CopyTo(fence);
+    *completionValue = gpuEvent.fenceValue;
 }
 
 ComPtr<ID3D12GraphicsCommandList> DmlCommandRecorder::GetCommandList()
@@ -334,7 +315,7 @@ void DmlCommandRecorder::Open()
 
     ID3D12CommandAllocator* allocator = m_commandAllocatorRing.GetNextAllocator(m_queue->GetNextCompletionEvent());
 
-    if (m_cachedCommandLists.empty())
+    if (!m_cachedCommandList)
     {
         ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommandList(
             0,
@@ -345,47 +326,43 @@ void DmlCommandRecorder::Open()
     }
     else
     {
-        m_currentCommandList = m_cachedCommandLists.front();
-        m_cachedCommandLists.pop_front();
+        m_currentCommandList = m_cachedCommandList;
+        m_cachedCommandList = nullptr;
         ORT_THROW_IF_FAILED(m_currentCommandList->Reset(allocator, nullptr));
     }
 }
 
 void DmlCommandRecorder::CloseAndExecute()
 {
+    CloseAndExecute(nullptr);
+}
+
+void DmlCommandRecorder::CloseAndExecute(_In_opt_ ID3D12GraphicsCommandList* commandList)
+{   
     ORT_THROW_IF_FAILED(m_currentCommandList->Close());
 
+    ID3D12GraphicsCommandList* commandListsToExecute[2] = {};
+    uint32_t commandListsToExecuteCount = 0;
+
     if (m_operationsRecordedInCurrentCommandList)
     {
-        m_pendingCommandLists.push_back(m_currentCommandList.Get());
-        m_pendingCommandListsCacheable.push_back(true);
+        commandListsToExecute[commandListsToExecuteCount++] = m_currentCommandList.Get();
     }
-    else
+
+    if (commandList)
     {
-        m_cachedCommandLists.push_back(m_currentCommandList.Get());
+        commandListsToExecute[commandListsToExecuteCount++] = commandList;
     }
 
-    m_currentCommandList = nullptr;
-    m_operationsRecordedInCurrentCommandList = false;
-
-    if (!m_pendingCommandLists.empty())
+    if (commandListsToExecuteCount > 0)
     {
-        // Close and execute the command list
         m_queue->ExecuteCommandLists(
-            gsl::span<ID3D12CommandList*>(reinterpret_cast<ID3D12CommandList**>(m_pendingCommandLists.data()), m_pendingCommandLists.size()));
-
-        assert(m_pendingCommandLists.size() == m_pendingCommandListsCacheable.size());
-        for (size_t i = 0; i < m_pendingCommandLists.size(); ++i)
-        {
-            if (m_pendingCommandListsCacheable[i])
-            {
-                m_cachedCommandLists.push_back(m_pendingCommandLists[i]);
-            }
-        }
-
-        m_pendingCommandLists.clear();
-        m_pendingCommandListsCacheable.clear();
+                gsl::span<ID3D12CommandList*>(reinterpret_cast<ID3D12CommandList**>(commandListsToExecute), commandListsToExecuteCount));
     }
+    
+    m_cachedCommandList = m_currentCommandList;
+    m_currentCommandList = nullptr;
+    m_operationsRecordedInCurrentCommandList = false;
 
     // The descriptor heap must be set on the command list the next time it's opened.
     m_currentDescriptorHeap = nullptr;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h
index 7ad7032317d77..83051c8ca4ff9 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h
@@ -58,7 +58,7 @@ namespace Dml
 
         bool HasUnsubmittedWork() override
         {
-            return m_operationsRecordedInCurrentCommandList || !m_pendingCommandLists.empty();
+            return m_operationsRecordedInCurrentCommandList;
         }
 
         // Forces the descriptor heap to be reset to D3D before executing future operations
@@ -68,7 +68,8 @@ namespace Dml
         }
 
     private:
-
+        void CloseAndExecute(_In_opt_ ID3D12GraphicsCommandList* commandList);
+    
         std::shared_ptr<CommandQueue> m_queue;
         ComPtr<ID3D12Device> m_d3dDevice;
         ComPtr<IDMLDevice> m_dmlDevice;
@@ -89,15 +90,8 @@ namespace Dml
         ComPtr<ID3D12GraphicsCommandList> m_currentCommandList;
         bool m_operationsRecordedInCurrentCommandList = false;
 
-        // Command lists which have been batched up for execution.  The values in 
-        // m_pendingCommandListsCacheable indicate whether they can be moved into this
-        // class's cache after execution, versus if they belong to the caller and were
-        // passed to ExecuteCommandList.
-        std::vector<ComPtr<ID3D12GraphicsCommandList>> m_pendingCommandLists;
-        std::vector<bool> m_pendingCommandListsCacheable;
-
-        // A pool of cached command lists which may be re-used.
-        std::deque<ComPtr<ID3D12GraphicsCommandList>> m_cachedCommandLists;
+        // A cached command list which may be re-used.
+        ComPtr<ID3D12GraphicsCommandList> m_cachedCommandList;
 
         void SetDescriptorHeap(ID3D12DescriptorHeap* descriptorHeap);
     };