Skip to content

Commit

Permalink
Fix bug in DML EP ExecuteCommandList fast path and simplify design (m…
Browse files Browse the repository at this point in the history
…icrosoft#18866)

### Description
This addresses a bug in a fast path that was added for submission of
re-used command lists of fused graph kernels in the DML EP, addressing a
D3D debug layer error.

### Motivation and Context
The fast path in DmlCommandRecorder::ExecuteCommandList enabled a
current non-reused command list, if empty, to be used for commands
following submission of the fused command list. The fix ensures the
associated command allocator is only re-used after the next fence value
is completed, which is higher due to submission of the other command
list.

The command recorder design was intended to support batching of provided
command list execution, however it submits command lists immedately as
an implementation detail to maximize CPU/GPU parallelism. If that
heuristic was removed, it would expose additional issues in this same
fast path. Because of this and complexity and inefficiency of the old
batching mechanism, I also removed this.
  • Loading branch information
jeffbloo authored and jslap-ubi committed Apr 5, 2024
1 parent 4306f9a commit f5350ac
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 67 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,14 @@ namespace Dml
return m_commandAllocators[m_currentCommandAllocator].Get();
}

// Updates the completion event of the current allocator to a different value. This is used when the caller
// decides to issue an unrelated call to the queue such as ExecuteCommandLists which updates its fence between calling
// GetNextAllocator and executing the work which it recorded using the allocator it received.
void UpdateCurrentAllocatorCompletionEvent(GpuEvent nextCompletionEvent)
{
m_commandAllocators[m_currentCommandAllocator].completionEvent = nextCompletionEvent;
}

private:
struct CommandAllocatorInfo
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -262,49 +262,30 @@ void DmlCommandRecorder::ExecuteCommandList(
m_queue->ExecuteCommandLists(
gsl::span<ID3D12CommandList*>(reinterpret_cast<ID3D12CommandList**>(&commandList), 1));

// The fence value at which the current command allocator may be re-used will now be higher
m_commandAllocatorRing.UpdateCurrentAllocatorCompletionEvent(m_queue->GetNextCompletionEvent());

// Fail early if something horrifying happens
ORT_THROW_IF_FAILED(m_dmlDevice->GetDeviceRemovedReason());
ORT_THROW_IF_FAILED(m_d3dDevice->GetDeviceRemovedReason());

return;
}

ORT_THROW_IF_FAILED(m_currentCommandList->Close());

if (m_operationsRecordedInCurrentCommandList)
{
m_pendingCommandLists.push_back(m_currentCommandList.Get());
m_pendingCommandListsCacheable.push_back(true);
}
else
{
m_cachedCommandLists.push_back(m_currentCommandList.Get());
}

m_currentCommandList = nullptr;
m_operationsRecordedInCurrentCommandList = false;

m_pendingCommandLists.push_back(commandList);
m_pendingCommandListsCacheable.push_back(false);

// Remember the descriptor heap and apply it to the next command list
// Remember the descriptor heap and apply it to the next command list. This avoids unnecessarily setting it onto
// the D3D object lazily at a point when the operation may not be parallelized with GPU work.
auto heap = m_currentDescriptorHeap;
m_currentDescriptorHeap = nullptr;
Open();

// The caller can re-use relevant resources after the next set of work to be
// flushed has completed. Its command list hasn't been executed yet, just batched.
GpuEvent gpuEvent = m_queue->GetNextCompletionEvent();
gpuEvent.fence.CopyTo(fence);
*completionValue = gpuEvent.fenceValue;

// Trigger a flush of the command list, with the assumption that it contains enough GPU work that this
// will help parallelize GPU work with subsequent CPU work. This policy is related to the choice of
// minNodeCountToReuseCommandList within FusedGraphKernel, so both should be tuned together.
CloseAndExecute();
// Execute work in the current command list plus provided command list while closing the recorder.
CloseAndExecute(commandList);
Open();

// Reset the descriptor heap opportunistically per above comment
SetDescriptorHeap(heap);

GpuEvent gpuEvent = m_queue->GetCurrentCompletionEvent();
gpuEvent.fence.CopyTo(fence);
*completionValue = gpuEvent.fenceValue;
}

ComPtr<ID3D12GraphicsCommandList> DmlCommandRecorder::GetCommandList()
Expand Down Expand Up @@ -334,7 +315,7 @@ void DmlCommandRecorder::Open()

ID3D12CommandAllocator* allocator = m_commandAllocatorRing.GetNextAllocator(m_queue->GetNextCompletionEvent());

if (m_cachedCommandLists.empty())
if (!m_cachedCommandList)
{
ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommandList(
0,
Expand All @@ -345,47 +326,43 @@ void DmlCommandRecorder::Open()
}
else
{
m_currentCommandList = m_cachedCommandLists.front();
m_cachedCommandLists.pop_front();
m_currentCommandList = m_cachedCommandList;
m_cachedCommandList = nullptr;
ORT_THROW_IF_FAILED(m_currentCommandList->Reset(allocator, nullptr));
}
}

void DmlCommandRecorder::CloseAndExecute()
{
CloseAndExecute(nullptr);
}

void DmlCommandRecorder::CloseAndExecute(_In_opt_ ID3D12GraphicsCommandList* commandList)
{
ORT_THROW_IF_FAILED(m_currentCommandList->Close());

ID3D12GraphicsCommandList* commandListsToExecute[2] = {};
uint32_t commandListsToExecuteCount = 0;

if (m_operationsRecordedInCurrentCommandList)
{
m_pendingCommandLists.push_back(m_currentCommandList.Get());
m_pendingCommandListsCacheable.push_back(true);
commandListsToExecute[commandListsToExecuteCount++] = m_currentCommandList.Get();
}
else

if (commandList)
{
m_cachedCommandLists.push_back(m_currentCommandList.Get());
commandListsToExecute[commandListsToExecuteCount++] = commandList;
}

m_currentCommandList = nullptr;
m_operationsRecordedInCurrentCommandList = false;

if (!m_pendingCommandLists.empty())
if (commandListsToExecuteCount > 0)
{
// Close and execute the command list
m_queue->ExecuteCommandLists(
gsl::span<ID3D12CommandList*>(reinterpret_cast<ID3D12CommandList**>(m_pendingCommandLists.data()), m_pendingCommandLists.size()));

assert(m_pendingCommandLists.size() == m_pendingCommandListsCacheable.size());
for (size_t i = 0; i < m_pendingCommandLists.size(); ++i)
{
if (m_pendingCommandListsCacheable[i])
{
m_cachedCommandLists.push_back(m_pendingCommandLists[i]);
}
}

m_pendingCommandLists.clear();
m_pendingCommandListsCacheable.clear();
gsl::span<ID3D12CommandList*>(reinterpret_cast<ID3D12CommandList**>(commandListsToExecute), commandListsToExecuteCount));
}

m_cachedCommandList = m_currentCommandList;
m_currentCommandList = nullptr;
m_operationsRecordedInCurrentCommandList = false;

// The descriptor heap must be set on the command list the next time it's opened.
m_currentDescriptorHeap = nullptr;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ namespace Dml

bool HasUnsubmittedWork() override
{
return m_operationsRecordedInCurrentCommandList || !m_pendingCommandLists.empty();
return m_operationsRecordedInCurrentCommandList;
}

// Forces the descriptor heap to be reset to D3D before executing future operations
Expand All @@ -68,7 +68,8 @@ namespace Dml
}

private:

void CloseAndExecute(_In_opt_ ID3D12GraphicsCommandList* commandList);

std::shared_ptr<CommandQueue> m_queue;
ComPtr<ID3D12Device> m_d3dDevice;
ComPtr<IDMLDevice> m_dmlDevice;
Expand All @@ -89,15 +90,8 @@ namespace Dml
ComPtr<ID3D12GraphicsCommandList> m_currentCommandList;
bool m_operationsRecordedInCurrentCommandList = false;

// Command lists which have been batched up for execution. The values in
// m_pendingCommandListsCacheable indicate whether they can be moved into this
// class's cache after execution, versus if they belong to the caller and were
// passed to ExecuteCommandList.
std::vector<ComPtr<ID3D12GraphicsCommandList>> m_pendingCommandLists;
std::vector<bool> m_pendingCommandListsCacheable;

// A pool of cached command lists which may be re-used.
std::deque<ComPtr<ID3D12GraphicsCommandList>> m_cachedCommandLists;
// A cached command list which may be re-used.
ComPtr<ID3D12GraphicsCommandList> m_cachedCommandList;

void SetDescriptorHeap(ID3D12DescriptorHeap* descriptorHeap);
};
Expand Down

0 comments on commit f5350ac

Please sign in to comment.