Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug in DML EP ExecuteCommandList fast path and simplify design #18866

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,14 @@
return m_commandAllocators[m_currentCommandAllocator].Get();
}

// Updates the completion event of the current allocator to a different value. This is used when the caller
// decides to issue an unrelated call to the queue such as ExecuteCommandLists which updates its fence between calling

Check warning on line 51 in onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandAllocatorRing.h

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandAllocatorRing.h#L51

Line ends in whitespace. Consider deleting these extra spaces. [whitespace/end_of_line] [4]
Raw output
onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandAllocatorRing.h:51:  Line ends in whitespace.  Consider deleting these extra spaces.  [whitespace/end_of_line] [4]
// GetNextAllocator and executing the work which it recorded using the allocator it received.
void UpdateCurrentAllocatorCompletionEvent(GpuEvent nextCompletionEvent)
{
m_commandAllocators[m_currentCommandAllocator].completionEvent = nextCompletionEvent;
}

private:
struct CommandAllocatorInfo
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -262,49 +262,30 @@
m_queue->ExecuteCommandLists(
gsl::span<ID3D12CommandList*>(reinterpret_cast<ID3D12CommandList**>(&commandList), 1));

// The fence value at which the current command allocator may be re-used will now be higher
m_commandAllocatorRing.UpdateCurrentAllocatorCompletionEvent(m_queue->GetNextCompletionEvent());

// Fail early if something horrifying happens
ORT_THROW_IF_FAILED(m_dmlDevice->GetDeviceRemovedReason());
ORT_THROW_IF_FAILED(m_d3dDevice->GetDeviceRemovedReason());

return;
}

ORT_THROW_IF_FAILED(m_currentCommandList->Close());

if (m_operationsRecordedInCurrentCommandList)
{
m_pendingCommandLists.push_back(m_currentCommandList.Get());
m_pendingCommandListsCacheable.push_back(true);
}
else
{
m_cachedCommandLists.push_back(m_currentCommandList.Get());
}

m_currentCommandList = nullptr;
m_operationsRecordedInCurrentCommandList = false;

m_pendingCommandLists.push_back(commandList);
m_pendingCommandListsCacheable.push_back(false);

// Remember the descriptor heap and apply it to the next command list
// Remember the descriptor heap and apply it to the next command list. This avoids unnecessarily setting it onto
// the D3D object lazily at a point when the operation may not be parallelized with GPU work.
auto heap = m_currentDescriptorHeap;
m_currentDescriptorHeap = nullptr;
Open();

// The caller can re-use relevant resources after the next set of work to be
// flushed has completed. Its command list hasn't been executed yet, just batched.
GpuEvent gpuEvent = m_queue->GetNextCompletionEvent();
gpuEvent.fence.CopyTo(fence);
*completionValue = gpuEvent.fenceValue;

// Trigger a flush of the command list, with the assumption that it contains enough GPU work that this
// will help parallelize GPU work with subsequent CPU work. This policy is related to the choice of
// minNodeCountToReuseCommandList within FusedGraphKernel, so both should be tuned together.
CloseAndExecute();
// Execute work in the current command list plus provided command list while closing the recorder.
CloseAndExecute(commandList);
Open();

// Reset the descriptor heap opportunistically per above comment
SetDescriptorHeap(heap);

GpuEvent gpuEvent = m_queue->GetCurrentCompletionEvent();
gpuEvent.fence.CopyTo(fence);
*completionValue = gpuEvent.fenceValue;
}

ComPtr<ID3D12GraphicsCommandList> DmlCommandRecorder::GetCommandList()
Expand Down Expand Up @@ -334,7 +315,7 @@

ID3D12CommandAllocator* allocator = m_commandAllocatorRing.GetNextAllocator(m_queue->GetNextCompletionEvent());

if (m_cachedCommandLists.empty())
if (!m_cachedCommandList)
{
ORT_THROW_IF_FAILED(m_d3dDevice->CreateCommandList(
0,
Expand All @@ -345,47 +326,43 @@
}
else
{
m_currentCommandList = m_cachedCommandLists.front();
m_cachedCommandLists.pop_front();
m_currentCommandList = m_cachedCommandList;
m_cachedCommandList = nullptr;
ORT_THROW_IF_FAILED(m_currentCommandList->Reset(allocator, nullptr));
}
}

void DmlCommandRecorder::CloseAndExecute()
{
CloseAndExecute(nullptr);
}

void DmlCommandRecorder::CloseAndExecute(_In_opt_ ID3D12GraphicsCommandList* commandList)
{

Check warning on line 341 in onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp#L341

Line ends in whitespace. Consider deleting these extra spaces. [whitespace/end_of_line] [4]
Raw output
onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp:341:  Line ends in whitespace.  Consider deleting these extra spaces.  [whitespace/end_of_line] [4]
ORT_THROW_IF_FAILED(m_currentCommandList->Close());

ID3D12GraphicsCommandList* commandListsToExecute[2] = {};
uint32_t commandListsToExecuteCount = 0;

if (m_operationsRecordedInCurrentCommandList)
{
m_pendingCommandLists.push_back(m_currentCommandList.Get());
m_pendingCommandListsCacheable.push_back(true);
commandListsToExecute[commandListsToExecuteCount++] = m_currentCommandList.Get();
}
else

if (commandList)
{
m_cachedCommandLists.push_back(m_currentCommandList.Get());
commandListsToExecute[commandListsToExecuteCount++] = commandList;
}

m_currentCommandList = nullptr;
m_operationsRecordedInCurrentCommandList = false;

if (!m_pendingCommandLists.empty())
if (commandListsToExecuteCount > 0)
{
// Close and execute the command list
m_queue->ExecuteCommandLists(
gsl::span<ID3D12CommandList*>(reinterpret_cast<ID3D12CommandList**>(m_pendingCommandLists.data()), m_pendingCommandLists.size()));

assert(m_pendingCommandLists.size() == m_pendingCommandListsCacheable.size());
for (size_t i = 0; i < m_pendingCommandLists.size(); ++i)
{
if (m_pendingCommandListsCacheable[i])
{
m_cachedCommandLists.push_back(m_pendingCommandLists[i]);
}
}

m_pendingCommandLists.clear();
m_pendingCommandListsCacheable.clear();
gsl::span<ID3D12CommandList*>(reinterpret_cast<ID3D12CommandList**>(commandListsToExecute), commandListsToExecuteCount));
}

Check warning on line 362 in onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp#L362

Line ends in whitespace. Consider deleting these extra spaces. [whitespace/end_of_line] [4]
Raw output
onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.cpp:362:  Line ends in whitespace.  Consider deleting these extra spaces.  [whitespace/end_of_line] [4]
m_cachedCommandList = m_currentCommandList;
m_currentCommandList = nullptr;
m_operationsRecordedInCurrentCommandList = false;

// The descriptor heap must be set on the command list the next time it's opened.
m_currentDescriptorHeap = nullptr;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@

bool HasUnsubmittedWork() override
{
return m_operationsRecordedInCurrentCommandList || !m_pendingCommandLists.empty();
return m_operationsRecordedInCurrentCommandList;
}

// Forces the descriptor heap to be reset to D3D before executing future operations
Expand All @@ -68,7 +68,8 @@
}

private:

void CloseAndExecute(_In_opt_ ID3D12GraphicsCommandList* commandList);

Check warning on line 72 in onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h#L72

Line ends in whitespace. Consider deleting these extra spaces. [whitespace/end_of_line] [4]
Raw output
onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommandRecorder.h:72:  Line ends in whitespace.  Consider deleting these extra spaces.  [whitespace/end_of_line] [4]
std::shared_ptr<CommandQueue> m_queue;
ComPtr<ID3D12Device> m_d3dDevice;
ComPtr<IDMLDevice> m_dmlDevice;
Expand All @@ -89,15 +90,8 @@
ComPtr<ID3D12GraphicsCommandList> m_currentCommandList;
bool m_operationsRecordedInCurrentCommandList = false;

// Command lists which have been batched up for execution. The values in
// m_pendingCommandListsCacheable indicate whether they can be moved into this
// class's cache after execution, versus if they belong to the caller and were
// passed to ExecuteCommandList.
std::vector<ComPtr<ID3D12GraphicsCommandList>> m_pendingCommandLists;
std::vector<bool> m_pendingCommandListsCacheable;

// A pool of cached command lists which may be re-used.
std::deque<ComPtr<ID3D12GraphicsCommandList>> m_cachedCommandLists;
// A cached command list which may be re-used.
ComPtr<ID3D12GraphicsCommandList> m_cachedCommandList;

void SetDescriptorHeap(ID3D12DescriptorHeap* descriptorHeap);
};
Expand Down
Loading