Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DML EP] Enable DML Graph Serialization #19505

Merged
merged 4 commits into from
Feb 26, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,12 @@ if (onnxruntime_USE_DML)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_DML=1)
list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_DML=1)
list(APPEND ONNXRUNTIME_PROVIDER_NAMES dml)
if(onnxruntime_DML_ENABLE_SERIALIZATION)
list(APPEND ORT_PROVIDER_FLAGS -DDML_ENABLE_SERIALIZATION=1)
if(onnxruntime_DML_ENABLE_SERIALIZATION_DEBUG)
sumitsays marked this conversation as resolved.
Show resolved Hide resolved
list(APPEND ORT_PROVIDER_FLAGS -DDML_ENABLE_SERIALIZATION_DEBUG=1)
endif()
endif()
endif()
if (onnxruntime_USE_MIGRAPHX)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_MIGRAPHX=1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,10 @@ namespace Windows::AI::MachineLearning::Adapter
};

// This is the counterpart to the MLOperatorGraphDesc ABI struct which owns its memory and uses containers.
// Either nodesAsOperatorDesc or nodesAsIDMLOperator can have non-zero size.
struct DmlGraphNodeCreateInfo
{
uint32_t nodeCount = 0;
std::vector<std::unique_ptr<AbstractOperatorDesc>> nodesAsOperatorDesc;

// TODO (jeffbloo): Remove this
std::vector<Microsoft::WRL::ComPtr<IDMLOperator>> nodesAsIDMLOperator;

std::vector<std::unique_ptr<AbstractOperatorDesc>> nodes;
std::vector<DML_INPUT_GRAPH_EDGE_DESC> inputEdges;
std::vector<DML_OUTPUT_GRAPH_EDGE_DESC> outputEdges;
std::vector<DML_INTERMEDIATE_GRAPH_EDGE_DESC> intermediateEdges;
Expand Down
570 changes: 570 additions & 0 deletions onnxruntime/core/providers/dml/DmlExecutionProvider/src/ApiTraits.cpp

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,17 @@ namespace DmlGraphFusionHelper
gsl::span<std::unique_ptr<GraphPartition>> partitions
);

template <size_t AllocatorSize>
void ConvertGraphDesc(
const Dml::GraphDescBuilder::GraphDesc& graphDesc,
_Out_ DML_GRAPH_DESC& dmlGraphDesc,
const uint32_t inputCount,
const uint32_t outputCount,
_Inout_ std::vector<DML_OPERATOR_GRAPH_NODE_DESC>& dmlOperatorGraphNodes,
IDMLDevice* device,
StackAllocator<AllocatorSize>& allocator,
const std::unordered_map<uint32_t, uint32_t>* serializedGraphInputIndexToSubgraphInputIndex,
const std::unordered_map<std::string_view, uint32_t>* serializedGraphLargeConstantNameToSubgraphInputIndex,
_Out_ DML_GRAPH_DESC& dmlGraphDesc,
_Inout_ std::vector<ComPtr<IDMLOperator>>& dmlOperators,
_Inout_ std::vector<DML_GRAPH_NODE_DESC>& dmlGraphNodes,
_Inout_ std::vector<DML_GRAPH_EDGE_DESC>& dmlInputEdges,
_Inout_ std::vector<DML_GRAPH_EDGE_DESC>& dmlOutputEdges,
Expand All @@ -69,17 +74,22 @@ namespace DmlGraphFusionHelper
Microsoft::WRL::ComPtr<IDMLCompiledOperator> TryCreateCompiledOperator(
const GraphDescBuilder::GraphDesc& graphDesc,
const onnxruntime::IndexedSubGraph& indexedSubGraph,
const ExecutionProviderImpl* providerImpl);
const ExecutionProviderImpl* providerImpl,
const std::unordered_map<uint32_t, uint32_t>* serializedGraphInputIndexToSubgraphInputIndex,
const std::unordered_map<std::string_view, uint32_t>* serializedGraphLargeConstantNameToSubgraphInputIndex);

void FusePartitionAndRegisterKernel(
const uint32_t partitionIndex,
onnxruntime::Graph& graph,
onnxruntime::KernelRegistry* registryForPartitionKernels,
const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& initializerNameToInitializerMap,
const ExecutionProviderImpl* providerImpl,
const onnxruntime::IndexedSubGraph& indexedSubGraph,
std::vector<uint8_t>&& isInputsUploadedByDmlEP,
const GraphDescBuilder::GraphDesc& graphDesc,
Microsoft::WRL::ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator);
Microsoft::WRL::ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator,
const std::unordered_map<uint32_t, uint32_t>* serializedGraphInputIndexToSubgraphInputIndex = nullptr,
const std::unordered_map<std::string_view, uint32_t>* serializedGraphLargeConstantNameToSubgraphInputIndex = nullptr);

void RegisterDynamicKernel(
onnxruntime::Graph& graph,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ namespace Dml
std::vector<uint8_t> isInputsUploadedByDmlEP;
GraphDescBuilder::GraphDesc graphDesc;
std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>> isInitializerTransferable;
std::vector<std::unique_ptr<std::byte[]>> smallConstantData; // Need to keep it alive for maintaining lifetime
std::unordered_map<uint32_t, uint32_t> serializedGraphInputIndexToSubgraphInputIndex;
std::unordered_map<std::string_view, uint32_t> serializedGraphLargeConstantNameToSubgraphInputIndex;
};
}

Expand Down Expand Up @@ -227,23 +230,39 @@ namespace Dml

ComPtr<IDMLDevice> device;
ORT_THROW_IF_FAILED(m_providerImpl->GetDmlDevice(device.GetAddressOf()));
// This map will be used to transfer the initializer to D3D12 system heap memory.
// 'serializedDmlGraphDesc' will have constant input as intermediate edges, that's why
// we need a mapping between intermediateEdgeIndex and indexedSubGraph's (a given partition)
// input arg index.
// For ex: Let's say intermediate edge index = idx, then
// indexedSubGraphInputArgIdx = constantEdgeIdxToSubgraphInputArgIdxMap[idx];
// corresponding constant tensor = initializerNameToInitializerMap[indexedSubGraph.GetMetaDef()->inputs[indexedSubGraphInputArgIdx]]
// We are using intermediate edge index as a key because same constant tensor can be used by
// multiple nodes.
std::unordered_map<uint32_t, uint32_t> serializedGraphInputIndexToSubgraphInputIndex;
std::unordered_map<std::string_view, uint32_t> serializedGraphLargeConstantNameToSubgraphInputIndex;
std::vector<std::unique_ptr<std::byte[]>> smallConstantData;
GraphDescBuilder::GraphDesc graphDesc = GraphDescBuilder::BuildGraphDesc(
isInputsUploadedByDmlEP.data(),
isInputsUploadedByDmlEP.size(),
isInitializerTransferable,
partitionNodePropsMap,
device.Get(),
m_providerImpl,
modelPath,
subgraphNodes,
subgraphInputs,
subgraphOutputs);
subgraphOutputs,
serializedGraphInputIndexToSubgraphInputIndex,
serializedGraphLargeConstantNameToSubgraphInputIndex,
smallConstantData);

// Compile the operator
auto compiledPartition = DmlGraphFusionHelper::TryCreateCompiledOperator(
graphDesc,
indexedSubGraph,
m_providerImpl);
m_providerImpl,
&serializedGraphInputIndexToSubgraphInputIndex,
&serializedGraphLargeConstantNameToSubgraphInputIndex);

if (!compiledPartition)
{
Expand All @@ -264,27 +283,34 @@ namespace Dml
compiledPartitionInfo->isInputsUploadedByDmlEP = std::move(isInputsUploadedByDmlEP);
compiledPartitionInfo->graphDesc = std::move(graphDesc);
compiledPartitionInfo->isInitializerTransferable = std::move(isInitializerTransferable);
compiledPartitionInfo->smallConstantData = std::move(smallConstantData);
compiledPartitionInfo->serializedGraphInputIndexToSubgraphInputIndex = std::move(serializedGraphInputIndexToSubgraphInputIndex);
compiledPartitionInfo->serializedGraphLargeConstantNameToSubgraphInputIndex = std::move(serializedGraphLargeConstantNameToSubgraphInputIndex);
compiledPartitionInfos[partitionIndex] = std::move(compiledPartitionInfo);
}
}
}
}
while (!additionalSplittingNodes.empty());

uint32_t partitionIndex = 0;
for (auto&& compiledPartitionInfo : compiledPartitionInfos)
{
// Null compiled operators were not DML partitions
if (compiledPartitionInfo)
{
DmlGraphFusionHelper::FusePartitionAndRegisterKernel(
partitionIndex++,
graph,
m_providerImpl->GetKernelRegistry().get(),
compiledPartitionInfo->isInitializerTransferable,
m_providerImpl,
compiledPartitionInfo->indexedSubGraph,
std::move(compiledPartitionInfo->isInputsUploadedByDmlEP),
compiledPartitionInfo->graphDesc,
compiledPartitionInfo->compiledOperator);
compiledPartitionInfo->compiledOperator,
&compiledPartitionInfo->serializedGraphInputIndexToSubgraphInputIndex,
&compiledPartitionInfo->serializedGraphLargeConstantNameToSubgraphInputIndex);
}
}

Expand Down
Loading
Loading