Skip to content

Commit

Permalink
[DML EP] Enable DML Graph Serialization (#19505)
Browse files Browse the repository at this point in the history
### Description
This PR adds a feature to serialize all DML EP partitions into DML
currency individually for a given a model. This feature can be
dynamically turned on by using DML EP option
`ep.dml.enable_graph_serialization`.


### Motivation and Context
- Why is this change required? What problem does it solve?
Useful when user want to capture the DML EP specific partition into DML
currency to mitigate the dependency on the framework.
<!-- - If it fixes an open issue, please link to the issue here. -->
  • Loading branch information
sumitsays authored Feb 26, 2024
1 parent 430a086 commit a956893
Show file tree
Hide file tree
Showing 41 changed files with 5,203 additions and 454 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,10 @@ namespace Windows::AI::MachineLearning::Adapter
};

// This is the counterpart to the MLOperatorGraphDesc ABI struct which owns its memory and uses containers.
// Either nodesAsOperatorDesc or nodesAsIDMLOperator can have non-zero size.
struct DmlGraphNodeCreateInfo
{
uint32_t nodeCount = 0;
std::vector<std::unique_ptr<AbstractOperatorDesc>> nodesAsOperatorDesc;

// TODO (jeffbloo): Remove this
std::vector<Microsoft::WRL::ComPtr<IDMLOperator>> nodesAsIDMLOperator;

std::vector<std::unique_ptr<AbstractOperatorDesc>> nodes;
std::vector<DML_INPUT_GRAPH_EDGE_DESC> inputEdges;
std::vector<DML_OUTPUT_GRAPH_EDGE_DESC> outputEdges;
std::vector<DML_INTERMEDIATE_GRAPH_EDGE_DESC> intermediateEdges;
Expand Down
570 changes: 570 additions & 0 deletions onnxruntime/core/providers/dml/DmlExecutionProvider/src/ApiTraits.cpp

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,17 @@ namespace DmlGraphFusionHelper
gsl::span<std::unique_ptr<GraphPartition>> partitions
);

template <size_t AllocatorSize>
void ConvertGraphDesc(
const Dml::GraphDescBuilder::GraphDesc& graphDesc,
_Out_ DML_GRAPH_DESC& dmlGraphDesc,
const uint32_t inputCount,
const uint32_t outputCount,
_Inout_ std::vector<DML_OPERATOR_GRAPH_NODE_DESC>& dmlOperatorGraphNodes,
IDMLDevice* device,
StackAllocator<AllocatorSize>& allocator,
const std::unordered_map<uint32_t, uint32_t>* serializedGraphInputIndexToSubgraphInputIndex,
const std::unordered_map<std::string_view, uint32_t>* serializedGraphLargeConstantNameToSubgraphInputIndex,
_Out_ DML_GRAPH_DESC& dmlGraphDesc,
_Inout_ std::vector<ComPtr<IDMLOperator>>& dmlOperators,
_Inout_ std::vector<DML_GRAPH_NODE_DESC>& dmlGraphNodes,
_Inout_ std::vector<DML_GRAPH_EDGE_DESC>& dmlInputEdges,
_Inout_ std::vector<DML_GRAPH_EDGE_DESC>& dmlOutputEdges,
Expand All @@ -69,17 +74,23 @@ namespace DmlGraphFusionHelper
Microsoft::WRL::ComPtr<IDMLCompiledOperator> TryCreateCompiledOperator(
const GraphDescBuilder::GraphDesc& graphDesc,
const onnxruntime::IndexedSubGraph& indexedSubGraph,
const ExecutionProviderImpl* providerImpl);
const ExecutionProviderImpl* providerImpl,
const std::unordered_map<uint32_t, uint32_t>* serializedGraphInputIndexToSubgraphInputIndex,
const std::unordered_map<std::string_view, uint32_t>* serializedGraphLargeConstantNameToSubgraphInputIndex);

void FusePartitionAndRegisterKernel(
const uint32_t partitionIndex,
onnxruntime::Graph& graph,
onnxruntime::KernelRegistry* registryForPartitionKernels,
const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& initializerNameToInitializerMap,
const ExecutionProviderImpl* providerImpl,
const onnxruntime::IndexedSubGraph& indexedSubGraph,
std::vector<uint8_t>&& isInputsUploadedByDmlEP,
const GraphDescBuilder::GraphDesc& graphDesc,
Microsoft::WRL::ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator);
Microsoft::WRL::ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator,
const bool graphSerializationEnabled,
const std::unordered_map<uint32_t, uint32_t>* serializedGraphInputIndexToSubgraphInputIndex = nullptr,
const std::unordered_map<std::string_view, uint32_t>* serializedGraphLargeConstantNameToSubgraphInputIndex = nullptr);

void RegisterDynamicKernel(
onnxruntime::Graph& graph,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,20 @@ namespace Dml
std::vector<uint8_t> isInputsUploadedByDmlEP;
GraphDescBuilder::GraphDesc graphDesc;
std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>> isInitializerTransferable;
std::vector<std::unique_ptr<std::byte[]>> smallConstantData; // Need to keep it alive for maintaining lifetime
std::unordered_map<uint32_t, uint32_t> serializedGraphInputIndexToSubgraphInputIndex;
std::unordered_map<std::string_view, uint32_t> serializedGraphLargeConstantNameToSubgraphInputIndex;
};
}

DmlGraphFusionTransformer::DmlGraphFusionTransformer(
const std::string& name,
const onnxruntime::IExecutionProvider* provider
const onnxruntime::IExecutionProvider* provider,
const bool graphSerializationEnabled
)
:onnxruntime::GraphTransformer(name),
m_providerImpl(static_cast<const ExecutionProvider*>(provider)->GetImpl())
m_providerImpl(static_cast<const ExecutionProvider*>(provider)->GetImpl()),
graphSerializationEnabled(graphSerializationEnabled)
{
}

Expand Down Expand Up @@ -227,23 +232,39 @@ namespace Dml

ComPtr<IDMLDevice> device;
ORT_THROW_IF_FAILED(m_providerImpl->GetDmlDevice(device.GetAddressOf()));
// This map will be used to transfer the initializer to D3D12 system heap memory.
// 'serializedDmlGraphDesc' will have constant input as intermediate edges, that's why
// we need a mapping between intermediateEdgeIndex and indexedSubGraph's (a given partition)
// input arg index.
// For ex: Let's say intermediate edge index = idx, then
// indexedSubGraphInputArgIdx = constantEdgeIdxToSubgraphInputArgIdxMap[idx];
// corresponding constant tensor = initializerNameToInitializerMap[indexedSubGraph.GetMetaDef()->inputs[indexedSubGraphInputArgIdx]]
// We are using intermediate edge index as a key because same constant tensor can be used by
// multiple nodes.
std::unordered_map<uint32_t, uint32_t> serializedGraphInputIndexToSubgraphInputIndex;
std::unordered_map<std::string_view, uint32_t> serializedGraphLargeConstantNameToSubgraphInputIndex;
std::vector<std::unique_ptr<std::byte[]>> smallConstantData;
GraphDescBuilder::GraphDesc graphDesc = GraphDescBuilder::BuildGraphDesc(
isInputsUploadedByDmlEP.data(),
isInputsUploadedByDmlEP.size(),
isInitializerTransferable,
partitionNodePropsMap,
device.Get(),
m_providerImpl,
modelPath,
subgraphNodes,
subgraphInputs,
subgraphOutputs);
subgraphOutputs,
serializedGraphInputIndexToSubgraphInputIndex,
serializedGraphLargeConstantNameToSubgraphInputIndex,
smallConstantData);

// Compile the operator
auto compiledPartition = DmlGraphFusionHelper::TryCreateCompiledOperator(
graphDesc,
indexedSubGraph,
m_providerImpl);
m_providerImpl,
&serializedGraphInputIndexToSubgraphInputIndex,
&serializedGraphLargeConstantNameToSubgraphInputIndex);

if (!compiledPartition)
{
Expand All @@ -264,27 +285,35 @@ namespace Dml
compiledPartitionInfo->isInputsUploadedByDmlEP = std::move(isInputsUploadedByDmlEP);
compiledPartitionInfo->graphDesc = std::move(graphDesc);
compiledPartitionInfo->isInitializerTransferable = std::move(isInitializerTransferable);
compiledPartitionInfo->smallConstantData = std::move(smallConstantData);
compiledPartitionInfo->serializedGraphInputIndexToSubgraphInputIndex = std::move(serializedGraphInputIndexToSubgraphInputIndex);
compiledPartitionInfo->serializedGraphLargeConstantNameToSubgraphInputIndex = std::move(serializedGraphLargeConstantNameToSubgraphInputIndex);
compiledPartitionInfos[partitionIndex] = std::move(compiledPartitionInfo);
}
}
}
}
while (!additionalSplittingNodes.empty());

uint32_t partitionIndex = 0;
for (auto&& compiledPartitionInfo : compiledPartitionInfos)
{
// Null compiled operators were not DML partitions
if (compiledPartitionInfo)
{
DmlGraphFusionHelper::FusePartitionAndRegisterKernel(
partitionIndex++,
graph,
m_providerImpl->GetKernelRegistry().get(),
compiledPartitionInfo->isInitializerTransferable,
m_providerImpl,
compiledPartitionInfo->indexedSubGraph,
std::move(compiledPartitionInfo->isInputsUploadedByDmlEP),
compiledPartitionInfo->graphDesc,
compiledPartitionInfo->compiledOperator);
compiledPartitionInfo->compiledOperator,
graphSerializationEnabled,
&compiledPartitionInfo->serializedGraphInputIndexToSubgraphInputIndex,
&compiledPartitionInfo->serializedGraphLargeConstantNameToSubgraphInputIndex);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ class DmlGraphFusionTransformer : public onnxruntime::GraphTransformer
public:
DmlGraphFusionTransformer(
const std::string& name,
const onnxruntime::IExecutionProvider* provider
const onnxruntime::IExecutionProvider* provider,
const bool graphSerializationEnabled
);

public:
Expand All @@ -38,5 +39,6 @@ class DmlGraphFusionTransformer : public onnxruntime::GraphTransformer

private:
const ExecutionProviderImpl* m_providerImpl = nullptr;
const bool graphSerializationEnabled = false;
};
}
Loading

0 comments on commit a956893

Please sign in to comment.