diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp index 5516fc62cdda0..2b4f3bb96537b 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/CommandQueue.cpp @@ -46,12 +46,12 @@ namespace Dml return GpuEvent{ m_lastFenceValue + 1, m_fence }; } - void CommandQueue::QueueReference(IUnknown* object, bool waitForUnsubmittedWork) + void CommandQueue::QueueReference(IUnknown* object, bool waitForUnsubmittedWork) { - // If the CommandQueue is closing, then m_queuedReferences is being cleared -- it is not OK - // to queue additional references at this time, since those references would be leaked. This - // affects any objects in m_queuedReferences whose destructors indirectly call QueueReference; - // for example, an allocation from BucketizedBufferAllocator attempts to queue a reference + // If the CommandQueue is closing, then m_queuedReferences is being cleared -- it is not OK + // to queue additional references at this time, since those references would be leaked. This + // affects any objects in m_queuedReferences whose destructors indirectly call QueueReference; + // for example, an allocation from BucketizedBufferAllocator attempts to queue a reference // to its underlying D3D resource when freed. Furthermore, these references are unnecessary // since Close() already blocks for scheduled GPU work before clearing m_queuedReferences. if (!m_closing) @@ -68,7 +68,7 @@ namespace Dml m_queuedReferences.push_back(queuedReference); } } - + void CommandQueue::Close() { // Wait for flushed work: @@ -79,7 +79,7 @@ namespace Dml m_queuedReferences.clear(); m_closing = false; } - + void CommandQueue::ReleaseCompletedReferences() { uint64_t completedValue = GetFence()->GetCompletedValue(); @@ -89,5 +89,4 @@ namespace Dml } } - } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp index a894d0660d6ff..bc82c7ab1c44a 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.cpp @@ -15,7 +15,7 @@ namespace Dml : m_queue(std::make_shared(queue)) , m_dmlRecorder(d3d12Device, dmlDevice, m_queue) { - ORT_THROW_IF_FAILED(dmlDevice->GetParentDevice(IID_GRAPHICS_PPV_ARGS(m_d3dDevice.GetAddressOf()))); + ORT_THROW_IF_FAILED(dmlDevice->GetParentDevice(IID_GRAPHICS_PPV_ARGS(m_d3dDevice.GetAddressOf()))); } void ExecutionContext::SetAllocator(std::weak_ptr allocator) @@ -78,14 +78,14 @@ namespace Dml ID3D12GraphicsCommandList* commandList, _Outptr_ ID3D12Fence** fence, _Out_ uint64_t* completionValue - ) + ) { assert(!m_closed); SetCommandRecorder(&m_dmlRecorder); m_dmlRecorder.ExecuteCommandList(commandList, fence, completionValue); } - + void ExecutionContext::InitializeOperator( IDMLCompiledOperator* op, const DML_BINDING_DESC& persistentResourceBinding, @@ -110,7 +110,7 @@ namespace Dml } void ExecutionContext::AddUAVBarrier() - { + { assert(!m_closed); SetCommandRecorder(&m_dmlRecorder); @@ -173,9 +173,9 @@ namespace Dml m_currentRecorder = nullptr; SetCommandRecorder(&m_dmlRecorder); } - - void ExecutionContext::QueueReference(IUnknown* object) - { + + void ExecutionContext::QueueReference(IUnknown* object) + { assert(!m_closed); // If something has been recorded into a command list but not submitted yet, it means that the *next* fence // value is the one to signal completion. @@ -186,14 +186,14 @@ namespace Dml void ExecutionContext::Close() { assert(!m_closed); - + // Discard unflushed work and clear queued references. This prevents the circular reference: // Kernel --> ProviderImpl --> Context --> QueuedRefs --> Kernel m_queue->Close(); m_currentRecorder = nullptr; m_closed = true; } - + GpuEvent ExecutionContext::GetCurrentCompletionEvent() { assert(!m_closed); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h index b06f11a5efd0a..ac8d3ff875786 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionContext.h @@ -20,13 +20,13 @@ namespace Dml public: // Constructs an ExecutionContext that executes on the supplied queue. ExecutionContext( - ID3D12Device* d3d12Device, - IDMLDevice* dmlDevice, + ID3D12Device* d3d12Device, + IDMLDevice* dmlDevice, ID3D12CommandQueue* queue); void SetAllocator(std::weak_ptr allocator); - // Waits for flushed work, discards unflushed work, and discards associated references to + // Waits for flushed work, discards unflushed work, and discards associated references to // prevent circular references. Must be the last call on the object before destruction. void Close(); @@ -75,12 +75,12 @@ namespace Dml // Returns an event which will become signaled when everything submitted to the execution context thus far has // completed execution on the GPU, including work that has yet to be flushed to the queue. GpuEvent GetCurrentCompletionEvent(); - + // Adds a reference which will be released when queued GPU work is completed void QueueReference(IUnknown* object); // Release any accumulated references who corresponding GPU fence values have - // been reached. + // been reached. void ReleaseCompletedReferences(); D3D12_COMMAND_LIST_TYPE GetCommandListTypeForQueue() const; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index f4e90bf769cca..1cb1cee5ae884 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -204,7 +204,7 @@ namespace Dml D3D12_FEATURE_DATA_D3D12_OPTIONS19 options19 = {}; // The call may fail in which case the default value is false - d3d12Device->CheckFeatureSupport((D3D12_FEATURE) 48 /*D3D12_FEATURE_D3D12_OPTIONS19*/, &options19, sizeof(options19)); + d3d12Device->CheckFeatureSupport((D3D12_FEATURE) 48 /*D3D12_FEATURE_D3D12_OPTIONS19*/, &options19, sizeof(options19)); m_areCustomHeapsSupported = options19.ComputeOnlyCustomHeapSupported; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h index ab932fb8a4367..5617bc7bdcac6 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h @@ -150,7 +150,7 @@ namespace Dml } STDMETHOD_(bool, IsMcdmDevice)() const noexcept final; - STDMETHOD_(bool, CustomHeapsSupported)() const noexcept final; + STDMETHOD_(bool, CustomHeapsSupported)() const noexcept final; STDMETHOD_(bool, MetacommandsEnabled)() const noexcept final; bool DynamicGraphFusionEnabled() const noexcept; diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc index 1696763c74791..d38656c0a5e55 100644 --- a/onnxruntime/core/providers/dml/dml_provider_factory.cc +++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc @@ -321,7 +321,7 @@ static std::optional ParsePerformancePreference(con } static std::optional ParseFilter(const ProviderOptions& provider_options) { - static const std::string Filter = "filter"; + static const std::string Filter = "device_filter"; static const std::string Any = "any"; static const std::string Gpu = "gpu"; static const std::string Npu = "npu"; diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc index a72a0d105eefc..11d7f19319d6e 100644 --- a/onnxruntime/test/perftest/command_args_parser.cc +++ b/onnxruntime/test/perftest/command_args_parser.cc @@ -58,6 +58,10 @@ namespace perftest { "\t-q [CUDA only] use separate stream for copy. \n" "\t-z: Set denormal as zero. When turning on this option reduces latency dramatically, a model may have denormals.\n" "\t-i: Specify EP specific runtime options as key value pairs. Different runtime options available are: \n" + "\t [DML only] [performance_preference]: DML device performance preference, options: 'default', 'minimum_power', 'high_performance', \n" + "\t [DML only] [device_filter]: DML device filter, options: 'any', 'gpu', 'npu', \n" + "\t [DML only] [disable_metacommands]: Options: 'true', 'false', \n" + "\t [DML only] [enable_dynamic_graph_fusion]: Options: 'true', 'false', \n" "\t [OpenVINO only] [device_type]: Overrides the accelerator hardware type and precision with these values at runtime.\n" "\t [OpenVINO only] [device_id]: Selects a particular hardware device for inference.\n" "\t [OpenVINO only] [enable_npu_fast_compile]: Optionally enabled to speeds up the model's compilation on NPU device targets.\n" diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index c2dd81ec9f359..106b5ce43bceb 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -16,6 +16,10 @@ #include "providers.h" #include "TestCase.h" +#ifdef USE_DML +#include "core/providers/dml/dml_provider_factory.h" +#endif + #ifdef _WIN32 #define strdup _strdup #endif @@ -42,8 +46,8 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device const TestModelInfo& m) : rand_engine_(rd()), input_names_(m.GetInputCount()), input_names_str_(m.GetInputCount()), input_length_(m.GetInputCount()) { Ort::SessionOptions session_options; - const std::string& provider_name = performance_test_config.machine_config.provider_type_name; - if (provider_name == onnxruntime::kDnnlExecutionProvider) { + provider_name_ = performance_test_config.machine_config.provider_type_name; + if (provider_name_ == onnxruntime::kDnnlExecutionProvider) { #ifdef USE_DNNL // Generate provider options OrtDnnlProviderOptions dnnl_options; @@ -96,7 +100,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device #else ORT_THROW("DNNL is not supported in this build\n"); #endif - } else if (provider_name == onnxruntime::kCudaExecutionProvider) { + } else if (provider_name_ == onnxruntime::kCudaExecutionProvider) { #ifdef USE_CUDA const auto& api = Ort::GetApi(); OrtCUDAProviderOptionsV2* cuda_options; @@ -161,7 +165,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device #else ORT_THROW("CUDA is not supported in this build\n"); #endif - } else if (provider_name == onnxruntime::kTensorrtExecutionProvider) { + } else if (provider_name_ == onnxruntime::kTensorrtExecutionProvider) { #ifdef USE_TENSORRT const auto& api = Ort::GetApi(); OrtTensorRTProviderOptionsV2* tensorrt_options; @@ -215,7 +219,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device #else ORT_THROW("TensorRT is not supported in this build\n"); #endif - } else if (provider_name == onnxruntime::kOpenVINOExecutionProvider) { + } else if (provider_name_ == onnxruntime::kOpenVINOExecutionProvider) { #ifdef USE_OPENVINO #ifdef _MSC_VER std::string ov_string = ToUTF8String(performance_test_config.run_config.ep_runtime_config_string); @@ -251,7 +255,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device ov_options[key] = value; } else { ORT_THROW( - "[ERROR] [OpenVINO] You have selcted wrong configuration value for the key 'device_type'. " + "[ERROR] [OpenVINO] You have selected a wrong configuration value for the key 'device_type'. " "Select from 'CPU_FP32', 'CPU_FP16', 'GPU_FP32', 'GPU.0_FP32', 'GPU.1_FP32', 'GPU_FP16', " "'GPU.0_FP16', 'GPU.1_FP16' or from" " HETERO/MULTI/AUTO options available. \n"); @@ -305,7 +309,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device #else ORT_THROW("OpenVINO is not supported in this build\n"); #endif - } else if (provider_name == onnxruntime::kQnnExecutionProvider) { + } else if (provider_name_ == onnxruntime::kQnnExecutionProvider) { #ifdef USE_QNN #ifdef _MSC_VER std::string option_string = ToUTF8String(performance_test_config.run_config.ep_runtime_config_string); @@ -384,7 +388,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device #else ORT_THROW("QNN is not supported in this build\n"); #endif - } else if (provider_name == onnxruntime::kSnpeExecutionProvider) { + } else if (provider_name_ == onnxruntime::kSnpeExecutionProvider) { #ifdef USE_SNPE #ifdef _MSC_VER std::string option_string = ToUTF8String(performance_test_config.run_config.ep_runtime_config_string); @@ -436,7 +440,7 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)"); #else ORT_THROW("SNPE is not supported in this build\n"); #endif - } else if (provider_name == onnxruntime::kNnapiExecutionProvider) { + } else if (provider_name_ == onnxruntime::kNnapiExecutionProvider) { #ifdef USE_NNAPI uint32_t nnapi_flags = 0; #ifdef _MSC_VER @@ -464,22 +468,81 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)"); #else ORT_THROW("NNAPI is not supported in this build\n"); #endif - } else if (provider_name == onnxruntime::kCoreMLExecutionProvider) { + } else if (provider_name_ == onnxruntime::kCoreMLExecutionProvider) { #ifdef USE_COREML Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(session_options, 0)); #else ORT_THROW("COREML is not supported in this build\n"); #endif - } else if (provider_name == onnxruntime::kDmlExecutionProvider) { + } else if (provider_name_ == onnxruntime::kDmlExecutionProvider) { #ifdef USE_DML std::unordered_map dml_options; dml_options["performance_preference"] = "high_performance"; dml_options["device_filter"] = "gpu"; + dml_options["disable_metacommands"] = "false"; + dml_options["enable_dynamic_graph_fusion"] = "false"; +#ifdef _MSC_VER + std::string ov_string = ToUTF8String(performance_test_config.run_config.ep_runtime_config_string); +#else + std::string ov_string = performance_test_config.run_config.ep_runtime_config_string; +#endif + std::istringstream ss(ov_string); + std::string token; + while (ss >> token) { + if (token == "") { + continue; + } + auto pos = token.find("|"); + if (pos == std::string::npos || pos == 0 || pos == token.length()) { + ORT_THROW("[ERROR] [DML] Use a '|' to separate the key and value for the run-time option you are trying to use.\n"); + } + + auto key = token.substr(0, pos); + auto value = token.substr(pos + 1); + + if (key == "device_filter") { + std::set ov_supported_device_types = {"gpu", "npu"}; + if (ov_supported_device_types.find(value) != ov_supported_device_types.end()) { + dml_options[key] = value; + } else { + ORT_THROW( + "[ERROR] [DML] You have selected a wrong configuration value for the key 'device_filter'. " + "Select from 'gpu', or 'npu' \n"); + } + } else if (key == "performance_preference") { + std::set ov_supported_values = {"default", "high_performance", "minimal_power"}; + if (ov_supported_values.find(value) != ov_supported_values.end()) { + dml_options[key] = value; + } else { + ORT_THROW( + "[ERROR] [DML] You have selected a wrong configuration value for the key 'performance_preference'. " + "Select from 'default', 'high_performance' or 'minimal_power' \n"); + } + } else if (key == "disable_metacommands") { + std::set ov_supported_values = {"true", "True", "false", "False"}; + if (ov_supported_values.find(value) != ov_supported_values.end()) { + dml_options[key] = value; + } else { + ORT_THROW( + "[ERROR] [DML] You have selcted wrong value for the key 'disable_metacommands'. " + "Select from 'true' or 'false' \n"); + } + } else if (key == "enable_dynamic_graph_fusion") { + std::set ov_supported_values = {"true", "True", "false", "False"}; + if (ov_supported_values.find(value) != ov_supported_values.end()) { + dml_options[key] = value; + } else { + ORT_THROW( + "[ERROR] [DML] You have selcted wrong value for the key 'enable_dynamic_graph_fusion'. " + "Select from 'true' or 'false' \n"); + } + } + } session_options.AppendExecutionProvider("DML", dml_options); #else ORT_THROW("DML is not supported in this build\n"); #endif - } else if (provider_name == onnxruntime::kAclExecutionProvider) { + } else if (provider_name_ == onnxruntime::kAclExecutionProvider) { #ifdef USE_ACL Ort::ThrowOnError( OrtSessionOptionsAppendExecutionProvider_ACL(session_options, @@ -487,14 +550,14 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)"); #else ORT_THROW("Acl is not supported in this build\n"); #endif - } else if (provider_name == onnxruntime::kArmNNExecutionProvider) { + } else if (provider_name_ == onnxruntime::kArmNNExecutionProvider) { #ifdef USE_ARMNN Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ArmNN(session_options, performance_test_config.run_config.enable_cpu_mem_arena ? 1 : 0)); #else ORT_THROW("ArmNN is not supported in this build\n"); #endif - } else if (provider_name == onnxruntime::kRocmExecutionProvider) { + } else if (provider_name_ == onnxruntime::kRocmExecutionProvider) { #ifdef USE_ROCM OrtROCMProviderOptions rocm_options; rocm_options.miopen_conv_exhaustive_search = performance_test_config.run_config.cudnn_conv_algo; @@ -504,7 +567,7 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)"); #else ORT_THROW("ROCM is not supported in this build\n"); #endif - } else if (provider_name == onnxruntime::kMIGraphXExecutionProvider) { + } else if (provider_name_ == onnxruntime::kMIGraphXExecutionProvider) { #ifdef USE_MIGRAPHX Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(session_options, 0)); OrtROCMProviderOptions rocm_options; @@ -514,7 +577,7 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)"); #else ORT_THROW("MIGraphX is not supported in this build\n"); #endif - } else if (provider_name == onnxruntime::kXnnpackExecutionProvider) { + } else if (provider_name_ == onnxruntime::kXnnpackExecutionProvider) { #ifdef USE_XNNPACK session_options.AddConfigEntry(kOrtSessionOptionsConfigAllowIntraOpSpinning, "0"); session_options.AppendExecutionProvider( @@ -522,7 +585,7 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)"); #else ORT_THROW("Xnnpack is not supported in this build\n"); #endif - } else if (provider_name == onnxruntime::kVitisAIExecutionProvider) { + } else if (provider_name_ == onnxruntime::kVitisAIExecutionProvider) { #ifdef USE_VITISAI #ifdef _MSC_VER std::string option_string = ToUTF8String(performance_test_config.run_config.ep_runtime_config_string); @@ -550,7 +613,7 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)"); #else ORT_THROW("VitisAI is not supported in this build\n"); #endif - } else if (!provider_name.empty() && provider_name != onnxruntime::kCpuExecutionProvider) { + } else if (!provider_name_.empty() && provider_name_ != onnxruntime::kCpuExecutionProvider) { ORT_THROW("This backend is not included in perf test runner.\n"); } diff --git a/onnxruntime/test/perftest/ort_test_session.h b/onnxruntime/test/perftest/ort_test_session.h index 208e3de53b1d2..f1a4220ab325e 100644 --- a/onnxruntime/test/perftest/ort_test_session.h +++ b/onnxruntime/test/perftest/ort_test_session.h @@ -45,6 +45,7 @@ class OnnxRuntimeTestSession : public TestSession { std::vector input_names_; std::vector input_names_str_; const int input_length_; + std::string provider_name_; }; } // namespace perftest diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc index 65646a7286719..327dda3e52ea4 100644 --- a/onnxruntime/test/util/default_providers.cc +++ b/onnxruntime/test/util/default_providers.cc @@ -270,8 +270,9 @@ std::unique_ptr DefaultCannExecutionProvider() { std::unique_ptr DefaultDmlExecutionProvider() { #ifdef USE_DML - if (auto factory = DMLProviderFactoryCreator::Create(0, false, false, false)) + if (auto factory = DMLProviderFactoryCreator::CreateFromOptions(nullptr, false, false)) { return factory->CreateProvider(); + } #endif return nullptr; }