From 1a60d8a0409ee00b6d3fd512817a07db8922a79b Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 2 Oct 2024 10:06:54 -0700 Subject: [PATCH] Add DML on-Device copy --- onnxruntime/core/session/lora_adapters.cc | 33 +++++++++++++++++ onnxruntime/test/lora/lora_test.cc | 45 +++++++++++++++++++++-- 2 files changed, 74 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/session/lora_adapters.cc b/onnxruntime/core/session/lora_adapters.cc index 466edce187a56..b40768945bb7c 100644 --- a/onnxruntime/core/session/lora_adapters.cc +++ b/onnxruntime/core/session/lora_adapters.cc @@ -16,6 +16,13 @@ #include "core/providers/cuda/cuda_provider_factory.h" #endif +#ifdef USE_DML +#include "core/framework/execution_provider.h" +#include "core/session/abi_session_options_impl.h" +#include "core/providers/dml/dml_provider_factory_creator.h" +#include "core/providers/dml/dml_provider_factory.h" +#endif + namespace onnxruntime { #ifdef USE_CUDA @@ -63,6 +70,32 @@ static std::unique_ptr GetDataTransfer(const OrtMemoryInfo& mem_i if (cuda_provider_info != nullptr) { data_transfer = cuda_provider_info->CreateGPUDataTransfer(); } +#endif + } else if (strcmp(mem_info.name, onnxruntime::DML) == 0) { +#ifdef USE_DML + auto ep_factory = onnxruntime::DMLProviderFactoryCreator::Create(ConfigOptions{}, 0, false, false, false); + auto dml_ep = ep_factory->CreateProvider(); + data_transfer = dml_ep->GetDataTransfer(); + + //constexpr uint32_t dml_api_version = 0; // This is ignored + //const void* dml_api = nullptr; + //auto* ort_status = OrtApis::GetExecutionProviderApi("DML", dml_api_version, &dml_api); + //if (ort_status == nullptr) { + // const auto* dml_provider_api = reinterpret_cast(dml_api); + // OrtSessionOptions sess_options; + // OrtDmlDeviceOptions dml_dev_options{OrtDmlPerformancePreference::Default, OrtDmlDeviceFilter::Gpu}; + // ort_status = dml_provider_api->SessionOptionsAppendExecutionProvider_DML2(&sess_options, &dml_dev_options); + // if (ort_status) { + // Ort::Status status(ort_status); + // ORT_THROW(status.GetErrorMessage()); + // } + // ORT_ENFORCE(sess_options.provider_factories.size() == 1, "Expecting a single factory"); + // auto dml_ep = sess_options.provider_factories[0]->CreateProvider(); + // data_transfer = dml_ep->GetDataTransfer(); + //} else { + // Ort::Status status(ort_status); + // ORT_THROW(status.GetErrorMessage()); + //} #endif } diff --git a/onnxruntime/test/lora/lora_test.cc b/onnxruntime/test/lora/lora_test.cc index e8291a36447ca..acacc806778a8 100644 --- a/onnxruntime/test/lora/lora_test.cc +++ b/onnxruntime/test/lora/lora_test.cc @@ -200,13 +200,13 @@ TEST(LoraAdapterTest, Load) { } #ifdef USE_CUDA -TEST(LoraAdapterTest, VerifyDeviceCopy) { +TEST(LoraAdapterTest, VerifyCudaDeviceCopy) { auto cpu_ep = DefaultCpuExecutionProvider(); auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0]; auto cuda_ep = DefaultCudaExecutionProvider(); auto cuda_allocator = cuda_ep->CreatePreferredAllocators()[0]; - auto gpu_transfer = cuda_ep->GetDataTransfer(); + auto dml_transfer = cuda_ep->GetDataTransfer(); auto test_params = GenerateTestParameters()(); lora::LoraAdapter adapter(std::move(cuda_allocator)); @@ -222,9 +222,9 @@ TEST(LoraAdapterTest, VerifyDeviceCopy) { ASSERT_EQ(tensor_cpu.Shape().Size(), tensor_device.Shape().Size()); Tensor copy(tensor_cpu.DataType(), tensor_cpu.Shape(), cpu_allocator); - ASSERT_TRUE(gpu_transfer->CanCopy(tensor_device.Location().device, + ASSERT_TRUE(dml_transfer->CanCopy(tensor_device.Location().device, copy.Location().device)); - ASSERT_STATUS_OK(gpu_transfer->CopyTensor(tensor_device, copy)); + ASSERT_STATUS_OK(dml_transfer->CopyTensor(tensor_device, copy)); auto expected_span = tensor_cpu.DataAsSpan(); auto copy_span = copy.DataAsSpan(); @@ -233,5 +233,42 @@ TEST(LoraAdapterTest, VerifyDeviceCopy) { } } #endif + +#ifdef USE_DML +TEST(LoraAdapterTest, VerifyDmlDeviceCopy) { + auto cpu_ep = DefaultCpuExecutionProvider(); + auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0]; + + auto dml_ep = DefaultDmlExecutionProvider(); + auto dml_allocator = dml_ep->CreatePreferredAllocators()[0]; + + auto dml_transfer = dml_ep->GetDataTransfer(); + + auto test_params = GenerateTestParameters()(); + lora::LoraAdapter adapter(std::move(dml_allocator)); + adapter.Load(std::move(test_params)); + + auto [begin, end] = adapter.GetParamIterators(); + for (; begin != end; ++begin) { + const auto& [_, param] = *begin; + const auto& tensor_device = param.GetDeviceOrMapped().Get(); + ASSERT_EQ(0, strcmp(tensor_device.Location().name, onnxruntime::CUDA)); + + const auto& tensor_cpu = param.GetMapped().Get(); + ASSERT_EQ(tensor_cpu.Shape().Size(), tensor_device.Shape().Size()); + + Tensor copy(tensor_cpu.DataType(), tensor_cpu.Shape(), cpu_allocator); + ASSERT_TRUE(dml_transfer->CanCopy(tensor_device.Location().device, + copy.Location().device)); + ASSERT_STATUS_OK(dml_transfer->CopyTensor(tensor_device, copy)); + + auto expected_span = tensor_cpu.DataAsSpan(); + auto copy_span = copy.DataAsSpan(); + + ASSERT_EQ(expected_span, copy_span); + } +} +#endif + } // namespace test } // namespace onnxruntime