From 4ff9a852fb075cadda622f3f3f6fe1d4f16bf37f Mon Sep 17 00:00:00 2001 From: Chi Lo Date: Tue, 5 Dec 2023 07:11:54 +0000 Subject: [PATCH] code refactor and add cleanup for dds_output_allocator_map --- .../tensorrt/tensorrt_execution_provider.cc | 23 ++++++++++++------- .../tensorrt/tensorrt_execution_provider.h | 2 +- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 184a60aa041fb..e75904ee0539c 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -1095,7 +1095,7 @@ Status BindKernelOutput(Ort::KernelContext& ctx, // The allocation buffer holds the INT32 output data since TRT doesn't support INT64 but INT32. // So, we need to cast the data from INT32 to INT64 and then set INT64 output data to kernel context. SafeInt output_dim_size(1); - for (int i = 0; i < shape.size(); ++i) { + for (size_t i = 0; i < shape.size(); ++i) { if (shape[i] == 0) { output_dim_size = 1; break; @@ -1104,9 +1104,9 @@ Status BindKernelOutput(Ort::KernelContext& ctx, } } scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator(alloc, output_dim_size * sizeof(int64_t))); - buffers[output_name] = scratch_buffers.back().get(); - cuda::Impl_Cast(stream, reinterpret_cast(allocator->getBuffer()), reinterpret_cast(buffers[output_name]), output_dim_size); - Ort::ThrowOnError(Ort::GetApi().CreateTensorWithDataAsOrtValue(mem_info, buffers[output_name], output_dim_size * sizeof(int64_t), + auto data = scratch_buffers.back().get(); + cuda::Impl_Cast(stream, reinterpret_cast(allocator->getBuffer()), reinterpret_cast(data), output_dim_size); + Ort::ThrowOnError(Ort::GetApi().CreateTensorWithDataAsOrtValue(mem_info, data, output_dim_size * sizeof(int64_t), shape.data(), shape.size(), Ort::TypeToTensorType::type, &out)); break; } @@ -1114,7 +1114,7 @@ Status BindKernelOutput(Ort::KernelContext& ctx, // The allocation buffer holds the FLOAT output data since TRT doesn't support DOUBLE but FLOAT. // So, we need to cast the data from FLOAT to DOUBEL and then set DOUBLE output data to kernel context. SafeInt output_dim_size(1); - for (int i = 0; i < shape.size(); ++i) { + for (size_t i = 0; i < shape.size(); ++i) { if (shape[i] == 0) { output_dim_size = 1; break; @@ -1123,9 +1123,9 @@ Status BindKernelOutput(Ort::KernelContext& ctx, } } scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator(alloc, output_dim_size * sizeof(double))); - buffers[output_name] = scratch_buffers.back().get(); - cuda::Impl_Cast(stream, reinterpret_cast(allocator->getBuffer()), reinterpret_cast(buffers[output_name]), output_dim_size); - Ort::ThrowOnError(Ort::GetApi().CreateTensorWithDataAsOrtValue(mem_info, buffers[output_name], output_dim_size * sizeof(double), + auto data = scratch_buffers.back().get(); + cuda::Impl_Cast(stream, reinterpret_cast(allocator->getBuffer()), reinterpret_cast(data), output_dim_size); + Ort::ThrowOnError(Ort::GetApi().CreateTensorWithDataAsOrtValue(mem_info, data, output_dim_size * sizeof(double), shape.data(), shape.size(), Ort::TypeToTensorType::type, &out)); break; } @@ -1659,6 +1659,13 @@ TensorrtExecutionProvider::~TensorrtExecutionProvider() { // We can't get api inside destructor so that's why we duplicate the code here. delete static_cast(alloc_); } + + for (auto iter_outer = dds_output_allocator_map_.begin(); iter_outer != dds_output_allocator_map_.end(); ++iter_outer) { + auto inner_map = iter_outer->second; + for (auto iter_inner = inner_map.begin(); iter_inner != inner_map.end(); ++iter_inner) { + delete iter_inner->second; + } + } } bool TensorrtExecutionProvider::IsGraphCaptureEnabled() const { diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h index 47947ab2598f2..269c1cde31c50 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h @@ -320,7 +320,7 @@ class TensorrtExecutionProvider : public IExecutionProvider { std::unordered_map>> profile_opt_shapes_; std::unordered_map input_shape_ranges_; // The profile shape ranges that the engine is built with std::unordered_map> profiles_; - std::unordered_map dds_output_allocator_map_; // For DDS output tensor + std::unordered_map dds_output_allocator_map_; // For DDS output tensor. TODO: Make DDSOutputAllocatorMap use unique_ptr // for external stream, we need to create its cudnn/cublass handle before cuda EP enable cuda graph capture cudnnHandle_t external_cudnn_handle_ = nullptr;