From f6791928b5ce01d88d88a82bb56017dc194c6f80 Mon Sep 17 00:00:00 2001 From: Randy Shuai Date: Wed, 3 Jan 2024 17:07:46 -0800 Subject: [PATCH 1/8] surface cuda ep options --- .../core/providers/cuda/cuda_resource.h | 21 ++++- .../providers/cuda/cuda_execution_provider.cc | 3 +- .../core/providers/cuda/cuda_stream_handle.cc | 78 ++++++++++++++++--- .../core/providers/cuda/cuda_stream_handle.h | 8 +- 4 files changed, 96 insertions(+), 14 deletions(-) diff --git a/include/onnxruntime/core/providers/cuda/cuda_resource.h b/include/onnxruntime/core/providers/cuda/cuda_resource.h index 8c3ed46ade6a1..9f620c4f43877 100644 --- a/include/onnxruntime/core/providers/cuda/cuda_resource.h +++ b/include/onnxruntime/core/providers/cuda/cuda_resource.h @@ -3,11 +3,30 @@ #include "core/providers/resource.h" -#define ORT_CUDA_RESOUCE_VERSION 2 +#define ORT_CUDA_RESOUCE_VERSION 3 enum CudaResource : int { cuda_stream_t = cuda_resource_offset, cudnn_handle_t, cublas_handle_t, deferred_cpu_allocator_t, + // below are cuda ep options + device_id_t, + has_user_compute_stream_t, + gpu_mem_limit_t, + arena_extend_strategy_t, + cudnn_conv_algo_search_t, + do_copy_in_default_stream_t, + gpu_external_alloc_t, + gpu_external_free_t, + gpu_external_empty_cache_t, + cudnn_conv_use_max_workspace_t, + enable_cuda_graph_t, + cudnn_conv1d_pad_to_nc1d_t, + tunable_op_enable_t, + tunable_op_tuning_enable_t, + tunable_op_max_tuning_duration_ms_t, + enable_skip_layer_norm_strict_mode_t, + prefer_nhwc_t, + use_ep_level_unified_stream_t }; \ No newline at end of file diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index d8a0792209b0f..f7b23f12e8193 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -2465,7 +2465,8 @@ void CUDAExecutionProvider::RegisterStreamHandlers(IStreamCommandHandleRegistry& stream_, use_ep_level_unified_stream_, GetPerThreadContext().CudnnHandle(), - GetPerThreadContext().CublasHandle()); + GetPerThreadContext().CublasHandle(), + info_); } OrtDevice CUDAExecutionProvider::GetOrtDeviceByMemType(OrtMemType mem_type) const { diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc index 9aad461b1d1c1..9cbba166618cc 100644 --- a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc +++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc @@ -62,11 +62,13 @@ CudaStream::CudaStream(cudaStream_t stream, bool release_cpu_buffer_on_cuda_stream, bool own_flag, cudnnHandle_t external_cudnn_handle, - cublasHandle_t external_cublas_handle) : Stream(stream, device), - own_stream_(own_flag), - cpu_allocator_(cpu_allocator), - release_cpu_buffer_on_cuda_stream_(release_cpu_buffer_on_cuda_stream), - deferred_cpu_allocator_(*this) { + cublasHandle_t external_cublas_handle, + const CUDAExecutionProviderInfo& ep_info) : Stream(stream, device), + own_stream_(own_flag), + cpu_allocator_(cpu_allocator), + release_cpu_buffer_on_cuda_stream_(release_cpu_buffer_on_cuda_stream), + deferred_cpu_allocator_(*this), + ep_info_(ep_info) { if (own_flag) { CUBLAS_CALL_THROW(cublasCreate(&cublas_handle_)); CUBLAS_CALL_THROW(cublasSetStream(cublas_handle_, stream)); @@ -185,6 +187,60 @@ void* CudaStream::GetResource(int version, int id) const { case CudaResource::deferred_cpu_allocator_t: return const_cast(&deferred_cpu_allocator_); break; + case CudaResource::device_id_t: + return reinterpret_cast(ep_info_.device_id); + break; + case CudaResource::has_user_compute_stream_t: + return reinterpret_cast(ep_info_.has_user_compute_stream); + break; + case CudaResource::gpu_mem_limit_t: + return reinterpret_cast(ep_info_.gpu_mem_limit); + break; + case CudaResource::arena_extend_strategy_t: + return reinterpret_cast(ep_info_.gpu_mem_limit); + break; + case CudaResource::cudnn_conv_algo_search_t: + return reinterpret_cast(ep_info_.cudnn_conv_algo_search); + break; + case CudaResource::do_copy_in_default_stream_t: + return reinterpret_cast(ep_info_.do_copy_in_default_stream); + break; + case CudaResource::gpu_external_alloc_t: + return reinterpret_cast(ep_info_.external_allocator_info.alloc); + break; + case CudaResource::gpu_external_free_t: + return reinterpret_cast(ep_info_.external_allocator_info.free); + break; + case CudaResource::gpu_external_empty_cache_t: + return reinterpret_cast(ep_info_.external_allocator_info.empty_cache); + break; + case CudaResource::cudnn_conv_use_max_workspace_t: + return reinterpret_cast(ep_info_.cudnn_conv_use_max_workspace); + break; + case CudaResource::enable_cuda_graph_t: + return reinterpret_cast(ep_info_.enable_cuda_graph); + break; + case CudaResource::cudnn_conv1d_pad_to_nc1d_t: + return reinterpret_cast(ep_info_.cudnn_conv1d_pad_to_nc1d); + break; + case CudaResource::tunable_op_enable_t: + return reinterpret_cast(ep_info_.tunable_op.enable); + break; + case CudaResource::tunable_op_tuning_enable_t: + return reinterpret_cast(ep_info_.tunable_op.tuning_enable); + break; + case CudaResource::tunable_op_max_tuning_duration_ms_t: + return reinterpret_cast(ep_info_.tunable_op.max_tuning_duration_ms); + break; + case CudaResource::enable_skip_layer_norm_strict_mode_t: + return reinterpret_cast(ep_info_.enable_skip_layer_norm_strict_mode); + break; + case CudaResource::prefer_nhwc_t: + return reinterpret_cast(ep_info_.prefer_nhwc); + break; + case CudaResource::use_ep_level_unified_stream_t: + return reinterpret_cast(ep_info_.use_ep_level_unified_stream); + break; default: break; } @@ -207,26 +263,28 @@ void RegisterCudaStreamHandles(IStreamCommandHandleRegistry& stream_handle_regis cudaStream_t external_stream, bool use_existing_stream, cudnnHandle_t external_cudnn_handle, - cublasHandle_t external_cublas_handle) { + cublasHandle_t external_cublas_handle, + const CUDAExecutionProviderInfo& ep_info) { // wait cuda notification on cuda ep stream_handle_registry.RegisterWaitFn(device_type, device_type, WaitCudaNotificationOnDevice); // wait cuda notification on cpu ep stream_handle_registry.RegisterWaitFn(device_type, OrtDevice::CPU, WaitCudaNotificationOnHost); if (!use_existing_stream) - stream_handle_registry.RegisterCreateStreamFn(device_type, [cpu_allocator, release_cpu_buffer_on_cuda_stream](const OrtDevice& device) { + stream_handle_registry.RegisterCreateStreamFn(device_type, [cpu_allocator, release_cpu_buffer_on_cuda_stream, ep_info](const OrtDevice& device) { CUDA_CALL_THROW(cudaSetDevice(device.Id())); cudaStream_t stream = nullptr; CUDA_CALL_THROW(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); // CUDA_CALL_THROW(cudaStreamCreate(&stream)); - return std::make_unique(stream, device, cpu_allocator, release_cpu_buffer_on_cuda_stream, true, nullptr, nullptr); + return std::make_unique(stream, device, cpu_allocator, release_cpu_buffer_on_cuda_stream, true, nullptr, nullptr, ep_info); }); else stream_handle_registry.RegisterCreateStreamFn(device_type, [cpu_allocator, release_cpu_buffer_on_cuda_stream, external_stream, external_cudnn_handle, - external_cublas_handle](const OrtDevice& device) { - return std::make_unique(external_stream, device, cpu_allocator, release_cpu_buffer_on_cuda_stream, false, external_cudnn_handle, external_cublas_handle); + external_cublas_handle, + ep_info](const OrtDevice& device) { + return std::make_unique(external_stream, device, cpu_allocator, release_cpu_buffer_on_cuda_stream, false, external_cudnn_handle, external_cublas_handle, ep_info); }); } diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.h b/onnxruntime/core/providers/cuda/cuda_stream_handle.h index 917702fae08f1..a2b5a5bc32f4f 100644 --- a/onnxruntime/core/providers/cuda/cuda_stream_handle.h +++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.h @@ -6,6 +6,7 @@ #include "core/providers/cuda/shared_inc/cuda_utils.h" #include "core/providers/cuda/shared_inc/cuda_call.h" #include "core/framework/stream_handles.h" +#include "core/providers/cuda/cuda_execution_provider_info.h" namespace onnxruntime { @@ -23,7 +24,8 @@ struct CudaStream : Stream { bool release_cpu_buffer_on_cuda_stream, bool own_flag, cudnnHandle_t external_cudnn_handle, - cublasHandle_t external_cublass_handle); + cublasHandle_t external_cublass_handle, + const CUDAExecutionProviderInfo& ep_info); ~CudaStream(); @@ -50,6 +52,7 @@ struct CudaStream : Stream { AllocatorPtr cpu_allocator_; bool release_cpu_buffer_on_cuda_stream_{true}; DeferredCpuAllocator deferred_cpu_allocator_; + const CUDAExecutionProviderInfo& ep_info_; }; void RegisterCudaStreamHandles(IStreamCommandHandleRegistry& stream_handle_registry, @@ -59,6 +62,7 @@ void RegisterCudaStreamHandles(IStreamCommandHandleRegistry& stream_handle_regis cudaStream_t external_stream, bool use_existing_stream, cudnnHandle_t external_cudnn_handle, - cublasHandle_t external_cublass_handle); + cublasHandle_t external_cublass_handle, + const CUDAExecutionProviderInfo& ep_info); void WaitCudaNotificationOnDevice(Stream& stream, synchronize::Notification& notification); } // namespace onnxruntime From 372b80b55da6daff3673432112fe5d6ff22bf362 Mon Sep 17 00:00:00 2001 From: Randy Shuai Date: Thu, 4 Jan 2024 15:43:50 -0800 Subject: [PATCH 2/8] add cxx wrapper --- .../core/providers/cuda/cuda_context.h | 79 +++++++++++++------ .../core/providers/cuda/cuda_resource.h | 12 +-- .../core/providers/cuda/cuda_stream_handle.cc | 2 +- onnxruntime/core/session/custom_ops.cc | 3 - 4 files changed, 60 insertions(+), 36 deletions(-) diff --git a/include/onnxruntime/core/providers/cuda/cuda_context.h b/include/onnxruntime/core/providers/cuda/cuda_context.h index d73d551920d47..bdc007f52f5d5 100644 --- a/include/onnxruntime/core/providers/cuda/cuda_context.h +++ b/include/onnxruntime/core/providers/cuda/cuda_context.h @@ -28,38 +28,65 @@ struct CudaContext : public CustomOpContext { cudnnHandle_t cudnn_handle = {}; cublasHandle_t cublas_handle = {}; OrtAllocator* deferred_cpu_allocator = {}; + // below are cuda ep options + int16_t device_id = 0; + bool has_user_compute_stream = false; + size_t gpu_mem_limit = 0; + int32_t arena_extend_strategy = 0; + int32_t cudnn_conv_algo_search = 0; + bool do_copy_in_default_stream = true; + void* gpu_external_alloc = {}; + void* gpu_external_free = {}; + void* gpu_external_empty_cache = {}; + bool cudnn_conv_use_max_workspace = true; + bool enable_cuda_graph = false; + bool cudnn_conv1d_pad_to_nc1d = false; + bool tunable_op_enable = false; + bool tunable_op_tuning_enable = false; + int32_t tunable_op_max_tuning_duration_ms = 0; + bool enable_skip_layer_norm_strict_mode = false; + bool prefer_nhwc = false; + bool use_ep_level_unified_stream = false; void Init(const OrtKernelContext& kernel_ctx) { + cuda_stream = FetchResource(kernel_ctx, CudaResource::cuda_stream_t); + cudnn_handle = FetchResource(kernel_ctx, CudaResource::cudnn_handle_t); + cublas_handle = FetchResource(kernel_ctx, CudaResource::cublas_handle_t); + deferred_cpu_allocator = FetchResource(kernel_ctx, CudaResource::deferred_cpu_allocator_t); + + device_id = FetchResource(kernel_ctx, CudaResource::device_id_t); + has_user_compute_stream = FetchResource(kernel_ctx, CudaResource::has_user_compute_stream_t); + gpu_mem_limit = FetchResource(kernel_ctx, CudaResource::gpu_mem_limit_t); + arena_extend_strategy = FetchResource(kernel_ctx, CudaResource::arena_extend_strategy_t); + + cudnn_conv_algo_search = FetchResource(kernel_ctx, CudaResource::cudnn_conv_algo_search_t); + do_copy_in_default_stream = FetchResource(kernel_ctx, CudaResource::do_copy_in_default_stream_t); + gpu_external_alloc = FetchResource(kernel_ctx, CudaResource::gpu_external_alloc_t); + gpu_external_free = FetchResource(kernel_ctx, CudaResource::gpu_external_free_t); + + gpu_external_empty_cache = FetchResource(kernel_ctx, CudaResource::gpu_external_empty_cache_t); + cudnn_conv_use_max_workspace = FetchResource(kernel_ctx, CudaResource::cudnn_conv_use_max_workspace_t); + enable_cuda_graph = FetchResource(kernel_ctx, CudaResource::enable_cuda_graph_t); + cudnn_conv1d_pad_to_nc1d = FetchResource(kernel_ctx, CudaResource::cudnn_conv1d_pad_to_nc1d_t); + + tunable_op_enable = FetchResource(kernel_ctx, CudaResource::tunable_op_enable_t); + tunable_op_tuning_enable = FetchResource(kernel_ctx, CudaResource::tunable_op_tuning_enable_t); + tunable_op_max_tuning_duration_ms = FetchResource(kernel_ctx, CudaResource::tunable_op_max_tuning_duration_ms_t); + enable_skip_layer_norm_strict_mode = FetchResource(kernel_ctx, CudaResource::enable_skip_layer_norm_strict_mode_t); + + prefer_nhwc = FetchResource(kernel_ctx, CudaResource::prefer_nhwc_t); + use_ep_level_unified_stream = FetchResource(kernel_ctx, CudaResource::use_ep_level_unified_stream_t); + } + + template + T FetchResource(const OrtKernelContext& kernel_ctx, CudaResource resource_type) { const auto& ort_api = Ort::GetApi(); void* resource = {}; - OrtStatus* status = nullptr; - - status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_CUDA_RESOUCE_VERSION, CudaResource::cuda_stream_t, &resource); - if (status) { - ORT_CXX_API_THROW("failed to fetch cuda stream", OrtErrorCode::ORT_RUNTIME_EXCEPTION); - } - cuda_stream = reinterpret_cast(resource); - - resource = {}; - status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_CUDA_RESOUCE_VERSION, CudaResource::cudnn_handle_t, &resource); - if (status) { - ORT_CXX_API_THROW("failed to fetch cudnn handle", OrtErrorCode::ORT_RUNTIME_EXCEPTION); - } - cudnn_handle = reinterpret_cast(resource); - - resource = {}; - status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_CUDA_RESOUCE_VERSION, CudaResource::cublas_handle_t, &resource); - if (status) { - ORT_CXX_API_THROW("failed to fetch cublas handle", OrtErrorCode::ORT_RUNTIME_EXCEPTION); - } - cublas_handle = reinterpret_cast(resource); - - resource = {}; - status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_CUDA_RESOUCE_VERSION, CudaResource::deferred_cpu_allocator_t, &resource); + OrtStatus* status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_CUDA_RESOUCE_VERSION, resource_type, &resource); if (status) { - ORT_CXX_API_THROW("failed to fetch deferred cpu allocator", OrtErrorCode::ORT_RUNTIME_EXCEPTION); + ORT_CXX_API_THROW("Failed to fetch cuda ep resource, resouce type: " + std::to_string(resource_type), OrtErrorCode::ORT_RUNTIME_EXCEPTION); } - deferred_cpu_allocator = reinterpret_cast(resource); + return *reinterpret_cast(&resource); } void* AllocDeferredCpuMem(size_t size) const { diff --git a/include/onnxruntime/core/providers/cuda/cuda_resource.h b/include/onnxruntime/core/providers/cuda/cuda_resource.h index 9f620c4f43877..1013a97682bfd 100644 --- a/include/onnxruntime/core/providers/cuda/cuda_resource.h +++ b/include/onnxruntime/core/providers/cuda/cuda_resource.h @@ -6,27 +6,27 @@ #define ORT_CUDA_RESOUCE_VERSION 3 enum CudaResource : int { - cuda_stream_t = cuda_resource_offset, + cuda_stream_t = cuda_resource_offset, // 10000 cudnn_handle_t, cublas_handle_t, deferred_cpu_allocator_t, // below are cuda ep options - device_id_t, + device_id_t, // 10004 has_user_compute_stream_t, gpu_mem_limit_t, arena_extend_strategy_t, - cudnn_conv_algo_search_t, + cudnn_conv_algo_search_t, // 10008 do_copy_in_default_stream_t, gpu_external_alloc_t, gpu_external_free_t, - gpu_external_empty_cache_t, + gpu_external_empty_cache_t, // 10012 cudnn_conv_use_max_workspace_t, enable_cuda_graph_t, cudnn_conv1d_pad_to_nc1d_t, - tunable_op_enable_t, + tunable_op_enable_t, // 10016 tunable_op_tuning_enable_t, tunable_op_max_tuning_duration_ms_t, enable_skip_layer_norm_strict_mode_t, - prefer_nhwc_t, + prefer_nhwc_t, // 10020 use_ep_level_unified_stream_t }; \ No newline at end of file diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc index 9cbba166618cc..a1c6922fa74a0 100644 --- a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc +++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc @@ -197,7 +197,7 @@ void* CudaStream::GetResource(int version, int id) const { return reinterpret_cast(ep_info_.gpu_mem_limit); break; case CudaResource::arena_extend_strategy_t: - return reinterpret_cast(ep_info_.gpu_mem_limit); + return reinterpret_cast(ep_info_.arena_extend_strategy); break; case CudaResource::cudnn_conv_algo_search_t: return reinterpret_cast(ep_info_.cudnn_conv_algo_search); diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc index b827c28f129b1..fa674360159f8 100644 --- a/onnxruntime/core/session/custom_ops.cc +++ b/onnxruntime/core/session/custom_ops.cc @@ -370,9 +370,6 @@ ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetResource, _In_ const OrtKernelCont return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Failed to fetch a stream hosting the requested resource"); } *resource = stream->GetResource(resource_version, resource_id); - if (!(*resource)) { - return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Requested resource does not exist"); - } return nullptr; API_IMPL_END }; From ec04d271c6c079c6112373966e5d9b38cbf05809 Mon Sep 17 00:00:00 2001 From: Randy Shuai Date: Fri, 5 Jan 2024 11:55:46 -0800 Subject: [PATCH 3/8] tune ut --- include/onnxruntime/core/providers/cuda/cuda_context.h | 2 +- include/onnxruntime/core/session/onnxruntime_c_api.h | 2 +- onnxruntime/core/providers/cuda/cuda_stream_handle.h | 2 +- onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/onnxruntime/core/providers/cuda/cuda_context.h b/include/onnxruntime/core/providers/cuda/cuda_context.h index bdc007f52f5d5..635849aa6a3b3 100644 --- a/include/onnxruntime/core/providers/cuda/cuda_context.h +++ b/include/onnxruntime/core/providers/cuda/cuda_context.h @@ -86,7 +86,7 @@ struct CudaContext : public CustomOpContext { if (status) { ORT_CXX_API_THROW("Failed to fetch cuda ep resource, resouce type: " + std::to_string(resource_type), OrtErrorCode::ORT_RUNTIME_EXCEPTION); } - return *reinterpret_cast(&resource); + return static_cast(*reinterpret_cast(&resource)); } void* AllocDeferredCpuMem(size_t size) const { diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index dbd5ad41255fa..134be42b9eb43 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -4417,7 +4417,7 @@ struct OrtApi { ORT_API2_STATUS(GetCUDAProviderOptionsByName, _In_ const OrtCUDAProviderOptionsV2* cuda_options, _In_ const char* key, _Outptr_ void** ptr); /** - * Get a EP resoure. + * Get a EP resource. * E.g. a cuda stream or a cublas handle * * \param context - Kernel context diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.h b/onnxruntime/core/providers/cuda/cuda_stream_handle.h index a2b5a5bc32f4f..b02c167e9e9ec 100644 --- a/onnxruntime/core/providers/cuda/cuda_stream_handle.h +++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.h @@ -52,7 +52,7 @@ struct CudaStream : Stream { AllocatorPtr cpu_allocator_; bool release_cpu_buffer_on_cuda_stream_{true}; DeferredCpuAllocator deferred_cpu_allocator_; - const CUDAExecutionProviderInfo& ep_info_; + const CUDAExecutionProviderInfo ep_info_; }; void RegisterCudaStreamHandles(IStreamCommandHandleRegistry& stream_handle_registry, diff --git a/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc b/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc index 3d561d378cb8c..2708b6d38aedb 100644 --- a/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc +++ b/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc @@ -28,14 +28,14 @@ void KernelOne(const Ort::Custom::CudaContext& cuda_ctx, const Ort::Custom::Tensor& X, const Ort::Custom::Tensor& Y, Ort::Custom::Tensor& Z) { - auto input_shape = X.Shape(); CUSTOM_ENFORCE(cuda_ctx.cuda_stream, "failed to fetch cuda stream"); CUSTOM_ENFORCE(cuda_ctx.cudnn_handle, "failed to fetch cudnn handle"); CUSTOM_ENFORCE(cuda_ctx.cublas_handle, "failed to fetch cublas handle"); + CUSTOM_ENFORCE(cuda_ctx.gpu_mem_limit == std::numeric_limits::max(), ""); void* deferred_cpu_mem = cuda_ctx.AllocDeferredCpuMem(sizeof(int32_t)); CUSTOM_ENFORCE(deferred_cpu_mem, "failed to allocate deferred cpu allocator"); cuda_ctx.FreeDeferredCpuMem(deferred_cpu_mem); - auto z_raw = Z.Allocate(input_shape); + auto z_raw = Z.Allocate(X.Shape()); cuda_add(Z.NumberOfElement(), z_raw, X.Data(), Y.Data(), cuda_ctx.cuda_stream); } From 1089b445b31fc61d9b7c0a26383b0f3886b9af30 Mon Sep 17 00:00:00 2001 From: Randy Shuai Date: Fri, 5 Jan 2024 13:28:28 -0800 Subject: [PATCH 4/8] tune cast --- onnxruntime/core/providers/cuda/cuda_stream_handle.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc index a1c6922fa74a0..75917f7f27d35 100644 --- a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc +++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc @@ -230,7 +230,7 @@ void* CudaStream::GetResource(int version, int id) const { return reinterpret_cast(ep_info_.tunable_op.tuning_enable); break; case CudaResource::tunable_op_max_tuning_duration_ms_t: - return reinterpret_cast(ep_info_.tunable_op.max_tuning_duration_ms); + return reinterpret_cast(static_cast(ep_info_.tunable_op.max_tuning_duration_ms)); break; case CudaResource::enable_skip_layer_norm_strict_mode_t: return reinterpret_cast(ep_info_.enable_skip_layer_norm_strict_mode); From 8774631aaf51df42d2bfa53811de814d289cd109 Mon Sep 17 00:00:00 2001 From: Randy Shuai Date: Mon, 8 Jan 2024 15:40:43 -0800 Subject: [PATCH 5/8] type punning --- include/onnxruntime/core/providers/cuda/cuda_context.h | 7 ++++++- .../core/providers/tensorrt/tensorrt_execution_provider.cc | 3 ++- .../test/testdata/custom_op_library/cuda/cuda_ops.cc | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/onnxruntime/core/providers/cuda/cuda_context.h b/include/onnxruntime/core/providers/cuda/cuda_context.h index 635849aa6a3b3..0b731c222f6a6 100644 --- a/include/onnxruntime/core/providers/cuda/cuda_context.h +++ b/include/onnxruntime/core/providers/cuda/cuda_context.h @@ -80,13 +80,18 @@ struct CudaContext : public CustomOpContext { template T FetchResource(const OrtKernelContext& kernel_ctx, CudaResource resource_type) { + if (sizeof(T) > sizeof(void*)) { + ORT_CXX_API_THROW("void* is not large enough to hold resource type: " + std::to_string(resource_type), OrtErrorCode::ORT_INVALID_ARGUMENT); + } const auto& ort_api = Ort::GetApi(); void* resource = {}; OrtStatus* status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_CUDA_RESOUCE_VERSION, resource_type, &resource); if (status) { ORT_CXX_API_THROW("Failed to fetch cuda ep resource, resouce type: " + std::to_string(resource_type), OrtErrorCode::ORT_RUNTIME_EXCEPTION); } - return static_cast(*reinterpret_cast(&resource)); + T t = {}; + memcpy(&t, &resource, sizeof(T)); + return t; } void* AllocDeferredCpuMem(size_t size) const { diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 684303a8b6448..7397b84373db7 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -3473,7 +3473,8 @@ void TensorrtExecutionProvider::RegisterStreamHandlers(IStreamCommandHandleRegis stream_, external_stream_ /* use_existing_stream */, external_cudnn_handle_, - external_cublas_handle_); + external_cublas_handle_, + {}); } OrtDevice TensorrtExecutionProvider::GetOrtDeviceByMemType(OrtMemType mem_type) const { diff --git a/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc b/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc index 2708b6d38aedb..05fb5147e4815 100644 --- a/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc +++ b/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc @@ -31,7 +31,7 @@ void KernelOne(const Ort::Custom::CudaContext& cuda_ctx, CUSTOM_ENFORCE(cuda_ctx.cuda_stream, "failed to fetch cuda stream"); CUSTOM_ENFORCE(cuda_ctx.cudnn_handle, "failed to fetch cudnn handle"); CUSTOM_ENFORCE(cuda_ctx.cublas_handle, "failed to fetch cublas handle"); - CUSTOM_ENFORCE(cuda_ctx.gpu_mem_limit == std::numeric_limits::max(), ""); + CUSTOM_ENFORCE(cuda_ctx.gpu_mem_limit == std::numeric_limits::max(), "gpu_mem_limit mismatch"); void* deferred_cpu_mem = cuda_ctx.AllocDeferredCpuMem(sizeof(int32_t)); CUSTOM_ENFORCE(deferred_cpu_mem, "failed to allocate deferred cpu allocator"); cuda_ctx.FreeDeferredCpuMem(deferred_cpu_mem); From 3cad11b75be785d74107e5d4822c77948388cb6e Mon Sep 17 00:00:00 2001 From: Randy Shuai Date: Wed, 10 Jan 2024 14:39:59 -0800 Subject: [PATCH 6/8] hide options --- .../core/providers/cuda/cuda_context.h | 25 +--------------- .../core/providers/cuda/cuda_stream_handle.cc | 30 ------------------- .../custom_op_library/cuda/cuda_ops.cc | 2 +- 3 files changed, 2 insertions(+), 55 deletions(-) diff --git a/include/onnxruntime/core/providers/cuda/cuda_context.h b/include/onnxruntime/core/providers/cuda/cuda_context.h index 0b731c222f6a6..2e946cf1bd0b1 100644 --- a/include/onnxruntime/core/providers/cuda/cuda_context.h +++ b/include/onnxruntime/core/providers/cuda/cuda_context.h @@ -30,23 +30,13 @@ struct CudaContext : public CustomOpContext { OrtAllocator* deferred_cpu_allocator = {}; // below are cuda ep options int16_t device_id = 0; - bool has_user_compute_stream = false; - size_t gpu_mem_limit = 0; int32_t arena_extend_strategy = 0; int32_t cudnn_conv_algo_search = 0; - bool do_copy_in_default_stream = true; - void* gpu_external_alloc = {}; - void* gpu_external_free = {}; - void* gpu_external_empty_cache = {}; bool cudnn_conv_use_max_workspace = true; bool enable_cuda_graph = false; bool cudnn_conv1d_pad_to_nc1d = false; - bool tunable_op_enable = false; - bool tunable_op_tuning_enable = false; - int32_t tunable_op_max_tuning_duration_ms = 0; bool enable_skip_layer_norm_strict_mode = false; bool prefer_nhwc = false; - bool use_ep_level_unified_stream = false; void Init(const OrtKernelContext& kernel_ctx) { cuda_stream = FetchResource(kernel_ctx, CudaResource::cuda_stream_t); @@ -55,27 +45,14 @@ struct CudaContext : public CustomOpContext { deferred_cpu_allocator = FetchResource(kernel_ctx, CudaResource::deferred_cpu_allocator_t); device_id = FetchResource(kernel_ctx, CudaResource::device_id_t); - has_user_compute_stream = FetchResource(kernel_ctx, CudaResource::has_user_compute_stream_t); - gpu_mem_limit = FetchResource(kernel_ctx, CudaResource::gpu_mem_limit_t); arena_extend_strategy = FetchResource(kernel_ctx, CudaResource::arena_extend_strategy_t); - cudnn_conv_algo_search = FetchResource(kernel_ctx, CudaResource::cudnn_conv_algo_search_t); - do_copy_in_default_stream = FetchResource(kernel_ctx, CudaResource::do_copy_in_default_stream_t); - gpu_external_alloc = FetchResource(kernel_ctx, CudaResource::gpu_external_alloc_t); - gpu_external_free = FetchResource(kernel_ctx, CudaResource::gpu_external_free_t); - - gpu_external_empty_cache = FetchResource(kernel_ctx, CudaResource::gpu_external_empty_cache_t); cudnn_conv_use_max_workspace = FetchResource(kernel_ctx, CudaResource::cudnn_conv_use_max_workspace_t); + enable_cuda_graph = FetchResource(kernel_ctx, CudaResource::enable_cuda_graph_t); cudnn_conv1d_pad_to_nc1d = FetchResource(kernel_ctx, CudaResource::cudnn_conv1d_pad_to_nc1d_t); - - tunable_op_enable = FetchResource(kernel_ctx, CudaResource::tunable_op_enable_t); - tunable_op_tuning_enable = FetchResource(kernel_ctx, CudaResource::tunable_op_tuning_enable_t); - tunable_op_max_tuning_duration_ms = FetchResource(kernel_ctx, CudaResource::tunable_op_max_tuning_duration_ms_t); enable_skip_layer_norm_strict_mode = FetchResource(kernel_ctx, CudaResource::enable_skip_layer_norm_strict_mode_t); - prefer_nhwc = FetchResource(kernel_ctx, CudaResource::prefer_nhwc_t); - use_ep_level_unified_stream = FetchResource(kernel_ctx, CudaResource::use_ep_level_unified_stream_t); } template diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc index 75917f7f27d35..2dbeee4452f33 100644 --- a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc +++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc @@ -190,30 +190,12 @@ void* CudaStream::GetResource(int version, int id) const { case CudaResource::device_id_t: return reinterpret_cast(ep_info_.device_id); break; - case CudaResource::has_user_compute_stream_t: - return reinterpret_cast(ep_info_.has_user_compute_stream); - break; - case CudaResource::gpu_mem_limit_t: - return reinterpret_cast(ep_info_.gpu_mem_limit); - break; case CudaResource::arena_extend_strategy_t: return reinterpret_cast(ep_info_.arena_extend_strategy); break; case CudaResource::cudnn_conv_algo_search_t: return reinterpret_cast(ep_info_.cudnn_conv_algo_search); break; - case CudaResource::do_copy_in_default_stream_t: - return reinterpret_cast(ep_info_.do_copy_in_default_stream); - break; - case CudaResource::gpu_external_alloc_t: - return reinterpret_cast(ep_info_.external_allocator_info.alloc); - break; - case CudaResource::gpu_external_free_t: - return reinterpret_cast(ep_info_.external_allocator_info.free); - break; - case CudaResource::gpu_external_empty_cache_t: - return reinterpret_cast(ep_info_.external_allocator_info.empty_cache); - break; case CudaResource::cudnn_conv_use_max_workspace_t: return reinterpret_cast(ep_info_.cudnn_conv_use_max_workspace); break; @@ -223,24 +205,12 @@ void* CudaStream::GetResource(int version, int id) const { case CudaResource::cudnn_conv1d_pad_to_nc1d_t: return reinterpret_cast(ep_info_.cudnn_conv1d_pad_to_nc1d); break; - case CudaResource::tunable_op_enable_t: - return reinterpret_cast(ep_info_.tunable_op.enable); - break; - case CudaResource::tunable_op_tuning_enable_t: - return reinterpret_cast(ep_info_.tunable_op.tuning_enable); - break; - case CudaResource::tunable_op_max_tuning_duration_ms_t: - return reinterpret_cast(static_cast(ep_info_.tunable_op.max_tuning_duration_ms)); - break; case CudaResource::enable_skip_layer_norm_strict_mode_t: return reinterpret_cast(ep_info_.enable_skip_layer_norm_strict_mode); break; case CudaResource::prefer_nhwc_t: return reinterpret_cast(ep_info_.prefer_nhwc); break; - case CudaResource::use_ep_level_unified_stream_t: - return reinterpret_cast(ep_info_.use_ep_level_unified_stream); - break; default: break; } diff --git a/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc b/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc index 05fb5147e4815..43795921f17da 100644 --- a/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc +++ b/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc @@ -31,7 +31,7 @@ void KernelOne(const Ort::Custom::CudaContext& cuda_ctx, CUSTOM_ENFORCE(cuda_ctx.cuda_stream, "failed to fetch cuda stream"); CUSTOM_ENFORCE(cuda_ctx.cudnn_handle, "failed to fetch cudnn handle"); CUSTOM_ENFORCE(cuda_ctx.cublas_handle, "failed to fetch cublas handle"); - CUSTOM_ENFORCE(cuda_ctx.gpu_mem_limit == std::numeric_limits::max(), "gpu_mem_limit mismatch"); + CUSTOM_ENFORCE(cuda_ctx.arena_extend_strategy == 0, "arena_extend_strategy mismatch"); void* deferred_cpu_mem = cuda_ctx.AllocDeferredCpuMem(sizeof(int32_t)); CUSTOM_ENFORCE(deferred_cpu_mem, "failed to allocate deferred cpu allocator"); cuda_ctx.FreeDeferredCpuMem(deferred_cpu_mem); From f8459e3d7933c5943860a7b74674f2e49b6b49bb Mon Sep 17 00:00:00 2001 From: Randy Shuai Date: Wed, 10 Jan 2024 15:06:28 -0800 Subject: [PATCH 7/8] trim options --- .../onnxruntime/core/providers/cuda/cuda_context.h | 2 +- .../core/providers/cuda/cuda_resource.h | 14 ++------------ 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/include/onnxruntime/core/providers/cuda/cuda_context.h b/include/onnxruntime/core/providers/cuda/cuda_context.h index 2e946cf1bd0b1..799f587d3dfeb 100644 --- a/include/onnxruntime/core/providers/cuda/cuda_context.h +++ b/include/onnxruntime/core/providers/cuda/cuda_context.h @@ -55,7 +55,7 @@ struct CudaContext : public CustomOpContext { prefer_nhwc = FetchResource(kernel_ctx, CudaResource::prefer_nhwc_t); } - template + template T FetchResource(const OrtKernelContext& kernel_ctx, CudaResource resource_type) { if (sizeof(T) > sizeof(void*)) { ORT_CXX_API_THROW("void* is not large enough to hold resource type: " + std::to_string(resource_type), OrtErrorCode::ORT_INVALID_ARGUMENT); diff --git a/include/onnxruntime/core/providers/cuda/cuda_resource.h b/include/onnxruntime/core/providers/cuda/cuda_resource.h index 1013a97682bfd..3389f1b94b4cd 100644 --- a/include/onnxruntime/core/providers/cuda/cuda_resource.h +++ b/include/onnxruntime/core/providers/cuda/cuda_resource.h @@ -12,21 +12,11 @@ enum CudaResource : int { deferred_cpu_allocator_t, // below are cuda ep options device_id_t, // 10004 - has_user_compute_stream_t, - gpu_mem_limit_t, arena_extend_strategy_t, - cudnn_conv_algo_search_t, // 10008 - do_copy_in_default_stream_t, - gpu_external_alloc_t, - gpu_external_free_t, - gpu_external_empty_cache_t, // 10012 + cudnn_conv_algo_search_t, cudnn_conv_use_max_workspace_t, enable_cuda_graph_t, cudnn_conv1d_pad_to_nc1d_t, - tunable_op_enable_t, // 10016 - tunable_op_tuning_enable_t, - tunable_op_max_tuning_duration_ms_t, enable_skip_layer_norm_strict_mode_t, - prefer_nhwc_t, // 10020 - use_ep_level_unified_stream_t + prefer_nhwc_t, }; \ No newline at end of file From 5b7ae52169c914068373f5a957bf0aae14707277 Mon Sep 17 00:00:00 2001 From: Randy Shuai Date: Wed, 10 Jan 2024 16:12:20 -0800 Subject: [PATCH 8/8] drop enable_cuda_graph --- include/onnxruntime/core/providers/cuda/cuda_context.h | 2 -- include/onnxruntime/core/providers/cuda/cuda_resource.h | 1 - onnxruntime/core/providers/cuda/cuda_stream_handle.cc | 3 --- 3 files changed, 6 deletions(-) diff --git a/include/onnxruntime/core/providers/cuda/cuda_context.h b/include/onnxruntime/core/providers/cuda/cuda_context.h index 799f587d3dfeb..9416fad5f1448 100644 --- a/include/onnxruntime/core/providers/cuda/cuda_context.h +++ b/include/onnxruntime/core/providers/cuda/cuda_context.h @@ -33,7 +33,6 @@ struct CudaContext : public CustomOpContext { int32_t arena_extend_strategy = 0; int32_t cudnn_conv_algo_search = 0; bool cudnn_conv_use_max_workspace = true; - bool enable_cuda_graph = false; bool cudnn_conv1d_pad_to_nc1d = false; bool enable_skip_layer_norm_strict_mode = false; bool prefer_nhwc = false; @@ -49,7 +48,6 @@ struct CudaContext : public CustomOpContext { cudnn_conv_algo_search = FetchResource(kernel_ctx, CudaResource::cudnn_conv_algo_search_t); cudnn_conv_use_max_workspace = FetchResource(kernel_ctx, CudaResource::cudnn_conv_use_max_workspace_t); - enable_cuda_graph = FetchResource(kernel_ctx, CudaResource::enable_cuda_graph_t); cudnn_conv1d_pad_to_nc1d = FetchResource(kernel_ctx, CudaResource::cudnn_conv1d_pad_to_nc1d_t); enable_skip_layer_norm_strict_mode = FetchResource(kernel_ctx, CudaResource::enable_skip_layer_norm_strict_mode_t); prefer_nhwc = FetchResource(kernel_ctx, CudaResource::prefer_nhwc_t); diff --git a/include/onnxruntime/core/providers/cuda/cuda_resource.h b/include/onnxruntime/core/providers/cuda/cuda_resource.h index 3389f1b94b4cd..c0e6328f27122 100644 --- a/include/onnxruntime/core/providers/cuda/cuda_resource.h +++ b/include/onnxruntime/core/providers/cuda/cuda_resource.h @@ -15,7 +15,6 @@ enum CudaResource : int { arena_extend_strategy_t, cudnn_conv_algo_search_t, cudnn_conv_use_max_workspace_t, - enable_cuda_graph_t, cudnn_conv1d_pad_to_nc1d_t, enable_skip_layer_norm_strict_mode_t, prefer_nhwc_t, diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc index 2dbeee4452f33..7c866395ecf6e 100644 --- a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc +++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc @@ -199,9 +199,6 @@ void* CudaStream::GetResource(int version, int id) const { case CudaResource::cudnn_conv_use_max_workspace_t: return reinterpret_cast(ep_info_.cudnn_conv_use_max_workspace); break; - case CudaResource::enable_cuda_graph_t: - return reinterpret_cast(ep_info_.enable_cuda_graph); - break; case CudaResource::cudnn_conv1d_pad_to_nc1d_t: return reinterpret_cast(ep_info_.cudnn_conv1d_pad_to_nc1d); break;