From 168e6ffae0fe92af0b66858c6d244872a7a936f7 Mon Sep 17 00:00:00 2001 From: Julius Tischbein Date: Mon, 15 Apr 2024 09:24:17 +0200 Subject: [PATCH 1/6] Adding CUDNN Frontend to CUDA EP and use it for Convolution --- cgmanifests/generated/cgmanifest.json | 10 + cmake/deps.txt | 1 + cmake/external/cudnn_frontend.cmake | 12 + cmake/onnxruntime_providers_cuda.cmake | 3 +- cmake/onnxruntime_rocm_hipify.cmake | 2 + cmake/onnxruntime_unittests.cmake | 2 +- .../core/providers/cuda/cuda_context.h | 2 + .../providers/cuda/cuda_provider_options.h | 1 + .../core/providers/cuda/cuda_resource.h | 1 + .../contrib_ops/cuda/diffusion/nhwc_conv.cc | 2 +- onnxruntime/contrib_ops/cuda/fused_conv.cc | 419 +++++++++++-- .../core/optimizer/conv_activation_fusion.cc | 61 +- .../core/optimizer/graph_transformer_utils.cc | 7 +- onnxruntime/core/providers/common.h | 25 +- onnxruntime/core/providers/cuda/cuda_call.cc | 16 +- .../providers/cuda/cuda_execution_provider.cc | 17 + .../providers/cuda/cuda_execution_provider.h | 1 + .../cuda/cuda_execution_provider_info.cc | 4 + .../cuda/cuda_execution_provider_info.h | 4 +- .../providers/cuda/cuda_provider_factory.cc | 2 + .../core/providers/cuda/cuda_stream_handle.cc | 3 + .../core/providers/cuda/cudnn_common.cc | 110 +++- .../core/providers/cuda/cudnn_common.h | 31 +- .../core/providers/cuda/cudnn_fe_call.cc | 119 ++++ onnxruntime/core/providers/cuda/nn/conv.cc | 560 +++++++++--------- onnxruntime/core/providers/cuda/nn/conv.h | 50 +- .../providers/cuda/shared_inc/cuda_call.h | 6 +- .../providers/cuda/shared_inc/cudnn_fe_call.h | 23 + .../providers/cuda/shared_inc/fpgeneric.h | 74 +-- .../core/session/provider_bridge_ort.cc | 1 + .../test/contrib_ops/nhwc_conv_op_test.cc | 3 +- .../test/optimizer/graph_transform_test.cc | 90 +-- .../test/providers/cpu/nn/conv_op_test.cc | 30 +- 33 files changed, 1178 insertions(+), 514 deletions(-) create mode 100644 cmake/external/cudnn_frontend.cmake create mode 100644 onnxruntime/core/providers/cuda/cudnn_fe_call.cc create mode 100644 onnxruntime/core/providers/cuda/shared_inc/cudnn_fe_call.h diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json index 29eb7045fc299..a5a70c19675e2 100644 --- a/cgmanifests/generated/cgmanifest.json +++ b/cgmanifests/generated/cgmanifest.json @@ -351,6 +351,16 @@ }, "comments": "directx_headers" } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "1b0b5eac540b7f8fd19b18f1e6b8427c95503348", + "repositoryUrl": "https://github.com/NVIDIA/cudnn-frontend.git" + }, + "comments": "cudnn_frontend" + } } ] } diff --git a/cmake/deps.txt b/cmake/deps.txt index 72469603a0889..93a55e30ffc72 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -58,3 +58,4 @@ utf8_range;https://github.com/protocolbuffers/utf8_range/archive/72c943dea2b9240 extensions;https://github.com/microsoft/onnxruntime-extensions/archive/94142d8391c9791ec71c38336436319a2d4ac7a0.zip;4365ac5140338b4cb75a39944a4be276e3829b3c composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/204da9c522cebec5220bba52cd3542ebcaf99e7a.zip;1827348efd47831c13074245274d41b7cae8a557 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e +cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.3.0.zip;dfccd5ff821cb2b2fa832bef26f39154358e6e13 \ No newline at end of file diff --git a/cmake/external/cudnn_frontend.cmake b/cmake/external/cudnn_frontend.cmake new file mode 100644 index 0000000000000..7ac51deaf93bc --- /dev/null +++ b/cmake/external/cudnn_frontend.cmake @@ -0,0 +1,12 @@ +include(FetchContent) +FetchContent_Declare( + cudnn_frontend + URL ${DEP_URL_cudnn_frontend} + URL_HASH SHA1=${DEP_SHA1_cudnn_frontend} +) + +set(CUDNN_FRONTEND_BUILD_SAMPLES OFF) +set(CUDNN_FRONTEND_BUILD_UNIT_TESTS OFF) +set(CUDNN_FRONTEND_BUILD_PYTHON_BINDINGS OFF) +set(CUDNN_PATH ${onnxruntime_CUDNN_HOME}) +FetchContent_MakeAvailable(cudnn_frontend) diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index 82c31ce6b6b4d..0f87331ecded9 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -197,7 +197,8 @@ target_compile_definitions(${target} PRIVATE USE_CUDA_MINIMAL) target_link_libraries(${target} PRIVATE ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface CUDA::cudart) else() - target_link_libraries(${target} PRIVATE CUDA::cublasLt CUDA::cublas cudnn CUDA::curand CUDA::cufft CUDA::cudart + include(cudnn_frontend) + target_link_libraries(${target} PRIVATE CUDA::cublasLt CUDA::cublas cudnn cudnn_frontend CUDA::curand CUDA::cufft CUDA::cudart ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface) if(onnxruntime_CUDNN_HOME) target_include_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/include) diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake index a8c876d30873e..1740144cf6553 100644 --- a/cmake/onnxruntime_rocm_hipify.cmake +++ b/cmake/onnxruntime_rocm_hipify.cmake @@ -145,6 +145,7 @@ set(provider_excluded_files "rnn/rnn_impl.cu" "rnn/rnn_impl.h" "shared_inc/cuda_call.h" + "shared_inc/cudnn_fe_call.h" "shared_inc/fpgeneric.h" "cuda_allocator.cc" "cuda_allocator.h" @@ -171,6 +172,7 @@ set(provider_excluded_files "cuda_utils.cu" "cudnn_common.cc" "cudnn_common.h" + "cudnn_fe_call.cc" "cupti_manager.cc" "cupti_manager.h" "fpgeneric.cu" diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index 38ed0b1640192..53d7428257754 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -67,7 +67,7 @@ function(AddTest) if(onnxruntime_USE_CUDA) #XXX: we should not need to do this. onnxruntime_test_all.exe should not have direct dependency on CUDA DLLs, # otherwise it will impact when CUDA DLLs can be unloaded. - target_link_libraries(${_UT_TARGET} PRIVATE CUDA::cudart) + target_link_libraries(${_UT_TARGET} PRIVATE CUDA::cudart cudnn_frontend) endif() target_link_libraries(${_UT_TARGET} PRIVATE ${_UT_LIBS} GTest::gtest GTest::gmock ${onnxruntime_EXTERNAL_LIBRARIES}) endif() diff --git a/include/onnxruntime/core/providers/cuda/cuda_context.h b/include/onnxruntime/core/providers/cuda/cuda_context.h index 462b31bb433a5..12a19759bb979 100644 --- a/include/onnxruntime/core/providers/cuda/cuda_context.h +++ b/include/onnxruntime/core/providers/cuda/cuda_context.h @@ -40,6 +40,7 @@ struct CudaContext : public CustomOpContext { bool enable_skip_layer_norm_strict_mode = false; bool prefer_nhwc = false; bool use_tf32 = true; + bool fuse_conv_bias = true; void Init(const OrtKernelContext& kernel_ctx) { cuda_stream = FetchResource(kernel_ctx, CudaResource::cuda_stream_t); @@ -57,6 +58,7 @@ struct CudaContext : public CustomOpContext { kernel_ctx, CudaResource::enable_skip_layer_norm_strict_mode_t); prefer_nhwc = FetchResource(kernel_ctx, CudaResource::prefer_nhwc_t); use_tf32 = FetchResource(kernel_ctx, CudaResource::use_tf32_t); + fuse_conv_bias = FetchResource(kernel_ctx, CudaResource::fuse_conv_bias_t); } template diff --git a/include/onnxruntime/core/providers/cuda/cuda_provider_options.h b/include/onnxruntime/core/providers/cuda/cuda_provider_options.h index 01a14de699dc4..3b7a1e99346d7 100644 --- a/include/onnxruntime/core/providers/cuda/cuda_provider_options.h +++ b/include/onnxruntime/core/providers/cuda/cuda_provider_options.h @@ -38,5 +38,6 @@ struct OrtCUDAProviderOptionsV2 { int prefer_nhwc = 0; // make the CUDA EP NHWC preferred int use_ep_level_unified_stream = 0; // flag specifying if ep level stream is used or not int use_tf32 = 1; // use TF32 + int fuse_conv_bias = 0; // Enable CUDNN Frontend kernel fusing, results in JIT compiles int sdpa_kernel = 0; // Scaled Dot Product Attention kernel option }; diff --git a/include/onnxruntime/core/providers/cuda/cuda_resource.h b/include/onnxruntime/core/providers/cuda/cuda_resource.h index 555023c442c01..b248d33035bc1 100644 --- a/include/onnxruntime/core/providers/cuda/cuda_resource.h +++ b/include/onnxruntime/core/providers/cuda/cuda_resource.h @@ -19,4 +19,5 @@ enum CudaResource : int { enable_skip_layer_norm_strict_mode_t, prefer_nhwc_t, use_tf32_t, + fuse_conv_bias_t }; diff --git a/onnxruntime/contrib_ops/cuda/diffusion/nhwc_conv.cc b/onnxruntime/contrib_ops/cuda/diffusion/nhwc_conv.cc index 79f0a18ba515f..c1459cb9d08d2 100644 --- a/onnxruntime/contrib_ops/cuda/diffusion/nhwc_conv.cc +++ b/onnxruntime/contrib_ops/cuda/diffusion/nhwc_conv.cc @@ -21,7 +21,7 @@ namespace cuda { T, \ kCudaExecutionProvider, \ (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()), \ - Conv); + onnxruntime::cuda::Conv); REGISTER_KERNEL_TYPED(float) REGISTER_KERNEL_TYPED(MLFloat16) diff --git a/onnxruntime/contrib_ops/cuda/fused_conv.cc b/onnxruntime/contrib_ops/cuda/fused_conv.cc index e126f8bcb3d11..279df73ee3d45 100644 --- a/onnxruntime/contrib_ops/cuda/fused_conv.cc +++ b/onnxruntime/contrib_ops/cuda/fused_conv.cc @@ -3,17 +3,66 @@ #include "core/common/status.h" #include "core/providers/cuda/nn/conv.h" +#include "core/providers/cuda/tensor/slice.h" #include "core/providers/cuda/cuda_common.h" namespace onnxruntime { namespace contrib { namespace cuda { +Status SliceOutUnwantedOutputSection(cudaStream_t stream, + const void* input_data, gsl::span input_dims, + void* output_data, + const gsl::span& output_dims, + const gsl::span& starts, + const gsl::span& ends, + const gsl::span& axes, + size_t element_size) { + SliceOp::PrepareForComputeMetadata compute_metadata(input_dims); + + ORT_THROW_IF_ERROR(SliceBase::PrepareForCompute(starts, ends, axes, compute_metadata)); + + // As a sanity check, ensure that the slice operator's output shape matches with the expected output shape + ORT_ENFORCE(SpanEq(gsl::make_span(compute_metadata.output_dims_), output_dims)); + + return ::onnxruntime::cuda::SliceCuda::Impl(stream, input_data, input_dims, output_data, + compute_metadata, element_size); +} + +static cudnnStatus_t GetWorkspaceSize(cudnnHandle_t handle, + const ::onnxruntime::cuda::CudnnConvState& s, + cudnnConvolutionFwdAlgo_t algo, + size_t* sz) { + return cudnnGetConvolutionForwardWorkspaceSize(handle, s.x_tensor, s.w_desc, s.conv_desc, s.y_tensor, algo, sz); +} + +static size_t GetMaxWorkspaceSize(cudnnHandle_t handle, + const ::onnxruntime::cuda::CudnnConvState& s, + const cudnnConvolutionFwdAlgo_t* algo, int n_algo) { + // TODO: get maximum available size from memory arena + size_t free, total; + CUDA_CALL_THROW(cudaMemGetInfo(&free, &total)); + // Assuming 10% of fragmentation + free = static_cast(static_cast(free) * 0.9); + size_t max_ws_size = 0; + for (int i = 0; i < n_algo; i++) { + cudnnStatus_t err; + size_t sz; + err = GetWorkspaceSize(handle, s, algo[i], &sz); + if (CUDNN_STATUS_SUCCESS != err || sz == 0 || sz < max_ws_size || sz > free) continue; + max_ws_size = sz; + } + return max_ws_size; +} + template -class FusedConv : public onnxruntime::cuda::Conv { +class FusedConv : public onnxruntime::cuda::CudaKernel { + using CudaT = typename ::onnxruntime::cuda::ToCudaType::MappedType; + public: - using Base = onnxruntime::cuda::Conv; - FusedConv(const OpKernelInfo& info) : onnxruntime::cuda::Conv(info) { + FusedConv(const OpKernelInfo& info) : onnxruntime::cuda::CudaKernel(info), conv_attrs_(info) { + auto pads_size = conv_attrs_.pads.size(); + ORT_ENFORCE(pads_size % 2 == 0); std::string activation; ORT_THROW_IF_ERROR(info.GetAttr("activation", &activation)); ORT_THROW_IF_ERROR(MapMode(activation)); @@ -32,66 +81,331 @@ class FusedConv : public onnxruntime::cuda::Conv { } } + Status UpdateState(OpKernelContext* context, bool bias_expected) const { + // set X + const Tensor* X = context->Input(0); + const TensorShape& x_shape = X->Shape(); + const auto x_dims = x_shape.AsShapeVector(); + s_.x_data = reinterpret_cast(X->Data()); + s_.element_size = X->DataType()->Size(); + // set W + const Tensor* W = context->Input(1); + const TensorShape& w_shape = W->Shape(); + auto w_dims = w_shape.AsShapeVector(); + s_.w_data = reinterpret_cast(W->Data()); + + // set B + if (context->InputCount() >= 3) { + const Tensor* B = context->Input(2); + s_.b_data = reinterpret_cast(B->Data()); + } else { + s_.b_data = nullptr; + } + // set Z + if (context->InputCount() >= 4) { + const Tensor* Z = context->Input(3); + ORT_RETURN_IF_ERROR(s_.z_tensor.Set(Z->Shape().GetDims(), + ::onnxruntime::cuda::CudnnTensor::GetDataType())); + s_.z_data = reinterpret_cast(Z->Data()); + } else { + s_.z_data = nullptr; + } + bool input_dims_changed = (s_.last_x_dims != x_dims); + bool w_dims_changed = (s_.last_w_dims != w_dims); + if (input_dims_changed || w_dims_changed) { + if (input_dims_changed) + s_.last_x_dims = gsl::make_span(x_dims); + + if (w_dims_changed) { + s_.last_w_dims = gsl::make_span(w_dims); + s_.cached_benchmark_results.clear(); + } + + ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape())); + + TensorShapeVector kernel_shape; + ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(W->Shape(), kernel_shape)); + + const size_t kernel_rank = kernel_shape.size(); + + ConvPadVector pads(conv_attrs_.pads); + if (pads.empty()) { + pads.resize(kernel_rank * 2, 0); + } + TensorShapeVector dilations(conv_attrs_.dilations); + if (dilations.empty()) { + dilations.resize(kernel_rank, 1); + } + TensorShapeVector strides(conv_attrs_.strides); + if (strides.empty()) { + strides.resize(kernel_rank, 1); + } + + TensorShapeVector y_dims; + y_dims.reserve(2 + kernel_rank); // add 2 to account for 'N' and 'C' + + const int64_t N = X->Shape()[0]; + const int64_t M = W->Shape()[0]; + y_dims.insert(y_dims.begin(), {N, M}); + + bool post_slicing_required = false; + TensorShapeVector slice_starts; + slice_starts.reserve(kernel_rank); + + TensorShapeVector slice_ends; + slice_ends.reserve(kernel_rank); + + TensorShapeVector slice_axes; + slice_axes.reserve(kernel_rank); + + constexpr size_t spatial_dim_start = 2; + const size_t spatial_dim_end = spatial_dim_start + kernel_rank; + TensorShape spatial_shape = X->Shape().Slice(spatial_dim_start, spatial_dim_end); + + TensorShapeVector y_dims_with_adjusted_pads(y_dims); + ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShapeWithAdjustedPads(spatial_shape, kernel_shape, + strides, dilations, pads, y_dims, + y_dims_with_adjusted_pads, post_slicing_required, + slice_starts, slice_ends, slice_axes)); + + ORT_ENFORCE(y_dims.size() == y_dims_with_adjusted_pads.size()); + s_.y_dims = gsl::make_span(y_dims); + s_.y_dims_with_adjusted_pads = y_dims_with_adjusted_pads; + s_.post_slicing_required = post_slicing_required; + s_.slice_starts = slice_starts; + s_.slice_ends = slice_ends; + s_.slice_axes = slice_axes; + + s_.Y = context->Output(0, TensorShape(s_.y_dims)); + if (post_slicing_required) { + // Post slicing needed. Create and fill in the Conv results in an intermediate buffer. + s_.memory_for_cudnn_conv_results = + GetScratchBuffer(TensorShape(y_dims_with_adjusted_pads).Size() * s_.element_size, + context->GetComputeStream()); + s_.y_data = reinterpret_cast(s_.memory_for_cudnn_conv_results.get()); + } else { + // No post slicing needed. Fill the output tensor's buffer directly. + s_.y_data = reinterpret_cast(s_.Y->MutableData()); + } + + const CUDAExecutionProvider* cuda_ep = + static_cast(this->Info().GetExecutionProvider()); + + TensorShapeVector x_dims_cudnn{x_dims.begin(), x_dims.end()}; + TensorShapeVector y_dims_cudnn = !post_slicing_required ? y_dims : y_dims_with_adjusted_pads; + if (kernel_rank < 2) { + // TODO: Explore padding the provided input shape [N, C, D] to [N, C, 1, D] + // especially for EXHAUSTIVE algo search which may result in a better algo selection. + // ORTModule uses different algo search options (HEURISTIC, and use max workspace size) compared to + // inference build (EXHAUSTIVE, 32M workspace size). We observed better perf when we pad input shape + // [N,C,D] to [N,C,1,D], expecially on A100, and especially for ConvGrad. + // PyTorch also pads to [N,C,1,D]. For inference build, we still pad it to [N, C, D, 1] as this seems + // to be the sweet spot for all algo search options: EXHAUSTIVE, HEURISTIC, and DEFAULT. + // See PR #7348 and #7702 for more context. + if (cuda_ep->GetCudnnConv1dPadToNc1d()) { + x_dims_cudnn.insert(x_dims_cudnn.begin() + 2, 1); + y_dims_cudnn.insert(y_dims_cudnn.begin() + 2, 1); + w_dims.insert(w_dims.begin() + 2, 1); + pads.insert(pads.begin() + kernel_rank, 0); + pads.insert(pads.begin(), 0); + kernel_shape.insert(kernel_shape.begin(), 1); + strides.insert(strides.begin(), 1); + dilations.insert(dilations.begin(), 1); + } else { + x_dims_cudnn.push_back(1); + y_dims_cudnn.push_back(1); + w_dims.push_back(1); + pads.insert(pads.begin() + kernel_rank, 0); + pads.insert(pads.end(), 0); + kernel_shape.push_back(1); + strides.push_back(1); + dilations.push_back(1); + } + } + + if (w_dims_changed) { + ORT_RETURN_IF_ERROR(s_.w_desc.Set(w_dims, ::onnxruntime::cuda::CudnnTensor::GetDataType())); + } + + // We must delay returning early until here so that the weight dims have been cached properly + if (s_.Y->Shape().Size() == 0) { + return Status::OK(); + } + + ORT_RETURN_IF_ERROR(s_.x_tensor.Set(x_dims_cudnn, ::onnxruntime::cuda::CudnnTensor::GetDataType())); + ORT_RETURN_IF_ERROR(s_.y_tensor.Set(y_dims_cudnn, ::onnxruntime::cuda::CudnnTensor::GetDataType())); + + ORT_RETURN_IF_ERROR(s_.conv_desc.Set(kernel_shape.size(), pads, strides, dilations, + gsl::narrow_cast(conv_attrs_.group), CUDNN_CROSS_CORRELATION, + ::onnxruntime::cuda::CudnnTensor::GetDataType(), UseTF32())); + + if (context->InputCount() >= 3) { + const Tensor* B = context->Input(2); + const auto& b_shape = B->Shape(); + ORT_RETURN_IF_NOT(b_shape.NumDimensions() == 1, "bias should be 1D"); + TensorShapeVector b_dims(2 + kernel_shape.size(), 1); + b_dims[1] = b_shape[0]; + ORT_RETURN_IF_ERROR(s_.b_tensor.Set(b_dims, ::onnxruntime::cuda::CudnnTensor::GetDataType())); + // s_.b_data = reinterpret_cast(B->Data()); + } else if (bias_expected) { + TensorShapeVector b_dims(2 + kernel_shape.size(), 1); + b_dims[1] = w_dims[0]; + auto malloc_size = b_dims[1] * sizeof(CudaT); + ORT_RETURN_IF_ERROR(s_.b_tensor.Set(b_dims, ::onnxruntime::cuda::CudnnTensor::GetDataType())); + if (s_.b_zero) { + CUDA_CALL_THROW(cudaFree(s_.b_zero)); + s_.b_zero = nullptr; + } + CUDA_CALL_THROW(cudaMalloc(&s_.b_zero, malloc_size)); + CUDA_CALL_THROW(cudaMemsetAsync(s_.b_zero, 0, malloc_size, Stream(context))); + } + + if (!s_.cached_benchmark_results.contains(x_dims_cudnn)) { + // set math type to tensor core before algorithm search + if constexpr (std::is_same::value) { + CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, CUDNN_TENSOR_OP_MATH)); + } else if constexpr (std::is_same::value) { + if (!UseTF32()) { + CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, CUDNN_FMA_MATH)); + } + } + + cudnnConvolutionFwdAlgoPerf_t perf; + int algo_count = 1; + int cudnn_conv_algo = cuda_ep->GetCudnnConvAlgo(); + ORT_ENFORCE(cudnn_conv_algo > -1 && cudnn_conv_algo < 3, "cudnn_conv_algo should be 0, 1 or 2, but got ", + cudnn_conv_algo); + switch (cudnn_conv_algo) { + case 0: { + static constexpr int num_algos = CUDNN_CONVOLUTION_FWD_ALGO_COUNT; + size_t max_ws_size = cuda_ep->GetCudnnConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetCudnnHandle(context), + s_, kAllAlgos, num_algos) + : ::onnxruntime::cuda::AlgoSearchWorkspaceSize; + // Use GetTransientScratchBuffer() so the workspace can be freed instead of cached. + // Because the benchmarking uses a huge amount of memory, e.g. a few GBs. + IAllocatorUniquePtr algo_search_workspace = GetTransientScratchBuffer(max_ws_size); + CUDNN_RETURN_IF_ERROR(cudnnFindConvolutionForwardAlgorithmEx( + GetCudnnHandle(context), + s_.x_tensor, + s_.x_data, + s_.w_desc, + s_.w_data, + s_.conv_desc, + s_.y_tensor, + s_.y_data, + 1, // requestedAlgoCount + &algo_count, // returnedAlgoCount + &perf, + algo_search_workspace.get(), + max_ws_size)); + break; + } + case 1: + CUDNN_RETURN_IF_ERROR(cudnnGetConvolutionForwardAlgorithm_v7( + GetCudnnHandle(context), + s_.x_tensor, + s_.w_desc, + s_.conv_desc, + s_.y_tensor, + 1, // requestedAlgoCount + &algo_count, // returnedAlgoCount + &perf)); + break; + + default: + perf.algo = kDefaultConvAlgo; + CUDNN_RETURN_IF_ERROR(GetWorkspaceSize(GetCudnnHandle(context), s_, perf.algo, &perf.memory)); + + if constexpr (std::is_same::value) { + perf.mathType = CUDNN_TENSOR_OP_MATH; + } else if (std::is_same::value && !UseTF32()) { + perf.mathType = CUDNN_FMA_MATH; + } else { + perf.mathType = CUDNN_DEFAULT_MATH; + } + } + s_.cached_benchmark_results.insert(x_dims_cudnn, {perf.algo, perf.memory, perf.mathType}); + } + const auto& perf = s_.cached_benchmark_results.at(x_dims_cudnn); + CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, perf.mathType)); + s_.algo = perf.algo; + s_.workspace_bytes = perf.memory; + } else { + // set Y + s_.Y = context->Output(0, s_.y_dims); + if (s_.Y->Shape().Size() == 0) { + return Status::OK(); + } + if (s_.post_slicing_required) { + s_.memory_for_cudnn_conv_results = GetScratchBuffer( + TensorShape(s_.y_dims_with_adjusted_pads).Size() * s_.element_size, context->GetComputeStream()); + s_.y_data = reinterpret_cast(s_.memory_for_cudnn_conv_results.get()); + } else { + s_.y_data = reinterpret_cast(s_.Y->MutableData()); + } + } + return Status::OK(); + } + Status ComputeInternal(OpKernelContext* context) const override { - std::lock_guard lock(Base::s_.mutex); + std::lock_guard lock(s_.mutex); auto cudnnHandle = this->GetCudnnHandle(context); - ORT_RETURN_IF_ERROR(Base::UpdateState(context, true)); - if (Base::s_.Y->Shape().Size() == 0) { + ORT_RETURN_IF_ERROR(UpdateState(context, true)); + if (s_.Y->Shape().Size() == 0) { return Status::OK(); } - bool has_z = nullptr != Base::s_.z_data; - bool has_b = nullptr != Base::s_.b_data; - typedef typename onnxruntime::cuda::ToCudaType::MappedType CudaT; + bool has_z = nullptr != s_.z_data; + bool has_b = nullptr != s_.b_data; const auto alpha = onnxruntime::cuda::Consts::One; const auto beta = onnxruntime::cuda::Consts::Zero; - IAllocatorUniquePtr workspace = Base::GetWorkSpace(context->GetComputeStream()); + IAllocatorUniquePtr workspace = GetWorkSpace(context->GetComputeStream()); auto cudnn_status = cudnnConvolutionBiasActivationForward(cudnnHandle, &alpha, - Base::s_.x_tensor, - Base::s_.x_data, - Base::s_.w_desc, - Base::s_.w_data, - Base::s_.conv_desc, - Base::s_.algo, + s_.x_tensor, + s_.x_data, + s_.w_desc, + s_.w_data, + s_.conv_desc, + s_.algo, workspace.get(), - Base::s_.workspace_bytes, + s_.workspace_bytes, has_z ? &alpha : &beta, - has_z ? Base::s_.z_tensor : Base::s_.y_tensor, - has_z ? Base::s_.z_data : Base::s_.y_data, - Base::s_.b_tensor, - has_b ? Base::s_.b_data : Base::s_.b_zero, + has_z ? s_.z_tensor : s_.y_tensor, + has_z ? s_.z_data : s_.y_data, + s_.b_tensor, + has_b ? s_.b_data : s_.b_zero, activation_desc_, - Base::s_.y_tensor, - Base::s_.y_data); + s_.y_tensor, + s_.y_data); if (CUDNN_STATUS_SUCCESS != cudnn_status) { CUDNN_RETURN_IF_ERROR(cudnnConvolutionForward(cudnnHandle, &alpha, - Base::s_.x_tensor, - Base::s_.x_data, - Base::s_.w_desc, - Base::s_.w_data, - Base::s_.conv_desc, - Base::s_.algo, + s_.x_tensor, + s_.x_data, + s_.w_desc, + s_.w_data, + s_.conv_desc, + s_.algo, workspace.get(), - Base::s_.workspace_bytes, + s_.workspace_bytes, &beta, - Base::s_.y_tensor, - Base::s_.y_data)); + s_.y_tensor, + s_.y_data)); if (has_b) { - CUDNN_RETURN_IF_ERROR(cudnnAddTensor(cudnnHandle, &alpha, Base::s_.b_tensor, Base::s_.b_data, - &alpha, Base::s_.y_tensor, Base::s_.y_data)); + CUDNN_RETURN_IF_ERROR(cudnnAddTensor(cudnnHandle, &alpha, s_.b_tensor, s_.b_data, + &alpha, s_.y_tensor, s_.y_data)); } if (has_z) { - CUDNN_RETURN_IF_ERROR(cudnnAddTensor(cudnnHandle, &alpha, Base::s_.z_tensor, Base::s_.z_data, - &alpha, Base::s_.y_tensor, Base::s_.y_data)); + CUDNN_RETURN_IF_ERROR(cudnnAddTensor(cudnnHandle, &alpha, s_.z_tensor, s_.z_data, + &alpha, s_.y_tensor, s_.y_data)); } - CUDNN_RETURN_IF_ERROR(cudnnActivationForward(cudnnHandle, activation_desc_, &alpha, Base::s_.y_tensor, - Base::s_.y_data, &beta, Base::s_.y_tensor, Base::s_.y_data)); + CUDNN_RETURN_IF_ERROR(cudnnActivationForward(cudnnHandle, activation_desc_, &alpha, s_.y_tensor, + s_.y_data, &beta, s_.y_tensor, s_.y_data)); } - if (Base::s_.post_slicing_required) { - ORT_RETURN_IF_ERROR(onnxruntime::cuda::SliceOutUnwantedOutputSection( - this->Stream(context), Base::s_.y_data, Base::s_.y_dims_with_adjusted_pads, Base::s_.Y->MutableDataRaw(), - Base::s_.y_dims.GetDims(), Base::s_.slice_starts, Base::s_.slice_ends, Base::s_.slice_axes, Base::s_.element_size)); + if (s_.post_slicing_required) { + ORT_RETURN_IF_ERROR(SliceOutUnwantedOutputSection( + this->Stream(context), s_.y_data, s_.y_dims_with_adjusted_pads, s_.Y->MutableDataRaw(), + s_.y_dims.GetDims(), s_.slice_starts, s_.slice_ends, s_.slice_axes, s_.element_size)); } return Status::OK(); } @@ -107,6 +421,25 @@ class FusedConv : public onnxruntime::cuda::Conv { } return Status::OK(); } + + inline IAllocatorUniquePtr GetWorkSpace(onnxruntime::Stream* stream) const { + return GetScratchBuffer(s_.workspace_bytes, stream); + } + + ConvAttributes conv_attrs_; + mutable ::onnxruntime::cuda::CudnnConvState s_; + constexpr static auto kDefaultConvAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; + constexpr static cudnnConvolutionFwdAlgo_t kAllAlgos[] = { + CUDNN_CONVOLUTION_FWD_ALGO_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_FFT, + CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_DIRECT, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED, + }; + cudnnActivationMode_t activation_mode_; cudnnActivationDescriptor_t activation_desc_ = nullptr; }; @@ -122,4 +455,4 @@ ONNX_OPERATOR_TYPED_KERNEL_EX( } // namespace cuda } // namespace contrib -} // namespace onnxruntime +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/optimizer/conv_activation_fusion.cc b/onnxruntime/core/optimizer/conv_activation_fusion.cc index 12746ad53123a..c7d2d95e6121d 100644 --- a/onnxruntime/core/optimizer/conv_activation_fusion.cc +++ b/onnxruntime/core/optimizer/conv_activation_fusion.cc @@ -58,16 +58,11 @@ bool HasElementDataType(const NodeArg& node_arg, int32_t data_type) { } bool ConvFusionDataTypeCheck(const Node& conv_node) { - // TODO(hasesh): The CPU and CUDA EP only support float type for the Conv+Activation + // TODO(hasesh): The CPU EP only supports float type for the Conv+Activation // and the Conv+Add+Relu fusions. // Assess the support level for the other compatible EPs and if they also // only support float, remove the EP check altogether. const std::string_view node_ep = conv_node.GetExecutionProviderType(); - if (node_ep == kCudaExecutionProvider) { - if (!HasElementDataType(*conv_node.InputDefs()[0], ONNX_NAMESPACE::TensorProto_DataType_FLOAT)) { - return false; - } - } if (node_ep == kCpuExecutionProvider) { #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED if (!HasElementDataType(*conv_node.InputDefs()[0], ONNX_NAMESPACE::TensorProto_DataType_FLOAT) && @@ -120,7 +115,9 @@ class ConvActivationSelector : public NodeSelector { } // check EP type and activation - if (node_ep == kCudaExecutionProvider || node_ep == kRocmExecutionProvider) { + if (node_ep == kCudaExecutionProvider) { + return std::nullopt; + } else if (node_ep == kRocmExecutionProvider) { if (!graph_utils::IsSupportedOptypeVersionAndDomain(*next_node, "Relu", {6, 13, 14})) { return std::nullopt; } @@ -142,43 +139,6 @@ class ConvActivationSelector : public NodeSelector { } }; -class ConvAddRelu : public NodeSelector { - public: - ConvAddRelu() = default; - - std::optional Select(const GraphViewer& graph_viewer, const Node& node) const override { - const std::string_view node_ep = node.GetExecutionProviderType(); - // only for CUDA EP - if (node_ep != kCudaExecutionProvider) { - return std::nullopt; - } - - if (!ConvFusionDataTypeCheck(node)) { - return std::nullopt; - } - - const auto* add_node = GetLoneConsumerNode(graph_viewer, node); - if (!add_node || - !graph_utils::IsSupportedOptypeVersionAndDomain(*add_node, "Add", {6, 7, 13, 14}) || - add_node->GetExecutionProviderType() != node_ep) { - return std::nullopt; - } - - const auto* relu_node = GetLoneConsumerNode(graph_viewer, *add_node); - if (!relu_node || - !graph_utils::IsSupportedOptypeVersionAndDomain(*relu_node, "Relu", {6, 13, 14}) || - relu_node->GetExecutionProviderType() != node_ep) { - return std::nullopt; - } - - NodesToOptimizeIndicesBuilder builder{}; - builder.target_node = node.Index(); - builder.output_nodes = {add_node->Index(), - relu_node->Index()}; - return builder.Build(); - } -}; - } // namespace selectors #endif // !defined(ORT_MINIMAL_BUILD) @@ -304,22 +264,9 @@ void RegisterConvActivationFusionRules(SelectorActionRegistry& registry) { #endif } -void RegisterConvAddReluFusionRules(SelectorActionRegistry& registry) { - const auto name = "ConvAddRelu"; - auto action = std::make_unique(); -#if !defined(ORT_MINIMAL_BUILD) - auto selector = std::make_unique(); - registry.RegisterSelectorAndAction(name, {{"Conv", {1, 11}}}, - std::move(selector), std::move(action)); -#else - registry.RegisterAction(name, std::move(action)); -#endif -} - SelectorActionRegistry CreateSelectorActionRegistry() { SelectorActionRegistry registry{}; RegisterConvActivationFusionRules(registry); - RegisterConvAddReluFusionRules(registry); return registry; } diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc index 7da65f18ccacb..a5eca9c7bdaf7 100644 --- a/onnxruntime/core/optimizer/graph_transformer_utils.cc +++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc @@ -281,6 +281,11 @@ InlinedVector> GenerateTransformers( onnxruntime::kCudaExecutionProvider, onnxruntime::kRocmExecutionProvider, onnxruntime::kDmlExecutionProvider}; + const InlinedHashSet cpu_rocm_acl_armnn_js_eps = {onnxruntime::kCpuExecutionProvider, + onnxruntime::kRocmExecutionProvider, + onnxruntime::kAclExecutionProvider, + onnxruntime::kArmNNExecutionProvider, + onnxruntime::kJsExecutionProvider}; const InlinedHashSet cpu_cuda_rocm_acl_armnn_js_eps = {onnxruntime::kCpuExecutionProvider, onnxruntime::kCudaExecutionProvider, onnxruntime::kRocmExecutionProvider, @@ -316,7 +321,7 @@ InlinedVector> GenerateTransformers( transformers.emplace_back(std::make_unique(cpu_dml_eps)); transformers.emplace_back(std::make_unique(cpu_ep)); - transformers.emplace_back(std::make_unique(cpu_cuda_rocm_acl_armnn_js_eps)); + transformers.emplace_back(std::make_unique(cpu_rocm_acl_armnn_js_eps)); transformers.emplace_back(std::make_unique(cpu_cuda_dml_rocm_eps)); transformers.emplace_back(std::make_unique(cpu_cuda_dml_rocm_eps)); diff --git a/onnxruntime/core/providers/common.h b/onnxruntime/core/providers/common.h index 564c7690c8da1..7576dfba5c85e 100644 --- a/onnxruntime/core/providers/common.h +++ b/onnxruntime/core/providers/common.h @@ -83,6 +83,14 @@ inline AutoPadType StringToAutoPadType(const std::string& str) { // helper function +constexpr inline int64_t ComputeOutputShape(const int64_t in_dim, + const int64_t stride, const int64_t kernel, const int64_t dilation, + const int64_t pad_head, const int64_t pad_tail) { + const SafeInt dkernel = SafeInt(dilation) * (kernel - 1) + 1; + int64_t dkernel_value = SafeInt(in_dim) + pad_head + pad_tail - dkernel; + return static_cast(static_cast(dkernel_value) / stride + 1); +} + inline Status ComputePad(const int64_t in_dim, const int64_t stride, const int64_t kernel, const int64_t dilation, AutoPadType pad_type, @@ -106,6 +114,15 @@ inline Status ComputePad(const int64_t in_dim, // is retained as is SafeInt legacy_target_size = (SafeInt(in_dim) + stride - 1) / stride; SafeInt pad_needed = (legacy_target_size - 1) * stride + kernel - in_dim; + // out_dim = floor((in_dim + 2p - k) / s) + 1 + // => if (in_dim + 2p - k) is not divisible by s we can remove the floor with following equation: + // out_dim + eps = ((in_dim + 2p - k) / s) + 1 ;where eps is in [0.0, 1.0] + // therefore in edge cases padding can lower calculated above than it should be + SafeInt actual_out_size = ComputeOutputShape(in_dim, stride, kernel, /*dilation*/ 1, + pad_needed, pad_needed); + if (actual_out_size < legacy_target_size) { + pad_needed += 1; + } // make sure padding is symmetric if (force_symmetric_auto_padding) { // Inlining math::roundUpPow2() from util/math.h to avoid bringing in the transitive dependencies. @@ -126,14 +143,6 @@ inline Status ComputePad(const int64_t in_dim, return Status::OK(); } -constexpr inline int64_t ComputeOutputShape(const int64_t in_dim, - const int64_t stride, const int64_t kernel, const int64_t dilation, - const int64_t pad_head, const int64_t pad_tail) { - const SafeInt dkernel = SafeInt(dilation) * (kernel - 1) + 1; - int64_t dkernel_value = SafeInt(in_dim) + pad_head + pad_tail - dkernel; - return static_cast(static_cast(dkernel_value) / stride + 1); -} - inline Status ComputePadAndOutputShape(const int64_t in_dim, const int64_t stride, const int64_t kernel, const int64_t dilation, AutoPadType pad_type, diff --git a/onnxruntime/core/providers/cuda/cuda_call.cc b/onnxruntime/core/providers/cuda/cuda_call.cc index c73b23f3762ed..511a6e2dce199 100644 --- a/onnxruntime/core/providers/cuda/cuda_call.cc +++ b/onnxruntime/core/providers/cuda/cuda_call.cc @@ -34,7 +34,6 @@ const char* CudaErrString(cudaError_t x) { template <> const char* CudaErrString(cublasStatus_t e) { cudaDeviceSynchronize(); - switch (e) { CASE_ENUM_TO_STR(CUBLAS_STATUS_SUCCESS); CASE_ENUM_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED); @@ -87,9 +86,15 @@ const char* CudaErrString(ncclResult_t e) { } #endif -template +template +int GetErrorCode(ERRTYPE err) { + return static_cast(err); +} + +template std::conditional_t CudaCall( - ERRTYPE retCode, const char* exprString, const char* libName, ERRTYPE successCode, const char* msg, const char* file, const int line) { + ERRTYPE retCode, const char* exprString, const char* libName, SUCCTYPE successCode, const char* msg, + const char* file, const int line) { if (retCode != successCode) { try { #ifdef _WIN32 @@ -108,7 +113,7 @@ std::conditional_t CudaCall( cudaGetLastError(); // clear last CUDA error static char str[1024]; snprintf(str, 1024, "%s failure %d: %s ; GPU=%d ; hostname=%s ; file=%s ; line=%d ; expr=%s; %s", - libName, (int)retCode, CudaErrString(retCode), currentCudaDevice, + libName, GetErrorCode(retCode), CudaErrString(retCode), currentCudaDevice, hostname, file, line, exprString, msg); if constexpr (THRW) { @@ -118,7 +123,8 @@ std::conditional_t CudaCall( LOGS_DEFAULT(ERROR) << str; return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, str); } - } catch (const std::exception& e) { // catch, log, and rethrow since CUDA code sometimes hangs in destruction, so we'd never get to see the error + } catch (const std::exception& e) { // catch, log, and rethrow since CUDA code sometimes hangs in destruction, + // so we'd never get to see the error if constexpr (THRW) { ORT_THROW(e.what()); } else { diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 8c03e489d298d..45d26f8b4bd19 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -180,6 +180,7 @@ CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId de CUDNN_CALL_THROW(cudnnCreate(&cudnn_handle_)); CUDNN_CALL_THROW(cudnnSetStream(cudnn_handle_, stream)); + LOGS_DEFAULT(INFO) << "cuDNN version: " << cudnnGetVersion(); #endif cuda_graph_.SetStream(stream); } @@ -2469,6 +2470,19 @@ static bool ConvTransposeNeedFallbackToCPU(const onnxruntime::Node& node, const return false; } +static bool NhwcConvNeedFallbackToCPU(const onnxruntime::Node& node, const logging::Logger& logger, + [[maybe_unused]] const GraphViewer& graph_viewer, + [[maybe_unused]] const bool prefer_nhwc) { + // NHWC implementation doesn't handle W in NHWC layout if it's not an initializer + if (!graph_viewer.IsConstantInitializer(node.InputDefs()[1]->Name(), true)) { + LOGS(logger, WARNING) << "Dropping the NhwcConv node: " << node.Name() + << " to CPU because the Cuda EP requires W as initializer for NHWC operation."; + return true; + } + + return false; +} + static bool CastNeedFallbackToCPU(const onnxruntime::Node& node) { const auto& node_attributes = node.GetAttributes(); // Check attributes @@ -2539,6 +2553,9 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, } else if ("Cast" == node.OpType()) { not_supported = CastNeedFallbackToCPU(node); // cast is not compute heavy, and may be placed outside + } else if ("NhwcConv" == node.OpType()) { + not_supported = NhwcConvNeedFallbackToCPU(node, logger, graph, IsNHWCPreferred()); + force_inside = !not_supported; } if (!force_inside && not_supported) { diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h index 9c8a8712ca51c..0871f7e4d0a74 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h @@ -82,6 +82,7 @@ class CUDAExecutionProvider : public IExecutionProvider { bool GetCudnnConv1dPadToNc1d() const { return info_.cudnn_conv1d_pad_to_nc1d; } bool IsSkipLayerNormInStrictMode() const { return info_.enable_skip_layer_norm_strict_mode; } bool IsNHWCPreferred() const { return info_.prefer_nhwc; } + bool IsFuseConvBias() const { return info_.fuse_conv_bias; } bool UseTF32() const { return info_.use_tf32; } #ifndef DISABLE_CONTRIB_OPS diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc index 31cf991a34fc9..14195703d5963 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc @@ -34,6 +34,7 @@ constexpr const char* kEnableSkipLayerNormStrictMode = "enable_skip_layer_norm_s constexpr const char* kPreferNHWCMode = "prefer_nhwc"; constexpr const char* kUseEPLevelUnifiedStream = "use_ep_level_unified_stream"; constexpr const char* kUseTF32 = "use_tf32"; +constexpr const char* kFuseConvBias = "fuse_conv_bias"; constexpr const char* kSdpaKernel = "sdpa_kernel"; } // namespace provider_option_names @@ -119,6 +120,7 @@ CUDAExecutionProviderInfo CUDAExecutionProviderInfo::FromProviderOptions(const P .AddAssignmentToReference(cuda::provider_option_names::kUseEPLevelUnifiedStream, info.use_ep_level_unified_stream) .AddAssignmentToReference(cuda::provider_option_names::kUseTF32, info.use_tf32) .AddAssignmentToReference(cuda::provider_option_names::kSdpaKernel, info.sdpa_kernel) + .AddAssignmentToReference(cuda::provider_option_names::kFuseConvBias, info.fuse_conv_bias) .AddValueParser( cuda::provider_option_names::kTunableOpEnable, [&info](const std::string& value_str) -> Status { @@ -173,6 +175,7 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const CUDAExecution {cuda::provider_option_names::kUseEPLevelUnifiedStream, MakeStringWithClassicLocale(info.use_ep_level_unified_stream)}, {cuda::provider_option_names::kUseTF32, MakeStringWithClassicLocale(info.use_tf32)}, {cuda::provider_option_names::kSdpaKernel, MakeStringWithClassicLocale(info.sdpa_kernel)}, + {cuda::provider_option_names::kFuseConvBias, MakeStringWithClassicLocale(info.fuse_conv_bias)}, }; return options; @@ -195,6 +198,7 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const OrtCUDAProvid {cuda::provider_option_names::kPreferNHWCMode, MakeStringWithClassicLocale(info.prefer_nhwc)}, {cuda::provider_option_names::kUseEPLevelUnifiedStream, MakeStringWithClassicLocale(info.use_ep_level_unified_stream)}, {cuda::provider_option_names::kUseTF32, MakeStringWithClassicLocale(info.use_tf32)}, + {cuda::provider_option_names::kFuseConvBias, MakeStringWithClassicLocale(info.fuse_conv_bias)}, {cuda::provider_option_names::kSdpaKernel, MakeStringWithClassicLocale(info.sdpa_kernel)}, }; diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h index 0efad80f743df..bfd50ca8d40a1 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h @@ -78,6 +78,7 @@ struct CUDAExecutionProviderInfo { // By default, enable TF32 to speed up float GEMM/MatMul or cuDNN convolution of float matrices. bool use_tf32{true}; + bool fuse_conv_bias{false}; int sdpa_kernel{0}; @@ -107,7 +108,8 @@ struct std::hash<::onnxruntime::CUDAExecutionProviderInfo> { (static_cast(info.enable_skip_layer_norm_strict_mode) << 27) ^ (static_cast(info.prefer_nhwc) << 28) ^ (static_cast(info.use_ep_level_unified_stream) << 29) ^ - (static_cast(info.use_tf32) << 30); + (static_cast(info.use_tf32) << 30) ^ + (static_cast(info.fuse_conv_bias) << 31); onnxruntime::HashCombine(data, value); onnxruntime::HashCombine(info.gpu_mem_limit, value); diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc index b1d54e56ded4e..83a5d02d16c6c 100644 --- a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc +++ b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc @@ -219,6 +219,7 @@ struct CUDA_Provider : Provider { info.cudnn_conv_use_max_workspace = params->cudnn_conv_use_max_workspace != 0; info.enable_cuda_graph = params->enable_cuda_graph != 0; info.prefer_nhwc = params->prefer_nhwc; + info.fuse_conv_bias = params->fuse_conv_bias; info.cudnn_conv1d_pad_to_nc1d = params->cudnn_conv1d_pad_to_nc1d != 0; info.tunable_op.enable = params->tunable_op_enable; info.tunable_op.tuning_enable = params->tunable_op_tuning_enable; @@ -262,6 +263,7 @@ struct CUDA_Provider : Provider { cuda_options.use_ep_level_unified_stream = internal_options.use_ep_level_unified_stream; cuda_options.use_tf32 = internal_options.use_tf32; cuda_options.sdpa_kernel = internal_options.sdpa_kernel; + cuda_options.fuse_conv_bias = internal_options.fuse_conv_bias; } ProviderOptions GetProviderOptions(const void* provider_options) override { diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc index 14b75d2383b58..e9b159516dad9 100644 --- a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc +++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc @@ -215,6 +215,9 @@ void* CudaStream::GetResource(int version, int id) const { case CudaResource::prefer_nhwc_t: return reinterpret_cast(ep_info_.prefer_nhwc); break; + case CudaResource::fuse_conv_bias_t: + return reinterpret_cast(ep_info_.fuse_conv_bias); + break; case CudaResource::use_tf32_t: return reinterpret_cast(ep_info_.use_tf32); break; diff --git a/onnxruntime/core/providers/cuda/cudnn_common.cc b/onnxruntime/core/providers/cuda/cudnn_common.cc index 31fc63a86d640..72482266a2eef 100644 --- a/onnxruntime/core/providers/cuda/cudnn_common.cc +++ b/onnxruntime/core/providers/cuda/cudnn_common.cc @@ -3,6 +3,8 @@ // Licensed under the MIT License. #include +#include +#include #include "core/providers/cuda/cudnn_common.h" #include "core/common/inlined_containers.h" @@ -60,7 +62,25 @@ Status CudnnTensor::Set(gsl::span input_dims, cudnnDataType_t dat dims[1] = gsl::narrow_cast(input_dims[rank - 1]); strides[1] = gsl::narrow_cast(pitches[rank - 1]); } - CUDNN_RETURN_IF_ERROR(cudnnSetTensorNdDescriptor(tensor_, dataType, static_cast(rank), dims.data(), strides.data())); + CUDNN_RETURN_IF_ERROR(cudnnSetTensorNdDescriptor(tensor_, dataType, static_cast(rank), + dims.data(), strides.data())); + return Status::OK(); +} + +Status CudnnTensor::Set(gsl::span input_dims, cudnnDataType_t dataType, + gsl::span input_strides) { + ORT_RETURN_IF_ERROR(CreateTensorIfNeeded()); + + int rank = gsl::narrow_cast(input_dims.size()); + InlinedVector dims(rank); + InlinedVector strides(rank); + + for (int i = 0; i < rank; i++) { + dims[i] = gsl::narrow_cast(input_dims[i]); + strides[i] = gsl::narrow_cast(input_strides[i]); + } + CUDNN_RETURN_IF_ERROR( + cudnnSetTensorNdDescriptor(tensor_, dataType, static_cast(rank), dims.data(), strides.data())); return Status::OK(); } @@ -100,7 +120,8 @@ Status CudnnDataTensor::Set(cudnnDataType_t dataType, const int32_t* seq_lengths) { ORT_RETURN_IF_ERROR(CreateTensorIfNeeded()); - // CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED works with CUDNN_RNN_PADDED_IO_ENABLED, so that it will auto fill 0 for the shorter sequences + // CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED works with CUDNN_RNN_PADDED_IO_ENABLED, + // so that it will auto fill 0 for the shorter sequences cudnnRNNDataLayout_t layout = CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED; float padding_fill = 0.0f; CUDNN_RETURN_IF_ERROR(cudnnSetRNNDataDescriptor(tensor_, dataType, layout, @@ -238,6 +259,91 @@ const Float8E5M2 Consts::One = Float8E5M2(1.0f, true); #endif +std::vector generateStrides(const std::vector& shape, bool channels_last) { + // For INT8x4 and INT8x32 we still compute standard strides here to input + // into the cuDNN functions. We will manually scale by resizeFactor in the cpu ref. + std::vector strides(shape.size()); + int64_t nbDims = strides.size(); + if (nbDims <= 1) { + strides[0] = 1; + return strides; + } + if (channels_last) { + // Here we assume that the format is CUDNN_TENSOR_NHWC + strides[1] = 1; + strides[nbDims - 1] = strides[1] * shape[1]; + for (int64_t d = nbDims - 2; d >= 2; d--) { + strides[d] = strides[d + 1] * shape[d + 1]; + } + strides[0] = strides[2] * shape[2]; + } else { + strides[nbDims - 1] = 1; + for (int64_t d = nbDims - 2; d >= 0; d--) { + strides[d] = strides[d + 1] * shape[d + 1]; + } + } + return strides; +} + +#if !defined(__CUDACC__) +CudnnFeTensor::CudnnFeTensor(const onnxruntime::TensorShapeVector& shape, + const std::string& name, + std::optional dtype, + const bool nhwc) { + std::vector shape_vec; + if (shape.size() == 1) { + shape_vec = {1, shape[0], 1, 1}; + } else if (shape.size() >= 4) { + for (size_t i = 0; i < shape.size(); i++) { + shape_vec.push_back(shape[i]); + } + } else { + ORT_THROW("Invalid tensor shape size, tensor name: ", name, ", shape size: ", shape.size()); + } + auto strides = generateStrides(shape_vec, nhwc); + + if (dtype.has_value()) { + tensor_ = cudnn_frontend::graph::Tensor_attributes() + .set_name(name) + .set_dim(shape_vec) + .set_stride(strides) + .set_data_type(dtype.value()); + } else { + tensor_ = cudnn_frontend::graph::Tensor_attributes().set_name(name).set_dim(shape_vec).set_stride(strides); + } +} + +template +cudnn_frontend::DataType_t CudnnFeTensor::GetDataType() { + return cudnn_frontend::DataType_t::NOT_SET; +} + +template <> +cudnn_frontend::DataType_t CudnnFeTensor::GetDataType() { + return cudnn_frontend::DataType_t::FLOAT; +} + +template <> +cudnn_frontend::DataType_t CudnnFeTensor::GetDataType() { + return cudnn_frontend::DataType_t::HALF; +} + +template <> +cudnn_frontend::DataType_t CudnnFeTensor::GetDataType() { + return cudnn_frontend::DataType_t::DOUBLE; +} + +template <> +cudnn_frontend::DataType_t CudnnFeTensor::GetDataType() { + return cudnn_frontend::DataType_t::INT8; +} + +template <> +cudnn_frontend::DataType_t CudnnFeTensor::GetDataType() { + return cudnn_frontend::DataType_t::UINT8; +} +#endif + } // namespace cuda } // namespace onnxruntime #endif diff --git a/onnxruntime/core/providers/cuda/cudnn_common.h b/onnxruntime/core/providers/cuda/cudnn_common.h index 2cbeb13696270..b267ef6bed64f 100644 --- a/onnxruntime/core/providers/cuda/cudnn_common.h +++ b/onnxruntime/core/providers/cuda/cudnn_common.h @@ -5,12 +5,22 @@ #pragma once #include +#include +#include #include "core/providers/cuda/cuda_common.h" +#include "core/providers/cuda/shared_inc/cudnn_fe_call.h" + #ifndef USE_CUDA_MINIMAL +#if !defined(__CUDACC__) +#include +#endif + namespace onnxruntime { namespace cuda { +#define CUDNN_FE_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUDNN_FE_CALL(expr)) + class CudnnTensor final { public: CudnnTensor(); @@ -19,6 +29,7 @@ class CudnnTensor final { Status Set(gsl::span input_dims, cudnnDataType_t dataType, bool is_nhwc = false); Status Set(const CudnnTensor& x_desc, cudnnBatchNormMode_t mode); + Status Set(gsl::span input_dims, cudnnDataType_t dataType, gsl::span input_strides); // Set 4D tensor format (for NHWC) Status Set(cudnnTensorFormat_t format, cudnnDataType_t dataType, int n, int c, int h, int w); @@ -139,7 +150,8 @@ struct Consts { inline double ClampCudnnBatchNormEpsilon(double epsilon) { if (epsilon < CUDNN_BN_MIN_EPSILON) { if (CUDNN_BN_MIN_EPSILON - epsilon > FLT_EPSILON) - LOGS_DEFAULT(WARNING) << "Provided epsilon is smaller than CUDNN_BN_MIN_EPSILON. Setting it to CUDNN_BN_MIN_EPSILON"; + LOGS_DEFAULT(WARNING) << "Provided epsilon is smaller than CUDNN_BN_MIN_EPSILON. " + << "Setting it to CUDNN_BN_MIN_EPSILON"; return CUDNN_BN_MIN_EPSILON; } return epsilon; @@ -258,6 +270,23 @@ SetPoolingNdDescriptorHelper(cudnnPoolingDescriptor_t poolingDesc, return cudnnSetPoolingNdDescriptor(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA); } +std::vector generateStrides(const std::vector& shape, bool channels_last); + +#if !defined(__CUDACC__) +class CudnnFeTensor final { + public: + CudnnFeTensor(const onnxruntime::TensorShapeVector& shape, const std::string& name, + std::optional dtype, const bool nhwc); + + template + static cudnn_frontend::DataType_t GetDataType(); + cudnn_frontend::graph::Tensor_attributes Get() { return tensor_; } + + private: + cudnn_frontend::graph::Tensor_attributes tensor_; +}; +#endif + } // namespace cuda } // namespace onnxruntime #endif diff --git a/onnxruntime/core/providers/cuda/cudnn_fe_call.cc b/onnxruntime/core/providers/cuda/cudnn_fe_call.cc new file mode 100644 index 0000000000000..640025c248187 --- /dev/null +++ b/onnxruntime/core/providers/cuda/cudnn_fe_call.cc @@ -0,0 +1,119 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/cuda/shared_inc/cudnn_fe_call.h" +#include "core/providers/shared_library/provider_api.h" +#include +#if !defined(__CUDACC__) +#include +#endif +#ifdef _WIN32 +#else // POSIX +#include +#include +#endif + +namespace onnxruntime { + +using namespace common; + +template +const char* CudaErrString(ERRTYPE) { + ORT_NOT_IMPLEMENTED(); +} + +#if !defined(__CUDACC__) +#define CASE_ENUM_TO_STR_CUDNN_FE(x) \ + case cudnn_frontend::error_code_t::x: \ + return #x + +template <> +const char* CudaErrString(cudnn_frontend::error_t x) { + cudaDeviceSynchronize(); + LOGS_DEFAULT(ERROR) << x.get_message(); + switch (x.get_code()) { + CASE_ENUM_TO_STR_CUDNN_FE(OK); + CASE_ENUM_TO_STR_CUDNN_FE(ATTRIBUTE_NOT_SET); + CASE_ENUM_TO_STR_CUDNN_FE(SHAPE_DEDUCTION_FAILED); + CASE_ENUM_TO_STR_CUDNN_FE(INVALID_TENSOR_NAME); + CASE_ENUM_TO_STR_CUDNN_FE(INVALID_VARIANT_PACK); + CASE_ENUM_TO_STR_CUDNN_FE(GRAPH_NOT_SUPPORTED); + CASE_ENUM_TO_STR_CUDNN_FE(GRAPH_EXECUTION_PLAN_CREATION_FAILED); + CASE_ENUM_TO_STR_CUDNN_FE(GRAPH_EXECUTION_FAILED); + CASE_ENUM_TO_STR_CUDNN_FE(HEURISTIC_QUERY_FAILED); + CASE_ENUM_TO_STR_CUDNN_FE(UNSUPPORTED_GRAPH_FORMAT); + CASE_ENUM_TO_STR_CUDNN_FE(CUDA_API_FAILED); + CASE_ENUM_TO_STR_CUDNN_FE(CUDNN_BACKEND_API_FAILED); + CASE_ENUM_TO_STR_CUDNN_FE(INVALID_CUDA_DEVICE); + CASE_ENUM_TO_STR_CUDNN_FE(HANDLE_ERROR); + default: + return "Unknown CUDNN_FRONTEND error status"; + } +} + +template +int GetErrorCode(ERRTYPE err) { + return static_cast(err); +} + +template <> +int GetErrorCode(cudnn_frontend::error_t err) { + return static_cast(err.get_code()); +} + +template +std::conditional_t CudaCall( + ERRTYPE retCode, const char* exprString, const char* libName, SUCCTYPE successCode, const char* msg, + const char* file, const int line) { + if (retCode != successCode) { + try { +#ifdef _WIN32 + std::string hostname_str = GetEnvironmentVar("COMPUTERNAME"); + if (hostname_str.empty()) { + hostname_str = "?"; + } + const char* hostname = hostname_str.c_str(); +#else + char hostname[HOST_NAME_MAX]; + if (gethostname(hostname, HOST_NAME_MAX) != 0) + strcpy(hostname, "?"); +#endif + int currentCudaDevice; + cudaGetDevice(¤tCudaDevice); + cudaGetLastError(); // clear last CUDA error + static char str[1024]; + snprintf(str, 1024, "%s failure %d: %s ; GPU=%d ; hostname=%s ; file=%s ; line=%d ; expr=%s; %s", + libName, GetErrorCode(retCode), CudaErrString(retCode), currentCudaDevice, + hostname, + file, line, exprString, msg); + if constexpr (THRW) { + // throw an exception with the error info + ORT_THROW(str); + } else { + LOGS_DEFAULT(ERROR) << str; + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, str); + } + } catch (const std::exception& e) { // catch, log, and rethrow since CUDA code sometimes hangs in destruction, + // so we'd never get to see the error + if constexpr (THRW) { + ORT_THROW(e.what()); + } else { + LOGS_DEFAULT(ERROR) << e.what(); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, e.what()); + } + } + } + if constexpr (!THRW) { + return Status::OK(); + } +} + +template Status CudaCall( + cudnn_frontend::error_t retCode, const char* exprString, const char* libName, + cudnn_frontend::error_code_t successCode, const char* msg, const char* file, const int line); +template void CudaCall( + cudnn_frontend::error_t retCode, const char* exprString, const char* libName, + cudnn_frontend::error_code_t successCode, const char* msg, const char* file, const int line); + +#endif +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc index 764feadcf4cb3..1e216d6ee7a6d 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.cc +++ b/onnxruntime/core/providers/cuda/nn/conv.cc @@ -3,13 +3,15 @@ // Licensed under the MIT License. #include +#include +#include +#include "core/common/status.h" #include "core/providers/cuda/nn/conv.h" #include "core/common/span_utils.h" #include "core/providers/cuda/cuda_common.h" -#include "core/providers/cuda/shared_inc/fpgeneric.h" -#include "core/providers/cuda/tensor/slice.h" #include "core/providers/cuda/tensor/transpose.h" +#include "core/providers/cuda/shared_inc/cudnn_fe_call.h" namespace onnxruntime { namespace cuda { @@ -43,58 +45,7 @@ REGISTER_KERNEL_TYPED(float, kMSInternalNHWCDomain, true) REGISTER_KERNEL_TYPED(MLFloat16, kMSInternalNHWCDomain, true) #endif -template -const cudnnConvolutionFwdAlgo_t Conv::kAllAlgos[] = { - CUDNN_CONVOLUTION_FWD_ALGO_GEMM, - CUDNN_CONVOLUTION_FWD_ALGO_FFT, - CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, - CUDNN_CONVOLUTION_FWD_ALGO_DIRECT, - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED, -}; - -cudnnStatus_t GetWorkspaceSize(cudnnHandle_t handle, const CudnnConvState& s, cudnnConvolutionFwdAlgo_t algo, size_t* sz) { - return cudnnGetConvolutionForwardWorkspaceSize(handle, s.x_tensor, s.w_desc, s.conv_desc, s.y_tensor, algo, sz); -} - -size_t GetMaxWorkspaceSize(cudnnHandle_t handle, const CudnnConvState& s, - const cudnnConvolutionFwdAlgo_t* algo, int n_algo) { - // TODO: get maximum available size from memory arena - size_t free, total; - CUDA_CALL_THROW(cudaMemGetInfo(&free, &total)); - // Assuming 10% of fragmentation - free = static_cast(static_cast(free) * 0.9); - size_t max_ws_size = 0; - for (int i = 0; i < n_algo; i++) { - cudnnStatus_t err; - size_t sz; - err = GetWorkspaceSize(handle, s, algo[i], &sz); - if (CUDNN_STATUS_SUCCESS != err || sz == 0 || sz < max_ws_size || sz > free) continue; - max_ws_size = sz; - } - return max_ws_size; -} - -Status SliceOutUnwantedOutputSection(cudaStream_t stream, - const void* input_data, gsl::span input_dims, - void* output_data, - const gsl::span& output_dims, - const gsl::span& starts, - const gsl::span& ends, - const gsl::span& axes, - size_t element_size) { - SliceOp::PrepareForComputeMetadata compute_metadata(input_dims); - - ORT_THROW_IF_ERROR(SliceBase::PrepareForCompute(starts, ends, axes, compute_metadata)); - - // As a sanity check, ensure that the slice operator's output shape matches with the expected output shape - ORT_ENFORCE(SpanEq(gsl::make_span(compute_metadata.output_dims_), output_dims)); - - return SliceCuda::Impl(stream, input_data, input_dims, output_data, compute_metadata, element_size); -} - +// First input (in this case X) is in case NHWC == true also in NHWC format, the other inputs in NCHW template Status Conv::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool& is_packed, PrePackedWeights* /*prepacked_weights*/) { @@ -104,14 +55,20 @@ Status Conv::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr if (is_nhwc_domain_ && input_idx == 1) { // InputTensors::IN_W // Transpose from {M, C/group, kH, kW} to {M, kH, kW, C/group} auto orig_shape = tensor.Shape(); + auto shape_size = orig_shape.GetDims().size(); - InlinedVector perm{0, 2, 3, 1}; - gsl::span permutation(perm.data(), 4); - TensorShapeVector new_dims{orig_shape[0], - orig_shape[2], - orig_shape[3], - orig_shape[1]}; - W_ = Tensor::Create(tensor.DataType(), TensorShape(new_dims), std::move(alloc)); + InlinedVector perm; + perm.push_back(0); + for (size_t i = 2; i < shape_size; i++) perm.push_back(i); + perm.push_back(1); + gsl::span permutation(perm.data(), shape_size); + + TensorShapeVector nhwc_dims; + for (size_t i = 0; i < shape_size; i++) { + nhwc_dims.push_back(orig_shape[perm[i]]); + } + + W_ = Tensor::Create(tensor.DataType(), TensorShape(nhwc_dims), std::move(alloc)); auto status = cuda::Transpose::DoTranspose(GetDeviceProp(), DefaultCudaStream(), @@ -122,6 +79,8 @@ Status Conv::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr } CUDA_CALL_THROW(cudaStreamSynchronize(DefaultCudaStream())); is_packed = true; + } else { + W_already_nhwc = true; } } else { ORT_UNUSED_PARAMETER(tensor); @@ -132,45 +91,201 @@ Status Conv::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr return Status::OK(); } -template -Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const { +#if !defined(__CUDACC__) + +template +Status Conv::CreateCudnnFeExecutionPlan(const onnxruntime::TensorShapeVector& x_dims, + const onnxruntime::TensorShapeVector& w_dims, + const Tensor* B, + const Tensor* Z, + const TensorShapeVector& y_dims, + cudnnContext* handle, + const cudnn_frontend::HeurMode_t heur_mode, + const std::vector& pads, + const std::vector& strides, + const std::vector& dilations, + const bool bias_expected, + const bool fuse_bias, + const bool fuse_act, + const bool w_in_nhwc, + const bool use_tf32) const { + s_.bias_fused = fuse_bias; + s_.act_fused = fuse_act; + s_.variant_pack.clear(); // clear variant pack, as stored pointers to tensors change + s_.cudnn_fe_graph = std::make_unique(); + cudnn_frontend::DataType_t data_type = CudnnFeTensor::GetDataType(); + s_.cudnn_fe_graph->set_io_data_type(data_type).set_intermediate_data_type(data_type); + if (data_type == cudnn_frontend::DataType_t::HALF) { + s_.cudnn_fe_graph->set_compute_data_type(cudnn_frontend::DataType_t::FLOAT); + } else { + s_.cudnn_fe_graph->set_compute_data_type(data_type); + } + + s_.cudnn_fe_X = s_.cudnn_fe_graph->tensor(CudnnFeTensor(x_dims, "x", data_type, Layout == LAYOUT_NHWC).Get()); + s_.cudnn_fe_W = s_.cudnn_fe_graph->tensor(CudnnFeTensor(w_dims, "w", data_type, w_in_nhwc).Get()); + + auto conv_options = cudnn_frontend::graph::Conv_fprop_attributes() + .set_pre_padding(std::vector(pads.begin(), + pads.begin() + pads.size() / 2)) + .set_post_padding(std::vector(pads.begin() + pads.size() / 2, pads.end())) + .set_stride(strides) + .set_dilation(dilations); + s_.cudnn_fe_conv_Y = s_.cudnn_fe_graph->conv_fprop(s_.cudnn_fe_X, s_.cudnn_fe_W, conv_options); + auto cudnn_fe_y_tensor = CudnnFeTensor(y_dims, "y", data_type, Layout == LAYOUT_NHWC).Get(); + + if (!bias_expected && B == nullptr) { + s_.cudnn_fe_Y = s_.cudnn_fe_conv_Y; + } else { + int64_t bias_size; + if (B != nullptr) { + bias_size = B->Shape()[0]; + } else { + bias_size = w_dims[0]; + } + + std::optional cudnn_fe_z_tensor; + if (Z) { + const auto& z_shape = Z->Shape().AsShapeVector(); + cudnn_fe_z_tensor = CudnnFeTensor(z_shape, "z", data_type, Layout == LAYOUT_NHWC).Get(); + } else if (fuse_bias && Layout == LAYOUT_NCHW) { + // Z is required for NCHW precompiled kernels in cuDNN + s_.z_data = s_.y_data; + cudnn_fe_z_tensor = cudnn_fe_y_tensor; + } + + if (fuse_bias) { + std::shared_ptr add_output; + if (cudnn_fe_z_tensor.has_value()) { + s_.cudnn_fe_Z = s_.cudnn_fe_graph->tensor(cudnn_fe_z_tensor.value()); + auto add_options = cudnn_frontend::graph::Pointwise_attributes().set_mode(cudnn_frontend::PointwiseMode_t::ADD); + add_output = s_.cudnn_fe_graph->pointwise(s_.cudnn_fe_conv_Y, s_.cudnn_fe_Z, add_options); + } else { + add_output = s_.cudnn_fe_conv_Y; + } + + onnxruntime::TensorShapeVector b_dims; + for (size_t i = 0; i < x_dims.size(); i++) { + b_dims.push_back(i == 1 ? bias_size : 1); + } + auto bias_tensor = CudnnFeTensor(b_dims, "b", data_type, Layout == LAYOUT_NHWC).Get(); + auto bias_options = cudnn_frontend::graph::Pointwise_attributes().set_mode(cudnn_frontend::PointwiseMode_t::ADD); + s_.cudnn_fe_B = s_.cudnn_fe_graph->tensor(bias_tensor); + s_.cudnn_fe_Y = s_.cudnn_fe_graph->pointwise(add_output, s_.cudnn_fe_B, bias_options); + } else { + s_.cudnn_fe_Y = s_.cudnn_fe_conv_Y; + + TensorShapeVector b_dims(y_dims.size(), 1); + TensorShapeVector b_strides(y_dims.size(), 1); + b_dims[1] = bias_size; + b_strides[0] = bias_size; + if (Z) { + ORT_RETURN_IF_ERROR(s_.z_tensor.Set(Z->Shape().AsShapeVector(), + CudnnTensor::GetDataType(), + cudnn_fe_z_tensor->get_stride())); + } + ORT_RETURN_IF_ERROR(s_.b_tensor.Set(b_dims, CudnnTensor::GetDataType(), b_strides)); + ORT_RETURN_IF_ERROR(s_.y_tensor.Set(y_dims, CudnnTensor::GetDataType(), cudnn_fe_y_tensor.get_stride())); + + /* Creating an own CUDNN Frontend graph for the bias addition. + s_.cudnn_fe_bias_graph = std::make_unique(); + s_.cudnn_fe_bias_graph->set_io_data_type(data_type) + .set_compute_data_type(data_type == cudnn_frontend::DataType_t::HALF ? + cudnn_frontend::DataType_t::FLOAT : data_type) + .set_intermediate_data_type(data_type); + s_.cudnn_fe_bias_X = s_.cudnn_fe_bias_graph->tensor(CudnnFeTensor(y_dims, "x", data_type).Get()); + + s_.cudnn_fe_B = s_.cudnn_fe_bias_graph->tensor(bias_tensor); + s_.cudnn_fe_bias_Y = s_.cudnn_fe_bias_graph->pointwise(s_.cudnn_fe_bias_X, s_.cudnn_fe_B, bias_options); + s_.cudnn_fe_bias_Y->set_output(true); + + CUDNN_FE_RETURN_IF_ERROR(s_.cudnn_fe_bias_graph->validate()); + CUDNN_FE_RETURN_IF_ERROR(s_.cudnn_fe_bias_graph->build_operation_graph(handle)); + CUDNN_FE_RETURN_IF_ERROR(s_.cudnn_fe_bias_graph->create_execution_plans({heur_mode})); + CUDNN_FE_RETURN_IF_ERROR(s_.cudnn_fe_bias_graph->check_support(handle)); + CUDNN_FE_RETURN_IF_ERROR(s_.cudnn_fe_bias_graph->build_plans(handle));*/ + } + } + if (fuse_act && s_.cudnn_fe_act_attr.has_value()) { + auto& activation_attr = s_.cudnn_fe_act_attr.value(); + s_.cudnn_fe_Y = s_.cudnn_fe_graph->pointwise(s_.cudnn_fe_Y, activation_attr); + } + + s_.cudnn_fe_Y->set_dim(cudnn_fe_y_tensor.get_dim()); + s_.cudnn_fe_Y->set_stride(cudnn_fe_y_tensor.get_stride()); + s_.cudnn_fe_Y->set_output(true); + + try { + CUDNN_FE_CALL_THROW(s_.cudnn_fe_graph->validate()); + CUDNN_FE_CALL_THROW(s_.cudnn_fe_graph->build_operation_graph(handle)); + CUDNN_FE_CALL_THROW(s_.cudnn_fe_graph->create_execution_plans({heur_mode})); + } catch (const std::exception& ex) { + LOGS_DEFAULT(ERROR) << "Failed to initialize CUDNN Frontend."; + return Status(common::StatusCategory::ONNXRUNTIME, common::StatusCode::EP_FAIL, ex.what()); + } + + if (!use_tf32) s_.cudnn_fe_graph->deselect_numeric_notes({cudnn_frontend::NumericalNote_t::TENSOR_CORE}); + + try { + CUDNN_FE_CALL_THROW(s_.cudnn_fe_graph->check_support(handle)); + CUDNN_FE_CALL_THROW(s_.cudnn_fe_graph->build_plans(handle)); + } catch (const std::exception& ex) { + if (!fuse_bias && !fuse_act && use_tf32) { + return Status(common::StatusCategory::ONNXRUNTIME, + common::StatusCode::EP_FAIL, "OP not supported by CUDNN Frontend."); + ; + } + return CreateCudnnFeExecutionPlan(x_dims, w_dims, B, Z, y_dims, handle, heur_mode, + pads, strides, dilations, bias_expected, false, false, w_in_nhwc, true); + } + + s_.workspace_bytes = s_.cudnn_fe_graph->get_workspace_size(); + return Status::OK(); +} + +#endif + +template +Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const { + constexpr bool channels_last = Layout; + // set X const Tensor* X = context->Input(0); const TensorShape& x_shape = X->Shape(); + // X incl. x_dims is in NHWC Format iff. NHWC == true const auto x_dims = x_shape.AsShapeVector(); + s_.x_data = reinterpret_cast(X->Data()); s_.element_size = X->DataType()->Size(); // set W + bool w_in_nhwc; const Tensor* W; if (!W_) { W = context->Input(1); + w_in_nhwc = W_already_nhwc; + // Dims and memory layout are in NCHW format } else { W = W_.get(); + w_in_nhwc = true; + // W got prepacked, therfore if NHWC == true, then dims and memory layout are in NHWC } const TensorShape& w_shape = W->Shape(); - auto w_dims = w_shape.AsShapeVector(); + onnxruntime::TensorShapeVector w_dims = w_shape.AsShapeVector(); s_.w_data = reinterpret_cast(W->Data()); - // Make sure input and weight are 4D for NHWC since we set 4D descriptor for NHWC. - constexpr bool channels_last = NHWC; - if constexpr (channels_last) { - if (x_shape.NumDimensions() != 4 || w_shape.NumDimensions() != 4) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Number of dimensions of X and W should be 4 for channels_last format (NHWC)"); - } - } - // set B + // Always in NCHW format + const Tensor* B = nullptr; if (context->InputCount() >= 3) { - const Tensor* B = context->Input(2); + B = context->Input(2); s_.b_data = reinterpret_cast(B->Data()); } else { s_.b_data = nullptr; } + // set Z + const Tensor* Z = nullptr; if (context->InputCount() >= 4) { - const Tensor* Z = context->Input(3); - ORT_RETURN_IF_ERROR(s_.z_tensor.Set(Z->Shape().GetDims(), CudnnTensor::GetDataType())); + Z = context->Input(3); s_.z_data = reinterpret_cast(Z->Data()); } else { s_.z_data = nullptr; @@ -183,13 +298,12 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) if (w_dims_changed) { s_.last_w_dims = gsl::make_span(w_dims); - s_.cached_benchmark_results.clear(); } - ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape(), channels_last, channels_last)); + ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape(), channels_last, w_in_nhwc)); TensorShapeVector kernel_shape; - ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(W->Shape(), kernel_shape, channels_last)); + ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(W->Shape(), kernel_shape, w_in_nhwc)); const size_t kernel_rank = kernel_shape.size(); @@ -211,59 +325,46 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const int64_t N = X->Shape()[0]; const int64_t M = W->Shape()[0]; - if (channels_last) { + + if constexpr (channels_last) { y_dims.push_back(N); } else { y_dims.insert(y_dims.begin(), {N, M}); } - bool post_slicing_required = false; - TensorShapeVector slice_starts; - slice_starts.reserve(kernel_rank); - - TensorShapeVector slice_ends; - slice_ends.reserve(kernel_rank); - - TensorShapeVector slice_axes; - slice_axes.reserve(kernel_rank); - constexpr size_t spatial_dim_start = channels_last ? 1 : 2; const size_t spatial_dim_end = spatial_dim_start + kernel_rank; TensorShape spatial_shape = X->Shape().Slice(spatial_dim_start, spatial_dim_end); - TensorShapeVector y_dims_with_adjusted_pads(y_dims); - ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShapeWithAdjustedPads(spatial_shape, kernel_shape, - strides, dilations, pads, y_dims, y_dims_with_adjusted_pads, - post_slicing_required, slice_starts, slice_ends, slice_axes, - channels_last)); - if (channels_last) { + ORT_RETURN_IF_ERROR(conv_attrs_.InferPadsAndOutputShape(spatial_shape, kernel_shape, + strides, dilations, pads, y_dims)); + if constexpr (channels_last) { y_dims.push_back(M); - y_dims_with_adjusted_pads.push_back(M); } - ORT_ENFORCE(y_dims.size() == y_dims_with_adjusted_pads.size()); s_.y_dims = gsl::make_span(y_dims); - s_.y_dims_with_adjusted_pads = y_dims_with_adjusted_pads; - s_.post_slicing_required = post_slicing_required; - s_.slice_starts = slice_starts; - s_.slice_ends = slice_ends; - s_.slice_axes = slice_axes; - s_.Y = context->Output(0, TensorShape(s_.y_dims)); - if (post_slicing_required) { - // Post slicing needed. Create and fill in the Conv results in an intermediate buffer. - s_.memory_for_cudnn_conv_results = GetScratchBuffer(TensorShape(y_dims_with_adjusted_pads).Size() * s_.element_size, context->GetComputeStream()); - s_.y_data = reinterpret_cast(s_.memory_for_cudnn_conv_results.get()); - } else { - // No post slicing needed. Fill the output tensor's buffer directly. - s_.y_data = reinterpret_cast(s_.Y->MutableData()); - } + s_.y_data = reinterpret_cast(s_.Y->MutableData()); const CUDAExecutionProvider* cuda_ep = static_cast(this->Info().GetExecutionProvider()); TensorShapeVector x_dims_cudnn{x_dims.begin(), x_dims.end()}; - TensorShapeVector y_dims_cudnn = !post_slicing_required ? y_dims : y_dims_with_adjusted_pads; + TensorShapeVector y_dims_cudnn{y_dims.begin(), y_dims.end()}; + TensorShapeVector w_dims_cudnn{w_dims.begin(), w_dims.end()}; + + if constexpr (channels_last) { + x_dims_cudnn.insert(x_dims_cudnn.begin() + 1, *(x_dims_cudnn.end() - 1)); + y_dims_cudnn.insert(y_dims_cudnn.begin() + 1, *(y_dims_cudnn.end() - 1)); + x_dims_cudnn.erase(x_dims_cudnn.end() - 1); + y_dims_cudnn.erase(y_dims_cudnn.end() - 1); + + if (w_in_nhwc) { + w_dims_cudnn.insert(w_dims_cudnn.begin() + 1, *(w_dims_cudnn.end() - 1)); + w_dims_cudnn.erase(w_dims_cudnn.end() - 1); + } + } + if (kernel_rank < 2) { // TODO: Explore padding the provided input shape [N, C, D] to [N, C, 1, D] // especially for EXHAUSTIVE algo search which may result in a better algo selection. @@ -276,7 +377,7 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) if (cuda_ep->GetCudnnConv1dPadToNc1d()) { x_dims_cudnn.insert(x_dims_cudnn.begin() + 2, 1); y_dims_cudnn.insert(y_dims_cudnn.begin() + 2, 1); - w_dims.insert(w_dims.begin() + 2, 1); + w_dims_cudnn.insert(w_dims.begin() + 2, 1); pads.insert(pads.begin() + kernel_rank, 0); pads.insert(pads.begin(), 0); kernel_shape.insert(kernel_shape.begin(), 1); @@ -285,7 +386,7 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) } else { x_dims_cudnn.push_back(1); y_dims_cudnn.push_back(1); - w_dims.push_back(1); + w_dims_cudnn.push_back(1); pads.insert(pads.begin() + kernel_rank, 0); pads.insert(pads.end(), 0); kernel_shape.push_back(1); @@ -294,188 +395,105 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) } } - if (w_dims_changed) { - if (!channels_last) { - ORT_RETURN_IF_ERROR(s_.w_desc.Set(w_dims, CudnnTensor::GetDataType())); - } else { - ORT_RETURN_IF_ERROR(s_.w_desc.Set(CUDNN_TENSOR_NHWC, - CudnnTensor::GetDataType(), - static_cast(w_dims[0]), - static_cast(w_dims[3]), - static_cast(w_dims[1]), - static_cast(w_dims[2]))); - } - } - // We must delay returning early until here so that the weight dims have been cached properly if (s_.Y->Shape().Size() == 0) { return Status::OK(); } - if (channels_last) { - ORT_RETURN_IF_ERROR(s_.x_tensor.Set(CUDNN_TENSOR_NHWC, - CudnnTensor::GetDataType(), - static_cast(x_dims_cudnn[0]), - static_cast(x_dims_cudnn[3]), - static_cast(x_dims_cudnn[1]), - static_cast(x_dims_cudnn[2]))); - - ORT_RETURN_IF_ERROR(s_.y_tensor.Set(CUDNN_TENSOR_NHWC, - CudnnTensor::GetDataType(), - static_cast(y_dims_cudnn[0]), - static_cast(y_dims_cudnn[3]), - static_cast(y_dims_cudnn[1]), - static_cast(y_dims_cudnn[2]))); - } else { - ORT_RETURN_IF_ERROR(s_.x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType())); - ORT_RETURN_IF_ERROR(s_.y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType())); - } - - ORT_RETURN_IF_ERROR(s_.conv_desc.Set(kernel_shape.size(), pads, strides, dilations, - gsl::narrow_cast(conv_attrs_.group), - CUDNN_CROSS_CORRELATION, CudnnTensor::GetDataType(), - UseTF32())); - - if (context->InputCount() >= 3) { - const Tensor* B = context->Input(2); - const auto& b_shape = B->Shape(); - ORT_RETURN_IF_NOT(b_shape.NumDimensions() == 1, "bias should be 1D"); - TensorShapeVector b_dims(2 + kernel_shape.size(), 1); - b_dims[1] = b_shape[0]; - ORT_RETURN_IF_ERROR(s_.b_tensor.Set(b_dims, CudnnTensor::GetDataType())); - // s_.b_data = reinterpret_cast(B->Data()); - } else if (bias_expected) { - TensorShapeVector b_dims(2 + kernel_shape.size(), 1); - b_dims[1] = w_dims[0]; - auto malloc_size = b_dims[1] * sizeof(CudaT); - ORT_RETURN_IF_ERROR(s_.b_tensor.Set(b_dims, CudnnTensor::GetDataType())); - if (s_.b_zero) { - CUDA_CALL_THROW(cudaFree(s_.b_zero)); - s_.b_zero = nullptr; - } - CUDA_CALL_THROW(cudaMalloc(&s_.b_zero, malloc_size)); - CUDA_CALL_THROW(cudaMemsetAsync(s_.b_zero, 0, malloc_size, Stream(context))); + auto handle = GetCudnnHandle(context); + + int cudnn_conv_algo = cuda_ep->GetCudnnConvAlgo(); +#if !defined(__CUDACC__) + cudnn_frontend::HeurMode_t heur_mode; + switch (cudnn_conv_algo) { + case 0: + heur_mode = cudnn_frontend::HeurMode_t::B; + break; + case 1: + heur_mode = cudnn_frontend::HeurMode_t::A; + break; + case 2: + heur_mode = cudnn_frontend::HeurMode_t::FALLBACK; + LOGS_DEFAULT(WARNING) << "OP " << CudaKernel::Node().OpType() << "(" << CudaKernel::Node().Name() + << ") running in Fallback mode. May be extremely slow."; + break; + default: + heur_mode = cudnn_frontend::HeurMode_t::A; + break; } - if (!s_.cached_benchmark_results.contains(x_dims_cudnn)) { - // set math type to tensor core before algorithm search - if constexpr (std::is_same::value) { - CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, CUDNN_TENSOR_OP_MATH)); - } else if constexpr (std::is_same::value) { - if (!UseTF32()) { - CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, CUDNN_FMA_MATH)); - } - } - - cudnnConvolutionFwdAlgoPerf_t perf; - int algo_count = 1; - int cudnn_conv_algo = cuda_ep->GetCudnnConvAlgo(); - ORT_ENFORCE(cudnn_conv_algo > -1 && cudnn_conv_algo < 3, "cudnn_conv_algo should be 0, 1 or 2, but got ", cudnn_conv_algo); - switch (cudnn_conv_algo) { - case 0: { - static constexpr int num_algos = CUDNN_CONVOLUTION_FWD_ALGO_COUNT; - size_t max_ws_size = cuda_ep->GetCudnnConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetCudnnHandle(context), s_, kAllAlgos, num_algos) - : AlgoSearchWorkspaceSize; - // Use GetTransientScratchBuffer() so the workspace can be freed instead of cached. - // Because the benchmarking uses a huge amount of memory, e.g. a few GBs. - IAllocatorUniquePtr algo_search_workspace = GetTransientScratchBuffer(max_ws_size); - CUDNN_RETURN_IF_ERROR(cudnnFindConvolutionForwardAlgorithmEx( - GetCudnnHandle(context), - s_.x_tensor, - s_.x_data, - s_.w_desc, - s_.w_data, - s_.conv_desc, - s_.y_tensor, - s_.y_data, - 1, // requestedAlgoCount - &algo_count, // returnedAlgoCount - &perf, - algo_search_workspace.get(), - max_ws_size)); - break; - } - case 1: - CUDNN_RETURN_IF_ERROR(cudnnGetConvolutionForwardAlgorithm_v7( - GetCudnnHandle(context), - s_.x_tensor, - s_.w_desc, - s_.conv_desc, - s_.y_tensor, - 1, // requestedAlgoCount - &algo_count, // returnedAlgoCount - &perf)); - break; - - default: - perf.algo = kDefaultConvAlgo; - CUDNN_RETURN_IF_ERROR(GetWorkspaceSize(GetCudnnHandle(context), s_, perf.algo, &perf.memory)); - - if constexpr (std::is_same::value) { - perf.mathType = CUDNN_TENSOR_OP_MATH; - } else if (std::is_same::value && !UseTF32()) { - perf.mathType = CUDNN_FMA_MATH; - } else { - perf.mathType = CUDNN_DEFAULT_MATH; - } - } - s_.cached_benchmark_results.insert(x_dims_cudnn, {perf.algo, perf.memory, perf.mathType}); - } - const auto& perf = s_.cached_benchmark_results.at(x_dims_cudnn); - CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, perf.mathType)); - s_.algo = perf.algo; - s_.workspace_bytes = perf.memory; + const auto use_tf32 = cuda_ep->UseTF32(); + // fuse if this op is part of a FusedConv or if the EP is set to fuse ops + const auto fuse_bias = cuda_ep->IsFuseConvBias() || is_fused_node_; + const auto fuse_act = is_fused_node_; + + ORT_RETURN_IF_ERROR(CreateCudnnFeExecutionPlan(x_dims_cudnn, w_dims_cudnn, B, Z, y_dims_cudnn, handle, heur_mode, + std::vector(pads.begin(), + pads.end()), + std::vector(strides.begin(), + strides.end()), + std::vector(dilations.begin(), + dilations.end()), + bias_expected, fuse_bias, fuse_act, w_in_nhwc, use_tf32)); +#endif } else { // set Y s_.Y = context->Output(0, s_.y_dims); if (s_.Y->Shape().Size() == 0) { return Status::OK(); } - if (s_.post_slicing_required) { - s_.memory_for_cudnn_conv_results = GetScratchBuffer(TensorShape(s_.y_dims_with_adjusted_pads).Size() * s_.element_size, context->GetComputeStream()); - s_.y_data = reinterpret_cast(s_.memory_for_cudnn_conv_results.get()); - } else { - s_.y_data = reinterpret_cast(s_.Y->MutableData()); - } + s_.y_data = reinterpret_cast(s_.Y->MutableData()); } return Status::OK(); } -template -Status Conv::ComputeInternal(OpKernelContext* context) const { +template +Status Conv::ComputeInternal(OpKernelContext* context) const { std::lock_guard lock(s_.mutex); ORT_RETURN_IF_ERROR(UpdateState(context)); if (s_.Y->Shape().Size() == 0) { return Status::OK(); } - const auto alpha = Consts::One; - const auto beta = Consts::Zero; - IAllocatorUniquePtr workspace = GetWorkSpace(context->GetComputeStream()); + const auto alpha = onnxruntime::cuda::Consts::One; auto cudnn_handle = GetCudnnHandle(context); - CUDNN_RETURN_IF_ERROR(cudnnConvolutionForward(cudnn_handle, - &alpha, - s_.x_tensor, - s_.x_data, - s_.w_desc, - s_.w_data, - s_.conv_desc, - s_.algo, - workspace.get(), - s_.workspace_bytes, - &beta, - s_.y_tensor, - s_.y_data)); - if (nullptr != s_.b_data) { - CUDNN_RETURN_IF_ERROR(cudnnAddTensor(cudnn_handle, &alpha, s_.b_tensor, s_.b_data, +#if !defined(__CUDACC__) + s_.variant_pack.insert_or_assign(s_.cudnn_fe_X, const_cast(s_.x_data)); + s_.variant_pack.insert_or_assign(s_.cudnn_fe_W, const_cast(s_.w_data)); + s_.variant_pack.insert_or_assign(s_.cudnn_fe_Y, s_.y_data); + if (s_.bias_fused && s_.b_data != nullptr) { + s_.variant_pack.insert_or_assign(s_.cudnn_fe_B, const_cast(s_.b_data)); + } + if (s_.bias_fused && s_.z_data != nullptr) { + s_.variant_pack.insert_or_assign(s_.cudnn_fe_Z, const_cast(s_.z_data)); + if (Layout == LAYOUT_NCHW && s_.z_data == s_.y_data) { + // memset Z if it's required for a succesful fusion + CUDA_RETURN_IF_ERROR(cudaMemset(s_.y_data, 0, s_.Y->SizeInBytes())); + } + } + auto ws = GetWorkSpace(context->GetComputeStream()); + + CUDNN_FE_RETURN_IF_ERROR(s_.cudnn_fe_graph->execute(cudnn_handle, + s_.variant_pack, + ws.get())); + + if (!s_.bias_fused && s_.z_data != nullptr) { + CUDNN_RETURN_IF_ERROR(cudnnAddTensor(cudnn_handle, &alpha, s_.z_tensor, s_.z_data, &alpha, s_.y_tensor, s_.y_data)); } - // To deal with asymmetric padding, we may have over-padded on one or both sides of the spatial dimensions - // This may have lead to extra results that are unnecessary and hence we slice that off here - if (s_.post_slicing_required) { - ORT_RETURN_IF_ERROR(SliceOutUnwantedOutputSection(Stream(context), s_.y_data, gsl::make_span(s_.y_dims_with_adjusted_pads), - s_.Y->MutableDataRaw(), s_.y_dims.GetDims(), s_.slice_starts, - s_.slice_ends, s_.slice_axes, s_.element_size)); + if (!s_.bias_fused && s_.b_data != nullptr) { + CUDNN_RETURN_IF_ERROR(cudnnAddTensor(cudnn_handle, &alpha, s_.b_tensor, s_.b_data, + &alpha, s_.y_tensor, s_.y_data)); + + /* For the standalone bias addition graph. + s_.variant_pack_bias.insert_or_assign(s_.cudnn_fe_bias_X, s_.y_data); + s_.variant_pack_bias.insert_or_assign(s_.cudnn_fe_bias_Y, s_.y_data); + s_.variant_pack_bias.insert_or_assign(s_.cudnn_fe_B, const_cast(s_.b_data)); + CUDNN_FE_RETURN_IF_ERROR(s_.cudnn_fe_bias_graph->execute(cudnn_handle, + s_.variant_pack_bias, + GetWorkSpace(context->GetComputeStream()).get()));*/ } +#endif + return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/nn/conv.h b/onnxruntime/core/providers/cuda/nn/conv.h index 3aec654224e39..7a8e9cae521c1 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.h +++ b/onnxruntime/core/providers/cuda/nn/conv.h @@ -6,6 +6,12 @@ #include #include +#include +#include + +#if !defined(__CUDACC__) +#include +#endif #include "core/platform/ort_mutex.h" #include "core/providers/cuda/cuda_kernel.h" @@ -150,6 +156,24 @@ struct CudnnConvState { CudnnTensor z_tensor; const void* z_data = nullptr; CudnnConvolutionDescriptor conv_desc; + bool bias_fused = true; + bool act_fused = true; + +#if !defined(__CUDACC__) + std::unique_ptr cudnn_fe_graph; + std::unique_ptr cudnn_fe_bias_graph; + std::shared_ptr cudnn_fe_X; + std::shared_ptr cudnn_fe_W; + std::shared_ptr cudnn_fe_conv_Y; + std::shared_ptr cudnn_fe_Z; + std::shared_ptr cudnn_fe_B; + std::shared_ptr cudnn_fe_Y; + + std::optional cudnn_fe_act_attr = std::nullopt; + + std::unordered_map, void*> variant_pack; + std::unordered_map, void*> variant_pack_bias; +#endif struct PerfResultParams { decltype(AlgoPerfType().algo) algo; @@ -183,7 +207,7 @@ enum : size_t { // ONNX Conv operator uses NCHW format for input, weights and output. // NhwcConv contrib ops uses NHWC format: last dimension of input, weights and output are channels. -template +template class Conv : public CudaKernel { public: using CudaT = typename ToCudaType::MappedType; @@ -205,12 +229,32 @@ class Conv : public CudaKernel { } Status UpdateState(OpKernelContext* context, bool bias_expected = false) const; + +#if !defined(__CUDACC__) + Status CreateCudnnFeExecutionPlan(const onnxruntime::TensorShapeVector& x_dims, + const onnxruntime::TensorShapeVector& w_dims, + const Tensor* B, + const Tensor* Z, + const TensorShapeVector& y_dims, + cudnnContext* handle, + const cudnn_frontend::HeurMode_t heur_mode, + const std::vector& pads, + const std::vector& strides, + const std::vector& dilations, + const bool bias_expected, + const bool fuse_bias, + const bool fuse_act, + const bool w_in_nhwc, + const bool use_tf32) const; +#endif + ConvAttributes conv_attrs_; mutable CudnnConvState s_; constexpr static auto kDefaultConvAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - static const cudnnConvolutionFwdAlgo_t kAllAlgos[]; std::unique_ptr W_; - bool is_nhwc_domain_; // prepack is only needed for the Conv in kMSInternalNHWCDomain + bool is_nhwc_domain_; // prepack is only needed for the Conv in kMSInternalNHWCDomain + bool is_fused_node_ = false; // ensures the node is fused although the session option is not set + bool W_already_nhwc = false; // In case NHWC == true and Conv is not in kMSInternalNHWCDomain }; Status SliceOutUnwantedOutputSection(cudaStream_t stream, diff --git a/onnxruntime/core/providers/cuda/shared_inc/cuda_call.h b/onnxruntime/core/providers/cuda/shared_inc/cuda_call.h index 51a5631b930b0..2b2b726e62c79 100644 --- a/onnxruntime/core/providers/cuda/shared_inc/cuda_call.h +++ b/onnxruntime/core/providers/cuda/shared_inc/cuda_call.h @@ -4,16 +4,16 @@ #pragma once #include "core/common/common.h" #include "core/providers/cuda/cuda_pch.h" - namespace onnxruntime { // ----------------------------------------------------------------------- // Error handling // ----------------------------------------------------------------------- -template +template std::conditional_t CudaCall( - ERRTYPE retCode, const char* exprString, const char* libName, ERRTYPE successCode, const char* msg, const char* file, const int line); + ERRTYPE retCode, const char* exprString, const char* libName, SUCCTYPE successCode, const char* msg, + const char* file, const int line); #define CUDA_CALL(expr) (CudaCall((expr), #expr, "CUDA", cudaSuccess, "", __FILE__, __LINE__)) #define CUBLAS_CALL(expr) (CudaCall((expr), #expr, "CUBLAS", CUBLAS_STATUS_SUCCESS, "", __FILE__, __LINE__)) diff --git a/onnxruntime/core/providers/cuda/shared_inc/cudnn_fe_call.h b/onnxruntime/core/providers/cuda/shared_inc/cudnn_fe_call.h new file mode 100644 index 0000000000000..a51d84a7efa59 --- /dev/null +++ b/onnxruntime/core/providers/cuda/shared_inc/cudnn_fe_call.h @@ -0,0 +1,23 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/common/common.h" +#include "core/providers/cuda/cuda_pch.h" +#include "core/providers/cuda/shared_inc/cuda_call.h" +#if !defined(__CUDACC__) +#include +#endif +namespace onnxruntime { + +// ----------------------------------------------------------------------- +// Error handling +// ----------------------------------------------------------------------- + +#define CUDNN_FE_CALL(expr) (CudaCall((cudnn_frontend::error_t)(expr), #expr, "CUDNN_FE", \ + cudnn_frontend::error_code_t::OK, "", __FILE__, __LINE__)) +#define CUDNN_FE_CALL_THROW(expr) (CudaCall((cudnn_frontend::error_t)(expr), #expr, "CUDNN_FE", \ + cudnn_frontend::error_code_t::OK, "", __FILE__, __LINE__)) +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/shared_inc/fpgeneric.h b/onnxruntime/core/providers/cuda/shared_inc/fpgeneric.h index 053c66ddcb34a..240272923a3a7 100644 --- a/onnxruntime/core/providers/cuda/shared_inc/fpgeneric.h +++ b/onnxruntime/core/providers/cuda/shared_inc/fpgeneric.h @@ -15,9 +15,6 @@ #include "core/providers/cuda/cuda_common.h" -using namespace onnxruntime; -using namespace onnxruntime::cuda; - // Generalize library calls to be use in template functions inline cublasStatus_t cublasGemmHelper(cublasHandle_t handle, @@ -84,7 +81,7 @@ inline cublasStatus_t cublasGemmHelper(cublasHandle_t handle, half* C, int ldc, const cudaDeviceProp& prop, bool /*use_tf32*/) { - const HalfGemmOptions* half_options = HalfGemmOptions::GetInstance(); + const onnxruntime::cuda::HalfGemmOptions* half_options = onnxruntime::cuda::HalfGemmOptions::GetInstance(); onnxruntime::cuda::CublasMathModeSetter math_mode_setter(prop, handle, half_options->GetMathMode()); if (half_options->IsCompute16F()) { return cublasGemmEx(handle, @@ -127,7 +124,7 @@ inline cublasStatus_t cublasGemmHelper(cublasHandle_t handle, half* C, int ldc, const cudaDeviceProp& prop, bool /*use_tf32*/) { - const HalfGemmOptions* half_options = HalfGemmOptions::GetInstance(); + const onnxruntime::cuda::HalfGemmOptions* half_options = onnxruntime::cuda::HalfGemmOptions::GetInstance(); onnxruntime::cuda::CublasMathModeSetter math_mode_setter(prop, handle, half_options->GetMathMode()); if (half_options->IsCompute16F()) { // The alpha and beta shall have same precision as compute type. @@ -162,8 +159,8 @@ inline cublasStatus_t cublasGemmHelper(cublasHandle_t handle, #if defined(USE_CUDA) inline cublasStatus_t cublasGemmHelper( cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, - int n, int k, const BFloat16* alpha, const BFloat16* A, int lda, - const BFloat16* B, int ldb, const BFloat16* beta, BFloat16* C, int ldc, + int n, int k, const onnxruntime::BFloat16* alpha, const onnxruntime::BFloat16* A, int lda, + const onnxruntime::BFloat16* B, int ldb, const onnxruntime::BFloat16* beta, onnxruntime::BFloat16* C, int ldc, const cudaDeviceProp& /*prop*/, bool /*use_tf32*/) { float h_a = alpha->ToFloat(); float h_b = beta->ToFloat(); @@ -174,8 +171,9 @@ inline cublasStatus_t cublasGemmHelper( } #else inline cublasStatus_t cublasGemmHelper(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, - const BFloat16*, const BFloat16*, int, const BFloat16*, int, const BFloat16*, - BFloat16*, int, const cudaDeviceProp&, bool /*use_tf32*/) { + const onnxruntime::BFloat16*, const onnxruntime::BFloat16*, int, + const onnxruntime::BFloat16*, int, const onnxruntime::BFloat16*, + onnxruntime::BFloat16*, int, const cudaDeviceProp&, bool /*use_tf32*/) { return CUBLAS_STATUS_NOT_SUPPORTED; } #endif @@ -250,7 +248,7 @@ inline cublasStatus_t cublasGemmBatchedHelper(cublasHandle_t handle, int batch_count, const cudaDeviceProp& prop, bool /*use_tf32*/) { - const HalfGemmOptions* half_options = HalfGemmOptions::GetInstance(); + const onnxruntime::cuda::HalfGemmOptions* half_options = onnxruntime::cuda::HalfGemmOptions::GetInstance(); onnxruntime::cuda::CublasMathModeSetter math_mode_setter(prop, handle, half_options->GetMathMode()); if (half_options->IsCompute16F()) { return cublasGemmBatchedEx(handle, @@ -286,9 +284,9 @@ inline cublasStatus_t cublasGemmBatchedHelper(cublasHandle_t handle, #if defined(USE_CUDA) inline cublasStatus_t cublasGemmBatchedHelper( cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const BFloat16* alpha, const BFloat16* Aarray[], - int lda, const BFloat16* Barray[], int ldb, const BFloat16* beta, - BFloat16* Carray[], int ldc, int batch_count, + int m, int n, int k, const onnxruntime::BFloat16* alpha, const onnxruntime::BFloat16* Aarray[], + int lda, const onnxruntime::BFloat16* Barray[], int ldb, const onnxruntime::BFloat16* beta, + onnxruntime::BFloat16* Carray[], int ldc, int batch_count, const cudaDeviceProp& /*prop*/, bool /*use_tf32*/) { float h_a = alpha->ToFloat(); float h_b = beta->ToFloat(); @@ -300,8 +298,9 @@ inline cublasStatus_t cublasGemmBatchedHelper( } #else inline cublasStatus_t cublasGemmBatchedHelper(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, - const BFloat16*, const BFloat16*[], int, const BFloat16*[], int, - const BFloat16*, BFloat16*[], int, int, const cudaDeviceProp&, + const onnxruntime::BFloat16*, const onnxruntime::BFloat16*[], int, + const onnxruntime::BFloat16*[], int, const onnxruntime::BFloat16*, + onnxruntime::BFloat16*[], int, int, const cudaDeviceProp&, bool /*use_tf32*/) { return CUBLAS_STATUS_NOT_SUPPORTED; } @@ -314,12 +313,12 @@ inline cublasStatus_t cublasGemmStridedBatchedHelper(cublasHandle_t handle, int m, int n, int k, const float* alpha, const float* A, int lda, - long long int strideA, + int64_t strideA, const float* B, int ldb, - long long int strideB, + int64_t strideB, const float* beta, float* C, int ldc, - long long int strideC, + int64_t strideC, int batch_count, const cudaDeviceProp& prop, bool use_tf32) { @@ -349,12 +348,12 @@ inline cublasStatus_t cublasGemmStridedBatchedHelper(cublasHandle_t handle, int m, int n, int k, const double* alpha, const double* A, int lda, - long long int strideA, + int64_t strideA, const double* B, int ldb, - long long int strideB, + int64_t strideB, const double* beta, double* C, int ldc, - long long int strideC, + int64_t strideC, int batch_count, const cudaDeviceProp& /*prop*/, bool /*use_tf32*/) { @@ -376,16 +375,16 @@ inline cublasStatus_t cublasGemmStridedBatchedHelper(cublasHandle_t handle, int m, int n, int k, const __half* alpha, const __half* A, int lda, - long long int strideA, + int64_t strideA, const __half* B, int ldb, - long long int strideB, + int64_t strideB, const __half* beta, __half* C, int ldc, - long long int strideC, + int64_t strideC, int batch_count, const cudaDeviceProp& prop, bool /*use_tf32*/) { - const HalfGemmOptions* half_options = HalfGemmOptions::GetInstance(); + const onnxruntime::cuda::HalfGemmOptions* half_options = onnxruntime::cuda::HalfGemmOptions::GetInstance(); onnxruntime::cuda::CublasMathModeSetter math_mode_setter(prop, handle, half_options->GetMathMode()); if (half_options->IsCompute16F()) { return cublasGemmStridedBatchedEx(handle, @@ -425,16 +424,16 @@ inline cublasStatus_t cublasGemmStridedBatchedHelper(cublasHandle_t handle, int m, int n, int k, const float* alpha, const __half* A, int lda, - long long int strideA, + int64_t strideA, const __half* B, int ldb, - long long int strideB, + int64_t strideB, const float* beta, __half* C, int ldc, - long long int strideC, + int64_t strideC, int batch_count, const cudaDeviceProp& prop, bool /*use_tf32*/) { - const HalfGemmOptions* half_options = HalfGemmOptions::GetInstance(); + const onnxruntime::cuda::HalfGemmOptions* half_options = onnxruntime::cuda::HalfGemmOptions::GetInstance(); onnxruntime::cuda::CublasMathModeSetter math_mode_setter(prop, handle, half_options->GetMathMode()); if (half_options->IsCompute16F()) { // The alpha and beta shall have same precision as compute type. @@ -472,10 +471,10 @@ inline cublasStatus_t cublasGemmStridedBatchedHelper(cublasHandle_t handle, inline cublasStatus_t cublasGemmStridedBatchedHelper( cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, - const BFloat16* alpha, const BFloat16* A, int lda, - long long int strideA, const BFloat16* B, int ldb, - long long int strideB, const BFloat16* beta, BFloat16* C, int ldc, - long long int strideC, int batch_count, + const onnxruntime::BFloat16* alpha, const onnxruntime::BFloat16* A, int lda, + int64_t strideA, const onnxruntime::BFloat16* B, int ldb, + int64_t strideB, const onnxruntime::BFloat16* beta, onnxruntime::BFloat16* C, int ldc, + int64_t strideC, int batch_count, const cudaDeviceProp& /*prop*/, bool /*use_tf32*/) { float h_a = alpha->ToFloat(); float h_b = beta->ToFloat(); @@ -488,9 +487,9 @@ inline cublasStatus_t cublasGemmStridedBatchedHelper( #else inline cublasStatus_t cublasGemmStridedBatchedHelper( cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, - int, const BFloat16*, const BFloat16*, int, long long int, - const BFloat16*, int, long long int, const BFloat16*, BFloat16*, - int, long long int, int, const cudaDeviceProp&, bool /*use_tf32*/) { + int, const onnxruntime::BFloat16*, const onnxruntime::BFloat16*, int, int64_t, + const onnxruntime::BFloat16*, int, int64_t, const onnxruntime::BFloat16*, onnxruntime::BFloat16*, + int, int64_t, int, const cudaDeviceProp&, bool /*use_tf32*/) { return CUBLAS_STATUS_NOT_SUPPORTED; } #endif @@ -531,4 +530,5 @@ cublasStatus_t cublasCopyHelper( cudaStream_t stream, cublasHandle_t handle, int n, const half* x, int incx, half* y, int incy); cublasStatus_t cublasCopyHelper( - cudaStream_t stream, cublasHandle_t handle, int n, const BFloat16* x, int incx, BFloat16* y, int incy); + cudaStream_t stream, cublasHandle_t handle, int n, const onnxruntime::BFloat16* x, + int incx, onnxruntime::BFloat16* y, int incy); diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index 1d21933e9cba9..a63c82b37c1a8 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -1781,6 +1781,7 @@ OrtCUDAProviderOptionsV2 OrtCUDAProviderOptionsToOrtCUDAProviderOptionsV2(const cuda_options_converted.cudnn_conv_use_max_workspace = 1; cuda_options_converted.enable_cuda_graph = 0; cuda_options_converted.prefer_nhwc = 0; + cuda_options_converted.fuse_conv_bias = 0; cuda_options_converted.cudnn_conv1d_pad_to_nc1d = 0; cuda_options_converted.enable_skip_layer_norm_strict_mode = 0; cuda_options_converted.use_ep_level_unified_stream = 0; diff --git a/onnxruntime/test/contrib_ops/nhwc_conv_op_test.cc b/onnxruntime/test/contrib_ops/nhwc_conv_op_test.cc index 3f298b0a8f8e1..e780d35df08bd 100644 --- a/onnxruntime/test/contrib_ops/nhwc_conv_op_test.cc +++ b/onnxruntime/test/contrib_ops/nhwc_conv_op_test.cc @@ -30,7 +30,8 @@ void TestNhwcConvOp(const NhwcConvOpAndTestAttributes& attributes, bool use_float16, bool weight_is_initializer = false) { int min_cuda_architecture = use_float16 ? 530 : 0; - bool enable_cuda = HasCudaEnvironment(min_cuda_architecture); + // NHWC implementation doesn't handle W in NHWC layout if it's not an initializer + bool enable_cuda = HasCudaEnvironment(min_cuda_architecture) && weight_is_initializer; bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get()); bool enable_dml = (nullptr != DefaultDmlExecutionProvider().get()); diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc index 3e4e845440117..7ab0268c3509c 100755 --- a/onnxruntime/test/optimizer/graph_transform_test.cc +++ b/onnxruntime/test/optimizer/graph_transform_test.cc @@ -1993,67 +1993,22 @@ TEST_F(GraphTransformationTests, NotWhereFusion) { ASSERT_TRUE(op_to_count["Not"] == 1); // can't remove Not if it is graph output/ has consumer that's not where } -#if (defined(USE_CUDA) || defined(USE_JSEP)) && !defined(DISABLE_CONTRIB_OPS) -// Conv->Add->Relu will be transformed to FusedConv -TEST_F(GraphTransformationTests, FuseCudaConvAddRelu) { - constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/conv_add_relu.onnx"; - std::shared_ptr p_model; - ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_)); - Graph& graph = p_model->MainGraph(); - for (auto& node : p_model->MainGraph().Nodes()) { - node.SetExecutionProviderType(kCudaExecutionProvider); - } - std::map op_to_count = CountOpsInGraph(graph); - ASSERT_TRUE(op_to_count["Add"] == 1); - ASSERT_TRUE(op_to_count["Relu"] == 1); - onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; - ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique(), TransformerLevel::Level2)); - ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_)); - op_to_count = CountOpsInGraph(graph); - ASSERT_TRUE(op_to_count["Add"] == 0); // Add removed from graph - ASSERT_TRUE(op_to_count["Relu"] == 0); // Relu removed from graph -} - -// Currently the ConvAddRelu fusion is only backed by a float kernel for the -// the CUDA EP. - -// When we see the corresponding pattern for the fp16 data type, the fusion -// should not be triggered as there is no kernel to back the fused pattern. - -// TODO(hasesh): Limit the test to using the CUDA EP for now as the level of -// data type support in other compatible EPs is still yet to be ascertained. - -// TODO(hasesh): If at all the fp16 type is supported for the fusion, adjust/remove -// this test. -TEST_F(GraphTransformationTests, FuseCudaConvAddRelu_UnsupportedType) { - constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/conv_add_relu_fp16.onnx"; - std::shared_ptr p_model; - ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_)); - Graph& graph = p_model->MainGraph(); - for (auto& node : p_model->MainGraph().Nodes()) { - node.SetExecutionProviderType(kCudaExecutionProvider); - } - std::map op_to_count = CountOpsInGraph(graph); - ASSERT_EQ(op_to_count["Add"], 1); - ASSERT_EQ(op_to_count["Relu"], 1); - onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; - ASSERT_STATUS_OK(graph_transformation_mgr.Register( - std::make_unique(), TransformerLevel::Level2)); - ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_)); - op_to_count = CountOpsInGraph(graph); - ASSERT_EQ(op_to_count["Add"], 1); // Add not removed from graph (fusion not triggered) - ASSERT_EQ(op_to_count["Relu"], 1); // Relu not removed from graph (fusion not triggered) -} - +#if !defined(DISABLE_CONTRIB_OPS) // Conv->Add->Relu will be left intact since there is Identity depend on Add TEST_F(GraphTransformationTests, FuseCudaConvAddReluIdentity) { constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/conv_add_relu_identity.onnx"; std::shared_ptr p_model; ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_)); Graph& graph = p_model->MainGraph(); +#if defined(USE_JSEP) for (auto& node : p_model->MainGraph().Nodes()) { - node.SetExecutionProviderType(kCudaExecutionProvider); + node.SetExecutionProviderType(kJsExecutionProvider); } +#else + for (auto& node : p_model->MainGraph().Nodes()) { + node.SetExecutionProviderType(kCpuExecutionProvider); + } +#endif std::map op_to_count = CountOpsInGraph(graph); ASSERT_TRUE(op_to_count["Add"] == 1); ASSERT_TRUE(op_to_count["Relu"] == 1); @@ -2073,9 +2028,15 @@ TEST_F(GraphTransformationTests, FuseCudaConvAdd) { std::shared_ptr p_model; ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_)); Graph& graph = p_model->MainGraph(); +#if defined(USE_JSEP) for (auto& node : p_model->MainGraph().Nodes()) { - node.SetExecutionProviderType(kCudaExecutionProvider); + node.SetExecutionProviderType(kJsExecutionProvider); } +#else + for (auto& node : p_model->MainGraph().Nodes()) { + node.SetExecutionProviderType(kCpuExecutionProvider); + } +#endif std::map op_to_count = CountOpsInGraph(graph); ASSERT_TRUE(op_to_count["Add"] == 1); onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; @@ -2165,17 +2126,13 @@ TEST_F(GraphTransformationTests, FuseConvActivation) { std::shared_ptr p_model; ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_)); Graph& graph = p_model->MainGraph(); -#ifdef USE_CUDA - for (auto& node : p_model->MainGraph().Nodes()) { - node.SetExecutionProviderType(kCudaExecutionProvider); - } -#elif defined(USE_ROCM) +#if defined(USE_JSEP) for (auto& node : p_model->MainGraph().Nodes()) { - node.SetExecutionProviderType(kCudaExecutionProvider); + node.SetExecutionProviderType(kJsExecutionProvider); } -#elif defined(USE_JSEP) +#else for (auto& node : p_model->MainGraph().Nodes()) { - node.SetExecutionProviderType(kJsExecutionProvider); + node.SetExecutionProviderType(kCpuExecutionProvider); } #endif std::map op_to_count_before_fusion = CountOpsInGraph(graph); @@ -2187,14 +2144,7 @@ TEST_F(GraphTransformationTests, FuseConvActivation) { ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_)); std::map op_to_count_after_fusion = CountOpsInGraph(graph); -#if defined(USE_CUDA) || defined(USE_ROCM) - std::set cuda_rocm_supported = {"Relu"}; - if (cuda_rocm_supported.find(model.second) == cuda_rocm_supported.end()) { - ASSERT_EQ(op_to_count_before_fusion[model.second], op_to_count_after_fusion[model.second]); - } else { - ASSERT_EQ(op_to_count_after_fusion[model.second], 0); - } -#elif defined(USE_JSEP) +#if defined(USE_JSEP) std::set js_supported = {"Relu", "Clip", "Sigmoid", "Tanh", "LeakyRelu"}; if (js_supported.find(model.second) == js_supported.end()) { ASSERT_EQ(op_to_count_before_fusion[model.second], op_to_count_after_fusion[model.second]); diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc index 0efa78af2795c..104b34b185791 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc @@ -25,6 +25,7 @@ void TestConvOp(const ConvOpAndTestAttributes& attributes, const std::initializer_list& expected_output, const vector& expected_output_shape, bool weight_is_initializer = false, + optional epsilon = optional(), OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess, const std::string& err_str = "", int opset = 7) { @@ -56,11 +57,13 @@ void TestConvOp(const ConvOpAndTestAttributes& attributes, test.AddOutput("Y", expected_output_shape, expected_output); + if (epsilon.has_value()) { + test.SetOutputTolerance(*epsilon); + } + std::unordered_set excluded_providers(attributes.excluded_providers); // Disable TensorRT because weight as input is not supported excluded_providers.insert(kTensorrtExecutionProvider); - // Disable CUDA NHWC execution provider as it is currently flaky - excluded_providers.insert(kCudaNHWCExecutionProvider); // QNN SDK 2.10.0 has a bug that breaks support for dynamic bias inputs. excluded_providers.insert(kQnnExecutionProvider); @@ -189,10 +192,15 @@ TEST(ConvTest, Conv1D_Bias) { vector Y_shape = {2, 1, 4}; auto expected_vals = {0.37892162799835205f, 0.4625728130340576f, 0.4934738576412201f, 0.44801419973373413f, 0.37892162799835205f, 0.2499445676803589f, 0.31682088971138f, 0.32773756980895996f}; - TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + + // For the CUDA EP: Due to CUDNN Frontend using TF32 for FP32 operations we get a higher error than using FP32 only, + // as TF32 has a 10 bit mantissa. + float epsilon = 1.1e-5f; + + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, false, epsilon); // CoreML EP requires weight to be an initializer - TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true, epsilon); } // Conv47 @@ -240,7 +248,7 @@ TEST(ConvTest, Conv1D_Invalid_Input_Shape) { vector X_shape = {1, 1, 1}; vector dummy_shape = {1, 1, 2}; auto dummy_vals = {0.0f, 0.0f}; - TestConvOp(attrs, {X, dummy_vals}, {X_shape, dummy_shape}, dummy_vals, dummy_shape, false, + TestConvOp(attrs, {X, dummy_vals}, {X_shape, dummy_shape}, dummy_vals, dummy_shape, false, optional(), OpTester::ExpectResult::kExpectFailure, "Node:node1 Output:Y [ShapeInferenceError] Can't merge shape info. " "Both inferred and declared dimension have values but they differ. Inferred=0 Declared=2 Dimension=2", @@ -263,7 +271,7 @@ TEST(ConvTest, Conv2D_Invalid_Input_Shape) { vector dummy_shape = {2, 2, 1, 2}; auto dummy_vals = {-0.0f, 0.0f, -0.0f, -0.0f, -0.0f, 0.0f, -0.0f, -0.0f}; - TestConvOp(attrs, {X, dummy_vals}, {X_shape, dummy_shape}, dummy_vals, dummy_shape, false, + TestConvOp(attrs, {X, dummy_vals}, {X_shape, dummy_shape}, dummy_vals, dummy_shape, false, optional(), OpTester::ExpectResult::kExpectFailure, "Node:node1 Output:Y [ShapeInferenceError] Can't merge shape info. " "Both inferred and declared dimension have values but they differ. Inferred=1 Declared=2 Dimension=0", @@ -620,7 +628,12 @@ TEST(ConvTest, Conv3D_Bias) { -0.47542816400527954f, -0.5078460574150085f, -0.4205915927886963f, -0.5584549903869629f, -0.39770257472991943f, -0.45317384600639343f, -0.5598302483558655f, -0.2542789578437805f, -0.5359901785850525f, -0.48090484738349915f, -0.38603779673576355f, -0.4991581439971924f}; - TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + + // For the CUDA EP: Due to CUDNN Frontend using TF32 for FP32 operations we get a higher error than using FP32 only, + // as TF32 has a 10 bit mantissa. + float epsilon = 2.1e-4f; + + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, false, epsilon); } TEST(ConvTest, Conv2D_group) { @@ -667,7 +680,8 @@ TEST(ConvTest, ConvDimWithZero) { // not handled by ACL attrs.excluded_providers.insert(kAclExecutionProvider); - TestConvOp(attrs, {X, W}, {X_shape, W_shape}, {}, out_shape, false, OpTester::ExpectResult::kExpectSuccess, "", 10); + TestConvOp(attrs, {X, W}, {X_shape, W_shape}, {}, out_shape, false, optional(), + OpTester::ExpectResult::kExpectSuccess, "", 10); } TEST(ConvTest, Conv1D_asymmetric_padding) { From cd81c1c84c5fbeadc8e3910940b9a4a0aff4350c Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 23 Jul 2024 11:45:01 -0700 Subject: [PATCH 2/6] cudnn_frontend 1.5.2 --- cgmanifests/generated/cgmanifest.json | 2 +- cmake/deps.txt | 2 +- .../github/azure-pipelines/templates/download-deps.yml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json index fe8510acc9003..c443fab916334 100644 --- a/cgmanifests/generated/cgmanifest.json +++ b/cgmanifests/generated/cgmanifest.json @@ -356,7 +356,7 @@ "component": { "type": "git", "git": { - "commitHash": "1b0b5eac540b7f8fd19b18f1e6b8427c95503348", + "commitHash": "98ca4e1941fe3263f128f74f10063a3ea35c7019", "repositoryUrl": "https://github.com/NVIDIA/cudnn-frontend.git" }, "comments": "cudnn_frontend" diff --git a/cmake/deps.txt b/cmake/deps.txt index 34b0dc144d556..6d1802f1558fa 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -58,4 +58,4 @@ utf8_range;https://github.com/protocolbuffers/utf8_range/archive/72c943dea2b9240 extensions;https://github.com/microsoft/onnxruntime-extensions/archive/94142d8391c9791ec71c38336436319a2d4ac7a0.zip;4365ac5140338b4cb75a39944a4be276e3829b3c composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/204da9c522cebec5220bba52cd3542ebcaf99e7a.zip;1827348efd47831c13074245274d41b7cae8a557 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e -cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.3.0.zip;dfccd5ff821cb2b2fa832bef26f39154358e6e13 \ No newline at end of file +cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.5.2.zip;11071a47594b20f00af09aad83e0d5203ccf6029 diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml index bf11730c2ce28..0cc9f468c3cfa 100644 --- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml @@ -11,7 +11,7 @@ steps: packageType: upack feed: '/7424c8e4-5c62-490e-95c4-79446f31017c' definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0' - version: 1.0.167 + version: 1.0.171 downloadPath: $(Build.BinariesDirectory)/deps # The private ADO project @@ -22,7 +22,7 @@ steps: packageType: upack feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325' definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a' - version: 1.0.167 + version: 1.0.171 downloadPath: $(Build.BinariesDirectory)/deps # You can add more ADO accounts at here. From 8a7b68751c3dd8a273f7e6f1b50024c44fde7aa3 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 23 Jul 2024 20:16:41 -0700 Subject: [PATCH 3/6] fix build --- onnxruntime/core/providers/cuda/nn/conv.cc | 11 ++++++----- setup.py | 1 + 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc index 1e216d6ee7a6d..1e442be65e788 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.cc +++ b/onnxruntime/core/providers/cuda/nn/conv.cc @@ -219,8 +219,8 @@ Status Conv::CreateCudnnFeExecutionPlan(const onnxruntime::TensorShap CUDNN_FE_CALL_THROW(s_.cudnn_fe_graph->build_operation_graph(handle)); CUDNN_FE_CALL_THROW(s_.cudnn_fe_graph->create_execution_plans({heur_mode})); } catch (const std::exception& ex) { - LOGS_DEFAULT(ERROR) << "Failed to initialize CUDNN Frontend."; - return Status(common::StatusCategory::ONNXRUNTIME, common::StatusCode::EP_FAIL, ex.what()); + std::string message = MakeString("Failed to initialize CUDNN Frontend", ex.what()); + return Status(common::StatusCategory::ONNXRUNTIME, common::StatusCode::EP_FAIL, message); } if (!use_tf32) s_.cudnn_fe_graph->deselect_numeric_notes({cudnn_frontend::NumericalNote_t::TENSOR_CORE}); @@ -230,10 +230,11 @@ Status Conv::CreateCudnnFeExecutionPlan(const onnxruntime::TensorShap CUDNN_FE_CALL_THROW(s_.cudnn_fe_graph->build_plans(handle)); } catch (const std::exception& ex) { if (!fuse_bias && !fuse_act && use_tf32) { - return Status(common::StatusCategory::ONNXRUNTIME, - common::StatusCode::EP_FAIL, "OP not supported by CUDNN Frontend."); - ; + std::string message = MakeString("OP not supported by CUDNN Frontend", ex.what()); + return Status(common::StatusCategory::ONNXRUNTIME, common::StatusCode::EP_FAIL, message); } + + // Try fallback. return CreateCudnnFeExecutionPlan(x_dims, w_dims, B, Z, y_dims, handle, heur_mode, pads, strides, dilations, bias_expected, false, false, w_in_nhwc, true); } diff --git a/setup.py b/setup.py index 51feedcfd3286..1fa297e22acd9 100644 --- a/setup.py +++ b/setup.py @@ -196,6 +196,7 @@ def run(self): to_preload_cann = [] cuda_dependencies = [ + "libcuda.so.1", "libcublas.so.11", "libcublas.so.12", "libcublasLt.so.11", From 77387f5f1d122b5fb80ee7429967096cb8d79a5a Mon Sep 17 00:00:00 2001 From: Maximilian Mueller Date: Fri, 26 Jul 2024 00:23:40 +0200 Subject: [PATCH 4/6] extend logging and propagate device_id --- onnxruntime/core/providers/cuda/nn/conv.cc | 6 ++++-- onnxruntime/test/onnx/main.cc | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc index 1e442be65e788..71743bd75faa4 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.cc +++ b/onnxruntime/core/providers/cuda/nn/conv.cc @@ -219,7 +219,8 @@ Status Conv::CreateCudnnFeExecutionPlan(const onnxruntime::TensorShap CUDNN_FE_CALL_THROW(s_.cudnn_fe_graph->build_operation_graph(handle)); CUDNN_FE_CALL_THROW(s_.cudnn_fe_graph->create_execution_plans({heur_mode})); } catch (const std::exception& ex) { - std::string message = MakeString("Failed to initialize CUDNN Frontend", ex.what()); + std::string message = MakeString("Failed to initialize CUDNN Frontend", ex.what(), + "with the cudnn frontend json:\n", s_.cudnn_fe_graph->print()); return Status(common::StatusCategory::ONNXRUNTIME, common::StatusCode::EP_FAIL, message); } @@ -230,7 +231,8 @@ Status Conv::CreateCudnnFeExecutionPlan(const onnxruntime::TensorShap CUDNN_FE_CALL_THROW(s_.cudnn_fe_graph->build_plans(handle)); } catch (const std::exception& ex) { if (!fuse_bias && !fuse_act && use_tf32) { - std::string message = MakeString("OP not supported by CUDNN Frontend", ex.what()); + std::string message = MakeString("OP not supported by CUDNN Frontend", ex.what(), + "with the cudnn frontend json:\n", s_.cudnn_fe_graph->print()); return Status(common::StatusCategory::ONNXRUNTIME, common::StatusCode::EP_FAIL, message); } diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc index 9886d98dcc6d6..343027b42c894 100644 --- a/onnxruntime/test/onnx/main.cc +++ b/onnxruntime/test/onnx/main.cc @@ -438,6 +438,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) { if (enable_cuda) { #ifdef USE_CUDA OrtCUDAProviderOptionsV2 cuda_options; + cuda_options.device_id = device_id; cuda_options.do_copy_in_default_stream = true; cuda_options.use_tf32 = false; // TODO: Support arena configuration for users of test runner From dfe75b80c3fb0a87e987a4a8111b91d03f003895 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20M=C3=BCller?= Date: Wed, 31 Jul 2024 13:16:48 +0200 Subject: [PATCH 5/6] introduce CMake cuDNN finder and make cudnnn frontend a cuDNN 9+ feature --- cmake/CMakeLists.txt | 4 +- cmake/external/cuDNN.cmake | 109 ++++ .../external/onnxruntime_external_deps.cmake | 10 +- cmake/onnxruntime_framework.cmake | 2 +- cmake/onnxruntime_providers_cuda.cmake | 17 +- cmake/onnxruntime_providers_tensorrt.cmake | 5 +- cmake/onnxruntime_python.cmake | 3 - cmake/onnxruntime_session.cmake | 4 +- cmake/onnxruntime_training.cmake | 7 - cmake/onnxruntime_unittests.cmake | 6 +- onnxruntime/core/providers/cuda/nn/conv.cc | 6 + onnxruntime/core/providers/cuda/nn/conv.h | 2 +- onnxruntime/core/providers/cuda/nn/conv_8.h | 484 ++++++++++++++++++ 13 files changed, 618 insertions(+), 41 deletions(-) create mode 100644 cmake/external/cuDNN.cmake create mode 100644 onnxruntime/core/providers/cuda/nn/conv_8.h diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 5555fa692eae8..b4d5af1357129 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -729,9 +729,6 @@ set(ORT_PROVIDER_FLAGS) set(ORT_PROVIDER_CMAKE_FLAGS) if (onnxruntime_USE_CUDA) - if (onnxruntime_USE_CUDA_NHWC_OPS) - add_compile_definitions(ENABLE_CUDA_NHWC_OPS) - endif() enable_language(CUDA) message( STATUS "CMAKE_CUDA_COMPILER_VERSION: ${CMAKE_CUDA_COMPILER_VERSION}") @@ -1447,6 +1444,7 @@ if (onnxruntime_USE_CUDA) find_package(CUDAToolkit REQUIRED) if(onnxruntime_CUDNN_HOME) file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME) + set(CUDNN_PATH ${onnxruntime_CUDNN_HOME}) endif() if (NOT CMAKE_CUDA_ARCHITECTURES) if (CMAKE_LIBRARY_ARCHITECTURE STREQUAL "aarch64-linux-gnu") diff --git a/cmake/external/cuDNN.cmake b/cmake/external/cuDNN.cmake new file mode 100644 index 0000000000000..f6640ca9429ba --- /dev/null +++ b/cmake/external/cuDNN.cmake @@ -0,0 +1,109 @@ +add_library(CUDNN::cudnn_all INTERFACE IMPORTED) + +find_path( + CUDNN_INCLUDE_DIR cudnn.h + HINTS $ENV{CUDNN_PATH} ${CUDNN_PATH} ${Python_SITEARCH}/nvidia/cudnn ${CUDAToolkit_INCLUDE_DIRS} + PATH_SUFFIXES include + REQUIRED +) + +file(READ "${CUDNN_INCLUDE_DIR}/cudnn_version.h" cudnn_version_header) +string(REGEX MATCH "#define CUDNN_MAJOR [1-9]+" macrodef "${cudnn_version_header}") +string(REGEX MATCH "[1-9]+" CUDNN_MAJOR_VERSION "${macrodef}") + +function(find_cudnn_library NAME) + find_library( + ${NAME}_LIBRARY ${NAME} "lib${NAME}.so.${CUDNN_MAJOR_VERSION}" + HINTS $ENV{CUDNN_PATH} ${CUDNN_PATH} ${Python_SITEARCH}/nvidia/cudnn ${CUDAToolkit_LIBRARY_DIR} + PATH_SUFFIXES lib64 lib/x64 lib + REQUIRED + ) + + if(${NAME}_LIBRARY) + add_library(CUDNN::${NAME} UNKNOWN IMPORTED) + set_target_properties( + CUDNN::${NAME} PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES ${CUDNN_INCLUDE_DIR} + IMPORTED_LOCATION ${${NAME}_LIBRARY} + ) + message(STATUS "${NAME} found at ${${NAME}_LIBRARY}.") + else() + message(STATUS "${NAME} not found.") + endif() + + +endfunction() + +find_cudnn_library(cudnn) + +include (FindPackageHandleStandardArgs) +find_package_handle_standard_args( + LIBRARY REQUIRED_VARS + CUDNN_INCLUDE_DIR cudnn_LIBRARY +) + +if(CUDNN_INCLUDE_DIR AND cudnn_LIBRARY) + + message(STATUS "cuDNN: ${cudnn_LIBRARY}") + message(STATUS "cuDNN: ${CUDNN_INCLUDE_DIR}") + + set(CUDNN_FOUND ON CACHE INTERNAL "cuDNN Library Found") + +else() + + set(CUDNN_FOUND OFF CACHE INTERNAL "cuDNN Library Not Found") + +endif() + +target_include_directories( + CUDNN::cudnn_all + INTERFACE + $ + $ +) + +target_link_libraries( + CUDNN::cudnn_all + INTERFACE + CUDNN::cudnn +) + +if(CUDNN_MAJOR_VERSION EQUAL 8) + find_cudnn_library(cudnn_adv_infer) + find_cudnn_library(cudnn_adv_train) + find_cudnn_library(cudnn_cnn_infer) + find_cudnn_library(cudnn_cnn_train) + find_cudnn_library(cudnn_ops_infer) + find_cudnn_library(cudnn_ops_train) + + target_link_libraries( + CUDNN::cudnn_all + INTERFACE + CUDNN::cudnn_adv_train + CUDNN::cudnn_ops_train + CUDNN::cudnn_cnn_train + CUDNN::cudnn_adv_infer + CUDNN::cudnn_cnn_infer + CUDNN::cudnn_ops_infer + ) +elseif(CUDNN_MAJOR_VERSION EQUAL 9) + find_cudnn_library(cudnn_cnn) + find_cudnn_library(cudnn_adv) + find_cudnn_library(cudnn_graph) + find_cudnn_library(cudnn_ops) + find_cudnn_library(cudnn_engines_runtime_compiled) + find_cudnn_library(cudnn_engines_precompiled) + find_cudnn_library(cudnn_heuristic) + + target_link_libraries( + CUDNN::cudnn_all + INTERFACE + CUDNN::cudnn_adv + CUDNN::cudnn_ops + CUDNN::cudnn_cnn + CUDNN::cudnn_graph + CUDNN::cudnn_engines_runtime_compiled + CUDNN::cudnn_engines_precompiled + CUDNN::cudnn_heuristic + ) +endif() diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index 14e6ed515fd6e..1a692e98c51df 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -595,15 +595,7 @@ set(onnxruntime_LINK_DIRS ) if (onnxruntime_USE_CUDA) #TODO: combine onnxruntime_CUDNN_HOME and onnxruntime_CUDA_HOME, assume they are the same find_package(CUDAToolkit REQUIRED) - if (WIN32) - if(onnxruntime_CUDNN_HOME) - list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDNN_HOME}/lib ${onnxruntime_CUDNN_HOME}/lib/x64) - endif() - else() - if(onnxruntime_CUDNN_HOME) - list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDNN_HOME}/lib ${onnxruntime_CUDNN_HOME}/lib64) - endif() - endif() + include(cuDNN) endif() if(onnxruntime_USE_SNPE) diff --git a/cmake/onnxruntime_framework.cmake b/cmake/onnxruntime_framework.cmake index 43d16abd8fbae..99a1e0c8b946f 100644 --- a/cmake/onnxruntime_framework.cmake +++ b/cmake/onnxruntime_framework.cmake @@ -69,7 +69,7 @@ endif() if(onnxruntime_USE_TENSORRT OR onnxruntime_USE_NCCL) # TODO: for now, core framework depends on CUDA. It should be moved to TensorRT EP # TODO: provider_bridge_ort.cc should not include nccl.h -target_include_directories(onnxruntime_framework PRIVATE ${ONNXRUNTIME_ROOT} ${eigen_INCLUDE_DIRS} ${onnxruntime_CUDNN_HOME}/include PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) +target_include_directories(onnxruntime_framework PRIVATE ${ONNXRUNTIME_ROOT} ${eigen_INCLUDE_DIRS} PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) else() target_include_directories(onnxruntime_framework PRIVATE ${ONNXRUNTIME_ROOT} ${eigen_INCLUDE_DIRS} PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) endif() diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index 0f87331ecded9..af92214de4f9e 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -11,7 +11,7 @@ ) # Remove pch files list(REMOVE_ITEM onnxruntime_providers_cuda_cc_srcs - "${ONNXRUNTIME_ROOT}/core/providers/cuda/integer_gemm.cc" + "${ONNXRUNTIME_ROOT}/core/providrs/cuda/integer_gemm.cc" "${ONNXRUNTIME_ROOT}/core/providers/cuda/triton_kernel.h" ) else() @@ -197,13 +197,16 @@ target_compile_definitions(${target} PRIVATE USE_CUDA_MINIMAL) target_link_libraries(${target} PRIVATE ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface CUDA::cudart) else() - include(cudnn_frontend) - target_link_libraries(${target} PRIVATE CUDA::cublasLt CUDA::cublas cudnn cudnn_frontend CUDA::curand CUDA::cufft CUDA::cudart - ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface) - if(onnxruntime_CUDNN_HOME) - target_include_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/include) - target_link_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/lib) + include(cudnn_frontend) # also defines CUDNN::* + if (onnxruntime_USE_CUDA_NHWC_OPS) + if(CUDNN_MAJOR_VERSION GREATER 8) + add_compile_definitions(ENABLE_CUDA_NHWC_OPS) + else() + message( WARNING "To compile with NHWC ops enabled please compile against cuDNN 9 or newer." ) + endif() endif() + target_link_libraries(${target} PRIVATE CUDA::cublasLt CUDA::cublas CUDNN::cudnn_all cudnn_frontend CUDA::curand CUDA::cufft CUDA::cudart + ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface) endif() if (onnxruntime_USE_TRITON_KERNEL) diff --git a/cmake/onnxruntime_providers_tensorrt.cmake b/cmake/onnxruntime_providers_tensorrt.cmake index 3d46c139feea9..468aaa44ec4ee 100644 --- a/cmake/onnxruntime_providers_tensorrt.cmake +++ b/cmake/onnxruntime_providers_tensorrt.cmake @@ -159,7 +159,7 @@ if(onnxruntime_CUDA_MINIMAL) set(trt_link_libs ${CMAKE_DL_LIBS} ${TENSORRT_LIBRARY}) else() - set(trt_link_libs cudnn cublas ${CMAKE_DL_LIBS} ${TENSORRT_LIBRARY}) + set(trt_link_libs CUDNN::cudnn_all cublas ${CMAKE_DL_LIBS} ${TENSORRT_LIBRARY}) endif() file(GLOB_RECURSE onnxruntime_providers_tensorrt_cc_srcs CONFIGURE_DEPENDS "${ONNXRUNTIME_ROOT}/core/providers/tensorrt/*.h" @@ -183,9 +183,6 @@ endif() target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} PUBLIC ${CUDAToolkit_INCLUDE_DIRS}) - if(onnxruntime_CUDNN_HOME) - target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${onnxruntime_CUDNN_HOME}/include) - endif() # ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found set_target_properties(onnxruntime_providers_tensorrt PROPERTIES LINKER_LANGUAGE CUDA) diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake index 07c65e7986b05..afa2ee14e6b42 100644 --- a/cmake/onnxruntime_python.cmake +++ b/cmake/onnxruntime_python.cmake @@ -97,9 +97,6 @@ endif() onnxruntime_add_include_to_target(onnxruntime_pybind11_state Python::Module Python::NumPy) target_include_directories(onnxruntime_pybind11_state PRIVATE ${ONNXRUNTIME_ROOT} ${pybind11_INCLUDE_DIRS}) -if(onnxruntime_USE_CUDA AND onnxruntime_CUDNN_HOME) - target_include_directories(onnxruntime_pybind11_state PRIVATE ${onnxruntime_CUDNN_HOME}/include) -endif() if(onnxruntime_USE_CANN) target_include_directories(onnxruntime_pybind11_state PRIVATE ${onnxruntime_CANN_HOME}/include) endif() diff --git a/cmake/onnxruntime_session.cmake b/cmake/onnxruntime_session.cmake index 79bee3bdb65d6..b51c875951135 100644 --- a/cmake/onnxruntime_session.cmake +++ b/cmake/onnxruntime_session.cmake @@ -44,9 +44,7 @@ if (onnxruntime_USE_EXTENSIONS) endif() add_dependencies(onnxruntime_session ${onnxruntime_EXTERNAL_DEPENDENCIES}) set_target_properties(onnxruntime_session PROPERTIES FOLDER "ONNXRuntime") -if (onnxruntime_USE_CUDA) - target_include_directories(onnxruntime_session PRIVATE ${onnxruntime_CUDNN_HOME}/include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) -endif() + if (onnxruntime_USE_ROCM) target_compile_options(onnxruntime_session PRIVATE -Wno-sign-compare -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1) target_include_directories(onnxruntime_session PRIVATE ${onnxruntime_ROCM_HOME}/hipfft/include ${onnxruntime_ROCM_HOME}/include ${onnxruntime_ROCM_HOME}/hipcub/include ${onnxruntime_ROCM_HOME}/hiprand/include ${onnxruntime_ROCM_HOME}/rocrand/include) diff --git a/cmake/onnxruntime_training.cmake b/cmake/onnxruntime_training.cmake index 01590a431205c..b633a9c2de378 100644 --- a/cmake/onnxruntime_training.cmake +++ b/cmake/onnxruntime_training.cmake @@ -39,10 +39,6 @@ endif() target_include_directories(onnxruntime_training PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} PUBLIC ${onnxruntime_graph_header} ${MPI_CXX_INCLUDE_DIRS}) -if (onnxruntime_USE_CUDA) - target_include_directories(onnxruntime_training PRIVATE ${onnxruntime_CUDNN_HOME}/include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) -endif() - if (onnxruntime_USE_NCCL) target_include_directories(onnxruntime_training PRIVATE ${NCCL_INCLUDE_DIRS}) endif() @@ -81,9 +77,6 @@ if (onnxruntime_BUILD_UNIT_TESTS) target_include_directories(onnxruntime_training_runner PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT} ${ORTTRAINING_ROOT} ${eigen_INCLUDE_DIRS} PUBLIC ${onnxruntime_graph_header}) target_link_libraries(onnxruntime_training_runner PRIVATE nlohmann_json::nlohmann_json) - if (onnxruntime_USE_CUDA) - target_include_directories(onnxruntime_training_runner PUBLIC ${onnxruntime_CUDNN_HOME}/include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) - endif() if (onnxruntime_USE_NCCL) target_include_directories(onnxruntime_training_runner PRIVATE ${NCCL_INCLUDE_DIRS}) diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index 53d7428257754..89771cc906b85 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -75,7 +75,7 @@ function(AddTest) onnxruntime_add_include_to_target(${_UT_TARGET} date::date flatbuffers::flatbuffers) target_include_directories(${_UT_TARGET} PRIVATE ${TEST_INC_DIR}) if (onnxruntime_USE_CUDA) - target_include_directories(${_UT_TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${onnxruntime_CUDNN_HOME}/include) + target_include_directories(${_UT_TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) if (onnxruntime_USE_NCCL) target_include_directories(${_UT_TARGET} PRIVATE ${NCCL_INCLUDE_DIRS}) endif() @@ -392,7 +392,7 @@ if (onnxruntime_USE_CUDA AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_R ) list(APPEND onnxruntime_test_providers_src ${onnxruntime_test_providers_cuda_src}) - if (onnxruntime_USE_CUDA_NHWC_OPS) + if (onnxruntime_USE_CUDA_NHWC_OPS AND CUDNN_MAJOR_VERSION GREATER 8) file(GLOB onnxruntime_test_providers_cuda_nhwc_src CONFIGURE_DEPENDS "${TEST_SRC_DIR}/providers/cuda/nhwc/*.cc" ) @@ -1495,7 +1495,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten") list(APPEND custom_op_src_patterns "${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/cuda_ops.cu" "${TEST_SRC_DIR}/testdata/custom_op_library/cuda/cuda_ops.*") - list(APPEND custom_op_lib_include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${onnxruntime_CUDNN_HOME}/include) + list(APPEND custom_op_lib_include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) if (HAS_QSPECTRE) list(APPEND custom_op_lib_option "$<$:SHELL:--compiler-options /Qspectre>") endif() diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc index 71743bd75faa4..95ba698b707ac 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.cc +++ b/onnxruntime/core/providers/cuda/nn/conv.cc @@ -13,6 +13,10 @@ #include "core/providers/cuda/tensor/transpose.h" #include "core/providers/cuda/shared_inc/cudnn_fe_call.h" +#if CUDNN_MAJOR < 9 +// if compiled with cuDNN 8 we want to use the legacy cuDNN API +#include "conv_8.h" +#endif namespace onnxruntime { namespace cuda { @@ -91,6 +95,7 @@ Status Conv::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr return Status::OK(); } +#if CUDNN_MAJOR >= 9 #if !defined(__CUDACC__) template @@ -557,6 +562,7 @@ Status CudnnConvolutionDescriptor::Set( return Status::OK(); } +#endif #ifndef DISABLE_CONTRIB_OPS // template instantiation for NhwcConv diff --git a/onnxruntime/core/providers/cuda/nn/conv.h b/onnxruntime/core/providers/cuda/nn/conv.h index 7a8e9cae521c1..484d66081018b 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.h +++ b/onnxruntime/core/providers/cuda/nn/conv.h @@ -230,7 +230,7 @@ class Conv : public CudaKernel { Status UpdateState(OpKernelContext* context, bool bias_expected = false) const; -#if !defined(__CUDACC__) +#if !defined(__CUDACC__) && CUDNN_MAJOR >= 9 Status CreateCudnnFeExecutionPlan(const onnxruntime::TensorShapeVector& x_dims, const onnxruntime::TensorShapeVector& w_dims, const Tensor* B, diff --git a/onnxruntime/core/providers/cuda/nn/conv_8.h b/onnxruntime/core/providers/cuda/nn/conv_8.h new file mode 100644 index 0000000000000..10239d09041fe --- /dev/null +++ b/onnxruntime/core/providers/cuda/nn/conv_8.h @@ -0,0 +1,484 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. +// Licensed under the MIT License. + +#include "core/common/status.h" +#include "core/providers/cuda/nn/conv.h" +#include "core/common/span_utils.h" +#include "core/providers/cuda/cuda_common.h" +#include "core/providers/cuda/tensor/transpose.h" +#include "core/providers/cuda/shared_inc/fpgeneric.h" +#include "core/providers/cuda/shared_inc/cudnn_fe_call.h" +#include "core/providers/cuda/tensor/slice.h" + +namespace onnxruntime { +namespace cuda { + +static const cudnnConvolutionFwdAlgo_t kAllAlgos[] = { + CUDNN_CONVOLUTION_FWD_ALGO_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_FFT, + CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_DIRECT, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED, +}; + +static cudnnStatus_t GetWorkspaceSize(cudnnHandle_t handle, const CudnnConvState& s, cudnnConvolutionFwdAlgo_t algo, size_t* sz) { + return cudnnGetConvolutionForwardWorkspaceSize(handle, s.x_tensor, s.w_desc, s.conv_desc, s.y_tensor, algo, sz); +} + +size_t GetMaxWorkspaceSize(cudnnHandle_t handle, const CudnnConvState& s, + const cudnnConvolutionFwdAlgo_t* algo, int n_algo) { + // TODO: get maximum available size from memory arena + size_t free, total; + CUDA_CALL_THROW(cudaMemGetInfo(&free, &total)); + // Assuming 10% of fragmentation + free = static_cast(static_cast(free) * 0.9); + size_t max_ws_size = 0; + for (int i = 0; i < n_algo; i++) { + cudnnStatus_t err; + size_t sz; + err = GetWorkspaceSize(handle, s, algo[i], &sz); + if (CUDNN_STATUS_SUCCESS != err || sz == 0 || sz < max_ws_size || sz > free) continue; + max_ws_size = sz; + } + return max_ws_size; +} + +Status SliceOutUnwantedOutputSection(cudaStream_t stream, + const void* input_data, gsl::span input_dims, + void* output_data, + const gsl::span& output_dims, + const gsl::span& starts, + const gsl::span& ends, + const gsl::span& axes, + size_t element_size) { + SliceOp::PrepareForComputeMetadata compute_metadata(input_dims); + + ORT_THROW_IF_ERROR(SliceBase::PrepareForCompute(starts, ends, axes, compute_metadata)); + + // As a sanity check, ensure that the slice operator's output shape matches with the expected output shape + ORT_ENFORCE(SpanEq(gsl::make_span(compute_metadata.output_dims_), output_dims)); + + return SliceCuda::Impl(stream, input_data, input_dims, output_data, compute_metadata, element_size); +} + +template +Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const { + // set X + const Tensor* X = context->Input(0); + const TensorShape& x_shape = X->Shape(); + const auto x_dims = x_shape.AsShapeVector(); + s_.x_data = reinterpret_cast(X->Data()); + s_.element_size = X->DataType()->Size(); + bool w_in_nhwc; + const Tensor* W; + if (!W_) { + W = context->Input(1); + w_in_nhwc = W_already_nhwc; + // Dims and memory layout are in NCHW format + } else { + W = W_.get(); + w_in_nhwc = true; + // W got prepacked, therfore if NHWC == true, then dims and memory layout are in NHWC + } + const TensorShape& w_shape = W->Shape(); + auto w_dims = w_shape.AsShapeVector(); + s_.w_data = reinterpret_cast(W->Data()); + + // Make sure input and weight are 4D for NHWC since we set 4D descriptor for NHWC. + constexpr bool channels_last = NHWC; + if constexpr (channels_last) { + if (x_shape.NumDimensions() != 4 || w_shape.NumDimensions() != 4) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Number of dimensions of X and W should be 4 for channels_last format (NHWC)"); + } + } + + // set B + if (context->InputCount() >= 3) { + const Tensor* B = context->Input(2); + s_.b_data = reinterpret_cast(B->Data()); + } else { + s_.b_data = nullptr; + } + // set Z + if (context->InputCount() >= 4) { + const Tensor* Z = context->Input(3); + ORT_RETURN_IF_ERROR(s_.z_tensor.Set(Z->Shape().GetDims(), CudnnTensor::GetDataType())); + s_.z_data = reinterpret_cast(Z->Data()); + } else { + s_.z_data = nullptr; + } + bool input_dims_changed = (s_.last_x_dims != x_dims); + bool w_dims_changed = (s_.last_w_dims != w_dims); + if (input_dims_changed || w_dims_changed) { + if (input_dims_changed) + s_.last_x_dims = gsl::make_span(x_dims); + + if (w_dims_changed) { + s_.last_w_dims = gsl::make_span(w_dims); + s_.cached_benchmark_results.clear(); + } + + ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape(), channels_last, w_in_nhwc)); + + TensorShapeVector kernel_shape; + ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(W->Shape(), kernel_shape, w_in_nhwc)); + + const size_t kernel_rank = kernel_shape.size(); + + ConvPadVector pads(conv_attrs_.pads); + if (pads.empty()) { + pads.resize(kernel_rank * 2, 0); + } + TensorShapeVector dilations(conv_attrs_.dilations); + if (dilations.empty()) { + dilations.resize(kernel_rank, 1); + } + TensorShapeVector strides(conv_attrs_.strides); + if (strides.empty()) { + strides.resize(kernel_rank, 1); + } + + TensorShapeVector y_dims; + y_dims.reserve(2 + kernel_rank); // add 2 to account for 'N' and 'C' + + const int64_t N = X->Shape()[0]; + const int64_t M = W->Shape()[0]; + if (channels_last) { + y_dims.push_back(N); + } else { + y_dims.insert(y_dims.begin(), {N, M}); + } + + bool post_slicing_required = false; + TensorShapeVector slice_starts; + slice_starts.reserve(kernel_rank); + + TensorShapeVector slice_ends; + slice_ends.reserve(kernel_rank); + + TensorShapeVector slice_axes; + slice_axes.reserve(kernel_rank); + + constexpr size_t spatial_dim_start = channels_last ? 1 : 2; + const size_t spatial_dim_end = spatial_dim_start + kernel_rank; + TensorShape spatial_shape = X->Shape().Slice(spatial_dim_start, spatial_dim_end); + + TensorShapeVector y_dims_with_adjusted_pads(y_dims); + ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShapeWithAdjustedPads(spatial_shape, kernel_shape, + strides, dilations, pads, y_dims, y_dims_with_adjusted_pads, + post_slicing_required, slice_starts, slice_ends, slice_axes, + channels_last)); + if (channels_last) { + y_dims.push_back(M); + y_dims_with_adjusted_pads.push_back(M); + } + + ORT_ENFORCE(y_dims.size() == y_dims_with_adjusted_pads.size()); + s_.y_dims = gsl::make_span(y_dims); + s_.y_dims_with_adjusted_pads = y_dims_with_adjusted_pads; + s_.post_slicing_required = post_slicing_required; + s_.slice_starts = slice_starts; + s_.slice_ends = slice_ends; + s_.slice_axes = slice_axes; + + s_.Y = context->Output(0, TensorShape(s_.y_dims)); + if (post_slicing_required) { + // Post slicing needed. Create and fill in the Conv results in an intermediate buffer. + s_.memory_for_cudnn_conv_results = GetScratchBuffer(TensorShape(y_dims_with_adjusted_pads).Size() * s_.element_size, context->GetComputeStream()); + s_.y_data = reinterpret_cast(s_.memory_for_cudnn_conv_results.get()); + } else { + // No post slicing needed. Fill the output tensor's buffer directly. + s_.y_data = reinterpret_cast(s_.Y->MutableData()); + } + + const CUDAExecutionProvider* cuda_ep = + static_cast(this->Info().GetExecutionProvider()); + + TensorShapeVector x_dims_cudnn{x_dims.begin(), x_dims.end()}; + TensorShapeVector y_dims_cudnn = !post_slicing_required ? y_dims : y_dims_with_adjusted_pads; + if (kernel_rank < 2) { + // TODO: Explore padding the provided input shape [N, C, D] to [N, C, 1, D] + // especially for EXHAUSTIVE algo search which may result in a better algo selection. + // ORTModule uses different algo search options (HEURISTIC, and use max workspace size) compared to + // inference build (EXHAUSTIVE, 32M workspace size). We observed better perf when we pad input shape + // [N,C,D] to [N,C,1,D], especially on A100, and especially for ConvGrad. + // PyTorch also pads to [N,C,1,D]. For inference build, we still pad it to [N, C, D, 1] as this seems + // to be the sweet spot for all algo search options: EXHAUSTIVE, HEURISTIC, and DEFAULT. + // See PR #7348 and #7702 for more context. + if (cuda_ep->GetCudnnConv1dPadToNc1d()) { + x_dims_cudnn.insert(x_dims_cudnn.begin() + 2, 1); + y_dims_cudnn.insert(y_dims_cudnn.begin() + 2, 1); + w_dims.insert(w_dims.begin() + 2, 1); + pads.insert(pads.begin() + kernel_rank, 0); + pads.insert(pads.begin(), 0); + kernel_shape.insert(kernel_shape.begin(), 1); + strides.insert(strides.begin(), 1); + dilations.insert(dilations.begin(), 1); + } else { + x_dims_cudnn.push_back(1); + y_dims_cudnn.push_back(1); + w_dims.push_back(1); + pads.insert(pads.begin() + kernel_rank, 0); + pads.insert(pads.end(), 0); + kernel_shape.push_back(1); + strides.push_back(1); + dilations.push_back(1); + } + } + + if (w_dims_changed) { + if (!channels_last) { + ORT_RETURN_IF_ERROR(s_.w_desc.Set(w_dims, CudnnTensor::GetDataType())); + } else if (w_in_nhwc) { + ORT_RETURN_IF_ERROR(s_.w_desc.Set(CUDNN_TENSOR_NHWC, + CudnnTensor::GetDataType(), + static_cast(w_dims[0]), + static_cast(w_dims[3]), + static_cast(w_dims[1]), + static_cast(w_dims[2]))); + } else { + ORT_RETURN_IF_ERROR(s_.w_desc.Set(CUDNN_TENSOR_NHWC, + CudnnTensor::GetDataType(), + static_cast(w_dims[0]), + static_cast(w_dims[1]), + static_cast(w_dims[2]), + static_cast(w_dims[3]))); + } + } + + // We must delay returning early until here so that the weight dims have been cached properly + if (s_.Y->Shape().Size() == 0) { + return Status::OK(); + } + + if (channels_last) { + ORT_RETURN_IF_ERROR(s_.x_tensor.Set(CUDNN_TENSOR_NHWC, + CudnnTensor::GetDataType(), + static_cast(x_dims_cudnn[0]), + static_cast(x_dims_cudnn[3]), + static_cast(x_dims_cudnn[1]), + static_cast(x_dims_cudnn[2]))); + + ORT_RETURN_IF_ERROR(s_.y_tensor.Set(CUDNN_TENSOR_NHWC, + CudnnTensor::GetDataType(), + static_cast(y_dims_cudnn[0]), + static_cast(y_dims_cudnn[3]), + static_cast(y_dims_cudnn[1]), + static_cast(y_dims_cudnn[2]))); + } else { + ORT_RETURN_IF_ERROR(s_.x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType())); + ORT_RETURN_IF_ERROR(s_.y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType())); + } + + ORT_RETURN_IF_ERROR(s_.conv_desc.Set(kernel_shape.size(), pads, strides, dilations, + gsl::narrow_cast(conv_attrs_.group), + CUDNN_CROSS_CORRELATION, CudnnTensor::GetDataType(), + UseTF32())); + + if (context->InputCount() >= 3) { + const Tensor* B = context->Input(2); + const auto& b_shape = B->Shape(); + ORT_RETURN_IF_NOT(b_shape.NumDimensions() == 1, "bias should be 1D"); + TensorShapeVector b_dims(2 + kernel_shape.size(), 1); + b_dims[1] = b_shape[0]; + ORT_RETURN_IF_ERROR(s_.b_tensor.Set(b_dims, CudnnTensor::GetDataType())); + // s_.b_data = reinterpret_cast(B->Data()); + } else if (bias_expected) { + TensorShapeVector b_dims(2 + kernel_shape.size(), 1); + b_dims[1] = w_dims[0]; + auto malloc_size = b_dims[1] * sizeof(CudaT); + ORT_RETURN_IF_ERROR(s_.b_tensor.Set(b_dims, CudnnTensor::GetDataType())); + if (s_.b_zero) { + CUDA_CALL_THROW(cudaFree(s_.b_zero)); + s_.b_zero = nullptr; + } + CUDA_CALL_THROW(cudaMalloc(&s_.b_zero, malloc_size)); + CUDA_CALL_THROW(cudaMemsetAsync(s_.b_zero, 0, malloc_size, Stream(context))); + } + + if (!s_.cached_benchmark_results.contains(x_dims_cudnn)) { + // set math type to tensor core before algorithm search + if constexpr (std::is_same::value) { + CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, CUDNN_TENSOR_OP_MATH)); + } else if constexpr (std::is_same::value) { + if (!UseTF32()) { + CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, CUDNN_FMA_MATH)); + } + } + + cudnnConvolutionFwdAlgoPerf_t perf; + int algo_count = 1; + int cudnn_conv_algo = cuda_ep->GetCudnnConvAlgo(); + ORT_ENFORCE(cudnn_conv_algo > -1 && cudnn_conv_algo < 3, "cudnn_conv_algo should be 0, 1 or 2, but got ", cudnn_conv_algo); + switch (cudnn_conv_algo) { + case 0: { + static constexpr int num_algos = CUDNN_CONVOLUTION_FWD_ALGO_COUNT; + size_t max_ws_size = cuda_ep->GetCudnnConvUseMaxWorkspace() ? GetMaxWorkspaceSize(GetCudnnHandle(context), s_, kAllAlgos, num_algos) + : AlgoSearchWorkspaceSize; + // Use GetTransientScratchBuffer() so the workspace can be freed instead of cached. + // Because the benchmarking uses a huge amount of memory, e.g. a few GBs. + IAllocatorUniquePtr algo_search_workspace = GetTransientScratchBuffer(max_ws_size); + CUDNN_RETURN_IF_ERROR(cudnnFindConvolutionForwardAlgorithmEx( + GetCudnnHandle(context), + s_.x_tensor, + s_.x_data, + s_.w_desc, + s_.w_data, + s_.conv_desc, + s_.y_tensor, + s_.y_data, + 1, // requestedAlgoCount + &algo_count, // returnedAlgoCount + &perf, + algo_search_workspace.get(), + max_ws_size)); + break; + } + case 1: + CUDNN_RETURN_IF_ERROR(cudnnGetConvolutionForwardAlgorithm_v7( + GetCudnnHandle(context), + s_.x_tensor, + s_.w_desc, + s_.conv_desc, + s_.y_tensor, + 1, // requestedAlgoCount + &algo_count, // returnedAlgoCount + &perf)); + break; + + default: + perf.algo = kDefaultConvAlgo; + CUDNN_RETURN_IF_ERROR(GetWorkspaceSize(GetCudnnHandle(context), s_, perf.algo, &perf.memory)); + + if constexpr (std::is_same::value) { + perf.mathType = CUDNN_TENSOR_OP_MATH; + } else if (std::is_same::value && !UseTF32()) { + perf.mathType = CUDNN_FMA_MATH; + } else { + perf.mathType = CUDNN_DEFAULT_MATH; + } + } + s_.cached_benchmark_results.insert(x_dims_cudnn, {perf.algo, perf.memory, perf.mathType}); + } + const auto& perf = s_.cached_benchmark_results.at(x_dims_cudnn); + CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, perf.mathType)); + s_.algo = perf.algo; + s_.workspace_bytes = perf.memory; + } else { + // set Y + s_.Y = context->Output(0, s_.y_dims); + if (s_.Y->Shape().Size() == 0) { + return Status::OK(); + } + if (s_.post_slicing_required) { + s_.memory_for_cudnn_conv_results = GetScratchBuffer(TensorShape(s_.y_dims_with_adjusted_pads).Size() * s_.element_size, context->GetComputeStream()); + s_.y_data = reinterpret_cast(s_.memory_for_cudnn_conv_results.get()); + } else { + s_.y_data = reinterpret_cast(s_.Y->MutableData()); + } + } + return Status::OK(); +} + +template +Status Conv::ComputeInternal(OpKernelContext* context) const { + std::lock_guard lock(s_.mutex); + ORT_RETURN_IF_ERROR(UpdateState(context)); + if (s_.Y->Shape().Size() == 0) { + return Status::OK(); + } + const auto alpha = Consts::One; + const auto beta = Consts::Zero; + IAllocatorUniquePtr workspace = GetWorkSpace(context->GetComputeStream()); + auto cudnn_handle = GetCudnnHandle(context); + CUDNN_RETURN_IF_ERROR(cudnnConvolutionForward(cudnn_handle, + &alpha, + s_.x_tensor, + s_.x_data, + s_.w_desc, + s_.w_data, + s_.conv_desc, + s_.algo, + workspace.get(), + s_.workspace_bytes, + &beta, + s_.y_tensor, + s_.y_data)); + if (nullptr != s_.b_data) { + CUDNN_RETURN_IF_ERROR(cudnnAddTensor(cudnn_handle, &alpha, s_.b_tensor, s_.b_data, + &alpha, s_.y_tensor, s_.y_data)); + } + // To deal with asymmetric padding, we may have over-padded on one or both sides of the spatial dimensions + // This may have lead to extra results that are unnecessary and hence we slice that off here + if (s_.post_slicing_required) { + ORT_RETURN_IF_ERROR(SliceOutUnwantedOutputSection(Stream(context), s_.y_data, gsl::make_span(s_.y_dims_with_adjusted_pads), + s_.Y->MutableDataRaw(), s_.y_dims.GetDims(), s_.slice_starts, + s_.slice_ends, s_.slice_axes, s_.element_size)); + } + return Status::OK(); +} + +CudnnConvolutionDescriptor::CudnnConvolutionDescriptor() : desc_(nullptr) { +} + +CudnnConvolutionDescriptor::~CudnnConvolutionDescriptor() { + if (desc_ != nullptr) { + cudnnDestroyConvolutionDescriptor(desc_); + desc_ = nullptr; + } +} + +Status CudnnConvolutionDescriptor::Set( + size_t rank, + const gsl::span& pads, + const gsl::span& strides, + const gsl::span& dilations, + int groups, + cudnnConvolutionMode_t mode, + cudnnDataType_t data_type, + bool use_tf32) { + if (!desc_) + CUDNN_RETURN_IF_ERROR(cudnnCreateConvolutionDescriptor(&desc_)); + + InlinedVector pad_dims(rank); + InlinedVector stride_dims(rank); + InlinedVector dilation_dims(rank); + for (size_t i = 0; i < rank; i++) { + pad_dims[i] = gsl::narrow_cast(pads[i]); + stride_dims[i] = gsl::narrow_cast(strides[i]); + dilation_dims[i] = gsl::narrow_cast(dilations[i]); + } + + // This piece of code is copied from /pytorch/aten/src/ATen/cudnn/Descriptors.h + // Setting math_type to CUDNN_DATA_FLOAT for half input + cudnnDataType_t math_type = data_type; + if (data_type == CUDNN_DATA_HALF) math_type = CUDNN_DATA_FLOAT; + CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionNdDescriptor( + desc_, + gsl::narrow_cast(rank), + pad_dims.data(), + stride_dims.data(), + dilation_dims.data(), + mode, + math_type)); + + CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionGroupCount(desc_, groups)); + + // Copied from /pytorch/aten/src/ATen/cudnn/Descriptors.h + // See Note [behavior of cudnnFind and cudnnGet] at /pytorch/aten/src/ATen/native/cudnn/Conv_v7.cpp + CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(desc_, CUDNN_DEFAULT_MATH)); + if (data_type == CUDNN_DATA_HALF) { + CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(desc_, CUDNN_TENSOR_OP_MATH)); + } else if (data_type == CUDNN_DATA_FLOAT && !use_tf32) { + CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(desc_, CUDNN_FMA_MATH)); + } + + return Status::OK(); +} +} // namespace cuda +} // namespace onnxruntime \ No newline at end of file From 6ab8399328f2b78d8b114cbefc57077ce9929c2e Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Thu, 1 Aug 2024 08:39:57 +0000 Subject: [PATCH 6/6] adjust cmake --- cmake/CMakeLists.txt | 4 ---- cmake/external/cuDNN.cmake | 8 +++++--- cmake/external/onnxruntime_external_deps.cmake | 8 ++++++-- cmake/onnxruntime_providers_cuda.cmake | 2 +- cmake/onnxruntime_python.cmake | 6 +----- cmake/onnxruntime_unittests.cmake | 4 ++-- 6 files changed, 15 insertions(+), 17 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index b4d5af1357129..2e9a50e522171 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -1442,10 +1442,6 @@ if (onnxruntime_USE_CUDA) file(TO_CMAKE_PATH CUDAToolkit_ROOT ${onnxruntime_CUDA_HOME}) endif() find_package(CUDAToolkit REQUIRED) - if(onnxruntime_CUDNN_HOME) - file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME) - set(CUDNN_PATH ${onnxruntime_CUDNN_HOME}) - endif() if (NOT CMAKE_CUDA_ARCHITECTURES) if (CMAKE_LIBRARY_ARCHITECTURE STREQUAL "aarch64-linux-gnu") # Support for Jetson/Tegra ARM devices diff --git a/cmake/external/cuDNN.cmake b/cmake/external/cuDNN.cmake index f6640ca9429ba..3d05f6406a80e 100644 --- a/cmake/external/cuDNN.cmake +++ b/cmake/external/cuDNN.cmake @@ -18,7 +18,7 @@ function(find_cudnn_library NAME) PATH_SUFFIXES lib64 lib/x64 lib REQUIRED ) - + if(${NAME}_LIBRARY) add_library(CUDNN::${NAME} UNKNOWN IMPORTED) set_target_properties( @@ -46,7 +46,7 @@ if(CUDNN_INCLUDE_DIR AND cudnn_LIBRARY) message(STATUS "cuDNN: ${cudnn_LIBRARY}") message(STATUS "cuDNN: ${CUDNN_INCLUDE_DIR}") - + set(CUDNN_FOUND ON CACHE INTERNAL "cuDNN Library Found") else() @@ -65,7 +65,7 @@ target_include_directories( target_link_libraries( CUDNN::cudnn_all INTERFACE - CUDNN::cudnn + CUDNN::cudnn ) if(CUDNN_MAJOR_VERSION EQUAL 8) @@ -107,3 +107,5 @@ elseif(CUDNN_MAJOR_VERSION EQUAL 9) CUDNN::cudnn_heuristic ) endif() + +mark_as_advanced(CUDNN_INCLUDE_DIR) diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index e45f10e5aaee4..4e52707474052 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -587,11 +587,15 @@ endif() message("Finished fetching external dependencies") - set(onnxruntime_LINK_DIRS ) + if (onnxruntime_USE_CUDA) - #TODO: combine onnxruntime_CUDNN_HOME and onnxruntime_CUDA_HOME, assume they are the same find_package(CUDAToolkit REQUIRED) + + if(onnxruntime_CUDNN_HOME) + file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME) + set(CUDNN_PATH ${onnxruntime_CUDNN_HOME}) + endif() include(cuDNN) endif() diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index af92214de4f9e..44bc416b6fc67 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -11,7 +11,7 @@ ) # Remove pch files list(REMOVE_ITEM onnxruntime_providers_cuda_cc_srcs - "${ONNXRUNTIME_ROOT}/core/providrs/cuda/integer_gemm.cc" + "${ONNXRUNTIME_ROOT}/core/providers/cuda/integer_gemm.cc" "${ONNXRUNTIME_ROOT}/core/providers/cuda/triton_kernel.h" ) else() diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake index 270139ceaff7b..372db15b108fb 100644 --- a/cmake/onnxruntime_python.cmake +++ b/cmake/onnxruntime_python.cmake @@ -98,11 +98,7 @@ endif() onnxruntime_add_include_to_target(onnxruntime_pybind11_state Python::Module Python::NumPy) target_include_directories(onnxruntime_pybind11_state PRIVATE ${ONNXRUNTIME_ROOT} ${pybind11_INCLUDE_DIRS}) if(onnxruntime_USE_CUDA) - target_include_directories(onnxruntime_pybind11_state PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) - # cudnn_home is optional for Window when cuda and cudnn are installed in the same directory. - if(onnxruntime_CUDNN_HOME) - target_include_directories(onnxruntime_pybind11_state PRIVATE ${onnxruntime_CUDNN_HOME}/include) - endif() + target_include_directories(onnxruntime_pybind11_state PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUDNN_INCLUDE_DIR}) endif() if(onnxruntime_USE_CANN) target_include_directories(onnxruntime_pybind11_state PRIVATE ${onnxruntime_CANN_HOME}/include) diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index 0d87ffe288565..ec396fbcc6117 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -75,7 +75,7 @@ function(AddTest) onnxruntime_add_include_to_target(${_UT_TARGET} date::date flatbuffers::flatbuffers) target_include_directories(${_UT_TARGET} PRIVATE ${TEST_INC_DIR}) if (onnxruntime_USE_CUDA) - target_include_directories(${_UT_TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + target_include_directories(${_UT_TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUDNN_INCLUDE_DIR}) if (onnxruntime_USE_NCCL) target_include_directories(${_UT_TARGET} PRIVATE ${NCCL_INCLUDE_DIRS}) endif() @@ -1498,7 +1498,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten") list(APPEND custom_op_src_patterns "${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/cuda_ops.cu" "${TEST_SRC_DIR}/testdata/custom_op_library/cuda/cuda_ops.*") - list(APPEND custom_op_lib_include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + list(APPEND custom_op_lib_include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUDNN_INCLUDE_DIR}) if (HAS_QSPECTRE) list(APPEND custom_op_lib_option "$<$:SHELL:--compiler-options /Qspectre>") endif()