diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h index 265144f926f3d..d0bb2321edf0a 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h @@ -53,6 +53,8 @@ class CUDAExecutionProvider : public IExecutionProvider { } cudaStream_t ComputeStream() { + // this will return the CUDA EP level stream which can differ from the actual compute tasks stream + // the compute task stream is supplied within OpKernelContext during inference return stream_; } diff --git a/onnxruntime/core/providers/cuda/cuda_kernel.h b/onnxruntime/core/providers/cuda/cuda_kernel.h index 385219152db9b..f8b92eface52f 100644 --- a/onnxruntime/core/providers/cuda/cuda_kernel.h +++ b/onnxruntime/core/providers/cuda/cuda_kernel.h @@ -171,6 +171,8 @@ class CudaKernel : public OpKernel { } inline cudaStream_t DefaultCudaStream() const { + // this will return the CUDA EP level stream which can differ from the actual compute tasks stream + // the compute task stream is supplied within OpKernelContext during inference return provider_->ComputeStream(); } diff --git a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc index 953bb47de09ab..15c3f3f579f76 100644 --- a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc +++ b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc @@ -15,83 +15,145 @@ namespace cuda { // When adding new supported NHWC operations make sure to also integrate them into: ConvertNodeLayout // in onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 8, float, BatchNormalization); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 8, MLFloat16, BatchNormalization); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 9, 13, float, BatchNormalization); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 9, 13, MLFloat16, BatchNormalization); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, Conv); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, MLFloat16, Conv); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, ConvTranspose); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, MLFloat16, ConvTranspose); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 9, float, AveragePool); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 9, MLFloat16, AveragePool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 8, float, + BatchNormalization); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 8, MLFloat16, + BatchNormalization); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 9, 13, float, + BatchNormalization); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 9, 13, MLFloat16, + BatchNormalization); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, + Conv); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, MLFloat16, + Conv); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, + ConvTranspose); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, MLFloat16, + ConvTranspose); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 9, float, + AveragePool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 9, MLFloat16, + AveragePool); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, float, GlobalAveragePool); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, MLFloat16, GlobalAveragePool); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 7, float, MaxPool); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 7, MLFloat16, MaxPool); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 8, 9, float, MaxPool); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 8, 9, MLFloat16, MaxPool); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, MLFloat16, + GlobalAveragePool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 7, float, + MaxPool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 7, MLFloat16, + MaxPool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 8, 9, float, + MaxPool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 8, 9, MLFloat16, + MaxPool); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, float, GlobalMaxPool); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, MLFloat16, GlobalMaxPool); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, float, AveragePool); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, MLFloat16, AveragePool); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, float, MaxPool); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, MLFloat16, MaxPool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, float, + AveragePool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, MLFloat16, + AveragePool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, float, + MaxPool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, MLFloat16, + MaxPool); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, MLFloat16, Conv); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, float, ConvTranspose); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, MLFloat16, ConvTranspose); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, MLFloat16, + ConvTranspose); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, float, AveragePool); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, MLFloat16, AveragePool); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, 11, float, MaxPool); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, 11, MLFloat16, MaxPool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, 11, float, + MaxPool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, 11, MLFloat16, + MaxPool); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, float, MaxPool); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, MLFloat16, MaxPool); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, float, BatchNormalization); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, MLFloat16, BatchNormalization); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15, float, BatchNormalization); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15, MLFloat16, BatchNormalization); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, float, + BatchNormalization); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, MLFloat16, + BatchNormalization); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15, float, + BatchNormalization); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15, MLFloat16, + BatchNormalization); Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn nhwc_function_table[] = { BuildKernelCreateInfo, // default entry to avoid the list become empty after ops-reducing - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, }; for (auto& function_table_entry : nhwc_function_table) { diff --git a/onnxruntime/core/providers/cuda/nn/conv.h b/onnxruntime/core/providers/cuda/nn/conv.h index e0cdd77f6502d..e103f2e74863d 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.h +++ b/onnxruntime/core/providers/cuda/nn/conv.h @@ -207,7 +207,7 @@ class Conv : public CudaKernel { constexpr static auto kDefaultConvAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; static const cudnnConvolutionFwdAlgo_t kAllAlgos[]; std::unique_ptr W_; - bool is_nhwc_domain_; // prepack is only needed for the Conv in kMSInternalNHWCDomain + bool is_nhwc_domain_; // prepack is only needed for the Conv in kMSInternalNHWCDomain }; Status SliceOutUnwantedOutputSection(cudaStream_t stream, diff --git a/onnxruntime/core/providers/cuda/nn/pool.cc b/onnxruntime/core/providers/cuda/nn/pool.cc index eb8b634d1c6e9..8bc96958693bc 100644 --- a/onnxruntime/core/providers/cuda/nn/pool.cc +++ b/onnxruntime/core/providers/cuda/nn/pool.cc @@ -14,50 +14,31 @@ namespace cuda { #define POOLING_KERNEL(op_name, data_type, pool_type, since_version, op_domain, nhwc) \ ONNX_OPERATOR_TYPED_KERNEL_EX( \ - op_name, \ - op_domain, \ - since_version, \ - data_type, \ - kCudaExecutionProvider, \ + op_name, op_domain, since_version, data_type, kCudaExecutionProvider, \ (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()), \ Pool); #define POOLING_KERNEL_VERSIONED(op_name, data_type, pool_type, since_version, end_version, op_domain, nhwc) \ ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ - op_name, \ - op_domain, \ - since_version, \ - end_version, \ - data_type, \ - kCudaExecutionProvider, \ - (*KernelDefBuilder::Create()) \ - .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ + op_name, op_domain, since_version, end_version, data_type, kCudaExecutionProvider, \ + (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()), \ Pool); -#define POOLING_KERNEL_WITH_INDICES(op_name, data_type, pool_type, since_version, op_domain, nhwc) \ - ONNX_OPERATOR_TYPED_KERNEL_EX( \ - op_name, \ - op_domain, \ - since_version, \ - data_type, \ - kCudaExecutionProvider, \ - (*KernelDefBuilder::Create()) \ - .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ - .TypeConstraint("I", DataTypeImpl::GetTensorType()), \ - Pool); - -#define POOLING_KERNEL_VERSIONED_WITH_INDICES(op_name, data_type, pool_type, since_version, end_version, op_domain, nhwc) \ - ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ - op_name, \ - op_domain, \ - since_version, \ - end_version, \ - data_type, \ - kCudaExecutionProvider, \ - (*KernelDefBuilder::Create()) \ - .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ - .TypeConstraint("I", DataTypeImpl::GetTensorType()), \ - Pool); +#define POOLING_KERNEL_WITH_INDICES(op_name, data_type, pool_type, since_version, op_domain, nhwc) \ + ONNX_OPERATOR_TYPED_KERNEL_EX(op_name, op_domain, since_version, data_type, kCudaExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("I", DataTypeImpl::GetTensorType()), \ + Pool); + +#define POOLING_KERNEL_VERSIONED_WITH_INDICES(op_name, data_type, pool_type, since_version, end_version, op_domain, \ + nhwc) \ + ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(op_name, op_domain, since_version, end_version, data_type, \ + kCudaExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("I", DataTypeImpl::GetTensorType()), \ + Pool); POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 7, 9, kOnnxDomain, false) POOLING_KERNEL_VERSIONED(AveragePool, double, AveragePool, 7, 9, kOnnxDomain, false) @@ -123,8 +104,7 @@ POOLING_KERNEL(GlobalAveragePool, MLFloat16, AveragePool, 1, kMSInternalNHWCDoma class CudnnPoolingDescriptor final { public: - CudnnPoolingDescriptor() : desc_(nullptr) { - } + CudnnPoolingDescriptor() : desc_(nullptr) {} ~CudnnPoolingDescriptor() { if (desc_ != nullptr) { @@ -136,12 +116,9 @@ class CudnnPoolingDescriptor final { CudnnPoolingDescriptor(const CudnnPoolingDescriptor&) = delete; CudnnPoolingDescriptor& operator=(const CudnnPoolingDescriptor&) = delete; - Status Set(cudnnPoolingMode_t mode, - const gsl::span& kernel_shape, - const gsl::span& pads, - const gsl::span& strides) { - if (!desc_) - CUDNN_RETURN_IF_ERROR(cudnnCreatePoolingDescriptor(&desc_)); + Status Set(cudnnPoolingMode_t mode, const gsl::span& kernel_shape, + const gsl::span& pads, const gsl::span& strides) { + if (!desc_) CUDNN_RETURN_IF_ERROR(cudnnCreatePoolingDescriptor(&desc_)); int rank = gsl::narrow_cast(kernel_shape.size()); InlinedVector window(rank); @@ -156,14 +133,8 @@ class CudnnPoolingDescriptor final { for (int i = 0; i < rank; i++) { stride[i] = gsl::narrow_cast(strides[i]); } - CUDNN_RETURN_IF_ERROR(SetPoolingNdDescriptorHelper( - desc_, - mode, - CUDNN_PROPAGATE_NAN, - rank, - window.data(), - padding.data(), - stride.data())); + CUDNN_RETURN_IF_ERROR(SetPoolingNdDescriptorHelper(desc_, mode, CUDNN_PROPAGATE_NAN, rank, window.data(), + padding.data(), stride.data())); return Status::OK(); } @@ -199,8 +170,7 @@ Status Pool::ComputeInternal(OpKernelContext* context) const TensorShape y_shape(y_dims); Tensor* Y = context->Output(0, y_shape); // special case when there is a dim value of 0 in the shape. - if (y_shape.Size() == 0) - return Status::OK(); + if (y_shape.Size() == 0) return Status::OK(); auto x_data = reinterpret_cast(X->Data()); auto y_data = reinterpret_cast(Y->MutableData()); @@ -247,7 +217,8 @@ Status Pool::ComputeInternal(OpKernelContext* context) const IAllocatorUniquePtr temp_X = GetScratchBuffer(input_count, context->GetComputeStream()); auto temp_Y = GetScratchBuffer(output_count, context->GetComputeStream()); Impl_Cast(Stream(context), reinterpret_cast(x_data), temp_X.get(), input_count); - CUDNN_RETURN_IF_ERROR(PoolingForwardHelper(GetCudnnHandle(context), pooling_desc, &alpha, x_tensor, temp_X.get(), &beta, y_tensor, temp_Y.get())); + CUDNN_RETURN_IF_ERROR(PoolingForwardHelper(GetCudnnHandle(context), pooling_desc, &alpha, x_tensor, temp_X.get(), + &beta, y_tensor, temp_Y.get())); Impl_Cast(Stream(context), temp_Y.get(), y_data, output_count); } else { const auto alpha = Consts::One; @@ -257,7 +228,8 @@ Status Pool::ComputeInternal(OpKernelContext* context) const ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType(), NHWC)); ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType(), NHWC)); - CUDNN_RETURN_IF_ERROR(PoolingForwardHelper(GetCudnnHandle(context), pooling_desc, &alpha, x_tensor, x_data, &beta, y_tensor, y_data)); + CUDNN_RETURN_IF_ERROR( + PoolingForwardHelper(GetCudnnHandle(context), pooling_desc, &alpha, x_tensor, x_data, &beta, y_tensor, y_data)); } return Status::OK(); @@ -288,8 +260,7 @@ Status Pool, NHWC>::ComputeInternal(OpKernelContext* context) cons Tensor* Y = context->Output(0, TensorShape(y_dims)); // special case when there is a dim value of 0 in the shape. - if (Y->Shape().Size() == 0) - return Status::OK(); + if (Y->Shape().Size() == 0) return Status::OK(); auto x_data = reinterpret_cast(X->Data()); auto y_data = reinterpret_cast(Y->MutableData()); @@ -297,18 +268,8 @@ Status Pool, NHWC>::ComputeInternal(OpKernelContext* context) cons Tensor* I = context->Output(1, TensorShape(y_dims)); if (nullptr != I || !this->pool_attrs_.default_dilations) { auto i_data = nullptr == I ? nullptr : I->MutableData(); - MaxPoolWithIndex( - this->Stream(context), - x_shape, - TensorShape(y_dims), - kernel_shape, - strides, - pads, - this->pool_attrs_.dilations, - this->pool_attrs_.storage_order, - x_data, - y_data, - i_data); + MaxPoolWithIndex(this->Stream(context), x_shape, TensorShape(y_dims), kernel_shape, strides, pads, + this->pool_attrs_.dilations, this->pool_attrs_.storage_order, x_data, y_data, i_data); } else { ORT_RETURN_IF_ERROR((Pool, NHWC>::ComputeInternal(context))); } diff --git a/onnxruntime/test/providers/compare_provider_test_utils.h b/onnxruntime/test/providers/compare_provider_test_utils.h index 235ae3b92f49d..155016d7e69a2 100644 --- a/onnxruntime/test/providers/compare_provider_test_utils.h +++ b/onnxruntime/test/providers/compare_provider_test_utils.h @@ -3,6 +3,11 @@ #pragma once +#include +#include +#include +#include + #include "core/graph/constants.h" #include "test/common/tensor_op_test_utils.h" #include "test/providers/provider_test_utils.h" diff --git a/onnxruntime/test/providers/cuda/nhwc/conv_test.cc b/onnxruntime/test/providers/cuda/nhwc/conv_test.cc index b72cb7b1f463c..be0082f95feb8 100644 --- a/onnxruntime/test/providers/cuda/nhwc/conv_test.cc +++ b/onnxruntime/test/providers/cuda/nhwc/conv_test.cc @@ -2,7 +2,7 @@ // Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. -#include "nhwc_cuda_helper.h" +#include "test/providers/cuda/nhwc/nhwc_cuda_helper.h" namespace onnxruntime { namespace test { @@ -41,8 +41,7 @@ struct ConvOp { test->AddAttribute("pads", padding); std::vector output_dims = { - input_dims[0], - channels, + input_dims[0], channels, ComputeOutputShape(input_dims[2], strides[0], kernel_shape[0], dilations[0], padding[0], padding[1]), ComputeOutputShape(input_dims[3], strides[1], kernel_shape[1], dilations[1], padding[2], padding[3])}; std::vector output_data = FillZeros(output_dims); @@ -53,31 +52,20 @@ struct ConvOp { }; TYPED_TEST(CudaNhwcTypedTest, ConvNhwcBias) { - auto op = ConvOp{ - .input_dims = {1, 16, 64, 64}, - .kernel_shape = {3, 3}, - .channels = 16, - .bias = true}; + auto op = ConvOp{.input_dims = {1, 16, 64, 64}, .kernel_shape = {3, 3}, .channels = 16, .bias = true}; MAKE_PROVIDERS_EPS_TYPE(TypeParam) } TYPED_TEST(CudaNhwcTypedTest, ConvNhwcGroupNoBias) { - auto op = ConvOp{ - .input_dims = {1, 16, 64, 64}, - .kernel_shape = {3, 3}, - .channels = 16, - .group = 4}; + auto op = ConvOp{.input_dims = {1, 16, 64, 64}, .kernel_shape = {3, 3}, .channels = 16, .group = 4}; MAKE_PROVIDERS_EPS_TYPE(TypeParam) } TYPED_TEST(CudaNhwcTypedTest, ConvNhwcPadding) { - auto op = ConvOp{ - .input_dims = {2, 4, 64, 64}, - .kernel_shape = {3, 3}, - .channels = 4, - .padding = {4, 4, 4, 4}}; + auto op = + ConvOp{.input_dims = {2, 4, 64, 64}, .kernel_shape = {3, 3}, .channels = 4, .padding = {4, 4, 4, 4}}; MAKE_PROVIDERS_EPS_TYPE(TypeParam) } diff --git a/onnxruntime/test/providers/cuda/nhwc/conv_transpose_test.cc b/onnxruntime/test/providers/cuda/nhwc/conv_transpose_test.cc index 15c761269b805..d45323190c514 100644 --- a/onnxruntime/test/providers/cuda/nhwc/conv_transpose_test.cc +++ b/onnxruntime/test/providers/cuda/nhwc/conv_transpose_test.cc @@ -2,7 +2,7 @@ // Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. -#include "nhwc_cuda_helper.h" +#include "test/providers/cuda/nhwc/nhwc_cuda_helper.h" namespace onnxruntime { namespace test { @@ -45,8 +45,7 @@ struct ConvTransposeOp { } std::vector output_dims = { - input_dims[0], - channels, + input_dims[0], channels, (kernel_shape[1] - 1) * dilations[1] + (input_dims[2] - 1) * strides[1] - (padding[1] + padding[0]) + 1, (kernel_shape[0] - 1) * dilations[0] + (input_dims[3] - 1) * strides[0] - (padding[3] + padding[2]) + 1}; std::vector output_data = FillZeros(output_dims); @@ -57,43 +56,35 @@ struct ConvTransposeOp { }; TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcGroupNoBias) { - auto op = ConvTransposeOp{ - .input_dims = {8, 8, 32, 32}, - .kernel_shape = {3, 3}, - .channels = 16, - .group = 4}; + auto op = + ConvTransposeOp{.input_dims = {8, 8, 32, 32}, .kernel_shape = {3, 3}, .channels = 16, .group = 4}; MAKE_PROVIDERS_EPS_TYPE(TypeParam) } TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcBias) { - auto op = ConvTransposeOp{ - .input_dims = {1, 8, 80, 80}, - .kernel_shape = {5, 5}, - .channels = 16, - .bias = true}; + auto op = + ConvTransposeOp{.input_dims = {1, 8, 80, 80}, .kernel_shape = {5, 5}, .channels = 16, .bias = true}; MAKE_PROVIDERS_EPS_TYPE(TypeParam) } TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcPad) { - auto op = ConvTransposeOp{ - .input_dims = {1, 16, 8, 8}, - .kernel_shape = {3, 3}, - .channels = 32, - .padding = {2, 2, 2, 2}, - .output_padding = {}}; + auto op = ConvTransposeOp{.input_dims = {1, 16, 8, 8}, + .kernel_shape = {3, 3}, + .channels = 32, + .padding = {2, 2, 2, 2}, + .output_padding = {}}; MAKE_PROVIDERS_EPS_TYPE(TypeParam) } TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcOutPad) { - auto op = ConvTransposeOp{ - .input_dims = {1, 32, 8, 8}, - .kernel_shape = {3, 3}, - .channels = 32, - .strides = {2, 2}, - .output_padding = {1, 1, 1, 1}}; + auto op = ConvTransposeOp{.input_dims = {1, 32, 8, 8}, + .kernel_shape = {3, 3}, + .channels = 32, + .strides = {2, 2}, + .output_padding = {1, 1, 1, 1}}; MAKE_PROVIDERS_EPS_TYPE(TypeParam) } diff --git a/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h b/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h index 6261e6eaab183..677bc1591b57f 100644 --- a/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h +++ b/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h @@ -2,6 +2,7 @@ // Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. +#include #include "core/providers/cuda/cuda_provider_options.h" #include "core/providers/common.h" @@ -12,13 +13,11 @@ #define MAKE_PROVIDERS_EPS(eps) \ std::vector> execution_providers; \ - OrtCUDAProviderOptionsV2 nhwc = { \ - .prefer_nhwc = true}; \ + OrtCUDAProviderOptionsV2 nhwc = {.prefer_nhwc = true}; \ execution_providers.push_back(CudaExecutionProviderWithOptions(&nhwc)); \ \ double error_tolerance = eps; \ - OrtCUDAProviderOptionsV2 nchw = { \ - .prefer_nhwc = false}; \ + OrtCUDAProviderOptionsV2 nchw = {.prefer_nhwc = false}; \ auto source_ep = CudaExecutionProviderWithOptions(&nchw); \ auto test = op.get_test(); \ test->CompareEPs(std::move(source_ep), execution_providers, error_tolerance); @@ -37,8 +36,7 @@ namespace onnxruntime { namespace test { template -class CudaNhwcTypedTest : public ::testing::Test { -}; +class CudaNhwcTypedTest : public ::testing::Test {}; using CudaNhwcTestTypes = ::testing::Types; // double, TYPED_TEST_SUITE(CudaNhwcTypedTest, CudaNhwcTestTypes); diff --git a/onnxruntime/test/providers/cuda/nhwc/norm_test.cc b/onnxruntime/test/providers/cuda/nhwc/norm_test.cc index 66f2a965c4202..52da8ba557c2d 100644 --- a/onnxruntime/test/providers/cuda/nhwc/norm_test.cc +++ b/onnxruntime/test/providers/cuda/nhwc/norm_test.cc @@ -2,7 +2,7 @@ // Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. -#include "nhwc_cuda_helper.h" +#include "test/providers/cuda/nhwc/nhwc_cuda_helper.h" namespace onnxruntime { namespace test { diff --git a/onnxruntime/test/providers/cuda/nhwc/pool_test.cc b/onnxruntime/test/providers/cuda/nhwc/pool_test.cc index a2fc5ec5cfe2a..3d1f81e6bc282 100644 --- a/onnxruntime/test/providers/cuda/nhwc/pool_test.cc +++ b/onnxruntime/test/providers/cuda/nhwc/pool_test.cc @@ -2,7 +2,7 @@ // Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. -#include "nhwc_cuda_helper.h" +#include "test/providers/cuda/nhwc/nhwc_cuda_helper.h" namespace onnxruntime { namespace test { @@ -30,8 +30,7 @@ struct PoolOp { test->AddAttribute("pads", padding); std::vector output_dims = { - input_dims[0], - channels, + input_dims[0], channels, (kernel_shape[1] - 1) + (input_dims[2] - 1) * strides[1] - (padding[1] + padding[0]) + 1, (kernel_shape[0] - 1) + (input_dims[3] - 1) * strides[0] - (padding[3] + padding[2]) + 1}; std::vector output_data = FillZeros(output_dims); @@ -73,24 +72,21 @@ TYPED_TEST(CudaNhwcTypedTest, GlobalMaxPoolNhwc) { test->AddOutput("Y", output_dims, output_data); std::vector> execution_providers; - OrtCUDAProviderOptionsV2 nhwc = { - .prefer_nhwc = true}; + OrtCUDAProviderOptionsV2 nhwc = {.prefer_nhwc = true}; execution_providers.push_back(CudaExecutionProviderWithOptions(&nhwc)); double error_tolerance = 1e-3; - OrtCUDAProviderOptionsV2 nchw = { - .prefer_nhwc = false}; + OrtCUDAProviderOptionsV2 nchw = {.prefer_nhwc = false}; auto source_ep = CudaExecutionProviderWithOptions(&nchw); test->CompareEPs(std::move(source_ep), execution_providers, error_tolerance); } TYPED_TEST(CudaNhwcTypedTest, AveragePoolNhwcPad) { - auto op = PoolOp{ - .pooling_type = "AveragePool", - .input_dims = {1, 16, 64, 64}, - .kernel_shape = {3, 3}, - .channels = 16, - .padding = {2, 2, 2, 2}}; + auto op = PoolOp{.pooling_type = "AveragePool", + .input_dims = {1, 16, 64, 64}, + .kernel_shape = {3, 3}, + .channels = 16, + .padding = {2, 2, 2, 2}}; MAKE_PROVIDERS() }