diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake index af95d0203544c..fe3e577b4fc36 100644 --- a/cmake/onnxruntime_rocm_hipify.cmake +++ b/cmake/onnxruntime_rocm_hipify.cmake @@ -168,6 +168,8 @@ set(provider_excluded_files "gpu_data_transfer.h" "integer_gemm.cc" "tunable/*" + "cuda_nhwc_kernels.cc" + "cuda_nhwc_kernels.h" ) set(training_ops_excluded_files diff --git a/onnxruntime/core/providers/rocm/miopen_common.cc b/onnxruntime/core/providers/rocm/miopen_common.cc index e77bbab2d5750..6b01f02ae49b5 100644 --- a/onnxruntime/core/providers/rocm/miopen_common.cc +++ b/onnxruntime/core/providers/rocm/miopen_common.cc @@ -73,7 +73,12 @@ Status MiopenTensor::CreateTensorIfNeeded() { return Status::OK(); } -Status MiopenTensor::Set(gsl::span input_dims, miopenDataType_t dataType) { +Status MiopenTensor::Set(gsl::span input_dims, miopenDataType_t dataType, bool is_nhwc) { + if (is_nhwc) { + return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, + "NHWC Tensor usage is not supported in AMD builds for now"); + } + ORT_RETURN_IF_ERROR(CreateTensorIfNeeded()); int rank = gsl::narrow_cast(input_dims.size()); diff --git a/onnxruntime/core/providers/rocm/miopen_common.h b/onnxruntime/core/providers/rocm/miopen_common.h index 7d9cabb0951c6..eb4eb745b3692 100644 --- a/onnxruntime/core/providers/rocm/miopen_common.h +++ b/onnxruntime/core/providers/rocm/miopen_common.h @@ -33,7 +33,7 @@ class MiopenTensor final { ~MiopenTensor(); ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(MiopenTensor); - Status Set(gsl::span input_dims, miopenDataType_t dataType); + Status Set(gsl::span input_dims, miopenDataType_t dataType, bool is_nhwc = false); Status Set(miopenDataType_t dataType, miopenTensorLayout_t tensor_layout, int n, int c, int h, int w); Status Set(const MiopenTensor& x_desc, miopenBatchNormMode_t mode); diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc index 475d26d2e306d..7447113fdf847 100644 --- a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc +++ b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc @@ -16,7 +16,7 @@ namespace rocm { T, \ kRocmExecutionProvider, \ (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()), \ - ConvTranspose); \ + ConvTranspose); \ ONNX_OPERATOR_TYPED_KERNEL_EX( \ ConvTranspose, \ kOnnxDomain, \ @@ -24,20 +24,20 @@ namespace rocm { T, \ kRocmExecutionProvider, \ (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()), \ - ConvTranspose); + ConvTranspose); REGISTER_KERNEL_TYPED(float) // not yet supported in MIOpen // REGISTER_KERNEL_TYPED(double) REGISTER_KERNEL_TYPED(MLFloat16) -template -Status ConvTranspose::ComputeInternal(OpKernelContext* context) const { +template +Status ConvTranspose::ComputeInternal(OpKernelContext* context) const { return DoConvTranspose(context, false); } -template -Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_padding) const { +template +Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_padding) const { typedef typename ToHipType::MappedType HipT; const Tensor* X = context->Input(0); diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.h b/onnxruntime/core/providers/rocm/nn/conv_transpose.h index b4b80aeec9421..55a84cc59fe92 100644 --- a/onnxruntime/core/providers/rocm/nn/conv_transpose.h +++ b/onnxruntime/core/providers/rocm/nn/conv_transpose.h @@ -12,10 +12,12 @@ namespace onnxruntime { namespace rocm { -template +template class ConvTranspose : public RocmKernel { public: - ConvTranspose(const OpKernelInfo& info) : RocmKernel(info), conv_transpose_attrs_(info){}; + ConvTranspose(const OpKernelInfo& info) : RocmKernel(info), conv_transpose_attrs_(info) { + static_assert(!NHWC, "AMD builds don't support usage of NHWC ops"); + }; Status ComputeInternal(OpKernelContext* context) const override; Status DoConvTranspose(OpKernelContext* context, bool dynamic_padding) const; diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index d6a5761d54042..806e536cb4ddb 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -247,6 +247,7 @@ def convert_arg_line_to_args(self, arg_line): "--cudnn_home is not specified.", ) parser.add_argument("--enable_cuda_line_info", action="store_true", help="Enable CUDA line info.") + parser.add_argument("--enable_cuda_nhwc_ops", action="store_true", help="Enable CUDA NHWC ops in build.") # Python bindings parser.add_argument("--enable_pybind", action="store_true", help="Enable Python Bindings.") @@ -1025,6 +1026,7 @@ def generate_build_tree( "-Donnxruntime_USE_MPI=" + ("ON" if args.use_mpi else "OFF"), "-Donnxruntime_ENABLE_MEMORY_PROFILE=" + ("ON" if args.enable_memory_profile else "OFF"), "-Donnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO=" + ("ON" if args.enable_cuda_line_info else "OFF"), + "-Donnxruntime_USE_CUDA_NHWC_OPS=" + ("ON" if args.enable_cuda_nhwc_ops else "OFF"), "-Donnxruntime_BUILD_WEBASSEMBLY_STATIC_LIB=" + ("ON" if args.build_wasm_static_lib else "OFF"), "-Donnxruntime_ENABLE_WEBASSEMBLY_EXCEPTION_CATCHING=" + ("OFF" if args.disable_wasm_exception_catching else "ON"), diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml index 981cbec4ef50f..71a580f348f6f 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml @@ -106,7 +106,7 @@ jobs: --parallel \ --build_wheel \ --enable_onnx_tests --use_cuda --cuda_version=${{variables.common_cuda_version}} --cuda_home=/usr/local/cuda-${{variables.common_cuda_version}} --cudnn_home=/usr/local/cuda-${{variables.common_cuda_version}} \ - --enable_cuda_profiling \ + --enable_cuda_profiling --enable_cuda_nhwc_ops \ --enable_pybind --build_java \ --use_cache \ --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75; \