microsoft · hariharans29 · Oct 16, 2023 · Aug 16, 2023 · Aug 16, 2023 · Aug 16, 2023
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -78,6 +78,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
 # use. If you hit any problem with that, please do not report it to GTest. Turn OFF the following build option instead.
 cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS;LINUX" OFF)
 
+option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" OFF)
 option(onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO "When building with CUDA support, generate device code line number information." OFF)
 option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF)
 option(onnxruntime_USE_COREML "Build with CoreML support" OFF)
@@ -671,6 +672,9 @@ set(ORT_PROVIDER_FLAGS)
 set(ORT_PROVIDER_CMAKE_FLAGS)
 
 if (onnxruntime_USE_CUDA)
+  if (onnxruntime_USE_CUDA_NHWC_OPS)
+    add_compile_definitions(ENABLE_CUDA_NHWC_OPS)
+  endif()
   enable_language(CUDA)
   message( STATUS "CMAKE_CUDA_COMPILER_VERSION: ${CMAKE_CUDA_COMPILER_VERSION}")
 

diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
@@ -374,6 +374,13 @@ if (onnxruntime_USE_CUDA AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_R
     "${TEST_SRC_DIR}/providers/cuda/*"
     )
   list(APPEND onnxruntime_test_providers_src ${onnxruntime_test_providers_cuda_src})
+
+  if (onnxruntime_USE_CUDA_NHWC_OPS)
+    file(GLOB onnxruntime_test_providers_cuda_nhwc_src CONFIGURE_DEPENDS
+      "${TEST_SRC_DIR}/providers/cuda/nhwc/*.cc"
+    )
+    list(APPEND onnxruntime_test_providers_src ${onnxruntime_test_providers_cuda_nhwc_src})
+  endif()
 endif()
 
 if (onnxruntime_USE_CANN)
@@ -851,7 +858,7 @@ if (HAS_SHORTEN_64_TO_32 AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
 endif()
 
 if (UNIX AND onnxruntime_USE_TENSORRT)
-    # The test_main.cc includes NvInfer.h where it has many deprecated declarations  
+    # The test_main.cc includes NvInfer.h where it has many deprecated declarations
     # simply ignore them for TensorRT EP build
     set_property(TARGET onnxruntime_test_all APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
 endif()
@@ -1294,7 +1301,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
     endif()
 
     if (UNIX AND onnxruntime_USE_TENSORRT)
-        # The test_main.cc includes NvInfer.h where it has many deprecated declarations  
+        # The test_main.cc includes NvInfer.h where it has many deprecated declarations
         # simply ignore them for TensorRT EP build
         set_property(TARGET onnxruntime_shared_lib_test APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
     endif()
@@ -1583,7 +1590,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
     endif()
 
     if (UNIX AND onnxruntime_USE_TENSORRT)
-        # The test_main.cc includes NvInfer.h where it has many deprecated declarations  
+        # The test_main.cc includes NvInfer.h where it has many deprecated declarations
         # simply ignore them for TensorRT EP build
         set_property(TARGET onnxruntime_customopregistration_test APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
     endif()

diff --git a/include/onnxruntime/core/providers/cuda/cuda_provider_options.h b/include/onnxruntime/core/providers/cuda/cuda_provider_options.h
@@ -1,8 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #pragma once
 
+#include <limits>
+
 #include "onnxruntime_c_api.h"
 #include "core/framework/arena_extend_strategy.h"
 
@@ -32,5 +35,6 @@
   int tunable_op_max_tuning_duration_ms = 0;                                                                   // Max tuning duration time limit for TunableOp.
   int enable_skip_layer_norm_strict_mode = 0;                                                                  // flag specifying if SkipLayerNorm is in strict mode. If true, use LayerNormalization kernel.
                                                                                                                // The strict mode has better accuracy but lower performance.
+  int prefer_nhwc = 0;                                                                                         // make the CUDA EP NHWC preferred
   int use_ep_level_unified_stream = 0;                                                                         // flag specifying if ep level stream is used or not
 };
diff --git a/onnxruntime/contrib_ops/cuda/conv_transpose_with_dynamic_pads.h b/onnxruntime/contrib_ops/cuda/conv_transpose_with_dynamic_pads.h
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #pragma once
@@ -10,12 +11,12 @@
 namespace cuda {
 
 template <typename T>
-class ConvTransposeWithDynamicPads : public ::onnxruntime::cuda::ConvTranspose<T> {
+class ConvTransposeWithDynamicPads : public ::onnxruntime::cuda::ConvTranspose<T, false> {
  public:
-  ConvTransposeWithDynamicPads(const OpKernelInfo& info) : ::onnxruntime::cuda::ConvTranspose<T>(info) {}
+  ConvTransposeWithDynamicPads(const OpKernelInfo& info) : ::onnxruntime::cuda::ConvTranspose<T, false>(info) {}
 
   Status ComputeInternal(OpKernelContext* context) const override {
-    return ::onnxruntime::cuda::ConvTranspose<T>::DoConvTranspose(context, true);
+    return ::onnxruntime::cuda::ConvTranspose<T, false>::DoConvTranspose(context, true);
   }
 };
 }  // namespace cuda

diff --git a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
@@ -30,6 +30,23 @@ CostCheckResult PostLayoutTransformCostCheck(const api::GraphRef& graph, const a
   return OrtEPCostCheck(graph, node, perm, outputs_leading_to_transpose);
 }
 
+#if defined(USE_CUDA) && ENABLE_CUDA_NHWC_OPS
+const std::unordered_set<std::string_view>& GetCUDALayoutSensitiveOps() {
+  static std::unordered_set<std::string_view> cuda_nhwc_ops = []() {
+    return std::unordered_set<std::string_view>{
+        "BatchNormalization",
+        "Conv",
+        "ConvTranspose",
+        "GlobalMaxPool",
+        "MaxPool",
+        "GlobalAveragePool",
+        "AveragePool",
+    };
+  }();
+  return cuda_nhwc_ops;
+}
+#endif
+
 /// <summary>
 /// Default function for checking if a node should have its layout changed. Allows EP specific adjustments to the
 /// default set of layout sensitive operators if required.
@@ -71,11 +88,16 @@ bool ConvertNodeLayout(const api::NodeRef& node) {
   }
 #endif
 
-  // #if defined(USE_CUDA)
-  //   if (node.GetExecutionProviderType() == kCudaExecutionProvider) {
-  //     Update as per https://github.com/microsoft/onnxruntime/pull/17200 with CUDA ops that support NHWC
-  //   }
-  // #endif
+#if defined(USE_CUDA) && ENABLE_CUDA_NHWC_OPS
+  if (node.GetExecutionProviderType() == kCudaExecutionProvider) {
+    if (layout_sensitive_ops.count(node.OpType())) {
+      const auto& cuda_nhwc_ops = GetCUDALayoutSensitiveOps();
+      if (!cuda_nhwc_ops.count(node.OpType())) {
+        return false;
+      }
+    }
+  }
+#endif
 
   return layout_sensitive_ops.count(node.OpType()) != 0;
 }

diff --git a/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h b/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #pragma once
@@ -22,11 +23,17 @@ class BatchNormHelper {
                                        const Tensor* B,
                                        const Tensor* mean,
                                        const Tensor* var,
-                                       bool is_spatial = true) {
+                                       bool is_spatial = true,
+                                       bool is_nhwc = false) {
     const auto& x_dims = X->Shape().GetDims();
 
     // If x_dims size < 2, num_channels defaults to 1.
-    int64_t num_channels = x_dims.size() > 1 ? x_dims[1] : 1;
+    int64_t num_channels;
+    if (is_nhwc) {
+      num_channels = x_dims.size() > 1 ? x_dims[x_dims.size() - 1] : 1;
+    } else {
+      num_channels = x_dims.size() > 1 ? x_dims[1] : 1;
+    }
     // the first 2 are respectively - N and C.
     int num_feature_dims = x_dims.size() > 1 ? static_cast<int>(x_dims.size() - 2) : 0;
 
@@ -109,7 +116,7 @@ class BatchNormHelper {
     return common::Status::OK();
   }
 
-  static void NormalizeDims(const TensorShape& x_shape, std::vector<int64_t>& new_dims) {
+  static void NormalizeDims(const TensorShape& x_shape, std::vector<int64_t>& new_dims, bool is_nhwc = false) {
     new_dims.clear();
     auto orig_dims = x_shape.GetDims();
     ORT_ENFORCE(orig_dims.size() < 6,
@@ -122,13 +129,19 @@ class BatchNormHelper {
 
     auto rank = x_shape.NumDimensions();
     auto num_samples = rank > 0 ? orig_dims[0] : 1;  // NCHW
-    auto num_channels = rank > 1 ? orig_dims[1] : 1;
-    auto height = rank > 2 ? orig_dims[2] : 1;
+    const size_t channel_dim = is_nhwc ? rank - 1 : 1;
+    const size_t height_dim = is_nhwc ? 1 : 2;
+    auto num_channels = rank > 1 ? orig_dims[channel_dim] : 1;
+    auto height = rank > 2 ? orig_dims[height_dim] : 1;
     int64_t width = 1;
-    new_dims = {num_samples, num_channels, height, width};
+    if (is_nhwc) {
+      new_dims = {num_samples, height, width, num_channels};
+    } else {
+      new_dims = {num_samples, num_channels, height, width};
+    }
   }
 };
 }  // namespace onnxruntime
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(pop)
-#endif
+#endif
diff --git a/onnxruntime/core/providers/cpu/nn/conv_transpose_attributes.h b/onnxruntime/core/providers/cpu/nn/conv_transpose_attributes.h
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 /* Modifications Copyright (c) Microsoft. */
+// Copyright (c) 2023 NVIDIA Corporation.
 
 #pragma once
 
@@ -44,17 +45,19 @@ struct ConvTransposeAttributes : public ConvAttributes {
   };
 
   Status PrepareForCompute(OpKernelContext* context, bool has_bias, Prepare& p,
-                           bool dynamic_padding = false, const TensorShape* filter_shape = nullptr) const {
+                           bool dynamic_padding = false, const TensorShape* filter_shape = nullptr,
+                           bool is_nhwc = false) const {
     const Tensor* X = context->Input<Tensor>(0);
     const Tensor* F = (filter_shape != nullptr) ? nullptr : context->Input<Tensor>(1);
     const TensorShape& F_Shape = (filter_shape != nullptr) ? *filter_shape : F->Shape();
     const Tensor* Pads = dynamic_padding ? context->Input<Tensor>(2) : nullptr;
     const Tensor* B = has_bias ? (dynamic_padding ? context->Input<Tensor>(3) : context->Input<Tensor>(2)) : nullptr;
-    TensorShape input_shape = X->Shape().Slice(2);
 
-    const int64_t num_input_channels = X->Shape()[1];
+    const int rank = static_cast<int>(X->Shape().NumDimensions());
+    TensorShape input_shape = X->Shape().Slice(is_nhwc ? 1 : 2, is_nhwc ? rank - 1 : rank);
+    const int64_t num_input_channels = is_nhwc ? X->Shape()[rank - 1] : X->Shape()[1];
     const int64_t N = X->Shape()[0];
-    const int64_t num_output_channels_multiplier = F_Shape[1];
+    const int64_t num_output_channels_multiplier = is_nhwc ? F_Shape[3] : F_Shape[1];
     const int64_t num_output_channels = num_output_channels_multiplier * group;
 
     // input validations
@@ -85,7 +88,7 @@ struct ConvTransposeAttributes : public ConvAttributes {
     }
 
     TensorShapeVector kernel_shape;
-    ORT_RETURN_IF_ERROR(ComputeKernelShape(F_Shape, kernel_shape));
+    ORT_RETURN_IF_ERROR(ComputeKernelShape(F_Shape, kernel_shape, is_nhwc));
 
     TensorShapeVector local_output_padding(output_padding);
     if (local_output_padding.empty()) {
@@ -115,7 +118,7 @@ struct ConvTransposeAttributes : public ConvAttributes {
     TensorShapeVector Y_dims;
 
     ComputePadsAndOutputShape(input_shape, num_output_channels, kernel_shape,
-                              local_strides, local_dilations, local_output_padding, N, &local_pads, &Y_dims);
+                              local_strides, local_dilations, local_output_padding, N, &local_pads, &Y_dims, is_nhwc);
     TensorShape Yshape(Y_dims);
     Tensor* Y = context->Output(0, Yshape);
 
@@ -137,9 +140,14 @@ struct ConvTransposeAttributes : public ConvAttributes {
   void ComputePadsAndOutputShape(TensorShape input_shape, int64_t output_channel,
                                  const TensorShapeVector& kernel_shape, const TensorShapeVector& p_strides,
                                  const TensorShapeVector& p_dilations, const TensorShapeVector& p_output_padding, const int64_t N,
-                                 ConvPadVector* p_pads, TensorShapeVector* output_shape_p) const {
+                                 ConvPadVector* p_pads, TensorShapeVector* output_shape_p,
+                                 bool is_nhwc = false) const {
     size_t output_shape_size = output_shape.size();
-    output_shape_p->insert(output_shape_p->begin(), {N, output_channel});
+    if (is_nhwc) {
+      output_shape_p->insert(output_shape_p->begin(), {N});
+    } else {
+      output_shape_p->insert(output_shape_p->begin(), {N, output_channel});
+    }
 
     size_t rank = input_shape.NumDimensions();
     for (size_t dim = 0; dim < rank; ++dim) {
@@ -163,6 +171,9 @@ struct ConvTransposeAttributes : public ConvAttributes {
       ORT_ENFORCE(dim_size > 0, "Invalid input shape: ", input_shape.ToString());
       output_shape_p->push_back(dim_size);
     }
+    if (is_nhwc) {
+      output_shape_p->push_back(output_channel);
+    }
   }
 
   TensorShapeVector output_padding;

diff --git a/onnxruntime/core/providers/cpu/nn/instance_norm_helper.h b/onnxruntime/core/providers/cpu/nn/instance_norm_helper.h
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #pragma once
@@ -8,13 +9,16 @@
 #include "core/framework/tensor.h"
 #endif
 #include <sstream>
+#include <utility>
 
 namespace onnxruntime {
 
 class InstanceNormHelper {
  public:
-  static common::Status ValidateInputs(const Tensor* input, const Tensor* scale, const Tensor* B) {
-    if (input->Shape().NumDimensions() < 3) {
+  static common::Status ValidateInputs(const Tensor* input, const Tensor* scale, const Tensor* B,
+                                       bool is_nhwc = false) {
+    const auto rank = input->Shape().NumDimensions();
+    if (rank < 3) {
       std::ostringstream ostr;
       ostr << "Invalid input data: number of dimensions is less than 3: " << input->Shape().NumDimensions();
       return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str());
@@ -24,10 +28,13 @@ class InstanceNormHelper {
       ostr << "Invalid input scale: number of dimensions is not 1: " << scale->Shape().NumDimensions();
       return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str());
     }
-    if (scale->Shape().Size() != input->Shape().GetDims()[1]) {
+    auto in_dims = input->Shape().GetDims();
+    auto in_channels = is_nhwc ? in_dims[rank - 1] : in_dims[1];
+
+    if (scale->Shape().Size() != in_channels) {
       std::ostringstream ostr;
-      ostr << "Mismatch between input data and scale: size of scale != input channel count "
-           << scale->Shape().Size() << " vs. " << input->Shape().GetDims()[1];
+      ostr << "Mismatch between input data and scale: size of scale != input channel count " << scale->Shape().Size()
+           << " vs. " << in_channels;
       return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str());
     }
 
@@ -37,10 +44,10 @@ class InstanceNormHelper {
       return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str());
     }
 
-    if (B->Shape().Size() != input->Shape().GetDims()[1]) {
+    if (B->Shape().Size() != in_channels) {
       std::ostringstream ostr;
-      ostr << "Mismatch between input data and B: size of B != input channel count "
-           << B->Shape().Size() << " vs. " << input->Shape().GetDims()[1];
+      ostr << "Mismatch between input data and B: size of B != input channel count " << B->Shape().Size() << " vs. "
+           << in_channels;
       return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str());
     }
 

diff --git a/onnxruntime/core/providers/cpu/nn/pool_attributes.h b/onnxruntime/core/providers/cpu/nn/pool_attributes.h
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #pragma once
@@ -98,28 +99,34 @@ struct PoolAttributes {
 
   TensorShapeVector SetOutputSize(const TensorShape& input_shape,
                                   int64_t output_channel,
-                                  TensorShapeVector* actual_pads) const {
+                                  TensorShapeVector* actual_pads,
+                                  bool is_nhwc = false) const {
     ORT_ENFORCE(input_shape.Size() > 0 || input_shape[0] == 0,
                 "Invalid input shape. Only N can be zero. Got:", input_shape);
     TensorShapeVector output_dims;
     int64_t N = input_shape[0];
-    InferOutputSize(input_shape.GetDims(), &output_dims, actual_pads);
-
-    output_dims.insert(output_dims.begin(), {N, output_channel});
-
+    InferOutputSize(input_shape.GetDims(), &output_dims, actual_pads, is_nhwc);
+    if (is_nhwc) {
+      output_dims.insert(output_dims.begin(), N);
+      output_dims.push_back(output_channel);
+    } else {
+      output_dims.insert(output_dims.begin(), {N, output_channel});
+    }
     return output_dims;
   }
 
   void InferOutputSize(gsl::span<const int64_t> input_dims,
                        TensorShapeVector* output_dims,
-                       TensorShapeVector* actual_pads) const {
+                       TensorShapeVector* actual_pads,
+                       bool is_nhwc = false) const {
     ORT_ENFORCE(input_dims.size() >= 2);
     if (global_pooling) {
       output_dims->assign(input_dims.size() - 2, 1);
     } else {
       for (size_t dim = 0; dim < input_dims.size() - 2; ++dim) {
         int64_t dim_size = 0;
-        ComputeSizePadDilations(static_cast<int>(input_dims[dim + 2]),
+        auto spatial_dim = is_nhwc ? input_dims[dim + 1] : input_dims[dim + 2];
+        ComputeSizePadDilations(static_cast<int>(spatial_dim),
                                 strides[dim],
                                 kernel_shape[dim],
                                 &actual_pads->at(dim),