Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make CUDA a NHWC EP #17200

Merged
merged 25 commits into from
Oct 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
cbecb0d
benchmarking utilities
gedoensmax Aug 16, 2023
8917c74
option to make CUDA EP prefer NHWC
gedoensmax Aug 16, 2023
2cdd184
register NHWC conv
gedoensmax Aug 16, 2023
51db617
batch- and instance norm NHWC
gedoensmax Aug 16, 2023
5e0e235
poling NHWC op
gedoensmax Aug 17, 2023
3ab4f85
resolving hacky NHWC command line parsing and enabling arbitraty options
gedoensmax Aug 22, 2023
fba5864
add copyright header
gedoensmax Aug 22, 2023
5a258ec
CUDA NHWC unit test
gedoensmax Aug 29, 2023
ea0d033
using pre pack to transpose weights
gedoensmax Aug 29, 2023
0aa7e51
Adding more unit tests
gedoensmax Aug 29, 2023
7d040a9
remove becnh script
gedoensmax Aug 30, 2023
f3625c3
dropping instance norm and moving to typed tests
gedoensmax Aug 30, 2023
7b808a3
adress review comments
gedoensmax Sep 20, 2023
7892929
adding compile option to omit NHWC kernels
gedoensmax Sep 27, 2023
fff1860
remove remaining fused conv and filter NHWC supported ops
gedoensmax Oct 2, 2023
d767a6d
review change cleanup
gedoensmax Oct 4, 2023
9df8fe2
adding pre pack sync
gedoensmax Oct 4, 2023
8d12c11
lint changes
gedoensmax Oct 9, 2023
3ad8e5c
support contrib NHWC conv
gedoensmax Oct 10, 2023
43cc600
update include
gedoensmax Oct 10, 2023
8b15fb8
revert documentation of prepack bool
gedoensmax Oct 10, 2023
b2402a9
fromatting and adding more comments
gedoensmax Oct 11, 2023
d0daf4e
cpplint changes
gedoensmax Oct 12, 2023
1cd4672
fix error messages
gedoensmax Oct 12, 2023
33a87f8
Merge branch 'main' into cuda_nhwc
gedoensmax Oct 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
# use. If you hit any problem with that, please do not report it to GTest. Turn OFF the following build option instead.
cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS;LINUX" OFF)

option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" OFF)
option(onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO "When building with CUDA support, generate device code line number information." OFF)
option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF)
option(onnxruntime_USE_COREML "Build with CoreML support" OFF)
Expand Down Expand Up @@ -671,6 +672,9 @@ set(ORT_PROVIDER_FLAGS)
set(ORT_PROVIDER_CMAKE_FLAGS)

if (onnxruntime_USE_CUDA)
if (onnxruntime_USE_CUDA_NHWC_OPS)
add_compile_definitions(ENABLE_CUDA_NHWC_OPS)
endif()
enable_language(CUDA)
message( STATUS "CMAKE_CUDA_COMPILER_VERSION: ${CMAKE_CUDA_COMPILER_VERSION}")

Expand Down
13 changes: 10 additions & 3 deletions cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,13 @@ if (onnxruntime_USE_CUDA AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_R
"${TEST_SRC_DIR}/providers/cuda/*"
)
list(APPEND onnxruntime_test_providers_src ${onnxruntime_test_providers_cuda_src})

if (onnxruntime_USE_CUDA_NHWC_OPS)
file(GLOB onnxruntime_test_providers_cuda_nhwc_src CONFIGURE_DEPENDS
"${TEST_SRC_DIR}/providers/cuda/nhwc/*.cc"
)
list(APPEND onnxruntime_test_providers_src ${onnxruntime_test_providers_cuda_nhwc_src})
endif()
hariharans29 marked this conversation as resolved.
Show resolved Hide resolved
endif()

if (onnxruntime_USE_CANN)
Expand Down Expand Up @@ -851,7 +858,7 @@ if (HAS_SHORTEN_64_TO_32 AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
endif()

if (UNIX AND onnxruntime_USE_TENSORRT)
# The test_main.cc includes NvInfer.h where it has many deprecated declarations
# The test_main.cc includes NvInfer.h where it has many deprecated declarations
# simply ignore them for TensorRT EP build
set_property(TARGET onnxruntime_test_all APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
endif()
Expand Down Expand Up @@ -1294,7 +1301,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
endif()

if (UNIX AND onnxruntime_USE_TENSORRT)
# The test_main.cc includes NvInfer.h where it has many deprecated declarations
# The test_main.cc includes NvInfer.h where it has many deprecated declarations
# simply ignore them for TensorRT EP build
set_property(TARGET onnxruntime_shared_lib_test APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
endif()
Expand Down Expand Up @@ -1583,7 +1590,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
endif()

if (UNIX AND onnxruntime_USE_TENSORRT)
# The test_main.cc includes NvInfer.h where it has many deprecated declarations
# The test_main.cc includes NvInfer.h where it has many deprecated declarations
# simply ignore them for TensorRT EP build
set_property(TARGET onnxruntime_customopregistration_test APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
endif()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Copyright (c) 2023 NVIDIA Corporation.
// Licensed under the MIT License.

#pragma once

#include <limits>
hariharans29 marked this conversation as resolved.
Show resolved Hide resolved

#include "onnxruntime_c_api.h"
#include "core/framework/arena_extend_strategy.h"

Expand Down Expand Up @@ -32,5 +35,6 @@
int tunable_op_max_tuning_duration_ms = 0; // Max tuning duration time limit for TunableOp.
int enable_skip_layer_norm_strict_mode = 0; // flag specifying if SkipLayerNorm is in strict mode. If true, use LayerNormalization kernel.
// The strict mode has better accuracy but lower performance.
int prefer_nhwc = 0; // make the CUDA EP NHWC preferred

Check warning on line 38 in include/onnxruntime/core/providers/cuda/cuda_provider_options.h

View workflow job for this annotation

GitHub Actions / Lint C++

[cpplint] reported by reviewdog 🐶 Lines should be <= 120 characters long [whitespace/line_length] [2] Raw Output: include/onnxruntime/core/providers/cuda/cuda_provider_options.h:38: Lines should be <= 120 characters long [whitespace/line_length] [2]
int use_ep_level_unified_stream = 0; // flag specifying if ep level stream is used or not
};
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Copyright (c) 2023 NVIDIA Corporation.
// Licensed under the MIT License.

#pragma once
Expand All @@ -10,12 +11,12 @@
namespace cuda {

template <typename T>
class ConvTransposeWithDynamicPads : public ::onnxruntime::cuda::ConvTranspose<T> {
class ConvTransposeWithDynamicPads : public ::onnxruntime::cuda::ConvTranspose<T, false> {
public:
ConvTransposeWithDynamicPads(const OpKernelInfo& info) : ::onnxruntime::cuda::ConvTranspose<T>(info) {}
ConvTransposeWithDynamicPads(const OpKernelInfo& info) : ::onnxruntime::cuda::ConvTranspose<T, false>(info) {}

Check warning on line 16 in onnxruntime/contrib_ops/cuda/conv_transpose_with_dynamic_pads.h

View workflow job for this annotation

GitHub Actions / Lint C++

[cpplint] reported by reviewdog 🐶 Single-parameter constructors should be marked explicit. [runtime/explicit] [5] Raw Output: onnxruntime/contrib_ops/cuda/conv_transpose_with_dynamic_pads.h:16: Single-parameter constructors should be marked explicit. [runtime/explicit] [5]

Status ComputeInternal(OpKernelContext* context) const override {
return ::onnxruntime::cuda::ConvTranspose<T>::DoConvTranspose(context, true);
return ::onnxruntime::cuda::ConvTranspose<T, false>::DoConvTranspose(context, true);
}
};
} // namespace cuda
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,23 @@ CostCheckResult PostLayoutTransformCostCheck(const api::GraphRef& graph, const a
return OrtEPCostCheck(graph, node, perm, outputs_leading_to_transpose);
}

#if defined(USE_CUDA) && ENABLE_CUDA_NHWC_OPS
const std::unordered_set<std::string_view>& GetCUDALayoutSensitiveOps() {
static std::unordered_set<std::string_view> cuda_nhwc_ops = []() {
return std::unordered_set<std::string_view>{
"BatchNormalization",
"Conv",
"ConvTranspose",
"GlobalMaxPool",
"MaxPool",
"GlobalAveragePool",
"AveragePool",
};
}();
return cuda_nhwc_ops;
}
#endif

/// <summary>
/// Default function for checking if a node should have its layout changed. Allows EP specific adjustments to the
/// default set of layout sensitive operators if required.
Expand Down Expand Up @@ -71,11 +88,16 @@ bool ConvertNodeLayout(const api::NodeRef& node) {
}
#endif

// #if defined(USE_CUDA)
// if (node.GetExecutionProviderType() == kCudaExecutionProvider) {
// Update as per https://github.com/microsoft/onnxruntime/pull/17200 with CUDA ops that support NHWC
// }
// #endif
#if defined(USE_CUDA) && ENABLE_CUDA_NHWC_OPS
if (node.GetExecutionProviderType() == kCudaExecutionProvider) {
if (layout_sensitive_ops.count(node.OpType())) {
const auto& cuda_nhwc_ops = GetCUDALayoutSensitiveOps();
if (!cuda_nhwc_ops.count(node.OpType())) {
return false;
}
}
}
#endif

return layout_sensitive_ops.count(node.OpType()) != 0;
}
Expand Down
27 changes: 20 additions & 7 deletions onnxruntime/core/providers/cpu/nn/batch_norm_helper.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Copyright (c) 2023 NVIDIA Corporation.
// Licensed under the MIT License.

#pragma once
Expand All @@ -22,11 +23,17 @@ class BatchNormHelper {
const Tensor* B,
const Tensor* mean,
const Tensor* var,
bool is_spatial = true) {
bool is_spatial = true,
bool is_nhwc = false) {
const auto& x_dims = X->Shape().GetDims();

// If x_dims size < 2, num_channels defaults to 1.
int64_t num_channels = x_dims.size() > 1 ? x_dims[1] : 1;
int64_t num_channels;
if (is_nhwc) {
num_channels = x_dims.size() > 1 ? x_dims[x_dims.size() - 1] : 1;
} else {
num_channels = x_dims.size() > 1 ? x_dims[1] : 1;
}
// the first 2 are respectively - N and C.
int num_feature_dims = x_dims.size() > 1 ? static_cast<int>(x_dims.size() - 2) : 0;

Expand Down Expand Up @@ -109,7 +116,7 @@ class BatchNormHelper {
return common::Status::OK();
}

static void NormalizeDims(const TensorShape& x_shape, std::vector<int64_t>& new_dims) {
static void NormalizeDims(const TensorShape& x_shape, std::vector<int64_t>& new_dims, bool is_nhwc = false) {
new_dims.clear();
auto orig_dims = x_shape.GetDims();
ORT_ENFORCE(orig_dims.size() < 6,
Expand All @@ -122,13 +129,19 @@ class BatchNormHelper {

auto rank = x_shape.NumDimensions();
auto num_samples = rank > 0 ? orig_dims[0] : 1; // NCHW
auto num_channels = rank > 1 ? orig_dims[1] : 1;
auto height = rank > 2 ? orig_dims[2] : 1;
const size_t channel_dim = is_nhwc ? rank - 1 : 1;
const size_t height_dim = is_nhwc ? 1 : 2;
hariharans29 marked this conversation as resolved.
Show resolved Hide resolved
auto num_channels = rank > 1 ? orig_dims[channel_dim] : 1;
auto height = rank > 2 ? orig_dims[height_dim] : 1;
int64_t width = 1;
new_dims = {num_samples, num_channels, height, width};
if (is_nhwc) {
new_dims = {num_samples, height, width, num_channels};
} else {
new_dims = {num_samples, num_channels, height, width};
}
}
};
} // namespace onnxruntime
#if defined(_MSC_VER) && !defined(__clang__)
#pragma warning(pop)
#endif
#endif
27 changes: 19 additions & 8 deletions onnxruntime/core/providers/cpu/nn/conv_transpose_attributes.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* limitations under the License.
*/
/* Modifications Copyright (c) Microsoft. */
// Copyright (c) 2023 NVIDIA Corporation.

#pragma once

Expand Down Expand Up @@ -44,17 +45,19 @@ struct ConvTransposeAttributes : public ConvAttributes {
};

Status PrepareForCompute(OpKernelContext* context, bool has_bias, Prepare& p,
bool dynamic_padding = false, const TensorShape* filter_shape = nullptr) const {
bool dynamic_padding = false, const TensorShape* filter_shape = nullptr,
bool is_nhwc = false) const {
const Tensor* X = context->Input<Tensor>(0);
const Tensor* F = (filter_shape != nullptr) ? nullptr : context->Input<Tensor>(1);
const TensorShape& F_Shape = (filter_shape != nullptr) ? *filter_shape : F->Shape();
const Tensor* Pads = dynamic_padding ? context->Input<Tensor>(2) : nullptr;
const Tensor* B = has_bias ? (dynamic_padding ? context->Input<Tensor>(3) : context->Input<Tensor>(2)) : nullptr;
TensorShape input_shape = X->Shape().Slice(2);

const int64_t num_input_channels = X->Shape()[1];
const int rank = static_cast<int>(X->Shape().NumDimensions());
TensorShape input_shape = X->Shape().Slice(is_nhwc ? 1 : 2, is_nhwc ? rank - 1 : rank);
const int64_t num_input_channels = is_nhwc ? X->Shape()[rank - 1] : X->Shape()[1];
const int64_t N = X->Shape()[0];
const int64_t num_output_channels_multiplier = F_Shape[1];
const int64_t num_output_channels_multiplier = is_nhwc ? F_Shape[3] : F_Shape[1];
const int64_t num_output_channels = num_output_channels_multiplier * group;

// input validations
Expand Down Expand Up @@ -85,7 +88,7 @@ struct ConvTransposeAttributes : public ConvAttributes {
}

TensorShapeVector kernel_shape;
ORT_RETURN_IF_ERROR(ComputeKernelShape(F_Shape, kernel_shape));
ORT_RETURN_IF_ERROR(ComputeKernelShape(F_Shape, kernel_shape, is_nhwc));

TensorShapeVector local_output_padding(output_padding);
if (local_output_padding.empty()) {
Expand Down Expand Up @@ -115,7 +118,7 @@ struct ConvTransposeAttributes : public ConvAttributes {
TensorShapeVector Y_dims;

ComputePadsAndOutputShape(input_shape, num_output_channels, kernel_shape,
local_strides, local_dilations, local_output_padding, N, &local_pads, &Y_dims);
local_strides, local_dilations, local_output_padding, N, &local_pads, &Y_dims, is_nhwc);
TensorShape Yshape(Y_dims);
Tensor* Y = context->Output(0, Yshape);

Expand All @@ -137,9 +140,14 @@ struct ConvTransposeAttributes : public ConvAttributes {
void ComputePadsAndOutputShape(TensorShape input_shape, int64_t output_channel,
const TensorShapeVector& kernel_shape, const TensorShapeVector& p_strides,
const TensorShapeVector& p_dilations, const TensorShapeVector& p_output_padding, const int64_t N,
ConvPadVector* p_pads, TensorShapeVector* output_shape_p) const {
ConvPadVector* p_pads, TensorShapeVector* output_shape_p,
bool is_nhwc = false) const {
size_t output_shape_size = output_shape.size();
output_shape_p->insert(output_shape_p->begin(), {N, output_channel});
if (is_nhwc) {
output_shape_p->insert(output_shape_p->begin(), {N});
} else {
output_shape_p->insert(output_shape_p->begin(), {N, output_channel});
}

size_t rank = input_shape.NumDimensions();
for (size_t dim = 0; dim < rank; ++dim) {
Expand All @@ -163,6 +171,9 @@ struct ConvTransposeAttributes : public ConvAttributes {
ORT_ENFORCE(dim_size > 0, "Invalid input shape: ", input_shape.ToString());
output_shape_p->push_back(dim_size);
}
if (is_nhwc) {
output_shape_p->push_back(output_channel);
}
}

TensorShapeVector output_padding;
Expand Down
23 changes: 15 additions & 8 deletions onnxruntime/core/providers/cpu/nn/instance_norm_helper.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Copyright (c) 2023 NVIDIA Corporation.
// Licensed under the MIT License.

#pragma once
Expand All @@ -8,13 +9,16 @@
#include "core/framework/tensor.h"
#endif
#include <sstream>
#include <utility>

namespace onnxruntime {

class InstanceNormHelper {
public:
static common::Status ValidateInputs(const Tensor* input, const Tensor* scale, const Tensor* B) {
if (input->Shape().NumDimensions() < 3) {
static common::Status ValidateInputs(const Tensor* input, const Tensor* scale, const Tensor* B,
bool is_nhwc = false) {
const auto rank = input->Shape().NumDimensions();
if (rank < 3) {
std::ostringstream ostr;
ostr << "Invalid input data: number of dimensions is less than 3: " << input->Shape().NumDimensions();
return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str());
Expand All @@ -24,10 +28,13 @@ class InstanceNormHelper {
ostr << "Invalid input scale: number of dimensions is not 1: " << scale->Shape().NumDimensions();
return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str());
}
if (scale->Shape().Size() != input->Shape().GetDims()[1]) {
auto in_dims = input->Shape().GetDims();
auto in_channels = is_nhwc ? in_dims[rank - 1] : in_dims[1];

if (scale->Shape().Size() != in_channels) {
std::ostringstream ostr;
ostr << "Mismatch between input data and scale: size of scale != input channel count "
<< scale->Shape().Size() << " vs. " << input->Shape().GetDims()[1];
ostr << "Mismatch between input data and scale: size of scale != input channel count " << scale->Shape().Size()
<< " vs. " << in_channels;
return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str());
}

Expand All @@ -37,10 +44,10 @@ class InstanceNormHelper {
return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str());
}

if (B->Shape().Size() != input->Shape().GetDims()[1]) {
if (B->Shape().Size() != in_channels) {
std::ostringstream ostr;
ostr << "Mismatch between input data and B: size of B != input channel count "
<< B->Shape().Size() << " vs. " << input->Shape().GetDims()[1];
ostr << "Mismatch between input data and B: size of B != input channel count " << B->Shape().Size() << " vs. "
<< in_channels;
return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str());
}

Expand Down
21 changes: 14 additions & 7 deletions onnxruntime/core/providers/cpu/nn/pool_attributes.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Copyright (c) 2023 NVIDIA Corporation.
// Licensed under the MIT License.

#pragma once
Expand Down Expand Up @@ -98,28 +99,34 @@ struct PoolAttributes {

TensorShapeVector SetOutputSize(const TensorShape& input_shape,
int64_t output_channel,
TensorShapeVector* actual_pads) const {
TensorShapeVector* actual_pads,
bool is_nhwc = false) const {
ORT_ENFORCE(input_shape.Size() > 0 || input_shape[0] == 0,
"Invalid input shape. Only N can be zero. Got:", input_shape);
TensorShapeVector output_dims;
int64_t N = input_shape[0];
InferOutputSize(input_shape.GetDims(), &output_dims, actual_pads);

output_dims.insert(output_dims.begin(), {N, output_channel});

InferOutputSize(input_shape.GetDims(), &output_dims, actual_pads, is_nhwc);
if (is_nhwc) {
output_dims.insert(output_dims.begin(), N);
output_dims.push_back(output_channel);
} else {
output_dims.insert(output_dims.begin(), {N, output_channel});
}
return output_dims;
}

void InferOutputSize(gsl::span<const int64_t> input_dims,
TensorShapeVector* output_dims,
TensorShapeVector* actual_pads) const {
TensorShapeVector* actual_pads,
bool is_nhwc = false) const {
ORT_ENFORCE(input_dims.size() >= 2);
if (global_pooling) {
output_dims->assign(input_dims.size() - 2, 1);
} else {
for (size_t dim = 0; dim < input_dims.size() - 2; ++dim) {
int64_t dim_size = 0;
ComputeSizePadDilations(static_cast<int>(input_dims[dim + 2]),
auto spatial_dim = is_nhwc ? input_dims[dim + 1] : input_dims[dim + 2];
ComputeSizePadDilations(static_cast<int>(spatial_dim),
strides[dim],
kernel_shape[dim],
&actual_pads->at(dim),
Expand Down
Loading
Loading