From d0daf4e63b73df685779afc7ce13978f9a599698 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20M=C3=BCller?= <maximilianm@nvidia.com>
Date: Thu, 12 Oct 2023 19:10:32 +0200
Subject: [PATCH] cpplint changes

---
 .../providers/cpu/nn/instance_norm_helper.h   |  12 +-
 .../core/providers/cuda/cuda_nhwc_kernels.cc  |   8 +-
 .../core/providers/cuda/cuda_nhwc_kernels.h   |  10 +-
 .../core/providers/cuda/cudnn_common.cc       |   4 +-
 onnxruntime/core/providers/cuda/nn/conv.cc    |   2 +
 onnxruntime/core/providers/cuda/nn/conv.h     |   4 +-
 .../core/providers/cuda/nn/conv_transpose.cc  | 112 ++++++------------
 .../core/providers/cuda/nn/conv_transpose.h   |   2 +
 onnxruntime/core/providers/cuda/nn/pool.h     |   2 +-
 onnxruntime/test/perftest/ort_test_session.cc |   3 +-
 .../providers/compare_provider_test_utils.cc  |   3 +-
 .../providers/cuda/nhwc/nhwc_cuda_helper.h    |   3 +
 12 files changed, 67 insertions(+), 98 deletions(-)
diff --git a/onnxruntime/core/providers/cpu/nn/instance_norm_helper.h b/onnxruntime/core/providers/cpu/nn/instance_norm_helper.h
index 6734ebf62f56d..7aa51001f3b29 100644
--- a/onnxruntime/core/providers/cpu/nn/instance_norm_helper.h
+++ b/onnxruntime/core/providers/cpu/nn/instance_norm_helper.h
@@ -9,12 +9,14 @@
 #include "core/framework/tensor.h"
 #endif
 #include <sstream>
+#include <utility>
 
 namespace onnxruntime {
 
 class InstanceNormHelper {
  public:
-  static common::Status ValidateInputs(const Tensor* input, const Tensor* scale, const Tensor* B, bool is_nhwc = false) {
+  static common::Status ValidateInputs(const Tensor* input, const Tensor* scale, const Tensor* B,
+                                       bool is_nhwc = false) {
     const auto rank = input->Shape().NumDimensions();
     if (rank < 3) {
       std::ostringstream ostr;
@@ -31,8 +33,8 @@ class InstanceNormHelper {
 
     if (scale->Shape().Size() != in_channels) {
       std::ostringstream ostr;
-      ostr << "Mismatch between input data and scale: size of scale != input channel count "
-           << scale->Shape().Size() << " vs. " << in_channels << "nhwc: " << is_nhwc;
+      ostr << "Mismatch between input data and scale: size of scale != input channel count " << scale->Shape().Size()
+           << " vs. " << input->Shape().GetDims()[1];
       return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str());
     }
 
@@ -44,8 +46,8 @@ class InstanceNormHelper {
 
     if (B->Shape().Size() != in_channels) {
       std::ostringstream ostr;
-      ostr << "Mismatch between input data and B: size of B != input channel count "
-           << B->Shape().Size() << " vs. " << input->Shape().GetDims()[1];
+      ostr << "Mismatch between input data and B: size of B != input channel count " << B->Shape().Size() << " vs. "
+           << input->Shape().GetDims()[1];
       return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str());
     }
 
diff --git a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
index 15c3f3f579f76..f416caecd115f 100644
--- a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
+++ b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
@@ -4,13 +4,14 @@
 
 #ifdef ENABLE_CUDA_NHWC_OPS
 
+#include <utility>
+
 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/cuda/cuda_fwd.h"
 
 #include "core/providers/cuda/cuda_nhwc_kernels.h"
 
-namespace onnxruntime {
-namespace cuda {
+namespace onnxruntime::cuda {
 
 // When adding new supported NHWC operations make sure to also integrate them into: ConvertNodeLayout
 // in onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
@@ -164,6 +165,5 @@ Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry) {
   }
   return Status::OK();
 }
-}  // namespace cuda
-}  // namespace onnxruntime
+}  // namespace onnxruntime::cuda
 #endif
diff --git a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.h b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.h
index 00afb2f0554e3..0b3a6d5cff0c7 100644
--- a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.h
+++ b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.h
@@ -6,12 +6,8 @@
 
 #include "core/common/status.h"
 
-using namespace onnxruntime::common;
+namespace onnxruntime::cuda {
 
-namespace onnxruntime {
-namespace cuda {
+onnxruntime::common::Status RegisterCudaNhwcKernels(onnxruntime::KernelRegistry& kernel_registry);
 
-Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry);
-
-}  // namespace cuda
-}  // namespace onnxruntime
+}  // namespace onnxruntime::cuda
diff --git a/onnxruntime/core/providers/cuda/cudnn_common.cc b/onnxruntime/core/providers/cuda/cudnn_common.cc
index 0b57ab559d04a..4df59a98b12e5 100644
--- a/onnxruntime/core/providers/cuda/cudnn_common.cc
+++ b/onnxruntime/core/providers/cuda/cudnn_common.cc
@@ -2,7 +2,9 @@
 // Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
-#include "cudnn_common.h"
+#include <utility>
+
+#include "core/providers/cuda/cudnn_common.h"
 #include "core/common/inlined_containers.h"
 #include "core/common/gsl.h"
 #include "shared_inc/cuda_call.h"
diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc
index de8782f89f9e0..82f3503919237 100644
--- a/onnxruntime/core/providers/cuda/nn/conv.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv.cc
@@ -2,6 +2,8 @@
 // Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
+#include <utility>
+
 #include "core/providers/cuda/nn/conv.h"
 #include "core/common/span_utils.h"
 #include "core/providers/cuda/cuda_common.h"
diff --git a/onnxruntime/core/providers/cuda/nn/conv.h b/onnxruntime/core/providers/cuda/nn/conv.h
index e103f2e74863d..bcaa4d855b81e 100644
--- a/onnxruntime/core/providers/cuda/nn/conv.h
+++ b/onnxruntime/core/providers/cuda/nn/conv.h
@@ -4,11 +4,13 @@
 
 #pragma once
 
+#include <list>
+#include <memory>
+
 #include "core/platform/ort_mutex.h"
 #include "core/providers/cuda/cuda_kernel.h"
 #include "core/providers/cuda/cudnn_common.h"
 #include "core/providers/cpu/nn/conv_attributes.h"
-#include <list>
 
 namespace onnxruntime {
 
diff --git a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc
index a8831a09605b7..55dceaa2698e8 100644
--- a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc
@@ -2,6 +2,8 @@
 // Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
+#include <utility>
+
 #include "conv_transpose.h"
 #include "core/providers/cuda/tensor/transpose.h"
 
@@ -19,23 +21,13 @@ namespace cuda {
 
 // Op Set 11 for ConvTranspose only update document to clarify default dilations and strides value.
 // which are already covered by op set 11 cpu version, so simply add declaration.
-#define REGISTER_KERNEL_TYPED(T, DOMAIN, NHWC)                                             \
-  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                 \
-      ConvTranspose,                                                                       \
-      DOMAIN,                                                                              \
-      1, 10,                                                                               \
-      T,                                                                                   \
-      kCudaExecutionProvider,                                                              \
-      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      ConvTranspose<T, NHWC>);                                                             \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                           \
-      ConvTranspose,                                                                       \
-      DOMAIN,                                                                              \
-      11,                                                                                  \
-      T,                                                                                   \
-      kCudaExecutionProvider,                                                              \
-      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      ConvTranspose<T, NHWC>);
+#define REGISTER_KERNEL_TYPED(T, DOMAIN, NHWC)                                                                       \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                                           \
+      ConvTranspose, DOMAIN, 1, 10, T, kCudaExecutionProvider,                                                       \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), ConvTranspose<T, NHWC>);  \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(ConvTranspose, DOMAIN, 11, T, kCudaExecutionProvider,                                \
+                                (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+                                ConvTranspose<T, NHWC>);
 
 REGISTER_KERNEL_TYPED(float, kOnnxDomain, false)
 REGISTER_KERNEL_TYPED(double, kOnnxDomain, false)
@@ -52,8 +44,8 @@ Status ConvTranspose<T, NHWC>::ComputeInternal(OpKernelContext* context) const {
 }
 
 template <typename T, bool NHWC>
-Status ConvTranspose<T, NHWC>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                                       bool& is_packed, [[maybe_unused]] PrePackedWeights* prepacked_weights) {
+Status ConvTranspose<T, NHWC>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool& is_packed,
+                                       [[maybe_unused]] PrePackedWeights* prepacked_weights) {
   is_packed = false;
   // only layout of weight input is adjusted via PrePack
   if (NHWC) {  // InputTensors::IN_W
@@ -63,15 +55,10 @@ Status ConvTranspose<T, NHWC>::PrePack(const Tensor& tensor, int input_idx, Allo
 
       InlinedVector<size_t> perm{0, 2, 3, 1};
       gsl::span<size_t> permutation(perm.data(), 4);
-      TensorShapeVector new_dims{orig_shape[0],
-                                 orig_shape[2],
-                                 orig_shape[3],
-                                 orig_shape[1]};
+      TensorShapeVector new_dims{orig_shape[0], orig_shape[2], orig_shape[3], orig_shape[1]};
       W_ = Tensor::Create(tensor.DataType(), TensorShape(new_dims), std::move(alloc));
 
-      auto status = cuda::Transpose::DoTranspose(GetDeviceProp(),
-                                                 DefaultCudaStream(),
-                                                 DefaultCublasHandle(),
+      auto status = cuda::Transpose::DoTranspose(GetDeviceProp(), DefaultCudaStream(), DefaultCublasHandle(),
                                                  permutation, tensor, *W_);
 
       if (!status.IsOK()) {
@@ -126,8 +113,7 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
     bool input_dims_changed = (s_.last_x_dims.AsShapeVector() != x_dims);
     bool w_dims_changed = (s_.last_w_dims.AsShapeVector() != w_dims);
     if (input_dims_changed || w_dims_changed) {
-      if (input_dims_changed)
-        s_.last_x_dims = gsl::make_span(x_dims);
+      if (input_dims_changed) s_.last_x_dims = gsl::make_span(x_dims);
 
       if (w_dims_changed) {
         s_.last_w_dims = gsl::make_span(w_dims);
@@ -135,7 +121,8 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
       }
 
       ConvTransposeAttributes::Prepare p;
-      ORT_RETURN_IF_ERROR(conv_transpose_attrs_.PrepareForCompute(context, has_bias, p, dynamic_padding, &w_shape, NHWC));
+      ORT_RETURN_IF_ERROR(
+          conv_transpose_attrs_.PrepareForCompute(context, has_bias, p, dynamic_padding, &w_shape, NHWC));
 
       auto y_dims = p.Y->Shape().AsShapeVector();
       if (x_dimensions == 3) {
@@ -150,12 +137,9 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
 
       if (w_dims_changed) {
         if (NHWC) {
-          ORT_RETURN_IF_ERROR(s_.w_desc.Set(CUDNN_TENSOR_NHWC,
-                                            CudnnTensor::GetDataType<CudaT>(),
-                                            static_cast<int>(w_dims[0]),
-                                            static_cast<int>(w_dims[3]),
-                                            static_cast<int>(w_dims[1]),
-                                            static_cast<int>(w_dims[2])));
+          ORT_RETURN_IF_ERROR(s_.w_desc.Set(CUDNN_TENSOR_NHWC, CudnnTensor::GetDataType<CudaT>(),
+                                            static_cast<int>(w_dims[0]), static_cast<int>(w_dims[3]),
+                                            static_cast<int>(w_dims[1]), static_cast<int>(w_dims[2])));
         } else {
           ORT_RETURN_IF_ERROR(s_.w_desc.Set(w_dims, CudnnTensor::GetDataType<CudaT>()));
         }
@@ -170,15 +154,11 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
       }
       if (NHWC) {
         ORT_RETURN_IF_ERROR(s_.x_tensor.Set(CUDNN_TENSOR_NHWC, CudnnTensor::GetDataType<CudaT>(),
-                                            static_cast<int>(x_dims[0]),
-                                            static_cast<int>(x_dims[3]),
-                                            static_cast<int>(x_dims[1]),
-                                            static_cast<int>(x_dims[2])));
+                                            static_cast<int>(x_dims[0]), static_cast<int>(x_dims[3]),
+                                            static_cast<int>(x_dims[1]), static_cast<int>(x_dims[2])));
         ORT_RETURN_IF_ERROR(s_.y_tensor.Set(CUDNN_TENSOR_NHWC, CudnnTensor::GetDataType<CudaT>(),
-                                            static_cast<int>(y_dims[0]),
-                                            static_cast<int>(y_dims[3]),
-                                            static_cast<int>(y_dims[1]),
-                                            static_cast<int>(y_dims[2])));
+                                            static_cast<int>(y_dims[0]), static_cast<int>(y_dims[3]),
+                                            static_cast<int>(y_dims[1]), static_cast<int>(y_dims[2])));
       } else {
         ORT_RETURN_IF_ERROR(s_.x_tensor.Set(x_dims, CudnnTensor::GetDataType<CudaT>()));
         ORT_RETURN_IF_ERROR(s_.y_tensor.Set(y_dims, CudnnTensor::GetDataType<CudaT>()));
@@ -186,8 +166,8 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
 
       cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
       ORT_RETURN_IF_ERROR(s_.conv_desc.Set(p.kernel_shape.size(), p.pads, p.strides, p.dilations,
-                                           gsl::narrow_cast<int>(conv_transpose_attrs_.group),
-                                           mode, CudnnTensor::GetDataType<CudaT>()));
+                                           gsl::narrow_cast<int>(conv_transpose_attrs_.group), mode,
+                                           CudnnTensor::GetDataType<CudaT>()));
 
       if (has_bias) {
         const auto& b_shape = p.B->Shape();
@@ -195,8 +175,7 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
         TensorShapeVector b_dims(2 + p.kernel_shape.size());
         b_dims[0] = 1;                      // N
         b_dims[NHWC ? 3 : 1] = b_shape[0];  // C
-        for (size_t i = 0; i < p.kernel_shape.size(); i++)
-          b_dims[(NHWC ? 1 : 2) + i] = 1;
+        for (size_t i = 0; i < p.kernel_shape.size(); i++) b_dims[(NHWC ? 1 : 2) + i] = 1;
 
         ORT_RETURN_IF_ERROR(s_.b_tensor.Set(b_dims, CudnnTensor::GetDataType<CudaT>(), NHWC));
       }
@@ -204,7 +183,8 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
       y_data = reinterpret_cast<CudaT*>(p.Y->MutableData<T>());
 
       if (!s_.cached_benchmark_results.contains(x_dims)) {
-        IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());
+        IAllocatorUniquePtr<void> algo_search_workspace =
+            GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());
 
         // set math type to tensor core before algorithm search
         if constexpr (std::is_same<T, MLFloat16>::value)
@@ -213,19 +193,8 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
         cudnnConvolutionBwdDataAlgoPerf_t perf;
         int algo_count = 1;
         CUDNN_RETURN_IF_ERROR(cudnnFindConvolutionBackwardDataAlgorithmEx(
-            GetCudnnHandle(context),
-            s_.w_desc,
-            w_data,
-            s_.x_tensor,
-            x_data,
-            s_.conv_desc,
-            s_.y_tensor,
-            y_data,
-            1,
-            &algo_count,
-            &perf,
-            algo_search_workspace.get(),
-            AlgoSearchWorkspaceSize));
+            GetCudnnHandle(context), s_.w_desc, w_data, s_.x_tensor, x_data, s_.conv_desc, s_.y_tensor, y_data, 1,
+            &algo_count, &perf, algo_search_workspace.get(), AlgoSearchWorkspaceSize));
         s_.cached_benchmark_results.insert(x_dims, {perf.algo, perf.memory, perf.mathType});
       }
 
@@ -256,26 +225,15 @@ Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dy
 
     IAllocatorUniquePtr<void> workspace = GetScratchBuffer<void>(s_.workspace_bytes, context->GetComputeStream());
 
-    CUDNN_RETURN_IF_ERROR(
-        cudnnConvolutionBackwardData(
-            GetCudnnHandle(context),
-            &alpha,
-            s_.w_desc,
-            w_data,
-            s_.x_tensor,
-            x_data,
-            s_.conv_desc,
-            s_.algo,
-            workspace.get(),
-            s_.workspace_bytes,
-            &beta,
-            s_.y_tensor,
-            y_data));
+    CUDNN_RETURN_IF_ERROR(cudnnConvolutionBackwardData(GetCudnnHandle(context), &alpha, s_.w_desc, w_data, s_.x_tensor,
+                                                       x_data, s_.conv_desc, s_.algo, workspace.get(),
+                                                       s_.workspace_bytes, &beta, s_.y_tensor, y_data));
 
     if (has_bias) {
       const Tensor* B = dynamic_padding ? context->Input<Tensor>(3) : context->Input<Tensor>(2);
       auto b_data = reinterpret_cast<const CudaT*>(B->Data<T>());
-      CUDNN_RETURN_IF_ERROR(cudnnAddTensor(GetCudnnHandle(context), &alpha, s_.b_tensor, b_data, &alpha, s_.y_tensor, y_data));
+      CUDNN_RETURN_IF_ERROR(
+          cudnnAddTensor(GetCudnnHandle(context), &alpha, s_.b_tensor, b_data, &alpha, s_.y_tensor, y_data));
     }
   }
 
diff --git a/onnxruntime/core/providers/cuda/nn/conv_transpose.h b/onnxruntime/core/providers/cuda/nn/conv_transpose.h
index e553adffbdb2c..77c9d94162b6b 100644
--- a/onnxruntime/core/providers/cuda/nn/conv_transpose.h
+++ b/onnxruntime/core/providers/cuda/nn/conv_transpose.h
@@ -4,6 +4,8 @@
 
 #pragma once
 
+#include <memory>
+
 #include "core/providers/cuda/cuda_common.h"
 #include "core/providers/cuda/cuda_kernel.h"
 #include "core/providers/cuda/cudnn_common.h"
diff --git a/onnxruntime/core/providers/cuda/nn/pool.h b/onnxruntime/core/providers/cuda/nn/pool.h
index 3118ff4089d3a..8b5152a1565a9 100644
--- a/onnxruntime/core/providers/cuda/nn/pool.h
+++ b/onnxruntime/core/providers/cuda/nn/pool.h
@@ -22,7 +22,7 @@ class Pool : public CudaKernel, public PoolBase {
 template <typename T, bool NHWC>
 class Pool<T, MaxPool<8>, NHWC> final : public Pool<T, MaxPool<1>, NHWC> {
  public:
-  Pool(const OpKernelInfo& info) : Pool<T, MaxPool<1>, NHWC>(info) {}
+  explicit Pool(const OpKernelInfo& info) : Pool<T, MaxPool<1>, NHWC>(info) {}
 
   Status ComputeInternal(OpKernelContext* context) const override;
 };
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 4e5a904fea5b7..1323f21b2eda6 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -144,7 +144,8 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
       }
       auto pos = token.find("|");
       if (pos == std::string::npos || pos == 0 || pos == token.length()) {
-        ORT_THROW("[ERROR] [CUDA] Use a '|' to separate the key and value for the run-time option you are trying to use.\n");
+        ORT_THROW(
+            "[ERROR] [CUDA] Use a '|' to separate the key and value for the run-time option you are trying to use.\n");
       }
 
       auto key = token.substr(0, pos);
diff --git a/onnxruntime/test/providers/compare_provider_test_utils.cc b/onnxruntime/test/providers/compare_provider_test_utils.cc
index 79a34c39f68a0..3ef74259e27b6 100644
--- a/onnxruntime/test/providers/compare_provider_test_utils.cc
+++ b/onnxruntime/test/providers/compare_provider_test_utils.cc
@@ -193,7 +193,8 @@ void CompareOpTester::CompareEPs(const std::shared_ptr<IExecutionProvider>& sour
     // compare
     ASSERT_TRUE(source_fetches.size() == target_fetches.size());
     for (size_t i = 0; i < source_fetches.size(); i++) {
-      auto ret = CompareOrtValue(target_fetches[i], source_fetches[i], per_sample_tolerance, relative_per_sample_tolerance, false);
+      auto ret = CompareOrtValue(target_fetches[i], source_fetches[i], per_sample_tolerance,
+                                 relative_per_sample_tolerance, false);
       EXPECT_EQ(ret.first, COMPARE_RESULT::SUCCESS) << ret.second;
     }
   }
diff --git a/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h b/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h
index 677bc1591b57f..2c942bb790096 100644
--- a/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h
+++ b/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h
@@ -3,6 +3,9 @@
 // Licensed under the MIT License.
 
 #include <vector>
+#include <utility>
+#include <memory>
+
 #include "core/providers/cuda/cuda_provider_options.h"
 #include "core/providers/common.h"