diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc
index e3bf85c66d821..60cdfb823e1fc 100644
--- a/onnxruntime/core/providers/cuda/nn/conv.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv.cc
@@ -378,6 +378,7 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
     auto handle = GetCudnnHandle(context);
 
     int cudnn_conv_algo = cuda_ep->GetCudnnConvAlgo();
+#if !defined(__CUDACC__)
     cudnn_frontend::HeurMode_t heur_mode;
     switch (cudnn_conv_algo) {
       case 0:
@@ -396,7 +397,6 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
 
     size_t kernel_shape_size = kernel_shape.size();
 
-#if !defined(__CUDACC__)
     ORT_RETURN_IF_ERROR(CreateCudnnFeExecutionPlan(X, W, B, y_dims_cudnn, handle, heur_mode,
                                                    std::vector<int64_t>(pads.begin(),
                                                                         pads.begin() + kernel_shape_size),
diff --git a/onnxruntime/core/providers/cuda/nn/conv.h b/onnxruntime/core/providers/cuda/nn/conv.h
index 0e2bfb1560f67..f2bde8bea4b1c 100644
--- a/onnxruntime/core/providers/cuda/nn/conv.h
+++ b/onnxruntime/core/providers/cuda/nn/conv.h
@@ -7,11 +7,14 @@
 #include <list>
 #include <memory>
 
+#if !defined(__CUDACC__)
+#include <cudnn_frontend.h>
+#endif
+
 #include "core/platform/ort_mutex.h"
 #include "core/providers/cuda/cuda_kernel.h"
 #include "core/providers/cuda/cudnn_common.h"
 #include "core/providers/cpu/nn/conv_attributes.h"
-#include <cudnn_frontend.h>
 
 namespace onnxruntime {
 
@@ -223,8 +226,8 @@ class Conv : public CudaKernel {
   Status UpdateState(OpKernelContext* context, bool bias_expected = false) const;
 
 #if !defined(__CUDACC__)
-  Status CreateCudnnFeExecutionPlan(const Tensor* X, const Tensor* W, const Tensor* B, cudnnContext* handle, const cudnn_frontend::HeurMode_t heur_mode,
-                                    const std::vector<int64_t>& pads, const std::vector<int64_t>& strides, const std::vector<int64_t>& dilations, const bool bias_expected, const bool fuse_bias) const;
+  Status CreateCudnnFeExecutionPlan(const Tensor* X, const Tensor* W, const Tensor* B, const TensorShapeVector& y_dims, cudnnContext* handle, const cudnn_frontend::HeurMode_t heur_mode,
+                                                   const std::vector<int64_t>& pads, const std::vector<int64_t>& strides, const std::vector<int64_t>& dilations, const bool bias_expected, const bool fuse_bias) const;
 #endif
 
   ConvAttributes conv_attrs_;
diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
index 0efa78af2795c..dede278b7274f 100644
--- a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
@@ -59,8 +59,6 @@ void TestConvOp(const ConvOpAndTestAttributes& attributes,
   std::unordered_set<std::string> excluded_providers(attributes.excluded_providers);
   // Disable TensorRT because weight as input is not supported
   excluded_providers.insert(kTensorrtExecutionProvider);
-  // Disable CUDA NHWC execution provider as it is currently flaky
-  excluded_providers.insert(kCudaNHWCExecutionProvider);
 
   // QNN SDK 2.10.0 has a bug that breaks support for dynamic bias inputs.
   excluded_providers.insert(kQnnExecutionProvider);