diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index b0ed68d595c42..1eaf0fb6dad76 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -734,7 +734,8 @@ Do not modify directly.*
 |||13|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **shape** = tensor(int64)|
 |||[5, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **shape** = tensor(int64)|
 |||[1, 4]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Resize|*in* X:**T**<br> *in* scales:**tensor(float)**<br> *out* Y:**T**<br><br>or<br><br>*in* X:**T1**<br> *in* roi:**T2**<br> *in* scales:**tensor(float)**<br> *in* sizes:**tensor(int64)**<br> *out* Y:**T1**|13+|**T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
+|Resize|*in* X:**T**<br> *in* scales:**tensor(float)**<br> *out* Y:**T**<br><br>or<br><br>*in* X:**T1**<br> *in* roi:**T2**<br> *in* scales:**tensor(float)**<br> *in* sizes:**tensor(int64)**<br> *out* Y:**T1**|18+|**T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
+|||[13, 17]|**T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
 |||[11, 12]|**T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
 |||10|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
 |ReverseSequence|*in* input:**T**<br> *in* sequence_lens:**tensor(int64)**<br> *out* Y:**T**|10+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index 48e4617b33b4d..37e7e42150413 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -2008,8 +2008,10 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                 Greater)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Less)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t, Less)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
+                                                                int32_t, Less)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
+                                                                int64_t, Less)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
                                                                           float, Add)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
diff --git a/onnxruntime/core/providers/cpu/cpu_provider_shared.cc b/onnxruntime/core/providers/cpu/cpu_provider_shared.cc
index bf73c59fb78ca..c4a83efa01a91 100644
--- a/onnxruntime/core/providers/cpu/cpu_provider_shared.cc
+++ b/onnxruntime/core/providers/cpu/cpu_provider_shared.cc
@@ -25,6 +25,7 @@
 #include "core/providers/cpu/tensor/tile.h"
 #include "core/providers/cpu/tensor/gather_elements.h"
 #include "core/providers/cpu/tensor/unsqueeze.h"
+#include "core/providers/cpu/tensor/upsamplebase.h"
 
 #ifndef DISABLE_CONTRIB_OPS
 #include "contrib_ops/cpu/bert/attention_base.h"
@@ -62,6 +63,7 @@
 #endif
 
 #include "cpu_provider_shared.h"
+#include <limits>
 
 namespace onnxruntime {
 // The suppressed warning is: "The type with a virtual function needs either public virtual or protected nonvirtual destructor."
@@ -292,6 +294,12 @@ struct ProviderHostCPUImpl : ProviderHostCPU {
   Status Sampling__Compute(const contrib::transformers::Sampling* p, OpKernelContext* ctx) override { return p->contrib::transformers::Sampling::Compute(ctx); }
   Status Sampling__SetupSubgraphExecutionInfo(contrib::transformers::Sampling* p, const SessionState& session_state, const std::string& attribute_name, const SessionState& subgraph_session_state) override { return p->contrib::transformers::Sampling::SetupSubgraphExecutionInfo(session_state, attribute_name, subgraph_session_state); }
 
+  void UpsampleBase__AdjustOutputSizeAsPolicy(const UpsampleBase* p, TensorShapeVector& output_dims,
+                                              gsl::span<const int64_t> input_dims,
+                                              InlinedVector<float>& scales) const override {
+    p->AdjustOutputSizeAsPolicy(output_dims, input_dims, scales);
+  }
+
 #ifdef ENABLE_ATEN
   Status ATen__Compute(const contrib::ATen* p, OpKernelContext* p_ctx) override { return p->ATen::Compute(p_ctx); }
 #endif
diff --git a/onnxruntime/core/providers/cpu/cpu_provider_shared.h b/onnxruntime/core/providers/cpu/cpu_provider_shared.h
index f33eec4b93e98..c0e674827e4d1 100644
--- a/onnxruntime/core/providers/cpu/cpu_provider_shared.h
+++ b/onnxruntime/core/providers/cpu/cpu_provider_shared.h
@@ -24,6 +24,7 @@ class SliceOp__PrepareForComputeMetadata;  // Directly maps to SliceOp::PrepareF
 class UnsqueezeBase__Prepare;              // Directly maps to UnsqueezeBase::Prepare
 class contrib__AdamWOptimizerBase__Prepare;
 class contrib__SGDOptimizerV2Base__Prepare;
+class UpsampleBase;
 
 using PadsVector = InlinedVector<int64_t, kTensorShapeSmallBufferElementsSize * 2>;
 
@@ -202,6 +203,10 @@ struct ProviderHostCPU {
   virtual Status Sampling__Compute(const contrib::transformers::Sampling* p, OpKernelContext* ctx) = 0;
   virtual Status Sampling__SetupSubgraphExecutionInfo(contrib::transformers::Sampling* p, const SessionState& session_state, const std::string& attribute_name, const SessionState& subgraph_session_state) = 0;
 
+  virtual void UpsampleBase__AdjustOutputSizeAsPolicy(const UpsampleBase* p, TensorShapeVector& output_dims,
+                                                      gsl::span<const int64_t> input_dims,
+                                                      InlinedVector<float>& scales) const = 0;
+
 #ifdef ENABLE_ATEN
   virtual Status ATen__Compute(const contrib::ATen* p, OpKernelContext* p_ctx) = 0;
 #endif
diff --git a/onnxruntime/core/providers/cpu/tensor/upsample.cc b/onnxruntime/core/providers/cpu/tensor/upsample.cc
index fa69e144be554..babbac0b7be17 100644
--- a/onnxruntime/core/providers/cpu/tensor/upsample.cc
+++ b/onnxruntime/core/providers/cpu/tensor/upsample.cc
@@ -1,10 +1,15 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/providers/cpu/tensor/upsample.h"
+
+#include <limits>
+
+#include "core/common/inlined_containers.h"
 #include "core/common/safeint.h"
 #include "core/platform/threadpool.h"
-#include "core/providers/cpu/tensor/upsample.h"
 #include "core/providers/cpu/tensor/upsample_antialias.h"
+
 using namespace onnxruntime::common;
 using namespace std;
 using onnxruntime::narrow;
@@ -30,6 +35,46 @@ REGISTER_VERSIONED_TYPED_KERNEL(int32_t, 9, 9);
 REGISTER_VERSIONED_TYPED_KERNEL(int8_t, 9, 9);
 REGISTER_VERSIONED_TYPED_KERNEL(uint8_t, 9, 9);
 
+void UpsampleBase::AdjustOutputSizeAsPolicy(TensorShapeVector& output_dims, gsl::span<const int64_t> input_dims,
+                                            InlinedVector<float>& scales) const {
+  // AspectRatioPolicy::STRETCH is default policy when opset < 18
+  if (keep_aspect_ratio_policy_ == AspectRatioPolicy::STRETCH) {
+    return;
+  }
+
+  InlinedHashSet<int64_t> axes_set(axes_.begin(), axes_.end());
+
+  float scale_in_policy = 0.0f;
+  if (keep_aspect_ratio_policy_ == AspectRatioPolicy ::NOT_LARGER) {
+    scale_in_policy = std::numeric_limits<float>::max();
+
+    for (size_t i = 0; i < scales.size(); i++) {
+      if (axes_set.empty() || axes_set.count(i) > 0) {
+        scale_in_policy = std::min(scale_in_policy, scales[i]);
+      }
+    }
+  } else if (keep_aspect_ratio_policy_ == AspectRatioPolicy ::NOT_SMALLER) {
+    scale_in_policy = std::numeric_limits<float>::min();
+
+    for (size_t i = 0; i < scales.size(); i++) {
+      if (axes_set.empty() || axes_set.count(i) > 0) {
+        scale_in_policy = std::max(scale_in_policy, scales[i]);
+      }
+    }
+  }
+
+  for (size_t i = 0; i < scales.size(); i++) {
+    // if axes is not specified (AKA axes_set.empty()), we apply the policy to all axes
+    if (axes_set.empty() || axes_set.count(i) > 0) {
+      scales[i] = scale_in_policy;
+      output_dims[i] = static_cast<int64_t>(std::round(scales[i] * input_dims[i]));
+    } else {
+      scales[i] = 1.0f;
+      output_dims[i] = input_dims[i];
+    }
+  }
+}
+
 template <typename T>
 void UpsampleNearest2x(int64_t batch_size,
                        int64_t num_channels,
@@ -94,8 +139,8 @@ UpsampleNearestSetupInputMappings(int64_t n_dim,
                                   const TensorShape& input_shape,
                                   const TensorShape& output_shape,
                                   const std::vector<int64_t>& input_dim_factor,
-                                  const vector<float>& scales,
-                                  const vector<float>& roi,
+                                  gsl::span<const float> scales,
+                                  gsl::span<const float> roi,
                                   bool extrapolation_enabled,
                                   const GetOriginalCoordinateFunc& get_original_coordinate,
                                   const GetNearestPixelFunc& get_nearest_pixel) {
@@ -141,8 +186,8 @@ static Status UpsampleNearestImpl(const T* input,
                                   T* output,
                                   const TensorShape& input_shape,
                                   const TensorShape& output_shape,
-                                  const vector<float>& scales,
-                                  const vector<float>& roi,
+                                  gsl::span<const float> scales,
+                                  gsl::span<const float> roi,
                                   bool extrapolation_enabled,
                                   const T extrapolation_value,
                                   const GetOriginalCoordinateFunc& get_original_coordinate,
@@ -285,8 +330,8 @@ static Status UpsampleNearest(const T* input,
                               T* output,
                               const TensorShape& input_shape,
                               const TensorShape& output_shape,
-                              const vector<float>& scales,
-                              const vector<float>& roi,
+                              gsl::span<const float> scales,
+                              gsl::span<const float> roi,
                               bool is_resize,
                               bool extrapolation_enabled,
                               T extrapolation_value,
@@ -412,7 +457,7 @@ BilinearParams SetupUpsampleBilinear(const int32_t input_height,
                                      const int32_t output_width,
                                      const float height_scale,
                                      const float width_scale,
-                                     const std::vector<float>& roi,
+                                     gsl::span<const float> roi,
                                      AllocatorPtr& alloc,
                                      const GetOriginalCoordinateFunc& get_original_coordinate,
                                      const bool is_nchw) {
@@ -518,7 +563,7 @@ BilinearParamsInteger SetupUpsampleBilinearInteger(const int32_t input_height,
                                                    const int32_t output_width,
                                                    const float height_scale,
                                                    const float width_scale,
-                                                   const std::vector<float>& roi,
+                                                   gsl::span<const float> roi,
                                                    AllocatorPtr& alloc,
                                                    const GetOriginalCoordinateFunc& get_original_coordinate,
                                                    const bool is_nchw) {
@@ -650,7 +695,7 @@ static TrilinearParams SetupUpsampleTrilinear(int64_t input_depth,
                                               float depth_scale,
                                               float height_scale,
                                               float width_scale,
-                                              const std::vector<float>& roi,
+                                              gsl::span<const float> roi,
                                               AllocatorPtr& alloc,
                                               const GetOriginalCoordinateFunc& get_original_coordinate) {
   TrilinearParams p;
@@ -796,7 +841,7 @@ void UpsampleTrilinear(int64_t batch_size,
                        float depth_scale,
                        float height_scale,
                        float width_scale,
-                       const std::vector<float>& roi,
+                       gsl::span<const float> roi,
                        bool use_extrapolation,
                        float extrapolation_value,
                        const T* XdataBase,
@@ -929,7 +974,7 @@ void ResizeBiCubic(int64_t batch_size,
                    bool use_extrapolation,
                    float extrapolation_value,
                    bool exclude_outside,
-                   const std::vector<float>& roi,
+                   gsl::span<const float> roi,
                    const T* Xdata,
                    T* Ydata,
                    const GetOriginalCoordinateFunc& get_original_coordinate) {
@@ -1067,9 +1112,9 @@ void ResizeBiCubic(int64_t batch_size,
 
 template <typename T>
 Status Upsample<T>::BaseCompute(OpKernelContext* context,
-                                const std::vector<float>& roi,
-                                const std::vector<float>& scales,
-                                const gsl::span<const int64_t>& output_dims) const {
+                                gsl::span<const float> roi,
+                                gsl::span<const float> scales,
+                                gsl::span<const int64_t> output_dims) const {
   const auto* X = context->Input<Tensor>(0);
   auto dims = X->Shape().GetDims();
   ORT_RETURN_IF_NOT(output_dims.size() == dims.size(), "Rank of input and output tensor should be same.");
@@ -1327,7 +1372,7 @@ Status Upsample<T>::Compute(OpKernelContext* context) const {
   // Initialize the roi array to all zeros as this will be the most common case
   // Roi data is needed only when coordinate transformation mode is set to tf_crop_and_resize
   // for all other cases we need a 0 initialized roi array
-  std::vector<float> roi_array(roi_);
+  InlinedVector<float> roi_array(roi_);
 
   if (!roi_cached_) {
     bool use_default_roi = true;
@@ -1353,7 +1398,7 @@ Status Upsample<T>::Compute(OpKernelContext* context) const {
 
   ComputeROIWithAxes(roi_array, input_dims.size());
   // Get scales data
-  std::vector<float> scales_array(input_dims.size());
+  InlinedVector<float> scales_array(input_dims.size());
 
   if (OpKernel::Node().InputDefs().size() == 1) {
     // Compute output shape from scales and input dims
diff --git a/onnxruntime/core/providers/cpu/tensor/upsample.h b/onnxruntime/core/providers/cpu/tensor/upsample.h
index 3046ee4b8260d..8ff04781f6ad0 100644
--- a/onnxruntime/core/providers/cpu/tensor/upsample.h
+++ b/onnxruntime/core/providers/cpu/tensor/upsample.h
@@ -66,8 +66,8 @@ class Upsample : public UpsampleBase, public OpKernel {
 
   Status Compute(OpKernelContext* context) const override;
 
-  Status BaseCompute(OpKernelContext* context, const std::vector<float>& roi, const std::vector<float>& scales,
-                     const gsl::span<const int64_t>& output_dims) const;
+  Status BaseCompute(OpKernelContext* context, gsl::span<const float> roi, gsl::span<const float> scales,
+                     gsl::span<const int64_t> output_dims) const;
 };
 
 BilinearParams SetupUpsampleBilinear(const int32_t input_height,
@@ -76,7 +76,7 @@ BilinearParams SetupUpsampleBilinear(const int32_t input_height,
                                      const int32_t output_width,
                                      const float height_scale,
                                      const float width_scale,
-                                     const std::vector<float>& roi,
+                                     gsl::span<const float> roi,
                                      AllocatorPtr& alloc,
                                      const GetOriginalCoordinateFunc& get_original_coordinate,
                                      const bool is_nchw);
@@ -90,7 +90,7 @@ void UpsampleBilinear(const int32_t batch_size,
                       const int32_t output_width,
                       const float height_scale,
                       const float width_scale,
-                      const std::vector<float>& roi,
+                      gsl::span<const float> roi,
                       const bool use_extrapolation,
                       const float extrapolation_value,
                       const T* const XdataBase,
@@ -144,7 +144,7 @@ void NhwcUpsampleBilinear(const int32_t batch_size,
                           const int32_t output_width,
                           const float height_scale,
                           const float width_scale,
-                          const std::vector<float>& roi,
+                          gsl::span<const float> roi,
                           const float extrapolation_value,
                           const T* const XdataBase,
                           T* const YdataBase,
@@ -227,7 +227,7 @@ BilinearParamsInteger SetupUpsampleBilinearInteger(const int32_t input_height,
                                                    const int32_t output_width,
                                                    const float height_scale,
                                                    const float width_scale,
-                                                   const std::vector<float>& roi,
+                                                   gsl::span<const float> roi,
                                                    AllocatorPtr& alloc,
                                                    const GetOriginalCoordinateFunc& get_original_coordinate,
                                                    const bool is_nchw);
@@ -241,7 +241,7 @@ void NhwcUpsampleBilinearInteger(const int32_t batch_size,
                                  const int32_t output_width,
                                  const float height_scale,
                                  const float width_scale,
-                                 const std::vector<float>& roi,
+                                 gsl::span<const float> roi,
                                  const float extrapolation_value,
                                  const T* const XdataBase,
                                  T* const YdataBase,
diff --git a/onnxruntime/core/providers/cpu/tensor/upsample_antialias.h b/onnxruntime/core/providers/cpu/tensor/upsample_antialias.h
index e1dcaf500a325..1e32b7e874b1a 100644
--- a/onnxruntime/core/providers/cpu/tensor/upsample_antialias.h
+++ b/onnxruntime/core/providers/cpu/tensor/upsample_antialias.h
@@ -21,32 +21,6 @@
 
 namespace onnxruntime {
 
-namespace ConstValue {
-constexpr int32_t mag_factor = 1 << (22 - 1);
-}
-
-namespace {
-const uint8_t* GetLookupTableShared() {
-  // initialized once
-  static const auto* lookup_table = []() {
-    // if we have already initialized the lookup table, just return
-    // ideally we could have a global lookup table, but that account for too much space.
-    /* Handles values form -640 to 639. */
-    static uint8_t table[1280] = {0};
-
-    // taken from https://github.com/python-pillow/Pillow/blob/66add095a50d76c35c7f58643461f2edf78a3f05/src/libImaging/Resample.c#L94
-    //  we need to handle negative values
-    //  it's equivalent to :x = np.clip(x, 0, 255) where x \in [-640, 639]
-    // we will accept a negative x for (&table[640])[x] means table +640 -x
-    for (int i = 0; i < 1280; ++i) {
-      table[i] = static_cast<uint8_t>(std::min(std::max(i - 640, 0), 255));
-    }
-    return table;
-  }();
-  return lookup_table;
-}
-}  // namespace
-
 template <typename T>
 struct FilterParamsBaseAntiAlias {
   std::vector<int64_t> bound;
@@ -57,15 +31,15 @@ struct FilterParamsBaseAntiAlias {
 
 template <typename T>
 struct FilterParamsAntiAlias {
-  float support_size = 2.0f;
-  float cubic_coeff_a = -0.75f;
+  float support_size = antialias_constants::kSupportSize;
+  float cubic_coeff_a = antialias_constants::kCubicCoeffA;
 
   FilterParamsBaseAntiAlias<T> dim_x;
   FilterParamsBaseAntiAlias<T> dim_y;
   FilterParamsBaseAntiAlias<T> dim_z;
 
   const uint8_t* GetClip8LookupTable() const {
-    return GetLookupTableShared();
+    return UpsampleBase::GetLookupTableShared();
   }
   virtual ~FilterParamsAntiAlias() = default;
   virtual float Filter(float x) const = 0;
@@ -89,7 +63,7 @@ struct BilinearParamsAntiAlias : FilterParamsAntiAlias<T> {
 template <typename T>
 struct BiCubicParamsAntiAlias : FilterParamsAntiAlias<T> {
   BiCubicParamsAntiAlias() {
-    this->support_size = 4.0f;
+    this->support_size = antialias_constants::kBiCubicSupportSize;
   }
 
   // taken from
@@ -124,27 +98,6 @@ struct TriLinearParamsAntiAlias : FilterParamsAntiAlias<T> {
   }
 };
 
-template <typename T>
-struct AccumulateType {
-  using type = int32_t;
-  using Dtype = T;
-};
-
-template <>
-struct AccumulateType<int32_t> {
-  using type = float;
-};
-
-template <>
-struct AccumulateType<float> {
-  using type = float;
-};
-
-template <>
-struct AccumulateType<double> {
-  using type = double;
-};
-
 // The following method supports a 3/4/5-D input in 'Linear mode, cubic mode'
 // that amounts to 'Bilinear,TriLinear, Bicubic/Tricubic' Upsampling/Resizing in the sense that it assumes
 // A N-D tensor has
@@ -156,19 +109,20 @@ struct AccumulateType<double> {
 // - [N, H, W, C] and the scales are [1.0, height_scale, width_scale, 1.0]
 template <class T>
 void SetupUpsampleFilterAntiAlias(FilterParamsAntiAlias<T>& p,
-                                  const gsl::span<int64_t> input_h_w_c,
-                                  const gsl::span<int64_t> output_h_w_c,
-                                  const gsl::span<float> scale_h_w_c,
-                                  const std::vector<float>& roi,
+                                  gsl::span<const int64_t> input_h_w_c,
+                                  gsl::span<const int64_t> output_h_w_c,
+                                  gsl::span<const float> scale_h_w_c,
+                                  gsl::span<const float> roi,
                                   AllocatorPtr& alloc,
                                   const GetOriginalCoordinateFunc& get_original_coordinate,
                                   bool exclude_outside, const bool is_nchw) {
-  auto compute_weight_coefficients = [&alloc, &roi, &get_original_coordinate, exclude_outside](const FilterParamsAntiAlias<T>& p,
-                                                                                               const int64_t input_size,
-                                                                                               const int64_t output_size,
-                                                                                               size_t rindex,
-                                                                                               FilterParamsBaseAntiAlias<T>& param_base,
-                                                                                               const float rscale) -> int64_t {
+  auto compute_weight_coefficients = [&alloc, roi, &get_original_coordinate, exclude_outside](
+                                         const FilterParamsAntiAlias<T>& p,
+                                         const int64_t input_size,
+                                         const int64_t output_size,
+                                         size_t rindex,
+                                         FilterParamsBaseAntiAlias<T>& param_base,
+                                         const float rscale) -> int64_t {
     param_base.bound.reserve(static_cast<size_t>(output_size) * 2);
     param_base.out_of_bound_idx.reserve(static_cast<size_t>(output_size));
 
@@ -245,13 +199,14 @@ void SetupUpsampleFilterAntiAlias(FilterParamsAntiAlias<T>& p,
 
         // normalize the scale to 1 << 22 for int8/uint8
         if constexpr (std::is_same<T, int32_t>::value) {
-          scale_buffer_int[x] = static_cast<int32_t>(std::round(scale_buffer[x] * ConstValue::mag_factor * 2.f));
+          scale_buffer_int[x] = static_cast<int32_t>(std::round(scale_buffer[x] * ConstValue::mag_factor_x_2));
         }
       }
       /*for (; x < window_size; x++) {
         scale_buffer[x] = 0;
       }*/
     }
+
     return window_size;
   };
 
@@ -269,9 +224,6 @@ void SetupUpsampleFilterAntiAlias(FilterParamsAntiAlias<T>& p,
   }
 }
 
-template <class T>
-inline constexpr bool is_8bit_v = std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
-
 /**
  * @brief To compute interpolation along with the last axis.
  * For brief,we assume the input tensor has 3 dimensions and we all it CHW for each character represent a dim.
@@ -398,6 +350,7 @@ void ComputeInterpolationAtLevel2(int64_t num_channels, int64_t input_height, in
                 output += *Xdata_offset * (*weight_coeff_start++);
                 Xdata_offset += output_width;
               }
+
               if constexpr (is_8bit_v<InputType>) {
                 *Ydata_offset++ = static_cast<InputType>(clip8_lookups[output >> 22]);
               } else if constexpr (std::is_same<InputType, int32_t>::value) {
@@ -444,6 +397,7 @@ void ComputeInterpolationAtLevel2(int64_t num_channels, int64_t input_height, in
                 output += *Xdata_offset * (*weight_coeff_start++);
                 Xdata_offset += output_width;
               }
+
               if constexpr (is_8bit_v<InputType>) {
                 *Ydata_offset++ = static_cast<InputType>(clip8_lookups[output >> 22]);
               } else if constexpr (std::is_same<InputType, int32_t>::value) {
@@ -515,6 +469,7 @@ void UpsampleBaseAntiAlias(FilterParamsAntiAlias<T1>& p,
                                        narrow<size_t>(input_height * num_channels * input_width));
       auto ydata_span = gsl::make_span(image_temp_buffer.get(), narrow<size_t>(input_height * num_channels * output_width));
 
+      // This computes only the width direction.Thus height keeps unchanged.
       ComputeInterpolationAtLevel1(num_channels, input_height, input_width, input_height, output_width,
                                    xdata_span, ydata_span, p, p.dim_x, tp);
     }
@@ -546,7 +501,7 @@ void UpsampleBilinearAntiAlias(const int64_t batch_size,
                                const int64_t output_width,
                                const float height_scale,
                                const float width_scale,
-                               const std::vector<float>& roi,
+                               gsl::span<const float> roi,
                                const bool use_extrapolation,
                                const float extrapolation_value,
                                bool exclude_outside,
@@ -575,7 +530,7 @@ void NhwcUpsampleBilinearAntiAlias(const int64_t batch_size,
                                    const int64_t output_width,
                                    const float height_scale,
                                    const float width_scale,
-                                   const std::vector<float>& roi,
+                                   gsl::span<const float> roi,
                                    const bool use_extrapolation,
                                    const float extrapolation_value,
                                    bool exclude_outside,
@@ -608,7 +563,7 @@ void NhwcResizeBiCubicAntiAlias(const int64_t batch_size,
                                 bool use_extrapolation,
                                 float extrapolation_value,
                                 bool exclude_outside,
-                                const std::vector<float>& roi,
+                                gsl::span<const float> roi,
                                 const Tensor* X,
                                 T* Ydata_base,
                                 AllocatorPtr& alloc,
@@ -688,7 +643,7 @@ void ResizeBiCubicAntiAlias(int64_t batch_size,
                             bool use_extrapolation,
                             float extrapolation_value,
                             bool exclude_outside,
-                            const std::vector<float>& roi,
+                            gsl::span<const float> roi,
                             const Tensor* X,
                             T* Ydata_base,
                             AllocatorPtr& alloc,
@@ -719,7 +674,7 @@ void UpsampleTrilinearAntiAlias(int64_t batch_size,
                                 float depth_scale,
                                 float height_scale,
                                 float width_scale,
-                                const std::vector<float>& roi,
+                                gsl::span<const float> roi,
                                 bool use_extrapolation,
                                 float extrapolation_value,
                                 bool exclude_outside,
diff --git a/onnxruntime/core/providers/cpu/tensor/upsamplebase.h b/onnxruntime/core/providers/cpu/tensor/upsamplebase.h
index a0e7ca1084fef..b768fedd8513a 100644
--- a/onnxruntime/core/providers/cpu/tensor/upsamplebase.h
+++ b/onnxruntime/core/providers/cpu/tensor/upsamplebase.h
@@ -3,11 +3,13 @@
 
 #pragma once
 
+#include <algorithm>
 #include <string>
 #include <string_view>
 #include <unordered_map>
 #include <vector>
-#include <unordered_set>
+
+#include <core/common/inlined_containers_fwd.h>
 #include "core/common/status.h"
 #include <core/common/safeint.h>
 #include <core/common/narrow.h>
@@ -58,7 +60,73 @@ enum class AspectRatioPolicy {
   NOT_SMALLER,
 };
 
+// Antialias types
+template <typename T>
+struct AccumulateType {
+  using type = int32_t;
+  using Dtype = T;
+};
+
+template <>
+struct AccumulateType<int32_t> {
+  using type = float;
+};
+
+template <>
+struct AccumulateType<float> {
+  using type = float;
+};
+
+template <>
+struct AccumulateType<MLFloat16> {
+  using type = float;
+};
+
+template <>
+struct AccumulateType<double> {
+  using type = double;
+};
+
+namespace antialias_constants {
+constexpr float kCubicCoeffA = -0.75f;
+constexpr float kSupportSize = 2.0f;
+constexpr float kBiCubicSupportSize = 4.0f;
+}  // namespace antialias_constants
+
+namespace ConstValue {
+constexpr int32_t mag_factor = 1 << (22 - 1);
+// We use to multiply by 2, let's make a constant which is twice as big
+constexpr int32_t mag_factor_x_2 = 1 << 22;
+}  // namespace ConstValue
+
+template <class T>
+inline constexpr bool is_8bit_v = std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
+
+template <typename T>
+void PrintAntiAliasBuffers(std::ostream& os, gsl::span<int64_t> bounds, gsl::span<int64_t> out_of_bounds,
+                           gsl::span<T> weight_coefficients) {
+  os << "#### Bounds: ";
+  std::copy(bounds.begin(), bounds.end(), std::ostream_iterator<int64_t>(os, " "));
+  os << std::endl;
+
+  os << "#### Out of Bounds: ";
+  std::copy(out_of_bounds.begin(), out_of_bounds.end(),
+            std::ostream_iterator<int64_t>(os, " "));
+  os << std::endl;
+
+  os << "#### Scale Buffer: ";
+  std::copy(weight_coefficients.begin(), weight_coefficients.end(),
+            std::ostream_iterator<T>(os, " "));
+  os << std::endl;
+}
+
 class UpsampleBase {
+ public:
+  // Make this available in other EP via provider bridge
+  // it works iff output_shape is specified
+  void AdjustOutputSizeAsPolicy(TensorShapeVector& output_dims, gsl::span<const int64_t> input_dims,
+                                InlinedVector<float>& scales) const;
+
  protected:
   explicit UpsampleBase(const OpKernelInfo& info)
       : scales_cached_(false), roi_cached_(false), use_extrapolation_(false) {
@@ -69,23 +137,32 @@ class UpsampleBase {
     std::string mode;
     ORT_ENFORCE(info.GetAttr<std::string>("mode", &mode).IsOK());
     mode_ = StringToUpsampleMode(mode);
-    antialias_ = info.GetAttrOrDefault<int64_t>("antialias", 0) == 0 ? false : true;
-    if (antialias_) {
-      ORT_ENFORCE((UpsampleMode::LINEAR == mode_ || UpsampleMode::CUBIC == mode_),
-                  "when anti-aliasing is set, Resize only supports mode `LINEAR` and `CUBIC`.");
-    }
 
     auto input_count = info.GetInputCount();
     if (input_count == 1) {  // opset < 10
-      ORT_THROW_IF_ERROR(info.GetAttrs<float>("scales", scales_));
-      ORT_THROW_IF_ERROR(ScalesValidation(scales_, mode_));
+      std::vector<float> scales;
+      ORT_THROW_IF_ERROR(info.GetAttrs<float>("scales", scales));
+      ORT_THROW_IF_ERROR(ScalesValidation(scales, mode_));
+      scales_.assign(scales.cbegin(), scales.cend());
       scales_cached_ = true;
     }
 
-    std::string keep_aspect_ratio_policy = info.GetAttrOrDefault<std::string>("keep_aspect_ratio_policy", "stretch");
-    keep_aspect_ratio_policy_ = StringToKeepAspectRatioPolicy(keep_aspect_ratio_policy);
+    if (opset >= 18) {
+      antialias_ = info.GetAttrOrDefault<int64_t>("antialias", 0) == 0 ? false : true;
+
+      if (antialias_) {
+        ORT_ENFORCE((UpsampleMode::LINEAR == mode_ || UpsampleMode::CUBIC == mode_),
+                    "when anti-aliasing is set, Resize only supports mode `LINEAR` and `CUBIC`.");
+      }
 
-    axes_ = info.GetAttrsOrDefault<int64_t>("axes");
+      // The attribute is absent in opset < 18, but the default value as if stretch.
+      std::string keep_aspect_ratio_policy = info.GetAttrOrDefault<std::string>("keep_aspect_ratio_policy", "stretch");
+      keep_aspect_ratio_policy_ = StringToKeepAspectRatioPolicy(keep_aspect_ratio_policy);
+
+      // guard against unit tests that can add an attribute
+      auto axes = info.GetAttrsOrDefault<int64_t>("axes");
+      axes_.assign(axes.cbegin(), axes.cend());
+    }
 
     extrapolation_value_ = info.GetAttrOrDefault<float>("extrapolation_value", 0.0f);
 
@@ -112,7 +189,7 @@ class UpsampleBase {
     nearest_mode_ = StringToNearestMode(nearest_mode_name);
     get_nearest_pixel_ = GetNearestPixelFromOriginal(nearest_mode_);
 
-    cubic_coeff_a_ = info.GetAttrOrDefault<float>("cubic_coeff_a", -0.75f);
+    cubic_coeff_a_ = info.GetAttrOrDefault<float>("cubic_coeff_a", antialias_constants::kCubicCoeffA);
     exclude_outside_ = info.GetAttrOrDefault<int64_t>("exclude_outside", 0) == 0 ? false : true;
 
     if ((exclude_outside_ == 1 && mode_ != CUBIC) && (antialias_ == false || mode_ != LINEAR)) {
@@ -166,7 +243,7 @@ class UpsampleBase {
   ResizeCoordinateTransformationMode coordinate_transform_mode_;
   GetOriginalCoordinateFunc get_original_coordinate_;
   ResizeNearestMode nearest_mode_;
-  AspectRatioPolicy keep_aspect_ratio_policy_;
+  AspectRatioPolicy keep_aspect_ratio_policy_{AspectRatioPolicy::STRETCH};
   GetNearestPixelFunc get_nearest_pixel_;
   float cubic_coeff_a_;
   bool exclude_outside_;
@@ -174,9 +251,9 @@ class UpsampleBase {
   float extrapolation_value_;
   bool use_nearest2x_optimization_ = false;
 
-  std::vector<float> scales_;
-  std::vector<float> roi_;
-  std::vector<int64_t> axes_;
+  InlinedVector<float> scales_;
+  InlinedVector<float> roi_;
+  TensorShapeVector axes_;
 
   bool scales_cached_;
   bool roi_cached_;
@@ -335,7 +412,7 @@ class UpsampleBase {
     }
   }
 
-  [[nodiscard]] Status ScalesValidation(const std::vector<float>& scales, const UpsampleMode mode) const {
+  [[nodiscard]] Status ScalesValidation(gsl::span<const float> scales, const UpsampleMode mode) const {
     if (!is_resize_) {
       for (auto& scale : scales) {
         ORT_RETURN_IF_NOT(scale >= 1, "Scale value should be greater than or equal to 1.");
@@ -372,7 +449,7 @@ class UpsampleBase {
   }
 
   [[nodiscard]] Status
-  ParseScalesData(const Tensor* scale, std::vector<float>& scales, int64_t rank) const {
+  ParseScalesData(const Tensor* scale, InlinedVector<float>& scales, int64_t rank) const {
     const auto* scale_data = scale->Data<float>();
     int64_t scales_size = scale->Shape().Size();
     ORT_RETURN_IF_NOT(scales_size > 0, "scales size should be greater than 0.");
@@ -387,19 +464,19 @@ class UpsampleBase {
     // in which case the other axes is ignored and use default scale of 1
     // scales_size == axes_.size() should be guaranteed if axes is not empty
     if (rank > 0 && (scales_size != rank || axes_.size())) {
-      std::vector<float> new_scales(size_t(rank), 1.0f);
+      InlinedVector<float> new_scales(size_t(rank), 1.0f);
       ORT_RETURN_IF_NOT(*std::max_element(axes_.begin(), axes_.end()) < rank && (int64_t(axes_.size()) == scales_size),
                         "all values in axes should be less than rank of the data");
 
       for (size_t i = 0; i < axes_.size(); i++) {
         new_scales[static_cast<size_t>(axes_[i])] = scales[i];
       }
-      scales = new_scales;
+      scales.swap(new_scales);
     }
     return ScalesValidation(scales, mode_);
   }
 
-  void ParseRoiData(const Tensor* roi, std::vector<float>& roi_array) const {
+  void ParseRoiData(const Tensor* roi, InlinedVector<float>& roi_array) const {
     int64_t roi_size = roi->Shape().Size();
     if (roi_size > 0) {
       roi_array.resize(onnxruntime::narrow<size_t>(roi_size));
@@ -429,52 +506,11 @@ class UpsampleBase {
     return Status::OK();
   }
 
-  // it works iff output_shape is specified
-  void AdjustOutputSizeAsPolicy(TensorShapeVector& output_dims, gsl::span<const int64_t> input_dims,
-                                std::vector<float>& scales) const {
-    std::unordered_set<int64_t> axes_set(axes_.begin(), axes_.end());
-
-    // AspectRatioPolicy::STRETCH is default policy when opset < 18
-    if (keep_aspect_ratio_policy_ == AspectRatioPolicy ::STRETCH) {
-      return;
-    }
-
-    float scale_in_policy = 0.0f;
-    if (keep_aspect_ratio_policy_ == AspectRatioPolicy ::NOT_LARGER) {
-      scale_in_policy = std::numeric_limits<float>::max();
-
-      for (size_t i = 0; i < scales.size(); i++) {
-        if (axes_set.empty() || axes_set.count(i) > 0) {
-          scale_in_policy = std::min(scale_in_policy, scales[i]);
-        }
-      }
-    } else if (keep_aspect_ratio_policy_ == AspectRatioPolicy ::NOT_SMALLER) {
-      scale_in_policy = std::numeric_limits<float>::min();
-
-      for (size_t i = 0; i < scales.size(); i++) {
-        if (axes_set.empty() || axes_set.count(i) > 0) {
-          scale_in_policy = std::max(scale_in_policy, scales[i]);
-        }
-      }
-    }
-
-    for (size_t i = 0; i < scales.size(); i++) {
-      // if axes is not specified (AKA axes_set.empty()), we apply the policy to all axes
-      if (axes_set.empty() || axes_set.count(i) > 0) {
-        scales[i] = scale_in_policy;
-        output_dims[i] = static_cast<int64_t>(std::round(scales[i] * input_dims[i]));
-      } else {
-        scales[i] = 1.0f;
-        output_dims[i] = input_dims[i];
-      }
-    }
-  }
-
   // It's different in Opset 18 and before.
   // we will modify output_shape by sorts of policy even if it's specified
   [[nodiscard]] Status ParseScalesDataAndAdjustOutputSize(TensorShapeVector& output_dims,
                                                           gsl::span<const int64_t> input_dims,
-                                                          std::vector<float>& scales) const {
+                                                          InlinedVector<float>& scales) const {
     for (size_t i = 0, end = input_dims.size(); i < end; ++i) {
       // Handle corner case to avoid dividing by zero in the next step
       if (input_dims[i] == 0) {
@@ -507,9 +543,9 @@ class UpsampleBase {
 
   // Roi is redefined in Opset-18, we have a concept of axes.
   // So we need to update it accordingly.
-  void ComputeROIWithAxes(std::vector<float>& roi_array, size_t rank) const {
+  void ComputeROIWithAxes(InlinedVector<float>& roi_array, size_t rank) const {
     if (axes_.size()) {
-      std::vector<float> roi_tmp(rank * 2, 0);
+      InlinedVector<float> roi_tmp(rank * 2, 0);
       for (size_t i = rank; i < rank * 2; ++i) {
         roi_tmp[i] = 1;
       }
@@ -518,9 +554,32 @@ class UpsampleBase {
         roi_tmp[v_in_axes] = (roi_array[i]);
         roi_tmp[rank + v_in_axes] = (roi_array[axes_.size() + i]);
       }
-      roi_array = roi_tmp;
+      roi_array.swap(roi_tmp);
     }
   }
+
+ public:
+  static constexpr size_t kLookupTableSize = 1280;
+
+  static const uint8_t* GetLookupTableShared() {
+    // initialized once
+    static const auto* lookup_table = []() {
+      // if we have already initialized the lookup table, just return
+      // ideally we could have a global lookup table, but that account for too much space.
+      /* Handles values form -640 to 639. */
+      static uint8_t table[kLookupTableSize] = {0};
+
+      // taken from https://github.com/python-pillow/Pillow/blob/66add095a50d76c35c7f58643461f2edf78a3f05/src/libImaging/Resample.c#L94
+      //  we need to handle negative values
+      //  it's equivalent to :x = np.clip(x, 0, 255) where x \in [-640, 639]
+      // we will accept a negative x for (&table[640])[x] means table +640 -x
+      for (int i = 0; i < static_cast<int>(kLookupTableSize); ++i) {
+        table[i] = static_cast<uint8_t>(std::min(std::max(i - 640, 0), 255));
+      }
+      return table;
+    }();
+    return lookup_table;
+  }
 };  // UpsampleBase
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/cu_inc/common.cuh b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
index 0d9928baa86e0..66794f88d8670 100644
--- a/onnxruntime/core/providers/cuda/cu_inc/common.cuh
+++ b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
@@ -194,13 +194,13 @@ template <>
 __device__ __inline__ half _Ceil(half a) { return half(ceilf((float)a)); }
 
 template <typename T>
-__device__ __inline__ T _Floor(T a);
+__device__ __host__ __inline__ T _Floor(T a);
 
 template <>
-__device__ __inline__ float _Floor(float a) { return floorf(a); }
+__device__ __host__ __inline__ float _Floor(float a) { return floorf(a); }
 
 template <>
-__device__ __inline__ double _Floor(double a) { return floor(a); }
+__device__ __host__ __inline__ double _Floor(double a) { return floor(a); }
 
 template <>
 __device__ __inline__ half _Floor(half a) { return half(floorf((float)a)); }
@@ -230,13 +230,13 @@ template <>
 __device__ __inline__ half _Erf(half a) { return half(erff((float)a)); }
 
 template <typename T>
-__device__ __inline__ T _Round(T a);
+__device__ __host__ __inline__ T _Round(T a);
 
 template <>
-__device__ __inline__ float _Round(float a) { return rintf(a); }
+__device__ __host__ __inline__ float _Round(float a) { return rintf(a); }
 
 template <>
-__device__ __inline__ double _Round(double a) { return rint(a); }
+__device__ __host__ __inline__ double _Round(double a) { return rint(a); }
 
 template <>
 __device__ __inline__ half _Round(half a) {
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 00783bcbc2665..1ce089fd93044 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -1109,11 +1109,11 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceSumSquare);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, GatherND);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Dropout);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint8_t, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, float, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, double, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, MLFloat16, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, int32_t, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, uint8_t, Resize);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, If);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, Loop);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Flatten);
@@ -1277,6 +1277,11 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, Pad);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, Pad);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, bool, Pad);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, uint8_t, Resize);
 
 // Opset 19
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, float, Cast);
@@ -2009,11 +2014,11 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceSumSquare)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, GatherND)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint8_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, float, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, double, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, MLFloat16, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, int32_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, uint8_t, Resize)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, If)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, Loop)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Flatten)>,
@@ -2176,6 +2181,11 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, Pad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, Pad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, bool, Pad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, uint8_t, Resize)>,
 
     // Opset 19
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, float, Cast)>,
diff --git a/onnxruntime/core/providers/cuda/tensor/resize.cc b/onnxruntime/core/providers/cuda/tensor/resize.cc
index 764172a8d1fac..97d4eb71e970a 100644
--- a/onnxruntime/core/providers/cuda/tensor/resize.cc
+++ b/onnxruntime/core/providers/cuda/tensor/resize.cc
@@ -28,10 +28,22 @@ namespace cuda {
           .InputMemoryType(OrtMemTypeCPUInput, 3)                  \
           .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()), \
       Resize<T>);                                                  \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                         \
+      Resize,                                                      \
+      kOnnxDomain,                                                 \
+      13, 17,                                                      \
+      T,                                                           \
+      kCudaExecutionProvider,                                      \
+      (*KernelDefBuilder::Create())                                \
+          .InputMemoryType(OrtMemTypeCPUInput, 1)                  \
+          .InputMemoryType(OrtMemTypeCPUInput, 2)                  \
+          .InputMemoryType(OrtMemTypeCPUInput, 3)                  \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()), \
+      Resize<T>);                                                  \
   ONNX_OPERATOR_TYPED_KERNEL_EX(                                   \
       Resize,                                                      \
       kOnnxDomain,                                                 \
-      13,                                                          \
+      18,                                                          \
       T,                                                           \
       kCudaExecutionProvider,                                      \
       (*KernelDefBuilder::Create())                                \
diff --git a/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu
new file mode 100644
index 0000000000000..56b7c3f499303
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu
@@ -0,0 +1,1179 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/cuda/cu_inc/common.cuh"
+#include "core/providers/cuda/tensor/resize_impl.h"
+
+#define FUNC_DEF __device__
+
+namespace onnxruntime {
+namespace cuda {
+
+using onnxruntime::ResizeCoordinateTransformationMode;
+using onnxruntime::UpsampleMode;
+
+/// <summary>
+/// Compute a buffer for bilinear data for CUDA antialias resizing.
+/// </summary>
+static std::tuple<int64_t, int64_t> ComputeBilinearScaleBufferSize(
+    int64_t output_height, int64_t output_width,
+    float height_rscale, float width_rscale,
+    float support_value,
+    float& scaled_support_height, float& scaled_support_width,
+    int32_t& window_size_height, int32_t& window_size_width) {
+  scaled_support_height = ComputeScaledSupportValue(support_value, height_rscale);
+  scaled_support_width = ComputeScaledSupportValue(support_value, width_rscale);
+  window_size_height = ComputeWindowSize(scaled_support_height);
+  window_size_width = ComputeWindowSize(scaled_support_width);
+
+  auto height_buffer_size = ComputeWeightedCoeffBufferSize(output_height, window_size_height);
+  auto width_buffer_size = ComputeWeightedCoeffBufferSize(output_width, window_size_width);
+
+  return std::make_tuple(height_buffer_size, width_buffer_size);
+}
+
+/// <summary>
+/// Compute a buffer for btrilinear data for CUDA antialias resizing.
+/// </summary>
+static std::tuple<int64_t, int64_t, int64_t> ComputeTrilinearScaleBufferSize(
+    int64_t output_depth, int64_t output_height, int64_t output_width,
+    float depth_rscale, float height_rscale, float width_rscale,
+    float support_value,
+    float& scaled_support_depth, float& scaled_support_height,
+    float& scaled_support_width, int32_t& window_size_depth,
+    int32_t& window_size_height, int32_t& window_size_width) {
+  scaled_support_depth = ComputeScaledSupportValue(support_value, depth_rscale);
+  window_size_depth = ComputeWindowSize(scaled_support_depth);
+  auto depth_buffer_size = ComputeWeightedCoeffBufferSize(output_depth, window_size_depth);
+
+  const auto [y_buffer_size, w_buffer_size] = ComputeBilinearScaleBufferSize(output_height,
+                                                                             output_width, height_rscale,
+                                                                             width_rscale, support_value,
+                                                                             scaled_support_height,
+                                                                             scaled_support_width,
+                                                                             window_size_height, window_size_width);
+  return std::make_tuple(depth_buffer_size, y_buffer_size, w_buffer_size);
+}
+
+// Antialiasing filters
+struct BilinearFilter {
+  __device__ __host__ float operator()(float x, float /* cubic_coeff_a */) const {
+    if (x < 0.0f) {
+      x = -x;
+    }
+    if (x < 1.0f) {
+      return 1.0f - x;
+    }
+    return 0.0f;
+  }
+};
+
+struct BiCubicFilter {
+  __device__ __host__ float operator()(float x, float cubic_coeff_a) const {
+    /* https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
+     */
+    if (x < 0.0f) {
+      x = -x;
+    }
+    if (x < 1.0f) {
+      return ((cubic_coeff_a + 2.0f) * x - (cubic_coeff_a + 3.0f)) * x * x + 1;
+    }
+    if (x < 2.0f) {
+      return (((x - 5.0f) * x + 8.f) * x - 4.f) * cubic_coeff_a;
+    }
+    return 0.0f;
+  }
+};
+
+struct TriLinearFilter {
+  __device__ __host__ float operator()(float x, float /* cubic_coeff_a */) const {
+    if (x < 0.0f) {
+      x = -x;
+    }
+    if (x < 1.0f) {
+      return 1.0f - x;
+    }
+    return 0.0f;
+  }
+};
+
+template <typename AccumType>
+struct AccumTypeCaster {
+  static __device__ __host__ AccumType* cast(AccumType* p) {
+    return p;
+  }
+};
+
+template <>
+struct AccumTypeCaster<int32_t> {
+  static __device__ __host__ float* cast(int32_t* p) {
+    return reinterpret_cast<float*>(p);
+  }
+};
+
+template <typename T, typename AccumType>
+__global__ void _ComputeInterpolationAtLevel1(
+    int64_t num_channels,
+    int64_t input_height, int64_t input_width,
+    int64_t output_height, int64_t output_width,
+    const fast_divmod div_output_width,
+    const fast_divmod div_output_image,
+    int32_t window_size,
+    const uint8_t* clip8_table,
+    const int64_t* bound_data,
+    std::tuple<int64_t*, int64_t*> outof_bounds_buffers,
+    const AccumType* weight_coefficients,
+    const T* Xdata, T* Ydata,
+    const int N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+
+  // No need to do scale
+  if (output_width == input_width) {
+    Ydata[id] = Xdata[id];
+    return;
+  }
+
+  int bxc, output_image_index;
+  div_output_image.divmod(id, bxc, output_image_index);
+
+  int output_y, output_x;
+  div_output_width.divmod(output_image_index, output_y, output_x);
+
+  CUDA_LONG input_index = static_cast<CUDA_LONG>(bxc * num_channels * input_height * input_width);
+  CUDA_LONG output_index = static_cast<CUDA_LONG>(bxc * num_channels * output_height * output_width);
+
+  auto* Ydata_offset = Ydata + output_index + output_width * output_y + output_x;
+  const auto* bound = bound_data;
+
+  AccumType output = onnxruntime::is_8bit_v<T> ? ConstValue::mag_factor : 0;
+
+  const auto* weight_coeff = weight_coefficients + window_size * output_x;
+  int64_t xmin = bound[static_cast<ptrdiff_t>(output_x) * 2];
+  int64_t xmax = bound[static_cast<ptrdiff_t>(output_x) * 2 + 1];
+
+  // Input window
+  const auto* Xdata_offset = Xdata + input_index + input_width * output_y + xmin;
+
+  for (; xmin < xmax; ++xmin) {
+    if constexpr (std::is_same<T, half>::value) {
+      // This cast is needed when we deal with half
+      output += static_cast<AccumType>((*Xdata_offset++)) * (*weight_coeff++);
+    } else {
+      output += (*Xdata_offset++) * (*weight_coeff++);
+    }
+  }
+
+  if constexpr (onnxruntime::is_8bit_v<T>) {
+    const uint8_t* clip8_lookups = &clip8_table[640];
+    *Ydata_offset = static_cast<T>(clip8_lookups[output >> 22]);
+  } else if constexpr (std::is_same<T, int32_t>::value) {
+    *Ydata_offset = static_cast<int32_t>(std::round(output));
+  } else {
+    *Ydata_offset = static_cast<T>(output);
+  }
+}
+
+template <typename T, typename AccumType>
+__global__ void _ComputeInterpolationAtLevel2(
+    int64_t num_channels,
+    int64_t input_height, int64_t input_width,
+    int64_t output_height, int64_t output_width,
+    const fast_divmod div_output_height,
+    const fast_divmod div_output_width,
+    const fast_divmod div_output_image,
+    int32_t window_size,
+    bool use_extrapolation, float extrapolation_value,
+    const uint8_t* clip8_table,
+    const int64_t* bound_data,
+    std::tuple<int64_t*, int64_t*> outof_bounds_buffers,
+    const AccumType* weight_coefficients,
+    const T* Xdata, T* Ydata, int N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+
+  // No need to do scale
+  if (output_height == input_height) {
+    Ydata[id] = Xdata[id];
+    return;
+  }
+
+  int bxc, output_image_index;
+  div_output_image.divmod(id, bxc, output_image_index);
+
+  int output_z, output_y, output_x, temp;
+  div_output_height.divmod(output_image_index, output_z, temp);
+  div_output_width.divmod(temp, output_y, output_x);
+
+  CUDA_LONG input_index = static_cast<CUDA_LONG>(bxc * num_channels * input_height * input_width +
+                                                 output_z * input_height * input_width);
+  CUDA_LONG output_index = static_cast<CUDA_LONG>(bxc * num_channels * output_height * output_width +
+                                                  output_z * output_height * output_width);
+
+  auto* Ydata_offset = Ydata + output_index + output_width * output_y + output_x;
+
+  if (use_extrapolation) {
+    const auto* w_outof_bounds = std::get<1>(outof_bounds_buffers);
+    // Extrapolate along the w dimension
+    if (w_outof_bounds[static_cast<ptrdiff_t>(output_x)] != -1) {
+      *Ydata_offset = static_cast<T>(extrapolation_value);
+      return;
+    }
+
+    // Extrapolate along the y dimension
+    const auto* y_outof_bounds = std::get<0>(outof_bounds_buffers);
+    if (y_outof_bounds[static_cast<ptrdiff_t>(output_y)] != -1) {
+      *Ydata_offset = static_cast<T>(extrapolation_value);
+      return;
+    }
+  }
+
+  const auto* bound = bound_data;
+
+  AccumType output = onnxruntime::is_8bit_v<T> ? ConstValue::mag_factor : 0;
+
+  const auto* weight_coeff = weight_coefficients + window_size * output_y;
+  int64_t ymin = bound[static_cast<ptrdiff_t>(output_y) * 2];
+  int64_t ymax = bound[static_cast<ptrdiff_t>(output_y) * 2 + 1];
+
+  const auto* Xdata_offset = Xdata + input_index + ymin * output_width + output_x;
+
+  for (; ymin < ymax; ++ymin) {
+    if constexpr (std::is_same<T, half>::value) {
+      // We cast to AccumType to resolve ambiguous call to operator* for half in CUDA
+      output += static_cast<AccumType>((*Xdata_offset)) * (*weight_coeff++);
+    } else {
+      output += (*Xdata_offset) * (*weight_coeff++);
+    }
+    Xdata_offset += input_width;
+  }
+
+  if constexpr (onnxruntime::is_8bit_v<T>) {
+    const uint8_t* clip8_lookups = &clip8_table[640];
+    *Ydata_offset = static_cast<T>(clip8_lookups[output >> 22]);
+  } else if constexpr (std::is_same<T, int32_t>::value) {
+    *Ydata_offset = static_cast<int32_t>(std::round(output));
+  } else {
+    *Ydata_offset = output;
+  }
+}
+
+template <typename T, typename AccumType>
+__global__ void _ComputeInterpolationAtLevel3(
+    int64_t input_depth,
+    int64_t input_height, int64_t input_width,
+    int64_t output_depth,
+    int64_t output_height, int64_t output_width,
+    const fast_divmod div_output_height,
+    const fast_divmod div_output_width,
+    const fast_divmod div_output_image,
+    int32_t window_size,
+    bool use_extrapolation, float extrapolation_value,
+    const uint8_t* clip8_table,
+    const int64_t* bound_data,
+    std::tuple<int64_t*, int64_t*, int64_t*> outof_bounds_buffers,
+    const AccumType* weight_coefficients,
+    const T* Xdata, T* Ydata, int N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+
+  // No need to do scale
+  if (input_depth == output_depth) {
+    Ydata[id] = Xdata[id];
+    return;
+  }
+
+  int bxc, output_image_index;
+  div_output_image.divmod(id, bxc, output_image_index);
+
+  int output_z, output_y, output_x, temp;
+  div_output_height.divmod(output_image_index, output_z, temp);
+  div_output_width.divmod(temp, output_y, output_x);
+
+  CUDA_LONG input_index = static_cast<CUDA_LONG>(bxc * input_depth * input_height * input_width);
+
+  auto* Ydata_offset = Ydata + id;
+
+  if (use_extrapolation) {
+    const auto* w_outof_bounds = std::get<2>(outof_bounds_buffers);
+    // Extrapolate along the w dimension
+    if (w_outof_bounds[static_cast<ptrdiff_t>(output_x)] != -1) {
+      *Ydata_offset = static_cast<T>(extrapolation_value);
+      return;
+    }
+
+    // Extrapolate along the y dimension
+    const auto* y_outof_bounds = std::get<1>(outof_bounds_buffers);
+    if (y_outof_bounds[static_cast<ptrdiff_t>(output_y)] != -1) {
+      *Ydata_offset = static_cast<T>(extrapolation_value);
+      return;
+    }
+
+    // Extrapolate along the z dimension
+    const int64_t* z_outof_bounds = std::get<0>(outof_bounds_buffers);
+    if (z_outof_bounds != nullptr && z_outof_bounds[static_cast<ptrdiff_t>(output_z)] != -1) {
+      *Ydata_offset = static_cast<T>(extrapolation_value);
+      return;
+    }
+  }
+
+  const auto* bound = bound_data;
+
+  AccumType output = onnxruntime::is_8bit_v<T> ? ConstValue::mag_factor : 0;
+
+  const auto* weight_coeff = weight_coefficients + window_size * output_z;
+  int64_t zmin = bound[static_cast<ptrdiff_t>(output_z) * 2];
+  int64_t zmax = bound[static_cast<ptrdiff_t>(output_z) * 2 + 1];
+
+  const auto z_step = input_height * input_width;
+  const auto* Xdata_offset = Xdata + input_index + zmin * z_step + output_y * output_width + output_x;
+
+  for (; zmin < zmax; ++zmin) {
+    if constexpr (std::is_same<T, half>::value) {
+      // We cast to AccumType to resolve ambiguous call to operator* for half in CUDA
+      output += static_cast<AccumType>((*Xdata_offset)) * (*weight_coeff++);
+    } else {
+      output += (*Xdata_offset) * (*weight_coeff++);
+    }
+    Xdata_offset += z_step;
+  }
+
+  if constexpr (onnxruntime::is_8bit_v<T>) {
+    const uint8_t* clip8_lookups = &clip8_table[640];
+    *Ydata_offset = static_cast<T>(clip8_lookups[output >> 22]);
+  } else if constexpr (std::is_same<T, int32_t>::value) {
+    *Ydata_offset = static_cast<int32_t>(std::round(output));
+  } else {
+    *Ydata_offset = output;
+  }
+}
+
+/// <summary>
+/// This function expects the following buffers to be pre-allocated on device
+/// 1. bounds: int64_t[output_size * 2]
+/// 2. out_of_bounds: int64_t[output_size]
+/// 3. scale_data: T[output_size * window_size]
+///
+/// Template parameter AccumType
+/// </summary>
+template <typename AccumType, typename Filter, typename CudaFunctionOriginalCoordinate>
+FUNC_DEF void SetupUpsampleFilterAnitAliasImpl(
+    int64_t i,
+    int64_t input_size, int64_t output_size,
+    float rscale,
+    float roi_start, float roi_end,
+    float scaled_support, int32_t window_size, bool exclude_outside,
+    float cubic_coeff_a,
+    int64_t* bounds,
+    int64_t* out_of_bounds,
+    AccumType* scale_data) {
+  Filter filter{};
+  CudaFunctionOriginalCoordinate get_original_coordinate{};
+
+  const auto scale = 1.f / rscale;
+  const float inv_scale = (scale >= 1.0f) ? 1.0f / scale : 1.0f;
+
+  const float id = static_cast<float>(i);
+  float center = 0.5f;
+  if (scale == 1.0f) {
+    center += id;
+  } else {
+    center += get_original_coordinate(id, rscale,
+                                      static_cast<float>(output_size),
+                                      static_cast<float>(input_size),
+                                      roi_start, roi_end);
+  }
+
+  if (center - 0.5f < 0 || center - 0.5f > static_cast<float>(input_size - 1)) {
+    out_of_bounds[i] = i;
+  } else {
+    out_of_bounds[i] = -1;
+  }
+
+  float total_weight{0};
+
+  auto fmin = _Floor(center - scaled_support + 0.5f);
+  auto fmax = _Floor(center + scaled_support + 0.5f);
+
+  int64_t min_real = static_cast<int64_t>(fmin);
+  int64_t max_real = static_cast<int64_t>(fmax);
+  int64_t min_cut = std::max<int64_t>(min_real, 0);
+  int64_t max_cut = std::min(max_real, input_size);
+
+  int64_t min_val = exclude_outside ? min_cut : min_real;
+  int64_t max_val = exclude_outside ? max_cut : max_real;
+  bounds[i * 2] = min_cut;
+  bounds[i * 2 + 1] = max_cut;
+
+  // This is done for int32_t case, when the final result is in int32_t, but
+  // we perform calculations in float. All other types as is.
+  auto* scale_buffer = AccumTypeCaster<AccumType>::cast(&scale_data[i * window_size]);
+
+  max_val -= min_val;
+  for (int64_t x = 0; x < max_val; x++) {
+    const float arg = (x + min_val - center + 0.5f) * inv_scale;
+    const auto w = filter(arg, cubic_coeff_a);
+    scale_buffer[x] = w;
+    total_weight += w;
+  }
+
+  if (!exclude_outside) {
+    int64_t neg_xsize = min_val < 0 ? -min_val : 0;
+    for (int64_t x = 0; x < neg_xsize; x++) {
+      scale_buffer[neg_xsize] += scale_buffer[x];
+    }
+
+    int64_t bound_size =
+        max_val + min_val > input_size ? max_val + min_val - input_size : 0;
+    for (int64_t x = max_val - bound_size; x < max_val; x++) {
+      scale_buffer[max_val - bound_size - 1] +=
+          scale_buffer[x];
+    }
+
+    for (int64_t x = 0; (neg_xsize | bound_size) > 0 && x < max_cut - min_cut; x++) {
+      scale_buffer[x] = scale_buffer[x + neg_xsize];
+    }
+  }
+
+  const float total_weight_inv = (total_weight == 0) ? 1.f : (1.f / total_weight);
+  if constexpr (std::is_same<AccumType, int32_t>::value) {
+    auto* scale_buffer_int = reinterpret_cast<int32_t*>(scale_buffer);
+    for (int64_t x = 0; x < max_cut - min_cut; x++) {
+      scale_buffer[x] *= total_weight_inv;
+      // normalize the scale to 1 << 22 for int8/uint8
+      scale_buffer_int[x] = static_cast<int32_t>(_Round(scale_buffer[x] * ConstValue::mag_factor_x_2));
+    }
+  } else {
+    for (int64_t x = 0; x < max_cut - min_cut; x++) {
+      scale_buffer[x] *= total_weight_inv;
+    }
+  }
+}
+
+/// This kernel computes antialias filter for bilinear or bicubic upsampling.
+/// The function expects the following buffers to be pre-allocated on device
+/// 1. bounds: int64_t[output_size * 2] for each of the two dimensions
+/// 2. out_of_bounds: int64_t[output_size] for each of the two dimensions
+/// 3. scale_data: AccumType[output_size * window_size] for each of the two dimensions
+/// Buffers layout [h_data, w_data]
+template <typename AccumType, typename Filter, typename CudaFunctionOriginalCoordinate>
+__global__ void _SetupBilinearUpsampleFilterAntiAlias(
+    std::tuple<int64_t, int64_t> input_dims,       // h, w
+    std::tuple<int64_t, int64_t> output_dims,      // h, w
+    std::tuple<float, float> inv_scale_vals,       // h, w
+    std::tuple<float, float> roi_start_vals,       // h, w
+    std::tuple<float, float> roi_end_vals,         // h, w
+    std::tuple<float, float> dim_scaled_support,   // Pre-computed scaled support values h, w
+    std::tuple<int32_t, int32_t> dim_window_size,  // Pre-computed windows sizes h, w
+    float cubic_coeff_a,
+    bool exclude_outside,
+    int64_t* bounds,
+    int64_t* out_of_bounds,
+    std::tuple<AccumType*, AccumType*> weighted_coefficients  // y, h buffers
+) {
+  const auto N = std::get<0>(output_dims) + std::get<1>(output_dims);
+
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+
+  if (id < std::get<0>(output_dims)) {
+    // Setup for y
+    int64_t input_size = std::get<0>(input_dims);
+    int64_t output_size = std::get<0>(output_dims);
+    float inv_scale = std::get<0>(inv_scale_vals);
+    float roi_start = std::get<0>(roi_start_vals);
+    float roi_end = std::get<0>(roi_end_vals);
+    float scaled_support = std::get<0>(dim_scaled_support);
+    int32_t window_size = std::get<0>(dim_window_size);
+
+    SetupUpsampleFilterAnitAliasImpl<AccumType, Filter, CudaFunctionOriginalCoordinate>(
+        id,
+        input_size, output_size,
+        inv_scale,
+        roi_start, roi_end,
+        scaled_support, window_size,
+        exclude_outside,
+        cubic_coeff_a,
+        bounds,
+        out_of_bounds,
+        std::get<0>(weighted_coefficients));
+
+  } else {
+    // Setup for w
+    // w = id - output_height
+
+    int64_t input_size = std::get<1>(input_dims);
+    int64_t output_size = std::get<1>(output_dims);
+    float inv_scale = std::get<1>(inv_scale_vals);
+    float roi_start = std::get<1>(roi_start_vals);
+    float roi_end = std::get<1>(roi_end_vals);
+
+    float scaled_support = std::get<1>(dim_scaled_support);
+    int32_t window_size = std::get<1>(dim_window_size);
+
+    // Adjust buffer positions
+    const auto y_output_size = std::get<0>(output_dims);
+
+    auto i = id - y_output_size;
+    bounds += (y_output_size * 2);
+    out_of_bounds += y_output_size;
+
+    SetupUpsampleFilterAnitAliasImpl<AccumType, Filter, CudaFunctionOriginalCoordinate>(
+        i,
+        input_size, output_size,
+        inv_scale,
+        roi_start, roi_end,
+        scaled_support, window_size,
+        exclude_outside,
+        cubic_coeff_a,
+        bounds,
+        out_of_bounds,
+        std::get<1>(weighted_coefficients));
+  }
+}
+
+/// <summary>
+/// Compute AntiAlias filter for trilinear upsampling, all in one go
+/// The function expects the following buffers to be pre-allocated on device
+/// 1. bounds: int64_t[output_size * 2] for each of the three dimensions
+/// 2. out_of_bounds: int64_t[output_size] for each of the three dimensions
+/// 3. scale_data: AccumType[output_size * window_size] for each of the three dimensions
+/// Each kind of buffer contains data for all 3 dims.
+/// Buffers layout [d_data, h_data, w_data]
+/// </summary>
+template <typename AccumType, typename Filter, typename CudaFunctionOriginalCoordinate>
+__global__ void _SetupTrilinerarUpsampleFilterAntiAlias(
+    std::tuple<int64_t, int64_t, int64_t> input_dims,       // d, h, w
+    std::tuple<int64_t, int64_t, int64_t> output_dims,      // d, h, w
+    std::tuple<float, float, float> inv_scale_vals,         // d, h, w
+    std::tuple<float, float, float> roi_start_vals,         // d, h, w
+    std::tuple<float, float, float> roi_end_vals,           // d, h, w
+    std::tuple<float, float, float> dim_scaled_support,     // Pre-computed scaled support values d, h, w
+    std::tuple<int32_t, int32_t, int32_t> dim_window_size,  // Pre-computed windows sizes d, h, w
+    bool exclude_outisde,
+    int64_t* bounds,
+    int64_t* out_of_bounds,
+    std::tuple<AccumType*, AccumType*, AccumType*> weighted_coefficients) {
+  const auto N = std::get<0>(output_dims) + std::get<1>(output_dims) + std::get<2>(output_dims);
+
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+
+  if (id < std::get<0>(output_dims)) {
+    // Setup for d by default (id < output_depth)
+    int64_t input_size = std::get<0>(input_dims);
+    int64_t output_size = std::get<0>(output_dims);
+    float inv_scale = std::get<0>(inv_scale_vals);
+    float roi_start = std::get<0>(roi_start_vals);
+    float roi_end = std::get<0>(roi_end_vals);
+    float scaled_support = std::get<0>(dim_scaled_support);
+    int32_t window_size = std::get<0>(dim_window_size);
+
+    SetupUpsampleFilterAnitAliasImpl<AccumType, Filter, CudaFunctionOriginalCoordinate>(
+        id,
+        input_size, output_size,
+        inv_scale,
+        roi_start, roi_end,
+        scaled_support, window_size,
+        exclude_outisde,
+        onnxruntime::antialias_constants::kCubicCoeffA,  // Default value for trilinear
+        bounds,
+        out_of_bounds,
+        std::get<0>(weighted_coefficients));
+
+  } else if (id >= std::get<0>(output_dims) && id < (std::get<0>(output_dims) + std::get<1>(output_dims))) {
+    int64_t input_size = std::get<1>(input_dims);
+    int64_t output_size = std::get<1>(output_dims);
+    float inv_scale = std::get<1>(inv_scale_vals);
+    float roi_start = std::get<1>(roi_start_vals);
+    float roi_end = std::get<1>(roi_end_vals);
+
+    float scaled_support = std::get<1>(dim_scaled_support);
+    int32_t window_size = std::get<1>(dim_window_size);
+
+    // Adjust buffer positions
+    const auto d_output_size = std::get<0>(output_dims);
+
+    auto i = id - d_output_size;
+    bounds += d_output_size * 2;
+    out_of_bounds += d_output_size;
+
+    SetupUpsampleFilterAnitAliasImpl<AccumType, Filter, CudaFunctionOriginalCoordinate>(
+        i,
+        input_size, output_size,
+        inv_scale,
+        roi_start, roi_end,
+        scaled_support, window_size,
+        exclude_outisde,
+        onnxruntime::antialias_constants::kCubicCoeffA,  // Default value for trilinear
+        bounds,
+        out_of_bounds,
+        std::get<1>(weighted_coefficients));
+  } else {
+    int64_t input_size = std::get<2>(input_dims);
+    int64_t output_size = std::get<2>(output_dims);
+    float inv_scale = std::get<2>(inv_scale_vals);
+    float roi_start = std::get<2>(roi_start_vals);
+    float roi_end = std::get<2>(roi_end_vals);
+    float scaled_support = std::get<2>(dim_scaled_support);
+    int32_t window_size = std::get<2>(dim_window_size);
+
+    // Adjust buffer positions
+    const auto d_y_output_size = std::get<0>(output_dims) + std::get<1>(output_dims);
+
+    auto i = id - d_y_output_size;
+    bounds += (d_y_output_size * 2);
+    out_of_bounds += d_y_output_size;
+
+    SetupUpsampleFilterAnitAliasImpl<AccumType, Filter, CudaFunctionOriginalCoordinate>(
+        i,
+        input_size, output_size,
+        inv_scale,
+        roi_start, roi_end,
+        scaled_support, window_size,
+        exclude_outisde,
+        onnxruntime::antialias_constants::kCubicCoeffA,  // Default value for trilinear
+        bounds,
+        out_of_bounds,
+        std::get<2>(weighted_coefficients));
+  }
+}
+
+#define CASEA_COORD_ANTIALIAS(coordinate_mode, TransformCoordType, ...) \
+  case coordinate_mode: {                                               \
+    using coord_t = TransformCoordType;                                 \
+    return __VA_ARGS__();                                               \
+    break;                                                              \
+  }
+
+#define DISPATCH_ANTIALIAS_FILTER_SETUP(coord_enum, ...)                              \
+  [&] {                                                                               \
+    const auto the_type = coord_enum;                                                 \
+    switch (the_type) {                                                               \
+      CASEA_COORD_ANTIALIAS(ResizeCoordinateTransformationMode::HALF_PIXEL,           \
+                            TransformCoordinate_HALF_PIXEL, __VA_ARGS__)              \
+      CASEA_COORD_ANTIALIAS(ResizeCoordinateTransformationMode::ASYMMETRIC,           \
+                            TransformCoordinate_ASYMMETRIC, __VA_ARGS__)              \
+      CASEA_COORD_ANTIALIAS(ResizeCoordinateTransformationMode::PYTORCH_HALF_PIXEL,   \
+                            TransformCoordinate_PYTORCH_HALF_PIXEL, __VA_ARGS__)      \
+      CASEA_COORD_ANTIALIAS(ResizeCoordinateTransformationMode::ALIGN_CORNERS,        \
+                            TransformCoordinate_ALIGN_CORNERS, __VA_ARGS__)           \
+      CASEA_COORD_ANTIALIAS(ResizeCoordinateTransformationMode::TF_HALF_PIXEL_FOR_NN, \
+                            TransformCoordinate_TF_HALF_PIXEL_FOR_NN, __VA_ARGS__)    \
+      CASEA_COORD_ANTIALIAS(ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE,   \
+                            TransformCoordinate_TF_CROP_AND_RESIZE, __VA_ARGS__)      \
+      default:                                                                        \
+        ORT_THROW("unknown ResizeCoordinateTransformationMode");                      \
+    }                                                                                 \
+  }()
+
+namespace {
+template <typename T>
+IAllocatorUniquePtr<uint8_t> AllocateTyped(
+    const TempSpaceAllocateFunc& alloc,
+    size_t elements) {
+  return alloc(elements * sizeof(T));
+}
+
+template <typename T>
+T* GetTyped(IAllocatorUniquePtr<uint8_t>& bytes) {
+  return reinterpret_cast<T*>(bytes.get());
+}
+}  // namespace
+
+template <typename T>
+void ResizeTrilinearUpsample(
+    cudaStream_t stream,
+    int rank,
+    const UpsampleMode upsample_mode,
+    ResizeCoordinateTransformationMode coordinate_transform_mode,
+    gsl::span<const int64_t> input_shape,
+    gsl::span<const int64_t> output_shape,
+    int64_t batch_size, int64_t num_channels,
+    std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,
+    std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,
+    std::tuple<float, float, float> inferred_dim_rscales,
+    const TArray<fast_divmod>& output_div_pitches,
+    gsl::span<const float> roi_vals,
+    const std::optional<float>& extrapolation,
+    bool exclude_outside,
+    const TempSpaceAllocateFunc& allocate_temp_space,
+    const uint8_t* clip8_lookups,
+    const T* input_data,
+    T* output_data,
+    const size_t N) {
+  using AccumType = typename onnxruntime::AccumulateType<T>::type;
+
+  const bool use_extrapolation = extrapolation.has_value();
+  const float extrapolation_value = use_extrapolation ? *extrapolation : 0.f;
+
+  int64_t input_depth, input_height, input_width;
+  std::tie(input_depth, input_height, input_width) = inferred_input_dims;
+
+  int64_t output_depth, output_height, output_width;
+  std::tie(output_depth, output_height, output_width) = inferred_output_dims;
+
+  int blocksPerDimsMappingGrid =
+      static_cast<int>(ceil((output_depth + output_height + output_width) / 32.0));
+
+  int blocksPerGrid = static_cast<int>(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
+
+  constexpr float support_value = antialias_constants::kSupportSize;
+  float z_scale, h_scale, w_scale;
+  std::tie(z_scale, h_scale, w_scale) = inferred_dim_rscales;
+
+  const auto& div_output_width = output_div_pitches[rank - 2];
+
+  SafeInt<int64_t> bounds_buffer_size = (SafeInt<int64_t>(output_depth) + output_height + output_width) * 2;
+  SafeInt<int64_t> out_of_bounds_buffer_size = (SafeInt<int64_t>(output_depth) + output_height + output_width);
+
+  auto bounds_buffer_ptr = AllocateTyped<int64_t>(allocate_temp_space, bounds_buffer_size);
+  auto out_of_bounds_buffer_ptr = AllocateTyped<int64_t>(allocate_temp_space, out_of_bounds_buffer_size);
+
+  int64_t* z_bounds_buffer = GetTyped<int64_t>(bounds_buffer_ptr);
+  int64_t* y_bounds_buffer = z_bounds_buffer + output_depth * 2;
+  int64_t* w_bounds_buffer = y_bounds_buffer + output_height * 2;
+
+  int64_t* z_outof_bounds_buffer = GetTyped<int64_t>(out_of_bounds_buffer_ptr);
+  int64_t* y_outof_bounds_buffer = z_outof_bounds_buffer + output_depth;
+  int64_t* w_outof_bounds_buffer = y_outof_bounds_buffer + output_height;
+
+  float z_scaled_support, h_scaled_support, w_scaled_support;
+  int32_t z_window_size, h_window_size, w_window_size;
+  const auto [z_buffer_size, y_buffer_size, w_buffer_size] = ComputeTrilinearScaleBufferSize(
+      output_depth, output_height, output_width,
+      z_scale, h_scale, w_scale, support_value,
+      z_scaled_support, h_scaled_support, w_scaled_support,
+      z_window_size, h_window_size, w_window_size);
+
+  const int64_t weighted_buffer_size = SafeInt<int64_t>(z_buffer_size) + y_buffer_size + w_buffer_size;
+
+  auto weighted_buffer_ptr = AllocateTyped<AccumType>(allocate_temp_space, weighted_buffer_size);
+  AccumType* z_weighted_buffer = GetTyped<AccumType>(weighted_buffer_ptr);
+  AccumType* y_weighted_buffer = z_weighted_buffer + z_buffer_size;
+  AccumType* w_weighted_buffer = y_weighted_buffer + y_buffer_size;
+
+  const auto h_w_interpolate_temp_buf_size = SafeInt<int64_t>(batch_size) * num_channels *
+                                             input_depth * input_height * output_width;
+  auto h_w_interpolate_temp_buffer_ptr = AllocateTyped<T>(allocate_temp_space,
+                                                          narrow<size_t>(h_w_interpolate_temp_buf_size));
+
+  const auto h_w_interpolate_result_buffer_size = SafeInt<int64_t>(batch_size) * num_channels *
+                                                  input_depth * output_height * output_width;
+  auto h_w_interpolate_result_buffer_ptr = AllocateTyped<T>(allocate_temp_space, h_w_interpolate_result_buffer_size);
+
+  // clang-format off
+  DISPATCH_ANTIALIAS_FILTER_SETUP(coordinate_transform_mode, [&]() {
+    _SetupTrilinerarUpsampleFilterAntiAlias<AccumType,
+                                            TriLinearFilter,
+                                            coord_t><<<blocksPerDimsMappingGrid, 32, 0, stream>>>(
+        inferred_input_dims,
+        inferred_output_dims,
+        inferred_dim_rscales,
+        std::make_tuple(roi_vals[rank - 3], roi_vals[rank - 2], roi_vals[rank - 1]),  // roi starts d, h, w
+        std::make_tuple(roi_vals[rank - 3 + rank], roi_vals[rank - 2 + rank],         // roi ends d, h, w
+                        roi_vals[rank - 1 + rank]),
+        std::make_tuple(z_scaled_support, h_scaled_support, w_scaled_support),
+        std::make_tuple(z_window_size, h_window_size, w_window_size),
+        exclude_outside,
+        GetTyped<int64_t>(bounds_buffer_ptr),
+        GetTyped<int64_t>(out_of_bounds_buffer_ptr),
+        std::make_tuple(z_weighted_buffer, y_weighted_buffer, w_weighted_buffer));
+  });
+
+  // clang-format on
+  const fast_divmod div_w_image(narrow<int>(num_channels * input_depth * input_height * output_width));
+  // clang-format off
+  _ComputeInterpolationAtLevel1<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      num_channels * input_depth, input_height, input_width, input_height, output_width,
+      div_output_width,
+      div_w_image,
+      w_window_size,
+      clip8_lookups,
+      w_bounds_buffer,
+      std::make_tuple(y_outof_bounds_buffer, w_outof_bounds_buffer),
+      w_weighted_buffer, input_data,
+      GetTyped<T>(h_w_interpolate_temp_buffer_ptr),
+      narrow<int>(h_w_interpolate_temp_buf_size));
+
+  // clang-format on
+  const fast_divmod div_output_height{narrow<int>(output_height * output_width)};
+  const fast_divmod div_h_w_image(narrow<int>(num_channels * input_depth * output_height * output_width));
+  // clang-format off
+  _ComputeInterpolationAtLevel2<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      num_channels * input_depth, input_height, output_width, output_height, output_width,
+      div_output_height,
+      div_output_width,
+      div_h_w_image,
+      h_window_size,
+      false, 0.f,  // No extrapolation
+      clip8_lookups,
+      y_bounds_buffer,
+      std::make_tuple(y_outof_bounds_buffer, w_outof_bounds_buffer),
+      y_weighted_buffer, GetTyped<T>(h_w_interpolate_temp_buffer_ptr),
+      GetTyped<T>(h_w_interpolate_result_buffer_ptr),
+      narrow<int>(h_w_interpolate_result_buffer_size));
+
+  // clang-format on
+  const fast_divmod div_z_h_w_image(narrow<int>(input_depth * output_height * output_width));
+  // clang-format off
+  _ComputeInterpolationAtLevel3<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      input_depth, output_height, output_width,
+      output_depth, output_height, output_width,
+      div_output_height,
+      div_output_width,
+      div_z_h_w_image,
+      z_window_size,
+      use_extrapolation, extrapolation_value,
+      clip8_lookups,
+      z_bounds_buffer,
+      std::make_tuple(z_outof_bounds_buffer, y_outof_bounds_buffer, w_outof_bounds_buffer),
+      z_weighted_buffer, GetTyped<T>(h_w_interpolate_result_buffer_ptr),
+      output_data,
+      narrow<int>(N));
+  // clang-format on
+}
+
+template <class T>
+void ResizeBiLinearUpsample(cudaStream_t stream,
+                            int rank,
+                            const UpsampleMode upsample_mode,
+                            ResizeCoordinateTransformationMode coordinate_transform_mode,
+                            gsl::span<const int64_t> input_shape,
+                            gsl::span<const int64_t> output_shape,
+                            int64_t batch_size, int64_t num_channels,
+                            std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,
+                            std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,
+                            std::tuple<float, float, float> inferred_dim_rscales,
+                            const TArray<fast_divmod>& output_div_pitches,
+                            gsl::span<const float> roi_vals,
+                            const std::optional<float>& extrapolation,
+                            bool exclude_outside,
+                            const TempSpaceAllocateFunc& allocate_temp_space,
+                            const uint8_t* clip8_lookups,
+                            const T* input_data,
+                            T* output_data,
+                            const size_t N) {
+  using AccumType = typename onnxruntime::AccumulateType<T>::type;
+
+  const bool use_extrapolation = extrapolation.has_value();
+  const float extrapolation_value = use_extrapolation ? *extrapolation : 0.f;
+
+  int64_t input_depth, input_height, input_width;
+  std::tie(input_depth, input_height, input_width) = inferred_input_dims;
+
+  int64_t output_depth, output_height, output_width;
+  std::tie(output_depth, output_height, output_width) = inferred_output_dims;
+
+  int blocksPerDimsMappingGrid =
+      narrow<int>(CeilDiv((output_depth + output_height + output_width), 32));
+
+  // rank 2 or 4
+  const fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 4]
+                                                  : fast_divmod(gsl::narrow_cast<int>(N));
+  const fast_divmod& div_output_width = output_div_pitches[rank - 2];
+
+  constexpr float support_value = antialias_constants::kSupportSize;
+
+  float h_scale, w_scale;
+  std::tie(std::ignore, h_scale, w_scale) = inferred_dim_rscales;
+
+  int blocksPerGrid = narrow<int>(CeilDiv(N, GridDim::maxThreadsPerBlock));
+
+  SafeInt<int64_t> bounds_buffer_size = (SafeInt<int64_t>(output_height) + output_width) * 2;
+  SafeInt<int64_t> out_of_bounds_buffer_size = (SafeInt<int64_t>(output_height) + output_width);
+
+  float h_scaled_support, w_scaled_support;
+  int32_t h_window_size, w_window_size;
+  const auto [weighted_y_size, weighted_w_size] =
+      ComputeBilinearScaleBufferSize(output_height, output_width,
+                                     h_scale, w_scale, support_value,
+                                     h_scaled_support, w_scaled_support, h_window_size, w_window_size);
+
+  auto bounds_buffer_ptr = AllocateTyped<int64_t>(allocate_temp_space, bounds_buffer_size);
+  auto out_of_bounds_buffer_ptr = AllocateTyped<int64_t>(allocate_temp_space, out_of_bounds_buffer_size);
+
+  int64_t* y_bounds_buffer = GetTyped<int64_t>(bounds_buffer_ptr);
+  int64_t* w_bounds_buffer = y_bounds_buffer + output_height * 2;
+
+  int64_t* y_outof_bounds_buffer = GetTyped<int64_t>(out_of_bounds_buffer_ptr);
+  int64_t* w_outof_bounds_buffer = y_outof_bounds_buffer + output_height;
+
+  const int64_t weighted_buffer_size = SafeInt<int64_t>(weighted_y_size) + weighted_w_size;
+  auto weighted_buffer_ptr = AllocateTyped<AccumType>(allocate_temp_space, narrow<size_t>(weighted_buffer_size));
+
+  AccumType* y_weighted_buffer = GetTyped<AccumType>(weighted_buffer_ptr);
+  AccumType* w_weighted_buffer = y_weighted_buffer + weighted_y_size;
+
+  const auto temp_buf_size = num_channels * input_height * output_width;
+  auto image_temp_buffer = AllocateTyped<T>(allocate_temp_space, narrow<size_t>(temp_buf_size));
+
+  // clang-format off
+  DISPATCH_ANTIALIAS_FILTER_SETUP(coordinate_transform_mode, [&]() {
+    //  Data is d, h, w in tuples
+
+    _SetupBilinearUpsampleFilterAntiAlias<AccumType,
+                                          BilinearFilter,
+                                          coord_t><<<blocksPerDimsMappingGrid, 32, 0, stream>>>(
+        std::make_tuple(input_height, input_width),
+        std::make_tuple(output_height, output_width),
+        std::make_tuple(h_scale, w_scale),
+        std::make_tuple(roi_vals[rank - 2], roi_vals[rank - 1]),                // roi starts h, w
+        std::make_tuple(roi_vals[rank - 2 + rank], roi_vals[rank - 1 + rank]),  // roi ends h, w
+        std::make_tuple(h_scaled_support, w_scaled_support),
+        std::make_tuple(h_window_size, w_window_size),
+        onnxruntime::antialias_constants::kCubicCoeffA, exclude_outside,
+        GetTyped<int64_t>(bounds_buffer_ptr),
+        GetTyped<int64_t>(out_of_bounds_buffer_ptr),
+        std::make_tuple(y_weighted_buffer, w_weighted_buffer));
+  });
+
+  // clang-format on
+  const fast_divmod div_step_image{narrow<int>(num_channels * input_height * output_width)};
+  // clang-format off
+  _ComputeInterpolationAtLevel1<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      num_channels, input_height, input_width, input_height, output_width,
+      div_output_width,
+      div_step_image,
+      w_window_size,
+      clip8_lookups,
+      w_bounds_buffer,
+      std::make_tuple(y_outof_bounds_buffer, w_outof_bounds_buffer),
+      w_weighted_buffer, input_data, GetTyped<T>(image_temp_buffer),
+      narrow<int>(temp_buf_size));
+
+  // clang-format on
+  const fast_divmod div_output_height{narrow<int>(output_height * output_width)};
+  // clang-format off
+  _ComputeInterpolationAtLevel2<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      num_channels, input_height, output_width, output_height, output_width,
+      div_output_height,
+      div_output_width,
+      div_output_image,
+      h_window_size,
+      use_extrapolation, extrapolation_value,
+      clip8_lookups,
+      y_bounds_buffer,
+      std::make_tuple(y_outof_bounds_buffer, w_outof_bounds_buffer),
+      y_weighted_buffer, GetTyped<T>(image_temp_buffer), output_data,
+      narrow<int>(N));
+
+  // clang-format on
+}
+
+template <typename T>
+void ResizeBicubicUpsample(cudaStream_t stream,
+                           int rank,
+                           const UpsampleMode upsample_mode,
+                           ResizeCoordinateTransformationMode coordinate_transform_mode,
+                           gsl::span<const int64_t> input_shape,
+                           gsl::span<const int64_t> output_shape,
+                           int64_t batch_size, int64_t num_channels,
+                           std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,
+                           std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,
+                           std::tuple<float, float, float> inferred_dim_rscales,
+                           // const TArray<int64_t>& input_strides,
+                           const TArray<fast_divmod>& output_div_pitches,
+                           gsl::span<const float> roi_vals,
+                           const std::optional<float>& extrapolation,
+                           bool exclude_outside,
+                           const TempSpaceAllocateFunc& allocate_temp_space,
+                           const uint8_t* clip8_lookups,
+                           const T* input_data,
+                           T* output_data,
+                           const size_t N) {
+  using AccumType = typename onnxruntime::AccumulateType<T>::type;
+
+  const bool use_extrapolation = extrapolation.has_value();
+  const float extrapolation_value = use_extrapolation ? *extrapolation : 0.f;
+
+  int blocksPerGrid = narrow<int>(CeilDiv(N, GridDim::maxThreadsPerBlock));
+  const fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 4]
+                                                  : fast_divmod(gsl::narrow_cast<int>(N));
+  const fast_divmod& div_output_width = output_div_pitches[rank - 2];
+
+  constexpr float support_value = antialias_constants::kBiCubicSupportSize;
+
+  int64_t input_depth, input_height, input_width;
+  std::tie(input_depth, input_height, input_width) = inferred_input_dims;
+
+  int64_t output_depth, output_height, output_width;
+  std::tie(output_depth, output_height, output_width) = inferred_output_dims;
+
+  int blocksPerDimsMappingGrid =
+      narrow<int>(CeilDiv((output_depth + output_height + output_width), 32));
+
+  float h_scale, w_scale;
+  std::tie(std::ignore, h_scale, w_scale) = inferred_dim_rscales;
+
+  SafeInt<int64_t> bounds_buffer_size = (SafeInt<int64_t>(output_height) + output_width) * 2;
+  SafeInt<int64_t> out_of_bounds_buffer_size = (SafeInt<int64_t>(output_height) + output_width);
+
+  float h_scaled_support, w_scaled_support;
+  int32_t h_window_size, w_window_size;
+  const auto [weighted_y_size, weighted_w_size] =
+      ComputeBilinearScaleBufferSize(output_height, output_width,
+                                     h_scale, w_scale, support_value,
+                                     h_scaled_support, w_scaled_support, h_window_size, w_window_size);
+
+  auto bounds_buffer_ptr = AllocateTyped<int64_t>(allocate_temp_space, bounds_buffer_size);
+  auto out_of_bounds_buffer_ptr = AllocateTyped<int64_t>(allocate_temp_space, out_of_bounds_buffer_size);
+
+  int64_t* y_bounds_buffer = GetTyped<int64_t>(bounds_buffer_ptr);
+  int64_t* w_bounds_buffer = y_bounds_buffer + output_height * 2;
+
+  int64_t* y_outof_bounds_buffer = GetTyped<int64_t>(out_of_bounds_buffer_ptr);
+  int64_t* w_outof_bounds_buffer = y_outof_bounds_buffer + output_height;
+
+  const int64_t weighted_buffer_size = SafeInt<int64_t>(weighted_y_size) +
+                                       weighted_w_size;
+  auto weighted_buffer_ptr = AllocateTyped<AccumType>(allocate_temp_space, weighted_buffer_size);
+
+  AccumType* y_weighted_buffer = GetTyped<AccumType>(weighted_buffer_ptr);
+  AccumType* w_weighted_buffer = y_weighted_buffer + weighted_y_size;
+
+  const auto temp_buf_size = SafeInt<int64_t>(batch_size) * num_channels * input_height * output_width;
+  auto image_temp_buffer = AllocateTyped<T>(allocate_temp_space, narrow<size_t>(temp_buf_size));
+
+  // clang-format off
+  DISPATCH_ANTIALIAS_FILTER_SETUP(coordinate_transform_mode, [&]() {
+    _SetupBilinearUpsampleFilterAntiAlias<AccumType,
+                                          BiCubicFilter,
+                                          coord_t><<<blocksPerDimsMappingGrid, 32, 0, stream>>>(
+        std::make_tuple(input_height, input_width),
+        std::make_tuple(output_height, output_width),
+        std::make_tuple(h_scale, w_scale),
+        std::make_tuple(roi_vals[rank - 2], roi_vals[rank - 1]),                // roi starts h, w
+        std::make_tuple(roi_vals[rank - 2 + rank], roi_vals[rank - 1 + rank]),  // roi ends h, w
+        std::make_tuple(h_scaled_support, w_scaled_support),
+        std::make_tuple(h_window_size, w_window_size),
+        onnxruntime::antialias_constants::kCubicCoeffA, exclude_outside,
+        GetTyped<int64_t>(bounds_buffer_ptr),
+        GetTyped<int64_t>(out_of_bounds_buffer_ptr),
+        std::make_tuple(y_weighted_buffer, w_weighted_buffer));
+  });
+  // clang-format on
+  const fast_divmod div_step_image(narrow<int>(num_channels * input_height * output_width));
+  // clang-format off
+  _ComputeInterpolationAtLevel1<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      num_channels, input_height, input_width, input_height, output_width,
+      div_output_width,
+      div_step_image,
+      w_window_size,
+      clip8_lookups,
+      w_bounds_buffer,
+      std::make_tuple(y_outof_bounds_buffer, w_outof_bounds_buffer),
+      w_weighted_buffer, input_data, GetTyped<T>(image_temp_buffer),
+      narrow<int>(temp_buf_size));
+  // clang-format on
+
+  const fast_divmod div_output_height{narrow<int>(output_height * output_width)};
+  // clang-format off
+  _ComputeInterpolationAtLevel2<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      num_channels, input_height, output_width, output_height, output_width,
+      div_output_height,
+      div_output_width,
+      div_output_image,
+      h_window_size,
+      use_extrapolation, extrapolation_value,
+      clip8_lookups,
+      y_bounds_buffer,
+      std::make_tuple(y_outof_bounds_buffer, w_outof_bounds_buffer),
+      y_weighted_buffer, GetTyped<T>(image_temp_buffer), output_data,
+      narrow<int>(N));
+  // clang-format on
+}
+
+template <class T>
+void ResizeAntiAliasImpl(
+    cudaStream_t stream,
+    int rank,
+    const UpsampleMode upsample_mode,
+    ResizeCoordinateTransformationMode coordinate_transform_mode,
+    gsl::span<const int64_t> input_shape,
+    gsl::span<const int64_t> output_shape,
+    int64_t batch_size, int64_t num_channels,
+    std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,
+    std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,
+    std::tuple<float, float, float> inferred_dim_rscales,
+    const TArray<fast_divmod>& output_div_pitches,
+    gsl::span<const float> roi_vals,
+    const std::optional<float>& extrapolation,
+    bool exclude_outside,
+    TempSpaceAllocateFunc allocate_temp_space,
+    const uint8_t* clip8_lookups,
+    const T* input_data,
+    T* output_data,
+    const size_t N) {
+  // We support a special case of bilinear or bicubic if the input data is 4D with the outer 2 scales being 1.0
+  // We would have validated the outer scale values by the time execution reaches this
+  const bool is_2D = (rank == 2 || rank == 4);
+
+  // We support a special case of trilinear or tricubic if the input data is 5D with the outer 2 scales being 1.0
+  // We would have validated the outer scale values by the time execution reaches this
+  const bool is_3D = (rank == 3 || rank == 5);
+
+  // Should not hit this as we have already validated input rank/scales and we provide verbose error messages
+  // to the user.
+  ORT_ENFORCE(is_2D || is_3D, "Only bilinear/trilinear and bicubic modes are supported in Resize anti-alias mode");
+
+  switch (upsample_mode) {
+    case UpsampleMode::LINEAR: {
+      if (is_2D) {
+        ResizeBiLinearUpsample<T>(stream, rank, upsample_mode, coordinate_transform_mode,
+                                  input_shape, output_shape, batch_size, num_channels,
+                                  inferred_input_dims, inferred_output_dims, inferred_dim_rscales,
+                                  output_div_pitches, roi_vals, extrapolation, exclude_outside,
+                                  allocate_temp_space, clip8_lookups, input_data, output_data, N);
+      } else if (is_3D) {
+        ResizeTrilinearUpsample<T>(stream, rank, upsample_mode, coordinate_transform_mode,
+                                   input_shape, output_shape, batch_size, num_channels,
+                                   inferred_input_dims, inferred_output_dims, inferred_dim_rscales,
+                                   output_div_pitches, roi_vals, extrapolation, exclude_outside,
+                                   allocate_temp_space, clip8_lookups, input_data, output_data, N);
+      } else {
+        ORT_NOT_IMPLEMENTED("Resize supports only 2-D or 3-D in LINEAR mode.");
+      }
+    } break;
+    case CUBIC: {
+      if (is_2D) {
+        ResizeBicubicUpsample<T>(stream, rank, upsample_mode, coordinate_transform_mode,
+                                 input_shape, output_shape, batch_size, num_channels,
+                                 inferred_input_dims, inferred_output_dims, inferred_dim_rscales,
+                                 output_div_pitches, roi_vals, extrapolation, exclude_outside,
+                                 allocate_temp_space, clip8_lookups, input_data, output_data, N);
+      } else {
+        ORT_NOT_IMPLEMENTED("Resize supports only 2-D in CUBIC mode.");
+      }
+    } break;
+    default:
+      ORT_NOT_IMPLEMENTED("Only bilinear/trilinear and bicubic modes are supported in Resize anti-alias mode");
+      break;
+  }
+}
+
+#define SPECIALIZED_ANTIALIAS_IMPL(T)                               \
+  template void ResizeAntiAliasImpl<T>(                             \
+      cudaStream_t stream,                                          \
+      int rank,                                                     \
+      const UpsampleMode upsample_mode,                             \
+      ResizeCoordinateTransformationMode coordinate_transform_mode, \
+      gsl::span<const int64_t> input_shape,                         \
+      gsl::span<const int64_t> output_shape,                        \
+      int64_t batch_size, int64_t num_channels,                     \
+      std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,    \
+      std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,   \
+      std::tuple<float, float, float> inferred_dim_rscales,         \
+      const TArray<fast_divmod>& output_div_pitches,                \
+      gsl::span<const float> roi_vals,                              \
+      const std::optional<float>& extrapolation_value,              \
+      bool exclude_outside,                                         \
+      TempSpaceAllocateFunc allocate_temp_space,                    \
+      const uint8_t* clip8_lookups,                                 \
+      const T* input_data,                                          \
+      T* output_data,                                               \
+      const size_t N);
+
+SPECIALIZED_ANTIALIAS_IMPL(float)
+SPECIALIZED_ANTIALIAS_IMPL(double)
+SPECIALIZED_ANTIALIAS_IMPL(half)
+SPECIALIZED_ANTIALIAS_IMPL(int32_t)
+SPECIALIZED_ANTIALIAS_IMPL(uint8_t)
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
index 1a94c7705e913..0cde0ed8e8681 100644
--- a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
@@ -12,7 +12,7 @@ using onnxruntime::ResizeNearestMode;
 using onnxruntime::UpsampleMode;
 
 struct NearestPixel_SIMPLE {
-  __device__ __forceinline__ int operator() (float x_original, bool is_down_sampling) const {
+  __device__ __forceinline__ int operator()(float x_original, bool is_down_sampling) const {
     if (is_down_sampling) {
       return static_cast<int>(_Ceil(x_original));
     }
@@ -21,7 +21,7 @@ struct NearestPixel_SIMPLE {
 };
 
 struct NearestPixel_ROUND_PREFER_FLOOR {
-  __device__ __forceinline__ int operator() (float x_original, bool) const {
+  __device__ __forceinline__ int operator()(float x_original, bool) const {
     if (x_original == static_cast<int>(x_original) + 0.5f) {
       return static_cast<int>(_Floor(x_original));
     }
@@ -30,62 +30,23 @@ struct NearestPixel_ROUND_PREFER_FLOOR {
 };
 
 struct NearestPixel_ROUND_PREFER_CEIL {
-  __device__ __forceinline__ int operator() (float x_original, bool) const {
+  __device__ __forceinline__ int operator()(float x_original, bool) const {
     return static_cast<int>(roundf(x_original));
   }
 };
 
 struct NearestPixel_FLOOR {
-  __device__ __forceinline__ int operator() (float x_original, bool) const {
+  __device__ __forceinline__ int operator()(float x_original, bool) const {
     return static_cast<int>(_Floor(x_original));
   }
 };
 
 struct NearestPixel_CEIL {
-  __device__ __forceinline__ int operator() (float x_original, bool) const {
+  __device__ __forceinline__ int operator()(float x_original, bool) const {
     return static_cast<int>(_Ceil(x_original));
   }
 };
 
-struct TransformCoordinate_ASYMMETRIC {
-  __device__ __forceinline__ float operator() (float x_resized, float x_scale, float, float, float, float) const {
-    return x_resized / x_scale;
-  }
-};
-
-struct TransformCoordinate_HALF_PIXEL {
-  __device__ __forceinline__ float operator() (float x_resized, float x_scale, float, float, float, float) const {
-    return ((x_resized + 0.5f) / x_scale) - 0.5f;
-  }
-};
-
-struct TransformCoordinate_PYTORCH_HALF_PIXEL {
-  __device__ __forceinline__ float operator() (float x_resized, float x_scale, float length_resized, float, float, float) const {
-    return length_resized > 1 ? (x_resized + 0.5f) / x_scale - 0.5f : 0.0f;
-  }
-};
-
-struct TransformCoordinate_TF_HALF_PIXEL_FOR_NN {
-  __device__ __forceinline__ float operator() (float x_resized, float x_scale, float, float, float, float) const {
-    return (x_resized + 0.5f) / x_scale;
-  }
-};
-
-struct TransformCoordinate_ALIGN_CORNERS {
-  __device__ __forceinline__ float operator() (float x_resized, float, float length_resized, float length_original, float, float) const {
-    return length_resized == 1 ? 0 : x_resized * (length_original - 1) / (length_resized - 1);
-  }
-};
-
-struct TransformCoordinate_TF_CROP_AND_RESIZE {
-  __device__ __forceinline__ float operator() (float x_resized, float, float length_resized, float length_original, float roi_start, float roi_end) const {
-    auto orig = length_resized > 1
-      ? roi_start * (length_original - 1) + (x_resized * (roi_end - roi_start) * (length_original - 1)) / (length_resized - 1)
-      : 0.5 * (roi_start + roi_end) * (length_original - 1);
-    return static_cast<float>(orig);
-  }
-};
-
 #define CASE_TYPE_USING_HINT(enum_type, type, HINT, ...) \
   case enum_type: {                                      \
     using HINT = type;                                   \
@@ -95,20 +56,24 @@ struct TransformCoordinate_TF_CROP_AND_RESIZE {
 #define CASE_TYPE_COORD(enum_type, type, ...) \
   CASE_TYPE_USING_HINT(enum_type, type, coord_t, __VA_ARGS__)
 
-#define DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(TYPE, ...)                                                                      \
-  [&] {                                                                                                                                \
-    const auto& the_type = TYPE;                                                                                                       \
-    /* don't use TYPE again in case it is an expensive or side-effect op */                                                            \
-    switch (the_type) {                                                                                                                \
-      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::HALF_PIXEL,           TransformCoordinate_HALF_PIXEL, __VA_ARGS__)           \
-      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::ASYMMETRIC,           TransformCoordinate_ASYMMETRIC, __VA_ARGS__)           \
-      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::PYTORCH_HALF_PIXEL,   TransformCoordinate_PYTORCH_HALF_PIXEL, __VA_ARGS__)   \
-      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::ALIGN_CORNERS,        TransformCoordinate_ALIGN_CORNERS, __VA_ARGS__)        \
-      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::TF_HALF_PIXEL_FOR_NN, TransformCoordinate_TF_HALF_PIXEL_FOR_NN, __VA_ARGS__) \
-      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE,   TransformCoordinate_TF_CROP_AND_RESIZE, __VA_ARGS__)   \
-      default:                                                                                                                         \
-        ORT_THROW("unknown ResizeCoordinateTransformationMode");                                                                       \
-    }                                                                                                                                  \
+#define DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(TYPE, ...)                                                  \
+  [&] {                                                                                                            \
+    const auto& the_type = TYPE;                                                                                   \
+    /* don't use TYPE again in case it is an expensive or side-effect op */                                        \
+    switch (the_type) {                                                                                            \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::HALF_PIXEL, TransformCoordinate_HALF_PIXEL, __VA_ARGS__) \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::ASYMMETRIC, TransformCoordinate_ASYMMETRIC, __VA_ARGS__) \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::PYTORCH_HALF_PIXEL,                                      \
+                      TransformCoordinate_PYTORCH_HALF_PIXEL, __VA_ARGS__)                                         \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::ALIGN_CORNERS,                                           \
+                      TransformCoordinate_ALIGN_CORNERS, __VA_ARGS__)                                              \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::TF_HALF_PIXEL_FOR_NN,                                    \
+                      TransformCoordinate_TF_HALF_PIXEL_FOR_NN, __VA_ARGS__)                                       \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE,                                      \
+                      TransformCoordinate_TF_CROP_AND_RESIZE, __VA_ARGS__)                                         \
+      default:                                                                                                     \
+        ORT_THROW("unknown ResizeCoordinateTransformationMode");                                                   \
+    }                                                                                                              \
   }()
 
 #define CASE_TYPE_NEAREST(enum_type, type, ...) \
@@ -119,11 +84,11 @@ struct TransformCoordinate_TF_CROP_AND_RESIZE {
     const auto& the_type = TYPE;                                                                             \
     /* don't use TYPE again in case it is an expensive or side-effect op */                                  \
     switch (the_type) {                                                                                      \
-      CASE_TYPE_NEAREST(ResizeNearestMode::SIMPLE,             NearestPixel_SIMPLE, __VA_ARGS__)             \
+      CASE_TYPE_NEAREST(ResizeNearestMode::SIMPLE, NearestPixel_SIMPLE, __VA_ARGS__)                         \
       CASE_TYPE_NEAREST(ResizeNearestMode::ROUND_PREFER_FLOOR, NearestPixel_ROUND_PREFER_FLOOR, __VA_ARGS__) \
-      CASE_TYPE_NEAREST(ResizeNearestMode::ROUND_PREFER_CEIL,  NearestPixel_ROUND_PREFER_CEIL, __VA_ARGS__)  \
-      CASE_TYPE_NEAREST(ResizeNearestMode::FLOOR,              NearestPixel_FLOOR, __VA_ARGS__)              \
-      CASE_TYPE_NEAREST(ResizeNearestMode::CEIL,               NearestPixel_CEIL, __VA_ARGS__)               \
+      CASE_TYPE_NEAREST(ResizeNearestMode::ROUND_PREFER_CEIL, NearestPixel_ROUND_PREFER_CEIL, __VA_ARGS__)   \
+      CASE_TYPE_NEAREST(ResizeNearestMode::FLOOR, NearestPixel_FLOOR, __VA_ARGS__)                           \
+      CASE_TYPE_NEAREST(ResizeNearestMode::CEIL, NearestPixel_CEIL, __VA_ARGS__)                             \
       default:                                                                                               \
         ORT_THROW("unknown ResizeNearestMode");                                                              \
     }                                                                                                        \
@@ -151,10 +116,12 @@ __global__ void _ResizeNearestMappingKernel2D(
 
     // only apply co-ordinate transformation if scale != 1.0
     if (scales_height == 1.0f) {
-        dims_mapping[id].extrapolate_ = 0;
+      dims_mapping[id].extrapolate_ = 0;
     } else {
-      float orig_coord = transform_coordinate(static_cast<float>(dim), scales_height, static_cast<float>(output_height),
-                                              static_cast<float>(input_height), roi_start_height, roi_end_height);
+      float orig_coord = transform_coordinate(static_cast<float>(dim), scales_height,
+                                              static_cast<float>(output_height),
+                                              static_cast<float>(input_height),
+                                              roi_start_height, roi_end_height);
       dims_mapping[id].extrapolate_ = static_cast<int>(
           extrapolation_enabled && (orig_coord < 0.f || orig_coord > static_cast<float>(input_height - 1)));
       dim = calc_nearest_pixel(orig_coord, scales_height < 1);
@@ -210,9 +177,12 @@ __global__ void _ResizeNearestMappingKernel(
       if (scales[axis] == 1.0f) {
         dims_mapping[id].extrapolate_ = 0;
       } else {
-        float orig_coord = transform_coordinate(static_cast<float>(dim), scales[axis], static_cast<float>(output_shape[axis]),
+        float orig_coord = transform_coordinate(static_cast<float>(dim), scales[axis],
+                                                static_cast<float>(output_shape[axis]),
                                                 static_cast<float>(input_shape[axis]), roi[axis], roi[axis + rank]);
-        dims_mapping[id].extrapolate_ = static_cast<int>(extrapolation_enabled && (orig_coord < 0.f || orig_coord > static_cast<float>(input_shape[axis] - 1)));
+        dims_mapping[id].extrapolate_ = static_cast<int>(extrapolation_enabled &&
+                                                         (orig_coord < 0.f ||
+                                                          orig_coord > static_cast<float>(input_shape[axis] - 1)));
         dim = calc_nearest_pixel(orig_coord, scales[axis] < 1);
         if (dim >= input_shape[axis]) dim = input_shape[axis] - 1;
         if (dim < 0) dim = 0;
@@ -293,21 +263,27 @@ __global__ void _ResizeBilinearCoordinateMapping(
     LinearMappingInfo* dims_mapping) {
   CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, SumHW);
   if (id < output_height) {  //  y = id
-    float input_y = scale_height == 1 ? static_cast<float>(id) :
-                                        transform_coordinate(static_cast<float>(id), scale_height,
-                                        static_cast<float>(output_height), static_cast<float>(input_height),
-                                        roi_height_start, roi_height_end);
-    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_y < 0 || input_y > static_cast<float>(input_height - 1)));
+    float input_y = scale_height == 1 ? static_cast<float>(id)
+                                      : transform_coordinate(static_cast<float>(id), scale_height,
+                                                             static_cast<float>(output_height),
+                                                             static_cast<float>(input_height),
+                                                             roi_height_start, roi_height_end);
+    dims_mapping[id].extrapolate_ = static_cast<int>((extrapolation_enabled &&
+                                                      (input_y < 0 ||
+                                                       input_y > static_cast<float>(input_height - 1))));
     input_y = max(0.0f, min(input_y, static_cast<float>(input_height - 1)));
     int y_int = static_cast<int>(input_y);
     dims_mapping[id].origin_ = y_int;
     dims_mapping[id].weight_ = (y_int >= input_height - 1) ? 0.5f : input_y - y_int;
-  } else {  //x = id - output_height
-    float input_x = scale_width == 1 ? static_cast<float>(id - output_height) :
-                                       transform_coordinate(static_cast<float>(id - output_height), scale_width,
-                                       static_cast<float>(output_width), static_cast<float>(input_width),
-                                       roi_width_start, roi_width_end);
-    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_x < 0 || input_x > static_cast<float>(input_width - 1)));
+  } else {  // x = id - output_height
+    float input_x = scale_width == 1 ? static_cast<float>(id - output_height)
+                                     : transform_coordinate(static_cast<float>(id - output_height),
+                                                            scale_width, static_cast<float>(output_width),
+                                                            static_cast<float>(input_width), roi_width_start,
+                                                            roi_width_end);
+    dims_mapping[id].extrapolate_ = static_cast<int>((extrapolation_enabled &&
+                                                      (input_x < 0 ||
+                                                       input_x > static_cast<float>(input_width - 1))));
     input_x = max(0.0f, min(input_x, static_cast<float>(input_width - 1)));
     int x_int = static_cast<int>(input_x);
     dims_mapping[id].origin_ = x_int;
@@ -371,32 +347,40 @@ __global__ void _ResizeTrilinearCoordinateMapping(
     LinearMappingInfo* dims_mapping) {
   CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, SumDHW);
   if (id < output_depth) {  //  z = id
-    float input_z = scale_depth == 1 ? static_cast<float>(id)  :
-                                       transform_coordinate(static_cast<float>(id), scale_depth,
-                                       static_cast<float>(output_depth), static_cast<float>(input_depth),
-                                       roi_depth_start, roi_depth_end);
-    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_z < 0 || input_z > static_cast<float>(input_depth - 1)));
+    float input_z = scale_depth == 1 ? static_cast<float>(id)
+                                     : transform_coordinate(static_cast<float>(id), scale_depth,
+                                                            static_cast<float>(output_depth),
+                                                            static_cast<float>(input_depth),
+                                                            roi_depth_start, roi_depth_end);
+    dims_mapping[id].extrapolate_ = static_cast<int>((extrapolation_enabled &&
+                                                      (input_z < 0 ||
+                                                       input_z > static_cast<float>(input_depth - 1))));
     input_z = max(0.0f, min(input_z, static_cast<float>(input_depth - 1)));
     int z_int = static_cast<int>(input_z);
     dims_mapping[id].origin_ = z_int;
     dims_mapping[id].weight_ = (z_int >= input_depth - 1) ? 0.5f : input_z - z_int;
   } else if (id >= output_depth && id < (output_depth + output_height)) {  //  y = id - output_depth
-    float input_y = scale_height == 1 ? static_cast<float>(id - output_depth) :
-                                        transform_coordinate(static_cast<float>(id - output_depth), scale_height,
-                                        static_cast<float>(output_height), static_cast<float>(input_height),
-                                        roi_height_start, roi_height_end);
-
-    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_y < 0 || input_y > static_cast<float>(input_height - 1)));
+    float input_y = scale_height == 1 ? static_cast<float>(id - output_depth)
+                                      : transform_coordinate(static_cast<float>(id - output_depth),
+                                                             scale_height, static_cast<float>(output_height),
+                                                             static_cast<float>(input_height),
+                                                             roi_height_start, roi_height_end);
+
+    dims_mapping[id].extrapolate_ = static_cast<int>((extrapolation_enabled &&
+                                                      (input_y < 0 ||
+                                                       input_y > static_cast<float>(input_height - 1))));
     input_y = max(0.0f, min(input_y, static_cast<float>(input_height - 1)));
     int y_int = static_cast<int>(input_y);
     dims_mapping[id].origin_ = y_int;
     dims_mapping[id].weight_ = (y_int >= input_height - 1) ? 0.5f : input_y - y_int;
-  } else {  //x = id - output_depth - output_height
-    float input_x = scale_width == 1 ? static_cast<float>(id - output_depth - output_height) :
-                                       transform_coordinate(static_cast<float>(id - output_depth - output_height), scale_width,
-                                       static_cast<float>(output_width), static_cast<float>(input_width),
-                                       roi_width_start, roi_width_end);
-    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_x < 0 || input_x > static_cast<float>(input_width - 1)));
+  } else {  // x = id - output_depth - output_height
+    float input_x = scale_width == 1 ? static_cast<float>(id - output_depth - output_height)
+                                     : transform_coordinate(static_cast<float>(id - output_depth - output_height),
+                                                            scale_width, static_cast<float>(output_width),
+                                                            static_cast<float>(input_width),
+                                                            roi_width_start, roi_width_end);
+    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_x < 0 ||
+                                                                    input_x > static_cast<float>(input_width - 1)));
     input_x = max(0.0f, min(input_x, static_cast<float>(input_width - 1)));
     int x_int = static_cast<int>(input_x);
     dims_mapping[id].origin_ = x_int;
@@ -513,21 +497,33 @@ __global__ void _ResizeCubicCoordinateMapping(
   int max_input_coord = static_cast<int>(is_y_axis ? input_height : input_width);
 
   float scale = is_y_axis ? scale_height : scale_width;
-  float input_coordinat = scale == 1 ? (is_y_axis ? id : id - output_height) :
-      transform_coordinate(
-      static_cast<float>(is_y_axis ? id : id - output_height),
-      scale,
-      static_cast<float>(is_y_axis ? output_height : output_width),
-      static_cast<float>(max_input_coord),
-      (is_y_axis ? roi_height_start : roi_width_start),
-      (is_y_axis ? roi_height_end : roi_width_end));
+  float input_coordinat = scale == 1 ? (is_y_axis ? id : id - output_height)
+                                     : transform_coordinate(
+                                           static_cast<float>(is_y_axis ? id : id - output_height),
+                                           scale,
+                                           static_cast<float>(is_y_axis ? output_height : output_width),
+                                           static_cast<float>(max_input_coord),
+                                           (is_y_axis ? roi_height_start : roi_width_start),
+                                           (is_y_axis ? roi_height_end : roi_width_end));
   int coord_int = static_cast<int>(_Floor(input_coordinat));
   float s_coord = abs(input_coordinat - coord_int);
   float coeff_sum = 1.0f;
-  float coeff_0 = static_cast<float>(((cubic_coeff_a * (s_coord + 1) - 5 * cubic_coeff_a) * (s_coord + 1) + 8 * cubic_coeff_a) * (s_coord + 1) - 4 * cubic_coeff_a);
-  float coeff_1 = static_cast<float>(((cubic_coeff_a + 2) * s_coord - (cubic_coeff_a + 3)) * s_coord * s_coord + 1);
-  float coeff_2 = static_cast<float>(((cubic_coeff_a + 2) * (1 - s_coord) - (cubic_coeff_a + 3)) * (1 - s_coord) * (1 - s_coord) + 1);
-  float coeff_3 = static_cast<float>(((cubic_coeff_a * (2 - s_coord) - 5 * cubic_coeff_a) * (2 - s_coord) + 8 * cubic_coeff_a) * (2 - s_coord) - 4 * cubic_coeff_a);
+  float coeff_0 = static_cast<float>(((cubic_coeff_a * (s_coord + 1) - 5 * cubic_coeff_a) *
+                                          (s_coord + 1) +
+                                      8 * cubic_coeff_a) *
+                                         (s_coord + 1) -
+                                     4 * cubic_coeff_a);
+  float coeff_1 = static_cast<float>(((cubic_coeff_a + 2) * s_coord - (cubic_coeff_a + 3)) *
+                                         s_coord * s_coord +
+                                     1);
+  float coeff_2 = static_cast<float>(((cubic_coeff_a + 2) * (1 - s_coord) - (cubic_coeff_a + 3)) *
+                                         (1 - s_coord) * (1 - s_coord) +
+                                     1);
+  float coeff_3 = static_cast<float>(((cubic_coeff_a * (2 - s_coord) - 5 * cubic_coeff_a) *
+                                          (2 - s_coord) +
+                                      8 * cubic_coeff_a) *
+                                         (2 - s_coord) -
+                                     4 * cubic_coeff_a);
   if (exclude_outside) {
     coeff_0 = (coord_int - 1 < 0 || coord_int - 1 >= max_input_coord) ? 0.0 : coeff_0;
     coeff_1 = (coord_int + 0 < 0 || coord_int + 0 >= max_input_coord) ? 0.0 : coeff_1;
@@ -540,7 +536,8 @@ __global__ void _ResizeCubicCoordinateMapping(
   dm.coeff1_ = coeff_1 / coeff_sum;
   dm.coeff2_ = coeff_2 / coeff_sum;
   dm.coeff3_ = coeff_3 / coeff_sum;
-  dm.extrapolate_ = (int)(extrapolation_enabled && (input_coordinat < 0 || input_coordinat > static_cast<float>(max_input_coord - 1)));
+  dm.extrapolate_ = (int)(extrapolation_enabled && (input_coordinat < 0 ||
+                                                    input_coordinat > static_cast<float>(max_input_coord - 1)));
 }
 
 template <typename T>
@@ -569,21 +566,30 @@ __global__ void _ResizeBiCubicKernel(
   int x_int = x_info.origin_;
   int y_int = y_info.origin_;
   const T* image = input_data + input_index;
-  output_data[id] = y_info.coeff0_ * CubicInterpolationRowwise(image, x_int, y_int - 1, input_height, input_width, w0, w1, w2, w3) +
-                    y_info.coeff1_ * CubicInterpolationRowwise(image, x_int, y_int, input_height, input_width, w0, w1, w2, w3) +
-                    y_info.coeff2_ * CubicInterpolationRowwise(image, x_int, y_int + 1, input_height, input_width, w0, w1, w2, w3) +
-                    y_info.coeff3_ * CubicInterpolationRowwise(image, x_int, y_int + 2, input_height, input_width, w0, w1, w2, w3);
+  output_data[id] = y_info.coeff0_ *
+                        CubicInterpolationRowwise(image, x_int, y_int - 1, input_height, input_width, w0, w1, w2, w3) +
+                    y_info.coeff1_ *
+                        CubicInterpolationRowwise(image, x_int, y_int, input_height, input_width, w0, w1, w2, w3) +
+                    y_info.coeff2_ *
+                        CubicInterpolationRowwise(image, x_int, y_int + 1, input_height, input_width, w0, w1, w2, w3) +
+                    y_info.coeff3_ *
+                        CubicInterpolationRowwise(image, x_int, y_int + 2, input_height, input_width, w0, w1, w2, w3);
 }
 
 size_t CalcResizeBufferSize(const onnxruntime::UpsampleMode upsample_mode,
                             const gsl::span<const int64_t>& output_dims) {
   switch (upsample_mode) {
     case UpsampleMode::NN:
-      return sizeof(int64_t) * output_dims.size() + sizeof(NearestMappingInfo) * static_cast<size_t>(std::accumulate(output_dims.begin(), output_dims.end(), (int64_t)0));
+      return sizeof(int64_t) * output_dims.size() +
+             sizeof(NearestMappingInfo) *
+                 static_cast<size_t>(std::accumulate(output_dims.begin(),
+                                                     output_dims.end(), (int64_t)0));
     case UpsampleMode::LINEAR:
-      return sizeof(LinearMappingInfo) * static_cast<size_t>(std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, (int64_t)0));
+      return sizeof(LinearMappingInfo) *
+             static_cast<size_t>(std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, (int64_t)0));
     case UpsampleMode::CUBIC:
-      return sizeof(CubicMappingInfo) * static_cast<size_t>(std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, (int64_t)0));
+      return sizeof(CubicMappingInfo) *
+             static_cast<size_t>(std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, (int64_t)0));
   }
   return 0;
 }
@@ -616,7 +622,8 @@ void ResizeNearestImpl(
   if (could2d) {
     int64_t output_height = output_shape[rank - 2];
     int64_t output_width = output_shape[rank - 1];
-    fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 3] : fast_divmod(static_cast<int>(output_height * output_width));
+    fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 3]
+                                              : fast_divmod(static_cast<int>(output_height * output_width));
     int blocksPerDimsMappingGrid = static_cast<int>(ceil((output_height + output_width) / 32.0));
 
     DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(transform_coordinate, [&]() {
@@ -694,13 +701,6 @@ void ResizeImpl(
     ResizeCoordinateTransformationMode coordinate_transform_mode,
     ResizeNearestMode nearest_mode,
     void* dims_mapping) {
-  bool isSame = std::all_of(scales_vals.Data(), scales_vals.Data() + rank, [](float v) { return v == 1.0f; }) &&
-                (coordinate_transform_mode != ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE);
-  if (isSame) {
-    CUDA_CALL_THROW(cudaMemcpyAsync(output_data, input_data, N * sizeof(T), cudaMemcpyDeviceToDevice, stream));
-    return;
-  }
-
   if (upsample_mode == UpsampleMode::NN) {
     ResizeNearestImpl(
         stream, rank, input_shape, output_shape, input_strides, output_div_pitches,
@@ -761,7 +761,7 @@ void ResizeImpl(
       } else if (is_3D) {
         DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(coordinate_transform_mode, [&]() {
           _ResizeTrilinearCoordinateMapping<T><<<blocksPerDimsMappingGrid, 32, 0, stream>>>(
-              input_shape[rank - 3] , input_shape[rank - 2], input_shape[rank - 1],
+              input_shape[rank - 3], input_shape[rank - 2], input_shape[rank - 1],
               output_depth, output_height, output_width,
               scales_vals[rank - 3], scales_vals[rank - 2], scales_vals[rank - 1],
               roi_vals[rank - 3], roi_vals[rank - 3 + rank],
@@ -778,7 +778,7 @@ void ResizeImpl(
             reinterpret_cast<LinearMappingInfo*>(dims_mapping));
         return;
       }
-      ORT_THROW("Only bilinear/trilinear and bicubic modes are supported in Resize");
+      ORT_THROW("Resize support 2-D and 3-D dimensions in LINEAR mode.");
       break;
     case UpsampleMode::CUBIC:
       if (is_2D) {
@@ -801,7 +801,7 @@ void ResizeImpl(
             reinterpret_cast<CubicMappingInfo*>(dims_mapping));
         return;
       }
-      ORT_THROW("Only bilinear/trilinear and bicubic modes are supported in Resize");
+      ORT_THROW("Resize supports only 2-D in CUBIC mode.");
     case UpsampleMode::NN:
       ORT_THROW("Only bilinear/trilinear and bicubic modes are supported in Resize");
   }
@@ -809,7 +809,7 @@ void ResizeImpl(
 
 #define SPECIALIZED_IMPL(T)                                         \
   template void ResizeImpl<T>(                                      \
-      cudaStream_t stream,                                    \
+      cudaStream_t stream,                                          \
       const UpsampleMode upsample_mode,                             \
       const int rank,                                               \
       TArray<int64_t>& input_shape,                                 \
diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.h b/onnxruntime/core/providers/cuda/tensor/resize_impl.h
index d459dbff18d3e..ad06eebb9efb1 100644
--- a/onnxruntime/core/providers/cuda/tensor/resize_impl.h
+++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.h
@@ -2,15 +2,69 @@
 // Licensed under the MIT License.
 
 #pragma once
+
 #include <stdint.h>
+
+#include <tuple>
+
 #include "core/providers/cuda/shared_inc/cuda_utils.h"
 #include "core/common/common.h"
 #include "core/providers/cpu/tensor/upsamplebase.h"
 #include "core/providers/cuda/cuda_common.h"
 
 namespace onnxruntime {
+template <>
+struct AccumulateType<half> {
+  using type = float;
+};
 namespace cuda {
 
+struct TransformCoordinate_ASYMMETRIC {
+  __device__ __host__ __forceinline__ float operator()(float x_resized, float x_scale,
+                                                       float, float, float, float) const {
+    return x_resized / x_scale;
+  }
+};
+
+struct TransformCoordinate_HALF_PIXEL {
+  __device__ __host__ __forceinline__ float operator()(float x_resized, float x_scale,
+                                                       float, float, float, float) const {
+    return ((x_resized + 0.5f) / x_scale) - 0.5f;
+  }
+};
+
+struct TransformCoordinate_PYTORCH_HALF_PIXEL {
+  __device__ __host__ __forceinline__ float operator()(float x_resized, float x_scale, float length_resized, float,
+                                                       float, float) const {
+    return length_resized > 1 ? (x_resized + 0.5f) / x_scale - 0.5f : 0.0f;
+  }
+};
+
+struct TransformCoordinate_TF_HALF_PIXEL_FOR_NN {
+  __device__ __host__ __forceinline__ float operator()(float x_resized, float x_scale,
+                                                       float, float, float, float) const {
+    return (x_resized + 0.5f) / x_scale;
+  }
+};
+
+struct TransformCoordinate_ALIGN_CORNERS {
+  __device__ __host__ __forceinline__ float operator()(float x_resized, float, float length_resized,
+                                                       float length_original, float, float) const {
+    return length_resized == 1 ? 0 : x_resized * (length_original - 1) / (length_resized - 1);
+  }
+};
+
+struct TransformCoordinate_TF_CROP_AND_RESIZE {
+  __device__ __host__ __forceinline__ float operator()(float x_resized, float, float length_resized,
+                                                       float length_original, float roi_start, float roi_end) const {
+    auto orig = length_resized > 1
+                    ? roi_start * (length_original - 1) +
+                          (x_resized * (roi_end - roi_start) * (length_original - 1)) / (length_resized - 1)
+                    : 0.5 * (roi_start + roi_end) * (length_original - 1);
+    return static_cast<float>(orig);
+  }
+};
+
 size_t CalcResizeBufferSize(const onnxruntime::UpsampleMode upsample_mode,
                             const gsl::span<const int64_t>& output_dims);
 
@@ -36,5 +90,62 @@ void ResizeImpl(
     onnxruntime::ResizeNearestMode nearest_mode,
     void* dims_mapping);
 
+using TempSpaceAllocateFunc = std::function<onnxruntime::IAllocatorUniquePtr<uint8_t>(size_t buffer_size)>;
+
+template <class T>
+void ResizeAntiAliasImpl(
+    cudaStream_t stream,
+    int rank,
+    const UpsampleMode upsample_mode,
+    ResizeCoordinateTransformationMode coordinate_transform_mode,
+    gsl::span<const int64_t> input_shape,
+    gsl::span<const int64_t> output_shape,
+    int64_t batch_size, int64_t num_channels,
+    std::tuple<int64_t, int64_t, int64_t> inferred_input_dims,
+    std::tuple<int64_t, int64_t, int64_t> inferred_output_dims,
+    std::tuple<float, float, float> inferred_dim_rscales,
+    const TArray<fast_divmod>& output_div_pitches,
+    gsl::span<const float> roi_vals,  // CPU
+    const std::optional<float>& extrapolation_value,
+    bool exclude_outside,
+    TempSpaceAllocateFunc allocate_temp_space,
+    const uint8_t* clip8_lookups,
+    const T* input_data,
+    T* output_data,
+    const size_t N);
+
+/// <summary>
+/// Compute scaled support value for a given dimension inverse scale
+/// </summary>
+/// <param name="support_value">Support value from parameters</param>
+/// <param name="inv_scale">inverse scale value comes from input/attr for</param>
+/// <returns></returns>
+inline float ComputeScaledSupportValue(float support_value, float rscale) {
+  const float scale = 1.0f / rscale;
+  float scaled_support = (scale >= 1.0f) ? (support_value * 0.5f) * scale : support_value * 0.5f;
+  return scaled_support;
+}
+
+/// <summary>
+/// Compute window size for a given dimension scaled support value.
+/// </summary>
+/// <param name="scaled_support"></param>
+/// <returns></returns>
+inline int32_t ComputeWindowSize(float scaled_support) {
+  SafeInt<int32_t> window_size(ceilf(scaled_support));
+  return window_size * 2 + 1;
+}
+
+/// <summary>
+/// Computes scale buffer size in number of elements for allocation purposes.
+/// </summary>
+/// <param name="output_size"></param>
+/// <param name="window_size"></param>
+/// <returns>Number of elements to fit in the buffer</returns>
+inline SafeInt<int64_t> ComputeWeightedCoeffBufferSize(int64_t output_size, int32_t window_size) {
+  SafeInt<int64_t> buffer_size(output_size);
+  return buffer_size * window_size;
+}
+
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/tensor/upsample.cc b/onnxruntime/core/providers/cuda/tensor/upsample.cc
index ae12ca328bc7c..17533eb3d9a72 100644
--- a/onnxruntime/core/providers/cuda/tensor/upsample.cc
+++ b/onnxruntime/core/providers/cuda/tensor/upsample.cc
@@ -2,6 +2,9 @@
 // Licensed under the MIT License.
 
 #include "upsample.h"
+
+#include <utility>
+
 #include "upsample_impl.h"
 #include "core/providers/cuda/tensor/resize_impl.h"
 #include "core/providers/cpu/tensor/utils.h"
@@ -37,11 +40,23 @@ REGISTER_VERSIONED_TYPED_KERNEL(MLFloat16, 9, 9);
 REGISTER_VERSIONED_TYPED_KERNEL(int32_t, 9, 9);
 REGISTER_VERSIONED_TYPED_KERNEL(uint8_t, 9, 9);
 
+template <typename T>
+Upsample<T>::Upsample(const OpKernelInfo& info) : UpsampleBase(info), CudaKernel(info) {
+  if (UpsampleBase::antialias_) {
+    // Copy the table on DEVICE
+    const uint8_t* lookup_table = GetLookupTableShared();
+    auto alloc = info.GetAllocator(OrtMemTypeDefault);
+    shared_lookup_table_ondevice_ = IAllocator::MakeUniquePtr<uint8_t>(std::move(alloc), kLookupTableSize);
+    CUDA_CALL_THROW(cudaMemcpyAsync(shared_lookup_table_ondevice_.get(), lookup_table, kLookupTableSize,
+                                    cudaMemcpyHostToDevice, nullptr));
+  }
+}
+
 template <typename T>
 Status Upsample<T>::BaseCompute(OpKernelContext* context,
-                                const std::vector<float>& roi,
-                                const std::vector<float>& scales,
-                                const gsl::span<const int64_t>& output_dims) const {
+                                gsl::span<const float> roi,
+                                gsl::span<const float> scales,
+                                gsl::span<const int64_t> output_dims) const {
   const Tensor* X = context->Input<Tensor>(0);
   auto X_dims = X->Shape().GetDims();
   int32_t rank = static_cast<int32_t>(X_dims.size());
@@ -52,7 +67,8 @@ Status Upsample<T>::BaseCompute(OpKernelContext* context,
                   is_resize_ ? "Resize: input tensor cannot be scalar." : "Upsample: input tensor cannot be scalar.");
   if (rank != static_cast<int32_t>(scales.size()))
     return Status(ONNXRUNTIME, INVALID_ARGUMENT,
-                  is_resize_ ? "Resize: input tensor's dimension does not match the scales." : "Upsample: input tensor's dimension does not match the scales.");
+                  is_resize_ ? "Resize: input tensor's dimension does not match the scales."
+                             : "Upsample: input tensor's dimension does not match the scales.");
   if (roi.size() != 2 * X_dims.size())
     return Status(ONNXRUNTIME, INVALID_ARGUMENT,
                   "Resize: size of roi array should be 2 * N where N is the rank of input tensor X.");
@@ -79,22 +95,194 @@ Status Upsample<T>::BaseCompute(OpKernelContext* context,
   size_t output_count = Y->Shape().Size();
 
   if (is_resize_) {
-    TArray<int64_t> input_shape(X_dims);
-    TArray<int64_t> output_shape(output_dims);
-    TArray<float, 10> roi_vals(roi);
-    TArray<float> scales_vals(scales);
-
-    size_t temp_buffer_size = CalcResizeBufferSize(mode_, output_dims);
-    auto dims_mapping_buffer = GetScratchBuffer<unsigned char>(temp_buffer_size, context->GetComputeStream());
-    void* dims_mapping = reinterpret_cast<void*>(dims_mapping_buffer.get());
-    ResizeImpl(Stream(context), mode_, (int)rank, input_shape, output_shape,
-               input_strides, output_div_pitches, scales_vals, roi_vals,
-               reinterpret_cast<const CudaT*>(X->Data<T>()),
-               reinterpret_cast<CudaT*>(Y->MutableData<T>()),
-               output_count, use_extrapolation_, ToCudaType<T>::FromFloat(extrapolation_value_),
-               cubic_coeff_a_, exclude_outside_,
-               coordinate_transform_mode_, nearest_mode_,
-               dims_mapping);
+    const bool is_same = std::all_of(scales.begin(), scales.end(), [](float v) { return v == 1.0f; }) &&
+                         (coordinate_transform_mode_ != ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE);
+    if (is_same) {
+      CUDA_CALL_THROW(cudaMemcpyAsync(Y->MutableData<T>(), X->Data<T>(),
+                                      output_count * sizeof(T), cudaMemcpyDeviceToDevice, Stream(context)));
+      return Status::OK();
+    }
+
+    if (antialias_) {
+      TempSpaceAllocateFunc allocate_temp_space = [&](size_t bytes_size) {
+        return GetScratchBuffer<uint8_t>(bytes_size, context->GetComputeStream());
+      };
+
+      std::optional<float> extrapolation_value;
+      if (use_extrapolation_)
+        extrapolation_value.emplace(extrapolation_value_);
+
+      switch (mode_) {
+        case UpsampleMode::LINEAR: {
+          if (X_dims.size() == 2 || X_dims.size() == 4) {
+            const bool is_2D = X_dims.size() == 2;
+
+            int64_t batch_size = 1;
+            int64_t num_channels = 1;
+
+            int64_t input_height;
+            int64_t input_width;
+
+            int64_t output_height;
+            int64_t output_width;
+
+            float height_scale;
+            float width_scale;
+
+            if (is_2D) {
+              input_height = X_dims[0];
+              input_width = X_dims[1];
+
+              output_height = output_dims[0];
+              output_width = output_dims[1];
+
+              height_scale = scales[0];
+              width_scale = scales[1];
+            } else {
+              if (scales[0] == 1.0f && scales[1] == 1.0f) {
+                batch_size = X_dims[Channels<LAYOUT_NCHW>::N];
+                num_channels = X_dims[Channels<LAYOUT_NCHW>::C];
+                input_height = X_dims[Channels<LAYOUT_NCHW>::H];
+                input_width = X_dims[Channels<LAYOUT_NCHW>::W];
+
+                output_height = output_dims[Channels<LAYOUT_NCHW>::H];
+                output_width = output_dims[Channels<LAYOUT_NCHW>::W];
+
+                height_scale = scales[2];
+                width_scale = scales[3];
+              } else {
+                return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "Resize", ": NHWC is not supported yet");
+              }
+            }
+
+            ResizeAntiAliasImpl(Stream(context),
+                                rank,
+                                mode_,
+                                coordinate_transform_mode_,
+                                X_dims, output_dims,
+                                batch_size, num_channels,
+                                std::make_tuple(0, input_height, input_width),
+                                std::make_tuple(0, output_height, output_width),
+                                std::make_tuple(0.f, height_scale, width_scale),
+                                output_div_pitches,
+                                roi,
+                                extrapolation_value,
+                                exclude_outside_,
+                                allocate_temp_space,
+                                shared_lookup_table_ondevice_.get(),
+                                reinterpret_cast<const CudaT*>(X->Data<T>()),
+                                reinterpret_cast<CudaT*>(Y->MutableData<T>()),
+                                output_count);
+
+          } else if (X_dims.size() == 3 || X_dims.size() == 5) {
+            const bool is_3D = X_dims.size() == 3;
+
+            if (!is_3D) {
+              if (!(scales[0] == 1.0f && scales[1] == 1.0f)) {
+                return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "Resize", ": NDHWC is not supported yet");
+              }
+            }
+
+            const int64_t batch_size = is_3D ? 1 : X_dims[0];
+            const int64_t num_channels = is_3D ? 1 : X_dims[1];
+            const int64_t input_depth = is_3D ? X_dims[0] : X_dims[2];
+            const int64_t input_height = is_3D ? X_dims[1] : X_dims[3];
+            const int64_t input_width = is_3D ? X_dims[2] : X_dims[4];
+
+            const int64_t output_depth = is_3D ? output_dims[0] : output_dims[2];
+            const int64_t output_height = is_3D ? output_dims[1] : output_dims[3];
+            const int64_t output_width = is_3D ? output_dims[2] : output_dims[4];
+
+            const float depth_scale = is_3D ? scales[0] : scales[2];
+            const float height_scale = is_3D ? scales[1] : scales[3];
+            const float width_scale = is_3D ? scales[2] : scales[4];
+
+            ResizeAntiAliasImpl(Stream(context),
+                                rank,
+                                mode_,
+                                coordinate_transform_mode_,
+                                X_dims, output_dims,
+                                batch_size, num_channels,
+                                std::make_tuple(input_depth, input_height, input_width),
+                                std::make_tuple(output_depth, output_height, output_width),
+                                std::make_tuple(depth_scale, height_scale, width_scale),
+                                output_div_pitches,
+                                roi,
+                                extrapolation_value,
+                                exclude_outside_,
+                                allocate_temp_space,
+                                shared_lookup_table_ondevice_.get(),
+                                reinterpret_cast<const CudaT*>(X->Data<T>()),
+                                reinterpret_cast<CudaT*>(Y->MutableData<T>()),
+                                output_count);
+          } else {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "Resize",
+                                   ": 'Linear' mode only support 2-D inputs or 3-D inputs ('Bilinear', 'Trilinear') "
+                                   "or 4-D inputs or 5-D inputs with the corresponding outermost 2 scale values "
+                                   "being 1.");
+          }
+        } break;
+        case UpsampleMode::CUBIC: {
+          if (X_dims.size() != 2 && X_dims.size() != 4) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "Resize",
+                                   ": 'Cubic' mode only support 2-D inputs ('Bicubic') or 4-D inputs "
+                                   "with the corresponding outermost 2 scale values being 1.");
+          }
+
+          const bool is_2D = X_dims.size() == 2;
+          const bool is_nchw = is_2D ? true : (scales[1] == 1.0f && scales[1] == 1.0f);
+
+          ORT_RETURN_IF_NOT(is_nchw,
+                            "Resize 'Cubic' mode only supports NCWH layout "
+                            " with 2-D or 4-D with leading dims equal to 1");
+
+          const int64_t batch_size = is_2D ? 1 : X_dims[Channels<LAYOUT_NCHW>::N];
+          const int64_t num_channels = is_2D ? 1 : X_dims[Channels<LAYOUT_NCHW>::C];
+          const int64_t input_height = is_2D ? X_dims[0] : X_dims[Channels<LAYOUT_NCHW>::H];
+          const int64_t input_width = is_2D ? X_dims[1] : X_dims[Channels<LAYOUT_NCHW>::W];
+
+          const int64_t output_height = is_2D ? output_dims[0] : output_dims[Channels<LAYOUT_NCHW>::H];
+          const int64_t output_width = is_2D ? output_dims[1] : output_dims[Channels<LAYOUT_NCHW>::W];
+          const float height_scale = is_2D ? scales[0] : scales[2];
+          const float width_scale = is_2D ? scales[1] : scales[3];
+
+          ResizeAntiAliasImpl(Stream(context), rank, mode_, coordinate_transform_mode_,
+                              X_dims, output_dims,
+                              batch_size, num_channels,
+                              std::make_tuple(0, input_height, input_width),
+                              std::make_tuple(0, output_height, output_width),
+                              std::make_tuple(0.f, height_scale, width_scale),
+                              output_div_pitches,
+                              roi,
+                              extrapolation_value,
+                              exclude_outside_,
+                              allocate_temp_space,
+                              shared_lookup_table_ondevice_.get(),
+                              reinterpret_cast<const CudaT*>(X->Data<T>()),
+                              reinterpret_cast<CudaT*>(Y->MutableData<T>()),
+                              output_count);
+        } break;
+        default:
+          return Status(ONNXRUNTIME, INVALID_ARGUMENT, "Resize: unexpected mode");
+      }
+    } else {
+      TArray<int64_t> input_shape(X_dims);
+      TArray<int64_t> output_shape(output_dims);
+      TArray<float, 10> roi_vals(roi);
+      TArray<float> scales_vals(scales);
+
+      size_t temp_buffer_size = CalcResizeBufferSize(mode_, output_dims);
+      auto dims_mapping_buffer = GetScratchBuffer<unsigned char>(temp_buffer_size, context->GetComputeStream());
+      void* dims_mapping = reinterpret_cast<void*>(dims_mapping_buffer.get());
+      ResizeImpl(Stream(context), mode_, rank, input_shape, output_shape,
+                 input_strides, output_div_pitches, scales_vals, roi_vals,
+                 reinterpret_cast<const CudaT*>(X->Data<T>()),
+                 reinterpret_cast<CudaT*>(Y->MutableData<T>()),
+                 output_count, use_extrapolation_, ToCudaType<T>::FromFloat(extrapolation_value_),
+                 cubic_coeff_a_, exclude_outside_,
+                 coordinate_transform_mode_, nearest_mode_,
+                 dims_mapping);
+    }
   } else {
     TArray<fast_divmod> scales_div(rank);
 
@@ -124,7 +312,7 @@ Status Upsample<T>::ComputeInternal(OpKernelContext* context) const {
   auto input_dims = X->Shape().GetDims();
 
   TensorShapeVector output_dims(input_dims.size());
-  std::vector<float> roi_array(input_dims.size() * 2, 0.0f);
+  InlinedVector<float> roi_array(input_dims.size() * 2, 0.0f);
   if (!roi_cached_) {
     bool use_default_roi = true;
     if (need_roi_input_) {
@@ -147,29 +335,37 @@ Status Upsample<T>::ComputeInternal(OpKernelContext* context) const {
     }
   }
 
-  const std::vector<float>& roi = roi_cached_ ? roi_ : roi_array;
-  std::vector<float> scales_array = scales_;
+  ComputeROIWithAxes(roi_array, input_dims.size());
 
+  InlinedVector<float> scales_array(input_dims.size());
+  // opset < 10
   if (OpKernel::Node().InputDefs().size() == 1) {
-    // Compute output shape from scales and input dims
+    // Compute output shape from scales attributes and input dims
+    scales_array = scales_;
+
     ComputeOutputShape(scales_array, input_dims, output_dims);
-    return BaseCompute(context, roi, scales_, output_dims);
+    return BaseCompute(context, roi_array, scales_, output_dims);
   }
 
   const Tensor* scales = context->Input<Tensor>(scales_input_idx_);
   const Tensor* sizes = context->Input<Tensor>(sizes_input_idx_);
 
+  // This is when scales are obtained and cached from a constant initializer
   if (scales_cached_) {
-    ORT_ENFORCE(sizes == nullptr, "Only one of scales or sizes must be provided as input.");
+    ORT_RETURN_IF_NOT(sizes == nullptr, "Only one of scales or sizes must be provided as input.");
+    scales_array = scales_;
+    // Compute output shape from scales and input dims
     ComputeOutputShape(scales_array, input_dims, output_dims);
-    return BaseCompute(context, roi, scales_, output_dims);
+    return BaseCompute(context, roi_array, scales_array, output_dims);
   }
 
-  scales_array.resize((input_dims.size()));
+  // Scales and sizes are input to the node
   if (scales != nullptr && scales->Shape().Size() != 0) {
     // use scales input data
     ORT_ENFORCE(sizes == nullptr, "Only one of scales or sizes must be provided as input.");
     ORT_RETURN_IF_ERROR(ParseScalesData(scales, scales_array, input_dims.size()));
+
+    // Compute output shape from scales and input dims
     ComputeOutputShape(scales_array, input_dims, output_dims);
   } else {
     // When sizes input is available directly populate it into the output_dims array.
@@ -179,7 +375,7 @@ Status Upsample<T>::ComputeInternal(OpKernelContext* context) const {
     ORT_RETURN_IF_ERROR(ParseScalesDataAndAdjustOutputSize(output_dims, input_dims, scales_array));
   }
 
-  return BaseCompute(context, roi, scales_array, output_dims);
+  return BaseCompute(context, roi_array, scales_array, output_dims);
 }
 
 }  // namespace cuda
diff --git a/onnxruntime/core/providers/cuda/tensor/upsample.h b/onnxruntime/core/providers/cuda/tensor/upsample.h
index 7bf2a23ede399..50597e0fba1b9 100644
--- a/onnxruntime/core/providers/cuda/tensor/upsample.h
+++ b/onnxruntime/core/providers/cuda/tensor/upsample.h
@@ -13,12 +13,14 @@ namespace cuda {
 template <typename T>
 class Upsample : public UpsampleBase, public CudaKernel {
  public:
-  Upsample(const OpKernelInfo& info) : UpsampleBase(info), CudaKernel(info) {
-  }
+  explicit Upsample(const OpKernelInfo& info);
 
   Status ComputeInternal(OpKernelContext* context) const override;
-  Status BaseCompute(OpKernelContext* context, const std::vector<float>& roi, const std::vector<float>& scales,
-                     const gsl::span<const int64_t>& output_dims) const;
+  Status BaseCompute(OpKernelContext* context, gsl::span<const float> roi, gsl::span<const float> scales,
+                     gsl::span<const int64_t> output_dims) const;
+
+ private:
+  IAllocatorUniquePtr<uint8_t> shared_lookup_table_ondevice_;
 };
 
 }  // namespace cuda
diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
index 3fd5423681b81..0265c06b9a938 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
@@ -1145,11 +1145,11 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, MLFloat16, ReduceSumSquare);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int64_t, GatherND);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, Dropout);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, float, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, double, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, MLFloat16, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int32_t, Resize);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, uint8_t, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, float, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, double, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, MLFloat16, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, int32_t, Resize);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17, uint8_t, Resize);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 18, If);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 18, Loop);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, Flatten);
@@ -1304,6 +1304,11 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, MLFloat16, Pad);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, bool, Pad);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, ScatterElements);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, float, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, double, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, MLFloat16, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, int32_t, Resize);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, uint8_t, Resize);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, Split);
 
 // Opset 19
@@ -2081,11 +2086,16 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, MLFloat16, ReduceSumSquare)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int64_t, GatherND)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, float, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, double, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, MLFloat16, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, int32_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, uint8_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          float, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          double, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          MLFloat16, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int32_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          uint8_t, Resize)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 18, If)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, 18, Loop)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, Flatten)>,
@@ -2240,6 +2250,16 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, MLFloat16, Pad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, bool, Pad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, ScatterElements)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18,
+                                                                                                     float, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18,
+                                                                                                     double, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18,
+                                                                                                   MLFloat16, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18,
+                                                                                                     int32_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18,
+                                                                                                     uint8_t, Resize)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 18, Split)>,
 
     // Opset 19
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index da17135878fe5..7b73ab36b3742 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -24,6 +24,7 @@
 #include "core/providers/cpu/tensor/size.h"
 #include "core/providers/cpu/tensor/scatter_nd.h"
 #include "core/providers/cpu/tensor/unsqueeze.h"
+#include "core/providers/cpu/tensor/upsamplebase.h"
 #include "core/providers/cpu/tensor/tile.h"
 
 #ifndef DISABLE_CONTRIB_OPS
@@ -572,6 +573,11 @@ std::unique_ptr<EinsumTypedComputeProcessor<double>> EinsumTypedComputeProcessor
 template <>
 std::unique_ptr<EinsumTypedComputeProcessor<MLFloat16>> EinsumTypedComputeProcessor<MLFloat16>::Create(OpKernelContext* context, AllocatorPtr allocator, concurrency::ThreadPool* tp, EinsumComputePreprocessor& einsum_compute_preprocessor, void* einsum_cuda_assets) { return g_host_cpu.EinsumTypedComputeProcessor_MLFloat16__Create(context, allocator, tp, einsum_compute_preprocessor, einsum_cuda_assets); }
 
+void UpsampleBase::AdjustOutputSizeAsPolicy(TensorShapeVector& output_dims, gsl::span<const int64_t> input_dims,
+                                            InlinedVector<float>& scales) const {
+  g_host_cpu.UpsampleBase__AdjustOutputSizeAsPolicy(this, output_dims, input_dims, scales);
+}
+
 #ifndef DISABLE_CONTRIB_OPS
 namespace contrib {
 Status embed_layer_norm::CheckInputs(const OpKernelContext* context, bool quantizedVersion) {
@@ -648,7 +654,6 @@ Status Sampling::SetupSubgraphExecutionInfo(const SessionState& session_state, c
                                             const SessionState& subgraph_session_state) {
   return g_host_cpu.Sampling__SetupSubgraphExecutionInfo(this, session_state, attribute_name, subgraph_session_state);
 }
-
 }  // namespace transformers
 
 #ifdef ENABLE_ATEN
diff --git a/onnxruntime/core/providers/xnnpack/tensor/resize.cc b/onnxruntime/core/providers/xnnpack/tensor/resize.cc
index 0c9e2e9fc17a2..09666c8039402 100644
--- a/onnxruntime/core/providers/xnnpack/tensor/resize.cc
+++ b/onnxruntime/core/providers/xnnpack/tensor/resize.cc
@@ -288,7 +288,7 @@ Status Resize::Compute(OpKernelContext* ctx) const {
 
     // Get scales data
     const auto* scales = ctx->Input<Tensor>(scales_input_idx_);
-    std::vector<float> scales_array(X->Shape().GetDims().size());
+    InlinedVector<float> scales_array(X->Shape().GetDims().size());
 
     if (scales != nullptr && scales->Shape().Size() != 0) {
       ORT_RETURN_IF_ERROR(ParseScalesData(scales, scales_array, output_shape.size()));
diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
index 10f02349a24d5..1d31f3fdb4eb4 100644
--- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
@@ -11,7 +11,8 @@ namespace test {
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_tf_crop_and_resize) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 0.20000028610229492, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] "
+                 << "is 0.20000028610229492, which exceeds threshold";
   }
 
   OpTester test("Resize", 13);
@@ -32,7 +33,8 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_tf_crop_and_resize) {
 
   test.AddInput<float>("X", {H, W}, X);
   test.AddInput<float>("roi", {4}, roi);
-  test.AddInput<float>("", {0}, scales);  // opset13 requires either 'sizes' or 'scales' must be provided, but not both of them
+  // opset13 requires either 'sizes' or 'scales' must be provided, but not both of them
+  test.AddInput<float>("", {0}, scales);
   test.AddInput<int64_t>("sizes", {2}, sizes);
 
   std::vector<float> Y = {7.600004f, 7.9f, 8.2f,
@@ -188,7 +190,9 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_without_e
   // CUDA: result mismatch due to not implementing NHWC support
   // ROCm: results mismatch
   // DML: results mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kDmlExecutionProvider});
+  test.Run(
+      OpTester::ExpectResult::kExpectSuccess, "",
+      {kCudaExecutionProvider, kRocmExecutionProvider, kDmlExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_without_extrapolation_int8) {
@@ -317,7 +321,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_int8) {
 // The output size is [1,1,2,4].*[1,1,0.6,0.6]=[1,1,1,2]
 // NNAPI will recaluclate the scales as the output size divided by input size
 // scales = [1,1,1,2]./[1,1,2,4] = [1,1,0.5,0.5]
-// See, https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/reference/reference_ops.h
+// See:https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/reference/reference_ops.h
 // So the result of the above example will be different than CPU EP
 // Add the following 2 tests to test with scales valid to NNAPI
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear1) {
@@ -475,7 +479,8 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_align_corners_int
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_pytorch_half_pixel) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 1.5000001192092896, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << " The difference between expected[i] and output[i] is 1.5000001192092896, which exceeds threshold";
   }
 
   OpTester test("Resize", 13);
@@ -533,7 +538,8 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_pytorch_half_pixe
   // CUDA: result mismatch due to not implementing NHWC support
   // ROCm: results mismatch
   // DML: results mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kDmlExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kRocmExecutionProvider, kDmlExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_pytorch_half_pixel_int8) {
@@ -721,7 +727,8 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_align_corners) {
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_3DTrilinear_pytorch_half_pixel) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 1.5000001192092896, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << "The difference between expected[i] and output[i] is 1.5000001192092896, which exceeds threshold";
   }
 
   OpTester test("Resize", 13);
@@ -1088,7 +1095,8 @@ TEST(ResizeOpTest, ResizeOpNearestUpSample_Floor_Align_Corners) {
 TEST(ResizeOpTest, ResizeOpNearest_OneToOneMappingBetweenInputAndOutputDataDims) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 3, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << "The difference between expected[i] and output[i] is 3, which exceeds threshold";
   }
 
   OpTester test("Resize", 12);  // tf_half_pixel_for_nn is deprecated since opset 13
@@ -1480,7 +1488,8 @@ TEST(ResizeOpTest, ResizeOpCubicUpSampleTest_tf_half_pixel_for_nn) {
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_Ver10) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 1.6666665077209473, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << "The difference between expected[i] and output[i] is 1.6666665077209473, which exceeds threshold";
   }
 
   OpTester test("Resize", 10);
@@ -1505,7 +1514,8 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_Ver10) {
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_Ver10) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 1.6666665077209473, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << "The difference between expected[i] and output[i] is 1.6666665077209473, which exceeds threshold ";
   }
 
   OpTester test("Resize", 10);
@@ -1530,7 +1540,8 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_Ver10) {
 TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_Ver10) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 0.5, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << "The difference between expected[i] and output[i] is 0.5, which exceeds threshold";
   }
 
   OpTester test("Resize", 10);
@@ -1565,7 +1576,8 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_Ver10) {
 TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_Ver10) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 0.5, which exceeds threshold";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << "The difference between expected[i] and output[i] is 0.5, which exceeds threshold";
   }
 
   OpTester test("Resize", 10);
@@ -1676,7 +1688,8 @@ TEST(UpsampleOpTest, ResizeOpNearestNoScaleTest_Ver10) {
 TEST(ResizeOpTest, ResizeOp_MissingRoiAndMissingScalesOptionalInputs) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: MLOperatorAuthorImpl.cpp(1876): The parameter is incorrect.";
+    GTEST_SKIP() << "Skipping because of the following error: "
+                 << "MLOperatorAuthorImpl.cpp(1876): The parameter is incorrect.";
   }
 
   OpTester test("Resize", 13);
@@ -1827,7 +1840,8 @@ template <typename T, typename T1 = int64_t>
 void TestAntialiasing(std::map<std::string, std::string> attributes,
                       std::vector<int64_t> input_shape,
                       std::vector<T> input_data,
-                      std::vector<T1> output_shape_or_scale, std::vector<T> output_data) {
+                      std::vector<T1> output_shape_or_scale, std::vector<T> output_data,
+                      gsl::span<std::string_view> excluded_ep = {}) {
   auto parse_attr = [](const std::string& str, auto typed_v) {
     using Tdata = decltype(typed_v);
     std::vector<Tdata> vect;
@@ -1891,13 +1905,22 @@ void TestAntialiasing(std::map<std::string, std::string> attributes,
   }
 
   test.AddOutput<T>("Y", output_shape, output_data);
-  // TensorRT 8.5 supports operators up to Opset 17. Temporarily exclude TensorRT EP due to accurarcy issue.
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+
+  std::unordered_set<std::string> excluded_eps;
+  std::transform(excluded_ep.begin(), excluded_ep.end(),
+                 std::inserter(excluded_eps, excluded_eps.end()), [](std::string_view ep) {
+                   return std::string(ep);
+                 });
+  // TensorRT 8.5 supports operators up to Opset 17. Temporarily exclude TensorRT EP due to accuracy issue.
+  excluded_eps.insert(kTensorrtExecutionProvider);
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_eps);
 }
 
 TEST(ResizeOpTest, Antialias_Bilinear_No_ExcludeOutside) {
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because dml implementation of antialias is slightly different and doesn't match in all cases.";
+    GTEST_SKIP() << "Skipping because dml implementation of antialias "
+                 << "is slightly different and doesn't match in all cases.";
   }
   std::vector<float> X(16);
   std::iota(X.begin(), X.end(), 1.f);
@@ -1939,7 +1962,8 @@ TEST(ResizeOpTest, Antialias_Bilinear_dtype) {
     std::vector<int8_t> Y = {1, 3, 4,
                              6, 8, 9,
                              11, 13, 14};
-    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 1, 4, 4}, X, {1, 1, 3, 3}, Y);
+    InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider};
+    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 1, 4, 4}, X, {1, 1, 3, 3}, Y, excluded_eps);
   }
   {
     std::vector<int32_t> X(16);
@@ -1982,17 +2006,21 @@ TEST(ResizeOpTest, Antialias_NhwcBilinear) {
                           33.5f, 73.5f, 113.5f,
                           35.074074f, 75.07407f, 115.07407f,
                           36.590908f, 76.59091f, 116.59091f};
-  TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 5, 8, 3}, X, {1, 4, 5, 3}, Y);
+
+  // Nchw is not supported by CUDA Resize implementation
+  InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider, kRocmExecutionProvider};
+  TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 5, 8, 3}, X, {1, 4, 5, 3}, Y, excluded_eps);
 }
 
 TEST(ResizeOpTest, Antialias_NhwcBilinear_dtype) {
+  InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider, kRocmExecutionProvider};
   {
     std::vector<uint8_t> X(16);
     std::iota(X.begin(), X.end(), uint8_t(0));
     std::vector<uint8_t> Y = {1, 3, 4,
                               6, 8, 9,
                               11, 13, 14};
-    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 4, 4, 1}, X, {1, 3, 3, 1}, Y);
+    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 4, 4, 1}, X, {1, 3, 3, 1}, Y, excluded_eps);
   }
   {
     std::vector<int8_t> X(16);
@@ -2000,7 +2028,7 @@ TEST(ResizeOpTest, Antialias_NhwcBilinear_dtype) {
     std::vector<int8_t> Y = {1, 3, 4,
                              6, 8, 9,
                              11, 13, 14};
-    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 4, 4, 1}, X, {1, 3, 3, 1}, Y);
+    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 4, 4, 1}, X, {1, 3, 3, 1}, Y, excluded_eps);
   }
   {
     std::vector<int32_t> X(16);
@@ -2008,13 +2036,14 @@ TEST(ResizeOpTest, Antialias_NhwcBilinear_dtype) {
     std::vector<int32_t> Y = {1, 3, 4,
                               6, 8, 9,
                               11, 13, 14};
-    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 4, 4, 1}, X, {1, 3, 3, 1}, Y);
+    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {1, 4, 4, 1}, X, {1, 3, 3, 1}, Y, excluded_eps);
   }
 }
 
 TEST(ResizeOpTest, Antialias_Trilinear_No_ExcludeOutside) {
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because dml implementation of antialias is slightly different and doesn't match in all cases.";
+    GTEST_SKIP() << "Skipping because dml implementation of "
+                 << "antialias is slightly different and doesn't match in all cases.";
   }
   std::vector<float> X(16 * 4);
   std::iota(X.begin(), X.end(), 0.f);
@@ -2038,13 +2067,17 @@ TEST(ResizeOpTest, Antialias_Trilinear_ExcludeOutside) {
 
 TEST(ResizeOpTest, Antialias_Trilinear_Scale_Is_11s_and_1s1) {
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because dml implementation of antialias is slightly different and doesn't match in all cases.";
+    GTEST_SKIP() << "Skipping because dml implementation of antialias"
+                 << " is slightly different and doesn't match in all cases.";
   }
+
+  InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider};
   std::vector<float> X(16 * 4 * 4);
   std::iota(X.begin(), X.end(), 0.f);
   {
     std::vector<float> Y = X;
-    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {4, 1, 4, 4, 4}, X, {4, 1, 4, 4, 4}, Y);
+    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}}, {4, 1, 4, 4, 4}, X, {4, 1, 4, 4, 4}, Y,
+                     excluded_eps);
   }
   {
     std::vector<float> Y = {0.625f, 2.375f, 4.625f, 6.375f, 8.625f, 10.375f, 12.625f,
@@ -2066,7 +2099,8 @@ TEST(ResizeOpTest, Antialias_Trilinear_Scale_Is_11s_and_1s1) {
                             224.625f, 226.375f, 228.625f, 230.375f, 232.625f, 234.375f, 236.625f,
                             238.375f, 240.625f, 242.375f, 244.625f, 246.375f, 248.625f, 250.375f,
                             252.625f, 254.375f};
-    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "0"}}, {4, 1, 4, 4, 4}, X, {4, 1, 4, 4, 2}, Y);
+    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "0"}}, {4, 1, 4, 4, 4}, X, {4, 1, 4, 4, 2}, Y,
+                     excluded_eps);
   }
   {
     std::vector<float> Y = {2.5f, 3.5f, 4.5f, 5.5f, 9.5f, 10.5f, 11.5f, 12.5f, 18.5f,
@@ -2084,7 +2118,8 @@ TEST(ResizeOpTest, Antialias_Trilinear_Scale_Is_11s_and_1s1) {
                             217.5f, 218.5f, 219.5f, 220.5f, 226.5f, 227.5f, 228.5f, 229.5f, 233.5f,
                             234.5f, 235.5f, 236.5f, 242.5f, 243.5f, 244.5f, 245.5f, 249.5f, 250.5f,
                             251.5f, 252.5f};
-    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "0"}}, {4, 1, 4, 4, 4}, X, {4, 1, 4, 2, 4}, Y);
+    TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "0"}}, {4, 1, 4, 4, 4}, X, {4, 1, 4, 2, 4}, Y,
+                     excluded_eps);
   }
 }
 
@@ -2124,12 +2159,15 @@ TEST(ResizeOpTest, Antialias_NHWCBicubic_ExcludeOutside) {
       19.576872f, 43.57687f, 21.126253f, 45.126255f, 22.606192f,
       46.606194f, 19.878183f, 43.87818f, 21.358122f, 45.35812f,
       22.907503f, 46.907505f, 24.387442f, 48.387444f};
-  TestAntialiasing({{"mode", "cubic"}, {"exclude_outside", "0"}}, {1, 4, 6, 2}, X, {1, 8, 4, 2}, Y);
+
+  InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider, kRocmExecutionProvider};
+  TestAntialiasing({{"mode", "cubic"}, {"exclude_outside", "0"}}, {1, 4, 6, 2}, X, {1, 8, 4, 2}, Y, excluded_eps);
 }
 
 TEST(ResizeOpTest, Antialias_Linear_AlignCorners) {
   if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because dml implementation of antialias is slightly different and doesn't match in all cases.";
+    GTEST_SKIP() << "Skipping because dml implementation of antialias"
+                 << "is slightly different and doesn't match in all cases.";
   }
   std::vector<float> X(256);
   std::iota(X.begin(), X.end(), 0.0f);
@@ -2145,9 +2183,40 @@ TEST(ResizeOpTest, Antialias_Linear_AlignCorners) {
       187.08333f, 195.91667f, 198.41667f, 205.91667f, 208.41667f,
       217.25f, 219.75f, 227.25f, 229.75f, 238.58333f,
       241.08333f, 248.58333f, 251.08333f};
+  InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider, kRocmExecutionProvider};
   TestAntialiasing(
       {{"mode", "linear"}, {"exclude_outside", "0"}, {"coordinate_transformation_mode", "align_corners"}},
-      {4, 1, 4, 4, 4}, X, {4, 1, 3, 2, 2}, Y);
+      {4, 1, 4, 4, 4}, X, {4, 1, 3, 2, 2}, Y, excluded_eps);
+}
+
+TEST(ResizeOpTest, Antialias_Linear_AlignCorners_3D) {
+  if (DefaultDmlExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because dml implementation of antialias is slightly "
+                 << "different and doesn't match in all cases.";
+  }
+  std::vector<float> X(256);
+  std::iota(X.begin(), X.end(), 0.0f);
+  std::vector<float> Y{
+      1.25f, 3.75f, 11.25f, 13.75f,
+      17.25f, 19.75f, 27.25f, 29.75f,
+      33.25f, 35.75f, 43.25f, 45.75f,
+      49.25f, 51.75f, 59.25f, 61.75f,
+      65.25f, 67.75f, 75.25f, 77.75f,
+      81.25f, 83.75f, 91.25f, 93.75f,
+      97.25f, 99.75f, 107.25f, 109.75f,
+      113.25f, 115.75f, 123.25f, 125.75f,
+      129.25f, 131.75f, 139.25f, 141.75f,
+      145.25f, 147.75f, 155.25f, 157.75f,
+      161.25f, 163.75f, 171.25f, 173.75f,
+      177.25f, 179.75f, 187.25f, 189.75f,
+      193.25f, 195.75f, 203.25f, 205.75f,
+      209.25f, 211.75f, 219.25f, 221.75f,
+      225.25f, 227.75f, 235.25f, 237.75f,
+      241.25f, 243.75f, 251.25f, 253.75f};
+
+  TestAntialiasing(
+      {{"mode", "linear"}, {"exclude_outside", "0"}, {"coordinate_transformation_mode", "align_corners"}},
+      {16, 4, 4}, X, {16, 2, 2}, Y);
 }
 
 TEST(ResizeOpTest, Antialias_Bicubic_ExcludeOutside) {
@@ -2166,19 +2235,23 @@ TEST(ResizeOpTest, Antialias_Bicubic_Dtype) {
     std::vector<uint8_t> X(36);
     std::iota(X.begin(), X.end(), uint8_t(0));
     std::vector<uint8_t> Y = {4, 6, 7, 16, 18, 19, 28, 30, 31};
-    TestAntialiasing({{"mode", "cubic"}, {"cubic_coeff_a", "-0.5f"}, {"exclude_outside", "1"}}, {1, 1, 6, 6}, X, {1, 1, 3, 3}, Y);
+    TestAntialiasing({{"mode", "cubic"}, {"cubic_coeff_a", "-0.5f"}, {"exclude_outside", "1"}}, {1, 1, 6, 6},
+                     X, {1, 1, 3, 3}, Y);
   }
   {
     std::vector<int8_t> X(36);
     std::iota(X.begin(), X.end(), int8_t(0));
     std::vector<int8_t> Y = {4, 6, 7, 16, 18, 19, 28, 30, 31};
-    TestAntialiasing({{"mode", "cubic"}, {"cubic_coeff_a", "-0.5f"}, {"exclude_outside", "1"}}, {1, 1, 6, 6}, X, {1, 1, 3, 3}, Y);
+    InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider};
+    TestAntialiasing({{"mode", "cubic"}, {"cubic_coeff_a", "-0.5f"}, {"exclude_outside", "1"}}, {1, 1, 6, 6},
+                     X, {1, 1, 3, 3}, Y, excluded_eps);
   }
   {
     std::vector<int32_t> X(36);
     std::iota(X.begin(), X.end(), 0);
     std::vector<int32_t> Y = {4, 6, 7, 16, 18, 19, 28, 30, 31};
-    TestAntialiasing({{"mode", "cubic"}, {"cubic_coeff_a", "-0.5f"}, {"exclude_outside", "1"}}, {1, 1, 6, 6}, X, {1, 1, 3, 3}, Y);
+    TestAntialiasing({{"mode", "cubic"}, {"cubic_coeff_a", "-0.5f"}, {"exclude_outside", "1"}}, {1, 1, 6, 6},
+                     X, {1, 1, 3, 3}, Y);
   }
 }
 
@@ -2189,8 +2262,10 @@ TEST(ResizeOpTest, Antialias_Axes_and_Scale) {
   std::vector<float> Y = {6.3f, 7.5f, 8.7f, 11.1f, 12.3f, 13.5f, 15.9f, 17.1f, 18.3f, 25.5f, 26.7f,
                           27.9f, 30.3f, 31.5f, 32.7f, 35.1f, 36.3f, 37.5f, 44.7f, 45.9f, 47.1f, 49.5f,
                           50.7f, 51.9f, 54.3f, 55.5f, 56.7f};
-  TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"}}, {1, 1, 4, 4, 4}, X,
-                   std::vector<float>{3 / 4.0f, 3 / 4.0f, 3 / 4.0f}, Y);
+  TestAntialiasing(
+      {{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"}},
+      {1, 1, 4, 4, 4}, X,
+      std::vector<float>{3 / 4.0f, 3 / 4.0f, 3 / 4.0f}, Y);
 }
 
 TEST(ResizeOpTest, Antialias_Axes_and_Size) {
@@ -2199,8 +2274,10 @@ TEST(ResizeOpTest, Antialias_Axes_and_Size) {
   std::vector<float> Y = {6.3f, 7.5f, 8.7f, 11.1f, 12.3f, 13.5f, 15.9f, 17.1f, 18.3f, 25.5f, 26.7f,
                           27.9f, 30.3f, 31.5f, 32.7f, 35.1f, 36.3f, 37.5f, 44.7f, 45.9f, 47.1f, 49.5f,
                           50.7f, 51.9f, 54.3f, 55.5f, 56.7f};
-  TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"}}, {1, 1, 4, 4, 4}, X,
-                   {3, 3, 3}, Y);
+  TestAntialiasing(
+      {{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"}},
+      {1, 1, 4, 4, 4}, X,
+      {3, 3, 3}, Y);
 }
 
 TEST(ResizeOpTest, Antialias_Axes_and_PolicyNoLarger) {
@@ -2209,9 +2286,13 @@ TEST(ResizeOpTest, Antialias_Axes_and_PolicyNoLarger) {
   std::vector<float> Y = {6.3f, 7.5f, 8.7f, 11.1f, 12.3f, 13.5f, 15.9f, 17.1f, 18.3f, 25.5f, 26.7f,
                           27.9f, 30.3f, 31.5f, 32.7f, 35.1f, 36.3f, 37.5f, 44.7f, 45.9f, 47.1f, 49.5f,
                           50.7f, 51.9f, 54.3f, 55.5f, 56.7f};
-  TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"}, {"policy", "not_larger"}},
-                   {1, 1, 4, 4, 4}, X,
-                   {3, 4, 5}, Y);
+  // clang-format off
+  TestAntialiasing(
+      {{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"},
+       {"policy", "not_larger"}},
+      {1, 1, 4, 4, 4}, X,
+      {3, 4, 5}, Y);
+  // clang-format on
 }
 
 TEST(ResizeOpTest, Antialias_Axes_and_PolicyNoSmaller) {
@@ -2220,9 +2301,13 @@ TEST(ResizeOpTest, Antialias_Axes_and_PolicyNoSmaller) {
   std::vector<float> Y = {6.3f, 7.5f, 8.7f, 11.1f, 12.3f, 13.5f, 15.9f, 17.1f, 18.3f, 25.5f, 26.7f,
                           27.9f, 30.3f, 31.5f, 32.7f, 35.1f, 36.3f, 37.5f, 44.7f, 45.9f, 47.1f, 49.5f,
                           50.7f, 51.9f, 54.3f, 55.5f, 56.7f};
-  TestAntialiasing({{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"}, {"policy", "not_smaller"}},
-                   {1, 1, 4, 4, 4}, X,
-                   {1, 2, 3}, Y);
+  // clang-format off
+  TestAntialiasing(
+      {{"mode", "linear"}, {"exclude_outside", "1"}, {"axes", "{2,3,4}"}, {"output_shape", "{1,1,3,3,3}"},
+       {"policy", "not_smaller"}},
+      {1, 1, 4, 4, 4}, X,
+      {1, 2, 3}, Y);
+  // clang-format on
 }
 
 TEST(ResizeOpTest, Antialias_Use_Extrapolation) {