diff --git a/include/onnxruntime/core/framework/op_kernel_context.h b/include/onnxruntime/core/framework/op_kernel_context.h
index ac22d9130983a..fa2621440ce30 100644
--- a/include/onnxruntime/core/framework/op_kernel_context.h
+++ b/include/onnxruntime/core/framework/op_kernel_context.h
@@ -186,6 +186,10 @@ class OpKernelContext {
   */
   AllocatorPtr GetAllocator(const OrtDevice& device) const;
 
+#if defined(ENABLE_ATEN) || defined(USE_TENSORRT)
+  Status SetOutputMLValue(int index, const OrtValue& ort_value);
+#endif
+
  protected:
   OpKernelContext(concurrency::ThreadPool* threadpool, const logging::Logger& logger, Stream* stream);
 
@@ -195,10 +199,6 @@ class OpKernelContext {
   const OrtValue* GetImplicitInputMLValue(int index) const;
   OrtValue* GetOutputMLValue(int index);
 
-#ifdef ENABLE_ATEN
-  Status SetOutputMLValue(int index, const OrtValue& ort_value);
-#endif
-
   // Creates the OrtValue* based on the shape, if it does not exist
   virtual OrtValue* OutputMLValue(int index, const TensorShape& shape);
 
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index cddad732104ed..36e78d6b4d500 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -4520,6 +4520,15 @@ struct OrtApi {
    * \since Version 1.17.
    */
   ORT_API2_STATUS(ReadOpAttr, _In_ const OrtOpAttr* op_attr, _In_ OrtOpAttrType type, _Inout_ void* data, _In_ size_t len, _Out_ size_t* out);
+
+  /** \brief Used for custom operators, set an output of a kernel
+   *
+   * \see ::OrtCustomOp
+   *
+   * \since Version 1.17.
+   */
+  ORT_API2_STATUS(KernelContext_SetOutput, _Inout_ OrtKernelContext* context, _In_ size_t index,
+                  _In_ const OrtValue* ort_value);
 };
 
 /*
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 92c25d8688b66..29ba3e1714630 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -2052,6 +2052,7 @@ struct KernelContext {
   ConstValue GetInput(size_t index) const;
   UnownedValue GetOutput(size_t index, const int64_t* dim_values, size_t dim_count) const;
   UnownedValue GetOutput(size_t index, const std::vector<int64_t>& dims) const;
+  void SetOutput(size_t index, const OrtValue& ort_value);
   void* GetGPUComputeStream() const;
   Logger GetLogger() const;
   OrtAllocator* GetAllocator(const OrtMemoryInfo& memory_info) const;
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index 860a27fc73f79..cb2d3925f9bac 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -1634,6 +1634,10 @@ inline UnownedValue KernelContext::GetOutput(size_t index, const std::vector<int
   return UnownedValue(out);
 }
 
+inline void KernelContext::SetOutput(size_t index, const OrtValue& ort_value) {
+  Ort::ThrowOnError(GetApi().KernelContext_SetOutput(ctx_, index, &ort_value));
+}
+
 inline void* KernelContext::GetGPUComputeStream() const {
   void* out = nullptr;
   Ort::ThrowOnError(GetApi().KernelContext_GetGPUComputeStream(ctx_, &out));
diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc
index d9c49dc6bea1d..39b0a921bd1ed 100644
--- a/onnxruntime/core/framework/execution_frame.cc
+++ b/onnxruntime/core/framework/execution_frame.cc
@@ -50,7 +50,7 @@ IExecutionFrame::IExecutionFrame(const OrtValueNameIdxMap& ort_value_idx_map,
 
 IExecutionFrame::~IExecutionFrame() = default;
 
-#ifdef ENABLE_ATEN
+#if defined(ENABLE_ATEN) || defined(USE_TENSORRT)
 Status IExecutionFrame::SetOutputMLValue(int index, const OrtValue& ort_value) {
   int ort_value_idx = GetNodeIdxToMLValueIdx(index);
   if (ort_value_idx == NodeIndexInfo::kInvalidEntry || static_cast<size_t>(ort_value_idx) >= all_values_size_) {
diff --git a/onnxruntime/core/framework/execution_frame.h b/onnxruntime/core/framework/execution_frame.h
index 1576c16684faa..e7942934ebe30 100644
--- a/onnxruntime/core/framework/execution_frame.h
+++ b/onnxruntime/core/framework/execution_frame.h
@@ -54,7 +54,7 @@ class IExecutionFrame {
   const OrtValue* GetNodeInputOrOutputMLValue(int index) const;
   OrtValue* GetMutableNodeInputOrOutputMLValue(int index);
 
-#ifdef ENABLE_ATEN
+#if defined(ENABLE_ATEN) || defined(USE_TENSORRT)
   // Override the index-th output with ort_value
   Status SetOutputMLValue(int index, const OrtValue& ort_value);
 #endif
diff --git a/onnxruntime/core/framework/op_kernel.cc b/onnxruntime/core/framework/op_kernel.cc
index 94b6224440ed0..31b6141ab985d 100644
--- a/onnxruntime/core/framework/op_kernel.cc
+++ b/onnxruntime/core/framework/op_kernel.cc
@@ -186,7 +186,7 @@ AllocatorPtr OpKernelContext::GetAllocator(const OrtDevice& device) const {
   return execution_frame_->GetAllocator(device);
 }
 
-#ifdef ENABLE_ATEN
+#if defined(ENABLE_ATEN) || defined(USE_TENSORRT)
 Status OpKernelContext::SetOutputMLValue(int index, const OrtValue& ort_value) {
   if (index < 0 || index >= OutputCount()) {
     return Status(common::ONNXRUNTIME, common::FAIL,
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 79f84864a5788..e75904ee0539c 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -365,15 +365,18 @@ std::unique_lock<OrtMutex> TensorrtExecutionProvider::GetApiLock() const {
   return std::unique_lock<OrtMutex>(singleton);
 }
 
+/*
+ * Get the shape of "shape tensor" input
+ */
 Status GetShapeOfShapeTensor(Ort::ConstValue& input_tensor,
                              std::vector<int32_t>& shape_values,
                              nvinfer1::ICudaEngine* trt_engine,
-                             int binding_index,
+                             const char* input_name,
                              cudaStream_t stream) {
   auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
   const auto tensor_shapes = tensor_info.GetShape();
   const auto tensor_type = tensor_info.GetElementType();
-  nvinfer1::Dims dims = trt_engine->getBindingDimensions(static_cast<int>(binding_index));
+  nvinfer1::Dims dims = trt_engine->getTensorShape(input_name);
   int nb_dims = dims.nbDims;
   int shape_size = nb_dims == 0 ? 1 : static_cast<int>(tensor_shapes[0]);  // The shape of the "shape tensor" is either zero dimension (scalar) or 1-dimension
   shape_values.resize(shape_size, 1);
@@ -581,7 +584,7 @@ Status ApplyProfileShapesFromInputTensorValue(std::vector<nvinfer1::IOptimizatio
     if (input->isShapeTensor()) {
       // Get shape values for shape tensor input
       const auto tensor_type = tensor_info.GetElementType();
-      int shape_size = nb_dims == 0 ? 1 : static_cast<int>(tensor_shapes[0]);
+      int shape_size = nb_dims == 0 ? 1 : static_cast<int>(tensor_shapes[0]);  // The shape of the "shape tensor" is either zero dimension (scalar) or 1-dimension
       tensor_shape_values[input_name].resize(shape_size);
       switch (tensor_type) {
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: {
@@ -689,6 +692,452 @@ Status ApplyProfileShapesFromInputTensorValue(std::vector<nvinfer1::IOptimizatio
   return Status::OK();
 }
 
+/*
+ * Set TensorRT execution context input.
+ *
+ * There are two types of input tensor: (1) shape tensor and (2) execution tensor.
+ * The input buffer binding needs to be handled differently.
+ *
+ */
+Status BindContextInput(Ort::KernelContext& ctx,
+                        nvinfer1::ICudaEngine* trt_engine,
+                        nvinfer1::IExecutionContext* trt_context,
+                        const char* input_name,
+                        size_t input_index,
+                        std::vector<int32_t>& shape_values,  // only for "shape tensor"
+                        std::vector<IAllocatorUniquePtr<void>>& scratch_buffers,
+                        OrtAllocator* alloc,
+                        cudaStream_t stream) {
+  auto input_tensor = ctx.GetInput(input_index);
+  auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
+  const auto tensor_shapes = tensor_info.GetShape();
+  const auto tensor_type = tensor_info.GetElementType();
+
+  if (trt_engine->isShapeInferenceIO(input_name)) {
+    // Get the shape value of "shape tensor"
+    if (shape_values.empty()) {
+      auto status = GetShapeOfShapeTensor(input_tensor, shape_values, trt_engine, input_name, stream);
+      if (status != Status::OK()) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+      }
+    }
+
+    // Bind "shape tensor" input buffer
+    if (!trt_context->setTensorAddress(input_name, &shape_values[0])) {
+      std::string error_input_name = input_name;
+      ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                         "TensorRT EP failed to call nvinfer1::IExecutionContext::setTensorAddress() for shape input '" + error_input_name + "'"));
+    }
+  } else {
+    // Set shape for input tensor which is execution tensor
+    nvinfer1::Dims dims = trt_context->getTensorShape(input_name);
+    int nb_dims = dims.nbDims;
+    for (int j = 0, end = nb_dims; j < end; ++j) {
+      dims.d[j] = static_cast<int32_t>(tensor_shapes[j]);
+    }
+    if (!trt_context->setInputShape(input_name, dims)) {
+      std::string error_input_name = input_name;
+      ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                         "TensorRT EP failed to call nvinfer1::IExecutionContext::setInputShape() for input '" + error_input_name + "'"));
+    }
+    // Bind "execution tensor" input buffers
+    void* data = nullptr;
+    switch (tensor_type) {
+      case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: {
+        auto input_tensor_ptr = input_tensor.GetTensorData<float>();
+        if (input_tensor_ptr == nullptr) {
+          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(float)));
+          data = scratch_buffers.back().get();
+        } else {
+          data = const_cast<float*>(input_tensor_ptr);
+        }
+        break;
+      }
+      case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: {
+        auto input_tensor_ptr = input_tensor.GetTensorData<uint16_t>();
+        if (input_tensor_ptr == nullptr) {
+          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(uint16_t)));
+          data = scratch_buffers.back().get();
+        } else {
+          data = const_cast<uint16_t*>(input_tensor_ptr);
+        }
+        break;
+      }
+      case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: {
+        auto input_tensor_ptr = input_tensor.GetTensorData<bool>();
+        if (input_tensor_ptr == nullptr) {
+          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(bool)));
+          data = scratch_buffers.back().get();
+        } else {
+          data = const_cast<bool*>(input_tensor_ptr);
+        }
+        break;
+      }
+      case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: {
+        auto input_tensor_ptr = input_tensor.GetTensorData<int8_t>();
+        if (input_tensor_ptr == nullptr) {
+          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int8_t)));
+          data = scratch_buffers.back().get();
+        } else {
+          data = const_cast<int8_t*>(input_tensor_ptr);
+        }
+        break;
+      }
+      case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: {
+        auto input_tensor_ptr = input_tensor.GetTensorData<uint8_t>();
+        if (input_tensor_ptr == nullptr) {
+          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(uint8_t)));
+          data = scratch_buffers.back().get();
+        } else {
+          data = const_cast<uint8_t*>(input_tensor_ptr);
+        }
+        break;
+      }
+      case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: {
+        auto input_tensor_ptr = input_tensor.GetTensorData<int32_t>();
+        if (input_tensor_ptr == nullptr) {
+          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int32_t)));
+          data = scratch_buffers.back().get();
+        } else {
+          data = const_cast<int32_t*>(input_tensor_ptr);
+        }
+        break;
+      }
+      case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: {
+        // Cast INT64 input to INT32 because TensorRT doesn't fully support INT64
+        auto input_tensor_ptr = input_tensor.GetTensorData<int64_t>();
+        if (input_tensor_ptr == nullptr) {
+          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int32_t)));
+          data = scratch_buffers.back().get();
+        } else {
+          SafeInt<int> input_dim_size = 1;
+          for (int j = 0, end = nb_dims; j < end; ++j) {
+            if (tensor_shapes[j] == 0) {
+              input_dim_size = 1;
+              break;
+            } else {
+              input_dim_size *= tensor_shapes[j];
+            }
+          }
+          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, input_dim_size * sizeof(int32_t)));
+          data = scratch_buffers.back().get();
+          cuda::Impl_Cast<int64_t, int32_t>(stream, input_tensor_ptr, reinterpret_cast<int32_t*>(data), input_dim_size);
+        }
+        break;
+      }
+      case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: {
+        // Cast DOUBLE input to FLOAT because TensorRT doesn't fully support INT64
+        auto input_tensor_ptr = input_tensor.GetTensorData<double>();
+        if (input_tensor_ptr == nullptr) {
+          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(float)));
+          data = scratch_buffers.back().get();
+        } else {
+          SafeInt<int> input_dim_size = 1;
+          for (int j = 0, end = nb_dims; j < end; ++j) {
+            if (tensor_shapes[j] == 0) {
+              input_dim_size = 1;
+              break;
+            } else {
+              input_dim_size *= tensor_shapes[j];
+            }
+          }
+          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, input_dim_size * sizeof(float)));
+          data = scratch_buffers.back().get();
+          cuda::Impl_Cast<double, float>(stream, input_tensor_ptr, reinterpret_cast<float*>(data), input_dim_size);
+        }
+        break;
+      }
+      default: {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                               "TensorRT EP input onnx tensor data type: " + std::to_string(tensor_type) + " not supported.");
+      }
+    }
+    trt_context->setTensorAddress(input_name, data);
+  }
+
+  return Status::OK();
+}
+
+/*
+ * Set TensorRT execution context output.
+ *
+ * Please note that the "data-depedent shape" output needs corresponding allocator provided.
+ *
+ *
+ * param ctx - ORT kernel context
+ * param trt_context - A pointer to TensorRT Execution context object
+ * param output_name - Output tensor name
+ * param output_index - The index of the output to the ORT kernel context
+ * param output_type - Data type of the output
+ * param i - Output iteration index
+ * param output_tensors - Output iteration index to output's ORT value
+ * param output_dim_sizes - Output iteration index to the multiplocation of its shape's dimensions
+ * param dds_output_set - DDS output set
+ * param dds_output_allocator_map - DDS output to its allocator
+ * param scratch_buffer - The allocation buffer created by TRT EP
+ * param allocator - ORT allocator
+ * param buffers - It holds all the output values which are binding to TRT's execution context
+ *
+ */
+Status BindContextOutput(Ort::KernelContext& ctx,
+                         nvinfer1::IExecutionContext* trt_context,
+                         const char* output_name,
+                         size_t output_index,
+                         size_t output_type,
+                         size_t i,
+                         std::unordered_map<size_t, Ort::UnownedValue>& output_tensors,
+                         std::unordered_map<size_t, int>& output_dim_sizes,
+                         std::unordered_set<char const*>& dds_output_set,
+                         std::unordered_map<char const*, OutputAllocator*>& dds_output_allocator_map,
+                         std::vector<IAllocatorUniquePtr<void>>& scratch_buffers,
+                         OrtAllocator* alloc,
+                         std::unordered_map<char const*, void*>& buffers) {
+  // Get output shape
+  nvinfer1::Dims dims = trt_context->getTensorShape(output_name);
+  int nb_dims = dims.nbDims;
+  bool is_dds_output = false;
+  std::vector<int64_t> output_shapes(nb_dims);
+  for (int j = 0, end = nb_dims; j < end; ++j) {
+    // data-dependent shape
+    if (dims.d[j] == -1) {
+      is_dds_output = true;
+      dds_output_set.emplace(output_name);
+      break;
+    }
+    output_shapes[j] = dims.d[j];
+  }
+
+  // If the output tensor has data-dependent shape, TRT EP will provide an IOutputAllocator for enqueueV3 to dynamically allocate memory buffer.
+  // Once enqueueV3 returns, TRT EP will then bind the output allocation to ORT kernel context output.
+  // (Please note that we take strategy A mentioned in https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#dynamic-shaped-output,
+  //  which we defer allocation until the size is known and don't call IExecution::setTensorAddress)
+  //
+  // Otherwise, if the shape of the output tensor is known prior to the runtime, ORT will pre-allocate memory buffer for the output tensor for enqueueV3.
+  if (is_dds_output) {
+    if (dds_output_allocator_map.find(output_name) == dds_output_allocator_map.end()) {
+      auto allocator = new OutputAllocator(alloc);
+      trt_context->setOutputAllocator(output_name, allocator);
+      dds_output_allocator_map[output_name] = allocator;
+    }
+  } else {
+    output_tensors[i] = ctx.GetOutput(output_index, output_shapes);
+    auto& output_tensor = output_tensors[i];
+    switch (output_type) {
+      case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: {
+        auto output_tensor_ptr = output_tensor.GetTensorMutableData<float>();
+        if (output_tensor_ptr == nullptr) {
+          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(float)));
+          buffers[output_name] = scratch_buffers.back().get();
+        } else {
+          buffers[output_name] = output_tensor_ptr;
+        }
+        break;
+      }
+      case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: {
+        auto output_tensor_ptr = output_tensor.GetTensorMutableData<uint16_t>();
+        if (output_tensor_ptr == nullptr) {
+          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(uint16_t)));
+          buffers[output_name] = scratch_buffers.back().get();
+        } else {
+          buffers[output_name] = output_tensor_ptr;
+        }
+        break;
+      }
+      case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: {
+        auto output_tensor_ptr = output_tensor.GetTensorMutableData<bool>();
+        if (output_tensor_ptr == nullptr) {
+          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(bool)));
+          buffers[output_name] = scratch_buffers.back().get();
+        } else {
+          buffers[output_name] = output_tensor_ptr;
+        }
+        break;
+      }
+      case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: {
+        auto output_tensor_ptr = output_tensor.GetTensorMutableData<int8_t>();
+        if (output_tensor_ptr == nullptr) {
+          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int8_t)));
+          buffers[output_name] = scratch_buffers.back().get();
+        } else {
+          buffers[output_name] = output_tensor_ptr;
+        }
+        break;
+      }
+      case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: {
+        auto output_tensor_ptr = output_tensor.GetTensorMutableData<uint8_t>();
+        if (output_tensor_ptr == nullptr) {
+          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(uint8_t)));
+          buffers[output_name] = scratch_buffers.back().get();
+        } else {
+          buffers[output_name] = output_tensor_ptr;
+        }
+        break;
+      }
+      case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: {
+        auto output_tensor_ptr = output_tensor.GetTensorMutableData<int32_t>();
+        if (output_tensor_ptr == nullptr) {
+          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int32_t)));
+          buffers[output_name] = scratch_buffers.back().get();
+        } else {
+          buffers[output_name] = output_tensor_ptr;
+        }
+        break;
+      }
+      case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: {
+        // Allocate INT32 CUDA memory for INT64 output type because TensorRT doesn't fully support INT64
+        auto output_tensor_ptr = output_tensor.GetTensorMutableData<int64_t>();
+        if (output_tensor_ptr == nullptr) {
+          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int32_t)));
+          buffers[output_name] = scratch_buffers.back().get();
+          output_dim_sizes[i] = 1;
+        } else {
+          SafeInt<int> output_dim_size(1);
+          for (int j = 0, end = nb_dims; j < end; ++j) {
+            if (dims.d[j] == 0) {
+              output_dim_size = 1;
+              break;
+            } else {
+              output_dim_size *= dims.d[j];
+            }
+          }
+          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, output_dim_size * sizeof(int32_t)));
+          buffers[output_name] = scratch_buffers.back().get();
+          output_dim_sizes[i] = output_dim_size;
+        }
+        break;
+      }
+      case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: {
+        // Allocate FLOAT CUDA memory for DOUBLE output type because TensorRT doesn't fully support DOUBLE
+        auto output_tensor_ptr = output_tensor.GetTensorMutableData<double>();
+        if (output_tensor_ptr == nullptr) {
+          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(float)));
+          buffers[output_name] = scratch_buffers.back().get();
+          output_dim_sizes[i] = 1;
+        } else {
+          SafeInt<int> output_dim_size(1);
+          for (int j = 0, end = nb_dims; j < end; ++j) {
+            if (dims.d[j] == 0) {
+              output_dim_size = 1;
+              break;
+            } else {
+              output_dim_size *= dims.d[j];
+            }
+          }
+          scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, output_dim_size * sizeof(float)));
+          buffers[output_name] = scratch_buffers.back().get();
+          output_dim_sizes[i] = output_dim_size;
+        }
+        break;
+      }
+      default: {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                               "TensorRT EP output tensor data type: " + std::to_string(output_type) + " not supported.");
+      }
+    }
+    trt_context->setTensorAddress(output_name, buffers[output_name]);
+  }
+
+  return Status::OK();
+}
+
+/*
+ * Set ORT kernel context Output.
+ *
+ * Note: In the case of DDS (data-dependent shape) output, TRT requires a provided allocator to allocate memory during runtime.
+ * Once the output has been put in the allocation buffer, ORT calls this function to bind the allocation to ORT kernel context output.
+ */
+Status BindKernelOutput(Ort::KernelContext& ctx,
+                        OrtMemoryInfo* mem_info,
+                        DDSOutputAllocatorMap& allocator_map,
+                        char const* output_name,
+                        size_t output_index,
+                        size_t output_type,
+                        std::vector<IAllocatorUniquePtr<void>>& scratch_buffers,
+                        std::unordered_map<char const*, void*>& buffers,
+                        OrtAllocator* alloc,
+                        cudaStream_t stream) {
+  auto allocator = allocator_map[output_name];
+  auto& shape = allocator->getOutputShape();
+  OrtValue* out = nullptr;
+
+  switch (output_type) {
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: {
+      Ort::ThrowOnError(Ort::GetApi().CreateTensorWithDataAsOrtValue(mem_info, allocator->getBuffer(), allocator->getSize(),
+                                                                     shape.data(), shape.size(), Ort::TypeToTensorType<float>::type, &out));
+      break;
+    }
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: {
+      Ort::ThrowOnError(Ort::GetApi().CreateTensorWithDataAsOrtValue(mem_info, allocator->getBuffer(), allocator->getSize(),
+                                                                     shape.data(), shape.size(), Ort::TypeToTensorType<uint16_t>::type, &out));
+      break;
+    }
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: {
+      Ort::ThrowOnError(Ort::GetApi().CreateTensorWithDataAsOrtValue(mem_info, allocator->getBuffer(), allocator->getSize(),
+                                                                     shape.data(), shape.size(), Ort::TypeToTensorType<bool>::type, &out));
+      break;
+    }
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: {
+      Ort::ThrowOnError(Ort::GetApi().CreateTensorWithDataAsOrtValue(mem_info, allocator->getBuffer(), allocator->getSize(),
+                                                                     shape.data(), shape.size(), Ort::TypeToTensorType<int8_t>::type, &out));
+      break;
+    }
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: {
+      Ort::ThrowOnError(Ort::GetApi().CreateTensorWithDataAsOrtValue(mem_info, allocator->getBuffer(), allocator->getSize(),
+                                                                     shape.data(), shape.size(), Ort::TypeToTensorType<uint8_t>::type, &out));
+      break;
+    }
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: {
+      Ort::ThrowOnError(Ort::GetApi().CreateTensorWithDataAsOrtValue(mem_info, allocator->getBuffer(), allocator->getSize(),
+                                                                     shape.data(), shape.size(), Ort::TypeToTensorType<int32_t>::type, &out));
+      break;
+    }
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: {
+      // The allocation buffer holds the INT32 output data since TRT doesn't support INT64 but INT32.
+      // So, we need to cast the data from INT32 to INT64 and then set INT64 output data to kernel context.
+      SafeInt<int> output_dim_size(1);
+      for (size_t i = 0; i < shape.size(); ++i) {
+        if (shape[i] == 0) {
+          output_dim_size = 1;
+          break;
+        } else {
+          output_dim_size *= shape[i];
+        }
+      }
+      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, output_dim_size * sizeof(int64_t)));
+      auto data = scratch_buffers.back().get();
+      cuda::Impl_Cast<int32_t, int64_t>(stream, reinterpret_cast<int32_t*>(allocator->getBuffer()), reinterpret_cast<int64_t*>(data), output_dim_size);
+      Ort::ThrowOnError(Ort::GetApi().CreateTensorWithDataAsOrtValue(mem_info, data, output_dim_size * sizeof(int64_t),
+                                                                     shape.data(), shape.size(), Ort::TypeToTensorType<int64_t>::type, &out));
+      break;
+    }
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: {
+      // The allocation buffer holds the FLOAT output data since TRT doesn't support DOUBLE but FLOAT.
+      // So, we need to cast the data from FLOAT to DOUBEL and then set DOUBLE output data to kernel context.
+      SafeInt<int> output_dim_size(1);
+      for (size_t i = 0; i < shape.size(); ++i) {
+        if (shape[i] == 0) {
+          output_dim_size = 1;
+          break;
+        } else {
+          output_dim_size *= shape[i];
+        }
+      }
+      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, output_dim_size * sizeof(double)));
+      auto data = scratch_buffers.back().get();
+      cuda::Impl_Cast<float, double>(stream, reinterpret_cast<float*>(allocator->getBuffer()), reinterpret_cast<double*>(data), output_dim_size);
+      Ort::ThrowOnError(Ort::GetApi().CreateTensorWithDataAsOrtValue(mem_info, data, output_dim_size * sizeof(double),
+                                                                     shape.data(), shape.size(), Ort::TypeToTensorType<double>::type, &out));
+      break;
+    }
+    default: {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "TensorRT EP output tensor data type: " + std::to_string(output_type) + " not supported.");
+    }
+  }
+  ctx.SetOutput(output_index, *out);
+  return Status::OK();
+}
+
 TensorrtExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, bool has_user_compute_stream, cudaStream_t stream) {
   if (has_user_compute_stream) {
     CUDA_CALL_THROW(cudaSetDevice(device_id));
@@ -1081,10 +1530,6 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
         throw std::runtime_error("Failed to create directory " + global_cache_path_);
       }
     }
-    {
-      auto lock = GetApiLock();
-      runtime_ = std::unique_ptr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(GetTensorrtLogger()));
-    }
   }
 
   if (engine_decryption_enable_) {
@@ -1151,6 +1596,11 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     }
   }
 
+  {
+    auto lock = GetApiLock();
+    runtime_ = std::unique_ptr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(GetTensorrtLogger()));
+  }
+
   LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] TensorRT provider options: "
                         << "device_id: " << device_id_
                         << ", trt_max_partition_iterations: " << max_partition_iterations_
@@ -1209,6 +1659,13 @@ TensorrtExecutionProvider::~TensorrtExecutionProvider() {
     // We can't get api inside destructor so that's why we duplicate the code here.
     delete static_cast<OrtAllocatorImpl*>(alloc_);
   }
+
+  for (auto iter_outer = dds_output_allocator_map_.begin(); iter_outer != dds_output_allocator_map_.end(); ++iter_outer) {
+    auto inner_map = iter_outer->second;
+    for (auto iter_inner = inner_map.begin(); iter_inner != inner_map.end(); ++iter_inner) {
+      delete iter_inner->second;
+    }
+  }
 }
 
 bool TensorrtExecutionProvider::IsGraphCaptureEnabled() const {
@@ -2317,7 +2774,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
           engine_file.seekg(0, std::ios::beg);
           std::unique_ptr<char[]> engine_buf{new char[engine_size]};
           engine_file.read((char*)engine_buf.get(), engine_size);
-          trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr));
+          trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(engine_buf.get(), engine_size));
           LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path;
           if (trt_engine == nullptr) {
             return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
@@ -2336,7 +2793,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
                                    "TensorRT EP could not call engine decryption function decrypt");
           }
           // Deserialize engine
-          trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr));
+          trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(engine_buf.get(), engine_size));
           LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Decrypted and DeSerialized " + encrypted_engine_cache_path;
           if (trt_engine == nullptr) {
             return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
@@ -2372,10 +2829,15 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
           if (detailed_build_log_) {
             engine_build_start = std::chrono::steady_clock::now();
           }
-          trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(trt_builder->buildEngineWithConfig(*trt_network, *trt_config));
+          std::unique_ptr<nvinfer1::IHostMemory> serialized_engine{trt_builder->buildSerializedNetwork(*trt_network, *trt_config)};
+          if (serialized_engine == nullptr) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                   "TensorRT EP failed to create engine from network for fused node: " + fused_node.Name());
+          }
+          trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(serialized_engine->data(), serialized_engine->size()));
           if (trt_engine == nullptr) {
             return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                   "TensorRT EP could not build engine for fused node: " + fused_node.Name());
+                                   "TensorRT EP failed to deserialize engine for fused node: " + fused_node.Name());
           }
           if (detailed_build_log_) {
             auto engine_build_stop = std::chrono::steady_clock::now();
@@ -2388,12 +2850,10 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
               LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + profile_cache_path;
             }
 
-            std::unique_ptr<nvinfer1::IHostMemory> serializedModel(trt_engine->serialize());
-            size_t engine_size = serializedModel->size();
             if (engine_decryption_enable_) {
               // Encrypt engine. The library is not always deployed with the encrypt function, so check if it is available first.
               if (engine_encryption_ != nullptr) {
-                if (!engine_encryption_(encrypted_engine_cache_path.c_str(), reinterpret_cast<char*>(serializedModel->data()), engine_size)) {
+                if (!engine_encryption_(encrypted_engine_cache_path.c_str(), reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size())) {
                   return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                                          "TensorRT EP call to engine encryption library failed");
                 }
@@ -2403,7 +2863,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
               }
             } else {
               std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out);
-              file.write(reinterpret_cast<char*>(serializedModel->data()), engine_size);
+              file.write(reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size());
               LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized engine " + engine_cache_path;
             }
           }
@@ -2487,7 +2947,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
       *p = {context->allocate_func, context->release_func, context->allocator_handle, context->node_name, builder_.get(),
             &parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name],
             &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name],
-            input_shape_ranges_[context->node_name], sync_stream_after_enqueue_, &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
+            input_shape_ranges_[context->node_name], sync_stream_after_enqueue_, dds_output_allocator_map_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
             dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_,
             runtime_.get(), profiles_[context->node_name], context_memory_sharing_enable_, &max_ctx_mem_size_,
             dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, timing_cache_enable_,
@@ -2518,6 +2978,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
       bool sync_stream_after_enqueue = trt_state->sync_stream_after_enqueue;
       auto fused_node_name = trt_state->fused_node_name;
       auto& shape_ranges = trt_state->input_shape_ranges;
+      auto& dds_output_allocator_map = trt_state->dds_output_allocator_map;
       auto trt_builder = trt_state->builder;
       auto trt_engine = trt_state->engine->get();
       auto trt_context = trt_state->context->get();
@@ -2577,7 +3038,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
           // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
           trt_state->engine->reset();
           *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
-              trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr));
+              trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size));
           if (!(*(trt_state->engine))) {
             return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
           }
@@ -2602,7 +3063,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
           // Note: Deserializing an engine from a TensorRT runtime is thread safe per TRT doc
           // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
           trt_state->engine->reset();
-          *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr));
+          *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size));
           if (!(*(trt_state->engine))) {
             return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                                    "TensorRT EP could not deserialize engine from encrypted cache: " + encrypted_engine_cache_path);
@@ -2720,14 +3181,23 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
         }
 
         // Build engine
+        std::unique_ptr<nvinfer1::IHostMemory> serialized_engine;
         {
           auto lock = GetApiLock();
           std::chrono::steady_clock::time_point engine_build_start;
           if (detailed_build_log_) {
             engine_build_start = std::chrono::steady_clock::now();
           }
+          serialized_engine = std::unique_ptr<nvinfer1::IHostMemory>(
+              trt_builder->buildSerializedNetwork(*trt_state->network->get(), *trt_config));
+          if (!serialized_engine) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to create engine from network.");
+          }
           *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
-              trt_builder->buildEngineWithConfig(*trt_state->network->get(), *trt_config));
+              trt_state->runtime->deserializeCudaEngine(serialized_engine->data(), serialized_engine->size()));
+          if (!(*(trt_state->engine))) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to deserialize engine.");
+          }
           if (detailed_build_log_) {
             auto engine_build_stop = std::chrono::steady_clock::now();
             LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_state->trt_node_name_with_precision << " took: " << std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
@@ -2743,12 +3213,10 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
           LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + profile_cache_path;
 
           // Serialize engine
-          std::unique_ptr<nvinfer1::IHostMemory> serializedModel(trt_engine->serialize());
-          size_t engine_size = serializedModel->size();
           if (trt_state->engine_decryption_enable) {
             // Encrypt engine. The library is not always deployed with the encrypt function, so check if it is available first.
             if (trt_state->engine_encryption != nullptr) {
-              if (!trt_state->engine_encryption(encrypted_engine_cache_path.c_str(), reinterpret_cast<char*>(serializedModel->data()), engine_size)) {
+              if (!trt_state->engine_encryption(encrypted_engine_cache_path.c_str(), reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size())) {
                 return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                                        "TensorRT EP could not call engine encryption function encrypt");
               }
@@ -2758,7 +3226,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
             }
           } else {
             std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out);
-            file.write(reinterpret_cast<char*>(serializedModel->data()), engine_size);
+            file.write(reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size());
             LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path;
           }
         }
@@ -2794,25 +3262,24 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
       }
 
       // Get input and output binding names
-      int total_bindings = trt_engine->getNbBindings();
-      std::vector<void*> buffers(total_bindings);
-      std::vector<std::string> input_binding_names, output_binding_names;
+      int total_bindings = trt_engine->getNbIOTensors();
+      std::vector<char const*> input_binding_names, output_binding_names;
       for (int i = 0, end = total_bindings; i < end; ++i) {
-        if (trt_engine->bindingIsInput(i)) {
-          input_binding_names.push_back(trt_engine->getBindingName(i));
+        auto const& name = trt_engine->getIOTensorName(i);
+        auto const& mode = trt_engine->getTensorIOMode(name);
+        if (mode == nvinfer1::TensorIOMode::kINPUT) {
+          input_binding_names.push_back(name);
         } else {
-          output_binding_names.push_back(trt_engine->getBindingName(i));
+          output_binding_names.push_back(name);
         }
       }
 
-      // Set input shapes and assign input buffers
+      /*
+       * Set input shapes and bind input buffers
+       */
       std::vector<IAllocatorUniquePtr<void>> scratch_buffers;
       for (size_t i = 0, end = input_binding_names.size(); i < end; ++i) {
-        const std::string& input_name = input_binding_names[i];
-        int binding_index = trt_engine->getBindingIndex(input_name.c_str());
-        if (binding_index == -1) {
-          continue;
-        }
+        char const* input_name = input_binding_names[i];
 
         size_t input_index = 0;
         const auto iter = input_indexes.find(input_name);
@@ -2823,172 +3290,38 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
         auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
         const auto tensor_shapes = tensor_info.GetShape();
 
-        // Set dynamic shapes
-        nvinfer1::Dims dimensions = trt_engine->getBindingDimensions(static_cast<int>(binding_index));
-        int nb_dims = dimensions.nbDims;
-        if (input_names.count(input_name) == 1) {
-          if (trt_engine->isShapeBinding(binding_index)) {
-            // Get shape of the shape tensor
-            std::vector<int32_t> shape_values;
-            if (!tensor_shape_values[input_name].empty()) {
-              shape_values = tensor_shape_values[input_name];
-            } else {
-              auto status = GetShapeOfShapeTensor(input_tensor, shape_values, trt_engine, binding_index, stream);
-              if (status != Status::OK()) {
-                return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
-              }
-            }
-            trt_context->setInputShapeBinding(binding_index, &shape_values[0]);
-          } else {
-            for (int j = 0, end = nb_dims; j < end; ++j) {
-              dimensions.d[j] = static_cast<int32_t>(tensor_shapes[j]);
-            }
-            const bool status = trt_context->setBindingDimensions(binding_index, dimensions);
-            if (!status) {
-              ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                                 "TensorRT EP cannot set the dynamic dimensions of a binding"));
-            }
-          }
+        // Only use for "shape tensor" input
+        std::vector<int32_t> shape_values;
+        if (tensor_shape_values.find(input_name) != tensor_shape_values.end()) {
+          shape_values = tensor_shape_values[input_name];
         }
 
-        const auto input_type = tensor_info.GetElementType();
-        switch (input_type) {
-          case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: {
-            auto input_tensor_ptr = input_tensor.GetTensorData<float>();
-            if (input_tensor_ptr == nullptr) {
-              scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(float)));
-              buffers[binding_index] = scratch_buffers.back().get();
-            } else {
-              buffers[binding_index] = const_cast<float*>(input_tensor_ptr);
-            }
-            break;
-          }
-          case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: {
-            auto input_tensor_ptr = input_tensor.GetTensorData<uint16_t>();
-            if (input_tensor_ptr == nullptr) {
-              scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(uint16_t)));
-              buffers[binding_index] = scratch_buffers.back().get();
-            } else {
-              buffers[binding_index] = const_cast<uint16_t*>(input_tensor_ptr);
-            }
-            break;
-          }
-          case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: {
-            auto input_tensor_ptr = input_tensor.GetTensorData<bool>();
-            if (input_tensor_ptr == nullptr) {
-              scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(bool)));
-              buffers[binding_index] = scratch_buffers.back().get();
-            } else {
-              buffers[binding_index] = const_cast<bool*>(input_tensor_ptr);
-            }
-            break;
-          }
-          case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: {
-            auto input_tensor_ptr = input_tensor.GetTensorData<int8_t>();
-            if (input_tensor_ptr == nullptr) {
-              scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int8_t)));
-              buffers[binding_index] = scratch_buffers.back().get();
-            } else {
-              buffers[binding_index] = const_cast<int8_t*>(input_tensor_ptr);
-            }
-            break;
-          }
-          case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: {
-            auto input_tensor_ptr = input_tensor.GetTensorData<uint8_t>();
-            if (input_tensor_ptr == nullptr) {
-              scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(uint8_t)));
-              buffers[binding_index] = scratch_buffers.back().get();
-            } else {
-              buffers[binding_index] = const_cast<uint8_t*>(input_tensor_ptr);
-            }
-            break;
-          }
-          case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: {
-            auto input_tensor_ptr = input_tensor.GetTensorData<int32_t>();
-            if (input_tensor_ptr == nullptr) {
-              scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int32_t)));
-              buffers[binding_index] = scratch_buffers.back().get();
-            } else {
-              buffers[binding_index] = const_cast<int32_t*>(input_tensor_ptr);
-            }
-            break;
-          }
-          case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: {
-            // Cast INT64 input to INT32 because TensorRT doesn't fully support INT64
-            auto input_tensor_ptr = input_tensor.GetTensorData<int64_t>();
-            if (input_tensor_ptr == nullptr) {
-              scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int32_t)));
-              buffers[binding_index] = scratch_buffers.back().get();
-            } else {
-              SafeInt<int> input_dim_size = 1;
-              for (int j = 0, end = nb_dims; j < end; ++j) {
-                if (tensor_shapes[j] == 0) {
-                  input_dim_size = 1;
-                  break;
-                } else {
-                  input_dim_size *= tensor_shapes[j];
-                }
-              }
-              scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, input_dim_size * sizeof(int32_t)));
-              buffers[binding_index] = scratch_buffers.back().get();
-              cuda::Impl_Cast<int64_t, int32_t>(stream, input_tensor_ptr, reinterpret_cast<int32_t*>(buffers[binding_index]), input_dim_size);
-            }
-            break;
-          }
-          case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: {
-            // Cast DOUBLE input to FLOAT because TensorRT doesn't fully support INT64
-            auto input_tensor_ptr = input_tensor.GetTensorData<double>();
-            if (input_tensor_ptr == nullptr) {
-              scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(float)));
-              buffers[binding_index] = scratch_buffers.back().get();
-            } else {
-              SafeInt<int> input_dim_size = 1;
-              for (int j = 0, end = nb_dims; j < end; ++j) {
-                if (tensor_shapes[j] == 0) {
-                  input_dim_size = 1;
-                  break;
-                } else {
-                  input_dim_size *= tensor_shapes[j];
-                }
-              }
-              scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, input_dim_size * sizeof(float)));
-              buffers[binding_index] = scratch_buffers.back().get();
-              cuda::Impl_Cast<double, float>(stream, input_tensor_ptr, reinterpret_cast<float*>(buffers[binding_index]), input_dim_size);
-            }
-            break;
-          }
-          default: {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                   "TensorRT EP input onnx tensor data type: " + std::to_string(input_type) + " not supported.");
-          }
+        auto status = BindContextInput(ctx, trt_engine, trt_context, input_name, input_index, shape_values, scratch_buffers, alloc, stream);
+        if (status != Status::OK()) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
         }
       }
 
-      // Set output shapes and assign output buffers
-      std::vector<int> output_dim_sizes(num_outputs, 1);
+      /*
+       * Set output shapes and bind output buffers
+       */
+      std::unordered_map<char const*, void*> buffers;
+      buffers.reserve(num_outputs);
       using OutputOrtValue = Ort::UnownedValue;
-      std::vector<OutputOrtValue> output_tensors;
+      std::unordered_map<size_t, OutputOrtValue> output_tensors;
       output_tensors.reserve(num_outputs);
+      std::unordered_map<size_t, int> output_dim_sizes;
+      output_dim_sizes.reserve(num_outputs);
+      std::unordered_set<char const*> dds_output_set;
+
       for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
-        // Set dynamic shapes
-        const std::string& output_name = output_binding_names[i];
-        int binding_index = trt_engine->getBindingIndex(output_name.c_str());
-        if (binding_index == -1) {
-          continue;
-        }
+        char const* output_name = output_binding_names[i];
 
         size_t output_index = 0;
         const auto& index_iter = output_indexes.find(output_name);
         if (index_iter != output_indexes.end()) {
           output_index = index_iter->second;
         }
-        nvinfer1::Dims dimensions = trt_context->getBindingDimensions(static_cast<int>(binding_index));
-        int nb_dims = dimensions.nbDims;
-        std::vector<int64_t> output_shapes(nb_dims);
-        for (int j = 0, end = nb_dims; j < end; ++j) {
-          output_shapes[j] = dimensions.d[j];
-        }
-        output_tensors.push_back(ctx.GetOutput(output_index, output_shapes));
 
         size_t output_type = 0;
         const auto type_iter = output_types.find(output_name);
@@ -2996,117 +3329,10 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
           output_type = type_iter->second;
         }
 
-        auto& output_tensor = output_tensors.back();
-        switch (output_type) {
-          case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: {
-            auto output_tensor_ptr = output_tensor.GetTensorMutableData<float>();
-            if (output_tensor_ptr == nullptr) {
-              scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(float)));
-              buffers[binding_index] = scratch_buffers.back().get();
-            } else {
-              buffers[binding_index] = output_tensor_ptr;
-            }
-            break;
-          }
-          case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: {
-            auto output_tensor_ptr = output_tensor.GetTensorMutableData<uint16_t>();
-            if (output_tensor_ptr == nullptr) {
-              scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(uint16_t)));
-              buffers[binding_index] = scratch_buffers.back().get();
-            } else {
-              buffers[binding_index] = output_tensor_ptr;
-            }
-            break;
-          }
-          case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: {
-            auto output_tensor_ptr = output_tensor.GetTensorMutableData<bool>();
-            if (output_tensor_ptr == nullptr) {
-              scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(bool)));
-              buffers[binding_index] = scratch_buffers.back().get();
-            } else {
-              buffers[binding_index] = output_tensor_ptr;
-            }
-            break;
-          }
-          case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: {
-            auto output_tensor_ptr = output_tensor.GetTensorMutableData<int8_t>();
-            if (output_tensor_ptr == nullptr) {
-              scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int8_t)));
-              buffers[binding_index] = scratch_buffers.back().get();
-            } else {
-              buffers[binding_index] = output_tensor_ptr;
-            }
-            break;
-          }
-          case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: {
-            auto output_tensor_ptr = output_tensor.GetTensorMutableData<uint8_t>();
-            if (output_tensor_ptr == nullptr) {
-              scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(uint8_t)));
-              buffers[binding_index] = scratch_buffers.back().get();
-            } else {
-              buffers[binding_index] = output_tensor_ptr;
-            }
-            break;
-          }
-          case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: {
-            auto output_tensor_ptr = output_tensor.GetTensorMutableData<int32_t>();
-            if (output_tensor_ptr == nullptr) {
-              scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int32_t)));
-              buffers[binding_index] = scratch_buffers.back().get();
-            } else {
-              buffers[binding_index] = output_tensor_ptr;
-            }
-            break;
-          }
-          case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: {
-            // Allocate INT32 CUDA memory for INT64 output type because TensorRT doesn't fully support INT64
-            auto output_tensor_ptr = output_tensor.GetTensorMutableData<int64_t>();
-            if (output_tensor_ptr == nullptr) {
-              scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(int32_t)));
-              buffers[binding_index] = scratch_buffers.back().get();
-              output_dim_sizes[i] = 1;
-            } else {
-              SafeInt<int> output_dim_size(output_dim_sizes[i]);
-              for (int j = 0, end = nb_dims; j < end; ++j) {
-                if (dimensions.d[j] == 0) {
-                  output_dim_size = 1;
-                  break;
-                } else {
-                  output_dim_size *= dimensions.d[j];
-                }
-              }
-              scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, output_dim_size * sizeof(int32_t)));
-              buffers[binding_index] = scratch_buffers.back().get();
-              output_dim_sizes[i] = output_dim_size;
-            }
-            break;
-          }
-          case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: {
-            // Allocate FLOAT CUDA memory for DOUBLE output type because TensorRT doesn't fully support DOUBLE
-            auto output_tensor_ptr = output_tensor.GetTensorMutableData<double>();
-            if (output_tensor_ptr == nullptr) {
-              scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, sizeof(float)));
-              buffers[binding_index] = scratch_buffers.back().get();
-            } else {
-              SafeInt<int> output_dim_size(output_dim_sizes[i]);
-              for (int j = 0, end = nb_dims; j < end; ++j) {
-                if (dimensions.d[j] == 0) {
-                  output_dim_size = 1;
-                  break;
-                } else {
-                  output_dim_size *= dimensions.d[j];
-                }
-              }
-              scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, output_dim_size * sizeof(float)));
-              buffers[binding_index] = scratch_buffers.back().get();
-              output_dim_sizes[i] = output_dim_size;
-            }
-            break;
-          }
-          default: {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                   "TensorRT EP output tensor data type: " + std::to_string(output_type) + " not supported.");
-          }
+        Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes,
+                                          dds_output_set, dds_output_allocator_map, scratch_buffers, alloc, buffers);
+        if (status != Status::OK()) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
         }
       }
 
@@ -3129,33 +3355,48 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
       }
 
       // Run TRT inference
-      if (!trt_context->enqueueV2(&buffers[0], stream, nullptr)) {
+      if (!trt_context->enqueueV3(stream)) {
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "TensorRT EP execution context enqueue failed.");
       }
 
-      if (sync_stream_after_enqueue) {
-        cudaStreamSynchronize(stream);
+      if (sync_stream_after_enqueue || dds_output_set.size() > 0) {
+        CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
       }
 
-      // Cast INT64 input to INT32 because TensorRT doesn't fully support INT64
+      // Assign TRT output back to ORT output
+      // (1) Bind TRT DDS output to ORT kernel context output. (It needs to wait until enqueueV3 is finished)
+      // (2) Cast TRT INT32 output to ORT INT64 output or TRT float output to double output
       for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
-        const std::string& output_name = output_binding_names[i];
-        size_t binding_index = trt_engine->getBindingIndex(output_name.c_str());
+        char const* output_name = output_binding_names[i];
+
         size_t output_type = 0;
         const auto& iter = output_types.find(output_name);
         if (iter != output_types.end()) {
           output_type = iter->second;
         }
-        auto& output_tensor = output_tensors[i];
-        if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) {
-          auto output_tensor_ptr = output_tensor.GetTensorMutableData<int64_t>();
-          if (output_tensor_ptr != nullptr) {
-            cuda::Impl_Cast<int32_t, int64_t>(stream, reinterpret_cast<int32_t*>(buffers[binding_index]), output_tensor_ptr, output_dim_sizes[i]);
+
+        if (dds_output_set.find(output_name) != dds_output_set.end()) {
+          size_t output_index = 0;
+          const auto& index_iter = output_indexes.find(output_name);
+          if (index_iter != output_indexes.end()) {
+            output_index = index_iter->second;
+          }
+          auto status = BindKernelOutput(ctx, &mem_info, dds_output_allocator_map, output_name, output_index, output_type, scratch_buffers, buffers, alloc, stream);
+          if (status != Status::OK()) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage());
           }
-        } else if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) {
-          auto output_tensor_ptr = output_tensor.GetTensorMutableData<double>();
-          if (output_tensor_ptr != nullptr) {
-            cuda::Impl_Cast<float, double>(stream, reinterpret_cast<float*>(buffers[binding_index]), output_tensor_ptr, output_dim_sizes[i]);
+        } else {
+          auto& output_tensor = output_tensors[i];
+          if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) {
+            auto output_tensor_ptr = output_tensor.GetTensorMutableData<int64_t>();
+            if (output_tensor_ptr != nullptr) {
+              cuda::Impl_Cast<int32_t, int64_t>(stream, reinterpret_cast<int32_t*>(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]);
+            }
+          } else if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) {
+            auto output_tensor_ptr = output_tensor.GetTensorMutableData<double>();
+            if (output_tensor_ptr != nullptr) {
+              cuda::Impl_Cast<float, double>(stream, reinterpret_cast<float*>(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]);
+            }
           }
         }
       }
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index a945d219088aa..269c1cde31c50 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -97,6 +97,61 @@ template <typename T>
 using unique_pointer = std::unique_ptr<T, TensorrtInferDeleter>;
 };  // namespace tensorrt_ptr
 
+template <typename T>
+inline T RoundUp(T m, T n) {
+  return ((m + n - 1) / n) * n;
+}
+
+//
+// Class to allocate memory for outputs with data-dependent shapes. The sizes of those are unknown so pre-allocation is
+// not possible.
+//
+class OutputAllocator : public nvinfer1::IOutputAllocator {
+ public:
+  OutputAllocator(OrtAllocator* alloc)
+      : allocator(alloc) {
+  }
+
+  void* reallocateOutput(
+      char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment) noexcept override {
+    // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
+    // even for empty tensors, so allocate a dummy byte.
+    size = std::max(size, static_cast<uint64_t>(1));
+    if (size > allocated_size) {
+      buffer = IAllocator::MakeUniquePtrFromOrtAllocator<void>(allocator, RoundUp(size, alignment));
+      allocated_size = size;
+    }
+    return buffer.get();
+  }
+
+  void* getBuffer() {
+    return buffer.get();
+  }
+
+  void notifyShape(char const* tensorName, nvinfer1::Dims const& dims) noexcept override {
+    output_shapes.reserve(dims.nbDims);
+    for (int i = 0; i < dims.nbDims; i++) {
+      output_shapes.push_back(dims.d[i]);
+    }
+  }
+
+  std::vector<int64_t>& getOutputShape() {
+    return output_shapes;
+  }
+
+  uint64_t getSize() {
+    return allocated_size;
+  }
+
+  ~OutputAllocator() override {}
+
+ private:
+  OrtAllocator* allocator = nullptr;
+  IAllocatorUniquePtr<void> buffer;
+  uint64_t allocated_size = 0;
+  std::vector<int64_t> output_shapes;
+};
+
 using ShapeRangesMap = std::unordered_map<std::string, std::unordered_map<size_t, std::vector<std::vector<int64_t>>>>;
 
 // Information to construct kernel function state.
@@ -114,6 +169,7 @@ struct TensorrtFuncState {
   std::vector<std::unordered_map<std::string, size_t>> output_info;
   std::unordered_map<std::string, std::unordered_map<size_t, std::vector<std::vector<int64_t>>>> input_shape_ranges;
   bool sync_stream_after_enqueue = false;
+  std::unordered_map<char const*, OutputAllocator*> dds_output_allocator_map;
   OrtMutex* tensorrt_mu_ptr = nullptr;
   bool fp16_enable = false;
   bool int8_enable = false;
@@ -153,6 +209,7 @@ struct SubGraphContext {
 };
 
 using SubGraphContextMap = std::unordered_map<std::string, std::unique_ptr<SubGraphContext>>;
+using DDSOutputAllocatorMap = std::unordered_map<char const*, OutputAllocator*>;
 
 // Logical device representation.
 class TensorrtExecutionProvider : public IExecutionProvider {
@@ -263,6 +320,7 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   std::unordered_map<std::string, std::vector<std::vector<int64_t>>> profile_opt_shapes_;
   std::unordered_map<std::string, ShapeRangesMap> input_shape_ranges_;  // The profile shape ranges that the engine is built with
   std::unordered_map<std::string, std::vector<nvinfer1::IOptimizationProfile*>> profiles_;
+  std::unordered_map<std::string, DDSOutputAllocatorMap> dds_output_allocator_map_;  // For DDS output tensor. TODO: Make DDSOutputAllocatorMap use unique_ptr
 
   // for external stream, we need to create its cudnn/cublass handle before cuda EP enable cuda graph capture
   cudnnHandle_t external_cudnn_handle_ = nullptr;
diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc
index b827c28f129b1..b3e7bd4935c1a 100644
--- a/onnxruntime/core/session/custom_ops.cc
+++ b/onnxruntime/core/session/custom_ops.cc
@@ -311,6 +311,22 @@ ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetOutput, _Inout_ OrtKernelContext*
   API_IMPL_END
 };
 
+ORT_API_STATUS_IMPL(OrtApis::KernelContext_SetOutput, _Inout_ OrtKernelContext* context, _In_ size_t index, _In_ const OrtValue* ort_value) {
+  API_IMPL_BEGIN
+#if defined(ENABLE_ATEN) || defined(USE_TENSORRT)
+  auto status = reinterpret_cast<onnxruntime::OpKernelContext*>(context)->SetOutputMLValue(gsl::narrow_cast<int>(index), *ort_value);
+  if (status.IsOK())
+    return nullptr;
+  return onnxruntime::ToOrtStatus(status);
+#else
+  ORT_UNUSED_PARAMETER(context);
+  ORT_UNUSED_PARAMETER(index);
+  ORT_UNUSED_PARAMETER(ort_value);
+  return CreateStatus(ORT_FAIL, "TensorRT execution provider is not enabled in this build.");
+#endif
+  API_IMPL_END
+};
+
 ORT_API_STATUS_IMPL(OrtApis::KernelInfoGetAttribute_string, _In_ const OrtKernelInfo* info, _In_ const char* name, _Out_ char* out, _Inout_ size_t* size) {
   API_IMPL_BEGIN
   std::string value;
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 9f8786b727ac1..d729b2a957a84 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -2721,6 +2721,7 @@ static constexpr OrtApi ort_api_1_to_17 = {
     &OrtApis::ShapeInferContext_SetOutputTypeShape,
     &OrtApis::SetSymbolicDimensions,
     &OrtApis::ReadOpAttr,
+    &OrtApis::KernelContext_SetOutput,
 };
 
 // OrtApiBase can never change as there is no way to know what version of OrtApiBase is returned by OrtGetApiBase.
diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h
index 09c83219ad2c8..2c7f501c4b8b0 100644
--- a/onnxruntime/core/session/ort_apis.h
+++ b/onnxruntime/core/session/ort_apis.h
@@ -184,6 +184,7 @@ ORT_API_STATUS_IMPL(KernelContext_GetInputCount, _In_ const OrtKernelContext* co
 ORT_API_STATUS_IMPL(KernelContext_GetOutputCount, _In_ const OrtKernelContext* context, _Out_ size_t* out);
 ORT_API_STATUS_IMPL(KernelContext_GetInput, _In_ const OrtKernelContext* context, _In_ size_t index, _Out_ const OrtValue** out);
 ORT_API_STATUS_IMPL(KernelContext_GetOutput, _Inout_ OrtKernelContext* context, _In_ size_t index, _In_ const int64_t* dim_values, size_t dim_count, _Out_ OrtValue** out);
+ORT_API_STATUS_IMPL(KernelContext_SetOutput, _Inout_ OrtKernelContext* context, _In_ size_t index, _In_ const OrtValue* ort_value);
 
 // OrtTypeInfo methods
 ORT_API_STATUS_IMPL(GetDenotationFromTypeInfo, _In_ const OrtTypeInfo*, _Out_ const char** const denotation, _Out_ size_t* len);
diff --git a/onnxruntime/test/providers/cpu/nn/dropout_op_test.cc b/onnxruntime/test/providers/cpu/nn/dropout_op_test.cc
index 5860d3167ce67..8d7d46316381b 100644
--- a/onnxruntime/test/providers/cpu/nn/dropout_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/dropout_op_test.cc
@@ -30,7 +30,9 @@ TEST(Dropout, WithOptionalOutputOpset10) {
   test.AddInput<float>("X", dims, {1.0f, 2.0f, 3.0f, 5.0f});
   test.AddOutput<float>("Y", dims, {1.0f, 2.0f, 3.0f, 5.0f});
   test.AddOutput<bool>("mask", dims, {false, false, false, false});
-  test.Run();
+  // The fix in onnx-tensorrt parser for dropout onnx node is not included in TRT 8.6.1 but might be included in later ORT release.
+  // Simply skip this for now.
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 TEST(Dropout, WithOptionalOutputOpset7) {