diff --git a/onnxruntime/test/contrib_ops/attention_op_test.cc b/onnxruntime/test/contrib_ops/attention_op_test.cc
index b652e0723f5aa..5ed68e1b48133 100644
--- a/onnxruntime/test/contrib_ops/attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/attention_op_test.cc
@@ -227,6 +227,12 @@ static void RunAttentionTest(
       tester.AddOptionalInputEdge<int32_t>();
     }
 
+    if (use_float16) {
+      tester.SetOutputTolerance(0.005f, 0.005f);
+    } else {
+      tester.SetOutputTolerance(0.001f, 0.001f);
+    }
+
     if (enable_cuda) {
       std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
       execution_providers.push_back(DefaultCudaExecutionProvider());
@@ -2013,13 +2019,6 @@ TEST(AttentionTest, AttentionMaskIndexOutOfRange) {
 #if !defined(__wasm__)
 // TODO: fix in web assembly
 TEST(AttentionTest, AttentionPastState_dynamic) {
-  // ORT enables TF32 in GEMM for A100. TF32 will cause precsion loss and fail this test.
-  // Do not run this test unless TF32 is disabled explicitly.
-  if (HasCudaEnvironment(800) && ParseEnvironmentVariableWithDefault<int>("NVIDIA_TF32_OVERRIDE", 1) != 0) {
-    GTEST_SKIP() << "Skipping AttentionPastState_dynamic in A100 since TF32 is enabled";
-    return;
-  }
-
   // create rand inputs
   RandomValueGenerator random{};
 
@@ -2101,13 +2100,6 @@ static void RunModelWithRandomInput(
     std::vector<int32_t>& mask_index_data,
     std::string& onnx_model,
     bool is_float16) {
-  // ORT enables TF32 in GEMM for A100. TF32 will cause precsion loss and fail this test.
-  // Do not run this test unless TF32 is disabled explicitly.
-  if (HasCudaEnvironment(800) && ParseEnvironmentVariableWithDefault<int>("NVIDIA_TF32_OVERRIDE", 1) != 0) {
-    GTEST_SKIP() << "Skipping RunModelWithRandomInput in A100 since TF32 is enabled";
-    return;
-  }
-
   RandomValueGenerator random{234};
 
   constexpr int hidden_size = 768;
diff --git a/onnxruntime/test/contrib_ops/beam_search_test.cc b/onnxruntime/test/contrib_ops/beam_search_test.cc
index 156ed3799fc22..6ce9f5de68f11 100644
--- a/onnxruntime/test/contrib_ops/beam_search_test.cc
+++ b/onnxruntime/test/contrib_ops/beam_search_test.cc
@@ -8,6 +8,10 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "test/common/cuda_op_test_utils.h"
 
+#ifdef USE_CUDA
+#include "core/providers/cuda/cuda_provider_options.h"
+#endif
+
 extern std::unique_ptr<Ort::Env> ort_env;
 
 namespace onnxruntime {
@@ -70,7 +74,9 @@ TEST(BeamSearchTest, GptBeamSearchFp32) {
 
   Ort::SessionOptions session_options;
 #ifdef USE_CUDA
-  Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+  OrtCUDAProviderOptionsV2 cuda_options;
+  cuda_options.use_tf32 = false;
+  session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 #endif
 
 #ifdef USE_ROCM
@@ -161,7 +167,9 @@ TEST(BeamSearchTest, GptBeamSearchFp16) {
   if (enable_cuda || enable_rocm) {
     Ort::SessionOptions session_options;
 #ifdef USE_CUDA
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+    OrtCUDAProviderOptionsV2 cuda_options;
+    cuda_options.use_tf32 = false;
+    session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 #endif
 
 #ifdef USE_ROCM
@@ -254,7 +262,9 @@ TEST(BeamSearchTest, GptBeamSearchWithInitDecoderFp16) {
   if (enable_cuda || enable_rocm) {
     Ort::SessionOptions session_options;
 #ifdef USE_CUDA
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+    OrtCUDAProviderOptionsV2 cuda_options;
+    cuda_options.use_tf32 = false;
+    session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 #endif
 
 #ifdef USE_ROCM
@@ -346,7 +356,9 @@ TEST(BeamSearchTest, GptBeamSearchFp16_VocabPadded) {
   if (enable_cuda || enable_rocm) {
     Ort::SessionOptions session_options;
 #ifdef USE_CUDA
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+    OrtCUDAProviderOptionsV2 cuda_options;
+    cuda_options.use_tf32 = false;
+    session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 #endif
 
 #ifdef USE_ROCM
diff --git a/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc b/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc
index 88a2bdf6a4849..8a37ef921fd2b 100644
--- a/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc
@@ -31,10 +31,8 @@ static void RunAttentionTest(
     const std::vector<float>* new_value_cache = nullptr,
     const std::vector<float>* key_cache = nullptr,
     const std::vector<float>* value_cache = nullptr,
-    const std::initializer_list<bool>* key_padding_mask_data = nullptr,
-    bool use_float16 = false) {
-  int min_cuda_architecture = use_float16 ? 530 : 0;
-  bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
+    const std::initializer_list<bool>* key_padding_mask_data = nullptr) {
+  bool enable_cuda = HasCudaEnvironment(0);
   bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get());
   bool enable_cpu = false;
 
@@ -99,6 +97,7 @@ static void RunAttentionTest(
       tester.AddOutput<float>("new_key_cache", output_cache_dims, *new_key_cache);
       tester.AddOutput<float>("new_value_cache", output_cache_dims, *new_value_cache);
     }
+    tester.SetOutputTolerance(0.001f, 0.001f);
 
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
     if (enable_cuda) {
diff --git a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
index acaae2dcd9712..e03e522ebbfce 100644
--- a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
@@ -754,9 +754,10 @@ TEST(DecoderMaskedSelfAttentionTest, Test_fp32) {
 
     // Output(s)
     tester.AddOutput<float>("output", input_dims, output);
-
     tester.AddOutput<float>("present", past_dims, present);
 
+    tester.SetOutputTolerance(0.001f, 0.001f);
+
     // Run - Regular kernel execution path
     {
       std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
@@ -897,9 +898,10 @@ TEST(DecoderMaskedSelfAttentionTest, Test_fp16) {
 
     // Output(s)
     tester.AddOutput<MLFloat16>("output", input_dims, output);
-
     tester.AddOutput<MLFloat16>("present", past_dims, present);
 
+    tester.SetOutputTolerance(0.005f, 0.001f);
+
     // Run - Regular kernel execution path
     {
       std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
diff --git a/onnxruntime/test/contrib_ops/fft_op_test.cc b/onnxruntime/test/contrib_ops/fft_op_test.cc
index 56a6466c760f6..be04ee0e49911 100644
--- a/onnxruntime/test/contrib_ops/fft_op_test.cc
+++ b/onnxruntime/test/contrib_ops/fft_op_test.cc
@@ -25,6 +25,7 @@ TEST(ContribOpTest, Rfft) {
   // Target values conputed using PyTorch torch.fft.rfft(X, dim=-1, norm="backward")
   test.AddInput<float>("X", {4, 4}, {0.8129f, 1.3108f, -0.8790f, -1.2046f, 0.1661f, -0.9831f, 0.5879f, 0.4918f, 1.2506f, 0.7244f, -2.6260f, -1.1268f, -1.6885f, 1.0439f, -0.2595f, 1.8780f});
   test.AddOutput<float>("Y", {4, 3, 2}, {0.0400f, 0.0000f, 1.6919f, -2.5154f, -0.1722f, 0.0000f, 0.2627f, 0.0000f, -0.4218f, 1.4748f, 1.2454f, 0.0000f, -1.7779f, 0.0000f, 3.8766f, -1.8512f, -0.9730f, 0.0000f, 0.9740f, 0.0000f, -1.4290f, 0.8341f, -4.8699f, 0.0000f});
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 
@@ -45,6 +46,7 @@ TEST(ContribOpTest, Irfft) {
   test.AddAttribute("normalized", static_cast<int64_t>(0));
   test.AddInput<float>("X", {4, 3, 2}, {0.0400f, 0.0000f, 1.6919f, -2.5154f, -0.1722f, 0.0000f, 0.2627f, 0.0000f, -0.4218f, 1.4748f, 1.2454f, 0.0000f, -1.7779f, 0.0000f, 3.8766f, -1.8512f, -0.9730f, 0.0000f, 0.9740f, 0.0000f, -1.4290f, 0.8341f, -4.8699f, 0.0000f});
   test.AddOutput<float>("Y", {4, 4}, {0.8129f, 1.3108f, -0.8790f, -1.2046f, 0.1661f, -0.9831f, 0.5879f, 0.4918f, 1.2506f, 0.7244f, -2.6260f, -1.1268f, -1.6885f, 1.0439f, -0.2595f, 1.8780f});
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 }  // namespace test
diff --git a/onnxruntime/test/contrib_ops/greedy_search_test.cc b/onnxruntime/test/contrib_ops/greedy_search_test.cc
index 1baf50c1ba616..8186529f8df45 100644
--- a/onnxruntime/test/contrib_ops/greedy_search_test.cc
+++ b/onnxruntime/test/contrib_ops/greedy_search_test.cc
@@ -8,6 +8,10 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "test/common/cuda_op_test_utils.h"
 
+#ifdef USE_CUDA
+#include "core/providers/cuda/cuda_provider_options.h"
+#endif
+
 extern std::unique_ptr<Ort::Env> ort_env;
 
 namespace onnxruntime {
@@ -64,9 +68,13 @@ TEST(GreedySearchTest, GptGreedySearchFp16_VocabPadded) {
 
   if (is_cuda || is_rocm) {
     Ort::SessionOptions session_options;
+#ifdef USE_CUDA
     if (is_cuda) {
-      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+      OrtCUDAProviderOptionsV2 cuda_options;
+      cuda_options.use_tf32 = false;
+      session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
     }
+#endif
     if (is_rocm) {
       Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, 0));
     }
@@ -146,7 +154,9 @@ TEST(GreedySearchTest, GptGreedySearchFp32) {
   if (is_cuda || is_rocm) {
     Ort::SessionOptions session_options;
     if (is_cuda) {
-      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+      OrtCUDAProviderOptionsV2 cuda_options;
+      cuda_options.use_tf32 = false;
+      session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
     }
     if (is_rocm) {
       Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, 0));
diff --git a/onnxruntime/test/contrib_ops/gridsample_test.cc b/onnxruntime/test/contrib_ops/gridsample_test.cc
index 46ed04301a9e8..9251f1b24cbcc 100644
--- a/onnxruntime/test/contrib_ops/gridsample_test.cc
+++ b/onnxruntime/test/contrib_ops/gridsample_test.cc
@@ -126,6 +126,7 @@ TEST(GridsampleContribOpTest, gridsample_mode_bicubic) {
                         0.5000f, 0.5000f, 1.0000f, 1.0000f});
   test.AddAttribute("mode", "bicubic");
   test.AddOutput<float>("Y", {1, 1, 2, 4}, {-0.1406f, 0.3828f, 1.7556f, 2.9688f, 2.9688f, 1.7556f, 5.1445f, 1.3906f});
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
diff --git a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
index 98fb62e435f31..2b77825d80b0e 100644
--- a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
+++ b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
@@ -160,6 +160,7 @@ TEST(LayerNormTest, LayerNorm_Scale_Bias) {
   test.AddInput<float>("gamma", {2}, {-0.6953f, 5.1824f});
   test.AddInput<float>("bias", {2}, {0.6435f, -0.3964f});
   test.AddOutput<float>("output", dims, {-0.0516f, -5.5776f, -0.0518f, -5.5788f, -0.0518f, -5.5788f});
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run();
 }
 
@@ -172,6 +173,8 @@ TEST(LayerNormTest, LayerNorm_Scale_Bias_Float16Input) {
   test.AddInput<float>("gamma", {2}, {-0.6953f, 5.1824f});
   test.AddInput<float>("bias", {2}, {0.6435f, -0.3964f});
   test.AddOutput<float>("output", dims, {-0.0516f, -5.5776f, -0.0518f, -5.5788f, -0.0518f, -5.5788f});
+  test.SetOutputTolerance(0.0001f, 0.0001f);
+
   // TRT, DNNL, OpenVINO and NNAPI, CoreML don't support this combination of datatypes
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
            {kTensorrtExecutionProvider, kDnnlExecutionProvider, kQnnExecutionProvider,
@@ -228,6 +231,9 @@ TEST(LayerNormTest, LayerNorm17_double) {
   test.AddInput<double>("x", dims, {1.0, 2.0, 3.0, 4.0, 5.0, 6.0});
   test.AddInput<double>("gamma", {3}, {1.0, 1.0, 1.0});
   test.AddOutput<double>("output", dims, {-1.2247, 0.0, 1.2247, -1.2247, 0.0, 1.2247});
+
+  test.SetOutputTolerance(0.0001f, 0.0001f);
+
   // DNNL does not support double
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kDnnlExecutionProvider});
 }
diff --git a/onnxruntime/test/contrib_ops/moe_test.cc b/onnxruntime/test/contrib_ops/moe_test.cc
index ebb0261deefa5..45fe830425a35 100644
--- a/onnxruntime/test/contrib_ops/moe_test.cc
+++ b/onnxruntime/test/contrib_ops/moe_test.cc
@@ -47,6 +47,7 @@ static void RunMoETest(
       tester.AddInput<MLFloat16>("fc1_experts_bias", fc1_experts_bias_dims, ToFloat16(fc1_experts_bias));
       tester.AddInput<MLFloat16>("fc2_experts_bias", fc2_experts_bias_dims, ToFloat16(fc2_experts_bias));
       tester.AddOutput<MLFloat16>("output", output_dims, ToFloat16(output_data));
+      tester.SetOutputTolerance(0.005f, 0.005f);
     } else {
       tester.AddInput<float>("input", input_dims, input);
       tester.AddInput<float>("router_probs", router_probs_dims, router_probs);
@@ -55,6 +56,7 @@ static void RunMoETest(
       tester.AddInput<float>("fc1_experts_bias", fc1_experts_bias_dims, fc1_experts_bias);
       tester.AddInput<float>("fc2_experts_bias", fc2_experts_bias_dims, fc2_experts_bias);
       tester.AddOutput<float>("output", output_dims, output_data);
+      tester.SetOutputTolerance(0.001f, 0.001f);
     }
 
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
diff --git a/onnxruntime/test/contrib_ops/packed_attention_op_test.cc b/onnxruntime/test/contrib_ops/packed_attention_op_test.cc
index 31ef62e69bb88..09baf8def05f6 100644
--- a/onnxruntime/test/contrib_ops/packed_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/packed_attention_op_test.cc
@@ -433,8 +433,7 @@ static void RunModelWithRandomInput(
   std::vector<int64_t> token_offset_dims{batch_size, sequence_length};
   std::vector<int64_t> cum_seq_len_dims{batch_size + 1};
 
-  // TF32 in SM >= 80 is enabled by default, need larger threshold for float when TF32 is enabled.
-  float gpu_threshold = is_float16 ? 0.15f : (HasCudaEnvironment(800) ? 0.05f : 0.005f);
+  float gpu_threshold = is_float16 ? 0.15f : 0.005f;
   gpu_threshold *= sequence_length > 1024 ? 4.0f : 1.0f;  // threshold should increase with sequence length
   bool enable_cuda = HasCudaEnvironment(is_float16 ? 530 : 0);
   if (enable_cuda) {
diff --git a/onnxruntime/test/contrib_ops/packed_multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/packed_multihead_attention_op_test.cc
index 22253955566f2..9b09362db5c5f 100644
--- a/onnxruntime/test/contrib_ops/packed_multihead_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/packed_multihead_attention_op_test.cc
@@ -107,6 +107,7 @@ static void RunPackedMultiHeadAttentionTest(
       }
 
       tester.AddOutput<MLFloat16>("output", output_dims, ToFloat16(output_data));
+      tester.SetOutputTolerance(0.005f, 0.005f);
     } else {
       if (is_packed_qkv) {
         tester.AddInput<float>("query", packed_qkv_dims, query_data);
@@ -131,6 +132,7 @@ static void RunPackedMultiHeadAttentionTest(
       }
 
       tester.AddOutput<float>("output", output_dims, output_data);
+      tester.SetOutputTolerance(0.001f, 0.001f);
     }
 
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
diff --git a/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc b/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
index fd222583ac67f..72dce2628221b 100644
--- a/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
@@ -90,11 +90,13 @@ void RunQAttention(const std::vector<float>& input_data,
     tester.AddInput<MLFloat16>("input_scale", {1}, ToFloat16({input_quant_params.scale}));
     tester.AddInput<MLFloat16>("weight_scale", {1}, ToFloat16({weight_quant_params.scale}));
     tester.AddOutput<MLFloat16>("output", output_dims, ToFloat16(output_data));
+    tester.SetOutputTolerance(0.01f, 0.01f);
   } else {
     tester.AddInput<float>("bias", bias_dims, bias_data);
     tester.AddInput<float>("input_scale", {1}, {input_quant_params.scale});
     tester.AddInput<float>("weight_scale", {1}, {weight_quant_params.scale});
     tester.AddOutput<float>("output", output_dims, output_data);
+    tester.SetOutputTolerance(0.005f, 0.005f);
   }
 
   if (mask_index_data.size() > 0) {
diff --git a/onnxruntime/test/contrib_ops/sampling_test.cc b/onnxruntime/test/contrib_ops/sampling_test.cc
index 733bc9f01fd11..d987a1cae427d 100644
--- a/onnxruntime/test/contrib_ops/sampling_test.cc
+++ b/onnxruntime/test/contrib_ops/sampling_test.cc
@@ -8,6 +8,10 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "test/common/cuda_op_test_utils.h"
 
+#ifdef USE_CUDA
+#include "core/providers/cuda/cuda_provider_options.h"
+#endif
+
 extern std::unique_ptr<Ort::Env> ort_env;
 
 namespace onnxruntime {
@@ -65,7 +69,10 @@ TEST(SamplingTest, Gpt2Sampling_GPU) {
     LOGS_DEFAULT(WARNING) << "Hardware NOT support current architecture";
     return;
   }
-  Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+
+  OrtCUDAProviderOptionsV2 cuda_options;
+  cuda_options.use_tf32 = false;
+  session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 #else  // USE_ROCM
   OrtROCMProviderOptions rocm_options;
   // TODO - verify the default settings
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index 9c2c24e3c337d..db706bf929748 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -25,6 +25,10 @@
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "nlohmann/json.hpp"
 
+#ifdef USE_CUDA
+#include "core/providers/cuda/cuda_provider_options.h"
+#endif
+
 using namespace onnxruntime;
 
 namespace {
@@ -401,12 +405,13 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
 
     if (enable_tensorrt) {
 #ifdef USE_TENSORRT
-      OrtCUDAProviderOptions cuda_options;
+      OrtCUDAProviderOptionsV2 cuda_options;
       cuda_options.device_id = device_id;
       cuda_options.do_copy_in_default_stream = true;
+      cuda_options.use_tf32 = false;
       // TODO: Support arena configuration for users of test runner
       Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Tensorrt(sf, device_id));
-      sf.AppendExecutionProvider_CUDA(cuda_options);
+      sf.AppendExecutionProvider_CUDA_V2(cuda_options);
 #else
       fprintf(stderr, "TensorRT is not supported in this build");
       return -1;
@@ -424,10 +429,11 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
     }
     if (enable_cuda) {
 #ifdef USE_CUDA
-      OrtCUDAProviderOptions cuda_options;
+      OrtCUDAProviderOptionsV2 cuda_options;
       cuda_options.do_copy_in_default_stream = true;
+      cuda_options.use_tf32 = false;
       // TODO: Support arena configuration for users of test runner
-      sf.AppendExecutionProvider_CUDA(cuda_options);
+      sf.AppendExecutionProvider_CUDA_V2(cuda_options);
 #else
       fprintf(stderr, "CUDA is not supported in this build");
       return -1;
diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc
index e94f8c2673be3..4a62edf64bb29 100644
--- a/onnxruntime/test/providers/base_tester.cc
+++ b/onnxruntime/test/providers/base_tester.cc
@@ -120,6 +120,15 @@ void BaseTester::SetOutputRelErr(const char* name, float v) {
   it->validation_params.relative_error = optional<float>(v);
 }
 
+void BaseTester::SetOutputTolerance(float abs_error, float rel_error) {
+  for (auto& output : output_data_) {
+    if (output.def.Exists()) {
+      output.validation_params.absolute_error = optional<float>(abs_error);
+      output.validation_params.relative_error = optional<float>(rel_error);
+    }
+  }
+}
+
 std::vector<int64_t> BaseTester::GetDimsForProto(gsl::span<const int64_t> dims) {
   std::vector<int64_t> dims_for_proto{dims.begin(), dims.end()};
   if (add_symbolic_dim_to_tensor_data_ >= 0 &&
diff --git a/onnxruntime/test/providers/base_tester.h b/onnxruntime/test/providers/base_tester.h
index 5607e58315a12..4a5df0be044b8 100644
--- a/onnxruntime/test/providers/base_tester.h
+++ b/onnxruntime/test/providers/base_tester.h
@@ -522,6 +522,9 @@ class BaseTester {
   void SetOutputAbsErr(const char* name, float v);
   void SetOutputRelErr(const char* name, float v);
 
+  // Set absolute and relative error for added outputs.
+  void SetOutputTolerance(float abs_error, float rel_error);
+
   // Number of times to call InferenceSession::Run. The same feeds are used each time.
   // e.g. used to verify the generator ops behave as expected
   void SetNumRunCalls(int n) {
diff --git a/onnxruntime/test/providers/checkers.cc b/onnxruntime/test/providers/checkers.cc
index c97e6d9de4911..a32e792374cf8 100644
--- a/onnxruntime/test/providers/checkers.cc
+++ b/onnxruntime/test/providers/checkers.cc
@@ -20,46 +20,87 @@ struct DefaultTolerance;
 
 template <>
 struct DefaultTolerance<double> {
-  static constexpr float absolute = 1e-6f;
+  static constexpr float absolute = 1e-5f;
   static constexpr float relative = 1e-5f;
+
+  // Allow to have different default absolute tolerance for different providers.
+  static float get_absolute(const std::string& /*provider_type*/) {
+    return absolute;
+  }
 };
 
 template <>
 struct DefaultTolerance<float> {
+#if defined(ENABLE_TRAINING)
+  static constexpr float absolute = 1e-4f;
+#else
   static constexpr float absolute = 1e-5f;
+#endif
+
   static constexpr float relative = 1e-4f;
+
+  static float get_absolute(const std::string& /*provider_type*/) {
+    return absolute;
+  }
 };
 
 template <>
 struct DefaultTolerance<MLFloat16> {
-  // The thresholds are estimated with PyTorch script like the following:
+  // The thresholds for inference are estimated with PyTorch script like the following:
   //    x = torch.rand(1000, 1000)
   //    absolute = ((x + 1e-6).to(torch.float16) - x).abs().max() * 10
   //    x[abs(x) < absolute] = absolute
   //    relative = ((x - x.to(torch.float16)) / x).abs().max() * 2
+#if defined(ENABLE_TRAINING)
+  static constexpr float absolute = 0.005f;
+#else
   static constexpr float absolute = 0.0025f;
+#endif
+
   static constexpr float relative = 0.001f;
+
+  static float get_absolute(const std::string& provider_type) {
+    if (provider_type == kDmlExecutionProvider) {
+      return 0.005f;
+    }
+    return absolute;
+  }
 };
 
 template <>
 struct DefaultTolerance<BFloat16> {
+  // The thresholds for inference are estimated with PyTorch script like the following:
+  //    x = torch.rand(1000, 1000)
+  //    absolute = ((x + 1e-6).to(torch.bfloat16) - x).abs().max() * 10
+  //    x[abs(x) < absolute] = absolute
+  //    relative = ((x - x.to(torch.bfloat16)) / x).abs().max() * 2
   static constexpr float absolute = 0.02f;
   static constexpr float relative = 0.01f;
+
+  static float get_absolute(const std::string& /*provider_type*/) {
+    return absolute;
+  }
+};
+
+struct ToleranceParams {
+  float absolute;
+  float relative;
 };
 
 template <typename T>
-T get_tolerance(float absolute, float relative, T expected_value) {
+ToleranceParams get_tolerance_params(const ValidateOutputParams& params, const std::string& provider_type) {
+  ToleranceParams new_params;
+  new_params.absolute = params.absolute_error.has_value() ? *(params.absolute_error) : DefaultTolerance<T>::get_absolute(provider_type);
+  new_params.relative = params.relative_error.has_value() ? *(params.relative_error) : DefaultTolerance<T>::relative;
+  return new_params;
+}
+
+template <typename T>
+T get_tolerance(const ToleranceParams& params, T expected_value) {
   static_assert(std::is_floating_point<T>::value, "T must be a floating point type");
 
   // The formula is similar to numpy.isclose: https://numpy.org/doc/stable/reference/generated/numpy.isclose.html
-  return static_cast<T>(absolute) + static_cast<T>(relative) * std::abs(expected_value);
-}
-
-template <typename T, typename D>  // D is the original data type
-T get_tolerance(const ValidateOutputParams& params, T expected_value) {
-  float absolute = (params.absolute_error.has_value() ? *(params.absolute_error) : DefaultTolerance<D>::absolute);
-  float relative = (params.relative_error.has_value() ? *(params.relative_error) : DefaultTolerance<D>::relative);
-  return get_tolerance<T>(absolute, relative, expected_value);
+  return static_cast<T>(params.absolute) + static_cast<T>(params.relative) * std::abs(expected_value);
 }
 
 template <typename T>
@@ -221,11 +262,9 @@ struct TensorCheck<double> {
   void operator()(const Tensor& expected,
                   const Tensor& actual,
                   const ValidateOutputParams& params,
-                  const std::string& /*provider_type*/) const {
+                  const std::string& provider_type) const {
     auto size = actual.Shape().Size();
 
-    const bool has_tolerance = params.absolute_error.has_value() || params.relative_error.has_value();
-
     // deal with rare cases in which order of output data from a kernel MAY be
     // undefined
     Tensor expected_sorted, actual_sorted;
@@ -240,10 +279,7 @@ struct TensorCheck<double> {
       cur_actual = actual.Data<double>();
     }
 
-    double threshold = 0.001;
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
-    threshold = 0.005;
-#endif
+    auto tolerance_params = get_tolerance_params<double>(params, provider_type);
 
     for (int64_t i = 0; i < size; ++i) {
       // NOTE: Check isnan first to work around MSVC linker bug when /LTCG:incremental is specified.
@@ -253,7 +289,7 @@ struct TensorCheck<double> {
       } else if (std::isinf(cur_expected[i])) {  // Test infinity for equality
         EXPECT_EQ(cur_expected[i], cur_actual[i]) << "Expected infinity. i:" << i;
       } else {
-        double tolerance = has_tolerance ? get_tolerance<double, double>(params, cur_expected[i]) : threshold;
+        double tolerance = get_tolerance<double>(tolerance_params, cur_expected[i]);
         EXPECT_NEAR(cur_expected[i], cur_actual[i], tolerance) << "i:" << i;
       }
     }
@@ -264,9 +300,7 @@ template <typename T>
 void InternalNumericalCheck(const Tensor& expected,
                             const Tensor& actual,
                             const ValidateOutputParams& params,
-                            const std::string& /*provider_type*/) {
-  const bool has_tolerance = params.absolute_error.has_value() || params.relative_error.has_value();
-
+                            const std::string& provider_type) {
   // deal with rare cases in which order of output data from a kernel MAY be
   // undefined
   Tensor expected_sorted, actual_sorted;
@@ -282,11 +316,7 @@ void InternalNumericalCheck(const Tensor& expected,
     cur_actual = actual.Data<T>();
   }
 
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
-  constexpr float threshold = 0.005f;
-#else
-  constexpr float threshold = 0.0001f;
-#endif
+  auto tolerance_params = get_tolerance_params<T>(params, provider_type);
 
   for (int64_t i = 0; i < size; ++i) {
     // NOTE: Check isnan first to work around MSVC linker bug when /LTCG:incremental is specified.
@@ -296,7 +326,7 @@ void InternalNumericalCheck(const Tensor& expected,
     } else if (std::isinf(cur_expected[i])) {  // Test infinity for equality
       EXPECT_EQ(cur_expected[i], cur_actual[i]) << "Expected infinity. i:" << i;
     } else {
-      T tolerance = has_tolerance ? get_tolerance<T, T>(params, cur_expected[i]) : threshold;
+      T tolerance = get_tolerance<T>(tolerance_params, cur_expected[i]);
       EXPECT_NEAR(cur_expected[i], cur_actual[i], tolerance) << "i:" << i;
     }
   }
@@ -317,7 +347,7 @@ struct TensorCheck<MLFloat16> {
   void operator()(const Tensor& expected,
                   const Tensor& actual,
                   const ValidateOutputParams& params,
-                  const std::string& /*provider_type*/) const {
+                  const std::string& provider_type) const {
     auto* cur_expected = expected.Data<MLFloat16>();
     auto* cur_actual = actual.Data<MLFloat16>();
     auto size = actual.Shape().Size();
@@ -333,21 +363,15 @@ struct TensorCheck<MLFloat16> {
       sort_expected_and_actual_buffers<float>(f_expected, f_actual);
     }
 
-    const bool has_tolerance = params.absolute_error.has_value() || params.relative_error.has_value();
+    auto tolerance_params = get_tolerance_params<MLFloat16>(params, provider_type);
 
-    float threshold = 0.001f;
-#if defined(USE_TENSORRT) || defined(ENABLE_TRAINING_CORE) || defined(USE_CUDA) || defined(USE_ROCM)
-    threshold = 0.005f;
-#elif defined(USE_DML)
-    threshold = 0.02f;
-#endif
     for (int64_t i = 0; i < size; ++i) {
       if (std::isnan(f_expected[i])) {
         EXPECT_TRUE(std::isnan(f_expected[i])) << "Expected NaN. i:" << i;
       } else if (std::isinf(f_expected[i])) {  // Test infinity for equality
         EXPECT_EQ(f_expected[i], f_actual[i]) << "Expected infinity. i:" << i;
       } else {
-        float tolerance = has_tolerance ? get_tolerance<float, MLFloat16>(params, f_expected[i]) : threshold;
+        float tolerance = get_tolerance<float>(tolerance_params, f_expected[i]);
         EXPECT_NEAR(f_expected[i], f_actual[i], tolerance) << "i:" << i;
       }
     }
@@ -359,7 +383,7 @@ struct TensorCheck<BFloat16> {
   void operator()(const Tensor& expected,
                   const Tensor& actual,
                   const ValidateOutputParams& params,
-                  const std::string& /*provider_type*/) const {
+                  const std::string& provider_type) const {
     auto* cur_expected = expected.Data<BFloat16>();
     auto* cur_actual = actual.Data<BFloat16>();
     auto size = actual.Shape().Size();
@@ -375,13 +399,7 @@ struct TensorCheck<BFloat16> {
       sort_expected_and_actual_buffers<float>(f_expected, f_actual);
     }
 
-    const bool has_tolerance = params.absolute_error.has_value() || params.relative_error.has_value();
-
-    float abs_threshold = 0.0001f;
-    float rel_threshold = 0.001f;
-#if defined(USE_TENSORRT) || defined(ENABLE_TRAINING_CORE) || defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML) || defined(USE_DNNL)
-    rel_threshold = 0.05f;  // expect at least 95% close
-#endif
+    auto tolerance_params = get_tolerance_params<BFloat16>(params, provider_type);
 
     for (int64_t i = 0; i < size; ++i) {
       if (std::isnan(f_expected[i])) {
@@ -389,9 +407,7 @@ struct TensorCheck<BFloat16> {
       } else if (std::isinf(f_expected[i])) {  // Test infinity for equality
         EXPECT_EQ(f_expected[i], f_actual[i]) << "Expected infinity. i:" << i;
       } else {
-        float tolerance = has_tolerance
-                              ? get_tolerance<float, BFloat16>(params, f_expected[i])
-                              : get_tolerance<float>(abs_threshold, rel_threshold, f_expected[i]);
+        float tolerance = get_tolerance<float>(tolerance_params, f_expected[i]);
         EXPECT_NEAR(f_expected[i], f_actual[i], tolerance) << "i:" << i;
       }
     }
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.h b/onnxruntime/test/providers/cpu/activation/activation_op_test.h
index 984b8f4437a3b..a0a0c28a0e26a 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.h
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.h
@@ -69,6 +69,11 @@ inline void TestActivationOp(const char* szOp, const std::vector<std::vector<T>>
       test.SetOutputRelErr("Y", .000001f);
     }
 #endif
+
+    if (strcmp(szOp, "QuickGelu") == 0) {
+      test.SetOutputTolerance(0.0001f, 0.0001f);
+    }
+
     test.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_providers);
   }
 }
diff --git a/onnxruntime/test/providers/cpu/math/einsum_test.cc b/onnxruntime/test/providers/cpu/math/einsum_test.cc
index 4e968d3de6b8a..d9f7ab6ea50fc 100644
--- a/onnxruntime/test/providers/cpu/math/einsum_test.cc
+++ b/onnxruntime/test/providers/cpu/math/einsum_test.cc
@@ -10,6 +10,11 @@
 namespace onnxruntime {
 namespace test {
 
+// Exclude TRT EP in many tests due to Segmentation fault in A100
+static void run_excluding_trt(OpTester& test) {
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
 // Tests are split up "theme-wise" (i.e.) each kind of operation Einsum can be used for
 // Within each theme we test "explicit" and "implicit" versions of the Einsum equation (wherever possible)
 // Some operations are not possible with implicit notation (reordering, reduction, etc.)
@@ -50,7 +55,7 @@ TEST(Einsum, ExplicitEinsumAsTransposeOp_2D_input_With_Broadcasting) {
   test.AddAttribute<std::string>("equation", "...i->i...");
   test.AddInput<float>("x", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("y", {2, 2}, {1.f, 3.f, 2.f, 4.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(Einsum, ExplicitEinsumAsBatchedTransposeOp_3D_input) {
@@ -58,7 +63,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedTransposeOp_3D_input) {
   test.AddAttribute<std::string>("equation", "...ji->...ij");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("y", {2, 2, 2}, {1.f, 3.f, 2.f, 4.f, 1.f, 3.f, 2.f, 4.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 // Implicit
@@ -75,7 +80,7 @@ TEST(Einsum, ImplicitEinsumAsBatchedTransposeOp_3D_input) {
   test.AddAttribute<std::string>("equation", "...ji");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("y", {2, 2, 2}, {1.f, 3.f, 2.f, 4.f, 1.f, 3.f, 2.f, 4.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 // Theme: Axis/Axes reduction
@@ -102,7 +107,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedReduceOp_3D_input_0) {
   test.AddAttribute<std::string>("equation", "...ji->...j");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("y", {2, 2}, {3.f, 7.f, 3.f, 7.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(Einsum, ExplicitEinsumAsBatchedReduceOp_3D_input_1) {
@@ -110,7 +115,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedReduceOp_3D_input_1) {
   test.AddAttribute<std::string>("equation", "...ji->...");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("y", {2}, {10.f, 10.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 // Implicit
@@ -144,7 +149,7 @@ TEST(Einsum, ExplicitEinsumAsOuterProductWithTransposeOp_Multi_Input) {
   test.AddInput<float>("y", {2}, {3.f, 4.f});
   test.AddInput<float>("z", {2}, {5.f, 6.f});
   test.AddOutput<float>("o", {2, 2, 2}, {15.f, 18.f, 30.f, 36.f, 20.f, 24.f, 40.f, 48.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 // Implicit
@@ -155,7 +160,7 @@ TEST(Einsum, ImplicitEinsumAsOuterProductOp_2D_input) {
   test.AddInput<float>("y", {2}, {3.f, 4.f});
   test.AddInput<float>("z", {2}, {5.f, 6.f});
   test.AddOutput<float>("o", {2, 2, 2}, {15.f, 18.f, 20.f, 24.f, 30.f, 36.f, 40.f, 48.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(Einsum, ImplicitEinsumAsOuterProductOp_Multi_Input) {
@@ -165,7 +170,7 @@ TEST(Einsum, ImplicitEinsumAsOuterProductOp_Multi_Input) {
   test.AddInput<float>("y", {2}, {3.f, 4.f});
   test.AddInput<float>("z", {2}, {5.f, 6.f});
   test.AddOutput<float>("o", {2, 2, 2}, {15.f, 18.f, 20.f, 24.f, 30.f, 36.f, 40.f, 48.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 // Theme: MatMul
 
@@ -233,7 +238,7 @@ TEST(Einsum, ExplicitEinsumAsMatmul_Multi_Input) {
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("z", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {37.f, 81.f, 54.f, 118.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(Einsum, ExplicitEinsumAsBatchedMatmul) {
@@ -251,7 +256,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedMatmulWithBroadcasting_0) {
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("y", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2, 2}, {7.f, 10.f, 15.f, 22.f, 7.f, 10.f, 15.f, 22.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(Einsum, ExplicitEinsumAsBatchedMatmulWithBroadcasting_1) {
@@ -260,7 +265,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedMatmulWithBroadcasting_1) {
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("y", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2, 2}, {14.f, 20.f, 30.f, 44.f, 14.f, 20.f, 30.f, 44.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(Einsum, ExplicitEinsumAsMatmul_OutputTransposed) {
@@ -303,7 +308,7 @@ TEST(Einsum, ImplicitEinsumAsMatmul_Multi_Input) {
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("z", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {37.f, 54.f, 81.f, 118.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 TEST(Einsum, ImplicitEinsumAsBatchedMatmul) {
   OpTester test("Einsum", 12, onnxruntime::kOnnxDomain);
@@ -320,7 +325,7 @@ TEST(Einsum, ImplicitEinsumAsBatchedMatmulWithBroadcasting_0) {
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("y", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2, 2}, {7.f, 10.f, 15.f, 22.f, 7.f, 10.f, 15.f, 22.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(Einsum, ImplicitEinsumAsMatmul_2) {
@@ -343,7 +348,7 @@ TEST(Einsum, DiagonalWithMatmul) {
   test.AddInput<float>("x", {2, 2, 3}, {1.f, 2.f, 3.f, 1.f, 2.f, 3.f, 1.f, 2.f, 3.f, 1.f, 2.f, 3.f});
   test.AddInput<float>("y", {3, 3}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f});
   test.AddOutput<float>("o", {3}, {60.f, 72.f, 84.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 // Theme: Diagonal parsing
@@ -354,7 +359,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOp) {
   test.AddAttribute<std::string>("equation", "ii->i");
   test.AddInput<float>("x", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2}, {1.f, 4.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(Einsum, ExplicitEinsumAsDiagonalOp_1) {
@@ -362,7 +367,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOp_1) {
   test.AddAttribute<std::string>("equation", "iii->i");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2}, {1.f, 4.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(Einsum, ExplicitEinsumAsDiagonalOpWithAxisReduced) {
@@ -370,7 +375,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithAxisReduced) {
   test.AddAttribute<std::string>("equation", "iji->j");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2}, {3.f, 7.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(Einsum, ExplicitEinsumAsDiagonalOpWithAxisPreserved) {
@@ -378,7 +383,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithAxisPreserved) {
   test.AddAttribute<std::string>("equation", "iji->ij");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {1.f, 3.f, 2.f, 4.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose) {
@@ -386,7 +391,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose) {
   test.AddAttribute<std::string>("equation", "iji->ji");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {1.f, 2.f, 3.f, 4.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 // ROCm doesn't support double
@@ -396,7 +401,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose_double) {
   test.AddAttribute<std::string>("equation", "iji->ji");
   test.AddInput<double>("x", {2, 2, 2}, {1., 2., 3., 4., 1., 2., 3., 4.});
   test.AddOutput<double>("o", {2, 2}, {1., 2., 3., 4.});
-  test.Run();
+  run_excluding_trt(test);
 }
 #endif
 
@@ -405,7 +410,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose_int32) {
   test.AddAttribute<std::string>("equation", "iji->ji");
   test.AddInput<int32_t>("x", {2, 2, 2}, {1, 2, 3, 4, 1, 2, 3, 4});
   test.AddOutput<int32_t>("o", {2, 2}, {1, 2, 3, 4});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose_int64) {
@@ -413,14 +418,14 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose_int64) {
   test.AddAttribute<std::string>("equation", "iji->ji");
   test.AddInput<int64_t>("x", {2, 2, 2}, {1, 2, 3, 4, 1, 2, 3, 4});
   test.AddOutput<int64_t>("o", {2, 2}, {1, 2, 3, 4});
-  test.Run();
+  run_excluding_trt(test);
 }
 TEST(Einsum, ExplicitEinsumAsBatchedDiagonalOp) {
   OpTester test("Einsum", 12, onnxruntime::kOnnxDomain);
   test.AddAttribute<std::string>("equation", "...ii->...i");
   test.AddInput<float>("x", {3, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {3, 2}, {1.f, 4.f, 1.f, 4.f, 1.f, 4.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(Einsum, ExplicitEinsumAsBatchedDiagonalOp_1) {
@@ -428,7 +433,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedDiagonalOp_1) {
   test.AddAttribute<std::string>("equation", "...iij->...j");
   test.AddInput<float>("x", {2, 2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {4.f, 6.f, 4.f, 6.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 // Implicit (Implicit diagonal ops will sum up diagonal values)
@@ -442,7 +447,7 @@ TEST(Einsum, ImplicitEinsumAsDiagonalOp) {
   test.AddAttribute<std::string>("equation", "ii");
   test.AddInput<float>("x", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {}, {5.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(Einsum, ImplicitEinsumAsDiagonalOp_1) {
@@ -455,7 +460,7 @@ TEST(Einsum, ImplicitEinsumAsDiagonalOp_1) {
   test.AddAttribute<std::string>("equation", "iii");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {}, {5.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(Einsum, ImplicitEinsumAsDiagonalOpWithAxisReduced) {
@@ -463,7 +468,7 @@ TEST(Einsum, ImplicitEinsumAsDiagonalOpWithAxisReduced) {
   test.AddAttribute<std::string>("equation", "iji");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2}, {3.f, 7.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(Einsum, ImplicitEinsumAsBatchedDiagonalOp) {
@@ -471,7 +476,7 @@ TEST(Einsum, ImplicitEinsumAsBatchedDiagonalOp) {
   test.AddAttribute<std::string>("equation", "...ii");
   test.AddInput<float>("x", {2, 1, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 1}, {5.f, 5.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(Einsum, ImplicitEinsumAsBatchedDiagonalOp_1) {
@@ -479,7 +484,7 @@ TEST(Einsum, ImplicitEinsumAsBatchedDiagonalOp_1) {
   test.AddAttribute<std::string>("equation", "...iij");
   test.AddInput<float>("x", {2, 2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {4.f, 6.f, 4.f, 6.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 // Theme: Scalar inputs and outputs
@@ -491,7 +496,7 @@ TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithOneScalar) {
   test.AddInput<float>("x", {}, {10.f});
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {10.f, 20.f, 30.f, 40.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithTwoScalars_Multi_Input) {
@@ -501,7 +506,7 @@ TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithTwoScalars_Multi_Input) {
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("z", {}, {10.f});
   test.AddOutput<float>("o", {2, 2}, {100.f, 200.f, 300.f, 400.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithAllScalars) {
   OpTester test("Einsum", 12, onnxruntime::kOnnxDomain);
@@ -527,7 +532,7 @@ TEST(Einsum, ImplicitEinsumAsElementwiseMulOpWithOneScalar) {
   test.AddInput<float>("x", {}, {10.f});
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {10.f, 20.f, 30.f, 40.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(Einsum, ImplicitEinsumAsElementwiseMulOpWithThreeScalars_Multi_Input) {
@@ -538,7 +543,7 @@ TEST(Einsum, ImplicitEinsumAsElementwiseMulOpWithThreeScalars_Multi_Input) {
   test.AddInput<float>("c", {}, {10.f});
   test.AddInput<float>("d", {}, {10.f});
   test.AddOutput<float>("o", {2, 2}, {1000.f, 2000.f, 3000.f, 4000.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 TEST(Einsum, ImplicitEinsumAsElementwiseMulOpWithAllScalars) {
   OpTester test("Einsum", 12, onnxruntime::kOnnxDomain);
@@ -568,7 +573,7 @@ TEST(Einsum, ExplicitEinsumAsTensorContractionReshapeFinal) {
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, -6.f, 2.f});
   test.AddInput<float>("z", {2, 2}, {3.f, 4.f, 5.f, 6.f});
   test.AddOutput<float>("o", {2, 2, 2}, {63.f, -132.f, 63.f, -132.f, 63.f, -132.f, 63.f, -132.f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(Einsum, ExplicitEinsumAsTensorContractionReshapeLeft) {
@@ -720,7 +725,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOp_Half) {
   ConvertFloatToMLFloat16(output_f.data(), output.data(), 2);
   test.AddInput<MLFloat16>("x", {2, 2}, input_x);
   test.AddOutput<MLFloat16>("o", {2}, output);
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithOneScalar_Half) {
@@ -741,7 +746,7 @@ TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithOneScalar_Half) {
   test.AddInput<MLFloat16>("x", {}, input_x);
   test.AddInput<MLFloat16>("y", {2, 2}, input_y);
   test.AddOutput<MLFloat16>("o", {2, 2}, output);
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(Einsum, ExplicitEinsumAsTensorContraction_Half) {
@@ -2093,7 +2098,7 @@ TEST_P(EinsumTransposeMatMulThreeInputsTest, EinsumTransposeMatMulThreeInputsTes
   std::vector<int64_t> v1(tst.shape.begin(), tst.shape.end());
   std::vector<float> v2(tst.expected.begin(), tst.expected.end());
   test.AddOutput<float>("o", v1, v2);
-  test.Run();
+  run_excluding_trt(test);
 }
 
 INSTANTIATE_TEST_SUITE_P(EinsumTransposeMatMulThreeInputsTests, EinsumTransposeMatMulThreeInputsTest, testing::ValuesIn(case1));
diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
index d35e5c78cfd69..090022d9e68c9 100644
--- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@@ -1370,7 +1370,8 @@ static void TestSumMultipleInputsNoBroadcasting(size_t num_inputs, const TensorS
 
   test.AddOutput<element_type>("sum", dims, expected_output_data);
 
-  test.Run();
+  // TensorRT EP Segmentation fault: https://github.com/microsoft/onnxruntime/issues/19530
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 TEST(MathOpTest, SumMultipleInputsNoBroadcasting) {
@@ -2630,7 +2631,7 @@ TEST(MathOpTest, Mean_8) {
 #endif
 
 template <float (&op)(float value) MATH_NO_EXCEPT>
-void TrigFloatTest(OpTester& test, std::initializer_list<float> input) {
+void TrigFloatTest(OpTester& test, std::initializer_list<float> input, float tolerance = -1.0f) {
   std::vector<int64_t> dims{static_cast<int64_t>(input.size())};
 
   std::vector<float> output;
@@ -2639,6 +2640,12 @@ void TrigFloatTest(OpTester& test, std::initializer_list<float> input) {
 
   test.AddInput<float>("X", dims, input);
   test.AddOutput<float>("Y", dims, output);
+
+  if (tolerance > 0.0f) {
+    test.SetOutputAbsErr("Y", tolerance);
+    test.SetOutputRelErr("Y", tolerance);
+  }
+
   test.Run();
 }
 
@@ -2708,6 +2715,7 @@ TEST(MathOpTest, CosFloat16) {
     TrigFloat16Test<::cosf>(test, {1.1f, -1.1f, 2.2f, -2.2f});
   }
 }
+
 TEST(MathOpTest, Tan) {
   OpTester test("Tan");
   TrigFloatTest<::tanf>(test, {-100.0f, -50.0f, 0.0f, 50.0f, 100.0f});
@@ -2715,7 +2723,8 @@ TEST(MathOpTest, Tan) {
 
 TEST(MathOpTest, Asin) {
   OpTester test("Asin");
-  TrigFloatTest<::asinf>(test, {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f});
+  float tolerance = DefaultDmlExecutionProvider().get() != nullptr ? 0.0001f : -1.0f;
+  TrigFloatTest<::asinf>(test, {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f}, tolerance);
 }
 
 TEST(MathOpTest, Acos) {
diff --git a/onnxruntime/test/providers/cpu/math/logsoftmax_test.cc b/onnxruntime/test/providers/cpu/math/logsoftmax_test.cc
index 273503e7bf6af..d7befe7820407 100644
--- a/onnxruntime/test/providers/cpu/math/logsoftmax_test.cc
+++ b/onnxruntime/test/providers/cpu/math/logsoftmax_test.cc
@@ -15,7 +15,8 @@ static void RunTest(const std::vector<float>& x_vals,
                     int64_t axis = 1,
                     bool is_tensorrt_supported = true,
                     OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess,
-                    const std::string& error_msg = "") {
+                    const std::string& error_msg = "",
+                    float tolerance = 0.0f) {
   OpTester tester("LogSoftmax", opset);
 
   if (opset < 13) {
@@ -31,6 +32,11 @@ static void RunTest(const std::vector<float>& x_vals,
   tester.AddInput("X", dimensions, x_vals);
   tester.AddOutput("Y", dimensions, expected_vals);
 
+  if (tolerance != 0.0f) {
+    tester.SetOutputAbsErr("Y", tolerance);
+    tester.SetOutputRelErr("Y", tolerance);
+  }
+
   std::unordered_set<std::string> excluded_providers;
   if (!is_tensorrt_supported) {
     excluded_providers.insert(kTensorrtExecutionProvider);
@@ -62,7 +68,7 @@ TEST(LogSoftmaxOperator, LargeNumber) {
                                       -3.4401896f, -2.4401896f, -1.44018972f, -0.44018969f};
   std::vector<int64_t> dimensions = {2, 4};
 
-  RunTest(x_vals, expected_vals, dimensions);
+  RunTest(x_vals, expected_vals, dimensions, 7, 1, true, OpTester::ExpectResult::kExpectSuccess, "", 0.0005f);
 }
 
 // np.random.seed(123)   # Use a seed so we can replicate the input and expected values here and in python
diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc
index af71fe5cf79ae..58df7763786b2 100644
--- a/onnxruntime/test/providers/cpu/model_tests.cc
+++ b/onnxruntime/test/providers/cpu/model_tests.cc
@@ -42,6 +42,10 @@
 #include "core/providers/armnn/armnn_provider_factory.h"
 #endif
 
+#ifdef USE_CUDA
+#include "core/providers/cuda/cuda_provider_options.h"
+#endif
+
 #include "test/common/cuda_op_test_utils.h"
 
 // test infrastructure
@@ -98,21 +102,6 @@ TEST_P(ModelTest, Run) {
 
   std::unique_ptr<OnnxModelInfo> model_info = std::make_unique<OnnxModelInfo>(model_path.c_str());
 
-#if defined(__linux__)
-  // ORT enables TF32 in GEMM for A100. TF32 will cause precsion loss and fail this test.
-  if (HasCudaEnvironment(800) && provider_name == "cuda") {
-    per_sample_tolerance = 1e-1;
-    if (model_path.find(ORT_TSTR("SSD")) > 0 ||
-        model_path.find(ORT_TSTR("ssd")) > 0 ||
-        model_path.find(ORT_TSTR("yolov3")) > 0 ||
-        model_path.find(ORT_TSTR("mask_rcnn")) > 0 ||
-        model_path.find(ORT_TSTR("FNS")) > 0) {
-      SkipTest("Skipping SSD test for big tolearance failure or other errors");
-      return;
-    }
-  }
-#endif
-
   if (model_info->HasDomain(ONNX_NAMESPACE::AI_ONNX_TRAINING_DOMAIN) ||
       model_info->HasDomain(ONNX_NAMESPACE::AI_ONNX_PREVIEW_TRAINING_DOMAIN)) {
     SkipTest("it has the training domain. No pipeline should need to run these tests.");
@@ -198,6 +187,7 @@ TEST_P(ModelTest, Run) {
         std::string device_id = Env::Default().GetEnvironmentVar("ONNXRUNTIME_TEST_GPU_DEVICE_ID");
         values.push_back(device_id.empty() ? "0" : device_id.c_str());
         ASSERT_ORT_STATUS_OK(OrtApis::UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), 1));
+        cuda_options->use_tf32 = false;
         ortso.AppendExecutionProvider_CUDA_V2(*cuda_options);
       } else if (provider_name == "rocm") {
         OrtROCMProviderOptions ep_options;
@@ -229,6 +219,7 @@ TEST_P(ModelTest, Run) {
         ASSERT_ORT_STATUS_OK(OrtApis::CreateCUDAProviderOptions(&cuda_options));
         std::unique_ptr<OrtCUDAProviderOptionsV2, decltype(&OrtApis::ReleaseCUDAProviderOptions)> rel_cuda_options(
             cuda_options, &OrtApis::ReleaseCUDAProviderOptions);
+        cuda_options->use_tf32 = false;
         ortso.AppendExecutionProvider_CUDA_V2(*cuda_options);
       } else if (provider_name == "migraphx") {
         OrtMIGraphXProviderOptions ep_options;
diff --git a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
index 3d30fc62a945d..76df28399564c 100644
--- a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
@@ -905,14 +905,16 @@ TEST(BatchNormTest, ForwardTrainingTestWithSavedOutputsOpset9) {
   test.AddInput<float>("var", channel_dims, {1.0f, 2.0f});
 
   test.AddOutput<float>("Y", input_output_dims, {0.0131f, 0.5210f, 1.7244f, 0.1387f, -0.2708f, -0.1191f, 1.2089f, -0.0922f, -0.9548f, -1.5203f, 0.9077f, -0.8298f, 0.5796f, -0.4501f, -2.0921f, 1.2358f});
-
   test.AddOutput<float>("running_mean", channel_dims, {-0.1754f, 0.303106f});
   test.AddOutput<float>("running_var", channel_dims, {0.696052f, 1.41316f});
+
   // mean and variance of X across channel dimension
   // With Opset9 we output saved_inv_std instead of saved_var to match CUDA EP
   test.AddOutput<float>("saved_mean", channel_dims, {-0.306f, 0.114562f});
   test.AddOutput<float>("saved_inv_std", channel_dims, {1.2288f, 0.861317f});
 
+  test.SetOutputTolerance(0.0001f, 0.0001f);
+
   // exclude CUDA Execution Provider due to flakiness
   // exclude TRT and OpenVINO for same reasons as seen in TestBatchNorm()
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
@@ -938,10 +940,11 @@ TEST(BatchNormTest, ForwardTrainingTestOpset14) {
   test.AddInput<float>("var", channel_dims, {1.0f, 2.0f});
 
   test.AddOutput<float>("Y", input_output_dims, {0.0131f, 0.5210f, 1.7244f, 0.1387f, -0.2708f, -0.1191f, 1.2089f, -0.0922f, -0.9548f, -1.5203f, 0.9077f, -0.8298f, 0.5796f, -0.4501f, -2.0921f, 1.2358f});
-
   test.AddOutput<float>("running_mean", channel_dims, {-0.1754f, 0.303106f});
   test.AddOutput<float>("running_var", channel_dims, {0.696052f, 1.41316f});
 
+  test.SetOutputTolerance(0.0001f, 0.0001f);
+
   // exclude CUDA Execution Provider due to flakiness
   // exclude TRT and OpenVINO for same reasons as seen in TestBatchNorm()
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
@@ -970,6 +973,8 @@ TEST(BatchNormTest, ForwardTrainingTestOpset15) {
   test.AddOutput<float>("running_mean", channel_dims, {-0.1754f, 0.303106f});
   test.AddOutput<float>("running_var", channel_dims, {0.696052f, 1.41316f});
 
+  test.SetOutputTolerance(0.0001f, 0.0001f);
+
   // Same exclusions as the opset 14 test
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
            {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider,
diff --git a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
index e24cda17166ed..b59d4492064a4 100644
--- a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
@@ -888,6 +888,7 @@ TEST(PoolTest, AveragePool_IncludePadPixel) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
 }
 
diff --git a/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc b/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc
index 2f97f6e71e92b..12976c040cdf7 100644
--- a/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc
+++ b/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc
@@ -463,6 +463,7 @@ static void BasicTest() {
                                            0.3661f,
                                            0.2349f,
                                        });
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run();
 }
 
@@ -713,7 +714,8 @@ TEST(RoiAlignTest, AvgModeNegativeInvalidMode) {
   test.AddInput<int64_t>("batch_indices", {5}, {0, 0, 0, 0, 0});
   test.AddOutput<float>("Y", {5, 3, 3, 4}, {2.95833f, 3.20833f, 3.45833f, 3.70833f, 4.625f, 4.875f, 5.125f, 5.375f, 6.29167f, 6.54167f, 6.79167f, 7.04167f, 27.9583f, 28.2083f, 28.4583f, 28.7083f, 29.625f, 29.875f, 30.125f, 30.375f, 31.2917f, 31.5417f, 31.7917f, 32.0417f, 52.9583f, 53.2083f, 53.4583f, 53.7083f, 54.625f, 54.875f, 55.125f, 55.375f, 56.2917f, 56.5417f, 56.7917f, 57.0417f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 7.39583f, 7.39583f, 7.42708f, 7.64583f, 9.0625f, 9.0625f, 9.09375f, 9.3125f, 10.7292f, 10.7292f, 10.7604f, 10.9792f, 32.3958f, 32.3958f, 32.4271f, 32.6458f, 34.0625f, 34.0625f, 34.0938f, 34.3125f, 35.7292f, 35.7292f, 35.7604f, 35.9792f, 57.3958f, 57.3958f, 57.4271f, 57.6458f, 59.0625f, 59.0625f, 59.0938f, 59.3125f, 60.7292f, 60.7292f, 60.7604f, 60.9792f, 4.27083f, 4.52083f, 4.77083f, 5.02083f, 5.9375f, 6.1875f, 6.4375f, 6.6875f, 7.60417f, 7.85417f, 8.10417f, 8.35417f, 29.2708f, 29.5208f, 29.7708f, 30.0208f, 30.9375f, 31.1875f, 31.4375f, 31.6875f, 32.6042f, 32.8542f, 33.1042f, 33.3542f, 54.2708f, 54.5208f, 54.7708f, 55.0208f, 55.9375f, 56.1875f, 56.4375f, 56.6875f, 57.6042f, 57.8542f, 58.1042f, 58.3542f, 6.77083f, 6.77083f, 6.77083f, 6.80208f, 8.4375f, 8.4375f, 8.4375f, 8.46875f, 10.1042f, 10.1042f, 10.1042f, 10.1354f, 31.7708f, 31.7708f, 31.7708f, 31.8021f, 33.4375f, 33.4375f, 33.4375f, 33.4688f, 35.1042f, 35.1042f, 35.1042f, 35.1354f, 56.7708f, 56.7708f, 56.7708f, 56.8021f, 58.4375f, 58.4375f, 58.4375f, 58.4688f, 60.1042f, 60.1042f, 60.1042f, 60.1354f});
 
-  test.Run(OpTester::ExpectResult::kExpectFailure, "Invalid mode");
+  // Exclude TRT EP due to Segmentation fault in A100
+  test.Run(OpTester::ExpectResult::kExpectFailure, "Invalid mode", {kTensorrtExecutionProvider});
 }
 
 TEST(RoiAlignTest, AvgModeNegativeSamplingRatio) {
@@ -738,7 +740,8 @@ TEST(RoiAlignTest, AvgModeNegativeSamplingRatio) {
   test.AddInput<int64_t>("batch_indices", {5}, {0, 0, 0, 0, 0});
   test.AddOutput<float>("Y", {5, 3, 3, 4}, {2.95833f, 3.20833f, 3.45833f, 3.70833f, 4.625f, 4.875f, 5.125f, 5.375f, 6.29167f, 6.54167f, 6.79167f, 7.04167f, 27.9583f, 28.2083f, 28.4583f, 28.7083f, 29.625f, 29.875f, 30.125f, 30.375f, 31.2917f, 31.5417f, 31.7917f, 32.0417f, 52.9583f, 53.2083f, 53.4583f, 53.7083f, 54.625f, 54.875f, 55.125f, 55.375f, 56.2917f, 56.5417f, 56.7917f, 57.0417f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 7.39583f, 7.39583f, 7.42708f, 7.64583f, 9.0625f, 9.0625f, 9.09375f, 9.3125f, 10.7292f, 10.7292f, 10.7604f, 10.9792f, 32.3958f, 32.3958f, 32.4271f, 32.6458f, 34.0625f, 34.0625f, 34.0938f, 34.3125f, 35.7292f, 35.7292f, 35.7604f, 35.9792f, 57.3958f, 57.3958f, 57.4271f, 57.6458f, 59.0625f, 59.0625f, 59.0938f, 59.3125f, 60.7292f, 60.7292f, 60.7604f, 60.9792f, 4.27083f, 4.52083f, 4.77083f, 5.02083f, 5.9375f, 6.1875f, 6.4375f, 6.6875f, 7.60417f, 7.85417f, 8.10417f, 8.35417f, 29.2708f, 29.5208f, 29.7708f, 30.0208f, 30.9375f, 31.1875f, 31.4375f, 31.6875f, 32.6042f, 32.8542f, 33.1042f, 33.3542f, 54.2708f, 54.5208f, 54.7708f, 55.0208f, 55.9375f, 56.1875f, 56.4375f, 56.6875f, 57.6042f, 57.8542f, 58.1042f, 58.3542f, 6.77083f, 6.77083f, 6.77083f, 6.80208f, 8.4375f, 8.4375f, 8.4375f, 8.46875f, 10.1042f, 10.1042f, 10.1042f, 10.1354f, 31.7708f, 31.7708f, 31.7708f, 31.8021f, 33.4375f, 33.4375f, 33.4375f, 33.4688f, 35.1042f, 35.1042f, 35.1042f, 35.1354f, 56.7708f, 56.7708f, 56.7708f, 56.8021f, 58.4375f, 58.4375f, 58.4375f, 58.4688f, 60.1042f, 60.1042f, 60.1042f, 60.1354f});
 
-  test.Run(OpTester::ExpectResult::kExpectFailure, "Sampling ratio should be >=0");
+  // Exclude TRT EP due to Segmentation fault in A100
+  test.Run(OpTester::ExpectResult::kExpectFailure, "Sampling ratio should be >=0", {kTensorrtExecutionProvider});
 }
 
 TEST(RoiAlignTest, AvgModeNegativeInvalidNumRoiDims) {
diff --git a/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc b/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
index 7e81fc80ddf85..ac7bd4ddb17d3 100644
--- a/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
@@ -143,6 +143,8 @@ static void RunLstmTest(const std::vector<float>& X_data,
     test.AddOptionalOutputEdge<float>();
   }
 
+  test.SetOutputTolerance(0.0001f, 0.0001f);
+
   // TensorRT failed on LSTM tests
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
diff --git a/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc b/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc
index e37e784f28930..98f7eb6133822 100644
--- a/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc
@@ -13,6 +13,7 @@ TEST(AffineGridTest, 2d) {
   test.AddInput<int64_t>("size", {4}, {1, 1, 2, 3});
   test.AddOutput<float>("grid", {1, 2, 3, 2},
                         {-0.6667f, -0.5000f, 0.0000f, -0.5000f, 0.6667f, -0.5000f, -0.6667f, 0.5000f, 0.0000f, 0.5000f, 0.6667f, 0.5000f});
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run();
 }
 
@@ -24,6 +25,7 @@ TEST(AffineGridTest, test_2d_0) {
   test.AddInput<float>("theta", {1, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {1, 1, 3, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2}, {-0.3228f, -0.9151f, 1.1544f, -0.7414f, -0.4386f, -0.5868f, 1.0386f, -0.4132f, -0.5544f, -0.2586f, 0.9228f, -0.0849f});
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run();
 }
 
@@ -33,6 +35,7 @@ TEST(AffineGridTest, test_2d_1) {
   test.AddInput<float>("theta", {2, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f, 1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {2, 10, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 3, 2}, {-0.5980f, -0.8620f, 0.3868f, -0.7462f, 1.3716f, -0.6304f, -0.7716f, -0.3696f, 0.2132f, -0.2538f, 1.1980f, -0.1380f, -0.5980f, -0.8620f, 0.3868f, -0.7462f, 1.3716f, -0.6304f, -0.7716f, -0.3696f, 0.2132f, -0.2538f, 1.1980f, -0.1380f});
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run();
 }
 
@@ -42,6 +45,7 @@ TEST(AffineGridTest, test_2d_2) {
   test.AddInput<float>("theta", {1, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {1, 1, 3, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2}, {-0.6726f, -2.7663f, 0.8274f, -1.9003f, -1.2500f, -0.9330f, 0.2500f, -0.0670f, -1.8274f, 0.9003f, -0.3274f, 1.7663f});
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run();
 }
 
@@ -51,6 +55,7 @@ TEST(AffineGridTest, test_2d_3) {
   test.AddInput<float>("theta", {2, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f, 1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {2, 10, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 3, 2}, {-1.0670f, -2.4524f, -0.0670f, -1.8750f, 0.9330f, -1.2976f, -1.9330f, 0.2976f, -0.9330f, 0.8750f, 0.0670f, 1.4524f, -1.0670f, -2.4524f, -0.0670f, -1.8750f, 0.9330f, -1.2976f, -1.9330f, 0.2976f, -0.9330f, 0.8750f, 0.0670f, 1.4524f});
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run();
 }
 
@@ -60,6 +65,7 @@ TEST(AffineGridTest, test_2d_4) {
   test.AddInput<float>("theta", {1, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {1, 1, 3, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2}, {-1.0036f, -1.1661f, 1.9509f, -0.8188f, -1.1772f, -0.6736f, 1.7772f, -0.3264f, -1.3509f, -0.1812f, 1.6036f, 0.1661f});
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run();
 }
 
@@ -69,6 +75,7 @@ TEST(AffineGridTest, test_2d_5) {
   test.AddInput<float>("theta", {2, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f, 1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {2, 10, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 3, 2}, {-1.0036f, -1.1661f, 0.4736f, -0.9924f, 1.9509f, -0.8188f, -1.3509f, -0.1812f, 0.1264f, -0.0076f, 1.6036f, 0.1661f, -1.0036f, -1.1661f, 0.4736f, -0.9924f, 1.9509f, -0.8188f, -1.3509f, -0.1812f, 0.1264f, -0.0076f, 1.6036f, 0.1661f});
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run();
 }
 
@@ -78,6 +85,7 @@ TEST(AffineGridTest, test_2d_6) {
   test.AddInput<float>("theta", {1, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {1, 1, 3, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2}, {-1.1340f, -4.1160f, 1.8660f, -2.3840f, -2.0000f, -1.3660f, 1.0000f, 0.3660f, -2.8660f, 1.3840f, 0.1340f, 3.1160f});
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run();
 }
 
@@ -87,6 +95,7 @@ TEST(AffineGridTest, test_2d_7) {
   test.AddInput<float>("theta", {2, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f, 1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {2, 10, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 3, 2}, {-1.1340f, -4.1160f, 0.3660f, -3.2500f, 1.8660f, -2.3840f, -2.8660f, 1.3840f, -1.3660f, 2.2500f, 0.1340f, 3.1160f, -1.1340f, -4.1160f, 0.3660f, -3.2500f, 1.8660f, -2.3840f, -2.8660f, 1.3840f, -1.3660f, 2.2500f, 0.1340f, 3.1160f});
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run();
 }
 
@@ -96,6 +105,7 @@ TEST(AffineGridTest, test_3d_0) {
   test.AddInput<float>("theta", {1, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f});
   test.AddInput<int64_t>("size", {5}, {1, 1, 3, 2, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2, 3}, {-0.7468f, -1.3266f, 1.5323f, 0.6627f, -1.2078f, 1.3639f, -0.7468f, 0.6430f, 1.6191f, 0.6627f, 0.7618f, 1.4507f, -0.4048f, -1.5442f, 1.8408f, 1.0048f, -1.4254f, 1.6724f, -0.4048f, 0.4254f, 1.9276f, 1.0048f, 0.5442f, 1.7592f, -0.0627f, -1.7618f, 2.1493f, 1.3468f, -1.6430f, 1.9809f, -0.0627f, 0.2078f, 2.2361f, 1.3468f, 0.3266f, 2.0677f});
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run();
 }
 
@@ -105,6 +115,7 @@ TEST(AffineGridTest, test_3d_1) {
   test.AddInput<float>("theta", {2, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f, 1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f});
   test.AddInput<int64_t>("size", {5}, {2, 10, 2, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 2, 3, 3}, {-0.8962f, -1.4008f, 1.6375f, 0.0435f, -1.3216f, 1.5252f, 0.9832f, -1.2424f, 1.4130f, -0.8962f, 0.5688f, 1.7243f, 0.0435f, 0.6480f, 1.6121f, 0.9832f, 0.7272f, 1.4998f, -0.3832f, -1.7272f, 2.1002f, 0.5565f, -1.6480f, 1.9879f, 1.4962f, -1.5688f, 1.8757f, -0.3832f, 0.2424f, 2.1870f, 0.5565f, 0.3216f, 2.0748f, 1.4962f, 0.4008f, 1.9625f, -0.8962f, -1.4008f, 1.6375f, 0.0435f, -1.3216f, 1.5252f, 0.9832f, -1.2424f, 1.4130f, -0.8962f, 0.5688f, 1.7243f, 0.0435f, 0.6480f, 1.6121f, 0.9832f, 0.7272f, 1.4998f, -0.3832f, -1.7272f, 2.1002f, 0.5565f, -1.6480f, 1.9879f, 1.4962f, -1.5688f, 1.8757f, -0.3832f, 0.2424f, 2.1870f, 0.5565f, 0.3216f, 2.0748f, 1.4962f, 0.4008f, 1.9625f});
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run();
 }
 
@@ -114,6 +125,7 @@ TEST(AffineGridTest, test_3d_2) {
   test.AddInput<float>("theta", {1, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f});
   test.AddInput<int64_t>("size", {5}, {1, 1, 3, 2, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2, 3}, {-0.5299f, 0.8995f, -4.3568f, -0.2701f, -0.3995f, -2.9818f, -0.5299f, 2.3995f, 0.4064f, -0.2701f, 1.1005f, 1.7814f, -0.6299f, -0.6005f, -2.7691f, -0.3701f, -1.8995f, -1.3941f, -0.6299f, 0.8995f, 1.9941f, -0.3701f, -0.3995f, 3.3691f, -0.7299f, -2.1005f, -1.1814f, -0.4701f, -3.3995f, 0.1936f, -0.7299f, -0.6005f, 3.5818f, -0.4701f, -1.8995f, 4.9568f});
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run();
 }
 
@@ -123,6 +135,7 @@ TEST(AffineGridTest, test_3d_3) {
   test.AddInput<float>("theta", {2, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f, 0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f});
   test.AddInput<int64_t>("size", {5}, {2, 10, 2, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 2, 3, 3}, {-0.5982f, 0.7410f, -4.1890f, -0.4250f, -0.1250f, -3.2724f, -0.2518f, -0.9910f, -2.3557f, -0.5982f, 2.2410f, 0.5741f, -0.4250f, 1.3750f, 1.4908f, -0.2518f, 0.5090f, 2.4075f, -0.7482f, -1.5090f, -1.8075f, -0.5750f, -2.3750f, -0.8908f, -0.4018f, -3.2410f, 0.0259f, -0.7482f, -0.0090f, 2.9557f, -0.5750f, -0.8750f, 3.8724f, -0.4018f, -1.7410f, 4.7890f, -0.5982f, 0.7410f, -4.1890f, -0.4250f, -0.1250f, -3.2724f, -0.2518f, -0.9910f, -2.3557f, -0.5982f, 2.2410f, 0.5741f, -0.4250f, 1.3750f, 1.4908f, -0.2518f, 0.5090f, 2.4075f, -0.7482f, -1.5090f, -1.8075f, -0.5750f, -2.3750f, -0.8908f, -0.4018f, -3.2410f, 0.0259f, -0.7482f, -0.0090f, 2.9557f, -0.5750f, -0.8750f, 3.8724f, -0.4018f, -1.7410f, 4.7890f});
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run();
 }
 
@@ -132,6 +145,7 @@ TEST(AffineGridTest, test_3d_4) {
   test.AddInput<float>("theta", {1, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f});
   test.AddInput<int64_t>("size", {5}, {1, 1, 3, 2, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2, 3}, {-1.6226f, -2.2620f, 1.4189f, 1.1965f, -2.0245f, 1.0821f, -1.6226f, 1.6772f, 1.5925f, 1.1965f, 1.9147f, 1.2557f, -1.1095f, -2.5884f, 1.8816f, 1.7095f, -2.3508f, 1.5448f, -1.1095f, 1.3508f, 2.0552f, 1.7095f, 1.5884f, 1.7184f, -0.5965f, -2.9147f, 2.3443f, 2.2226f, -2.6772f, 2.0075f, -0.5965f, 1.0245f, 2.5179f, 2.2226f, 1.2620f, 2.1811f});
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run();
 }
 
@@ -141,6 +155,7 @@ TEST(AffineGridTest, test_3d_5) {
   test.AddInput<float>("theta", {2, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f, 1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f});
   test.AddInput<int64_t>("size", {5}, {2, 10, 2, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 2, 3, 3}, {-1.6226f, -2.2620f, 1.4189f, -0.2130f, -2.1433f, 1.2505f, 1.1965f, -2.0245f, 1.0821f, -1.6226f, 1.6772f, 1.5925f, -0.2130f, 1.7960f, 1.4241f, 1.1965f, 1.9147f, 1.2557f, -0.5965f, -2.9147f, 2.3443f, 0.8130f, -2.7960f, 2.1759f, 2.2226f, -2.6772f, 2.0075f, -0.5965f, 1.0245f, 2.5179f, 0.8130f, 1.1433f, 2.3495f, 2.2226f, 1.2620f, 2.1811f, -1.6226f, -2.2620f, 1.4189f, -0.2130f, -2.1433f, 1.2505f, 1.1965f, -2.0245f, 1.0821f, -1.6226f, 1.6772f, 1.5925f, -0.2130f, 1.7960f, 1.4241f, 1.1965f, 1.9147f, 1.2557f, -0.5965f, -2.9147f, 2.3443f, 0.8130f, -2.7960f, 2.1759f, 2.2226f, -2.6772f, 2.0075f, -0.5965f, 1.0245f, 2.5179f, 0.8130f, 1.1433f, 2.3495f, 2.2226f, 1.2620f, 2.1811f});
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run();
 }
 
@@ -150,6 +165,7 @@ TEST(AffineGridTest, test_3d_6) {
   test.AddInput<float>("theta", {1, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f});
   test.AddInput<int64_t>("size", {5}, {1, 1, 3, 2, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2, 3}, {-0.6098f, 1.5490f, -8.2197f, -0.0902f, -1.0490f, -5.4697f, -0.6098f, 4.5490f, 1.3066f, -0.0902f, 1.9510f, 4.0566f, -0.7598f, -0.7010f, -5.8381f, -0.2402f, -3.2990f, -3.0881f, -0.7598f, 2.2990f, 3.6881f, -0.2402f, -0.2990f, 6.4381f, -0.9098f, -2.9510f, -3.4566f, -0.3902f, -5.5490f, -0.7066f, -0.9098f, 0.0490f, 6.0697f, -0.3902f, -2.5490f, 8.8197f});
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run();
 }
 
@@ -159,6 +175,7 @@ TEST(AffineGridTest, test_3d_7) {
   test.AddInput<float>("theta", {2, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f, 0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f});
   test.AddInput<int64_t>("size", {5}, {2, 10, 2, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 2, 3, 3}, {-0.6098f, 1.5490f, -8.2197f, -0.3500f, 0.2500f, -6.8447f, -0.0902f, -1.0490f, -5.4697f, -0.6098f, 4.5490f, 1.3066f, -0.3500f, 3.2500f, 2.6816f, -0.0902f, 1.9510f, 4.0566f, -0.9098f, -2.9510f, -3.4566f, -0.6500f, -4.2500f, -2.0816f, -0.3902f, -5.5490f, -0.7066f, -0.9098f, 0.0490f, 6.0697f, -0.6500f, -1.2500f, 7.4447f, -0.3902f, -2.5490f, 8.8197f, -0.6098f, 1.5490f, -8.2197f, -0.3500f, 0.2500f, -6.8447f, -0.0902f, -1.0490f, -5.4697f, -0.6098f, 4.5490f, 1.3066f, -0.3500f, 3.2500f, 2.6816f, -0.0902f, 1.9510f, 4.0566f, -0.9098f, -2.9510f, -3.4566f, -0.6500f, -4.2500f, -2.0816f, -0.3902f, -5.5490f, -0.7066f, -0.9098f, 0.0490f, 6.0697f, -0.6500f, -1.2500f, 7.4447f, -0.3902f, -2.5490f, 8.8197f});
+  test.SetOutputTolerance(0.0001f, 0.0001f);
   test.Run();
 }
 }  // namespace test
diff --git a/onnxruntime/test/providers/cpu/tensor/mean_variance_normalization_test.cc b/onnxruntime/test/providers/cpu/tensor/mean_variance_normalization_test.cc
index b6720ae2a9a7d..be212515376fb 100644
--- a/onnxruntime/test/providers/cpu/tensor/mean_variance_normalization_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/mean_variance_normalization_test.cc
@@ -5,6 +5,7 @@
 
 #include "test/common/tensor_op_test_utils.h"
 #include "test/providers/provider_test_utils.h"
+#include "test/util/include/default_providers.h"
 
 namespace onnxruntime::test {
 
@@ -155,6 +156,10 @@ TEST(MeanVarianceNormalizationTest, AxesSubsets5D) {
     test.AddInput<float>("input", shape, X.data(), X.size());
     test.AddOutput<float>("output", shape, Y.data(), Y.size());
 
+    if (DefaultDmlExecutionProvider().get() != nullptr) {
+      test.SetOutputTolerance(0.001f, 0.001f);
+    }
+
     test.Run();
   };
 
diff --git a/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc b/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc
index a2ffbdcc0bdf1..452094b484d4a 100644
--- a/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc
@@ -9,6 +9,11 @@ using namespace std;
 namespace onnxruntime {
 namespace test {
 
+// Exclude TRT EP in tests due to Segmentation fault in A100
+static void run_excluding_trt(OpTester& test) {
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
 TEST(OneHotOpTest, DefaultAxis) {
   OpTester test("OneHot", 11);
   test.AddInput<int64_t>("indices", {2, 3}, {1, 9, 8, 2, 4, 6});
@@ -36,7 +41,7 @@ TEST(OneHotOpTest, DefaultAxis_float_float_float /*indices, output, depth*/) {
                          0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
                          0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
                          0., 0., 0., 0., 0., 0., 1., 0., 0., 0.});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(OneHotOpTest, DefaultAxis_int64_int32_float /*indices, output, depth*/) {
@@ -51,7 +56,7 @@ TEST(OneHotOpTest, DefaultAxis_int64_int32_float /*indices, output, depth*/) {
                            0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
                            0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
                            0, 0, 0, 0, 0, 0, 1, 0, 0, 0});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(OneHotOpTest, DefaultAxis_int64_float_int64 /*indices, output, depth*/) {
@@ -81,7 +86,7 @@ TEST(OneHotOpTest, DefaultAxis_int32_float_float /*indices, output, depth*/) {
                          0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
                          0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(OneHotOpTest, DefaultAxis_int32_float_int32 /*indices, output, depth*/) {
@@ -231,7 +236,7 @@ TEST(OneHotOpTest, DefaultAxis_float_float_float_NonZeroOffValue /*indices, outp
                          2., 2., 3., 2., 2., 2., 2., 2., 2., 2.,
                          2., 2., 2., 2., 3., 2., 2., 2., 2., 2.,
                          2., 2., 2., 2., 2., 2., 3., 2., 2., 2.});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(OneHotOpTest, DefaultAxis_int64_int32_float_NonZeroOffValue /*indices, output, depth*/) {
@@ -246,7 +251,7 @@ TEST(OneHotOpTest, DefaultAxis_int64_int32_float_NonZeroOffValue /*indices, outp
                            2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
                            2, 2, 2, 2, 3, 2, 2, 2, 2, 2,
                            2, 2, 2, 2, 2, 2, 3, 2, 2, 2});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(OneHotOpTest, DefaultAxis_int64_float_int64_NonZeroOffValue /*indices, output, depth*/) {
@@ -276,7 +281,7 @@ TEST(OneHotOpTest, DefaultAxis_int32_float_float_NonZeroOffValue /*indices, outp
                          2.0f, 2.0f, 3.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
                          2.0f, 2.0f, 2.0f, 2.0f, 3.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
                          2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 3.0f, 2.0f, 2.0f, 2.0f});
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(OneHotOpTest, DefaultAxis_int32_float_int32_NonZeroOffValue /*indices, output, depth*/) {
diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
index 062f25b989a70..f8467fb50ed2d 100644
--- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
@@ -8,6 +8,12 @@
 
 namespace onnxruntime {
 namespace test {
+
+// Exclude TRT EP in tests due to Segmentation fault in A100
+static void run_excluding_trt(OpTester& test) {
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_tf_crop_and_resize) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
@@ -243,7 +249,9 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear) {
   std::vector<float> Y = {2.66666651f, 4.3333331f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});  // QNN: result diff
+  // QNN: result diff
+  // TRT: Segmentation fault in A100
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear) {
@@ -267,8 +275,9 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear) {
   test.AddOutput<float>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
   // CUDA: result mismatch due to not implementing NHWC support
   // ROCm: results mismatch
+  // TRT: Segmentation fault in A100
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider});
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_uint8) {
@@ -315,7 +324,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_int8) {
   std::vector<int8_t> Y = {0, 0};
 
   test.AddOutput<int8_t>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 // Since NNAPI(TFLite) only using the scale calculate using the input/output size
@@ -347,7 +356,7 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear1) {
     std::vector<float> Y = {3.5f, 5.5f};
 
     test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-    test.Run();
+    run_excluding_trt(test);
   };
 
   run_test(false);
@@ -405,7 +414,7 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_align_corners) {
     std::vector<float> Y = {1.0f, 4.0f};
 
     test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-    test.Run();
+    run_excluding_trt(test);
   };
 
   run_test(false);
@@ -608,7 +617,7 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_asymmetric_scales) {
         7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 11.0f, 11.0f, 11.0f};
 
     test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-    test.Run();
+    run_excluding_trt(test);
   };
 
   run_test(false);
@@ -725,7 +734,7 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_align_corners) {
       4.0f, 4.5714290f, 5.142857f, 5.714286f, 6.285714f, 6.8571430f, 7.428571f, 8.0f};
 
   test.AddOutput<float>("Y", {static_cast<int64_t>(H * scales[0]), static_cast<int64_t>(W * scales[1])}, Y);
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_3DTrilinear_pytorch_half_pixel) {
@@ -819,7 +828,7 @@ TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest) {
                             7.0f, 11.0f};
 
     test.AddOutput<float>("Y", {N, C, H, W}, Y);
-    test.Run();
+    run_excluding_trt(test);
   };
 
   run_test(false);
@@ -845,7 +854,7 @@ TEST(ResizeOpTest, ResizeOpNearestDownSampleTest) {
   std::vector<float> Y = {1.0f, 3.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_Opset12) {
@@ -867,7 +876,7 @@ TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_Opset12) {
   std::vector<float> Y = {1.0f, 3.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_WithSizes) {
@@ -920,7 +929,7 @@ TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_tf_half_pixel) {
                           14.0f, 16.0f};
 
   test.AddOutput<float>("Y", {N, C, sizes[2], sizes[3]}, Y);
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_tf_crop_and_resize_with_extrapolation) {
@@ -1000,7 +1009,7 @@ TEST(ResizeOpTest, ResizeOpNearestUpSampleTest) {
                           3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(ResizeOpTest, ResizeOpNearestUpSampleTest_WithSizes_CeilMode) {
@@ -1093,7 +1102,7 @@ TEST(ResizeOpTest, ResizeOpNearestUpSample_Floor_Align_Corners) {
                           13.0f, 13.0f, 13.0f, 14.0f, 14.0f, 15.0f, 15.0f, 16.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(ResizeOpTest, ResizeOpNearest_OneToOneMappingBetweenInputAndOutputDataDims) {
@@ -1197,7 +1206,7 @@ TEST(ResizeOpTest, ResizeOpNearestUpSample_Nearest2xOptimization_Scales) {
                             3.0f, 3.0f, 4.0f, 4.0f};
 
     test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-    test.Run();
+    run_excluding_trt(test);
   };
 
   run_test(false);
@@ -1262,7 +1271,7 @@ TEST(ResizeOpTest, ResizeOpCubicDownSampleTest) {
                           11.9165f, 13.2266f, 14.5278f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_exclude_outside) {
@@ -1292,7 +1301,7 @@ TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_exclude_outside) {
                           11.949f, 13.2503f, 14.5942f};
 
   test.AddOutput<float>("Y", {static_cast<int64_t>(H * scales[0]), static_cast<int64_t>(W * scales[1])}, Y);
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_coeff) {
@@ -1319,7 +1328,7 @@ TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_coeff) {
                           11.8701f, 13.168f, 14.4912f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_with_roi) {
@@ -1373,7 +1382,7 @@ TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_asymmetric) {
                           11.375f, 12.6719f, 13.9688f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(ResizeOpTest, ResizeOpCubicUpSampleTest) {
@@ -1405,7 +1414,7 @@ TEST(ResizeOpTest, ResizeOpCubicUpSampleTest) {
                           13.375f, 13.7813f, 14.375f, 14.875f, 15.375f, 15.9688f, 16.375f, 16.4688f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(ResizeOpTest, ResizeOpCubicUpSampleTest_MultiChannel) {
@@ -1486,7 +1495,7 @@ TEST(ResizeOpTest, ResizeOpCubicUpSampleTest_tf_half_pixel_for_nn) {
                           13.332f, 13.8086f, 14.4375f, 14.8438f, 15.4727f, 15.9492f, 16.2461f, 16.1758f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_Ver10) {
@@ -1512,7 +1521,9 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_Ver10) {
   std::vector<float> Y = {1.0f, 2.66666651f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});  // QNN: result diff
+  // QNN: result diff
+  // TRT: segmentation fault in A100
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_Ver10) {
@@ -1538,7 +1549,7 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_Ver10) {
   std::vector<float> Y = {1.0f, 2.66666651f};
 
   test.AddOutput<float>("Y", {static_cast<int64_t>(H * scales[0]), static_cast<int64_t>(W * scales[1])}, Y);
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_Ver10) {
@@ -1574,7 +1585,9 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_Ver10) {
       7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 11.0f, 11.0f, 11.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});  // QNN: result diff
+  // QNN: result diff
+  // TRT: segmentation fault in A100
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_Ver10) {
@@ -1602,7 +1615,7 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_Ver10) {
       4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 8.0f, 8.0f, 8.0f};
 
   test.AddOutput<float>("Y", {static_cast<int64_t>(H * scales[0]), static_cast<int64_t>(W * scales[1])}, Y);
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest_Ver10) {
@@ -1627,7 +1640,7 @@ TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest_Ver10) {
                           7.0f, 11.0f};
 
   test.AddOutput<float>("Y", {N, C, H, W}, Y);
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_Ver10) {
@@ -1647,7 +1660,7 @@ TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_Ver10) {
   std::vector<float> Y = {1.0f, 3.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(ResizeOpTest, ResizeOpNearestUpSampleTest_Ver10) {
@@ -1668,10 +1681,10 @@ TEST(ResizeOpTest, ResizeOpNearestUpSampleTest_Ver10) {
                           3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  run_excluding_trt(test);
 }
 
-TEST(UpsampleOpTest, ResizeOpNearestNoScaleTest_Ver10) {
+TEST(ResizeOpTest, ResizeOpNearestNoScaleTest_Ver10) {
   OpTester test("Resize", 10);
   std::vector<float> scales{1.0f, 1.0f, 1.0f, 1.0f};
 
@@ -1686,7 +1699,7 @@ TEST(UpsampleOpTest, ResizeOpNearestNoScaleTest_Ver10) {
   std::vector<float> Y = {1.0f, 2.0f, 3.0f, 4.0f};
 
   test.AddOutput<float>("Y", {N, C, H, W}, Y);
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(ResizeOpTest, ResizeOp_MissingRoiAndMissingScalesOptionalInputs) {
@@ -1737,7 +1750,7 @@ void ResizeOpTypeCheck_Ver_10() {
                       3, 3, 3, 4, 4, 4};
 
   test.AddOutput<T>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(ResizeOpTest, ResizeOpTypeCheck_Ver_10) {
@@ -1768,7 +1781,7 @@ void ResizeOpTypeCheck_Ver_11_13_18(int opset_version) {
                       3, 3, 3, 4, 4, 4};
 
   test.AddOutput<T>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  run_excluding_trt(test);
 }
 
 TEST(ResizeOpTest, ResizeOpTypeCheck_Ver11) {
diff --git a/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc b/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc
index 188532cfa350a..825bdc8c8103d 100644
--- a/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc
@@ -939,7 +939,9 @@ TEST(UpsampleOpTest, UpsampleOpNearest2XTest_opset9) {
       7, 7, 9, 9};
 
   test.AddOutput<int32_t>("Y", {N, C, (int64_t)(H * scales[2]), (int64_t)(W * scales[3])}, Y);
-  test.Run();
+
+  // TRT: segmentation fault in A100
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 TEST(UpsampleOpTest, NhwcUpsampleOpNearest2XTest_opset9) {
diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index c12a52c4356aa..6ad2d41edb562 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -8,7 +8,7 @@
 #ifdef USE_COREML
 #include "core/providers/coreml/coreml_provider_factory.h"
 #endif
-#if defined(ENABLE_CUDA_NHWC_OPS)
+#ifdef USE_CUDA
 #include <core/providers/cuda/cuda_provider_options.h>
 #endif
 #include "core/session/onnxruntime_cxx_api.h"
@@ -113,8 +113,9 @@ std::unique_ptr<IExecutionProvider> DefaultOpenVINOExecutionProvider() {
 
 std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider() {
 #ifdef USE_CUDA
-  OrtCUDAProviderOptions provider_options{};
+  OrtCUDAProviderOptionsV2 provider_options{};
   provider_options.do_copy_in_default_stream = true;
+  provider_options.use_tf32 = false;
   if (auto factory = CudaProviderFactoryCreator::Create(&provider_options))
     return factory->CreateProvider();
 #endif
@@ -126,6 +127,7 @@ std::unique_ptr<IExecutionProvider> DefaultCudaNHWCExecutionProvider() {
 #if defined(USE_CUDA)
   OrtCUDAProviderOptionsV2 provider_options{};
   provider_options.do_copy_in_default_stream = true;
+  provider_options.use_tf32 = false;
   provider_options.prefer_nhwc = true;
   if (auto factory = CudaProviderFactoryCreator::Create(&provider_options))
     return factory->CreateProvider();
diff --git a/orttraining/orttraining/test/gradient/optimizer_ops_test.cc b/orttraining/orttraining/test/gradient/optimizer_ops_test.cc
index bfb59f1525e47..84de4b5ef7ff0 100644
--- a/orttraining/orttraining/test/gradient/optimizer_ops_test.cc
+++ b/orttraining/orttraining/test/gradient/optimizer_ops_test.cc
@@ -144,6 +144,8 @@ TEST(OptimizerTest, AdamBiasCorrection) {
   test.AddOutput<float>("Moment_2_Out", {3}, {1.7400e-04f, 8.9966e-04f, 1.5102e-03f});
   test.AddOutput<float>("W_Out", {3}, {-1.4634f, -0.6416f, -1.2121f});
 
+  test.SetOutputTolerance(0.0001f, 0.0001f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(1));
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(0));
 
@@ -167,6 +169,8 @@ TEST(OptimizerTest, AdamWeightDecayMode0NoBiasCorrection) {
   test.AddOutput<float>("W_Out", {3}, {-3.6210f, -2.8075f, -3.3723f});
   test.AddOutput<float>("G_Out", {3}, {-3.1576f, -3.1658f, -3.1601f});
 
+  test.SetOutputTolerance(0.0001f, 0.0001f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(0));
   test.AddAttribute("lambda", 0.01f);
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(0));
@@ -191,6 +195,8 @@ TEST(OptimizerTest, AdamWeightDecayMode0WithBiasCorrection) {
   test.AddOutput<float>("W_Out", {3}, {-1.4587f, -0.6452f, -1.2099f});
   test.AddOutput<float>("G_Out", {3}, {-0.9954f, -1.0036f, -0.9979f});
 
+  test.SetOutputTolerance(0.0001f, 0.0001f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(1));
   test.AddAttribute("lambda", 0.01f);
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(0));
@@ -214,6 +220,8 @@ TEST(OptimizerTest, AdamWeightDecayMode1NoBiasCorrection) {
   test.AddOutput<float>("Moment_2_Out", {3}, {1.7400e-04f, 8.9966e-04f, 1.5102e-03f});
   test.AddOutput<float>("W_Out", {3}, {-3.5894f, -2.7758f, -3.3406f});
 
+  test.SetOutputTolerance(0.0001f, 0.0001f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(0));
   test.AddAttribute("lambda", 0.01f);
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(1));
@@ -237,6 +245,8 @@ TEST(OptimizerTest, AdamWeightDecayMode1WithBiasCorrection) {
   test.AddOutput<float>("Moment_2_Out", {3}, {1.7400e-04f, 8.9966e-04f, 1.5102e-03f});
   test.AddOutput<float>("W_Out", {3}, {-1.4488f, -0.6352f, -1.1999f});
 
+  test.SetOutputTolerance(0.0001f, 0.0001f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(1));
   test.AddAttribute("lambda", 0.01f);
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(1));
@@ -368,6 +378,11 @@ TEST(OptimizerTest, AdamOptimizerMixPrecision_FP16Weight_ClipNorm_Test) {
   test.AddOptionalOutputEdge<MLFloat16>();
   test.AddOutput<MLFloat16>("FP16_W_Out", {3}, w_new_half);
 
+  test.SetOutputAbsErr("Moment_1_Out", 0.005f);
+  test.SetOutputAbsErr("Moment_2_Out", 0.005f);
+  test.SetOutputAbsErr("W_Out", 0.001f);
+  test.SetOutputAbsErr("FP16_W_Out", 0.005f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(0));
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(0));
   test.AddAttribute("max_norm_clip", 0.001f);
diff --git a/orttraining/orttraining/test/training_ops/cpu/nn/batchnorm_internal_test.cc b/orttraining/orttraining/test/training_ops/cpu/nn/batchnorm_internal_test.cc
index e9795a24681cb..3e0d35ab17c9b 100644
--- a/orttraining/orttraining/test/training_ops/cpu/nn/batchnorm_internal_test.cc
+++ b/orttraining/orttraining/test/training_ops/cpu/nn/batchnorm_internal_test.cc
@@ -37,6 +37,8 @@ TEST(BatchNormInternalTest, ForwardTrainingTest) {
   test.AddOutput<float>("saved_mean", channel_dims, {-0.306f, 0.114562f});
   test.AddOutput<float>("saved_inv_std", channel_dims, {1.2288f, 0.861317f});
 
+  test.SetOutputTolerance(0.0001f, 0.0001f);
+
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
   execution_providers.emplace_back(DefaultCpuExecutionProvider());