diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_kernel.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_kernel.h
index e961bab399326..d46d9597a758f 100644
--- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_kernel.h
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_kernel.h
@@ -98,7 +98,7 @@ inline __device__ void compute_attn_1rowblock(const Params& params, const int bi
       for (int m = 0; m < size<1>(tOgO); ++m) {
         const int row = get<0>(tOcO(0, m, 0));
         if (row < binfo.actual_seqlen_q - m_block * kBlockM && get<1>(tOcO(0, m, 0)) == 0) {
-          gLSE(row) = INFINITY;
+          gLSE(row) = std::numeric_limits<ElementAccum>::infinity();
         }
       }
       return;
@@ -499,7 +499,7 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params& params, cons
     for (int m = 0; m < size<1>(tOgOaccum); ++m) {
       const int row = get<0>(tOcO(0, m, 0));
       if (row < binfo.actual_seqlen_q - m_block * kBlockM && get<1>(tOcO(0, m, 0)) == 0) {
-        gLSEaccum(row) = Split ? -INFINITY : INFINITY;
+        gLSEaccum(row) = Split ? -std::numeric_limits<ElementAccum>::infinity() : std::numeric_limits<ElementAccum>::infinity();
       }
     }
     return;
@@ -1061,7 +1061,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) {
   for (int l = 0; l < kNLsePerThread; ++l) {
     const int row = l * kRowsPerLoadLSE + tidx / kBlockM;
     const int col = tidx % kBlockM;
-    ElementAccum lse = (row < params.num_splits && col < params.b * params.h * params.seqlen_q - bidx * kBlockM) ? gLSEaccum(row, col) : -INFINITY;
+    ElementAccum lse = (row < params.num_splits && col < params.b * params.h * params.seqlen_q - bidx * kBlockM) ? gLSEaccum(row, col) : -std::numeric_limits<ElementAccum>::infinity();
     if (row < kMaxSplits) {
       sLSE[row][col] = lse;
     }
@@ -1082,7 +1082,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) {
   for (int l = 0; l < kNLsePerThread; ++l) {
     const int row = l * kRowsPerLoadTranspose + tidx % kRowsPerLoadTranspose;
     const int col = tidx / kRowsPerLoadTranspose;
-    lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE[row][col] : -INFINITY;
+    lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE[row][col] : -std::numeric_limits<ElementAccum>::infinity();
     // if (bidx == 0 && tidx < 32) { printf("tidx = %d, row = %d, col = %d, lse = %f\n", tidx, row, col, lse_accum(l)); }
   }
 
@@ -1094,7 +1094,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) {
   }
   MaxOp<float> max_op;
   lse_max = Allreduce<kRowsPerLoadTranspose>::run(lse_max, max_op);
-  lse_max = lse_max == -INFINITY ? 0.0f : lse_max;  // In case all local LSEs are -inf
+  lse_max = lse_max == -std::numeric_limits<ElementAccum>::infinity() ? 0.0f : lse_max;  // In case all local LSEs are -inf
   float lse_sum = expf(lse_accum(0) - lse_max);
 #pragma unroll
   for (int l = 1; l < kNLsePerThread; ++l) {
@@ -1104,7 +1104,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) {
   lse_sum = Allreduce<kRowsPerLoadTranspose>::run(lse_sum, sum_op);
   // For the case where all local lse == -INFINITY, we want to set lse_logsum to INFINITY. Otherwise
   // lse_logsum is log(0.0) = -INFINITY and we get NaN when we do lse_accum(l) - lse_logsum.
-  ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum) ? INFINITY : logf(lse_sum) + lse_max;
+  ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum) ? std::numeric_limits<ElementAccum>::infinity() : logf(lse_sum) + lse_max;
   // if (bidx == 0 && tidx < 32) { printf("tidx = %d, lse = %f, lse_max = %f, lse_logsum = %f\n", tidx, lse_accum(0), lse_max, lse_logsum); }
   if (tidx % kRowsPerLoadTranspose == 0 && tidx / kRowsPerLoadTranspose < kBlockM) {
     gLSE(tidx / kRowsPerLoadTranspose) = lse_logsum;
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h
index 0998155eba635..71434002f8df1 100644
--- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <limits>
 #include <cute/tensor.hpp>
 
 namespace onnxruntime {
@@ -28,7 +29,7 @@ __forceinline__ __device__ void apply_mask(Tensor<Engine, Layout>& tensor, const
 // Without the "make_coord" we get wrong results
 #pragma unroll
         for (int mi = 0; mi < size<0>(tensor); ++mi) {
-          tensor(mi, make_coord(j, nj)) = -INFINITY;
+          tensor(mi, make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
         }
       }
     }
@@ -59,7 +60,7 @@ __forceinline__ __device__ void apply_mask_local(Tensor<Engine, Layout>& tensor,
         for (int j = 0; j < size<1, 0>(tensor); ++j) {
           const int col_idx = col_idx_base + j;
           if (col_idx >= col_idx_limit_right || (HasWSLeft && col_idx < col_idx_limit_left)) {
-            tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+            tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
           }
         }
       }
@@ -96,7 +97,7 @@ __forceinline__ __device__ void apply_mask_causal_w_idx(
 #pragma unroll
     for (int ni = 0; ni < size<1, 1>(tensor); ++ni) {
       if (col_idx_offset_ + get<1>(idx_rowcol(0, ni)) >= col_idx_limit) {
-        tensor(mi, ni) = -INFINITY;
+        tensor(mi, ni) = -std::numeric_limits<float>::infinity();
       }
     }
     // if (cute::thread0()) {
@@ -151,7 +152,7 @@ struct Mask {
               }
               if constexpr (!Is_even_MN) {
                 if (col_idx >= max_seqlen_k) {
-                  tensor(mi, make_coord(j, nj)) = -INFINITY;
+                  tensor(mi, make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                 }
               }
             }
@@ -181,18 +182,18 @@ struct Mask {
                 }
                 if constexpr (Causal_mask) {
                   if (col_idx >= col_idx_limit_right) {
-                    tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                    tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                   }
                 }
                 if constexpr (Is_local) {
                   if (col_idx >= col_idx_limit_right || col_idx < col_idx_limit_left) {
-                    tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                    tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                   }
                 }
                 if constexpr (!Causal_mask && !Is_local && !Is_even_MN) {
                   // Causal and Local already handles MN masking
                   if (col_idx >= max_seqlen_k) {
-                    tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                    tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                   }
                 }
               }
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h
index 7e0095cb39bd9..7fe506e01a9b9 100644
--- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include <cmath>
+#include <limits>
 
 #include <cute/tensor.hpp>
 
@@ -71,7 +72,9 @@ __forceinline__ __device__ void scale_apply_exp2(Tensor<Engine0, Layout0>& tenso
     // If max is -inf, then all elements must have been -inf (possibly due to masking).
     // We don't want (-inf - (-inf)) since that would give NaN.
     // If we don't have float around M_LOG2E the multiplication is done in fp64.
-    const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * (Scale_max ? scale : float(M_LOG2E));
+    const float max_scaled = max(mi) == -std::numeric_limits<float>::infinity()
+                                 ? 0.f
+                                 : max(mi) * (Scale_max ? scale : float(M_LOG2E));
 #pragma unroll
     for (int ni = 0; ni < size<1>(tensor); ++ni) {
       // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
@@ -99,7 +102,7 @@ __forceinline__ __device__ void max_scale_exp2_sum(Tensor<Engine0, Layout0>& ten
     max(mi) = Allreduce<4>::run(max(mi), max_op);
     // If max is -inf, then all elements must have been -inf (possibly due to masking).
     // We don't want (-inf - (-inf)) since that would give NaN.
-    const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * scale;
+    const float max_scaled = max(mi) == -std::numeric_limits<float>::infinity() ? 0.f : max(mi) * scale;
     sum(mi) = 0;
 #pragma unroll
     for (int ni = 0; ni < size<1>(tensor); ++ni) {
@@ -143,7 +146,7 @@ struct Softmax {
       for (int mi = 0; mi < size(row_max); ++mi) {
         float scores_max_cur = !Check_inf
                                    ? row_max(mi)
-                                   : (row_max(mi) == -INFINITY ? 0.0f : row_max(mi));
+                                   : (row_max(mi) == -std::numeric_limits<float>::infinity() ? 0.0f : row_max(mi));
         float scores_scale = exp2f((scores_max_prev(mi) - scores_max_cur) * softmax_scale_log2);
         row_sum(mi) *= scores_scale;
 #pragma unroll
@@ -169,7 +172,9 @@ struct Softmax {
     for (int mi = 0; mi < size<0>(acc_o_rowcol); ++mi) {
       float sum = smooth_softmax ? row_sum(mi) + expf(-row_max(mi) * softmax_scale) : row_sum(mi);
       float inv_sum = (sum == 0.f || sum != sum) ? 1.f : 1.f / sum;
-      lse(mi) = (sum == 0.f || sum != sum) ? (Split ? -INFINITY : INFINITY) : row_max(mi) * softmax_scale + __logf(sum);
+      lse(mi) = (sum == 0.f || sum != sum)
+                    ? (Split ? -std::numeric_limits<float>::infinity() : std::numeric_limits<float>::infinity())
+                    : row_max(mi) * softmax_scale + __logf(sum);
       float scale = inv_sum;
 #pragma unroll
       for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) {
diff --git a/onnxruntime/contrib_ops/cuda/bert/lean_attention/lean_fwd_kernel.h b/onnxruntime/contrib_ops/cuda/bert/lean_attention/lean_fwd_kernel.h
index 5be69ea0af55c..bd54b404420e5 100644
--- a/onnxruntime/contrib_ops/cuda/bert/lean_attention/lean_fwd_kernel.h
+++ b/onnxruntime/contrib_ops/cuda/bert/lean_attention/lean_fwd_kernel.h
@@ -825,7 +825,7 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i
         const int row = l * kRowsPerLoadLSE + tidx / kBlockM;
         const int col = tidx % kBlockM;
         // We skip the first row = 0, as we already populated it in shared memory.
-        ElementAccum lse = (row > 0 && row < total_splits && col < params.b * params.h * (index_t)params.seqlen_q - row_offset_lseaccum) ? gLSEaccumRead(row, col) : -INFINITY;
+        ElementAccum lse = (row > 0 && row < total_splits && col < params.b * params.h * (index_t)params.seqlen_q - row_offset_lseaccum) ? gLSEaccumRead(row, col) : -std::numeric_limits<ElementAccum>::infinity();
         if (row > 0 && row < kMaxSplits) {
           sLSE(row, col) = lse;
 
@@ -857,7 +857,7 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i
       for (int l = 0; l < kNLsePerThread; ++l) {
         const int row = l * kRowsPerLoadTranspose + tidx % kRowsPerLoadTranspose;
         const int col = tidx / kRowsPerLoadTranspose;
-        lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE(row, col) : -INFINITY;
+        lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE(row, col) : -std::numeric_limits<ElementAccum>::infinity();
 
 #if defined(DEBUG_LEAN_ATTENTION)
         if (threadIdx.x == 0 && blockIdx.z == tracing_block) {
@@ -874,7 +874,7 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i
       }
       MaxOp<float> max_op;
       lse_max = Allreduce<kRowsPerLoadTranspose>::run(lse_max, max_op);
-      lse_max = lse_max == -INFINITY ? 0.0f : lse_max;  // In case all local LSEs are -inf
+      lse_max = lse_max == -std::numeric_limits<ElementAccum>::infinity() ? 0.0f : lse_max;  // In case all local LSEs are -inf
       float lse_sum = expf(lse_accum(0) - lse_max);
 #pragma unroll
       for (int l = 1; l < kNLsePerThread; ++l) {
@@ -884,7 +884,9 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i
       lse_sum = Allreduce<kRowsPerLoadTranspose>::run(lse_sum, sum_op);
       // For the case where all local lse == -INFINITY, we want to set lse_logsum to INFINITY. Otherwise
       // lse_logsum is log(0.0) = -INFINITY and we get NaN when we do lse_accum(l) - lse_logsum.
-      ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum) ? INFINITY : logf(lse_sum) + lse_max;
+      ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum)
+                                    ? std::numeric_limits<ElementAccum>::infinity()
+                                    : logf(lse_sum) + lse_max;
 // if (tidx % kRowsPerLoadTranspose == 0 && tidx / kRowsPerLoadTranspose < kBlockM) { gLSE(tidx / kRowsPerLoadTranspose) = lse_logsum; }
 // Store the scales exp(lse - lse_logsum) in shared memory.
 #pragma unroll
diff --git a/onnxruntime/contrib_ops/cuda/bert/lean_attention/mask.h b/onnxruntime/contrib_ops/cuda/bert/lean_attention/mask.h
index d63c80b012de6..2d33418d69667 100644
--- a/onnxruntime/contrib_ops/cuda/bert/lean_attention/mask.h
+++ b/onnxruntime/contrib_ops/cuda/bert/lean_attention/mask.h
@@ -3,7 +3,7 @@
  ******************************************************************************/
 
 #pragma once
-
+#include <limits>
 #include <cute/tensor.hpp>
 
 namespace onnxruntime {
@@ -28,7 +28,7 @@ __forceinline__ __device__ void apply_mask(Tensor<Engine, Layout>& tensor, const
 // Without the "make_coord" we get wrong results
 #pragma unroll
         for (int mi = 0; mi < size<0>(tensor); ++mi) {
-          tensor(mi, make_coord(j, nj)) = -INFINITY;
+          tensor(mi, make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
         }
       }
     }
@@ -59,7 +59,7 @@ __forceinline__ __device__ void apply_mask_local(Tensor<Engine, Layout>& tensor,
         for (int j = 0; j < size<1, 0>(tensor); ++j) {
           const int col_idx = col_idx_base + j;
           if (col_idx >= col_idx_limit_right || (HasWSLeft && col_idx < col_idx_limit_left)) {
-            tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+            tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
           }
         }
       }
@@ -96,7 +96,7 @@ __forceinline__ __device__ void apply_mask_causal_w_idx(
 #pragma unroll
     for (int ni = 0; ni < size<1, 1>(tensor); ++ni) {
       if (col_idx_offset_ + get<1>(idx_rowcol(0, ni)) >= col_idx_limit) {
-        tensor(mi, ni) = -INFINITY;
+        tensor(mi, ni) = -std::numeric_limits<float>::infinity();
       }
     }
     // if (cute::thread0()) {
@@ -152,7 +152,7 @@ struct Mask {
               }
               if constexpr (!Is_even_MN) {
                 if (col_idx >= max_seqlen_k) {
-                  tensor(mi, make_coord(j, nj)) = -INFINITY;
+                  tensor(mi, make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                 }
               }
             }
@@ -182,18 +182,18 @@ struct Mask {
                 }
                 if constexpr (Causal_mask) {
                   if (col_idx >= col_idx_limit_right) {
-                    tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                    tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                   }
                 }
                 if constexpr (Is_local) {
                   if (col_idx >= col_idx_limit_right || col_idx < col_idx_limit_left) {
-                    tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                    tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                   }
                 }
                 if constexpr (!Causal_mask && !Is_local && !Is_even_MN) {
                   // Causal and Local already handles MN masking
                   if (col_idx >= max_seqlen_k) {
-                    tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                    tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
                   }
                 }
               }
diff --git a/onnxruntime/contrib_ops/cuda/bert/lean_attention/softmax.h b/onnxruntime/contrib_ops/cuda/bert/lean_attention/softmax.h
index ad66389848e6e..0b6ffb3f1985a 100644
--- a/onnxruntime/contrib_ops/cuda/bert/lean_attention/softmax.h
+++ b/onnxruntime/contrib_ops/cuda/bert/lean_attention/softmax.h
@@ -3,7 +3,7 @@
  ******************************************************************************/
 
 #pragma once
-
+#include <limits>
 #include <cmath>
 
 #include <cute/tensor.hpp>
@@ -72,7 +72,9 @@ __forceinline__ __device__ void scale_apply_exp2(Tensor<Engine0, Layout0>& tenso
     // If max is -inf, then all elements must have been -inf (possibly due to masking).
     // We don't want (-inf - (-inf)) since that would give NaN.
     // If we don't have float around M_LOG2E the multiplication is done in fp64.
-    const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * (Scale_max ? scale : float(M_LOG2E));
+    const float max_scaled = max(mi) == -std::numeric_limits<float>::infinity()
+                                 ? 0.f
+                                 : max(mi) * (Scale_max ? scale : float(M_LOG2E));
 #pragma unroll
     for (int ni = 0; ni < size<1>(tensor); ++ni) {
 // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
@@ -107,7 +109,7 @@ __forceinline__ __device__ void max_scale_exp2_sum(Tensor<Engine0, Layout0>& ten
     max(mi) = Allreduce<4>::run(max(mi), max_op);
     // If max is -inf, then all elements must have been -inf (possibly due to masking).
     // We don't want (-inf - (-inf)) since that would give NaN.
-    const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * scale;
+    const float max_scaled = max(mi) == -std::numeric_limits<float>::infinity() ? 0.f : max(mi) * scale;
     sum(mi) = 0;
 #pragma unroll
     for (int ni = 0; ni < size<1>(tensor); ++ni) {
@@ -151,7 +153,7 @@ struct Softmax {
       for (int mi = 0; mi < size(row_max); ++mi) {
         float scores_max_cur = !Check_inf
                                    ? row_max(mi)
-                                   : (row_max(mi) == -INFINITY ? 0.0f : row_max(mi));
+                                   : (row_max(mi) == -std::numeric_limits<float>::infinity() ? 0.0f : row_max(mi));
         float scores_scale = exp2f((scores_max_prev(mi) - scores_max_cur) * softmax_scale_log2);
         row_sum(mi) *= scores_scale;
 #pragma unroll
@@ -181,7 +183,9 @@ struct Softmax {
       //     printf("sum: %f, inv_sum: %f\n", sum, inv_sum);
       //     printf("mi %d row_max %f softmax_scale %f\n", mi, row_max(mi), softmax_scale);
       // }
-      lse(mi) = (sum == 0.f || sum != sum) ? (Split ? -INFINITY : INFINITY) : row_max(mi) * softmax_scale + __logf(sum);
+      lse(mi) = (sum == 0.f || sum != sum)
+                    ? (Split ? -std::numeric_limits<float>::infinity() : std::numeric_limits<float>::infinity())
+                    : row_max(mi) * softmax_scale + __logf(sum);
       float scale = !Is_dropout ? inv_sum : inv_sum * rp_dropout;
 #pragma unroll
       for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) {
diff --git a/onnxruntime/contrib_ops/cuda/bert/ngram_repeat_block_impl.cu b/onnxruntime/contrib_ops/cuda/bert/ngram_repeat_block_impl.cu
index 8a04ede231a27..ab809d12a89ad 100644
--- a/onnxruntime/contrib_ops/cuda/bert/ngram_repeat_block_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/ngram_repeat_block_impl.cu
@@ -6,7 +6,7 @@ Licensed under the MIT License.
 /*
 Kernel implementation for blocking repeated n-grams.
 */
-
+#include <limits>
 #include "core/providers/cuda/cu_inc/common.cuh"
 #include "contrib_ops/cuda/bert/ngram_repeat_block_impl.h"
 
@@ -48,7 +48,7 @@ __global__ void banRepeatedTokens(const int64_t* __restrict__ tokens,
   }
   if (is_banned == true) {
     auto token_to_be_banned = tokens_shm[col + no_repeat_ngram_size - 1];
-    lprobs[lprob_start + token_to_be_banned] = -INFINITY;
+    lprobs[lprob_start + token_to_be_banned] = -std::numeric_limits<float>::infinity();
   }
 }
 
diff --git a/onnxruntime/core/optimizer/attention_fusion_helper.h b/onnxruntime/core/optimizer/attention_fusion_helper.h
index 267a82b72670c..935114c40d1a7 100644
--- a/onnxruntime/core/optimizer/attention_fusion_helper.h
+++ b/onnxruntime/core/optimizer/attention_fusion_helper.h
@@ -1,5 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+#include <limits>
 #include "onnx/defs/shape_inference.h"
 #include "onnx/defs/tensor_proto_util.h"
 #include "core/framework/tensorprotoutils.h"
@@ -767,7 +768,8 @@ bool MatchInputMaskSubgraph(const Graph& graph, const Node& layer_norm, const No
   }
 
   // check where has X=-Infinity
-  if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(where.InputDefs()[1]), -INFINITY, true)) {
+  if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(where.InputDefs()[1]),
+                                                       -std::numeric_limits<float>::infinity(), true)) {
     DEBUG_LOG("where const not matched.");
     return false;
   }
diff --git a/onnxruntime/core/providers/xnnpack/detail/utils.cc b/onnxruntime/core/providers/xnnpack/detail/utils.cc
index 4eef14dddecd3..2adf8339b4b66 100644
--- a/onnxruntime/core/providers/xnnpack/detail/utils.cc
+++ b/onnxruntime/core/providers/xnnpack/detail/utils.cc
@@ -5,6 +5,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+#include <limits>
 
 #include "core/common/common.h"
 #include "core/common/safeint.h"
@@ -239,8 +240,8 @@ std::unique_ptr<IndexedSubGraph::MetaDef> FuseActivation(const NodeUnit& node_un
   def.attributes = node_unit.GetNode().GetAttributes();
 
   // use infinity as the default as that's what xnnpack uses if min/max are not set
-  float min = -INFINITY;
-  float max = INFINITY;
+  float min = -std::numeric_limits<float>::infinity();
+  float max = std::numeric_limits<float>::infinity();
 
   const auto& activation_type = activation.OpType();
   if (activation_type == "Clip") {
diff --git a/onnxruntime/core/providers/xnnpack/math/gemm.cc b/onnxruntime/core/providers/xnnpack/math/gemm.cc
index 35a06cb7eb89f..a3ff3b585ae45 100644
--- a/onnxruntime/core/providers/xnnpack/math/gemm.cc
+++ b/onnxruntime/core/providers/xnnpack/math/gemm.cc
@@ -2,6 +2,9 @@
 // Licensed under the MIT License.
 
 #include "gemm.h"
+
+#include <limits>
+
 #include "core/framework/transpose_helper.h"
 #include "core/providers/utils.h"
 #include "core/providers/xnnpack/xnnpack_init.h"
@@ -140,8 +143,8 @@ Status Gemm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr,
   auto weights_cache = GetWeightsCache();
   xnn_status status = xnn_status::xnn_status_uninitialized;
   struct xnn_operator* p = nullptr;
-  float foutput_min = clip_min_max_ ? clip_min_max_->first : -INFINITY;
-  float foutput_max = clip_min_max_ ? clip_min_max_->second : INFINITY;
+  float foutput_min = clip_min_max_ ? clip_min_max_->first : -std::numeric_limits<float>::infinity();
+  float foutput_max = clip_min_max_ ? clip_min_max_->second : std::numeric_limits<float>::infinity();
   if (op_compute_type_ == OpComputeType::op_compute_type_fp32) {
     const float* bias_data = nullptr;
     if (C_matrix_exists_) {
diff --git a/onnxruntime/core/providers/xnnpack/math/matmul.cc b/onnxruntime/core/providers/xnnpack/math/matmul.cc
index 44a6fb4ee835a..f574238195ffd 100644
--- a/onnxruntime/core/providers/xnnpack/math/matmul.cc
+++ b/onnxruntime/core/providers/xnnpack/math/matmul.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "matmul.h"
+#include <limits>
 #include "core/providers/cpu/math/matmul_helper.h"
 #include "core/providers/xnnpack/xnnpack_init.h"
 
@@ -109,8 +110,8 @@ Status MatMul::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
   xnn_weights_cache_t weight_cache = nullptr;
 #endif
 
-  float foutput_min = -INFINITY;
-  float foutput_max = INFINITY;
+  float foutput_min = -std::numeric_limits<float>::infinity();
+  float foutput_max = std::numeric_limits<float>::infinity();
   if (op_type_ == OpComputeType::op_compute_type_fp32) {
     status = xnn_create_fully_connected_nc_f32(
         shape_broadcast[0],    // size_t input_channels,
diff --git a/onnxruntime/core/providers/xnnpack/nn/average_pool.cc b/onnxruntime/core/providers/xnnpack/nn/average_pool.cc
index 1c8ed556e90d7..1fc941d9f52f6 100644
--- a/onnxruntime/core/providers/xnnpack/nn/average_pool.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/average_pool.cc
@@ -33,8 +33,8 @@ Status CreateXnnpackKernel(const PoolAttributes& pool_attrs,
   if (pool_attrs.auto_pad == AutoPadType::SAME_UPPER) {
     flags |= XNN_FLAG_TENSORFLOW_SAME_PADDING;
   }
-  float foutput_min = clip_min_max ? clip_min_max->first : -INFINITY;
-  float foutput_max = clip_min_max ? clip_min_max->second : INFINITY;
+  float foutput_min = clip_min_max ? clip_min_max->first : -std::numeric_limits<float>::infinity();
+  float foutput_max = clip_min_max ? clip_min_max->second : std::numeric_limits<float>::infinity();
   xnn_status status = xnn_status_unsupported_parameter;
   if (avgpool_type == OpComputeType::op_compute_type_fp32) {
     status = xnn_create_average_pooling2d_nhwc_f32(input_padding_top, input_padding_right,
diff --git a/onnxruntime/core/providers/xnnpack/nn/conv_base.cc b/onnxruntime/core/providers/xnnpack/nn/conv_base.cc
index e0723c0e7690e..458e6000c8d70 100644
--- a/onnxruntime/core/providers/xnnpack/nn/conv_base.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/conv_base.cc
@@ -54,8 +54,8 @@ Status CreateXnnpackKernel(const ConvAttributes& conv_attrs,
 
   xnn_status status = xnn_status::xnn_status_uninitialized;
   p = nullptr;
-  float foutput_min = clip_min_max ? clip_min_max->first : -INFINITY;
-  float foutput_max = clip_min_max ? clip_min_max->second : INFINITY;
+  float foutput_min = clip_min_max ? clip_min_max->first : -std::numeric_limits<float>::infinity();
+  float foutput_max = clip_min_max ? clip_min_max->second : std::numeric_limits<float>::infinity();
   // with the following IC and OC number, we can cover depthwise and regular conv at the same time
   // the equation 'IC (group_input_channels) == C ' set up when group_count==1 (regular convolution)
   // and OC (group_output_channels) follows the same rule.
diff --git a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc
index 6742e51e55082..c828ae9400174 100644
--- a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc
@@ -3,6 +3,8 @@
 
 #include "max_pool.h"
 
+#include <limits>
+
 #include "core/graph/graph.h"
 #include "core/providers/utils.h"
 #include "core/providers/xnnpack/xnnpack_init.h"
@@ -168,8 +170,8 @@ MaxPool::MaxPool(const OpKernelInfo& info)
   auto input_dtype = X_arg.TypeAsProto()->tensor_type().elem_type();
   xnn_status status = xnn_status_invalid_state;
   struct xnn_operator* p = nullptr;
-  float foutput_min = clip_min_max_ ? clip_min_max_->first : -INFINITY;
-  float foutput_max = clip_min_max_ ? clip_min_max_->second : INFINITY;
+  float foutput_min = clip_min_max_ ? clip_min_max_->first : -std::numeric_limits<float>::infinity();
+  float foutput_max = clip_min_max_ ? clip_min_max_->second : std::numeric_limits<float>::infinity();
   if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
     maxpool_type_ = OpComputeType::op_compute_type_fp32;
     status = xnn_create_max_pooling2d_nhwc_f32(input_padding_top, input_padding_right,