diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_kernel.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_kernel.h index e961bab399326..d46d9597a758f 100644 --- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_kernel.h +++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_kernel.h @@ -98,7 +98,7 @@ inline __device__ void compute_attn_1rowblock(const Params& params, const int bi for (int m = 0; m < size<1>(tOgO); ++m) { const int row = get<0>(tOcO(0, m, 0)); if (row < binfo.actual_seqlen_q - m_block * kBlockM && get<1>(tOcO(0, m, 0)) == 0) { - gLSE(row) = INFINITY; + gLSE(row) = std::numeric_limits::infinity(); } } return; @@ -499,7 +499,7 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params& params, cons for (int m = 0; m < size<1>(tOgOaccum); ++m) { const int row = get<0>(tOcO(0, m, 0)); if (row < binfo.actual_seqlen_q - m_block * kBlockM && get<1>(tOcO(0, m, 0)) == 0) { - gLSEaccum(row) = Split ? -INFINITY : INFINITY; + gLSEaccum(row) = Split ? -std::numeric_limits::infinity() : std::numeric_limits::infinity(); } } return; @@ -1061,7 +1061,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) { for (int l = 0; l < kNLsePerThread; ++l) { const int row = l * kRowsPerLoadLSE + tidx / kBlockM; const int col = tidx % kBlockM; - ElementAccum lse = (row < params.num_splits && col < params.b * params.h * params.seqlen_q - bidx * kBlockM) ? gLSEaccum(row, col) : -INFINITY; + ElementAccum lse = (row < params.num_splits && col < params.b * params.h * params.seqlen_q - bidx * kBlockM) ? gLSEaccum(row, col) : -std::numeric_limits::infinity(); if (row < kMaxSplits) { sLSE[row][col] = lse; } @@ -1082,7 +1082,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) { for (int l = 0; l < kNLsePerThread; ++l) { const int row = l * kRowsPerLoadTranspose + tidx % kRowsPerLoadTranspose; const int col = tidx / kRowsPerLoadTranspose; - lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE[row][col] : -INFINITY; + lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE[row][col] : -std::numeric_limits::infinity(); // if (bidx == 0 && tidx < 32) { printf("tidx = %d, row = %d, col = %d, lse = %f\n", tidx, row, col, lse_accum(l)); } } @@ -1094,7 +1094,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) { } MaxOp max_op; lse_max = Allreduce::run(lse_max, max_op); - lse_max = lse_max == -INFINITY ? 0.0f : lse_max; // In case all local LSEs are -inf + lse_max = lse_max == -std::numeric_limits::infinity() ? 0.0f : lse_max; // In case all local LSEs are -inf float lse_sum = expf(lse_accum(0) - lse_max); #pragma unroll for (int l = 1; l < kNLsePerThread; ++l) { @@ -1104,7 +1104,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) { lse_sum = Allreduce::run(lse_sum, sum_op); // For the case where all local lse == -INFINITY, we want to set lse_logsum to INFINITY. Otherwise // lse_logsum is log(0.0) = -INFINITY and we get NaN when we do lse_accum(l) - lse_logsum. - ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum) ? INFINITY : logf(lse_sum) + lse_max; + ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum) ? std::numeric_limits::infinity() : logf(lse_sum) + lse_max; // if (bidx == 0 && tidx < 32) { printf("tidx = %d, lse = %f, lse_max = %f, lse_logsum = %f\n", tidx, lse_accum(0), lse_max, lse_logsum); } if (tidx % kRowsPerLoadTranspose == 0 && tidx / kRowsPerLoadTranspose < kBlockM) { gLSE(tidx / kRowsPerLoadTranspose) = lse_logsum; diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h index 0998155eba635..71434002f8df1 100644 --- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h +++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h @@ -4,6 +4,7 @@ #pragma once +#include #include namespace onnxruntime { @@ -28,7 +29,7 @@ __forceinline__ __device__ void apply_mask(Tensor& tensor, const // Without the "make_coord" we get wrong results #pragma unroll for (int mi = 0; mi < size<0>(tensor); ++mi) { - tensor(mi, make_coord(j, nj)) = -INFINITY; + tensor(mi, make_coord(j, nj)) = -std::numeric_limits::infinity(); } } } @@ -59,7 +60,7 @@ __forceinline__ __device__ void apply_mask_local(Tensor& tensor, for (int j = 0; j < size<1, 0>(tensor); ++j) { const int col_idx = col_idx_base + j; if (col_idx >= col_idx_limit_right || (HasWSLeft && col_idx < col_idx_limit_left)) { - tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY; + tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits::infinity(); } } } @@ -96,7 +97,7 @@ __forceinline__ __device__ void apply_mask_causal_w_idx( #pragma unroll for (int ni = 0; ni < size<1, 1>(tensor); ++ni) { if (col_idx_offset_ + get<1>(idx_rowcol(0, ni)) >= col_idx_limit) { - tensor(mi, ni) = -INFINITY; + tensor(mi, ni) = -std::numeric_limits::infinity(); } } // if (cute::thread0()) { @@ -151,7 +152,7 @@ struct Mask { } if constexpr (!Is_even_MN) { if (col_idx >= max_seqlen_k) { - tensor(mi, make_coord(j, nj)) = -INFINITY; + tensor(mi, make_coord(j, nj)) = -std::numeric_limits::infinity(); } } } @@ -181,18 +182,18 @@ struct Mask { } if constexpr (Causal_mask) { if (col_idx >= col_idx_limit_right) { - tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY; + tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits::infinity(); } } if constexpr (Is_local) { if (col_idx >= col_idx_limit_right || col_idx < col_idx_limit_left) { - tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY; + tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits::infinity(); } } if constexpr (!Causal_mask && !Is_local && !Is_even_MN) { // Causal and Local already handles MN masking if (col_idx >= max_seqlen_k) { - tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY; + tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits::infinity(); } } } diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h index 7e0095cb39bd9..7fe506e01a9b9 100644 --- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h +++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h @@ -4,6 +4,7 @@ #pragma once #include +#include #include @@ -71,7 +72,9 @@ __forceinline__ __device__ void scale_apply_exp2(Tensor& tenso // If max is -inf, then all elements must have been -inf (possibly due to masking). // We don't want (-inf - (-inf)) since that would give NaN. // If we don't have float around M_LOG2E the multiplication is done in fp64. - const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * (Scale_max ? scale : float(M_LOG2E)); + const float max_scaled = max(mi) == -std::numeric_limits::infinity() + ? 0.f + : max(mi) * (Scale_max ? scale : float(M_LOG2E)); #pragma unroll for (int ni = 0; ni < size<1>(tensor); ++ni) { // Instead of computing exp(x - max), we compute exp2(x * log_2(e) - @@ -99,7 +102,7 @@ __forceinline__ __device__ void max_scale_exp2_sum(Tensor& ten max(mi) = Allreduce<4>::run(max(mi), max_op); // If max is -inf, then all elements must have been -inf (possibly due to masking). // We don't want (-inf - (-inf)) since that would give NaN. - const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * scale; + const float max_scaled = max(mi) == -std::numeric_limits::infinity() ? 0.f : max(mi) * scale; sum(mi) = 0; #pragma unroll for (int ni = 0; ni < size<1>(tensor); ++ni) { @@ -143,7 +146,7 @@ struct Softmax { for (int mi = 0; mi < size(row_max); ++mi) { float scores_max_cur = !Check_inf ? row_max(mi) - : (row_max(mi) == -INFINITY ? 0.0f : row_max(mi)); + : (row_max(mi) == -std::numeric_limits::infinity() ? 0.0f : row_max(mi)); float scores_scale = exp2f((scores_max_prev(mi) - scores_max_cur) * softmax_scale_log2); row_sum(mi) *= scores_scale; #pragma unroll @@ -169,7 +172,9 @@ struct Softmax { for (int mi = 0; mi < size<0>(acc_o_rowcol); ++mi) { float sum = smooth_softmax ? row_sum(mi) + expf(-row_max(mi) * softmax_scale) : row_sum(mi); float inv_sum = (sum == 0.f || sum != sum) ? 1.f : 1.f / sum; - lse(mi) = (sum == 0.f || sum != sum) ? (Split ? -INFINITY : INFINITY) : row_max(mi) * softmax_scale + __logf(sum); + lse(mi) = (sum == 0.f || sum != sum) + ? (Split ? -std::numeric_limits::infinity() : std::numeric_limits::infinity()) + : row_max(mi) * softmax_scale + __logf(sum); float scale = inv_sum; #pragma unroll for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) { diff --git a/onnxruntime/contrib_ops/cuda/bert/lean_attention/lean_fwd_kernel.h b/onnxruntime/contrib_ops/cuda/bert/lean_attention/lean_fwd_kernel.h index 5be69ea0af55c..bd54b404420e5 100644 --- a/onnxruntime/contrib_ops/cuda/bert/lean_attention/lean_fwd_kernel.h +++ b/onnxruntime/contrib_ops/cuda/bert/lean_attention/lean_fwd_kernel.h @@ -825,7 +825,7 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i const int row = l * kRowsPerLoadLSE + tidx / kBlockM; const int col = tidx % kBlockM; // We skip the first row = 0, as we already populated it in shared memory. - ElementAccum lse = (row > 0 && row < total_splits && col < params.b * params.h * (index_t)params.seqlen_q - row_offset_lseaccum) ? gLSEaccumRead(row, col) : -INFINITY; + ElementAccum lse = (row > 0 && row < total_splits && col < params.b * params.h * (index_t)params.seqlen_q - row_offset_lseaccum) ? gLSEaccumRead(row, col) : -std::numeric_limits::infinity(); if (row > 0 && row < kMaxSplits) { sLSE(row, col) = lse; @@ -857,7 +857,7 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i for (int l = 0; l < kNLsePerThread; ++l) { const int row = l * kRowsPerLoadTranspose + tidx % kRowsPerLoadTranspose; const int col = tidx / kRowsPerLoadTranspose; - lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE(row, col) : -INFINITY; + lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE(row, col) : -std::numeric_limits::infinity(); #if defined(DEBUG_LEAN_ATTENTION) if (threadIdx.x == 0 && blockIdx.z == tracing_block) { @@ -874,7 +874,7 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i } MaxOp max_op; lse_max = Allreduce::run(lse_max, max_op); - lse_max = lse_max == -INFINITY ? 0.0f : lse_max; // In case all local LSEs are -inf + lse_max = lse_max == -std::numeric_limits::infinity() ? 0.0f : lse_max; // In case all local LSEs are -inf float lse_sum = expf(lse_accum(0) - lse_max); #pragma unroll for (int l = 1; l < kNLsePerThread; ++l) { @@ -884,7 +884,9 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i lse_sum = Allreduce::run(lse_sum, sum_op); // For the case where all local lse == -INFINITY, we want to set lse_logsum to INFINITY. Otherwise // lse_logsum is log(0.0) = -INFINITY and we get NaN when we do lse_accum(l) - lse_logsum. - ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum) ? INFINITY : logf(lse_sum) + lse_max; + ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum) + ? std::numeric_limits::infinity() + : logf(lse_sum) + lse_max; // if (tidx % kRowsPerLoadTranspose == 0 && tidx / kRowsPerLoadTranspose < kBlockM) { gLSE(tidx / kRowsPerLoadTranspose) = lse_logsum; } // Store the scales exp(lse - lse_logsum) in shared memory. #pragma unroll diff --git a/onnxruntime/contrib_ops/cuda/bert/lean_attention/mask.h b/onnxruntime/contrib_ops/cuda/bert/lean_attention/mask.h index d63c80b012de6..2d33418d69667 100644 --- a/onnxruntime/contrib_ops/cuda/bert/lean_attention/mask.h +++ b/onnxruntime/contrib_ops/cuda/bert/lean_attention/mask.h @@ -3,7 +3,7 @@ ******************************************************************************/ #pragma once - +#include #include namespace onnxruntime { @@ -28,7 +28,7 @@ __forceinline__ __device__ void apply_mask(Tensor& tensor, const // Without the "make_coord" we get wrong results #pragma unroll for (int mi = 0; mi < size<0>(tensor); ++mi) { - tensor(mi, make_coord(j, nj)) = -INFINITY; + tensor(mi, make_coord(j, nj)) = -std::numeric_limits::infinity(); } } } @@ -59,7 +59,7 @@ __forceinline__ __device__ void apply_mask_local(Tensor& tensor, for (int j = 0; j < size<1, 0>(tensor); ++j) { const int col_idx = col_idx_base + j; if (col_idx >= col_idx_limit_right || (HasWSLeft && col_idx < col_idx_limit_left)) { - tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY; + tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits::infinity(); } } } @@ -96,7 +96,7 @@ __forceinline__ __device__ void apply_mask_causal_w_idx( #pragma unroll for (int ni = 0; ni < size<1, 1>(tensor); ++ni) { if (col_idx_offset_ + get<1>(idx_rowcol(0, ni)) >= col_idx_limit) { - tensor(mi, ni) = -INFINITY; + tensor(mi, ni) = -std::numeric_limits::infinity(); } } // if (cute::thread0()) { @@ -152,7 +152,7 @@ struct Mask { } if constexpr (!Is_even_MN) { if (col_idx >= max_seqlen_k) { - tensor(mi, make_coord(j, nj)) = -INFINITY; + tensor(mi, make_coord(j, nj)) = -std::numeric_limits::infinity(); } } } @@ -182,18 +182,18 @@ struct Mask { } if constexpr (Causal_mask) { if (col_idx >= col_idx_limit_right) { - tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY; + tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits::infinity(); } } if constexpr (Is_local) { if (col_idx >= col_idx_limit_right || col_idx < col_idx_limit_left) { - tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY; + tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits::infinity(); } } if constexpr (!Causal_mask && !Is_local && !Is_even_MN) { // Causal and Local already handles MN masking if (col_idx >= max_seqlen_k) { - tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY; + tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits::infinity(); } } } diff --git a/onnxruntime/contrib_ops/cuda/bert/lean_attention/softmax.h b/onnxruntime/contrib_ops/cuda/bert/lean_attention/softmax.h index ad66389848e6e..0b6ffb3f1985a 100644 --- a/onnxruntime/contrib_ops/cuda/bert/lean_attention/softmax.h +++ b/onnxruntime/contrib_ops/cuda/bert/lean_attention/softmax.h @@ -3,7 +3,7 @@ ******************************************************************************/ #pragma once - +#include #include #include @@ -72,7 +72,9 @@ __forceinline__ __device__ void scale_apply_exp2(Tensor& tenso // If max is -inf, then all elements must have been -inf (possibly due to masking). // We don't want (-inf - (-inf)) since that would give NaN. // If we don't have float around M_LOG2E the multiplication is done in fp64. - const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * (Scale_max ? scale : float(M_LOG2E)); + const float max_scaled = max(mi) == -std::numeric_limits::infinity() + ? 0.f + : max(mi) * (Scale_max ? scale : float(M_LOG2E)); #pragma unroll for (int ni = 0; ni < size<1>(tensor); ++ni) { // Instead of computing exp(x - max), we compute exp2(x * log_2(e) - @@ -107,7 +109,7 @@ __forceinline__ __device__ void max_scale_exp2_sum(Tensor& ten max(mi) = Allreduce<4>::run(max(mi), max_op); // If max is -inf, then all elements must have been -inf (possibly due to masking). // We don't want (-inf - (-inf)) since that would give NaN. - const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * scale; + const float max_scaled = max(mi) == -std::numeric_limits::infinity() ? 0.f : max(mi) * scale; sum(mi) = 0; #pragma unroll for (int ni = 0; ni < size<1>(tensor); ++ni) { @@ -151,7 +153,7 @@ struct Softmax { for (int mi = 0; mi < size(row_max); ++mi) { float scores_max_cur = !Check_inf ? row_max(mi) - : (row_max(mi) == -INFINITY ? 0.0f : row_max(mi)); + : (row_max(mi) == -std::numeric_limits::infinity() ? 0.0f : row_max(mi)); float scores_scale = exp2f((scores_max_prev(mi) - scores_max_cur) * softmax_scale_log2); row_sum(mi) *= scores_scale; #pragma unroll @@ -181,7 +183,9 @@ struct Softmax { // printf("sum: %f, inv_sum: %f\n", sum, inv_sum); // printf("mi %d row_max %f softmax_scale %f\n", mi, row_max(mi), softmax_scale); // } - lse(mi) = (sum == 0.f || sum != sum) ? (Split ? -INFINITY : INFINITY) : row_max(mi) * softmax_scale + __logf(sum); + lse(mi) = (sum == 0.f || sum != sum) + ? (Split ? -std::numeric_limits::infinity() : std::numeric_limits::infinity()) + : row_max(mi) * softmax_scale + __logf(sum); float scale = !Is_dropout ? inv_sum : inv_sum * rp_dropout; #pragma unroll for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) { diff --git a/onnxruntime/contrib_ops/cuda/bert/ngram_repeat_block_impl.cu b/onnxruntime/contrib_ops/cuda/bert/ngram_repeat_block_impl.cu index 8a04ede231a27..ab809d12a89ad 100644 --- a/onnxruntime/contrib_ops/cuda/bert/ngram_repeat_block_impl.cu +++ b/onnxruntime/contrib_ops/cuda/bert/ngram_repeat_block_impl.cu @@ -6,7 +6,7 @@ Licensed under the MIT License. /* Kernel implementation for blocking repeated n-grams. */ - +#include #include "core/providers/cuda/cu_inc/common.cuh" #include "contrib_ops/cuda/bert/ngram_repeat_block_impl.h" @@ -48,7 +48,7 @@ __global__ void banRepeatedTokens(const int64_t* __restrict__ tokens, } if (is_banned == true) { auto token_to_be_banned = tokens_shm[col + no_repeat_ngram_size - 1]; - lprobs[lprob_start + token_to_be_banned] = -INFINITY; + lprobs[lprob_start + token_to_be_banned] = -std::numeric_limits::infinity(); } } diff --git a/onnxruntime/core/optimizer/attention_fusion_helper.h b/onnxruntime/core/optimizer/attention_fusion_helper.h index 267a82b72670c..935114c40d1a7 100644 --- a/onnxruntime/core/optimizer/attention_fusion_helper.h +++ b/onnxruntime/core/optimizer/attention_fusion_helper.h @@ -1,5 +1,6 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include #include "onnx/defs/shape_inference.h" #include "onnx/defs/tensor_proto_util.h" #include "core/framework/tensorprotoutils.h" @@ -767,7 +768,8 @@ bool MatchInputMaskSubgraph(const Graph& graph, const Node& layer_norm, const No } // check where has X=-Infinity - if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(where.InputDefs()[1]), -INFINITY, true)) { + if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(where.InputDefs()[1]), + -std::numeric_limits::infinity(), true)) { DEBUG_LOG("where const not matched."); return false; } diff --git a/onnxruntime/core/providers/xnnpack/detail/utils.cc b/onnxruntime/core/providers/xnnpack/detail/utils.cc index 4eef14dddecd3..2adf8339b4b66 100644 --- a/onnxruntime/core/providers/xnnpack/detail/utils.cc +++ b/onnxruntime/core/providers/xnnpack/detail/utils.cc @@ -5,6 +5,7 @@ #include #include #include +#include #include "core/common/common.h" #include "core/common/safeint.h" @@ -239,8 +240,8 @@ std::unique_ptr FuseActivation(const NodeUnit& node_un def.attributes = node_unit.GetNode().GetAttributes(); // use infinity as the default as that's what xnnpack uses if min/max are not set - float min = -INFINITY; - float max = INFINITY; + float min = -std::numeric_limits::infinity(); + float max = std::numeric_limits::infinity(); const auto& activation_type = activation.OpType(); if (activation_type == "Clip") { diff --git a/onnxruntime/core/providers/xnnpack/math/gemm.cc b/onnxruntime/core/providers/xnnpack/math/gemm.cc index 35a06cb7eb89f..a3ff3b585ae45 100644 --- a/onnxruntime/core/providers/xnnpack/math/gemm.cc +++ b/onnxruntime/core/providers/xnnpack/math/gemm.cc @@ -2,6 +2,9 @@ // Licensed under the MIT License. #include "gemm.h" + +#include + #include "core/framework/transpose_helper.h" #include "core/providers/utils.h" #include "core/providers/xnnpack/xnnpack_init.h" @@ -140,8 +143,8 @@ Status Gemm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr, auto weights_cache = GetWeightsCache(); xnn_status status = xnn_status::xnn_status_uninitialized; struct xnn_operator* p = nullptr; - float foutput_min = clip_min_max_ ? clip_min_max_->first : -INFINITY; - float foutput_max = clip_min_max_ ? clip_min_max_->second : INFINITY; + float foutput_min = clip_min_max_ ? clip_min_max_->first : -std::numeric_limits::infinity(); + float foutput_max = clip_min_max_ ? clip_min_max_->second : std::numeric_limits::infinity(); if (op_compute_type_ == OpComputeType::op_compute_type_fp32) { const float* bias_data = nullptr; if (C_matrix_exists_) { diff --git a/onnxruntime/core/providers/xnnpack/math/matmul.cc b/onnxruntime/core/providers/xnnpack/math/matmul.cc index 44a6fb4ee835a..f574238195ffd 100644 --- a/onnxruntime/core/providers/xnnpack/math/matmul.cc +++ b/onnxruntime/core/providers/xnnpack/math/matmul.cc @@ -2,6 +2,7 @@ // Licensed under the MIT License. #include "matmul.h" +#include #include "core/providers/cpu/math/matmul_helper.h" #include "core/providers/xnnpack/xnnpack_init.h" @@ -109,8 +110,8 @@ Status MatMul::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, xnn_weights_cache_t weight_cache = nullptr; #endif - float foutput_min = -INFINITY; - float foutput_max = INFINITY; + float foutput_min = -std::numeric_limits::infinity(); + float foutput_max = std::numeric_limits::infinity(); if (op_type_ == OpComputeType::op_compute_type_fp32) { status = xnn_create_fully_connected_nc_f32( shape_broadcast[0], // size_t input_channels, diff --git a/onnxruntime/core/providers/xnnpack/nn/average_pool.cc b/onnxruntime/core/providers/xnnpack/nn/average_pool.cc index 1c8ed556e90d7..1fc941d9f52f6 100644 --- a/onnxruntime/core/providers/xnnpack/nn/average_pool.cc +++ b/onnxruntime/core/providers/xnnpack/nn/average_pool.cc @@ -33,8 +33,8 @@ Status CreateXnnpackKernel(const PoolAttributes& pool_attrs, if (pool_attrs.auto_pad == AutoPadType::SAME_UPPER) { flags |= XNN_FLAG_TENSORFLOW_SAME_PADDING; } - float foutput_min = clip_min_max ? clip_min_max->first : -INFINITY; - float foutput_max = clip_min_max ? clip_min_max->second : INFINITY; + float foutput_min = clip_min_max ? clip_min_max->first : -std::numeric_limits::infinity(); + float foutput_max = clip_min_max ? clip_min_max->second : std::numeric_limits::infinity(); xnn_status status = xnn_status_unsupported_parameter; if (avgpool_type == OpComputeType::op_compute_type_fp32) { status = xnn_create_average_pooling2d_nhwc_f32(input_padding_top, input_padding_right, diff --git a/onnxruntime/core/providers/xnnpack/nn/conv_base.cc b/onnxruntime/core/providers/xnnpack/nn/conv_base.cc index e0723c0e7690e..458e6000c8d70 100644 --- a/onnxruntime/core/providers/xnnpack/nn/conv_base.cc +++ b/onnxruntime/core/providers/xnnpack/nn/conv_base.cc @@ -54,8 +54,8 @@ Status CreateXnnpackKernel(const ConvAttributes& conv_attrs, xnn_status status = xnn_status::xnn_status_uninitialized; p = nullptr; - float foutput_min = clip_min_max ? clip_min_max->first : -INFINITY; - float foutput_max = clip_min_max ? clip_min_max->second : INFINITY; + float foutput_min = clip_min_max ? clip_min_max->first : -std::numeric_limits::infinity(); + float foutput_max = clip_min_max ? clip_min_max->second : std::numeric_limits::infinity(); // with the following IC and OC number, we can cover depthwise and regular conv at the same time // the equation 'IC (group_input_channels) == C ' set up when group_count==1 (regular convolution) // and OC (group_output_channels) follows the same rule. diff --git a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc index 6742e51e55082..c828ae9400174 100644 --- a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc +++ b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc @@ -3,6 +3,8 @@ #include "max_pool.h" +#include + #include "core/graph/graph.h" #include "core/providers/utils.h" #include "core/providers/xnnpack/xnnpack_init.h" @@ -168,8 +170,8 @@ MaxPool::MaxPool(const OpKernelInfo& info) auto input_dtype = X_arg.TypeAsProto()->tensor_type().elem_type(); xnn_status status = xnn_status_invalid_state; struct xnn_operator* p = nullptr; - float foutput_min = clip_min_max_ ? clip_min_max_->first : -INFINITY; - float foutput_max = clip_min_max_ ? clip_min_max_->second : INFINITY; + float foutput_min = clip_min_max_ ? clip_min_max_->first : -std::numeric_limits::infinity(); + float foutput_max = clip_min_max_ ? clip_min_max_->second : std::numeric_limits::infinity(); if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) { maxpool_type_ = OpComputeType::op_compute_type_fp32; status = xnn_create_max_pooling2d_nhwc_f32(input_padding_top, input_padding_right,