Skip to content

Commit

Permalink
Replace INFINITY by std::numeric_limits<float>::infinity() (microsoft…
Browse files Browse the repository at this point in the history
…#22868)

Replace INFINITY by `std::numeric_limits<float>::infinity()` to avoid
build errors with Visual Studio 2022 v17.12 Preview 5

### Motivation and Context
microsoft#22728
  • Loading branch information
tianleiwu authored and ankitm3k committed Dec 11, 2024
1 parent fb4f5bf commit 6dd8736
Show file tree
Hide file tree
Showing 14 changed files with 70 additions and 49 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ inline __device__ void compute_attn_1rowblock(const Params& params, const int bi
for (int m = 0; m < size<1>(tOgO); ++m) {
const int row = get<0>(tOcO(0, m, 0));
if (row < binfo.actual_seqlen_q - m_block * kBlockM && get<1>(tOcO(0, m, 0)) == 0) {
gLSE(row) = INFINITY;
gLSE(row) = std::numeric_limits<ElementAccum>::infinity();
}
}
return;
Expand Down Expand Up @@ -499,7 +499,7 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params& params, cons
for (int m = 0; m < size<1>(tOgOaccum); ++m) {
const int row = get<0>(tOcO(0, m, 0));
if (row < binfo.actual_seqlen_q - m_block * kBlockM && get<1>(tOcO(0, m, 0)) == 0) {
gLSEaccum(row) = Split ? -INFINITY : INFINITY;
gLSEaccum(row) = Split ? -std::numeric_limits<ElementAccum>::infinity() : std::numeric_limits<ElementAccum>::infinity();
}
}
return;
Expand Down Expand Up @@ -1061,7 +1061,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) {
for (int l = 0; l < kNLsePerThread; ++l) {
const int row = l * kRowsPerLoadLSE + tidx / kBlockM;
const int col = tidx % kBlockM;
ElementAccum lse = (row < params.num_splits && col < params.b * params.h * params.seqlen_q - bidx * kBlockM) ? gLSEaccum(row, col) : -INFINITY;
ElementAccum lse = (row < params.num_splits && col < params.b * params.h * params.seqlen_q - bidx * kBlockM) ? gLSEaccum(row, col) : -std::numeric_limits<ElementAccum>::infinity();
if (row < kMaxSplits) {
sLSE[row][col] = lse;
}
Expand All @@ -1082,7 +1082,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) {
for (int l = 0; l < kNLsePerThread; ++l) {
const int row = l * kRowsPerLoadTranspose + tidx % kRowsPerLoadTranspose;
const int col = tidx / kRowsPerLoadTranspose;
lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE[row][col] : -INFINITY;
lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE[row][col] : -std::numeric_limits<ElementAccum>::infinity();
// if (bidx == 0 && tidx < 32) { printf("tidx = %d, row = %d, col = %d, lse = %f\n", tidx, row, col, lse_accum(l)); }
}

Expand All @@ -1094,7 +1094,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) {
}
MaxOp<float> max_op;
lse_max = Allreduce<kRowsPerLoadTranspose>::run(lse_max, max_op);
lse_max = lse_max == -INFINITY ? 0.0f : lse_max; // In case all local LSEs are -inf
lse_max = lse_max == -std::numeric_limits<ElementAccum>::infinity() ? 0.0f : lse_max; // In case all local LSEs are -inf
float lse_sum = expf(lse_accum(0) - lse_max);
#pragma unroll
for (int l = 1; l < kNLsePerThread; ++l) {
Expand All @@ -1104,7 +1104,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) {
lse_sum = Allreduce<kRowsPerLoadTranspose>::run(lse_sum, sum_op);
// For the case where all local lse == -INFINITY, we want to set lse_logsum to INFINITY. Otherwise
// lse_logsum is log(0.0) = -INFINITY and we get NaN when we do lse_accum(l) - lse_logsum.
ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum) ? INFINITY : logf(lse_sum) + lse_max;
ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum) ? std::numeric_limits<ElementAccum>::infinity() : logf(lse_sum) + lse_max;
// if (bidx == 0 && tidx < 32) { printf("tidx = %d, lse = %f, lse_max = %f, lse_logsum = %f\n", tidx, lse_accum(0), lse_max, lse_logsum); }
if (tidx % kRowsPerLoadTranspose == 0 && tidx / kRowsPerLoadTranspose < kBlockM) {
gLSE(tidx / kRowsPerLoadTranspose) = lse_logsum;
Expand Down
15 changes: 8 additions & 7 deletions onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#pragma once

#include <limits>
#include <cute/tensor.hpp>

namespace onnxruntime {
Expand All @@ -28,7 +29,7 @@ __forceinline__ __device__ void apply_mask(Tensor<Engine, Layout>& tensor, const
// Without the "make_coord" we get wrong results
#pragma unroll
for (int mi = 0; mi < size<0>(tensor); ++mi) {
tensor(mi, make_coord(j, nj)) = -INFINITY;
tensor(mi, make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
}
}
}
Expand Down Expand Up @@ -59,7 +60,7 @@ __forceinline__ __device__ void apply_mask_local(Tensor<Engine, Layout>& tensor,
for (int j = 0; j < size<1, 0>(tensor); ++j) {
const int col_idx = col_idx_base + j;
if (col_idx >= col_idx_limit_right || (HasWSLeft && col_idx < col_idx_limit_left)) {
tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
}
}
}
Expand Down Expand Up @@ -96,7 +97,7 @@ __forceinline__ __device__ void apply_mask_causal_w_idx(
#pragma unroll
for (int ni = 0; ni < size<1, 1>(tensor); ++ni) {
if (col_idx_offset_ + get<1>(idx_rowcol(0, ni)) >= col_idx_limit) {
tensor(mi, ni) = -INFINITY;
tensor(mi, ni) = -std::numeric_limits<float>::infinity();
}
}
// if (cute::thread0()) {
Expand Down Expand Up @@ -151,7 +152,7 @@ struct Mask {
}
if constexpr (!Is_even_MN) {
if (col_idx >= max_seqlen_k) {
tensor(mi, make_coord(j, nj)) = -INFINITY;
tensor(mi, make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
}
}
}
Expand Down Expand Up @@ -181,18 +182,18 @@ struct Mask {
}
if constexpr (Causal_mask) {
if (col_idx >= col_idx_limit_right) {
tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
}
}
if constexpr (Is_local) {
if (col_idx >= col_idx_limit_right || col_idx < col_idx_limit_left) {
tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
}
}
if constexpr (!Causal_mask && !Is_local && !Is_even_MN) {
// Causal and Local already handles MN masking
if (col_idx >= max_seqlen_k) {
tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
}
}
}
Expand Down
13 changes: 9 additions & 4 deletions onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#pragma once

#include <cmath>
#include <limits>

#include <cute/tensor.hpp>

Expand Down Expand Up @@ -71,7 +72,9 @@ __forceinline__ __device__ void scale_apply_exp2(Tensor<Engine0, Layout0>& tenso
// If max is -inf, then all elements must have been -inf (possibly due to masking).
// We don't want (-inf - (-inf)) since that would give NaN.
// If we don't have float around M_LOG2E the multiplication is done in fp64.
const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * (Scale_max ? scale : float(M_LOG2E));
const float max_scaled = max(mi) == -std::numeric_limits<float>::infinity()
? 0.f
: max(mi) * (Scale_max ? scale : float(M_LOG2E));
#pragma unroll
for (int ni = 0; ni < size<1>(tensor); ++ni) {
// Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
Expand Down Expand Up @@ -99,7 +102,7 @@ __forceinline__ __device__ void max_scale_exp2_sum(Tensor<Engine0, Layout0>& ten
max(mi) = Allreduce<4>::run(max(mi), max_op);
// If max is -inf, then all elements must have been -inf (possibly due to masking).
// We don't want (-inf - (-inf)) since that would give NaN.
const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * scale;
const float max_scaled = max(mi) == -std::numeric_limits<float>::infinity() ? 0.f : max(mi) * scale;
sum(mi) = 0;
#pragma unroll
for (int ni = 0; ni < size<1>(tensor); ++ni) {
Expand Down Expand Up @@ -143,7 +146,7 @@ struct Softmax {
for (int mi = 0; mi < size(row_max); ++mi) {
float scores_max_cur = !Check_inf
? row_max(mi)
: (row_max(mi) == -INFINITY ? 0.0f : row_max(mi));
: (row_max(mi) == -std::numeric_limits<float>::infinity() ? 0.0f : row_max(mi));
float scores_scale = exp2f((scores_max_prev(mi) - scores_max_cur) * softmax_scale_log2);
row_sum(mi) *= scores_scale;
#pragma unroll
Expand All @@ -169,7 +172,9 @@ struct Softmax {
for (int mi = 0; mi < size<0>(acc_o_rowcol); ++mi) {
float sum = smooth_softmax ? row_sum(mi) + expf(-row_max(mi) * softmax_scale) : row_sum(mi);
float inv_sum = (sum == 0.f || sum != sum) ? 1.f : 1.f / sum;
lse(mi) = (sum == 0.f || sum != sum) ? (Split ? -INFINITY : INFINITY) : row_max(mi) * softmax_scale + __logf(sum);
lse(mi) = (sum == 0.f || sum != sum)
? (Split ? -std::numeric_limits<float>::infinity() : std::numeric_limits<float>::infinity())
: row_max(mi) * softmax_scale + __logf(sum);
float scale = inv_sum;
#pragma unroll
for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -825,7 +825,7 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i
const int row = l * kRowsPerLoadLSE + tidx / kBlockM;
const int col = tidx % kBlockM;
// We skip the first row = 0, as we already populated it in shared memory.
ElementAccum lse = (row > 0 && row < total_splits && col < params.b * params.h * (index_t)params.seqlen_q - row_offset_lseaccum) ? gLSEaccumRead(row, col) : -INFINITY;
ElementAccum lse = (row > 0 && row < total_splits && col < params.b * params.h * (index_t)params.seqlen_q - row_offset_lseaccum) ? gLSEaccumRead(row, col) : -std::numeric_limits<ElementAccum>::infinity();
if (row > 0 && row < kMaxSplits) {
sLSE(row, col) = lse;

Expand Down Expand Up @@ -857,7 +857,7 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i
for (int l = 0; l < kNLsePerThread; ++l) {
const int row = l * kRowsPerLoadTranspose + tidx % kRowsPerLoadTranspose;
const int col = tidx / kRowsPerLoadTranspose;
lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE(row, col) : -INFINITY;
lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE(row, col) : -std::numeric_limits<ElementAccum>::infinity();

#if defined(DEBUG_LEAN_ATTENTION)
if (threadIdx.x == 0 && blockIdx.z == tracing_block) {
Expand All @@ -874,7 +874,7 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i
}
MaxOp<float> max_op;
lse_max = Allreduce<kRowsPerLoadTranspose>::run(lse_max, max_op);
lse_max = lse_max == -INFINITY ? 0.0f : lse_max; // In case all local LSEs are -inf
lse_max = lse_max == -std::numeric_limits<ElementAccum>::infinity() ? 0.0f : lse_max; // In case all local LSEs are -inf
float lse_sum = expf(lse_accum(0) - lse_max);
#pragma unroll
for (int l = 1; l < kNLsePerThread; ++l) {
Expand All @@ -884,7 +884,9 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i
lse_sum = Allreduce<kRowsPerLoadTranspose>::run(lse_sum, sum_op);
// For the case where all local lse == -INFINITY, we want to set lse_logsum to INFINITY. Otherwise
// lse_logsum is log(0.0) = -INFINITY and we get NaN when we do lse_accum(l) - lse_logsum.
ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum) ? INFINITY : logf(lse_sum) + lse_max;
ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum)
? std::numeric_limits<ElementAccum>::infinity()
: logf(lse_sum) + lse_max;
// if (tidx % kRowsPerLoadTranspose == 0 && tidx / kRowsPerLoadTranspose < kBlockM) { gLSE(tidx / kRowsPerLoadTranspose) = lse_logsum; }
// Store the scales exp(lse - lse_logsum) in shared memory.
#pragma unroll
Expand Down
16 changes: 8 additions & 8 deletions onnxruntime/contrib_ops/cuda/bert/lean_attention/mask.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
******************************************************************************/

#pragma once

#include <limits>
#include <cute/tensor.hpp>

namespace onnxruntime {
Expand All @@ -28,7 +28,7 @@ __forceinline__ __device__ void apply_mask(Tensor<Engine, Layout>& tensor, const
// Without the "make_coord" we get wrong results
#pragma unroll
for (int mi = 0; mi < size<0>(tensor); ++mi) {
tensor(mi, make_coord(j, nj)) = -INFINITY;
tensor(mi, make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
}
}
}
Expand Down Expand Up @@ -59,7 +59,7 @@ __forceinline__ __device__ void apply_mask_local(Tensor<Engine, Layout>& tensor,
for (int j = 0; j < size<1, 0>(tensor); ++j) {
const int col_idx = col_idx_base + j;
if (col_idx >= col_idx_limit_right || (HasWSLeft && col_idx < col_idx_limit_left)) {
tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
}
}
}
Expand Down Expand Up @@ -96,7 +96,7 @@ __forceinline__ __device__ void apply_mask_causal_w_idx(
#pragma unroll
for (int ni = 0; ni < size<1, 1>(tensor); ++ni) {
if (col_idx_offset_ + get<1>(idx_rowcol(0, ni)) >= col_idx_limit) {
tensor(mi, ni) = -INFINITY;
tensor(mi, ni) = -std::numeric_limits<float>::infinity();
}
}
// if (cute::thread0()) {
Expand Down Expand Up @@ -152,7 +152,7 @@ struct Mask {
}
if constexpr (!Is_even_MN) {
if (col_idx >= max_seqlen_k) {
tensor(mi, make_coord(j, nj)) = -INFINITY;
tensor(mi, make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
}
}
}
Expand Down Expand Up @@ -182,18 +182,18 @@ struct Mask {
}
if constexpr (Causal_mask) {
if (col_idx >= col_idx_limit_right) {
tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
}
}
if constexpr (Is_local) {
if (col_idx >= col_idx_limit_right || col_idx < col_idx_limit_left) {
tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
}
}
if constexpr (!Causal_mask && !Is_local && !Is_even_MN) {
// Causal and Local already handles MN masking
if (col_idx >= max_seqlen_k) {
tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits<float>::infinity();
}
}
}
Expand Down
14 changes: 9 additions & 5 deletions onnxruntime/contrib_ops/cuda/bert/lean_attention/softmax.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
******************************************************************************/

#pragma once

#include <limits>
#include <cmath>

#include <cute/tensor.hpp>
Expand Down Expand Up @@ -72,7 +72,9 @@ __forceinline__ __device__ void scale_apply_exp2(Tensor<Engine0, Layout0>& tenso
// If max is -inf, then all elements must have been -inf (possibly due to masking).
// We don't want (-inf - (-inf)) since that would give NaN.
// If we don't have float around M_LOG2E the multiplication is done in fp64.
const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * (Scale_max ? scale : float(M_LOG2E));
const float max_scaled = max(mi) == -std::numeric_limits<float>::infinity()
? 0.f
: max(mi) * (Scale_max ? scale : float(M_LOG2E));
#pragma unroll
for (int ni = 0; ni < size<1>(tensor); ++ni) {
// Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
Expand Down Expand Up @@ -107,7 +109,7 @@ __forceinline__ __device__ void max_scale_exp2_sum(Tensor<Engine0, Layout0>& ten
max(mi) = Allreduce<4>::run(max(mi), max_op);
// If max is -inf, then all elements must have been -inf (possibly due to masking).
// We don't want (-inf - (-inf)) since that would give NaN.
const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * scale;
const float max_scaled = max(mi) == -std::numeric_limits<float>::infinity() ? 0.f : max(mi) * scale;
sum(mi) = 0;
#pragma unroll
for (int ni = 0; ni < size<1>(tensor); ++ni) {
Expand Down Expand Up @@ -151,7 +153,7 @@ struct Softmax {
for (int mi = 0; mi < size(row_max); ++mi) {
float scores_max_cur = !Check_inf
? row_max(mi)
: (row_max(mi) == -INFINITY ? 0.0f : row_max(mi));
: (row_max(mi) == -std::numeric_limits<float>::infinity() ? 0.0f : row_max(mi));
float scores_scale = exp2f((scores_max_prev(mi) - scores_max_cur) * softmax_scale_log2);
row_sum(mi) *= scores_scale;
#pragma unroll
Expand Down Expand Up @@ -181,7 +183,9 @@ struct Softmax {
// printf("sum: %f, inv_sum: %f\n", sum, inv_sum);
// printf("mi %d row_max %f softmax_scale %f\n", mi, row_max(mi), softmax_scale);
// }
lse(mi) = (sum == 0.f || sum != sum) ? (Split ? -INFINITY : INFINITY) : row_max(mi) * softmax_scale + __logf(sum);
lse(mi) = (sum == 0.f || sum != sum)
? (Split ? -std::numeric_limits<float>::infinity() : std::numeric_limits<float>::infinity())
: row_max(mi) * softmax_scale + __logf(sum);
float scale = !Is_dropout ? inv_sum : inv_sum * rp_dropout;
#pragma unroll
for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) {
Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/contrib_ops/cuda/bert/ngram_repeat_block_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Licensed under the MIT License.
/*
Kernel implementation for blocking repeated n-grams.
*/

#include <limits>
#include "core/providers/cuda/cu_inc/common.cuh"
#include "contrib_ops/cuda/bert/ngram_repeat_block_impl.h"

Expand Down Expand Up @@ -48,7 +48,7 @@ __global__ void banRepeatedTokens(const int64_t* __restrict__ tokens,
}
if (is_banned == true) {
auto token_to_be_banned = tokens_shm[col + no_repeat_ngram_size - 1];
lprobs[lprob_start + token_to_be_banned] = -INFINITY;
lprobs[lprob_start + token_to_be_banned] = -std::numeric_limits<float>::infinity();
}
}

Expand Down
4 changes: 3 additions & 1 deletion onnxruntime/core/optimizer/attention_fusion_helper.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include <limits>
#include "onnx/defs/shape_inference.h"
#include "onnx/defs/tensor_proto_util.h"
#include "core/framework/tensorprotoutils.h"
Expand Down Expand Up @@ -767,7 +768,8 @@ bool MatchInputMaskSubgraph(const Graph& graph, const Node& layer_norm, const No
}

// check where has X=-Infinity
if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(where.InputDefs()[1]), -INFINITY, true)) {
if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(where.InputDefs()[1]),
-std::numeric_limits<float>::infinity(), true)) {
DEBUG_LOG("where const not matched.");
return false;
}
Expand Down
5 changes: 3 additions & 2 deletions onnxruntime/core/providers/xnnpack/detail/utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <limits>

#include "core/common/common.h"
#include "core/common/safeint.h"
Expand Down Expand Up @@ -239,8 +240,8 @@ std::unique_ptr<IndexedSubGraph::MetaDef> FuseActivation(const NodeUnit& node_un
def.attributes = node_unit.GetNode().GetAttributes();

// use infinity as the default as that's what xnnpack uses if min/max are not set
float min = -INFINITY;
float max = INFINITY;
float min = -std::numeric_limits<float>::infinity();
float max = std::numeric_limits<float>::infinity();

const auto& activation_type = activation.OpType();
if (activation_type == "Clip") {
Expand Down
Loading

0 comments on commit 6dd8736

Please sign in to comment.