From e2eadab7f36e6f60f8a0f7e5e09fe6936d264707 Mon Sep 17 00:00:00 2001 From: aciddelgado Date: Sat, 4 Nov 2023 10:43:53 -0700 Subject: [PATCH] fix warning and lint --- .../contrib_ops/cpu/bert/attention_common.h | 4 ++-- .../cpu/transformers/logits_processor.h | 6 ++--- .../cuda/bert/group_query_attention.cc | 24 +++++++++---------- .../cuda/bert/group_query_attention.h | 2 +- .../cuda/bert/group_query_attention_helper.h | 14 +++++------ .../cuda/bert/group_query_attention_impl.cu | 4 ++-- 6 files changed, 27 insertions(+), 27 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_common.h b/onnxruntime/contrib_ops/cpu/bert/attention_common.h index ef57b2a11d7a9..9be826c4506fa 100644 --- a/onnxruntime/contrib_ops/cpu/bert/attention_common.h +++ b/onnxruntime/contrib_ops/cpu/bert/attention_common.h @@ -97,11 +97,11 @@ struct GroupQueryAttentionParameters { int head_size; int kv_hidden_size; int kv_num_heads; - int num_splits; // number of splits for splitkv + int num_splits; // number of splits for splitkv bool has_mask; bool is_unidirectional; // causal bool kv_share_buffer; - bool is_prompt; // determines if seqlens_k is past or kv sequence length tensor + bool is_prompt; // determines if seqlens_k is past or kv sequence length tensor float scale; AttentionQkvFormat qkv_format; AttentionQkvFormat past_kv_format; diff --git a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h index d870f5b9a2e1c..c893ba7c22d99 100644 --- a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h +++ b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h @@ -266,9 +266,9 @@ class TimestampLogitsProcessor : public ILogitsProcessor { } } -// #ifdef DEBUG_GENERATION -// DumpScores("TimestampLogitsProcessor", next_token_scores); -// #endif + // #ifdef DEBUG_GENERATION + // DumpScores("TimestampLogitsProcessor", next_token_scores); + // #endif } private: diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc index d07e6c5e3132e..e5e82a5671f88 100644 --- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc +++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc @@ -17,18 +17,18 @@ namespace onnxruntime { namespace contrib { namespace cuda { -#define REGISTER_KERNEL_TYPED(T) \ - ONNX_OPERATOR_TYPED_KERNEL_EX( \ - GroupQueryAttention, \ - kMSDomain, \ - 1, \ - T, \ - kCudaExecutionProvider, \ - (*KernelDefBuilder::Create()) \ - .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ +#define REGISTER_KERNEL_TYPED(T) \ + ONNX_OPERATOR_TYPED_KERNEL_EX( \ + GroupQueryAttention, \ + kMSDomain, \ + 1, \ + T, \ + kCudaExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ .TypeConstraint("M", {DataTypeImpl::GetTensorType()}) \ - .MayInplace(3, 1) \ - .MayInplace(4, 2), \ + .MayInplace(3, 1) \ + .MayInplace(4, 2), \ GroupQueryAttention); // REGISTER_KERNEL_TYPED(float) @@ -131,7 +131,7 @@ Status GroupQueryAttention::ComputeInternal(OpKernelContext* context) const { auto out_accum_buffer = GetScratchBuffer(0, context->GetComputeStream()); // nullptr #endif -ORT_ENFORCE(use_flash_attention); + ORT_ENFORCE(use_flash_attention); #if USE_MEMORY_EFFICIENT_ATTENTION int sm = (device_prop.major * 10) + device_prop.minor; diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.h b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.h index 04491b6efc9ec..a7d582432de44 100644 --- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.h +++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.h @@ -24,7 +24,7 @@ class GroupQueryAttention final : public CudaKernel { int kv_num_heads_; // different for k and v for group query attention int past_sequence_length_; bool is_unidirectional_; // causal - bool kv_share_buffer_; // kv-cache + bool kv_share_buffer_; // kv-cache bool is_past_bsnh_; float scale_; bool disable_flash_attention_; diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_helper.h b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_helper.h index d7feaaa6285ad..3a1c09cbaaf79 100644 --- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_helper.h +++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_helper.h @@ -185,7 +185,7 @@ Status CheckInputs(const Tensor* query, const auto& attention_mask_shape = attention_mask->Shape().GetDims(); if (attention_mask_shape[0] != batch_size) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "attention_mask dim 0 must be batch_size."); + "attention_mask dim 0 must be batch_size."); } if (attention_mask_shape[1] == kv_sequence_length) { is_prompt = true; @@ -197,7 +197,7 @@ Status CheckInputs(const Tensor* query, if (kv_share_buffer) { if (attention_mask == nullptr) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "attention_mask tensor must be present when kv-share buffer is on."); + "attention_mask tensor must be present when kv-share buffer is on."); } present_sequence_length = max_sequence_length; } else { @@ -208,11 +208,11 @@ Status CheckInputs(const Tensor* query, if (parameters != nullptr) { GroupQueryAttentionParameters* output_parameters = reinterpret_cast(parameters); output_parameters->batch_size = batch_size; - output_parameters->sequence_length = sequence_length; // sequence length of Q - output_parameters->past_sequence_length = past_sequence_length; // max sequence length of past kv tensors - output_parameters->kv_sequence_length = kv_sequence_length; // max sequence length of new kv tensors - output_parameters->present_sequence_length = present_sequence_length; // max sequence length of present kv tensors - output_parameters->max_sequence_length = max_sequence_length; // max sequence length of kv buffer tensors TODO(aciddelgado): always same as present, remove + output_parameters->sequence_length = sequence_length; // sequence length of Q + output_parameters->past_sequence_length = past_sequence_length; // max sequence length of past kv tensors + output_parameters->kv_sequence_length = kv_sequence_length; // max sequence length of new kv tensors + output_parameters->present_sequence_length = present_sequence_length; // max sequence length of present kv tensors + output_parameters->max_sequence_length = max_sequence_length; // max sequence length of kv buffer tensors TODO(aciddelgado): always same as present, remove output_parameters->mask_sequence_length = mask_sequence_length; output_parameters->hidden_size = q_hidden_size; output_parameters->num_heads = num_heads; diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu index 6742c58bf3a09..6eb794ceadb26 100644 --- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu +++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu @@ -583,7 +583,7 @@ Status FlashAttention( } else { // Launch kernel to copy seqlen int thr_per_blk = 256; - int blk_in_grid = ceil(float(batch_size) / thr_per_blk); + int blk_in_grid = int(ceil(float(batch_size) / thr_per_blk)); repeat_seqlen<<>>(data.seqlens_k, parameters.past_sequence_length, batch_size); } @@ -693,7 +693,7 @@ Status EfficientAttention( if (!parameters.has_mask) { // Launch kernel to copy seqlen int thr_per_blk = 256; - int blk_in_grid = ceil(float(batch_size) / thr_per_blk); + int blk_in_grid = int(ceil(float(batch_size) / thr_per_blk)); repeat_seqlen<<>>(data.seqlens_k, parameters.past_sequence_length, batch_size); } else { ORT_RETURN_IF_ERROR(LaunchGetCacheSeqlens(parameters, data.attention_mask, data.seqlens_k, parameters.is_prompt, stream, 256));