From e2eadab7f36e6f60f8a0f7e5e09fe6936d264707 Mon Sep 17 00:00:00 2001
From: aciddelgado <aciddelgado@microsoft.com>
Date: Sat, 4 Nov 2023 10:43:53 -0700
Subject: [PATCH] fix warning and lint

---
 .../contrib_ops/cpu/bert/attention_common.h   |  4 ++--
 .../cpu/transformers/logits_processor.h       |  6 ++---
 .../cuda/bert/group_query_attention.cc        | 24 +++++++++----------
 .../cuda/bert/group_query_attention.h         |  2 +-
 .../cuda/bert/group_query_attention_helper.h  | 14 +++++------
 .../cuda/bert/group_query_attention_impl.cu   |  4 ++--
 6 files changed, 27 insertions(+), 27 deletions(-)
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_common.h b/onnxruntime/contrib_ops/cpu/bert/attention_common.h
index ef57b2a11d7a9..9be826c4506fa 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_common.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_common.h
@@ -97,11 +97,11 @@ struct GroupQueryAttentionParameters {
   int head_size;
   int kv_hidden_size;
   int kv_num_heads;
-  int num_splits;          // number of splits for splitkv
+  int num_splits;  // number of splits for splitkv
   bool has_mask;
   bool is_unidirectional;  // causal
   bool kv_share_buffer;
-  bool is_prompt; // determines if seqlens_k is past or kv sequence length tensor
+  bool is_prompt;  // determines if seqlens_k is past or kv sequence length tensor
   float scale;
   AttentionQkvFormat qkv_format;
   AttentionQkvFormat past_kv_format;
diff --git a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h
index d870f5b9a2e1c..c893ba7c22d99 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h
@@ -266,9 +266,9 @@ class TimestampLogitsProcessor : public ILogitsProcessor<T> {
       }
     }
 
-// #ifdef DEBUG_GENERATION
-//     DumpScores("TimestampLogitsProcessor", next_token_scores);
-// #endif
+    // #ifdef DEBUG_GENERATION
+    //     DumpScores("TimestampLogitsProcessor", next_token_scores);
+    // #endif
   }
 
  private:
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
index d07e6c5e3132e..e5e82a5671f88 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
@@ -17,18 +17,18 @@ namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
-#define REGISTER_KERNEL_TYPED(T)                                                                                 \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                                                 \
-      GroupQueryAttention,                                                                                       \
-      kMSDomain,                                                                                                 \
-      1,                                                                                                         \
-      T,                                                                                                         \
-      kCudaExecutionProvider,                                                                                    \
-      (*KernelDefBuilder::Create())                                                                              \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>())                                                 \
+#define REGISTER_KERNEL_TYPED(T)                                         \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                         \
+      GroupQueryAttention,                                               \
+      kMSDomain,                                                         \
+      1,                                                                 \
+      T,                                                                 \
+      kCudaExecutionProvider,                                            \
+      (*KernelDefBuilder::Create())                                      \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>())         \
           .TypeConstraint("M", {DataTypeImpl::GetTensorType<int64_t>()}) \
-          .MayInplace(3, 1)                                                                                      \
-          .MayInplace(4, 2),                                                                                     \
+          .MayInplace(3, 1)                                              \
+          .MayInplace(4, 2),                                             \
       GroupQueryAttention<T>);
 
 // REGISTER_KERNEL_TYPED(float)
@@ -131,7 +131,7 @@ Status GroupQueryAttention<T>::ComputeInternal(OpKernelContext* context) const {
   auto out_accum_buffer = GetScratchBuffer<void>(0, context->GetComputeStream());          // nullptr
 #endif
 
-ORT_ENFORCE(use_flash_attention);
+  ORT_ENFORCE(use_flash_attention);
 
 #if USE_MEMORY_EFFICIENT_ATTENTION
   int sm = (device_prop.major * 10) + device_prop.minor;
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.h b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.h
index 04491b6efc9ec..a7d582432de44 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.h
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.h
@@ -24,7 +24,7 @@ class GroupQueryAttention final : public CudaKernel {
   int kv_num_heads_;  // different for k and v for group query attention
   int past_sequence_length_;
   bool is_unidirectional_;  // causal
-  bool kv_share_buffer_;  // kv-cache
+  bool kv_share_buffer_;    // kv-cache
   bool is_past_bsnh_;
   float scale_;
   bool disable_flash_attention_;
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_helper.h b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_helper.h
index d7feaaa6285ad..3a1c09cbaaf79 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_helper.h
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_helper.h
@@ -185,7 +185,7 @@ Status CheckInputs(const Tensor* query,
     const auto& attention_mask_shape = attention_mask->Shape().GetDims();
     if (attention_mask_shape[0] != batch_size) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                            "attention_mask dim 0 must be batch_size.");
+                             "attention_mask dim 0 must be batch_size.");
     }
     if (attention_mask_shape[1] == kv_sequence_length) {
       is_prompt = true;
@@ -197,7 +197,7 @@ Status CheckInputs(const Tensor* query,
   if (kv_share_buffer) {
     if (attention_mask == nullptr) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                            "attention_mask tensor must be present when kv-share buffer is on.");
+                             "attention_mask tensor must be present when kv-share buffer is on.");
     }
     present_sequence_length = max_sequence_length;
   } else {
@@ -208,11 +208,11 @@ Status CheckInputs(const Tensor* query,
   if (parameters != nullptr) {
     GroupQueryAttentionParameters* output_parameters = reinterpret_cast<GroupQueryAttentionParameters*>(parameters);
     output_parameters->batch_size = batch_size;
-    output_parameters->sequence_length = sequence_length; // sequence length of Q
-    output_parameters->past_sequence_length = past_sequence_length; // max sequence length of past kv tensors
-    output_parameters->kv_sequence_length = kv_sequence_length; // max sequence length of new kv tensors
-    output_parameters->present_sequence_length = present_sequence_length; // max sequence length of present kv tensors
-    output_parameters->max_sequence_length = max_sequence_length; // max sequence length of kv buffer tensors TODO(aciddelgado): always same as present, remove
+    output_parameters->sequence_length = sequence_length;                  // sequence length of Q
+    output_parameters->past_sequence_length = past_sequence_length;        // max sequence length of past kv tensors
+    output_parameters->kv_sequence_length = kv_sequence_length;            // max sequence length of new kv tensors
+    output_parameters->present_sequence_length = present_sequence_length;  // max sequence length of present kv tensors
+    output_parameters->max_sequence_length = max_sequence_length;          // max sequence length of kv buffer tensors TODO(aciddelgado): always same as present, remove
     output_parameters->mask_sequence_length = mask_sequence_length;
     output_parameters->hidden_size = q_hidden_size;
     output_parameters->num_heads = num_heads;
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
index 6742c58bf3a09..6eb794ceadb26 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
@@ -583,7 +583,7 @@ Status FlashAttention(
   } else {
     // Launch kernel to copy seqlen
     int thr_per_blk = 256;
-    int blk_in_grid = ceil(float(batch_size) / thr_per_blk);
+    int blk_in_grid = int(ceil(float(batch_size) / thr_per_blk));
     repeat_seqlen<<<blk_in_grid, thr_per_blk, 0, stream>>>(data.seqlens_k, parameters.past_sequence_length, batch_size);
   }
 
@@ -693,7 +693,7 @@ Status EfficientAttention(
   if (!parameters.has_mask) {
     // Launch kernel to copy seqlen
     int thr_per_blk = 256;
-    int blk_in_grid = ceil(float(batch_size) / thr_per_blk);
+    int blk_in_grid = int(ceil(float(batch_size) / thr_per_blk));
     repeat_seqlen<<<blk_in_grid, thr_per_blk, 0, stream>>>(data.seqlens_k, parameters.past_sequence_length, batch_size);
   } else {
     ORT_RETURN_IF_ERROR(LaunchGetCacheSeqlens(parameters, data.attention_mask, data.seqlens_k, parameters.is_prompt, stream, 256));