diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index 45306c852a906..ed9e2a0567d2f 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -5646,7 +5646,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16), tensor(bfloat16)</dt>
+<dt><tt>T</tt> : tensor(float), tensor(float16), tensor(bfloat16)</dt>
 <dd>Constrain input and output to float tensors.</dd>
 <dt><tt>M</tt> : tensor(int32)</dt>
 <dd>Constrain integer type.</dd>
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 5f19c16cba616..df5897529baae 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -512,6 +512,7 @@ Do not modify directly.*
 |Sampling|*in* input_ids:**I**<br> *in* max_length:**I**<br> *in* min_length:**I**<br> *in* repetition_penalty:**T**<br> *in* vocab_mask:**I**<br> *in* prefix_vocab_mask:**I**<br> *in* attention_mask:**I**<br> *in* presence_mask:**I**<br> *in* seed:**I**<br> *out* sequences:**I**<br> *out* filtered_logits:**T**|1+|**T** = tensor(float)|
 |SkipLayerNormalization|*in* input:**T**<br> *in* skip:**T**<br> *in* gamma:**T**<br> *in* beta:**T**<br> *in* bias:**T**<br> *out* output:**T**<br> *out* mean:**U**<br> *out* inv_std_var:**U**<br> *out* input_skip_bias_sum:**T**|1+|**T** = tensor(double), tensor(float)|
 |SkipSimplifiedLayerNormalization|*in* input:**T**<br> *in* skip:**T**<br> *in* gamma:**T**<br> *in* bias:**T**<br> *out* output:**T**<br> *out* mean:**U**<br> *out* inv_std_var:**U**<br> *out* input_skip_bias_sum:**T**|1+|**T** = tensor(double), tensor(float)|
+|SparseAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *in* block_row_indices:**M**<br> *in* block_col_indices:**M**<br> *in* total_sequence_length:**M**<br> *in* key_total_sequence_lengths:**M**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**M** = tensor(int32)<br/> **T** = tensor(float)|
 |SparseToDenseMatMul|*in* A:**T**<br> *in* B:**T1**<br> *out* Y:**T1**|1+|**T** = sparse_tensor(double), sparse_tensor(float), sparse_tensor(int32), sparse_tensor(int64), sparse_tensor(uint32), sparse_tensor(uint64)<br/> **T1** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
 |Tokenizer|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(string)|
 |TransposeMatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float)|
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_base.h b/onnxruntime/contrib_ops/cpu/bert/attention_base.h
index af902a713eaa2..a6782daa58f1a 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_base.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_base.h
@@ -68,7 +68,6 @@ class AttentionBase {
                      const Tensor* past_seq_len = nullptr) const;
 
   int num_heads_;                          // number of attention heads
-  int kv_num_heads_;                       // different for k and v for group query attention
   bool is_unidirectional_;                 // whether every token can only attend to previous tokens.
   std::vector<int64_t> qkv_hidden_sizes_;  // Q, K, V hidden sizes parsed from the qkv_hidden_sizes attribute.
   bool require_same_hidden_size_;          // whether the implementation supports different hidden sizes of Q/K/V.
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h b/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h
index fc4905cd31819..dd52001c2ac6b 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h
@@ -3,9 +3,8 @@
 
 #pragma once
 
-#include "attention_base.h"
-#include "attention_helper.h"
-
+#include "contrib_ops/cpu/bert/attention_base.h"
+#include "contrib_ops/cpu/bert/attention_helper.h"
 #include "core/common/common.h"
 #include "core/common/safeint.h"
 #include "core/framework/op_kernel.h"
diff --git a/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h b/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
index 6b0c5f395cab0..137612a4bf902 100644
--- a/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
+++ b/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include "attention_base.h"
-#include "attention_helper.h"
+#include "contrib_ops/cpu/bert/attention_base.h"
+#include "contrib_ops/cpu/bert/attention_helper.h"
 
 #include "core/common/common.h"
 #include "contrib_ops/cpu/bert/attention_common.h"
@@ -14,14 +14,31 @@
 namespace onnxruntime {
 namespace contrib {
 
-class GQAAttentionBase : public AttentionBase {
+class GQAAttentionBase {
  protected:
-  GQAAttentionBase(const OpKernelInfo& info, bool require_same_hidden_size)
-      : AttentionBase(info, require_same_hidden_size) {}
+  GQAAttentionBase(const OpKernelInfo& info, bool has_local) {
+    int64_t num_heads = 0;
+    ORT_ENFORCE(info.GetAttr("num_heads", &num_heads).IsOK() && num_heads > 0);
+    num_heads_ = static_cast<int>(num_heads);
 
-  int local_window_size_;
-  bool do_rotary_;
+    int64_t kv_num_heads = 0;
+    ORT_ENFORCE(info.GetAttr("kv_num_heads", &kv_num_heads).IsOK() && kv_num_heads > 0);
+    kv_num_heads_ = static_cast<int>(kv_num_heads);
+
+    scale_ = info.GetAttrOrDefault<float>("scale", 0.0f);
+
+    do_rotary_ = info.GetAttrOrDefault<int64_t>("do_rotary", 0) == 1;
+    rotary_interleaved_ = info.GetAttrOrDefault<int64_t>("rotary_interleaved", 0) == 1;
+
+    local_window_size_ = has_local ? static_cast<int>(info.GetAttrOrDefault<int64_t>("local_window_size", -1)) : -1;
+  }
+
+  int num_heads_;     // number of attention heads of Q
+  int kv_num_heads_;  // number of attention heads of K or V
+  float scale_;       // the scaling factor applied before softmax
+  bool do_rotary_;    // whether or not to use rotary embeddings
   bool rotary_interleaved_;
+  int local_window_size_;
 
   template <typename T>
   Status ApplyAttention(const T* Q,                                 // Q data with shape BxNxSxH
diff --git a/onnxruntime/contrib_ops/cpu/bert/group_query_attention.cc b/onnxruntime/contrib_ops/cpu/bert/group_query_attention.cc
index cad9274e68149..97388a9d6bce8 100644
--- a/onnxruntime/contrib_ops/cpu/bert/group_query_attention.cc
+++ b/onnxruntime/contrib_ops/cpu/bert/group_query_attention.cc
@@ -1,11 +1,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "group_query_attention.h"
-#include "group_query_attention_helper.h"
-#include "attention_utils.h"
-#include "rotary_embedding.h"
-#include "rotary_embedding_helper.h"
+#include "contrib_ops/cpu/bert/group_query_attention.h"
+#include "contrib_ops/cpu/bert/group_query_attention_helper.h"
+#include "contrib_ops/cpu/bert/rotary_helper.h"
+#include "contrib_ops/cpu/bert/attention_utils.h"
+#include "contrib_ops/cpu/bert/rotary_embedding.h"
+#include "contrib_ops/cpu/bert/rotary_embedding_helper.h"
 
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/onnx_protobuf.h"
@@ -33,19 +34,8 @@ ONNX_OPERATOR_TYPED_KERNEL_EX(
     GroupQueryAttention<float>);
 
 template <typename T>
-GroupQueryAttention<T>::GroupQueryAttention(const OpKernelInfo& info) : OpKernel(info), GQAAttentionBase(info, false) {
-  int64_t num_heads = 0;
-  int64_t kv_num_heads = 0;
-  ORT_ENFORCE(info.GetAttr("num_heads", &num_heads).IsOK() && num_heads > 0);
-  ORT_ENFORCE(info.GetAttr("kv_num_heads", &kv_num_heads).IsOK() && kv_num_heads > 0);
-  num_heads_ = static_cast<int>(num_heads);
-  kv_num_heads_ = static_cast<int>(kv_num_heads);
-
-  mask_filter_value_ = info.GetAttrOrDefault<float>("mask_filter_value", -10000.0f);
-  local_window_size_ = static_cast<int>(info.GetAttrOrDefault<int64_t>("local_window_size", -1));
-  do_rotary_ = info.GetAttrOrDefault<int64_t>("do_rotary", 0) == 1;
-  rotary_interleaved_ = info.GetAttrOrDefault<int64_t>("rotary_interleaved", 0) == 1;
-}
+GroupQueryAttention<T>::GroupQueryAttention(const OpKernelInfo& info)
+    : OpKernel(info), GQAAttentionBase(info, true) {}
 
 template <typename T>
 Status GroupQueryAttention<T>::Compute(OpKernelContext* context) const {
@@ -174,14 +164,14 @@ Status GroupQueryAttention<T>::Compute(OpKernelContext* context) const {
     if (packed_qkv) {
       const T* v_input = k_input + kv_num_heads_ * sequence_length * head_size;
       T* v_rotary = k_rotary + kv_num_heads_ * sequence_length * head_size;
-      ORT_RETURN_IF_ERROR(group_query_attention_helper::PackVIntoRotaryQKV<T>(tp,
-                                                                              parameters.batch_size,
-                                                                              parameters.sequence_length,
-                                                                              parameters.num_heads,
-                                                                              parameters.kv_num_heads,
-                                                                              parameters.head_size,
-                                                                              v_input,
-                                                                              v_rotary));
+      ORT_RETURN_IF_ERROR(rotary_helper::PackVIntoRotaryQKV<T>(tp,
+                                                               parameters.batch_size,
+                                                               parameters.sequence_length,
+                                                               parameters.num_heads,
+                                                               parameters.kv_num_heads,
+                                                               parameters.head_size,
+                                                               v_input,
+                                                               v_rotary));
     }
   }
 
diff --git a/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h b/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h
index a7de02452aa58..7ffb72fe55d25 100644
--- a/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h
+++ b/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h
@@ -263,38 +263,6 @@ Status CheckInputs(const Tensor* query,
 
   return CheckInputs(query, key, value, past_key, past_value, cos_cache, sin_cache, parameters, num_heads, kv_num_heads, seqlens_k, total_seqlen, scale);
 }
-
-template <typename T>
-Status PackVIntoRotaryQKV(concurrency::ThreadPool* tp,
-                          int batch_size,
-                          int sequence_length,
-                          int num_heads,
-                          int kv_num_heads,
-                          int head_size,
-                          const T* input,
-                          T* output) {
-  int seq_stride = head_size;
-  int head_stride = sequence_length * seq_stride;
-  int batch_stride = (num_heads + 2 * kv_num_heads) * head_stride;
-
-  const int loop_len = batch_size * sequence_length * kv_num_heads;
-  const double cost = static_cast<double>(head_size);
-  ThreadPool::TryParallelFor(tp, loop_len, cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) {
-    for (std::ptrdiff_t ptr = begin; ptr != end; ++ptr) {
-      const int b = static_cast<int>((ptr / kv_num_heads) / sequence_length);
-      const int s = static_cast<int>((ptr / kv_num_heads) % sequence_length);
-      const int n = static_cast<int>(ptr % kv_num_heads);
-      const int block_offset = b * batch_stride + s * seq_stride + n * head_stride;
-      const T* input_data = input + block_offset;
-      T* output_data = output + block_offset;
-      for (int i = 0; i < head_size; i++) {
-        output_data[i] = input_data[i];
-      }
-    }
-  });
-  return Status::OK();
-}
-
 }  // namespace group_query_attention_helper
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/bert/rotary_helper.h b/onnxruntime/contrib_ops/cpu/bert/rotary_helper.h
new file mode 100644
index 0000000000000..714d962dfb34e
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/bert/rotary_helper.h
@@ -0,0 +1,47 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/common.h"
+#include "core/providers/common.h"
+#include "contrib_ops/cpu/bert/attention_common.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace rotary_helper {
+
+template <typename T>
+Status PackVIntoRotaryQKV(concurrency::ThreadPool* tp,
+                          int batch_size,
+                          int sequence_length,
+                          int num_heads,
+                          int kv_num_heads,
+                          int head_size,
+                          const T* input,
+                          T* output) {
+  int seq_stride = head_size;
+  int head_stride = sequence_length * seq_stride;
+  int batch_stride = (num_heads + 2 * kv_num_heads) * head_stride;
+
+  const int loop_len = batch_size * sequence_length * kv_num_heads;
+  const double cost = static_cast<double>(head_size);
+  ThreadPool::TryParallelFor(tp, loop_len, cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) {
+    for (std::ptrdiff_t ptr = begin; ptr != end; ++ptr) {
+      const int b = static_cast<int>((ptr / kv_num_heads) / sequence_length);
+      const int s = static_cast<int>((ptr / kv_num_heads) % sequence_length);
+      const int n = static_cast<int>(ptr % kv_num_heads);
+      const int block_offset = b * batch_stride + s * seq_stride + n * head_stride;
+      const T* input_data = input + block_offset;
+      T* output_data = output + block_offset;
+      for (int i = 0; i < head_size; i++) {
+        output_data[i] = input_data[i];
+      }
+    }
+  });
+  return Status::OK();
+}
+
+}  // namespace rotary_helper
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
index e8ca4370135cc..90a51fda0b188 100644
--- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
@@ -21,6 +21,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GreedySearch);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, MultiHeadAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GroupQueryAttention);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SparseAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, RotaryEmbedding);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Sampling);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, AttnLSTM);
@@ -281,6 +282,7 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GreedySearch)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, MultiHeadAttention)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GroupQueryAttention)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SparseAttention)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, RotaryEmbedding)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Sampling)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, AttnLSTM)>,
diff --git a/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc
new file mode 100644
index 0000000000000..e337f41a8688d
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.cc
@@ -0,0 +1,226 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "contrib_ops/cpu/sparse/sparse_attention.h"
+#include "contrib_ops/cpu/sparse/sparse_attention_helper.h"
+#include "contrib_ops/cpu/bert/rotary_helper.h"
+#include "contrib_ops/cpu/bert/attention_utils.h"
+#include "contrib_ops/cpu/bert/rotary_embedding.h"
+#include "contrib_ops/cpu/bert/rotary_embedding_helper.h"
+
+#include "core/framework/tensorprotoutils.h"
+#include "core/graph/onnx_protobuf.h"
+#include "core/common/safeint.h"
+#include "core/platform/threadpool.h"
+
+#include <unsupported/Eigen/SpecialFunctions>
+#include <vector>
+
+using onnxruntime::concurrency::ThreadPool;
+
+namespace onnxruntime {
+namespace contrib {
+
+ONNX_OPERATOR_TYPED_KERNEL_EX(
+    SparseAttention,
+    kMSDomain,
+    1,
+    float,
+    kCpuExecutionProvider,
+    KernelDefBuilder()
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+        .TypeConstraint("M", DataTypeImpl::GetTensorType<int32_t>()),
+    SparseAttention<float>);
+
+template <typename T>
+SparseAttention<T>::SparseAttention(const OpKernelInfo& info) : OpKernel(info), SparseAttentionBase(info) {
+}
+
+template <typename T>
+Status SparseAttention<T>::Compute(OpKernelContext* context) const {
+  const Tensor* query = context->Input<Tensor>(0);
+  const Tensor* key = context->Input<Tensor>(1);
+  const Tensor* value = context->Input<Tensor>(2);
+  const Tensor* past_key = context->Input<Tensor>(3);
+  const Tensor* past_value = context->Input<Tensor>(4);
+  const Tensor* block_row_indices = context->Input<Tensor>(5);
+  const Tensor* block_col_indices = context->Input<Tensor>(6);
+  const Tensor* total_seq_len = context->Input<Tensor>(7);
+  const Tensor* total_key_lengths = context->Input<Tensor>(8);
+  const Tensor* cos_cache = context->Input<Tensor>(9);
+  const Tensor* sin_cache = context->Input<Tensor>(10);
+
+  SparseAttentionParameters parameters = {};
+
+  // Parameters from node attribute shall be set before calling CheckInputs
+  parameters.sparse_block_size = sparse_block_size_;
+  parameters.num_heads = num_heads_;
+  parameters.kv_num_heads = kv_num_heads_;
+  parameters.scale = scale_;
+  parameters.do_rotary = do_rotary_;
+  parameters.rotary_interleaved = rotary_interleaved_;
+  ORT_RETURN_IF_ERROR(sparse_attention_helper::CheckInputs(&parameters,
+                                                           query,
+                                                           key,
+                                                           value,
+                                                           past_key,
+                                                           past_value,
+                                                           cos_cache,
+                                                           sin_cache,
+                                                           block_row_indices,
+                                                           block_col_indices,
+                                                           total_key_lengths,
+                                                           total_seq_len));
+
+  const int batch_size = parameters.batch_size;
+  const int sequence_length = parameters.sequence_length;
+  int q_hidden_size = parameters.hidden_size;
+
+  std::vector<int64_t> output_shape(3);
+  output_shape[0] = static_cast<int64_t>(batch_size);
+  output_shape[1] = static_cast<int64_t>(sequence_length);
+  output_shape[2] = static_cast<int64_t>(q_hidden_size);
+  Tensor* output = context->Output(0, output_shape);
+
+  constexpr bool past_present_share_buffer = true;  // Only supports share buffer for past and present for now.
+  parameters.past_present_share_buffer = past_present_share_buffer;
+
+  int head_size = parameters.head_size;
+  const int cache_length = past_present_share_buffer
+                               ? parameters.max_cache_sequence_length
+                               : parameters.total_sequence_length;
+  std::vector<int64_t> present_k_shape({static_cast<int64_t>(batch_size),
+                                        static_cast<int64_t>(kv_num_heads_),
+                                        static_cast<int64_t>(cache_length),
+                                        static_cast<int64_t>(head_size)});
+  std::vector<int64_t> present_v_shape({static_cast<int64_t>(batch_size),
+                                        static_cast<int64_t>(kv_num_heads_),
+                                        static_cast<int64_t>(cache_length),
+                                        static_cast<int64_t>(head_size)});
+  Tensor* present_key = context->Output(1, present_k_shape);
+  Tensor* present_value = context->Output(2, present_v_shape);
+
+  // Check past and present share buffer.
+  if (past_present_share_buffer) {
+    ORT_ENFORCE(past_key->DataRaw() == present_key->DataRaw() && past_value->DataRaw() == present_value->DataRaw());
+  }
+
+  AllocatorPtr allocator;
+  ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&allocator));
+
+  auto element_type = DataTypeImpl::GetType<T>();
+  OrtValue Q;
+  OrtValue K;
+  OrtValue V;
+
+  const bool packed_qkv = parameters.is_packed_qkv;
+  if (packed_qkv) {
+    ORT_RETURN_IF_ERROR(MaybeTransposeToBNSH<T>(
+        allocator, batch_size, num_heads_ + 2 * kv_num_heads_, sequence_length, head_size, query, Q));
+  } else {
+    ORT_RETURN_IF_ERROR(MaybeTransposeToBNSH<T>(
+        allocator, batch_size, num_heads_, sequence_length, head_size, query, Q));
+    ORT_RETURN_IF_ERROR(MaybeTransposeToBNSH<T>(
+        allocator, batch_size, kv_num_heads_, sequence_length, head_size, key, K));
+    ORT_RETURN_IF_ERROR(MaybeTransposeToBNSH<T>(
+        allocator, batch_size, kv_num_heads_, sequence_length, head_size, value, V));
+  }
+
+  if (do_rotary_) {
+    rotary_embedding_helper::RotaryParameters rotary_params = {};
+    rotary_params.batch_size = batch_size;
+    rotary_params.sequence_length = sequence_length;
+    rotary_params.hidden_size = q_hidden_size;
+    rotary_params.head_size = head_size;
+    rotary_params.rotary_embedding_dim = parameters.rotary_dim;
+    rotary_params.num_heads = num_heads_;
+    rotary_params.max_sequence_length = sequence_length;  // unused
+    rotary_params.seq_stride = head_size;
+    rotary_params.head_stride = sequence_length * rotary_params.seq_stride;
+    rotary_params.batch_stride = (packed_qkv ? (num_heads_ + 2 * kv_num_heads_) : num_heads_) *
+                                 rotary_params.head_stride;
+    rotary_params.position_ids_format = sequence_length == 1 ? 1 : 0;
+    rotary_params.transposed = true;
+    auto* tp = context->GetOperatorThreadPool();
+
+    const bool is_prompt = parameters.total_sequence_length == parameters.sequence_length;
+    std::vector<int64_t> pos_ids(is_prompt ? 1 : batch_size * sequence_length);
+    if (is_prompt) {
+      pos_ids[0] = static_cast<int64_t>(0);
+    } else if (sequence_length == 1) {
+      for (int b = 0; b < batch_size; b++) {
+        pos_ids[b] = static_cast<int64_t>(total_key_lengths->Data<int32_t>()[b]) - 1;
+      }
+    } else {
+      // This supports a rare case that sequence_length > 1 when it is not prompt.
+      for (int b = 0; b < batch_size; b++) {
+        for (int s = 0; s < sequence_length; s++) {
+          pos_ids[b * sequence_length + s] = static_cast<int64_t>(total_key_lengths->Data<int32_t>()[b]) -
+                                             (sequence_length - s);
+        }
+      }
+    }
+
+    const T* q_input;
+    const T* k_input;
+    T* q_rotary;
+    T* k_rotary;
+    if (packed_qkv) {
+      OrtValue RotaryQKV;
+      TensorShape qkv_shape({batch_size, num_heads_ + 2 * kv_num_heads_, sequence_length, head_size});
+      Tensor::InitOrtValue(element_type, qkv_shape, allocator, RotaryQKV);
+      q_input = Q.Get<Tensor>().Data<T>();
+      k_input = q_input + num_heads_ * sequence_length * head_size;
+      q_rotary = RotaryQKV.GetMutable<Tensor>()->MutableData<T>();
+      k_rotary = q_rotary + num_heads_ * sequence_length * head_size;
+      Q = RotaryQKV;
+    } else {
+      OrtValue RotaryQ;
+      TensorShape q_shape({batch_size, num_heads_, sequence_length, head_size});
+      Tensor::InitOrtValue(element_type, q_shape, allocator, RotaryQ);
+      OrtValue RotaryK;
+      TensorShape k_shape({batch_size, kv_num_heads_, sequence_length, head_size});
+      Tensor::InitOrtValue(element_type, k_shape, allocator, RotaryK);
+      q_input = Q.Get<Tensor>().Data<T>();
+      k_input = K.Get<Tensor>().Data<T>();
+      q_rotary = RotaryQ.GetMutable<Tensor>()->MutableData<T>();
+      k_rotary = RotaryK.GetMutable<Tensor>()->MutableData<T>();
+      Q = RotaryQ;
+      K = RotaryK;
+    }
+
+    ORT_RETURN_IF_ERROR(RunRotaryEmbedding<T>(tp, rotary_params, q_input,
+                                              pos_ids.data(), cos_cache->Data<T>(),
+                                              sin_cache->Data<T>(), q_rotary, rotary_interleaved_));
+
+    rotary_params.num_heads = kv_num_heads_;
+    rotary_params.hidden_size = parameters.kv_hidden_size;
+    if (!packed_qkv) {
+      rotary_params.batch_stride = kv_num_heads_ * rotary_params.head_stride;
+    }
+    ORT_RETURN_IF_ERROR(RunRotaryEmbedding<T>(tp, rotary_params, k_input,
+                                              pos_ids.data(), cos_cache->Data<T>(),
+                                              sin_cache->Data<T>(), k_rotary, rotary_interleaved_));
+    if (packed_qkv) {
+      const T* v_input = k_input + kv_num_heads_ * sequence_length * head_size;
+      T* v_rotary = k_rotary + kv_num_heads_ * sequence_length * head_size;
+      ORT_RETURN_IF_ERROR(rotary_helper::PackVIntoRotaryQKV<T>(tp,
+                                                               parameters.batch_size,
+                                                               parameters.sequence_length,
+                                                               parameters.num_heads,
+                                                               parameters.kv_num_heads,
+                                                               parameters.head_size,
+                                                               v_input,
+                                                               v_rotary));
+    }
+  }
+
+  ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&allocator));
+  // Compute the attention score and apply the score to V
+  return ApplyAttention(Q.Get<Tensor>().Data<T>(), packed_qkv ? nullptr : K.Get<Tensor>().Data<T>(),
+                        packed_qkv ? nullptr : V.Get<Tensor>().Data<T>(), past_key, past_value,
+                        output, present_key, present_value,
+                        total_key_lengths, block_row_indices, block_col_indices, parameters, allocator, context);
+}
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.h b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.h
new file mode 100644
index 0000000000000..4267d85c0e35d
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention.h
@@ -0,0 +1,21 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/common.h"
+#include "core/framework/op_kernel.h"
+#include "contrib_ops/cpu/sparse/sparse_attention_base.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+template <typename T>
+class SparseAttention final : public OpKernel, public SparseAttentionBase {
+ public:
+  SparseAttention(const OpKernelInfo& info);
+  Status Compute(OpKernelContext* context) const override;
+};
+
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/sparse/sparse_attention_base.h b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention_base.h
new file mode 100644
index 0000000000000..cf66bd8407126
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention_base.h
@@ -0,0 +1,390 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "contrib_ops/cpu/bert/attention_helper.h"
+
+#include "core/common/common.h"
+#include "contrib_ops/cpu/bert/attention_common.h"
+#include "core/common/safeint.h"
+#include "core/framework/op_kernel.h"
+#include "contrib_ops/cpu/utils/dump_tensor.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+class SparseAttentionBase {
+ protected:
+  SparseAttentionBase(const OpKernelInfo& info) {
+    int64_t num_heads = 0;
+    ORT_ENFORCE(info.GetAttr("num_heads", &num_heads).IsOK() && num_heads > 0);
+    num_heads_ = static_cast<int>(num_heads);
+
+    int64_t kv_num_heads = 0;
+    ORT_ENFORCE(info.GetAttr("kv_num_heads", &kv_num_heads).IsOK() && kv_num_heads > 0);
+    kv_num_heads_ = static_cast<int>(kv_num_heads);
+
+    scale_ = info.GetAttrOrDefault<float>("scale", 0.0f);
+
+    do_rotary_ = info.GetAttrOrDefault<int64_t>("do_rotary", 0) == 1;
+    rotary_interleaved_ = info.GetAttrOrDefault<int64_t>("rotary_interleaved", 0) == 1;
+
+    int64_t sparse_block_size = 0;
+    ORT_ENFORCE(info.GetAttr("sparse_block_size", &sparse_block_size).IsOK());
+    sparse_block_size_ = static_cast<int>(sparse_block_size);
+  }
+
+  int num_heads_;     // number of attention heads of Q
+  int kv_num_heads_;  // number of attention heads of K or V
+  float scale_;       // the scaling factor applied before softmax
+  bool do_rotary_;    // whether or not to use rotary embeddings
+  bool rotary_interleaved_;
+  int sparse_block_size_;
+
+  template <typename T>
+  Status ApplyAttention(const T* Q,                             // Q data with shape BxNxSxH
+                        const T* K,                             // K data with shape BxN_kvxSxH
+                        const T* V,                             // V data with shape BxN_kvxSxH
+                        const Tensor* past_key,                 // past K input tensor
+                        const Tensor* past_value,               // past V input tensor
+                        Tensor* output,                         // output tensor
+                        Tensor* present_key,                    // present K output tensor
+                        Tensor* present_value,                  // present V output tensor
+                        const Tensor* total_key_lengths,        // total key lengths tensor
+                        const Tensor* block_row_indices,        // block row indices
+                        const Tensor* block_col_indices,        // block column indices
+                        SparseAttentionParameters& parameters,  // attention parameters
+                        AllocatorPtr allocator,                 // allocator for temporary tensors
+                        OpKernelContext* context) const {
+    const int batch_size = parameters.batch_size;
+    const int sequence_length = parameters.sequence_length;
+    const int head_size = parameters.head_size;
+    const bool packed_qkv = parameters.is_packed_qkv;
+
+    int past_buffer_sequence_length = static_cast<int>(past_key->Shape().GetDims()[2]);
+    int present_buffer_sequence_length = static_cast<int>(present_key->Shape().GetDims()[2]);
+
+    // Allocate a buffer to store Softmax(QK)
+    size_t bytes = SafeInt<size_t>(batch_size) * num_heads_ * sequence_length * parameters.total_sequence_length * sizeof(T);
+    auto attention_probs = allocator->Alloc(bytes);
+    BufferUniquePtr scratch_buffer(attention_probs, BufferDeleter(allocator));
+
+    bool past_present_share_buffer = parameters.past_present_share_buffer;
+    assert(past_present_share_buffer);
+
+    auto* tp = context->GetOperatorThreadPool();
+
+    const T* k = packed_qkv ? Q + num_heads_ * sequence_length * head_size : K;
+    ComputeAttentionProbs<T>(
+        static_cast<T*>(attention_probs), Q, k, total_key_lengths->Data<int32_t>(),
+        batch_size, sequence_length, parameters.total_sequence_length,
+        past_buffer_sequence_length, present_buffer_sequence_length, head_size,
+        past_key->Data<T>(), present_key->MutableData<T>(), past_present_share_buffer, packed_qkv,
+        block_row_indices->Data<int32_t>(), block_col_indices->Data<int32_t>(), parameters, tp);
+
+    // Compute the attentionScore * Value: out(B, N, S, H_v) = attention_probs(B, N, S, T) x V(B, N, T, H_v)
+    const T* v = packed_qkv ? Q + (num_heads_ + kv_num_heads_) * sequence_length * head_size : V;
+    ComputeVxAttentionScore<T>(
+        output->MutableData<T>(), static_cast<T*>(attention_probs), v,
+        total_key_lengths->Data<int32_t>(),
+        batch_size, sequence_length, parameters.total_sequence_length,
+        past_buffer_sequence_length, present_buffer_sequence_length, head_size, parameters.hidden_size,
+        past_value->Data<T>(), present_value->MutableData<T>(), past_present_share_buffer, packed_qkv, tp);
+
+    return Status::OK();
+  }
+
+ private:
+  // Helper function to compute the attention probs. It does 2 things:
+  //  attention_probs(B, N, S, T) = 1/sqrt(H) x Q(B, N, S, H) x K'(B, N, T, H -> B, N, H, T)
+  //  attention_probs(B, N, S, T) = Softmax(attention_probs)
+  template <typename T>
+  void ComputeAttentionProbs(
+      T* attention_probs,                     // output buffer with size BxNxSxT
+      const T* Q,                             // query start pointer
+      const T* K,                             // key start pointer
+      const int32_t* total_key_lengths,       // total key sequence lengths (past + new)
+      int batch_size,                         // batch size
+      int sequence_length,                    // sequence length of query or new key
+      int total_sequence_length,              // maximum past_sequence_length + sequence_length
+      int past_buffer_sequence_length,        // sequence length of past_key or past_value
+      int present_buffer_sequence_length,     // sequence length of present_key or present_value
+      int head_size,                          // head size of query
+      const T* past_key,                      // past key
+      T* present_key,                         // present key
+      bool past_present_share_buffer,         // whether past_key and present_key share the buffer
+      bool packed_qkv,                        // whether Q, K, V are packed
+      const int32_t* block_row_indices,       // block row indices
+      const int32_t* block_col_indices,       // block column indices
+      SparseAttentionParameters& parameters,  // parameters
+      ThreadPool* tp) const {                 // thread pool
+    const bool is_prompt = (total_sequence_length == sequence_length);
+    const ptrdiff_t packed_batch_stride =
+        packed_qkv ? SafeInt<ptrdiff_t>(num_heads_ + 2 * kv_num_heads_) * sequence_length * head_size
+                   : SafeInt<ptrdiff_t>(0);
+    const int kv_num_heads_factor = num_heads_ / kv_num_heads_;
+    const size_t q_input_chunk_length = static_cast<size_t>(sequence_length) * head_size;  // S x H
+    const size_t kv_input_chunk_length = q_input_chunk_length;
+    const size_t past_buff_chunk_length = static_cast<size_t>(past_buffer_sequence_length) * head_size;
+    const size_t present_buff_chunk_length = static_cast<size_t>(present_buffer_sequence_length) * head_size;
+
+    const int loop_len = batch_size * num_heads_;
+    const float alpha = scale_ == 0.0f ? 1.0f / sqrt(static_cast<float>(head_size)) : scale_;
+
+    TensorOpCost unit_cost;
+    const ptrdiff_t probs_matrix_bytes =
+        SafeInt<ptrdiff_t>(sequence_length) * total_sequence_length * sizeof(T);
+    unit_cost.compute_cycles =
+        static_cast<double>(SafeInt<ptrdiff_t>(2) * sequence_length * head_size * total_sequence_length);
+    unit_cost.bytes_loaded =
+        static_cast<double>((sequence_length + total_sequence_length) * head_size * sizeof(T));
+    unit_cost.bytes_stored = static_cast<double>(probs_matrix_bytes);
+
+    unit_cost.bytes_loaded += static_cast<double>(probs_matrix_bytes);
+    unit_cost.bytes_stored += static_cast<double>(probs_matrix_bytes);
+
+    // Cost to concatenate current key to cache (assume past and present share buffer).
+    double bytes_to_copy_key = static_cast<double>(sizeof(T) * sequence_length * head_size);
+    unit_cost.bytes_loaded += bytes_to_copy_key;
+    unit_cost.bytes_stored += bytes_to_copy_key;
+
+    DUMP_CPU_TENSOR_INIT();
+    DUMP_CPU_TENSOR("block_row_indices", block_row_indices, parameters.num_sparse_layout, parameters.stride_row_indices);
+    DUMP_CPU_TENSOR("block_col_indices", block_col_indices, parameters.num_sparse_layout, parameters.stride_col_indices);
+
+    // Check whether each layout has sparse (has zero in lower triangular)
+    std::vector<bool> layout_has_sparse(parameters.num_sparse_layout);
+    for (int layout_index = 0; layout_index < parameters.num_sparse_layout; layout_index++) {
+      int nonzero_elements = block_row_indices[(layout_index + 1) * parameters.stride_row_indices - 1];
+      int dense_nonzero = (parameters.stride_row_indices * (parameters.stride_row_indices - 1)) / 2;
+      layout_has_sparse[layout_index] = nonzero_elements < dense_nonzero;
+      DUMP_STRING("layout_has_sparse[", layout_index, "]=", layout_has_sparse[layout_index]);
+    }
+
+    ThreadPool::TryParallelFor(tp, loop_len, unit_cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) {
+      DUMP_STRING("batch_size=", batch_size, ",num_heads=", num_heads_, ",loop_len=", loop_len, ",begin=", begin, ",end=", end);
+      for (std::ptrdiff_t i = begin; i != end; ++i) {
+        const int batch_index = static_cast<int>(i) / num_heads_;
+        const int head_index = static_cast<int>(i) % num_heads_;
+        const int past_seq_len = is_prompt ? 0 : (static_cast<int>(total_key_lengths[batch_index]) - sequence_length);
+        const size_t past_chunk_length = static_cast<size_t>(past_seq_len) * head_size;
+        const int total_seq_len = total_key_lengths[batch_index];
+
+        const ptrdiff_t output_offset = SafeInt<ptrdiff_t>(i) * sequence_length * total_sequence_length;
+        T* output = attention_probs + output_offset;
+
+        const T* k;
+        if (packed_qkv) {
+          k = K + packed_batch_stride * batch_index + kv_input_chunk_length * (head_index / kv_num_heads_factor);
+        } else {
+          k = K + kv_input_chunk_length * (i / kv_num_heads_factor);
+        }
+
+        // Concatenate past_k + k -> present_k
+        // TODO: avoid copying mutiple times for a group.
+        k = ConcatStateChunkGQA(past_key, k, present_key, present_buff_chunk_length, past_buff_chunk_length,
+                                past_chunk_length, kv_input_chunk_length, is_prompt, past_present_share_buffer,
+                                i / kv_num_heads_factor);
+
+        // Compute Q*K' + AttentionMask
+        //                     original                 transposed             each iteration
+        // A: Q                (B x N x) S x H          (B x N x) S x H        S x H
+        // B: K'               (B x N x) T x H          (B x N x) H x T        H x T
+        // C: attention_probs  (B x N x) S x T          (B x N x) S x T        S x T
+        const T* q;
+        if (packed_qkv) {
+          q = Q + packed_batch_stride * batch_index + q_input_chunk_length * head_index;
+        } else {
+          q = Q + q_input_chunk_length * i;
+        }
+
+        DUMP_STRING("i=", i, ",batch_index=", batch_index, ",head_index=", head_index,
+                    ",past_seq_len=", past_seq_len, ",total_seq_len=", total_seq_len, ",packed_qkv=", packed_qkv);
+        DUMP_CPU_TENSOR("Q", q, sequence_length, head_size);
+        DUMP_CPU_TENSOR("K", k, total_seq_len, head_size);
+
+        math::GemmEx<T, ThreadPool>(CblasNoTrans, CblasTrans, sequence_length, total_seq_len, head_size, alpha, q,
+                                    head_size, k, head_size, 0.0f /*bata*/, output, total_seq_len,
+                                    nullptr);
+
+        DUMP_CPU_TENSOR("QK", output, sequence_length, total_seq_len);
+
+        // Compute Softmax for causal and output result in place.
+        T* output_softmax = output;
+
+        int layout_id = head_index % parameters.num_sparse_layout;
+        bool is_sparse_layout = layout_has_sparse[layout_id];
+
+        DUMP_STRING("layout_id=", layout_id, ",is_sparse_layout=", is_sparse_layout);
+
+        if (!is_sparse_layout) {  // dense
+          for (int q_id = 0; q_id < sequence_length; q_id++) {
+            int causal_length = past_seq_len + q_id + 1;
+            ComputeAttentionSoftmaxInplace(output_softmax, 1, causal_length, nullptr);
+            for (int remain_seq_id = causal_length; remain_seq_id < total_seq_len; remain_seq_id++) {
+              output_softmax[remain_seq_id] = 0.f;
+            }
+            output_softmax += total_seq_len;
+          }
+        } else {  // sparse
+          int q_id = 0;
+          bool has_sparse = false;
+          std::vector<int32_t> mask(parameters.max_sequence_length);
+
+          const int32_t* layout_row_indices = block_row_indices + layout_id * parameters.stride_row_indices;
+          const int32_t* layout_col_indices = block_col_indices + layout_id * parameters.stride_col_indices;
+          do {
+            int q_abs_position = past_seq_len + q_id;
+            int causal_length = q_abs_position + 1;
+
+            // Update mask when query token is the first or at the boundary of sparse block.
+            if (q_id == 0 || q_abs_position % parameters.sparse_block_size == 0) {
+              int row_in_sparse_layout = q_abs_position / parameters.sparse_block_size;
+              int start_in_col_indices = layout_row_indices[row_in_sparse_layout];
+              int end_in_col_indices = layout_row_indices[row_in_sparse_layout + 1];
+              int nonzero_blocks = end_in_col_indices - start_in_col_indices;
+              has_sparse = (nonzero_blocks != row_in_sparse_layout + 1);
+
+              DUMP_STRING("q_id=", q_id,
+                          ",q_abs_position=", q_abs_position,
+                          ",sparse_block_size=", parameters.sparse_block_size,
+                          ",row_in_sparse_layout=", row_in_sparse_layout,
+                          ",start_in_col_indices=", start_in_col_indices,
+                          ",end_in_col_indices=", end_in_col_indices,
+                          ",nonzero_blocks=", nonzero_blocks,
+                          ",has_sparse=", has_sparse);
+
+              // Expand attention mask for current row of q_id
+              if (has_sparse) {
+                int block_aligned_length = q_abs_position / parameters.sparse_block_size * parameters.sparse_block_size + parameters.sparse_block_size;
+                DUMP_STRING("block_aligned_length=", block_aligned_length);
+
+                std::fill_n(mask.begin(), block_aligned_length, 0);
+                for (int j = start_in_col_indices; j < end_in_col_indices; j++) {
+                  int col_in_sparse_layout = layout_col_indices[j];
+
+                  int offset = col_in_sparse_layout * parameters.sparse_block_size;
+                  for (int s = 0; s < parameters.sparse_block_size; s++, offset++) {
+                    mask[offset] = 1;
+                  }
+                }
+
+                DUMP_CPU_TENSOR("mask", mask, block_aligned_length);
+              }
+            }
+
+            // Update inline according to attention mask.
+            if (has_sparse) {
+              for (int s = 0; s < causal_length; s++) {
+                if (mask[s] == 0)
+                  output_softmax[s] = std::numeric_limits<T>::lowest();
+              }
+            }
+            ComputeAttentionSoftmaxInplace(output_softmax, 1, causal_length, nullptr);
+
+            for (int remain_seq_id = causal_length; remain_seq_id < total_seq_len; remain_seq_id++) {
+              output_softmax[remain_seq_id] = 0.f;
+            }
+
+            output_softmax += total_seq_len;
+            q_id++;
+
+          } while (q_id < sequence_length);
+        }
+
+        DUMP_CPU_TENSOR("softmax", output, sequence_length, total_seq_len);
+      }
+    });
+  }
+
+  template <typename T>
+  void ComputeVxAttentionScore(T* output,                           // buffer for the result with size BxSxNxH
+                               const T* attention_probs,            // Softmax of Q*K' with size BxNxSxT
+                               const T* V,                          // v value with size BxN_kvxSxH
+                               const int32_t* total_key_lengths,    // total sequence lengths
+                               int batch_size,                      // batch size
+                               int sequence_length,                 // sequence length
+                               int total_sequence_length,           // maximum past_sequence_length + sequence_length
+                               int past_buffer_sequence_length,     // sequence length in past state
+                               int present_buffer_sequence_length,  // sequence length in past state
+                               int head_size,                       // head size of Q, K, V
+                               int hidden_size,                     // hidden size of Output
+                               const T* past_value,                 // past value only
+                               T* present_value,                    // present value only
+                               bool past_present_share_buffer,      // whether past_key and present_key share the buffer
+                               bool packed_qkv,                     // whether Q, K, V are packed
+                               ThreadPool* tp) const {
+    const bool is_prompt = sequence_length == total_sequence_length;
+    const ptrdiff_t packed_batch_stride =
+        packed_qkv ? SafeInt<ptrdiff_t>(num_heads_ + 2 * kv_num_heads_) * sequence_length * head_size
+                   : SafeInt<ptrdiff_t>(0);
+    const int kv_num_heads_factor = num_heads_ / kv_num_heads_;
+
+    const int kv_input_chunk_length = sequence_length * head_size;  // S x H
+    const size_t past_buff_chunk_length = static_cast<size_t>(past_buffer_sequence_length) * head_size;
+    const size_t present_buff_chunk_length = static_cast<size_t>(present_buffer_sequence_length) * head_size;
+
+    // The cost of Gemm.
+    TensorOpCost unit_cost;
+    // Here we use total_sequence_length to estimate total_key_lengths[batch_index] used in GEMM.
+    unit_cost.compute_cycles =
+        static_cast<double>(SafeInt<ptrdiff_t>(2) * sequence_length * head_size * total_sequence_length);
+    unit_cost.bytes_loaded = static_cast<double>(SafeInt<ptrdiff_t>(sequence_length + head_size) *
+                                                 total_sequence_length * sizeof(T));
+    unit_cost.bytes_stored = static_cast<double>(sequence_length * head_size * sizeof(T));
+
+    if (present_value) {
+      double bytes_to_copy_value = static_cast<double>(sizeof(T) * sequence_length * head_size);
+      unit_cost.bytes_loaded += bytes_to_copy_value;
+      unit_cost.bytes_stored += bytes_to_copy_value;
+    }
+
+    DUMP_CPU_TENSOR_INIT();
+
+    ThreadPool::TryParallelFor(
+        tp, SafeInt<ptrdiff_t>(batch_size) * num_heads_, unit_cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) {
+          DUMP_STRING("batch_size=", batch_size, ",num_heads=", num_heads_, ",begin=", begin, ",end=", end);
+
+          for (std::ptrdiff_t i = begin; i != end; ++i) {
+            const int batch_index = static_cast<int>(i / num_heads_);
+            const int head_index = static_cast<int>(i % num_heads_);
+            const int past_seq_len = is_prompt ? 0 : (static_cast<int>(total_key_lengths[batch_index]) - sequence_length);
+            const size_t past_chunk_length = static_cast<size_t>(past_seq_len) * head_size;
+            const int total_seq_len = total_key_lengths[batch_index];
+
+            DUMP_STRING("i=", i, ",batch_index=", batch_index, ",head_index=", head_index,
+                        ",past_seq_len=", past_seq_len, ",total_seq_len=", total_seq_len, ",packed_qkv=", packed_qkv);
+
+            const T* v;
+            if (packed_qkv) {
+              v = V + packed_batch_stride * batch_index + kv_input_chunk_length * (head_index / kv_num_heads_factor);
+            } else {
+              v = V + kv_input_chunk_length * (i / kv_num_heads_factor);
+            }
+
+            // Concatenate past_v + v -> present_v
+            v = ConcatStateChunkGQA(past_value, v, present_value, present_buff_chunk_length, past_buff_chunk_length,
+                                    past_chunk_length, kv_input_chunk_length, is_prompt, past_present_share_buffer,
+                                    i / kv_num_heads_factor);
+
+            DUMP_CPU_TENSOR("present_value", v, total_seq_len, head_size);
+
+            T* output_current = output + (batch_index * sequence_length * num_heads_ + head_index) * head_size;
+            ptrdiff_t attention_probs_offset = SafeInt<ptrdiff_t>(sequence_length) * total_seq_len * i;
+
+            DUMP_CPU_TENSOR("attention_probs", attention_probs + attention_probs_offset, sequence_length, total_seq_len);
+
+            math::GemmEx<T, ThreadPool>(CblasNoTrans, CblasNoTrans, sequence_length, head_size, total_seq_len,
+                                        1.f, /*alpha*/
+                                        attention_probs + attention_probs_offset, total_seq_len, v,
+                                        head_size, 0.0f /*beta*/, output_current, hidden_size, nullptr);
+
+            DUMP_CPU_TENSOR("out", attention_probs + attention_probs_offset, sequence_length, head_size);
+          }
+        });
+  }
+};
+
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/sparse/sparse_attention_helper.h b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention_helper.h
similarity index 98%
rename from onnxruntime/contrib_ops/cuda/sparse/sparse_attention_helper.h
rename to onnxruntime/contrib_ops/cpu/sparse/sparse_attention_helper.h
index a5f1d50e618af..ca69370b4ce17 100644
--- a/onnxruntime/contrib_ops/cuda/sparse/sparse_attention_helper.h
+++ b/onnxruntime/contrib_ops/cpu/sparse/sparse_attention_helper.h
@@ -21,7 +21,7 @@ Status CheckInputs(void* params,
                    const Tensor* sin_cache,
                    const Tensor* block_row_indices,
                    const Tensor* block_col_indices,
-                   const Tensor* seqlens_k_total,
+                   const Tensor* total_key_lengths,
                    const Tensor* total_seq_len) {
   // No packing for q/k/v:
   //   query                (batch_size, sequence_length, num_heads * head_size)
@@ -36,7 +36,7 @@ Status CheckInputs(void* params,
   //   past_value           (batch_size, kv_num_heads, max_cache_sequence_length, head_size)
   //   block_row_indices    (num_layout, max_blocks + 1), where max_blocks = max_sequence_length / sparse_block_size
   //   block_col_indices    (num_layout, max_nnz)
-  //   seqlens_k_total      (batch_size) when do_rotary is True, optional otherwise
+  //   total_key_lengths    (batch_size)
   //   total_seq_len        (1)
   //   cos_cache            (max_rotary_sequence_length, rotary_dim / 2) when do_rotary is true.
   //   sin_cache            (max_rotary_sequence_length, rotary_dim / 2) when do_rotary is true.
@@ -197,7 +197,7 @@ Status CheckInputs(void* params,
   }
 
   // Check the shape of total_key_sequence_lengths. We do not check the values here.
-  const auto& k_len_dim = seqlens_k_total->Shape().GetDims();
+  const auto& k_len_dim = total_key_lengths->Shape().GetDims();
   if (k_len_dim.size() != 1 && k_len_dim[0] != batch_size) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                            "key_total_sequence_lengths must have shape (batch_size).");
diff --git a/onnxruntime/contrib_ops/cuda/sparse/sparse_attention.cc b/onnxruntime/contrib_ops/cuda/sparse/sparse_attention.cc
index 7d3f6eb9295d8..865a1dc29ce47 100644
--- a/onnxruntime/contrib_ops/cuda/sparse/sparse_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/sparse/sparse_attention.cc
@@ -3,7 +3,7 @@
 
 #include "contrib_ops/cuda/sparse/sparse_attention_impl.h"
 #include "contrib_ops/cuda/sparse/sparse_attention.h"
-#include "contrib_ops/cuda/sparse/sparse_attention_helper.h"
+#include "contrib_ops/cpu/sparse/sparse_attention_helper.h"
 #include "contrib_ops/cuda/sparse/sparse_attention_v1/sparse_attention_v1_api.h"
 #include "contrib_ops/cuda/sparse/sparse_attention_v2/sparse_attention_v2_api.h"
 #include "core/platform/env_var_utils.h"
diff --git a/onnxruntime/core/graph/contrib_ops/bert_defs.cc b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
index 2a14ba1db4bb7..7272a949f7218 100644
--- a/onnxruntime/core/graph/contrib_ops/bert_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
@@ -1254,7 +1254,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
                 "present_value",
                 "Updated value cache with shape (batch_size, kv_num_heads, max_cache_sequence_length, head_size).",
                 "T")
-        .TypeConstraint("T", {"tensor(float16)", "tensor(bfloat16)"}, "Constrain input and output to float tensors.")
+        .TypeConstraint("T", {"tensor(float)", "tensor(float16)", "tensor(bfloat16)"}, "Constrain input and output to float tensors.")
         .TypeConstraint("M", {"tensor(int32)"}, "Constrain integer type.")
         .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
           SparseAttentionTypeAndShapeInference(ctx, 3);
diff --git a/onnxruntime/test/python/transformers/test_sparse_attention.py b/onnxruntime/test/python/transformers/test_sparse_attention.py
index f33a56ee4e1f9..f18bcdba65579 100644
--- a/onnxruntime/test/python/transformers/test_sparse_attention.py
+++ b/onnxruntime/test/python/transformers/test_sparse_attention.py
@@ -8,14 +8,16 @@
 """
 import math
 import unittest
-from typing import Optional
+from typing import Optional, Union
 
 import torch
+from benchmark_mha import InputFormats
 from onnx import TensorProto, helper
+from parameterized import parameterized
 from torch import Tensor
 
-from onnxruntime import InferenceSession, SessionOptions
-from onnxruntime.transformers.io_binding_helper import CudaSession, GpuBindingManager
+from onnxruntime import InferenceSession, SessionOptions, get_available_providers
+from onnxruntime.transformers.io_binding_helper import CudaSession
 
 ENABLE_DEBUG = False
 
@@ -34,6 +36,7 @@ def __init__(
         softmax_scale: Optional[float],
         do_rotary: bool,
         rotary_interleaved: bool,
+        provider: str = "CUDAExecutionProvider",
         device="cuda",
         dtype=torch.float16,
         share_buffer: bool = True,
@@ -62,11 +65,13 @@ def __init__(
 
         self.do_rotary = do_rotary
         self.rotary_interleaved = rotary_interleaved
+
+        self.provider = provider
         self.device = device
+        self.dtype = dtype
 
         self.share_buffer = share_buffer
         self.is_packed_qkv = is_packed_qkv
-        self.dtype = dtype
 
     def shape_dict(self):
         shapes = {
@@ -106,7 +111,7 @@ def get_cos_sin_cache(self, dtype):
     def random_inputs(self):
         device = self.device
         # Since bfloat16 is not supported in ORT python I/O binding API, we always use float16 as model inputs.
-        dtype = torch.float16
+        dtype = torch.float16 if self.dtype == torch.bfloat16 else self.dtype
 
         # Always use non-packed qkv to generate same inputs for Torch and ORT.
         packed = self.is_packed_qkv  # Save the original value.
@@ -153,7 +158,9 @@ def __init__(
         softmax_scale=None,
         do_rotary: bool = False,
         rotary_interleaved: bool = False,
+        provider: str = "CUDAExecutionProvider",
         device="cuda",
+        dtype=torch.float16,
         local_window_size: int = -1,
         attention_mask=None,
         is_packed_qkv=False,
@@ -162,17 +169,19 @@ def __init__(
     ):
         super().__init__(
             "GroupQueryAttention",
-            batch_size,
-            sequence_length,
-            max_sequence_length,
-            past_sequence_length,
-            num_heads,
-            kv_num_heads,
-            head_size,
-            softmax_scale,
-            do_rotary,
-            rotary_interleaved,
-            device,
+            batch_size=batch_size,
+            sequence_length=sequence_length,
+            max_sequence_length=max_sequence_length,
+            past_sequence_length=past_sequence_length,
+            num_heads=num_heads,
+            kv_num_heads=kv_num_heads,
+            head_size=head_size,
+            softmax_scale=softmax_scale,
+            do_rotary=do_rotary,
+            rotary_interleaved=rotary_interleaved,
+            provider=provider,
+            device=device,
+            dtype=dtype,
             is_packed_qkv=is_packed_qkv,
             max_cache_sequence_length=max_cache_sequence_length,
             max_rotary_sequence_length=max_rotary_sequence_length,
@@ -220,24 +229,28 @@ def __init__(
         softmax_scale=None,
         do_rotary: bool = False,
         rotary_interleaved: bool = False,
+        provider: str = "CUDAExecutionProvider",
         device="cuda",
+        dtype=torch.float16,
         is_packed_qkv=False,
         max_cache_sequence_length=None,
         max_rotary_sequence_length=None,
     ):
         super().__init__(
             "SparseAttention",
-            batch_size,
-            sequence_length,
-            max_sequence_length,
-            past_sequence_length,
-            num_heads,
-            kv_num_heads,
-            head_size,
-            softmax_scale,
-            do_rotary,
-            rotary_interleaved,
-            device,
+            batch_size=batch_size,
+            sequence_length=sequence_length,
+            max_sequence_length=max_sequence_length,
+            past_sequence_length=past_sequence_length,
+            num_heads=num_heads,
+            kv_num_heads=kv_num_heads,
+            head_size=head_size,
+            softmax_scale=softmax_scale,
+            do_rotary=do_rotary,
+            rotary_interleaved=rotary_interleaved,
+            provider=provider,
+            device=device,
+            dtype=dtype,
             is_packed_qkv=is_packed_qkv,
             max_cache_sequence_length=max_cache_sequence_length,
             max_rotary_sequence_length=max_rotary_sequence_length,
@@ -288,17 +301,19 @@ def random_inputs(self):
 
     def get_comparable_ort_gqa_config(self, use_local=False) -> GroupQueryAttentionConfig:
         return GroupQueryAttentionConfig(
-            self.batch_size,
-            self.sequence_length,
-            self.max_sequence_length,
-            self.past_sequence_length,
-            self.num_heads,
-            self.kv_num_heads,
-            self.head_size,
-            self.softmax_scale,
-            self.do_rotary,
-            self.rotary_interleaved,
-            self.device,
+            batch_size=self.batch_size,
+            sequence_length=self.sequence_length,
+            max_sequence_length=self.max_sequence_length,
+            past_sequence_length=self.past_sequence_length,
+            num_heads=self.num_heads,
+            kv_num_heads=self.kv_num_heads,
+            head_size=self.head_size,
+            softmax_scale=self.softmax_scale,
+            do_rotary=self.do_rotary,
+            rotary_interleaved=self.rotary_interleaved,
+            provider=self.provider,
+            device=self.device,
+            dtype=self.dtype,
             local_window_size=self.local_blocks * self.sparse_block_size if use_local else -1,
             is_packed_qkv=self.is_packed_qkv,
             max_cache_sequence_length=self.max_cache_sequence_length,
@@ -314,17 +329,19 @@ def get_comparable_torch_gqa_config(self, use_sparse=False) -> GroupQueryAttenti
                 attention_mask = attention_mask[:, :, -self.sequence_length :, :]
 
         return GroupQueryAttentionConfig(
-            self.batch_size,
-            self.sequence_length,
-            self.max_sequence_length,
-            self.past_sequence_length,
-            self.num_heads,
-            self.kv_num_heads,
-            self.head_size,
-            self.softmax_scale,
-            self.do_rotary,
-            self.rotary_interleaved,
-            self.device,
+            batch_size=self.batch_size,
+            sequence_length=self.sequence_length,
+            max_sequence_length=self.max_sequence_length,
+            past_sequence_length=self.past_sequence_length,
+            num_heads=self.num_heads,
+            kv_num_heads=self.kv_num_heads,
+            head_size=self.head_size,
+            softmax_scale=self.softmax_scale,
+            do_rotary=self.do_rotary,
+            rotary_interleaved=self.rotary_interleaved,
+            provider=self.provider,
+            device=self.device,
+            dtype=self.dtype,
             attention_mask=attention_mask,
             is_packed_qkv=False,  # torch reference implementation does not support packed qkv.
             max_cache_sequence_length=self.max_cache_sequence_length,
@@ -375,7 +392,7 @@ def get_dense_mask(block_mask, total_seq_len, query_seq_len, block_size):
 
 def create_sparse_attention_onnx_model(config: SparseAttentionConfig):
     # ORT Python I/O binding API does not support bf16, so always use fp16 as graph inputs/outputs.
-    io_float_type = TensorProto.FLOAT16
+    io_float_type = TensorProto.FLOAT if config.dtype == torch.float32 else TensorProto.FLOAT16
 
     suffix = "_bf16" if config.dtype == torch.bfloat16 else ""
     nodes = [
@@ -487,9 +504,9 @@ def create_sparse_attention_onnx_model(config: SparseAttentionConfig):
 
 
 def create_group_query_attention_onnx_model(config: GroupQueryAttentionConfig):
-    assert config.dtype == torch.float16
+    assert config.dtype in [torch.float16, torch.float32]
 
-    float_type = TensorProto.FLOAT16
+    float_type = TensorProto.FLOAT16 if config.dtype in [torch.float16] else TensorProto.FLOAT
     nodes = [
         helper.make_node(
             "GroupQueryAttention",
@@ -599,7 +616,10 @@ def group_query_attention_reference(
     attn_output = torch.einsum("bhmn,bhnd->bhmd", attn.type_as(value), value)
 
     result = attn_output.transpose(1, 2).contiguous()
-    torch.cuda.synchronize()
+
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+
     return result
 
 
@@ -671,25 +691,42 @@ def infer(self):
         )
 
 
+def create_ort_session(
+    config: Union[SparseAttentionConfig, GroupQueryAttentionConfig], session_options=None, enable_cuda_graph=False
+) -> CudaSession:
+    if isinstance(config, SparseAttentionConfig):
+        onnx_model_str = create_sparse_attention_onnx_model(config)
+    else:
+        onnx_model_str = create_group_query_attention_onnx_model(config)
+
+    if config.provider == "CUDAExecutionProvider":
+        device_id = torch.cuda.current_device() if isinstance(config.device, str) else config.device.index
+        provider_options = CudaSession.get_cuda_provider_options(
+            device_id, enable_cuda_graph=enable_cuda_graph, stream=torch.cuda.current_stream().cuda_stream
+        )
+        providers = [(config.provider, provider_options), "CPUExecutionProvider"]
+    else:
+        providers = ["CPUExecutionProvider"]
+
+    ort_session = InferenceSession(onnx_model_str, session_options, providers=providers)
+    # Note that CudaSession could work with both CUDA and CPU providers.
+    cuda_session = CudaSession(ort_session, config.device, enable_cuda_graph=enable_cuda_graph)
+    shape_dict = config.shape_dict()
+    cuda_session.allocate_buffers(shape_dict)
+
+    buffer_sharing = {"past_key": "present_key", "past_value": "present_value"}
+    for input_name, output_name in buffer_sharing.items():
+        cuda_session.set_buffer_sharing(input_name, output_name)
+
+    return cuda_session
+
+
 class OrtGroupQueryAttention:
     """A wrapper of ORT GroupQueryAttention to test relevance and performance."""
 
     def __init__(self, config: GroupQueryAttentionConfig):
-        cuda_provider_options = CudaSession.get_cuda_provider_options(
-            torch.cuda.current_device(), enable_cuda_graph=False, stream=torch.cuda.current_stream().cuda_stream
-        )
-        onnx_model_str = create_group_query_attention_onnx_model(config)
-        self.ort_session = create_session(onnx_model_str, cuda_provider_options=cuda_provider_options)
-        self.gpu_binding_manager = GpuBindingManager(
-            ort_session=self.ort_session,
-            device=config.device,
-            stream=torch.cuda.current_stream().cuda_stream,
-            max_cuda_graphs=2,
-        )
-        buffer_sharing = {"past_key": "present_key", "past_value": "present_value"}
-        self.gpu_binding = self.gpu_binding_manager.get_binding(
-            config.shape_dict(), use_cuda_graph=False, buffer_sharing=buffer_sharing
-        )
+        self.session = create_ort_session(config)
+
         self.feed_dict = config.random_inputs()
 
         if ENABLE_DEBUG and not config.is_packed_qkv:
@@ -709,28 +746,14 @@ def __init__(self, config: GroupQueryAttentionConfig):
             print("seqlens_k (BSNH, GQA)", self.feed_dict["seqlens_k"])
 
     def infer(self):
-        return self.gpu_binding.infer(self.feed_dict)
+        return self.session.infer(self.feed_dict)
 
 
 class OrtSparseAttention:
     """A wrapper of ORT SparseAttention to test relevance and performance."""
 
     def __init__(self, config: SparseAttentionConfig):
-        cuda_provider_options = CudaSession.get_cuda_provider_options(
-            torch.cuda.current_device(), enable_cuda_graph=False, stream=torch.cuda.current_stream().cuda_stream
-        )
-        onnx_model_str = create_sparse_attention_onnx_model(config)
-        self.ort_session = create_session(onnx_model_str, cuda_provider_options=cuda_provider_options)
-        self.gpu_binding_manager = GpuBindingManager(
-            ort_session=self.ort_session,
-            device=config.device,
-            stream=torch.cuda.current_stream().cuda_stream,
-            max_cuda_graphs=2,
-        )
-        buffer_sharing = {"past_key": "present_key", "past_value": "present_value"}
-        self.gpu_binding = self.gpu_binding_manager.get_binding(
-            config.shape_dict(), use_cuda_graph=False, buffer_sharing=buffer_sharing
-        )
+        self.session = create_ort_session(config)
         self.feed_dict = config.random_inputs()
 
         if ENABLE_DEBUG and not config.is_packed_qkv:
@@ -753,19 +776,196 @@ def __init__(self, config: SparseAttentionConfig):
             print("key_total_sequence_lengths", self.feed_dict["key_total_sequence_lengths"])
 
     def infer(self):
-        return self.gpu_binding.infer(self.feed_dict)
+        return self.session.infer(self.feed_dict)
+
+
+def get_provider_support_info(provider: str, use_kv_cache: bool):
+    if provider == "CUDAExecutionProvider":
+        formats = [InputFormats.Q_K_V_BSNH_BSNH_BSNH, InputFormats.QKV_BSN3H]
+        device_id = torch.cuda.current_device()
+        device = torch.device("cuda", device_id)
+        dtype = torch.float16
+    else:
+        assert provider == "CPUExecutionProvider"
+        formats = [InputFormats.Q_K_V_BSNH_BSNH_BSNH, InputFormats.QKV_BSN3H]
+        device = torch.device("cpu")
+        dtype = torch.float
+    return device, dtype, formats
+
+
+def has_cuda_support():
+    if torch.cuda.is_available() and "CUDAExecutionProvider" in get_available_providers():
+        major, minor = torch.cuda.get_device_capability()
+        sm = major * 10 + minor
+        return sm in [75, 80, 86, 89, 90]
+
+    return False
+
+
+def get_simple_test_case(provider: str, has_past_kv: bool):
+    """A simple test case for debugging purpose."""
+    device, dtype, _formats = get_provider_support_info(provider, False)
+    if provider == "CPUExecutionProvider":
+        # A simple case for debugging purpose.
+        max_sequence_length = 16
+        sequence_length = 15
+        packed_qkv = False
+        config = SparseAttentionConfig(
+            batch_size=1,
+            sequence_length=1 if has_past_kv else sequence_length,
+            max_sequence_length=max_sequence_length,
+            past_sequence_length=sequence_length if has_past_kv else 0,
+            num_heads=4,
+            kv_num_heads=2,
+            head_size=8,
+            sparse_block_size=4,
+            num_layout=2,
+            local_blocks=2,
+            vert_stride=2,
+            softmax_scale=0.0,
+            provider=provider,
+            device=device,
+            dtype=dtype,
+            is_packed_qkv=packed_qkv,
+            max_cache_sequence_length=max_sequence_length,
+        )
+        yield config
+
+
+def get_test_cases(provider: str, has_past_kv: bool, comprehensive: bool, do_rotary=False):
+    if provider == "CUDAExecutionProvider" and not has_cuda_support():
+        return
+        yield
+
+    device, dtype, formats = get_provider_support_info(provider, False)
+    batch_sizes = [1, 2, 3]
+    sequence_lengths = [1, 64, 127, 128, 192, 256]
+    heads = [4, 8, 16]
+
+    # SparseAttention CUDA kernel only supports head size 128
+    head_sizes = [128] if provider == "CUDAExecutionProvider" else [128, 256]
+
+    if comprehensive:
+        for batch_size in batch_sizes:
+            for sequence_length in sequence_lengths:
+                for num_heads in heads:
+                    for head_size in head_sizes:
+                        for format in formats:
+                            packed_qkv = format == InputFormats.QKV_BSN3H
+
+                            non_prompt_len = 1
+                            if provider == "CPUExecutionProvider" and sequence_length > 128 and not do_rotary:
+                                # Generate case of sequence_length > 1 when it is not prompt for CPU provider.
+                                non_prompt_len = batch_size
+
+                            query_sequence_length = non_prompt_len if has_past_kv else sequence_length
+                            config = SparseAttentionConfig(
+                                batch_size=batch_size,
+                                sequence_length=query_sequence_length,
+                                max_sequence_length=256,
+                                past_sequence_length=(
+                                    min(256 - query_sequence_length, sequence_length) if has_past_kv else 0
+                                ),
+                                num_heads=num_heads,
+                                kv_num_heads=num_heads // 2,
+                                head_size=head_size,
+                                sparse_block_size=64,
+                                num_layout=2,
+                                local_blocks=2,
+                                vert_stride=2,
+                                softmax_scale=1.8 / (128**0.5),
+                                provider=provider,
+                                device=device,
+                                dtype=dtype,
+                                is_packed_qkv=packed_qkv,
+                                do_rotary=do_rotary,
+                                rotary_interleaved=sequence_length <= 128,
+                                max_cache_sequence_length=None if sequence_length >= 128 else 128,
+                            )
+                            yield config
+    else:
+        test_cases = max(len(batch_sizes), len(sequence_lengths), len(heads), len(head_sizes))
+        for i in range(test_cases):
+            batch_size = batch_sizes[i % len(batch_sizes)]
+            sequence_length = sequence_lengths[i % len(sequence_lengths)]
+            num_heads = heads[i % len(heads)]
+            head_size = head_sizes[i % len(head_sizes)]
+            format = formats[i % len(formats)]
+            packed_qkv = format == InputFormats.QKV_BSN3H
+
+            non_prompt_len = 1
+            if provider == "CPUExecutionProvider" and sequence_length > 128 and not do_rotary:
+                # Generate case of sequence_length > 1 when it is not prompt for CPU provider.
+                non_prompt_len = batch_size
+
+            query_sequence_length = non_prompt_len if has_past_kv else sequence_length
+
+            config = SparseAttentionConfig(
+                batch_size=batch_size,
+                sequence_length=query_sequence_length,
+                max_sequence_length=256,
+                past_sequence_length=min(256 - query_sequence_length, sequence_length) if has_past_kv else 0,
+                num_heads=num_heads,
+                kv_num_heads=num_heads // 2,
+                head_size=head_size,
+                sparse_block_size=64,
+                num_layout=2,
+                local_blocks=2,
+                vert_stride=2,
+                softmax_scale=1.8 / (128**0.5),
+                provider=provider,
+                device=device,
+                dtype=dtype,
+                is_packed_qkv=packed_qkv,
+                do_rotary=do_rotary,
+                rotary_interleaved=sequence_length <= 128,
+                max_cache_sequence_length=None if sequence_length >= 128 else 128,  # test smaller kv cache buffer.
+            )
+            yield config
+
+
+# Do not run too many tests in CI pipeline. Change it to True to run all combinations in dev machine.
+comprehensive_mode = False
 
 
 class TestSparseAttention(unittest.TestCase):
-    @unittest.skipUnless(torch.cuda.is_available(), "cuda not available")
+    @unittest.skipUnless(has_cuda_support(), "cuda not available")
     def test_sparse_attention(self):
         major, minor = torch.cuda.get_device_capability()
         sm = major * 10 + minor
+        self.run_relevance_test(sm)
 
-        if sm not in [75, 80, 86, 89, 90]:
-            self.skipTest("SparseAttention is not supported on this GPU")
+    @parameterized.expand(get_simple_test_case("CPUExecutionProvider", True), skip_on_empty=True)
+    def test_simple_token_cpu(self, config: SparseAttentionConfig):
+        self.run_one_relevance_test(config)
 
-        self.run_relevance_test(sm)
+    @parameterized.expand(get_simple_test_case("CPUExecutionProvider", False), skip_on_empty=True)
+    def test_simple_prompt_cpu(self, config: SparseAttentionConfig):
+        self.run_one_relevance_test(config)
+
+    @parameterized.expand(
+        get_test_cases("CPUExecutionProvider", True, comprehensive_mode, do_rotary=True), skip_on_empty=True
+    )
+    def test_sparse_att_token_cpu_rotary(self, config: SparseAttentionConfig):
+        # When there is rotary, we use ORT GQA as reference: ORT GQA does not support mask so here we use dense.
+        if config.sparse_block_size * config.local_blocks > config.total_sequence_length:
+            self.run_one_relevance_test(config)
+
+    @parameterized.expand(get_test_cases("CUDAExecutionProvider", True, comprehensive_mode), skip_on_empty=True)
+    def test_sparse_att_token_gpu(self, config):
+        self.run_one_relevance_test(config)
+
+    @parameterized.expand(get_test_cases("CPUExecutionProvider", True, comprehensive_mode), skip_on_empty=True)
+    def test_sparse_att_token_cpu(self, config):
+        self.run_one_relevance_test(config)
+
+    @parameterized.expand(get_test_cases("CPUExecutionProvider", False, comprehensive_mode), skip_on_empty=True)
+    def test_sparse_att_prompt_cpu(self, config):
+        self.run_one_relevance_test(config)
+
+    @parameterized.expand(get_test_cases("CUDAExecutionProvider", False, comprehensive_mode), skip_on_empty=True)
+    def test_sparse_att_prompt_gpu(self, config):
+        self.run_one_relevance_test(config)
 
     def run_one_relevance_test(self, config: SparseAttentionConfig):
         if (not config.do_rotary) and config.total_sequence_length <= 2048:
@@ -774,6 +974,10 @@ def run_one_relevance_test(self, config: SparseAttentionConfig):
             obj = TorchGroupQueryAttention(gqa_config)
             expected_out = obj.infer()
         else:
+            if config.dtype == torch.bfloat16:
+                # Skip test since create_group_query_attention_onnx_model does not support bfloat16 right now.
+                return
+
             # Run QGA by ORT (support packed QKV, rotary and very long sequence, but no mask so dense only).
             gqa_config: GroupQueryAttentionConfig = config.get_comparable_ort_gqa_config(use_local=False)
             obj = OrtGroupQueryAttention(gqa_config)
@@ -881,6 +1085,8 @@ def run_relevance_no_past_128k(self, sm: int, device):
                     local_blocks=2048,  # use dense to compare with GQA
                     vert_stride=8,
                     softmax_scale=None,
+                    provider="CUDAExecutionProvider",
+                    dtype=torch.float16,
                     device=device,
                     is_packed_qkv=packed_qkv,
                 )
@@ -907,6 +1113,8 @@ def run_relevance_past_128k(self, sm: int, device):
                     local_blocks=2048,  # use dense to compare with GQA
                     vert_stride=8,
                     softmax_scale=None,
+                    provider="CUDAExecutionProvider",
+                    dtype=torch.float16,
                     device=device,
                     is_packed_qkv=packed_qkv,
                 )
@@ -921,7 +1129,8 @@ def run_relevance_test(self, sm: int):
         device = torch.device("cuda", device_id)
         with torch.no_grad():
             # Test long sequence when GPU memory is enough (need about 12 GB for 128K sequence length)
-            if torch.cuda.get_device_properties(device_id).total_memory > 13 * 1024 * 1024 * 1024:
+            # The 128k tests fails randomly in T4 GPU, increase memory threshold for now.
+            if torch.cuda.get_device_properties(device_id).total_memory > 20 * 1024 * 1024 * 1024:
                 self.run_relevance_no_past_128k(sm, device)
                 self.run_relevance_past_128k(sm, device)
             self.run_relevance_no_past(sm, device)