From e01abbc224b47cf9951225f067d825941122d7ee Mon Sep 17 00:00:00 2001
From: Yufeng Li <liyufeng1987@gmail.com>
Date: Tue, 16 Apr 2024 17:01:46 -0700
Subject: [PATCH 1/3] enable GQA on CPU

---
 src/models/kv_cache.cpp         | 3 ++-
 src/python/py/models/builder.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp
index 17515355f..a80c84330 100644
--- a/src/models/kv_cache.cpp
+++ b/src/models/kv_cache.cpp
@@ -117,7 +117,8 @@ KV_Cache::KV_Cache(const Model& model, State& state)
     : model_{model},
       state_{state},
       layer_count_{model_.config_->model.decoder.num_hidden_layers},
-      past_present_share_buffer_{state_.params_->search.past_present_share_buffer && state_.params_->search.num_beams == 1 && model_.device_type_ == DeviceType::CUDA},
+      past_present_share_buffer_{state_.params_->search.past_present_share_buffer && state_.params_->search.num_beams == 1 &&
+                                 (model_.device_type_ == DeviceType::CUDA || model_.device_type_ == DeviceType::CPU)},
       shape_{state_.params_->BatchBeamSize(), model.config_->model.decoder.num_key_value_heads, 0, model.config_->model.decoder.head_size} {
   pasts_.resize(layer_count_ * 2);
   presents_.reserve(layer_count_ * 2);
diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index e0c6d28aa..2728adbc1 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -159,7 +159,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
             "use_rotemb_in_attn": False,                     # Use rotary embeddings within attention op (instead of a separate RotaryEmbedding op)
             "use_packed_matmul": False,                      # Use packed MatMul (instead of 3 separate MatMuls for Q/K/V)
         }
-        if self.ep == "cuda" and self.io_dtype == TensorProto.FLOAT16:
+        if (self.ep == "cuda" and self.io_dtype == TensorProto.FLOAT16) or (self.ep == "cpu" and self.io_dtype == TensorProto.FLOAT):
             # Change model settings for GroupQueryAttention
             self.attention_attrs["op_type"] = "GroupQueryAttention"
             print("GroupQueryAttention (GQA) is used in this model. GQA is currently supported only for INT4 CUDA and FP16 CUDA.")

From 7b89975366ace026ac0393a25b3380a85b65f620 Mon Sep 17 00:00:00 2001
From: Yufeng Li <liyufeng1987@gmail.com>
Date: Tue, 30 Apr 2024 11:15:30 -0700
Subject: [PATCH 2/3] update modelbuilder

---
 src/python/py/models/builder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index 371c6731a..b503b5ef3 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -176,7 +176,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
             self.attention_attrs["use_packed_matmul"] = self.ep != "dml" and self.num_attn_heads == self.num_kv_heads
 
             # GQA + Rot.Emb. does not require `position ids` as input
-            if self.ep == "cuda":
+            if self.ep == "cuda" or self.ep == "cpu":
                 self.attention_attrs["use_rotemb_in_attn"] = True
                 self.input_names.remove("position_ids")
 

From 7dc9de112ade750b25b90c920cbd6f34fa9a609e Mon Sep 17 00:00:00 2001
From: Yufeng Li <liyufeng1987@gmail.com>
Date: Wed, 1 May 2024 15:11:52 -0700
Subject: [PATCH 3/3] add an option to control GQA on CPU

---
 src/python/py/models/builder.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index 46b917856..fd9c90d53 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -167,7 +167,8 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
             "use_rotemb_in_attn": False,                     # Use rotary embeddings within attention op (instead of a separate RotaryEmbedding op)
             "use_packed_matmul": False,                      # Use packed MatMul (instead of 3 separate MatMuls for Q/K/V)
         }
-        if (self.ep in {"cuda", "dml"} and self.io_dtype == TensorProto.FLOAT16) or (self.ep == "cpu" and self.io_dtype == TensorProto.FLOAT):
+        enable_GQA_on_CPU = True if "enable_GQA_on_CPU" in extra_options and extra_options["enable_GQA_on_CPU"] == "1" else False
+        if (self.ep in {"cuda", "dml"} and self.io_dtype == TensorProto.FLOAT16) or (enable_GQA_on_CPU and self.ep == "cpu" and self.io_dtype == TensorProto.FLOAT):
             # Change model settings for GroupQueryAttention
             self.attention_attrs["op_type"] = "GroupQueryAttention"
             print("GroupQueryAttention (GQA) is used in this model. GQA is currently supported only for INT4 and FP16 on the CUDA and DML execution providers.")
@@ -176,7 +177,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
             self.attention_attrs["use_packed_matmul"] = self.ep != "dml" and self.num_attn_heads == self.num_kv_heads
 
             # GQA + Rot.Emb. does not require `position ids` as input
-            if self.ep == "cuda" or self.ep == "cpu":
+            if self.ep in {"cuda", "cpu"}:
                 self.attention_attrs["use_rotemb_in_attn"] = True
                 self.input_names.remove("position_ids")
 
@@ -1979,6 +1980,7 @@ def get_args():
                 enable_cuda_graph = 1 : The model can use CUDA graph capture for CUDA execution provider. If enabled, all nodes being placed on the CUDA EP
                     is the prerequisite for the CUDA graph to be used correctly. It is not guaranteed that cuda graph be enabled as it depends on the model
                     and the graph structure.
+                enable_GQA_on_CPU = Enalbe G(Group)Query(Q)Attention(A) on CPU.
             """),
     )