From 356d000797c596e7d2395feb1ef870d25f50f1b0 Mon Sep 17 00:00:00 2001 From: Yufeng Li Date: Fri, 3 May 2024 10:43:18 -0700 Subject: [PATCH] update GQA message (#396) --- src/python/py/models/builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index c6a3e2574..3865de4bc 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -171,7 +171,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): if (self.ep in {"cuda", "dml"} and self.io_dtype == TensorProto.FLOAT16) or (enable_GQA_on_CPU and self.ep == "cpu" and self.io_dtype == TensorProto.FLOAT): # Change model settings for GroupQueryAttention self.attention_attrs["op_type"] = "GroupQueryAttention" - print("GroupQueryAttention (GQA) is used in this model. GQA is currently supported only for INT4 and FP16 on the CUDA and DML execution providers.") + print("GroupQueryAttention (GQA) is used in this model.") # DML doesn't support packed Q/K/V for GQA yet self.attention_attrs["use_packed_matmul"] = self.ep != "dml" and self.num_attn_heads == self.num_kv_heads