diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index c6a3e2574..3865de4bc 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -171,7 +171,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): if (self.ep in {"cuda", "dml"} and self.io_dtype == TensorProto.FLOAT16) or (enable_GQA_on_CPU and self.ep == "cpu" and self.io_dtype == TensorProto.FLOAT): # Change model settings for GroupQueryAttention self.attention_attrs["op_type"] = "GroupQueryAttention" - print("GroupQueryAttention (GQA) is used in this model. GQA is currently supported only for INT4 and FP16 on the CUDA and DML execution providers.") + print("GroupQueryAttention (GQA) is used in this model.") # DML doesn't support packed Q/K/V for GQA yet self.attention_attrs["use_packed_matmul"] = self.ep != "dml" and self.num_attn_heads == self.num_kv_heads