diff --git a/onnxruntime/python/tools/transformers/fusion_conformer_attention.py b/onnxruntime/python/tools/transformers/fusion_conformer_attention.py
index e30c35bf156ba..5f098b33ce39c 100644
--- a/onnxruntime/python/tools/transformers/fusion_conformer_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_conformer_attention.py
@@ -59,14 +59,28 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             logger.debug("fuse_conformer_attention: failed to match v path")
             return
 
-        qk_nodes = self.model.match_parent_path(matmul_qkv, ["Softmax", "Add", "MatMul"], [0, 0, 0])
+        qk_nodes = self.model.match_parent_path(
+            matmul_qkv,
+            ["Softmax", "Add", "Add", "MatMul"],
+            [0, 0, 0, 0])
 
         if qk_nodes is not None:
-            _, add_qk, matmul_qk = qk_nodes
+            _, add_mask_qk, add_embd_qk, matmul_qk = qk_nodes
         else:
             logger.debug("fuse_conformer_attention: failed to match qk path")
             return
 
+        mask_nodes = self.model.match_parent_path(
+            add_mask_qk,
+            ["Cast", "Reshape", "Where", "Equal", "Cast", "Cast"],
+            [1, 0, 0, 0, 0, 0],
+        )
+        if mask_nodes is not None:
+            _, _, where_mask, _, _, cast_mask = mask_nodes
+        else:
+            logger.debug("fuse_conformer_attention: failed to match mask path")
+            return
+
         q_nodes = self.model.match_parent_path(
             matmul_qk,
             ["Mul", "Transpose", "Reshape", "Add", "MatMul"],
@@ -78,6 +92,8 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             logger.debug("fuse_conformer_attention: failed to match q path")
             return
 
+
+
         k_nodes = self.model.match_parent_path(
             matmul_qk,
             ["Transpose", "Concat", "Transpose", "Reshape", "Add", "MatMul"],
@@ -111,7 +127,8 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             num_heads,
             hidden_size,
             attention_last_node.output[0],
-            add_qk=add_qk.input[1],
+            key_padding_mask=cast_mask.output[0],
+            add_qk=add_embd_qk.input[1],
             past_k=past_k,
             past_v=past_v,
             present_k=present_k,