microsoft · raoanag · Mar 4, 2024 · Feb 15, 2024 · Feb 20, 2024 · Feb 28, 2024
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQAttention.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQAttention.cpp
@@ -361,25 +361,30 @@ class DmlOperatorQAttention : public DmlOperator
         const DML_OPERATOR_DESC pastKeySlicedDesc = { DML_OPERATOR_SLICE1, &pastKeySlicedOperatorDesc};
         const DML_OPERATOR_DESC pastValueSlicedDesc = { DML_OPERATOR_SLICE1, &pastValueSlicedOperatorDesc};
 
-        // Causal Mask: [pastSequenceLength, pastSequenceLength + 1 ... pastSequenceLength + batchSize -1]
+        // Causal Mask: Upper Triangular Boolean Matrix
+        // Example: [[1, 0, 0, 0, 0],
+        //           [1, 1, 0, 0, 0],
+        //           [1, 1, 1, 0, 0],
+        //           [1, 1, 1, 1, 0]]
+        // DML adds maskFilterValue to the "off" bits in the mask and sets the "on" bits to 0
         // passed to MHA as maskIndex Tensor when unidirectional == 1
-        std::array<uint32_t, 2> causalMaskOutputShape = {1, batchSize};
+        std::array<uint32_t, 4> causalMaskOutputShape = {1, 1,  sequenceLength, pastSequenceLength + sequenceLength};
         TensorDesc causalMaskTensorDesc;
-        DML_FILL_VALUE_SEQUENCE_OPERATOR_DESC causalMaskOperatorDesc = {};
+        DML_DIAGONAL_MATRIX1_OPERATOR_DESC causalMaskOperatorDesc = {};
         DML_TENSOR_DESC namedcausalMaskTensorDesc;
 
         if (unidirectional && !hasMask)
         {
             causalMaskTensorDesc = TensorDesc::ConstructDefaultTensorDesc(MLOperatorTensorDataType::Int32, causalMaskOutputShape);
             namedcausalMaskTensorDesc = causalMaskTensorDesc.GetDmlDesc();
             causalMaskOperatorDesc.ValueDataType = DML_TENSOR_DATA_TYPE_INT32;
-            causalMaskOperatorDesc.ValueStart.Int32 = pastSequenceLength;
-            causalMaskOperatorDesc.ValueDelta.Int32 = 1;
+            causalMaskOperatorDesc.DiagonalFillBegin = INT32_MIN;
+            causalMaskOperatorDesc.DiagonalFillEnd = pastSequenceLength + 1;
+            causalMaskOperatorDesc.Value.Int32 = 1;
             causalMaskOperatorDesc.OutputTensor = &namedcausalMaskTensorDesc;
-
-            maskType = DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_LENGTH;
+            maskType = DML_MULTIHEAD_ATTENTION_MASK_TYPE_BOOLEAN;
         }
-        DML_OPERATOR_DESC causalMaskDesc = { DML_OPERATOR_FILL_VALUE_SEQUENCE, &causalMaskOperatorDesc };
+        DML_OPERATOR_DESC causalMaskDesc = { DML_OPERATOR_DIAGONAL_MATRIX1, &causalMaskOperatorDesc };
 
         DML_MULTIHEAD_ATTENTION_OPERATOR_DESC mhaOperatorDesc = {};
         std::array<uint32_t, 5> presentKeyOutputShape = {1, batchSize, numHeads, pastSequenceLength + sequenceLength, headSize};
@@ -393,7 +398,11 @@ class DmlOperatorQAttention : public DmlOperator
 
         if (unidirectional && !hasMask)
         {
-            mhaOperatorDesc.MaskTensor = &namedcausalMaskTensorDesc;
+            // Broadcast to MHA MaskTensor Shape
+            std::array<uint32_t, 4> mhaMaskTensorShape = {batchSize, numHeads, sequenceLength, pastSequenceLength + sequenceLength};
+            TensorDesc broadcastedcausalMaskTensorDesc = TensorDesc::ConstructBroadcastedTensorDesc(MLOperatorTensorDataType::Int32, mhaMaskTensorShape, causalMaskOutputShape);
+            const DML_TENSOR_DESC namedbroadcastedcausalMaskTensorDesc = broadcastedcausalMaskTensorDesc.GetDmlDesc();
+            mhaOperatorDesc.MaskTensor = &namedbroadcastedcausalMaskTensorDesc;
         }
         else if (hasMaxSequenceMask)
         {
@@ -407,7 +416,7 @@ class DmlOperatorQAttention : public DmlOperator
         mhaOperatorDesc.RelativePositionBiasTensor = nullptr;
         mhaOperatorDesc.OutputTensor = &outputDescs[outputIndex];
         mhaOperatorDesc.Scale = kernelCreationContext.GetOptionalAttribute<float>(AttrName::Scale, gsl::narrow_cast<float>(1.0f / std::sqrt(headSize)));
-        mhaOperatorDesc.MaskFilterValue = kernelCreationContext.GetOptionalAttribute<float>(AttrName::MaskFilterValue, -10'000.0f);
+        mhaOperatorDesc.MaskFilterValue = std::numeric_limits<float>::lowest();
         mhaOperatorDesc.HeadCount = numHeads;
         mhaOperatorDesc.MaskType = maskType;
         if (hasPast)