microsoft · mindest · Nov 2, 2024 · Oct 21, 2024 · Oct 21, 2024 · Oct 22, 2024
diff --git a/onnxruntime/contrib_ops/cpu/bert/decoder_masked_multihead_attention.cc b/onnxruntime/contrib_ops/cpu/bert/decoder_masked_multihead_attention.cc
@@ -339,6 +339,7 @@ void DecoderMaskedMultiHeadAttention<T>::ComputeAttentionProbsWithBeams(
         T* attention_probs_ptr = reinterpret_cast<T*>(attention_probs) + last_offset;
         math::Dot<float, CPUMathUtil>(head_size, q_vec, K + i * head_size, attention_probs_ptr, nullptr);
 
+        *attention_probs_ptr *= scale;
         // Apply the attention bias and mask
         if (attn_bias_data != nullptr) {
           *attention_probs_ptr += attn_bias_data[attn_bias_base_offset + past_sequence_length];
@@ -348,7 +349,6 @@ void DecoderMaskedMultiHeadAttention<T>::ComputeAttentionProbsWithBeams(
         if (is_masked) {
           *attention_probs_ptr += mask_filter_value_;
         }
-        *attention_probs_ptr *= scale;
       }
 
       {
@@ -362,6 +362,8 @@ void DecoderMaskedMultiHeadAttention<T>::ComputeAttentionProbsWithBeams(
           const T* past_k_vec = past_key_data + beam_batch_offset + beam_offset + j * head_size;
           T* output = reinterpret_cast<T*>(attention_probs) + j + i * probs_matrix_size;
           math::Dot<float, CPUMathUtil>(head_size, q_vec, past_k_vec, output, nullptr);
+
+          *output *= scale;
           // Apply the attention bias and mask
           if (attn_bias_data != nullptr) {
             *output += attn_bias_data[attn_bias_base_offset + j];
@@ -371,11 +373,11 @@ void DecoderMaskedMultiHeadAttention<T>::ComputeAttentionProbsWithBeams(
           if (is_masked) {
             *output += mask_filter_value_;
           }
-          *output *= scale;
         }
       }
       // Append current key to present key (past_present_share_buffer_ is true)
-      memcpy(present_key_data + i * max_sequence_length * head_size, K + i * head_size, head_size * sizeof(T));
+      memcpy(present_key_data + (i * max_sequence_length + past_sequence_length) * head_size,
+             K + i * head_size, head_size * sizeof(T));
     }
   });
 
@@ -460,7 +462,7 @@ void DecoderMaskedMultiHeadAttention<T>::ComputeVxAttentionScoreWithBeams(
             }
           }
           // Append current value to present value (past_present_share_buffer_ is true)
-          memcpy(present_value_data + i * max_sequence_length * v_head_size,
+          memcpy(present_value_data + (i * max_sequence_length + past_sequence_length) * v_head_size,
                  V + i * v_head_size,
                  v_head_size * sizeof(T));
         }

diff --git a/onnxruntime/contrib_ops/cpu/bert/decoder_masked_multihead_attention.h b/onnxruntime/contrib_ops/cpu/bert/decoder_masked_multihead_attention.h
@@ -33,7 +33,7 @@ class DecoderMaskedMultiHeadAttention final : public OpKernel, public AttentionC
                                  const Tensor* cache_indir,
                                  OpKernelContext* context,
                                  int beam_width,
-                                 Tensor* scaled_qk = nullptr) const;
+                                 Tensor* output_qk = nullptr) const;
   void ComputeAttentionProbsWithBeams(T* attention_probs,
                                       const T* Q,
                                       const T* K,
@@ -50,7 +50,7 @@ class DecoderMaskedMultiHeadAttention final : public OpKernel, public AttentionC
                                       bool broadcast_attn_bias_dim_1,
                                       const int32_t* cache_indir_data,
                                       int beam_width,
-                                      T* scaled_qk_data = nullptr) const;
+                                      T* output_qk_data = nullptr) const;
   void ComputeVxAttentionScoreWithBeams(T* output,
                                         T* tmp_buffer,
                                         const T* attention_probs,

diff --git a/onnxruntime/core/graph/contrib_ops/bert_defs.cc b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
@@ -908,7 +908,6 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
                OpSchema::Optional)
         .Input(9,
                "cache_indirection",
-               // This input is useful for CUDA EP only.
                "A buffer of shape [batch_size, beam_width, max_output_length] where an `[i, j, k]` entry specifies "
                "which beam the `k`-th token came from for the `j`-th beam for batch `i` in the current iteration",
                "M",