microsoft · guschmue · Aug 16, 2024 · Aug 16, 2024 · Aug 16, 2024 · Aug 16, 2024
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/attention.ts b/js/web/lib/wasm/jsep/webgpu/ops/attention.ts
@@ -668,11 +668,27 @@ export const applyAttention = (
   parameters: AttentionParameters,
   attributes: AttentionAttrs,
 ) => {
+  const pastSequenceLength =
+    parameters.kvNumHeads !== undefined || context.outputCount > 1 ? parameters.pastSequenceLength : 0;
+
+  // context.outputCount comes from KernelOp and is the number of outputs the op has.
+  // If they are not consumed we need to make sure the shaders don't generate the output
+  // since there is no buffer for it.
+  // We check by requesting the output and if not there we'll adjust context.outputCount
+  const presentKeyShape = [
+    parameters.batchSize,
+    parameters.kvNumHeads === undefined ? parameters.numHeads : parameters.kvNumHeads,
+    parameters.totalSequenceLength,
+    parameters.headSize,
+  ];
+  const output1 = context.output(1, presentKeyShape);
+  if (output1 === 0) {
+    context.outputCount = 1;
+  }
   const outputCount = context.outputCount;
-  const pastSequenceLength = parameters.kvNumHeads !== undefined || outputCount > 1 ? parameters.pastSequenceLength : 0;
-  const totalSequenceLength = pastSequenceLength + parameters.kvSequenceLength;
+  const outputPresent = outputCount > 1;
 
-  const inputsK = parameters.kvNumHeads === undefined && outputCount > 1 && pastKey ? [q, k, pastKey] : [q, k];
+  const inputsK = parameters.kvNumHeads === undefined && outputPresent && pastKey ? [q, k, pastKey] : [q, k];
   if (attentionBias) {
     inputsK.push(attentionBias);
   }
@@ -683,13 +699,13 @@ export const applyAttention = (
       context,
       q,
       k,
-      outputCount > 1 ? pastKey : undefined,
+      outputPresent ? pastKey : undefined,
       attentionBias,
       parameters,
       attributes,
       pastSequenceLength,
     ),
-    { inputs: inputsK, outputs: parameters.kvNumHeads === undefined && outputCount > 1 ? [-1, 1] : [-1] },
+    { inputs: inputsK, outputs: parameters.kvNumHeads === undefined && outputPresent ? [-1, 1] : [-1] },
   )[0];
 
   // Run Softmax
@@ -698,24 +714,24 @@ export const applyAttention = (
       context,
       probs,
       parameters.batchSize * parameters.numHeads * parameters.sequenceLength,
-      totalSequenceLength,
+      parameters.totalSequenceLength,
     ),
     { inputs: [probs], outputs: [] },
   );
 
   // Run AttrionScore
   const inputsV =
-    parameters.kvNumHeads === undefined && outputCount > 1 && pastValue ? [probs, v, pastValue] : [probs, v];
+    parameters.kvNumHeads === undefined && outputPresent && pastValue ? [probs, v, pastValue] : [probs, v];
   context.compute(
     createVxAttentionScoreProgramInfo(
       context,
       probs,
       v,
-      outputCount > 1 && pastValue ? pastValue : undefined,
+      outputPresent && pastValue ? pastValue : undefined,
       parameters,
       pastSequenceLength,
     ),
-    { inputs: inputsV, outputs: parameters.kvNumHeads === undefined && outputCount > 1 ? [0, 2] : [0] },
+    { inputs: inputsV, outputs: parameters.kvNumHeads === undefined && outputPresent ? [0, 2] : [0] },
   );
 };
 

diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -183,7 +183,7 @@ export interface ComputeContext {
   /**
    * a number of outputs for the node
    */
-  readonly outputCount: number;
+  outputCount: number;
 
   compute(program: ProgramInfo, inputsOutputsMapping?: ComputeContextInputsOutputsMapping): TensorView[];
   output(index: number, dims: readonly number[]): number;