From b572e90c131dab20ae5c135f0f87ab78c9e26836 Mon Sep 17 00:00:00 2001
From: Qin Jiajia <jiajia.qin@intel.com>
Date: Tue, 30 Jan 2024 17:32:27 +0800
Subject: [PATCH 1/7] Optimize conv1d by conv2d

---
 .../webgpu/ops/3rd-party/conv2d_mm_webgpu.ts  |   8 +-
 .../ops/3rd-party/matmul_packed_webgpu.ts     |   9 +-
 .../lib/wasm/jsep/webgpu/ops/conv-grouped.ts  |   8 +-
 js/web/lib/wasm/jsep/webgpu/ops/conv.ts       | 246 +++++++++---------
 js/web/lib/wasm/jsep/webgpu/ops/matmul.ts     |   9 +-
 js/web/test/data/ops/conv.jsonc               |  67 +++++
 6 files changed, 215 insertions(+), 132 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
index 24006d393592a..1e44111cf61b4 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
@@ -157,7 +157,8 @@ const conv2dCommonSnippet =
 
 export const createConv2DMatMulProgramInfo =
     (inputs: readonly TensorView[], attributes: ConvAttributes, outputShape: readonly number[], dimAOuter: number,
-     dimBOuter: number, dimInner: number, hasBias: boolean, sequentialAccessByThreads: boolean): ProgramInfo => {
+     dimBOuter: number, dimInner: number, hasBias: boolean, sequentialAccessByThreads: boolean,
+     squeezeOutputShapeFunction?: (shape: readonly number[]) => number[]): ProgramInfo => {
       const isChannelsLast = attributes.format === 'NHWC';
       const inChannels = isChannelsLast ? inputs[0].dims[3] : inputs[0].dims[1];
       const batchSize = outputShape[0];
@@ -262,7 +263,10 @@ export const createConv2DMatMulProgramInfo =
           inputDependencies
         },
         getRunData: () => ({
-          outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
+          outputs: [{
+            dims: squeezeOutputShapeFunction ? squeezeOutputShapeFunction(outputShape) : outputShape,
+            dataType: inputs[0].dataType
+          }],
           dispatchGroup: {x: dispatch[0], y: dispatch[1], z: dispatch[2]},
           programUniforms,
         }),
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
index 9b37247167bab..21bcc5b5a925c 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
@@ -413,8 +413,8 @@ const matMulReadWriteFnSource =
 
 export const createMatmulProgramInfo =
     (inputs: readonly TensorView[], activationAttributes: InternalActivationAttributes, outputShape: readonly number[],
-     reshapedOutputShape?: readonly number[],
-     isChannelsLast = false /* only used for conv2dByMatMul*/): ProgramInfo => {
+     reshapedOutputShape?: readonly number[], isChannelsLast = false /* only used for conv2dByMatMul*/,
+     squeezeOutputShapeFunction?: (shape: readonly number[]) => number[]): ProgramInfo => {
       const aShape = inputs[0].dims;
       const bShape = inputs[1].dims;
       const outerDimsA = aShape.slice(0, -2);
@@ -494,7 +494,10 @@ export const createMatmulProgramInfo =
           inputDependencies
         },
         getRunData: () => ({
-          outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
+          outputs: [{
+            dims: squeezeOutputShapeFunction ? squeezeOutputShapeFunction(outputShape) : outputShape,
+            dataType: inputs[0].dataType
+          }],
           dispatchGroup: {x: dispatch[0], y: dispatch[1], z: dispatch[2]},
           programUniforms
         }),
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
index 924030125c420..aac1fcb5f4fff 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
@@ -118,7 +118,8 @@ export const createGroupedConvProgramInfo =
     };
 
 export const createGroupedConvVectorizeProgramInfo =
-    (inputs: readonly TensorView[], attributes: ConvAttributes, outputShape: readonly number[]): ProgramInfo => {
+    (inputs: readonly TensorView[], attributes: ConvAttributes, outputShape: readonly number[],
+     squeezeOutputShapeFunction?: (shape: readonly number[]) => number[]): ProgramInfo => {
       const hasBias = inputs.length > 2;
       const components = getMaxComponents(outputShape[3]);
       const outputNumber = getMaxComponents(outputShape[2]);
@@ -207,7 +208,10 @@ export const createGroupedConvVectorizeProgramInfo =
           inputDependencies: hasBias ? ['rank', 'rank', 'type'] : ['rank', 'rank']
         },
         getRunData: () => ({
-          outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
+          outputs: [{
+            dims: squeezeOutputShapeFunction ? squeezeOutputShapeFunction(outputShape) : outputShape,
+            dataType: inputs[0].dataType
+          }],
           dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
           programUniforms
         }),
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index 52bd69130e617..34c93f50dc158 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -140,68 +140,118 @@ export const parseConvAttributes = (attributes: Record<string, unknown>): ConvAt
   };
 };
 
-const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attributes: ConvAttributes): void => {
-  const adjustedAttributes = getAdjustedConvAttributes(attributes, inputs);
+const conv2d =
+    (context: ComputeContext, inputs: readonly TensorView[], attributes: ConvAttributes,
+     squeezeOutputShapeFunction?: (shape: readonly number[]) => number[]): void => {
+      // check attributes
+
+      // const hasPreluActivationWeights = false; /* TODO: add support for prelu activation weights */
+      const isChannelsLast = attributes.format === 'NHWC';
+      if (attributes.group !== 1) {
+        // Temporarily disable createGroupedConvVectorizeProgramInfo path due to bots failures with below two cases:
+        // [webgpu]Conv - conv - vectorize group - B
+        // [webgpu]Conv - conv - vectorize group - D
+        const disableGroupedConvVectorize = false;
+        if (!disableGroupedConvVectorize && isChannelsLast && inputs[1].dims[0] === attributes.group &&
+            inputs[1].dims[1] === 1 && attributes.dilations[0] === 1 && attributes.dilations[1] === 1) {
+          const outputShape = calculateOutputShape(
+              inputs[0].dims, inputs[1].dims, attributes.dilations, attributes.pads, attributes.strides,
+              isChannelsLast);
+          const transposedWeight = (context.kernelCustomData.wT as TensorView | undefined) ??
+              context.compute(
+                  createTransposeProgramInfo(inputs[1], weightTransposeAttribute),
+                  {inputs: [1], outputs: [attributes.wIsConst ? -2 : -1]})[0];
+          if (attributes.wIsConst && !context.kernelCustomData.wT) {
+            context.kernelCustomData.wT = transposedWeight;
+          }
+          const convInputs = [inputs[0], transposedWeight];
+          if (inputs.length === 3) {
+            convInputs.push(inputs[2]);
+          }
+          context.compute(
+              createGroupedConvVectorizeProgramInfo(convInputs, attributes, outputShape, squeezeOutputShapeFunction),
+              {inputs: convInputs});
+        } else {
+          context.compute(createGroupedConvProgramInfo(inputs, attributes, squeezeOutputShapeFunction));
+        }
+        return;
+      }
 
-  // check attributes
+      const hasBias = inputs.length === 3;
+      const inputHeight = inputs[0].dims[isChannelsLast ? 1 : 2];
+      const inputWidth = inputs[0].dims[isChannelsLast ? 2 : 3];
+      const inputChannels = inputs[0].dims[isChannelsLast ? 3 : 1];
+      const weightHeight = inputs[1].dims[2];
+      const weightWidth = inputs[1].dims[3];
 
-  // const hasPreluActivationWeights = false; /* TODO: add support for prelu activation weights */
-  const isChannelsLast = attributes.format === 'NHWC';
-  if (attributes.group !== 1) {
-    // NVIDIA GPU with ampere architecture fails with below 2 cases, but we couldn't repro them with any other
-    // GPUs. So just disable vectorize on NVIDIA ampere to ensure always correct outputs.
-    // [webgpu]Conv - conv - vectorize group - B
-    // [webgpu]Conv - conv - vectorize group - D
-    const enableGroupedConvVectorize = !context.adapterInfo.isArchitecture('ampere');
-    if (enableGroupedConvVectorize && isChannelsLast && inputs[1].dims[0] === attributes.group &&
-        inputs[1].dims[1] === 1 && attributes.dilations[0] === 1 && attributes.dilations[1] === 1) {
       const outputShape = calculateOutputShape(
-          inputs[0].dims, inputs[1].dims, attributes.dilations, adjustedAttributes.pads, attributes.strides,
-          isChannelsLast);
-      const transposedWeight = (context.kernelCustomData.wT as TensorView | undefined) ??
+          inputs[0].dims, inputs[1].dims, attributes.dilations, attributes.pads, attributes.strides, isChannelsLast);
+      const outHeight = outputShape[isChannelsLast ? 1 : 2];
+      const outWidth = outputShape[isChannelsLast ? 2 : 3];
+      const outChannels = outputShape[isChannelsLast ? 3 : 1];
+
+      const sameSize = isChannelsLast && weightHeight === inputHeight && weightWidth === inputWidth &&
+          attributes.pads[0] === 0 && attributes.pads[1] === 0;
+      if (sameSize ||
+          (weightHeight === 1 && weightWidth === 1 && attributes.dilations[0] === 1 && attributes.dilations[1] === 1 &&
+           attributes.strides[0] === 1 && attributes.strides[1] === 1 && attributes.pads[0] === 0 &&
+           attributes.pads[1] === 0)) {
+        // conv2dByMatMul
+        const batch = outputShape[0];
+        let xReshaped, wReshaped, matmulOutputShape;
+        const matmulInputs = [];
+        if (isChannelsLast) {
+          const transposedWeight = (context.kernelCustomData.wT as TensorView | undefined) ??
+              context.compute(
+                  createTransposeProgramInfo(inputs[1], weightTransposeAttribute),
+                  {inputs: [1], outputs: [attributes.wIsConst ? -2 : -1]})[0];
+          if (attributes.wIsConst && !context.kernelCustomData.wT) {
+            context.kernelCustomData.wT = transposedWeight;
+          }
+          if (sameSize) {
+            const sharedDim = inputHeight * inputWidth * inputChannels;
+            xReshaped = inputs[0].reshape([1, batch, sharedDim]);
+            wReshaped = transposedWeight.reshape([1, sharedDim, outChannels]);
+            matmulOutputShape = [1, batch, outChannels];
+          } else {
+            xReshaped = inputs[0].reshape([batch, inputHeight * inputWidth, inputChannels]);
+            wReshaped = transposedWeight.reshape([1, inputChannels, outChannels]);
+            matmulOutputShape = [batch, outHeight * outWidth, outChannels];
+          }
+          matmulInputs.push(xReshaped);
+          matmulInputs.push(wReshaped);
+        } else {
+          xReshaped = inputs[0].reshape([batch, inputChannels, inputHeight * inputWidth]);
+          wReshaped = inputs[1].reshape([1, outChannels, inputChannels]);
+          matmulOutputShape = [batch, outChannels, outHeight * outWidth];
+          matmulInputs.push(wReshaped);
+          matmulInputs.push(xReshaped);
+        }
+        if (hasBias) {
+          matmulInputs.push(inputs[2]);
+        }
+        const N = matmulOutputShape[2];
+        const K = matmulInputs[0].dims[matmulInputs[0].dims.length - 1];
+        // Tune the threshold.
+        if (N < 8 && K < 8) {
           context.compute(
-              createTransposeProgramInfo(inputs[1], weightTransposeAttribute),
-              {inputs: [1], outputs: [attributes.wIsConst ? -2 : -1]})[0];
-      if (attributes.wIsConst && !context.kernelCustomData.wT) {
-        context.kernelCustomData.wT = transposedWeight;
-      }
-      const convInputs = [inputs[0], transposedWeight];
-      if (inputs.length === 3) {
-        convInputs.push(inputs[2]);
+              createNaiveMatmulProgramInfo(
+                  matmulInputs, attributes, outputShape, matmulOutputShape, isChannelsLast, squeezeOutputShapeFunction),
+              {inputs: matmulInputs});
+        } else {
+          context.compute(
+              createMatmulProgramInfo(
+                  matmulInputs, attributes, outputShape, matmulOutputShape, isChannelsLast, squeezeOutputShapeFunction),
+              {inputs: matmulInputs});
+        }
+        return;
       }
-      context.compute(
-          createGroupedConvVectorizeProgramInfo(convInputs, adjustedAttributes, outputShape), {inputs: convInputs});
-    } else {
-      context.compute(createGroupedConvProgramInfo(inputs, adjustedAttributes));
-    }
-    return;
-  }
 
-  const hasBias = inputs.length === 3;
-  const inputHeight = inputs[0].dims[isChannelsLast ? 1 : 2];
-  const inputWidth = inputs[0].dims[isChannelsLast ? 2 : 3];
-  const inputChannels = inputs[0].dims[isChannelsLast ? 3 : 1];
-  const weightHeight = inputs[1].dims[2];
-  const weightWidth = inputs[1].dims[3];
+      // TODO: implement conv2dWithIm2Col()
 
-  const outputShape = calculateOutputShape(
-      inputs[0].dims, inputs[1].dims, attributes.dilations, adjustedAttributes.pads, attributes.strides,
-      isChannelsLast);
-  const outHeight = outputShape[isChannelsLast ? 1 : 2];
-  const outWidth = outputShape[isChannelsLast ? 2 : 3];
-  const outChannels = outputShape[isChannelsLast ? 3 : 1];
+      const sequentialAccessByThreads = /* backend.adapterInfo.isIntel() */ true;
 
-  const sameSize = isChannelsLast && weightHeight === inputHeight && weightWidth === inputWidth &&
-      attributes.pads[0] === 0 && attributes.pads[1] === 0;
-  if (sameSize ||
-      (weightHeight === 1 && weightWidth === 1 && attributes.dilations[0] === 1 && attributes.dilations[1] === 1 &&
-       attributes.strides[0] === 1 && attributes.strides[1] === 1 && attributes.pads[0] === 0 &&
-       attributes.pads[1] === 0)) {
-    // conv2dByMatMul
-    const batch = outputShape[0];
-    let xReshaped, wReshaped, matmulOutputShape;
-    const matmulInputs = [];
-    if (isChannelsLast) {
+      // STEP.1: transpose weight
       const transposedWeight = (context.kernelCustomData.wT as TensorView | undefined) ??
           context.compute(
               createTransposeProgramInfo(inputs[1], weightTransposeAttribute),
@@ -209,73 +259,23 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut
       if (attributes.wIsConst && !context.kernelCustomData.wT) {
         context.kernelCustomData.wT = transposedWeight;
       }
-      if (sameSize) {
-        const sharedDim = inputHeight * inputWidth * inputChannels;
-        xReshaped = inputs[0].reshape([1, batch, sharedDim]);
-        wReshaped = transposedWeight.reshape([1, sharedDim, outChannels]);
-        matmulOutputShape = [1, batch, outChannels];
-      } else {
-        xReshaped = inputs[0].reshape([batch, inputHeight * inputWidth, inputChannels]);
-        wReshaped = transposedWeight.reshape([1, inputChannels, outChannels]);
-        matmulOutputShape = [batch, outHeight * outWidth, outChannels];
-      }
-      matmulInputs.push(xReshaped);
-      matmulInputs.push(wReshaped);
-    } else {
-      xReshaped = inputs[0].reshape([batch, inputChannels, inputHeight * inputWidth]);
-      wReshaped = inputs[1].reshape([1, outChannels, inputChannels]);
-      matmulOutputShape = [batch, outChannels, outHeight * outWidth];
-      matmulInputs.push(wReshaped);
-      matmulInputs.push(xReshaped);
-    }
-    if (hasBias) {
-      matmulInputs.push(inputs[2]);
-    }
-    const N = matmulOutputShape[2];
-    const K = matmulInputs[0].dims[matmulInputs[0].dims.length - 1];
-    // Tune the threshold.
-    if (N < 8 && K < 8) {
-      context.compute(
-          createNaiveMatmulProgramInfo(
-              matmulInputs, adjustedAttributes, outputShape, matmulOutputShape, isChannelsLast),
-          {inputs: matmulInputs});
-    } else {
-      context.compute(
-          createMatmulProgramInfo(matmulInputs, adjustedAttributes, outputShape, matmulOutputShape, isChannelsLast),
-          {inputs: matmulInputs});
-    }
-    return;
-  }
 
-  // TODO: implement conv2dWithIm2Col()
-
-  const sequentialAccessByThreads = /* backend.adapterInfo.isIntel() */ true;
+      // STEP.2: prepare reshaped inputs
+      const convInputs = [inputs[0], transposedWeight];
+      if (hasBias) {
+        convInputs.push(inputs[2]);
+      }
 
-  // STEP.1: transpose weight
-  const transposedWeight = (context.kernelCustomData.wT as TensorView | undefined) ??
+      // STEP.3: compute matmul
+      const dimAOuter = isChannelsLast ? outHeight * outWidth : outChannels;
+      const dimBOuter = isChannelsLast ? outChannels : outHeight * outWidth;
+      const dimInner = weightHeight * weightWidth * inputChannels;
       context.compute(
-          createTransposeProgramInfo(inputs[1], weightTransposeAttribute),
-          {inputs: [1], outputs: [attributes.wIsConst ? -2 : -1]})[0];
-  if (attributes.wIsConst && !context.kernelCustomData.wT) {
-    context.kernelCustomData.wT = transposedWeight;
-  }
-
-  // STEP.2: prepare reshaped inputs
-  const convInputs = [inputs[0], transposedWeight];
-  if (hasBias) {
-    convInputs.push(inputs[2]);
-  }
-
-  // STEP.3: compute matmul
-  const dimAOuter = isChannelsLast ? outHeight * outWidth : outChannels;
-  const dimBOuter = isChannelsLast ? outChannels : outHeight * outWidth;
-  const dimInner = weightHeight * weightWidth * inputChannels;
-  context.compute(
-      createConv2DMatMulProgramInfo(
-          convInputs, adjustedAttributes, outputShape, dimAOuter, dimBOuter, dimInner, hasBias,
-          sequentialAccessByThreads),
-      {inputs: convInputs});
-};
+          createConv2DMatMulProgramInfo(
+              convInputs, attributes, outputShape, dimAOuter, dimBOuter, dimInner, hasBias, sequentialAccessByThreads,
+              squeezeOutputShapeFunction),
+          {inputs: convInputs});
+    };
 
 const conv1d = (context: ComputeContext, attributes: ConvAttributes): void => {
   // extend the input to 2D by adding H dimension
@@ -298,9 +298,10 @@ const conv1d = (context: ComputeContext, attributes: ConvAttributes): void => {
   const dilations = [1].concat(attributes.dilations);
   const kernelShape = [1].concat(attributes.kernelShape);
   const adjustedAttributes = getAdjustedConvAttributes({...attributes, pads, strides, dilations, kernelShape}, inputs);
-  context.compute(createGroupedConvProgramInfo(
-      inputs, adjustedAttributes,
-      outputShape => isChannelLast ? [outputShape[0], outputShape[2], outputShape[3]] : []));
+  conv2d(
+      context, inputs, adjustedAttributes,
+      outputShape => isChannelLast ? [outputShape[0], outputShape[2], outputShape[3]] :
+                                     [outputShape[0], outputShape[1], outputShape[3]]);
 };
 
 const conv3d = (context: ComputeContext, inputs: readonly TensorView[], attributes: ConvAttributes): void => {
@@ -325,6 +326,7 @@ export const conv = (context: ComputeContext, attributes: ConvAttributes): void
   } else if (context.inputs[0].dims.length === 5) {
     conv3d(context, context.inputs, attributes);
   } else {
-    conv2d(context, context.inputs, attributes);
+    const adjustedAttributes = getAdjustedConvAttributes(attributes, context.inputs);
+    conv2d(context, context.inputs, adjustedAttributes);
   }
 };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
index 1a92d861002fb..2f90c731b4a44 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
@@ -12,8 +12,8 @@ import {appendActivationUniforms, appendActivationUniformsData, getActivationSni
 
 export const createNaiveMatmulProgramInfo =
     (inputs: readonly TensorView[], activationAttributes: InternalActivationAttributes, outputShape: readonly number[],
-     reshapedOutputShape?: readonly number[],
-     isChannelsLast = false /* only used for conv2dByMatMul*/): ProgramInfo => {
+     reshapedOutputShape?: readonly number[], isChannelsLast = false /* only used for conv2dByMatMul*/,
+     squeezeOutputShapeFunction?: (shape: readonly number[]) => number[]): ProgramInfo => {
       const aShape = inputs[0].dims;
       const bShape = inputs[1].dims;
 
@@ -143,7 +143,10 @@ export const createNaiveMatmulProgramInfo =
           inputDependencies: hasBias ? ['rank', 'rank', 'rank'] : ['rank', 'rank']
         },
         getRunData: () => ({
-          outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
+          outputs: [{
+            dims: squeezeOutputShapeFunction ? squeezeOutputShapeFunction(outputShape) : outputShape,
+            dataType: inputs[0].dataType
+          }],
           dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
           programUniforms
         }),
diff --git a/js/web/test/data/ops/conv.jsonc b/js/web/test/data/ops/conv.jsonc
index cc10df5864233..fb8a2e0d8ac91 100644
--- a/js/web/test/data/ops/conv.jsonc
+++ b/js/web/test/data/ops/conv.jsonc
@@ -485,5 +485,72 @@
         ]
       }
     ]
+  },
+  {
+    "name": "conv 1D without bias addition A",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [{ "name": "kernel_shape", "data": [2], "type": "ints" }],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [10, 20, 30],
+            "dims": [1, 1, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2],
+            "dims": [1, 1, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [50, 80],
+            "dims": [1, 1, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Conv 1D with bias addition A",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [{ "name": "kernel_shape", "data": [2], "type": "ints" }],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [10, 20, 30, 40],
+            "dims": [1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+            "dims": [4, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [0.1, 0.2, 0.3, 0.4],
+            "dims": [4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [100.1, 100.2, 100.3, 100.4],
+            "dims": [1, 4, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
   }
 ]

From d4afd6a549410fcd40ae5f859260b4e60187dd9d Mon Sep 17 00:00:00 2001
From: Qin Jiajia <jiajia.qin@intel.com>
Date: Fri, 9 Aug 2024 14:43:41 +0800
Subject: [PATCH 2/7] fix uniform variable mismatch error

---
 js/web/lib/wasm/jsep/webgpu/ops/transpose.ts | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
index 8496173b1e8f8..62918c8e16326 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
@@ -83,13 +83,13 @@ export const createTransposeProgramInfo = (inputTensor: TensorView, permAttr: nu
   return {
     name: 'Transpose',
     shaderCache: {hint: `${permAttr}`, inputDependencies: ['rank']},
-    getRunData: (inputs) => {
+    getRunData: () => {
       const outputSize = ShapeUtil.size(outputShape);
       return {
-        outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
+        outputs: [{dims: outputShape, dataType: inputTensor.dataType}],
         dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
         programUniforms:
-            [{type: DataType.uint32, data: outputSize}, ...createTensorShapeVariables(inputs[0].dims, outputShape)],
+            [{type: DataType.uint32, data: outputSize}, ...createTensorShapeVariables(inputTensor.dims, outputShape)],
       };
     },
     getShaderSource,

From f9e00f98df219f1f3d37cc153e5b2a43d7bb58e8 Mon Sep 17 00:00:00 2001
From: Qin Jiajia <jiajia.qin@intel.com>
Date: Mon, 12 Aug 2024 14:03:39 +0800
Subject: [PATCH 3/7] Split conv1d tests to separate file

---
 js/web/test/data/ops/conv.jsonc   | 67 ------------------------------
 js/web/test/data/ops/conv1d.jsonc | 69 +++++++++++++++++++++++++++++++
 js/web/test/suite-test-list.jsonc |  1 +
 3 files changed, 70 insertions(+), 67 deletions(-)
 create mode 100644 js/web/test/data/ops/conv1d.jsonc

diff --git a/js/web/test/data/ops/conv.jsonc b/js/web/test/data/ops/conv.jsonc
index fb8a2e0d8ac91..cc10df5864233 100644
--- a/js/web/test/data/ops/conv.jsonc
+++ b/js/web/test/data/ops/conv.jsonc
@@ -485,72 +485,5 @@
         ]
       }
     ]
-  },
-  {
-    "name": "conv 1D without bias addition A",
-    "operator": "Conv",
-    "inputShapeDefinitions": "rankOnly",
-    "opset": { "domain": "", "version": 17 },
-    "attributes": [{ "name": "kernel_shape", "data": [2], "type": "ints" }],
-    "cases": [
-      {
-        "name": "T[0]",
-        "inputs": [
-          {
-            "data": [10, 20, 30],
-            "dims": [1, 1, 3],
-            "type": "float32"
-          },
-          {
-            "data": [1, 2],
-            "dims": [1, 1, 2],
-            "type": "float32"
-          }
-        ],
-        "outputs": [
-          {
-            "data": [50, 80],
-            "dims": [1, 1, 2],
-            "type": "float32"
-          }
-        ]
-      }
-    ]
-  },
-  {
-    "name": "Conv 1D with bias addition A",
-    "operator": "Conv",
-    "inputShapeDefinitions": "rankOnly",
-    "opset": { "domain": "", "version": 17 },
-    "attributes": [{ "name": "kernel_shape", "data": [2], "type": "ints" }],
-    "cases": [
-      {
-        "name": "T[0]",
-        "inputs": [
-          {
-            "data": [10, 20, 30, 40],
-            "dims": [1, 2, 2],
-            "type": "float32"
-          },
-          {
-            "data": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-            "dims": [4, 2, 2],
-            "type": "float32"
-          },
-          {
-            "data": [0.1, 0.2, 0.3, 0.4],
-            "dims": [4],
-            "type": "float32"
-          }
-        ],
-        "outputs": [
-          {
-            "data": [100.1, 100.2, 100.3, 100.4],
-            "dims": [1, 4, 1],
-            "type": "float32"
-          }
-        ]
-      }
-    ]
   }
 ]
diff --git a/js/web/test/data/ops/conv1d.jsonc b/js/web/test/data/ops/conv1d.jsonc
new file mode 100644
index 0000000000000..a387f0de324a6
--- /dev/null
+++ b/js/web/test/data/ops/conv1d.jsonc
@@ -0,0 +1,69 @@
+[
+  {
+    "name": "conv 1D without bias addition A",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [{ "name": "kernel_shape", "data": [2], "type": "ints" }],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [10, 20, 30],
+            "dims": [1, 1, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2],
+            "dims": [1, 1, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [50, 80],
+            "dims": [1, 1, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "conv 1D with bias addition A",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [{ "name": "kernel_shape", "data": [2], "type": "ints" }],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [10, 20, 30, 40],
+            "dims": [1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+            "dims": [4, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [0.1, 0.2, 0.3, 0.4],
+            "dims": [4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [100.1, 100.2, 100.3, 100.4],
+            "dims": [1, 4, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index 4aaf9d16b2b0e..3071b9a4eb9af 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -1347,6 +1347,7 @@
       "concat_zero-sized.jsonc",
       "cast.jsonc",
       "conv.jsonc",
+      "conv1d.jsonc",
       "conv3dncdhw.jsonc",
       "cos.jsonc",
       "div.jsonc",

From 5ec24f58ab380e4cc3855b3f741b176867207f11 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 15 Aug 2024 13:58:50 -0700
Subject: [PATCH 4/7] format

---
 .../webgpu/ops/3rd-party/conv2d_mm_webgpu.ts  | 16 ++++----
 .../ops/3rd-party/matmul_packed_webgpu.ts     | 24 ++++++------
 .../lib/wasm/jsep/webgpu/ops/conv-grouped.ts  |  6 +--
 js/web/lib/wasm/jsep/webgpu/ops/conv.ts       | 38 +++++++++----------
 4 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
index c21eee78286b3..bfed99cc4f51d 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
@@ -48,7 +48,7 @@ const conv2dCommonSnippet = (
   innerElementSizeX = 4,
   innerElementSizeW = 4,
   innerElementSize = 4,
-  dataType = 'f32'
+  dataType = 'f32',
 ): string => {
   const getXSnippet = (innerElementSize: number) => {
     switch (innerElementSize) {
@@ -133,10 +133,10 @@ const conv2dCommonSnippet = (
     }
     return ${typeSnippet(innerElementSizeX, dataType)}(0.0);`
     : fitInner && fitBOuter
-    ? `
+      ? `
     let col = colIn * ${innerElementSizeX};
     ${readXSnippet}`
-    : `
+      : `
     let col = colIn * ${innerElementSizeX};
     if (row < uniforms.dim_inner && col < uniforms.dim_b_outer) {
       ${readXSnippet}
@@ -182,7 +182,7 @@ export const createConv2DMatMulProgramInfo = (
   dimInner: number,
   hasBias: boolean,
   sequentialAccessByThreads: boolean,
-  squeezeOutputShapeFunction?: (shape: readonly number[]) => number[]
+  squeezeOutputShapeFunction?: (shape: readonly number[]) => number[],
 ): ProgramInfo => {
   const isChannelsLast = attributes.format === 'NHWC';
   const inChannels = isChannelsLast ? inputs[0].dims[3] : inputs[0].dims[1];
@@ -258,7 +258,7 @@ export const createConv2DMatMulProgramInfo = (
       'x',
       inputs[0].dataType,
       inputs[0].dims.length,
-      innerElementSize === 3 ? 1 : innerElementSize
+      innerElementSize === 3 ? 1 : innerElementSize,
     );
     const w = inputVariable('w', inputs[1].dataType, inputs[1].dims.length, components);
     const inputVariables = [x, w];
@@ -289,7 +289,7 @@ export const createConv2DMatMulProgramInfo = (
           elementsSize[0],
           elementsSize[1],
           elementsSize[2],
-          t
+          t,
         )}
         ${conv2dCommonSnippet(
           isChannelsLast,
@@ -301,7 +301,7 @@ export const createConv2DMatMulProgramInfo = (
           elementsSize[0],
           elementsSize[1],
           elementsSize[2],
-          t
+          t,
         )}
         ${
           isVec4
@@ -315,7 +315,7 @@ export const createConv2DMatMulProgramInfo = (
                 tileInner,
                 false,
                 undefined,
-                sequentialAccessByThreads
+                sequentialAccessByThreads,
               )
         }`;
   };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
index 7a0c8ce6b1a2f..f0287529ca08b 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
@@ -92,7 +92,7 @@ export const makeMatMulPackedVec4Source = (
   transposeA = false,
   tileInner = 32,
   splitK = false,
-  splitedDimInner = 32
+  splitedDimInner = 32,
 ): string => {
   const tileAOuter = workgroupSize[1] * workPerThread[1];
   const tileBOuter = workgroupSize[0] * workPerThread[0];
@@ -212,7 +212,7 @@ export const makeMatMulPackedSource = (
   tileInner = 32,
   splitK = false,
   splitedDimInner = 32,
-  sequentialAccessByThreads = false
+  sequentialAccessByThreads = false,
 ): string => {
   const tileAOuter = workPerThread[1] * workgroupSize[1];
   const tileBOuter = workPerThread[0] * workgroupSize[0];
@@ -223,7 +223,7 @@ export const makeMatMulPackedSource = (
     !(tileAHight % workgroupSize[1] === 0 && tileAWidth % workgroupSize[0] === 0 && tileInner % workgroupSize[1] === 0)
   ) {
     throw new Error(
-      `tileAHight ${tileAHight} must be divisible by workgroupSize[1]${workgroupSize[1]}, tileAWidth ${tileAWidth} must be divisible by workgroupSize[0]${workgroupSize[0]}, tileInner ${tileInner} must be divisible by workgroupSize[1]${workgroupSize[1]}`
+      `tileAHight ${tileAHight} must be divisible by workgroupSize[1]${workgroupSize[1]}, tileAWidth ${tileAWidth} must be divisible by workgroupSize[0]${workgroupSize[0]}, tileInner ${tileInner} must be divisible by workgroupSize[1]${workgroupSize[1]}`,
     );
   }
   const rowPerThreadA = tileAHight / workgroupSize[1];
@@ -374,7 +374,7 @@ const matMulReadWriteFnSource = (
   applyActivation: string,
   variables: IndicesHelper[],
   batchShapes: Array<readonly number[]>,
-  isChannelsLast = false
+  isChannelsLast = false,
 ): string => {
   const [batchAShape, batchBShape, batchShape] = batchShapes;
   const [batchVariable, aVariable, bVariable, outputVariable] = variables;
@@ -411,9 +411,9 @@ const matMulReadWriteFnSource = (
   };
   const source = `
     fn mm_readA(batch: i32, row: i32, colIn: i32, batchIndices: ${batchVariable.type.indices}) -> ${typeSnippet(
-    component,
-    dataType
-  )} {
+      component,
+      dataType,
+    )} {
       var value = ${typeSnippet(component, dataType)}(0.0);
       let col = colIn * ${component};
       if(row < uniforms.dim_a_outer && col < uniforms.dim_inner)
@@ -425,9 +425,9 @@ const matMulReadWriteFnSource = (
     }
 
     fn mm_readB(batch: i32, row: i32, colIn: i32, batchIndices: ${batchVariable.type.indices}) -> ${typeSnippet(
-    component,
-    dataType
-  )} {
+      component,
+      dataType,
+    )} {
       var value = ${typeSnippet(component, dataType)}(0.0);
       let col = colIn * ${component};
       if(row < uniforms.dim_inner && col < uniforms.dim_b_outer)
@@ -462,7 +462,7 @@ export const createMatmulProgramInfo = (
   outputShape: readonly number[],
   reshapedOutputShape?: readonly number[],
   isChannelsLast = false /* only used for conv2dByMatMul*/,
-  squeezeOutputShapeFunction?: (shape: readonly number[]) => number[]
+  squeezeOutputShapeFunction?: (shape: readonly number[]) => number[],
 ): ProgramInfo => {
   const aShape = inputs[0].dims;
   const bShape = inputs[1].dims;
@@ -533,7 +533,7 @@ export const createMatmulProgramInfo = (
       applyActivation,
       [batchDims, A, B, output],
       [outerDimsA, outerDimsB, outerDims],
-      isChannelsLast
+      isChannelsLast,
     );
     return `
   ${shaderHelper
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
index 17fbf71760ad6..1ad4149b01e08 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
@@ -25,7 +25,7 @@ import { appendActivationUniforms, appendActivationUniformsData, getActivationSn
 export const createGroupedConvProgramInfo = (
   inputs: readonly TensorView[],
   attributes: ConvAttributes,
-  squeezeOutputShapeFunction?: (shape: readonly number[]) => number[]
+  squeezeOutputShapeFunction?: (shape: readonly number[]) => number[],
 ): ProgramInfo => {
   const hasBias = inputs.length > 2;
   const processBias = hasBias ? 'value += b[output_channel];' : '';
@@ -40,7 +40,7 @@ export const createGroupedConvProgramInfo = (
     attributes.dilations,
     attributes.pads,
     attributes.strides,
-    isChannelLast
+    isChannelLast,
   );
   const outputSize = ShapeUtil.size(outputShape);
 
@@ -145,7 +145,7 @@ export const createGroupedConvVectorizeProgramInfo = (
   inputs: readonly TensorView[],
   attributes: ConvAttributes,
   outputShape: readonly number[],
-  squeezeOutputShapeFunction?: (shape: readonly number[]) => number[]
+  squeezeOutputShapeFunction?: (shape: readonly number[]) => number[],
 ): ProgramInfo => {
   const hasBias = inputs.length > 2;
   const components = getMaxComponents(outputShape[3]);
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index 5e8f6e245c93a..e64d253620186 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -20,7 +20,7 @@ export const calculateOutputShape = (
   dilations: readonly number[],
   adjustPads: readonly number[],
   strides: readonly number[],
-  isChannelLast: boolean
+  isChannelLast: boolean,
 ): number[] => {
   const batchSize = inputShape[0];
   const inputSpatialShape = inputShape.slice(isChannelLast ? 1 : 2, isChannelLast ? 3 : 4);
@@ -30,7 +30,7 @@ export const calculateOutputShape = (
   const dilatedKernelShape = kernelSpatialShape.map((v, i) => v + (v - 1) * (dilations[i] - 1));
   const inputSpatialShapeWithPad = inputSpatialShape.map((v, i) => v + adjustPads[i] + adjustPads[i + spatialRank]);
   const outputShape = inputSpatialShapeWithPad.map((v, i) =>
-    Math.floor((v - dilatedKernelShape[i] + strides[i]) / strides[i])
+    Math.floor((v - dilatedKernelShape[i] + strides[i]) / strides[i]),
   );
   outputShape.splice(0, 0, batchSize);
   outputShape.splice(isChannelLast ? 3 : 1, 0, outChannels);
@@ -117,7 +117,7 @@ const getAdjustedConvAttributes = <T extends ConvAttributes>(attributes: T, inpu
     kernelShape,
     pads,
     attributes.format === 'NHWC',
-    attributes.autoPad
+    attributes.autoPad,
   );
 
   // always return a new object so does not modify the original attributes
@@ -156,7 +156,7 @@ const conv2d = (
   context: ComputeContext,
   inputs: readonly TensorView[],
   attributes: ConvAttributes,
-  squeezeOutputShapeFunction?: (shape: readonly number[]) => number[]
+  squeezeOutputShapeFunction?: (shape: readonly number[]) => number[],
 ): void => {
   // check attributes
 
@@ -181,7 +181,7 @@ const conv2d = (
         attributes.dilations,
         attributes.pads,
         attributes.strides,
-        isChannelsLast
+        isChannelsLast,
       );
       const transposedWeight =
         (context.kernelCustomData.wT as TensorView | undefined) ??
@@ -198,7 +198,7 @@ const conv2d = (
       }
       context.compute(
         createGroupedConvVectorizeProgramInfo(convInputs, attributes, outputShape, squeezeOutputShapeFunction),
-        { inputs: convInputs }
+        { inputs: convInputs },
       );
     } else {
       context.compute(createGroupedConvProgramInfo(inputs, attributes, squeezeOutputShapeFunction));
@@ -219,7 +219,7 @@ const conv2d = (
     attributes.dilations,
     attributes.pads,
     attributes.strides,
-    isChannelsLast
+    isChannelsLast,
   );
   const outHeight = outputShape[isChannelsLast ? 1 : 2];
   const outWidth = outputShape[isChannelsLast ? 2 : 3];
@@ -289,9 +289,9 @@ const conv2d = (
           outputShape,
           matmulOutputShape,
           isChannelsLast,
-          squeezeOutputShapeFunction
+          squeezeOutputShapeFunction,
         ),
-        { inputs: matmulInputs }
+        { inputs: matmulInputs },
       );
     } else {
       context.compute(
@@ -301,9 +301,9 @@ const conv2d = (
           outputShape,
           matmulOutputShape,
           isChannelsLast,
-          squeezeOutputShapeFunction
+          squeezeOutputShapeFunction,
         ),
-        { inputs: matmulInputs }
+        { inputs: matmulInputs },
       );
     }
     return;
@@ -344,9 +344,9 @@ const conv2d = (
       dimInner,
       hasBias,
       sequentialAccessByThreads,
-      squeezeOutputShapeFunction
+      squeezeOutputShapeFunction,
     ),
-    { inputs: convInputs }
+    { inputs: convInputs },
   );
 };
 
@@ -359,7 +359,7 @@ const conv1d = (context: ComputeContext, attributes: ConvAttributes): void => {
         ? // [N, W, C] -> [N, H=1, W, C]
           [context.inputs[0].dims[0], 1, context.inputs[0].dims[1], context.inputs[0].dims[2]]
         : // [N, C, W] -> [N, C, H=1, W]
-          [context.inputs[0].dims[0], context.inputs[0].dims[1], 1, context.inputs[0].dims[2]]
+          [context.inputs[0].dims[0], context.inputs[0].dims[1], 1, context.inputs[0].dims[2]],
     ),
     //[FILTER_OUT_CHANNEL, FILTER_IN_CHANNEL, kW] -> [FILTER_OUT_CHANNEL, FILTER_IN_CHANNEL, kH=1, kW]
     context.inputs[1].reshape([context.inputs[1].dims[0], context.inputs[1].dims[1], 1, context.inputs[1].dims[2]]),
@@ -373,10 +373,10 @@ const conv1d = (context: ComputeContext, attributes: ConvAttributes): void => {
   const kernelShape = [1].concat(attributes.kernelShape);
   const adjustedAttributes = getAdjustedConvAttributes(
     { ...attributes, pads, strides, dilations, kernelShape },
-    inputs
+    inputs,
   );
   conv2d(context, inputs, adjustedAttributes, (outputShape) =>
-    isChannelLast ? [outputShape[0], outputShape[2], outputShape[3]] : [outputShape[0], outputShape[1], outputShape[3]]
+    isChannelLast ? [outputShape[0], outputShape[2], outputShape[3]] : [outputShape[0], outputShape[1], outputShape[3]],
   );
 };
 
@@ -391,7 +391,7 @@ const conv3d = (context: ComputeContext, inputs: readonly TensorView[], attribut
     attributes.dilations as number | [number, number, number],
     pads as string | number[],
     false,
-    format
+    format,
   );
   context.compute(
     createConv3DNaiveProgramInfo(
@@ -400,8 +400,8 @@ const conv3d = (context: ComputeContext, inputs: readonly TensorView[], attribut
       convInfo.outShape,
       [convInfo.filterDepth, convInfo.filterHeight, convInfo.filterWidth],
       [convInfo.padInfo.front, convInfo.padInfo.top, convInfo.padInfo.left],
-      format
-    )
+      format,
+    ),
   );
 };
 

From a147ae0babea67f0a42ae070b0cbbd9812a776c9 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 15 Aug 2024 19:42:05 -0700
Subject: [PATCH 5/7] more formatting

---
 js/web/lib/wasm/jsep/webgpu/ops/matmul.ts | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
index d2622361440d7..bd37474f38f71 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
@@ -32,7 +32,7 @@ export const createNaiveMatmulProgramInfo = (
   outputShape: readonly number[],
   reshapedOutputShape?: readonly number[],
   isChannelsLast = false /* only used for conv2dByMatMul*/,
-  squeezeOutputShapeFunction?: (shape: readonly number[]) => number[]
+  squeezeOutputShapeFunction?: (shape: readonly number[]) => number[],
 ): ProgramInfo => {
   const aShape = inputs[0].dims;
   const bShape = inputs[1].dims;
@@ -122,8 +122,8 @@ export const createNaiveMatmulProgramInfo = (
         for (let j = 0; j < aComponents; j++) {
           calcStr += `
             values[${i}] = fma(${b.type.value}(a_data${
-            aComponents === 1 ? '' : `[${j}]`
-          }), b_data${j}, values[${i}]);\n`;
+              aComponents === 1 ? '' : `[${j}]`
+            }), b_data${j}, values[${i}]);\n`;
         }
       }
       return calcStr;

From 549a8d38eb6084f9307333d2fbfdec5cc1cc35c4 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 15 Aug 2024 22:10:20 -0700
Subject: [PATCH 6/7] fix error caused by resolve

---
 .../jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts    | 12 ------------
 js/web/lib/wasm/jsep/webgpu/ops/matmul.ts            |  4 +---
 2 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
index bfed99cc4f51d..3ef5c943d5624 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
@@ -291,18 +291,6 @@ export const createConv2DMatMulProgramInfo = (
           elementsSize[2],
           t,
         )}
-        ${conv2dCommonSnippet(
-          isChannelsLast,
-          fitAOuter,
-          fitBOuter,
-          fitInner,
-          hasBias,
-          attributes,
-          elementsSize[0],
-          elementsSize[1],
-          elementsSize[2],
-          t,
-        )}
         ${
           isVec4
             ? makeMatMulPackedVec4Source(elementsPerThread, workGroupSize, t, undefined, !isChannelsLast, tileInner)
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
index bd37474f38f71..7605e67c972b9 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
@@ -121,9 +121,7 @@ export const createNaiveMatmulProgramInfo = (
 
         for (let j = 0; j < aComponents; j++) {
           calcStr += `
-            values[${i}] = fma(${b.type.value}(a_data${
-              aComponents === 1 ? '' : `[${j}]`
-            }), b_data${j}, values[${i}]);\n`;
+            values[${i}] = fma(${b.type.value}(a_data${aComponents === 1 ? '' : `[${j}]`}), b_data${j}, values[${i}]);\n`;
         }
       }
       return calcStr;

From b0e353931a44ddff442c16615599525ce0b66513 Mon Sep 17 00:00:00 2001
From: Qin Jiajia <jiajia.qin@intel.com>
Date: Mon, 19 Aug 2024 16:05:28 +0800
Subject: [PATCH 7/7] revert some changes due to incorrect rebase

---
 js/web/lib/wasm/jsep/webgpu/ops/conv.ts | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index e64d253620186..241aae8c46603 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -163,12 +163,13 @@ const conv2d = (
   // const hasPreluActivationWeights = false; /* TODO: add support for prelu activation weights */
   const isChannelsLast = attributes.format === 'NHWC';
   if (attributes.group !== 1) {
-    // Temporarily disable createGroupedConvVectorizeProgramInfo path due to bots failures with below two cases:
+    // NVIDIA GPU with ampere architecture fails with below 2 cases, but we couldn't repro them with any other
+    // GPUs. So just disable vectorize on NVIDIA ampere to ensure always correct outputs.
     // [webgpu]Conv - conv - vectorize group - B
     // [webgpu]Conv - conv - vectorize group - D
-    const disableGroupedConvVectorize = false;
+    const enableGroupedConvVectorize = !context.adapterInfo.isArchitecture('ampere');
     if (
-      !disableGroupedConvVectorize &&
+      enableGroupedConvVectorize &&
       isChannelsLast &&
       inputs[1].dims[0] === attributes.group &&
       inputs[1].dims[1] === 1 &&