From b572e90c131dab20ae5c135f0f87ab78c9e26836 Mon Sep 17 00:00:00 2001 From: Qin Jiajia Date: Tue, 30 Jan 2024 17:32:27 +0800 Subject: [PATCH 1/7] Optimize conv1d by conv2d --- .../webgpu/ops/3rd-party/conv2d_mm_webgpu.ts | 8 +- .../ops/3rd-party/matmul_packed_webgpu.ts | 9 +- .../lib/wasm/jsep/webgpu/ops/conv-grouped.ts | 8 +- js/web/lib/wasm/jsep/webgpu/ops/conv.ts | 246 +++++++++--------- js/web/lib/wasm/jsep/webgpu/ops/matmul.ts | 9 +- js/web/test/data/ops/conv.jsonc | 67 +++++ 6 files changed, 215 insertions(+), 132 deletions(-) diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts index 24006d393592a..1e44111cf61b4 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts @@ -157,7 +157,8 @@ const conv2dCommonSnippet = export const createConv2DMatMulProgramInfo = (inputs: readonly TensorView[], attributes: ConvAttributes, outputShape: readonly number[], dimAOuter: number, - dimBOuter: number, dimInner: number, hasBias: boolean, sequentialAccessByThreads: boolean): ProgramInfo => { + dimBOuter: number, dimInner: number, hasBias: boolean, sequentialAccessByThreads: boolean, + squeezeOutputShapeFunction?: (shape: readonly number[]) => number[]): ProgramInfo => { const isChannelsLast = attributes.format === 'NHWC'; const inChannels = isChannelsLast ? inputs[0].dims[3] : inputs[0].dims[1]; const batchSize = outputShape[0]; @@ -262,7 +263,10 @@ export const createConv2DMatMulProgramInfo = inputDependencies }, getRunData: () => ({ - outputs: [{dims: outputShape, dataType: inputs[0].dataType}], + outputs: [{ + dims: squeezeOutputShapeFunction ? squeezeOutputShapeFunction(outputShape) : outputShape, + dataType: inputs[0].dataType + }], dispatchGroup: {x: dispatch[0], y: dispatch[1], z: dispatch[2]}, programUniforms, }), diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts index 9b37247167bab..21bcc5b5a925c 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts @@ -413,8 +413,8 @@ const matMulReadWriteFnSource = export const createMatmulProgramInfo = (inputs: readonly TensorView[], activationAttributes: InternalActivationAttributes, outputShape: readonly number[], - reshapedOutputShape?: readonly number[], - isChannelsLast = false /* only used for conv2dByMatMul*/): ProgramInfo => { + reshapedOutputShape?: readonly number[], isChannelsLast = false /* only used for conv2dByMatMul*/, + squeezeOutputShapeFunction?: (shape: readonly number[]) => number[]): ProgramInfo => { const aShape = inputs[0].dims; const bShape = inputs[1].dims; const outerDimsA = aShape.slice(0, -2); @@ -494,7 +494,10 @@ export const createMatmulProgramInfo = inputDependencies }, getRunData: () => ({ - outputs: [{dims: outputShape, dataType: inputs[0].dataType}], + outputs: [{ + dims: squeezeOutputShapeFunction ? squeezeOutputShapeFunction(outputShape) : outputShape, + dataType: inputs[0].dataType + }], dispatchGroup: {x: dispatch[0], y: dispatch[1], z: dispatch[2]}, programUniforms }), diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts index 924030125c420..aac1fcb5f4fff 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts @@ -118,7 +118,8 @@ export const createGroupedConvProgramInfo = }; export const createGroupedConvVectorizeProgramInfo = - (inputs: readonly TensorView[], attributes: ConvAttributes, outputShape: readonly number[]): ProgramInfo => { + (inputs: readonly TensorView[], attributes: ConvAttributes, outputShape: readonly number[], + squeezeOutputShapeFunction?: (shape: readonly number[]) => number[]): ProgramInfo => { const hasBias = inputs.length > 2; const components = getMaxComponents(outputShape[3]); const outputNumber = getMaxComponents(outputShape[2]); @@ -207,7 +208,10 @@ export const createGroupedConvVectorizeProgramInfo = inputDependencies: hasBias ? ['rank', 'rank', 'type'] : ['rank', 'rank'] }, getRunData: () => ({ - outputs: [{dims: outputShape, dataType: inputs[0].dataType}], + outputs: [{ + dims: squeezeOutputShapeFunction ? squeezeOutputShapeFunction(outputShape) : outputShape, + dataType: inputs[0].dataType + }], dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)}, programUniforms }), diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts index 52bd69130e617..34c93f50dc158 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts @@ -140,68 +140,118 @@ export const parseConvAttributes = (attributes: Record): ConvAt }; }; -const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attributes: ConvAttributes): void => { - const adjustedAttributes = getAdjustedConvAttributes(attributes, inputs); +const conv2d = + (context: ComputeContext, inputs: readonly TensorView[], attributes: ConvAttributes, + squeezeOutputShapeFunction?: (shape: readonly number[]) => number[]): void => { + // check attributes + + // const hasPreluActivationWeights = false; /* TODO: add support for prelu activation weights */ + const isChannelsLast = attributes.format === 'NHWC'; + if (attributes.group !== 1) { + // Temporarily disable createGroupedConvVectorizeProgramInfo path due to bots failures with below two cases: + // [webgpu]Conv - conv - vectorize group - B + // [webgpu]Conv - conv - vectorize group - D + const disableGroupedConvVectorize = false; + if (!disableGroupedConvVectorize && isChannelsLast && inputs[1].dims[0] === attributes.group && + inputs[1].dims[1] === 1 && attributes.dilations[0] === 1 && attributes.dilations[1] === 1) { + const outputShape = calculateOutputShape( + inputs[0].dims, inputs[1].dims, attributes.dilations, attributes.pads, attributes.strides, + isChannelsLast); + const transposedWeight = (context.kernelCustomData.wT as TensorView | undefined) ?? + context.compute( + createTransposeProgramInfo(inputs[1], weightTransposeAttribute), + {inputs: [1], outputs: [attributes.wIsConst ? -2 : -1]})[0]; + if (attributes.wIsConst && !context.kernelCustomData.wT) { + context.kernelCustomData.wT = transposedWeight; + } + const convInputs = [inputs[0], transposedWeight]; + if (inputs.length === 3) { + convInputs.push(inputs[2]); + } + context.compute( + createGroupedConvVectorizeProgramInfo(convInputs, attributes, outputShape, squeezeOutputShapeFunction), + {inputs: convInputs}); + } else { + context.compute(createGroupedConvProgramInfo(inputs, attributes, squeezeOutputShapeFunction)); + } + return; + } - // check attributes + const hasBias = inputs.length === 3; + const inputHeight = inputs[0].dims[isChannelsLast ? 1 : 2]; + const inputWidth = inputs[0].dims[isChannelsLast ? 2 : 3]; + const inputChannels = inputs[0].dims[isChannelsLast ? 3 : 1]; + const weightHeight = inputs[1].dims[2]; + const weightWidth = inputs[1].dims[3]; - // const hasPreluActivationWeights = false; /* TODO: add support for prelu activation weights */ - const isChannelsLast = attributes.format === 'NHWC'; - if (attributes.group !== 1) { - // NVIDIA GPU with ampere architecture fails with below 2 cases, but we couldn't repro them with any other - // GPUs. So just disable vectorize on NVIDIA ampere to ensure always correct outputs. - // [webgpu]Conv - conv - vectorize group - B - // [webgpu]Conv - conv - vectorize group - D - const enableGroupedConvVectorize = !context.adapterInfo.isArchitecture('ampere'); - if (enableGroupedConvVectorize && isChannelsLast && inputs[1].dims[0] === attributes.group && - inputs[1].dims[1] === 1 && attributes.dilations[0] === 1 && attributes.dilations[1] === 1) { const outputShape = calculateOutputShape( - inputs[0].dims, inputs[1].dims, attributes.dilations, adjustedAttributes.pads, attributes.strides, - isChannelsLast); - const transposedWeight = (context.kernelCustomData.wT as TensorView | undefined) ?? + inputs[0].dims, inputs[1].dims, attributes.dilations, attributes.pads, attributes.strides, isChannelsLast); + const outHeight = outputShape[isChannelsLast ? 1 : 2]; + const outWidth = outputShape[isChannelsLast ? 2 : 3]; + const outChannels = outputShape[isChannelsLast ? 3 : 1]; + + const sameSize = isChannelsLast && weightHeight === inputHeight && weightWidth === inputWidth && + attributes.pads[0] === 0 && attributes.pads[1] === 0; + if (sameSize || + (weightHeight === 1 && weightWidth === 1 && attributes.dilations[0] === 1 && attributes.dilations[1] === 1 && + attributes.strides[0] === 1 && attributes.strides[1] === 1 && attributes.pads[0] === 0 && + attributes.pads[1] === 0)) { + // conv2dByMatMul + const batch = outputShape[0]; + let xReshaped, wReshaped, matmulOutputShape; + const matmulInputs = []; + if (isChannelsLast) { + const transposedWeight = (context.kernelCustomData.wT as TensorView | undefined) ?? + context.compute( + createTransposeProgramInfo(inputs[1], weightTransposeAttribute), + {inputs: [1], outputs: [attributes.wIsConst ? -2 : -1]})[0]; + if (attributes.wIsConst && !context.kernelCustomData.wT) { + context.kernelCustomData.wT = transposedWeight; + } + if (sameSize) { + const sharedDim = inputHeight * inputWidth * inputChannels; + xReshaped = inputs[0].reshape([1, batch, sharedDim]); + wReshaped = transposedWeight.reshape([1, sharedDim, outChannels]); + matmulOutputShape = [1, batch, outChannels]; + } else { + xReshaped = inputs[0].reshape([batch, inputHeight * inputWidth, inputChannels]); + wReshaped = transposedWeight.reshape([1, inputChannels, outChannels]); + matmulOutputShape = [batch, outHeight * outWidth, outChannels]; + } + matmulInputs.push(xReshaped); + matmulInputs.push(wReshaped); + } else { + xReshaped = inputs[0].reshape([batch, inputChannels, inputHeight * inputWidth]); + wReshaped = inputs[1].reshape([1, outChannels, inputChannels]); + matmulOutputShape = [batch, outChannels, outHeight * outWidth]; + matmulInputs.push(wReshaped); + matmulInputs.push(xReshaped); + } + if (hasBias) { + matmulInputs.push(inputs[2]); + } + const N = matmulOutputShape[2]; + const K = matmulInputs[0].dims[matmulInputs[0].dims.length - 1]; + // Tune the threshold. + if (N < 8 && K < 8) { context.compute( - createTransposeProgramInfo(inputs[1], weightTransposeAttribute), - {inputs: [1], outputs: [attributes.wIsConst ? -2 : -1]})[0]; - if (attributes.wIsConst && !context.kernelCustomData.wT) { - context.kernelCustomData.wT = transposedWeight; - } - const convInputs = [inputs[0], transposedWeight]; - if (inputs.length === 3) { - convInputs.push(inputs[2]); + createNaiveMatmulProgramInfo( + matmulInputs, attributes, outputShape, matmulOutputShape, isChannelsLast, squeezeOutputShapeFunction), + {inputs: matmulInputs}); + } else { + context.compute( + createMatmulProgramInfo( + matmulInputs, attributes, outputShape, matmulOutputShape, isChannelsLast, squeezeOutputShapeFunction), + {inputs: matmulInputs}); + } + return; } - context.compute( - createGroupedConvVectorizeProgramInfo(convInputs, adjustedAttributes, outputShape), {inputs: convInputs}); - } else { - context.compute(createGroupedConvProgramInfo(inputs, adjustedAttributes)); - } - return; - } - const hasBias = inputs.length === 3; - const inputHeight = inputs[0].dims[isChannelsLast ? 1 : 2]; - const inputWidth = inputs[0].dims[isChannelsLast ? 2 : 3]; - const inputChannels = inputs[0].dims[isChannelsLast ? 3 : 1]; - const weightHeight = inputs[1].dims[2]; - const weightWidth = inputs[1].dims[3]; + // TODO: implement conv2dWithIm2Col() - const outputShape = calculateOutputShape( - inputs[0].dims, inputs[1].dims, attributes.dilations, adjustedAttributes.pads, attributes.strides, - isChannelsLast); - const outHeight = outputShape[isChannelsLast ? 1 : 2]; - const outWidth = outputShape[isChannelsLast ? 2 : 3]; - const outChannels = outputShape[isChannelsLast ? 3 : 1]; + const sequentialAccessByThreads = /* backend.adapterInfo.isIntel() */ true; - const sameSize = isChannelsLast && weightHeight === inputHeight && weightWidth === inputWidth && - attributes.pads[0] === 0 && attributes.pads[1] === 0; - if (sameSize || - (weightHeight === 1 && weightWidth === 1 && attributes.dilations[0] === 1 && attributes.dilations[1] === 1 && - attributes.strides[0] === 1 && attributes.strides[1] === 1 && attributes.pads[0] === 0 && - attributes.pads[1] === 0)) { - // conv2dByMatMul - const batch = outputShape[0]; - let xReshaped, wReshaped, matmulOutputShape; - const matmulInputs = []; - if (isChannelsLast) { + // STEP.1: transpose weight const transposedWeight = (context.kernelCustomData.wT as TensorView | undefined) ?? context.compute( createTransposeProgramInfo(inputs[1], weightTransposeAttribute), @@ -209,73 +259,23 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut if (attributes.wIsConst && !context.kernelCustomData.wT) { context.kernelCustomData.wT = transposedWeight; } - if (sameSize) { - const sharedDim = inputHeight * inputWidth * inputChannels; - xReshaped = inputs[0].reshape([1, batch, sharedDim]); - wReshaped = transposedWeight.reshape([1, sharedDim, outChannels]); - matmulOutputShape = [1, batch, outChannels]; - } else { - xReshaped = inputs[0].reshape([batch, inputHeight * inputWidth, inputChannels]); - wReshaped = transposedWeight.reshape([1, inputChannels, outChannels]); - matmulOutputShape = [batch, outHeight * outWidth, outChannels]; - } - matmulInputs.push(xReshaped); - matmulInputs.push(wReshaped); - } else { - xReshaped = inputs[0].reshape([batch, inputChannels, inputHeight * inputWidth]); - wReshaped = inputs[1].reshape([1, outChannels, inputChannels]); - matmulOutputShape = [batch, outChannels, outHeight * outWidth]; - matmulInputs.push(wReshaped); - matmulInputs.push(xReshaped); - } - if (hasBias) { - matmulInputs.push(inputs[2]); - } - const N = matmulOutputShape[2]; - const K = matmulInputs[0].dims[matmulInputs[0].dims.length - 1]; - // Tune the threshold. - if (N < 8 && K < 8) { - context.compute( - createNaiveMatmulProgramInfo( - matmulInputs, adjustedAttributes, outputShape, matmulOutputShape, isChannelsLast), - {inputs: matmulInputs}); - } else { - context.compute( - createMatmulProgramInfo(matmulInputs, adjustedAttributes, outputShape, matmulOutputShape, isChannelsLast), - {inputs: matmulInputs}); - } - return; - } - // TODO: implement conv2dWithIm2Col() - - const sequentialAccessByThreads = /* backend.adapterInfo.isIntel() */ true; + // STEP.2: prepare reshaped inputs + const convInputs = [inputs[0], transposedWeight]; + if (hasBias) { + convInputs.push(inputs[2]); + } - // STEP.1: transpose weight - const transposedWeight = (context.kernelCustomData.wT as TensorView | undefined) ?? + // STEP.3: compute matmul + const dimAOuter = isChannelsLast ? outHeight * outWidth : outChannels; + const dimBOuter = isChannelsLast ? outChannels : outHeight * outWidth; + const dimInner = weightHeight * weightWidth * inputChannels; context.compute( - createTransposeProgramInfo(inputs[1], weightTransposeAttribute), - {inputs: [1], outputs: [attributes.wIsConst ? -2 : -1]})[0]; - if (attributes.wIsConst && !context.kernelCustomData.wT) { - context.kernelCustomData.wT = transposedWeight; - } - - // STEP.2: prepare reshaped inputs - const convInputs = [inputs[0], transposedWeight]; - if (hasBias) { - convInputs.push(inputs[2]); - } - - // STEP.3: compute matmul - const dimAOuter = isChannelsLast ? outHeight * outWidth : outChannels; - const dimBOuter = isChannelsLast ? outChannels : outHeight * outWidth; - const dimInner = weightHeight * weightWidth * inputChannels; - context.compute( - createConv2DMatMulProgramInfo( - convInputs, adjustedAttributes, outputShape, dimAOuter, dimBOuter, dimInner, hasBias, - sequentialAccessByThreads), - {inputs: convInputs}); -}; + createConv2DMatMulProgramInfo( + convInputs, attributes, outputShape, dimAOuter, dimBOuter, dimInner, hasBias, sequentialAccessByThreads, + squeezeOutputShapeFunction), + {inputs: convInputs}); + }; const conv1d = (context: ComputeContext, attributes: ConvAttributes): void => { // extend the input to 2D by adding H dimension @@ -298,9 +298,10 @@ const conv1d = (context: ComputeContext, attributes: ConvAttributes): void => { const dilations = [1].concat(attributes.dilations); const kernelShape = [1].concat(attributes.kernelShape); const adjustedAttributes = getAdjustedConvAttributes({...attributes, pads, strides, dilations, kernelShape}, inputs); - context.compute(createGroupedConvProgramInfo( - inputs, adjustedAttributes, - outputShape => isChannelLast ? [outputShape[0], outputShape[2], outputShape[3]] : [])); + conv2d( + context, inputs, adjustedAttributes, + outputShape => isChannelLast ? [outputShape[0], outputShape[2], outputShape[3]] : + [outputShape[0], outputShape[1], outputShape[3]]); }; const conv3d = (context: ComputeContext, inputs: readonly TensorView[], attributes: ConvAttributes): void => { @@ -325,6 +326,7 @@ export const conv = (context: ComputeContext, attributes: ConvAttributes): void } else if (context.inputs[0].dims.length === 5) { conv3d(context, context.inputs, attributes); } else { - conv2d(context, context.inputs, attributes); + const adjustedAttributes = getAdjustedConvAttributes(attributes, context.inputs); + conv2d(context, context.inputs, adjustedAttributes); } }; diff --git a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts index 1a92d861002fb..2f90c731b4a44 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts @@ -12,8 +12,8 @@ import {appendActivationUniforms, appendActivationUniformsData, getActivationSni export const createNaiveMatmulProgramInfo = (inputs: readonly TensorView[], activationAttributes: InternalActivationAttributes, outputShape: readonly number[], - reshapedOutputShape?: readonly number[], - isChannelsLast = false /* only used for conv2dByMatMul*/): ProgramInfo => { + reshapedOutputShape?: readonly number[], isChannelsLast = false /* only used for conv2dByMatMul*/, + squeezeOutputShapeFunction?: (shape: readonly number[]) => number[]): ProgramInfo => { const aShape = inputs[0].dims; const bShape = inputs[1].dims; @@ -143,7 +143,10 @@ export const createNaiveMatmulProgramInfo = inputDependencies: hasBias ? ['rank', 'rank', 'rank'] : ['rank', 'rank'] }, getRunData: () => ({ - outputs: [{dims: outputShape, dataType: inputs[0].dataType}], + outputs: [{ + dims: squeezeOutputShapeFunction ? squeezeOutputShapeFunction(outputShape) : outputShape, + dataType: inputs[0].dataType + }], dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)}, programUniforms }), diff --git a/js/web/test/data/ops/conv.jsonc b/js/web/test/data/ops/conv.jsonc index cc10df5864233..fb8a2e0d8ac91 100644 --- a/js/web/test/data/ops/conv.jsonc +++ b/js/web/test/data/ops/conv.jsonc @@ -485,5 +485,72 @@ ] } ] + }, + { + "name": "conv 1D without bias addition A", + "operator": "Conv", + "inputShapeDefinitions": "rankOnly", + "opset": { "domain": "", "version": 17 }, + "attributes": [{ "name": "kernel_shape", "data": [2], "type": "ints" }], + "cases": [ + { + "name": "T[0]", + "inputs": [ + { + "data": [10, 20, 30], + "dims": [1, 1, 3], + "type": "float32" + }, + { + "data": [1, 2], + "dims": [1, 1, 2], + "type": "float32" + } + ], + "outputs": [ + { + "data": [50, 80], + "dims": [1, 1, 2], + "type": "float32" + } + ] + } + ] + }, + { + "name": "Conv 1D with bias addition A", + "operator": "Conv", + "inputShapeDefinitions": "rankOnly", + "opset": { "domain": "", "version": 17 }, + "attributes": [{ "name": "kernel_shape", "data": [2], "type": "ints" }], + "cases": [ + { + "name": "T[0]", + "inputs": [ + { + "data": [10, 20, 30, 40], + "dims": [1, 2, 2], + "type": "float32" + }, + { + "data": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "dims": [4, 2, 2], + "type": "float32" + }, + { + "data": [0.1, 0.2, 0.3, 0.4], + "dims": [4], + "type": "float32" + } + ], + "outputs": [ + { + "data": [100.1, 100.2, 100.3, 100.4], + "dims": [1, 4, 1], + "type": "float32" + } + ] + } + ] } ] From d4afd6a549410fcd40ae5f859260b4e60187dd9d Mon Sep 17 00:00:00 2001 From: Qin Jiajia Date: Fri, 9 Aug 2024 14:43:41 +0800 Subject: [PATCH 2/7] fix uniform variable mismatch error --- js/web/lib/wasm/jsep/webgpu/ops/transpose.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts index 8496173b1e8f8..62918c8e16326 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts @@ -83,13 +83,13 @@ export const createTransposeProgramInfo = (inputTensor: TensorView, permAttr: nu return { name: 'Transpose', shaderCache: {hint: `${permAttr}`, inputDependencies: ['rank']}, - getRunData: (inputs) => { + getRunData: () => { const outputSize = ShapeUtil.size(outputShape); return { - outputs: [{dims: outputShape, dataType: inputs[0].dataType}], + outputs: [{dims: outputShape, dataType: inputTensor.dataType}], dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)}, programUniforms: - [{type: DataType.uint32, data: outputSize}, ...createTensorShapeVariables(inputs[0].dims, outputShape)], + [{type: DataType.uint32, data: outputSize}, ...createTensorShapeVariables(inputTensor.dims, outputShape)], }; }, getShaderSource, From f9e00f98df219f1f3d37cc153e5b2a43d7bb58e8 Mon Sep 17 00:00:00 2001 From: Qin Jiajia Date: Mon, 12 Aug 2024 14:03:39 +0800 Subject: [PATCH 3/7] Split conv1d tests to separate file --- js/web/test/data/ops/conv.jsonc | 67 ------------------------------ js/web/test/data/ops/conv1d.jsonc | 69 +++++++++++++++++++++++++++++++ js/web/test/suite-test-list.jsonc | 1 + 3 files changed, 70 insertions(+), 67 deletions(-) create mode 100644 js/web/test/data/ops/conv1d.jsonc diff --git a/js/web/test/data/ops/conv.jsonc b/js/web/test/data/ops/conv.jsonc index fb8a2e0d8ac91..cc10df5864233 100644 --- a/js/web/test/data/ops/conv.jsonc +++ b/js/web/test/data/ops/conv.jsonc @@ -485,72 +485,5 @@ ] } ] - }, - { - "name": "conv 1D without bias addition A", - "operator": "Conv", - "inputShapeDefinitions": "rankOnly", - "opset": { "domain": "", "version": 17 }, - "attributes": [{ "name": "kernel_shape", "data": [2], "type": "ints" }], - "cases": [ - { - "name": "T[0]", - "inputs": [ - { - "data": [10, 20, 30], - "dims": [1, 1, 3], - "type": "float32" - }, - { - "data": [1, 2], - "dims": [1, 1, 2], - "type": "float32" - } - ], - "outputs": [ - { - "data": [50, 80], - "dims": [1, 1, 2], - "type": "float32" - } - ] - } - ] - }, - { - "name": "Conv 1D with bias addition A", - "operator": "Conv", - "inputShapeDefinitions": "rankOnly", - "opset": { "domain": "", "version": 17 }, - "attributes": [{ "name": "kernel_shape", "data": [2], "type": "ints" }], - "cases": [ - { - "name": "T[0]", - "inputs": [ - { - "data": [10, 20, 30, 40], - "dims": [1, 2, 2], - "type": "float32" - }, - { - "data": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - "dims": [4, 2, 2], - "type": "float32" - }, - { - "data": [0.1, 0.2, 0.3, 0.4], - "dims": [4], - "type": "float32" - } - ], - "outputs": [ - { - "data": [100.1, 100.2, 100.3, 100.4], - "dims": [1, 4, 1], - "type": "float32" - } - ] - } - ] } ] diff --git a/js/web/test/data/ops/conv1d.jsonc b/js/web/test/data/ops/conv1d.jsonc new file mode 100644 index 0000000000000..a387f0de324a6 --- /dev/null +++ b/js/web/test/data/ops/conv1d.jsonc @@ -0,0 +1,69 @@ +[ + { + "name": "conv 1D without bias addition A", + "operator": "Conv", + "inputShapeDefinitions": "rankOnly", + "opset": { "domain": "", "version": 17 }, + "attributes": [{ "name": "kernel_shape", "data": [2], "type": "ints" }], + "cases": [ + { + "name": "T[0]", + "inputs": [ + { + "data": [10, 20, 30], + "dims": [1, 1, 3], + "type": "float32" + }, + { + "data": [1, 2], + "dims": [1, 1, 2], + "type": "float32" + } + ], + "outputs": [ + { + "data": [50, 80], + "dims": [1, 1, 2], + "type": "float32" + } + ] + } + ] + }, + { + "name": "conv 1D with bias addition A", + "operator": "Conv", + "inputShapeDefinitions": "rankOnly", + "opset": { "domain": "", "version": 17 }, + "attributes": [{ "name": "kernel_shape", "data": [2], "type": "ints" }], + "cases": [ + { + "name": "T[0]", + "inputs": [ + { + "data": [10, 20, 30, 40], + "dims": [1, 2, 2], + "type": "float32" + }, + { + "data": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "dims": [4, 2, 2], + "type": "float32" + }, + { + "data": [0.1, 0.2, 0.3, 0.4], + "dims": [4], + "type": "float32" + } + ], + "outputs": [ + { + "data": [100.1, 100.2, 100.3, 100.4], + "dims": [1, 4, 1], + "type": "float32" + } + ] + } + ] + } +] diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc index 4aaf9d16b2b0e..3071b9a4eb9af 100644 --- a/js/web/test/suite-test-list.jsonc +++ b/js/web/test/suite-test-list.jsonc @@ -1347,6 +1347,7 @@ "concat_zero-sized.jsonc", "cast.jsonc", "conv.jsonc", + "conv1d.jsonc", "conv3dncdhw.jsonc", "cos.jsonc", "div.jsonc", From 5ec24f58ab380e4cc3855b3f741b176867207f11 Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Thu, 15 Aug 2024 13:58:50 -0700 Subject: [PATCH 4/7] format --- .../webgpu/ops/3rd-party/conv2d_mm_webgpu.ts | 16 ++++---- .../ops/3rd-party/matmul_packed_webgpu.ts | 24 ++++++------ .../lib/wasm/jsep/webgpu/ops/conv-grouped.ts | 6 +-- js/web/lib/wasm/jsep/webgpu/ops/conv.ts | 38 +++++++++---------- 4 files changed, 42 insertions(+), 42 deletions(-) diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts index c21eee78286b3..bfed99cc4f51d 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts @@ -48,7 +48,7 @@ const conv2dCommonSnippet = ( innerElementSizeX = 4, innerElementSizeW = 4, innerElementSize = 4, - dataType = 'f32' + dataType = 'f32', ): string => { const getXSnippet = (innerElementSize: number) => { switch (innerElementSize) { @@ -133,10 +133,10 @@ const conv2dCommonSnippet = ( } return ${typeSnippet(innerElementSizeX, dataType)}(0.0);` : fitInner && fitBOuter - ? ` + ? ` let col = colIn * ${innerElementSizeX}; ${readXSnippet}` - : ` + : ` let col = colIn * ${innerElementSizeX}; if (row < uniforms.dim_inner && col < uniforms.dim_b_outer) { ${readXSnippet} @@ -182,7 +182,7 @@ export const createConv2DMatMulProgramInfo = ( dimInner: number, hasBias: boolean, sequentialAccessByThreads: boolean, - squeezeOutputShapeFunction?: (shape: readonly number[]) => number[] + squeezeOutputShapeFunction?: (shape: readonly number[]) => number[], ): ProgramInfo => { const isChannelsLast = attributes.format === 'NHWC'; const inChannels = isChannelsLast ? inputs[0].dims[3] : inputs[0].dims[1]; @@ -258,7 +258,7 @@ export const createConv2DMatMulProgramInfo = ( 'x', inputs[0].dataType, inputs[0].dims.length, - innerElementSize === 3 ? 1 : innerElementSize + innerElementSize === 3 ? 1 : innerElementSize, ); const w = inputVariable('w', inputs[1].dataType, inputs[1].dims.length, components); const inputVariables = [x, w]; @@ -289,7 +289,7 @@ export const createConv2DMatMulProgramInfo = ( elementsSize[0], elementsSize[1], elementsSize[2], - t + t, )} ${conv2dCommonSnippet( isChannelsLast, @@ -301,7 +301,7 @@ export const createConv2DMatMulProgramInfo = ( elementsSize[0], elementsSize[1], elementsSize[2], - t + t, )} ${ isVec4 @@ -315,7 +315,7 @@ export const createConv2DMatMulProgramInfo = ( tileInner, false, undefined, - sequentialAccessByThreads + sequentialAccessByThreads, ) }`; }; diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts index 7a0c8ce6b1a2f..f0287529ca08b 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts @@ -92,7 +92,7 @@ export const makeMatMulPackedVec4Source = ( transposeA = false, tileInner = 32, splitK = false, - splitedDimInner = 32 + splitedDimInner = 32, ): string => { const tileAOuter = workgroupSize[1] * workPerThread[1]; const tileBOuter = workgroupSize[0] * workPerThread[0]; @@ -212,7 +212,7 @@ export const makeMatMulPackedSource = ( tileInner = 32, splitK = false, splitedDimInner = 32, - sequentialAccessByThreads = false + sequentialAccessByThreads = false, ): string => { const tileAOuter = workPerThread[1] * workgroupSize[1]; const tileBOuter = workPerThread[0] * workgroupSize[0]; @@ -223,7 +223,7 @@ export const makeMatMulPackedSource = ( !(tileAHight % workgroupSize[1] === 0 && tileAWidth % workgroupSize[0] === 0 && tileInner % workgroupSize[1] === 0) ) { throw new Error( - `tileAHight ${tileAHight} must be divisible by workgroupSize[1]${workgroupSize[1]}, tileAWidth ${tileAWidth} must be divisible by workgroupSize[0]${workgroupSize[0]}, tileInner ${tileInner} must be divisible by workgroupSize[1]${workgroupSize[1]}` + `tileAHight ${tileAHight} must be divisible by workgroupSize[1]${workgroupSize[1]}, tileAWidth ${tileAWidth} must be divisible by workgroupSize[0]${workgroupSize[0]}, tileInner ${tileInner} must be divisible by workgroupSize[1]${workgroupSize[1]}`, ); } const rowPerThreadA = tileAHight / workgroupSize[1]; @@ -374,7 +374,7 @@ const matMulReadWriteFnSource = ( applyActivation: string, variables: IndicesHelper[], batchShapes: Array, - isChannelsLast = false + isChannelsLast = false, ): string => { const [batchAShape, batchBShape, batchShape] = batchShapes; const [batchVariable, aVariable, bVariable, outputVariable] = variables; @@ -411,9 +411,9 @@ const matMulReadWriteFnSource = ( }; const source = ` fn mm_readA(batch: i32, row: i32, colIn: i32, batchIndices: ${batchVariable.type.indices}) -> ${typeSnippet( - component, - dataType - )} { + component, + dataType, + )} { var value = ${typeSnippet(component, dataType)}(0.0); let col = colIn * ${component}; if(row < uniforms.dim_a_outer && col < uniforms.dim_inner) @@ -425,9 +425,9 @@ const matMulReadWriteFnSource = ( } fn mm_readB(batch: i32, row: i32, colIn: i32, batchIndices: ${batchVariable.type.indices}) -> ${typeSnippet( - component, - dataType - )} { + component, + dataType, + )} { var value = ${typeSnippet(component, dataType)}(0.0); let col = colIn * ${component}; if(row < uniforms.dim_inner && col < uniforms.dim_b_outer) @@ -462,7 +462,7 @@ export const createMatmulProgramInfo = ( outputShape: readonly number[], reshapedOutputShape?: readonly number[], isChannelsLast = false /* only used for conv2dByMatMul*/, - squeezeOutputShapeFunction?: (shape: readonly number[]) => number[] + squeezeOutputShapeFunction?: (shape: readonly number[]) => number[], ): ProgramInfo => { const aShape = inputs[0].dims; const bShape = inputs[1].dims; @@ -533,7 +533,7 @@ export const createMatmulProgramInfo = ( applyActivation, [batchDims, A, B, output], [outerDimsA, outerDimsB, outerDims], - isChannelsLast + isChannelsLast, ); return ` ${shaderHelper diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts index 17fbf71760ad6..1ad4149b01e08 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts @@ -25,7 +25,7 @@ import { appendActivationUniforms, appendActivationUniformsData, getActivationSn export const createGroupedConvProgramInfo = ( inputs: readonly TensorView[], attributes: ConvAttributes, - squeezeOutputShapeFunction?: (shape: readonly number[]) => number[] + squeezeOutputShapeFunction?: (shape: readonly number[]) => number[], ): ProgramInfo => { const hasBias = inputs.length > 2; const processBias = hasBias ? 'value += b[output_channel];' : ''; @@ -40,7 +40,7 @@ export const createGroupedConvProgramInfo = ( attributes.dilations, attributes.pads, attributes.strides, - isChannelLast + isChannelLast, ); const outputSize = ShapeUtil.size(outputShape); @@ -145,7 +145,7 @@ export const createGroupedConvVectorizeProgramInfo = ( inputs: readonly TensorView[], attributes: ConvAttributes, outputShape: readonly number[], - squeezeOutputShapeFunction?: (shape: readonly number[]) => number[] + squeezeOutputShapeFunction?: (shape: readonly number[]) => number[], ): ProgramInfo => { const hasBias = inputs.length > 2; const components = getMaxComponents(outputShape[3]); diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts index 5e8f6e245c93a..e64d253620186 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts @@ -20,7 +20,7 @@ export const calculateOutputShape = ( dilations: readonly number[], adjustPads: readonly number[], strides: readonly number[], - isChannelLast: boolean + isChannelLast: boolean, ): number[] => { const batchSize = inputShape[0]; const inputSpatialShape = inputShape.slice(isChannelLast ? 1 : 2, isChannelLast ? 3 : 4); @@ -30,7 +30,7 @@ export const calculateOutputShape = ( const dilatedKernelShape = kernelSpatialShape.map((v, i) => v + (v - 1) * (dilations[i] - 1)); const inputSpatialShapeWithPad = inputSpatialShape.map((v, i) => v + adjustPads[i] + adjustPads[i + spatialRank]); const outputShape = inputSpatialShapeWithPad.map((v, i) => - Math.floor((v - dilatedKernelShape[i] + strides[i]) / strides[i]) + Math.floor((v - dilatedKernelShape[i] + strides[i]) / strides[i]), ); outputShape.splice(0, 0, batchSize); outputShape.splice(isChannelLast ? 3 : 1, 0, outChannels); @@ -117,7 +117,7 @@ const getAdjustedConvAttributes = (attributes: T, inpu kernelShape, pads, attributes.format === 'NHWC', - attributes.autoPad + attributes.autoPad, ); // always return a new object so does not modify the original attributes @@ -156,7 +156,7 @@ const conv2d = ( context: ComputeContext, inputs: readonly TensorView[], attributes: ConvAttributes, - squeezeOutputShapeFunction?: (shape: readonly number[]) => number[] + squeezeOutputShapeFunction?: (shape: readonly number[]) => number[], ): void => { // check attributes @@ -181,7 +181,7 @@ const conv2d = ( attributes.dilations, attributes.pads, attributes.strides, - isChannelsLast + isChannelsLast, ); const transposedWeight = (context.kernelCustomData.wT as TensorView | undefined) ?? @@ -198,7 +198,7 @@ const conv2d = ( } context.compute( createGroupedConvVectorizeProgramInfo(convInputs, attributes, outputShape, squeezeOutputShapeFunction), - { inputs: convInputs } + { inputs: convInputs }, ); } else { context.compute(createGroupedConvProgramInfo(inputs, attributes, squeezeOutputShapeFunction)); @@ -219,7 +219,7 @@ const conv2d = ( attributes.dilations, attributes.pads, attributes.strides, - isChannelsLast + isChannelsLast, ); const outHeight = outputShape[isChannelsLast ? 1 : 2]; const outWidth = outputShape[isChannelsLast ? 2 : 3]; @@ -289,9 +289,9 @@ const conv2d = ( outputShape, matmulOutputShape, isChannelsLast, - squeezeOutputShapeFunction + squeezeOutputShapeFunction, ), - { inputs: matmulInputs } + { inputs: matmulInputs }, ); } else { context.compute( @@ -301,9 +301,9 @@ const conv2d = ( outputShape, matmulOutputShape, isChannelsLast, - squeezeOutputShapeFunction + squeezeOutputShapeFunction, ), - { inputs: matmulInputs } + { inputs: matmulInputs }, ); } return; @@ -344,9 +344,9 @@ const conv2d = ( dimInner, hasBias, sequentialAccessByThreads, - squeezeOutputShapeFunction + squeezeOutputShapeFunction, ), - { inputs: convInputs } + { inputs: convInputs }, ); }; @@ -359,7 +359,7 @@ const conv1d = (context: ComputeContext, attributes: ConvAttributes): void => { ? // [N, W, C] -> [N, H=1, W, C] [context.inputs[0].dims[0], 1, context.inputs[0].dims[1], context.inputs[0].dims[2]] : // [N, C, W] -> [N, C, H=1, W] - [context.inputs[0].dims[0], context.inputs[0].dims[1], 1, context.inputs[0].dims[2]] + [context.inputs[0].dims[0], context.inputs[0].dims[1], 1, context.inputs[0].dims[2]], ), //[FILTER_OUT_CHANNEL, FILTER_IN_CHANNEL, kW] -> [FILTER_OUT_CHANNEL, FILTER_IN_CHANNEL, kH=1, kW] context.inputs[1].reshape([context.inputs[1].dims[0], context.inputs[1].dims[1], 1, context.inputs[1].dims[2]]), @@ -373,10 +373,10 @@ const conv1d = (context: ComputeContext, attributes: ConvAttributes): void => { const kernelShape = [1].concat(attributes.kernelShape); const adjustedAttributes = getAdjustedConvAttributes( { ...attributes, pads, strides, dilations, kernelShape }, - inputs + inputs, ); conv2d(context, inputs, adjustedAttributes, (outputShape) => - isChannelLast ? [outputShape[0], outputShape[2], outputShape[3]] : [outputShape[0], outputShape[1], outputShape[3]] + isChannelLast ? [outputShape[0], outputShape[2], outputShape[3]] : [outputShape[0], outputShape[1], outputShape[3]], ); }; @@ -391,7 +391,7 @@ const conv3d = (context: ComputeContext, inputs: readonly TensorView[], attribut attributes.dilations as number | [number, number, number], pads as string | number[], false, - format + format, ); context.compute( createConv3DNaiveProgramInfo( @@ -400,8 +400,8 @@ const conv3d = (context: ComputeContext, inputs: readonly TensorView[], attribut convInfo.outShape, [convInfo.filterDepth, convInfo.filterHeight, convInfo.filterWidth], [convInfo.padInfo.front, convInfo.padInfo.top, convInfo.padInfo.left], - format - ) + format, + ), ); }; From a147ae0babea67f0a42ae070b0cbbd9812a776c9 Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Thu, 15 Aug 2024 19:42:05 -0700 Subject: [PATCH 5/7] more formatting --- js/web/lib/wasm/jsep/webgpu/ops/matmul.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts index d2622361440d7..bd37474f38f71 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts @@ -32,7 +32,7 @@ export const createNaiveMatmulProgramInfo = ( outputShape: readonly number[], reshapedOutputShape?: readonly number[], isChannelsLast = false /* only used for conv2dByMatMul*/, - squeezeOutputShapeFunction?: (shape: readonly number[]) => number[] + squeezeOutputShapeFunction?: (shape: readonly number[]) => number[], ): ProgramInfo => { const aShape = inputs[0].dims; const bShape = inputs[1].dims; @@ -122,8 +122,8 @@ export const createNaiveMatmulProgramInfo = ( for (let j = 0; j < aComponents; j++) { calcStr += ` values[${i}] = fma(${b.type.value}(a_data${ - aComponents === 1 ? '' : `[${j}]` - }), b_data${j}, values[${i}]);\n`; + aComponents === 1 ? '' : `[${j}]` + }), b_data${j}, values[${i}]);\n`; } } return calcStr; From 549a8d38eb6084f9307333d2fbfdec5cc1cc35c4 Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Thu, 15 Aug 2024 22:10:20 -0700 Subject: [PATCH 6/7] fix error caused by resolve --- .../jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts | 12 ------------ js/web/lib/wasm/jsep/webgpu/ops/matmul.ts | 4 +--- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts index bfed99cc4f51d..3ef5c943d5624 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts @@ -291,18 +291,6 @@ export const createConv2DMatMulProgramInfo = ( elementsSize[2], t, )} - ${conv2dCommonSnippet( - isChannelsLast, - fitAOuter, - fitBOuter, - fitInner, - hasBias, - attributes, - elementsSize[0], - elementsSize[1], - elementsSize[2], - t, - )} ${ isVec4 ? makeMatMulPackedVec4Source(elementsPerThread, workGroupSize, t, undefined, !isChannelsLast, tileInner) diff --git a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts index bd37474f38f71..7605e67c972b9 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts @@ -121,9 +121,7 @@ export const createNaiveMatmulProgramInfo = ( for (let j = 0; j < aComponents; j++) { calcStr += ` - values[${i}] = fma(${b.type.value}(a_data${ - aComponents === 1 ? '' : `[${j}]` - }), b_data${j}, values[${i}]);\n`; + values[${i}] = fma(${b.type.value}(a_data${aComponents === 1 ? '' : `[${j}]`}), b_data${j}, values[${i}]);\n`; } } return calcStr; From b0e353931a44ddff442c16615599525ce0b66513 Mon Sep 17 00:00:00 2001 From: Qin Jiajia Date: Mon, 19 Aug 2024 16:05:28 +0800 Subject: [PATCH 7/7] revert some changes due to incorrect rebase --- js/web/lib/wasm/jsep/webgpu/ops/conv.ts | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts index e64d253620186..241aae8c46603 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts @@ -163,12 +163,13 @@ const conv2d = ( // const hasPreluActivationWeights = false; /* TODO: add support for prelu activation weights */ const isChannelsLast = attributes.format === 'NHWC'; if (attributes.group !== 1) { - // Temporarily disable createGroupedConvVectorizeProgramInfo path due to bots failures with below two cases: + // NVIDIA GPU with ampere architecture fails with below 2 cases, but we couldn't repro them with any other + // GPUs. So just disable vectorize on NVIDIA ampere to ensure always correct outputs. // [webgpu]Conv - conv - vectorize group - B // [webgpu]Conv - conv - vectorize group - D - const disableGroupedConvVectorize = false; + const enableGroupedConvVectorize = !context.adapterInfo.isArchitecture('ampere'); if ( - !disableGroupedConvVectorize && + enableGroupedConvVectorize && isChannelsLast && inputs[1].dims[0] === attributes.group && inputs[1].dims[1] === 1 &&