diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts index f3845e3110905..c054da51a3098 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/common.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts @@ -592,7 +592,8 @@ class ShaderHelperImpl implements ShaderHelper { const workgroupSizeZ = typeof workgroupSize === 'number' ? 1 : workgroupSize[2]; const is1DimensionDispatch = this.normalizedDispatchGroup[1] === 1 && this.normalizedDispatchGroup[2] === 1; - const paramList = is1DimensionDispatch ? '@builtin(global_invocation_id) global_id : vec3' : + const paramList = is1DimensionDispatch ? `@builtin(global_invocation_id) global_id : vec3, + @builtin(local_invocation_id) local_id : vec3` : `@builtin(local_invocation_index) local_index : u32, @builtin(workgroup_id) workgroup_id : vec3`; const globalIdxDefinition = is1DimensionDispatch ? diff --git a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts index f62c766aa9ed0..449073a133295 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts @@ -1,83 +1,97 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -import {DataType} from '../../../wasm-common'; import {TensorView} from '../../tensor'; import {ShapeUtil} from '../../util'; import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key'; import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types'; -import {ShaderHelper, tensorTypeToWsglStorageType} from './common'; +import {inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType} from './common'; export interface InstanceNormAttributes extends AttributeWithCacheKey { epsilon: number; format: 'NHWC'|'NCHW'; } -const validateInputs = (inputs: readonly TensorView[]): void => { - if (!inputs || inputs.length !== 3) { - throw new Error('instanceNorm requires 3 inputs.'); - } - - if (inputs[0].dataType !== DataType.float || inputs[1].dataType !== DataType.float) { - throw new Error('inputs should be float type'); - } -}; - const createInstanceNormProgramInfo = (metadata: ProgramMetadata, inputs: readonly TensorView[], attributes: InstanceNormAttributes): ProgramInfo => { const xShape = inputs[0].dims; - const scale = inputs[1]; - const bias = inputs[2]; const outputShape = xShape; - const outputSize = ShapeUtil.size(outputShape); const axis = 2; const normCount = ShapeUtil.sizeToDimension(xShape, axis); const normSize = ShapeUtil.sizeFromDimension(xShape, axis); const C = xShape[1]; - - const scaleSize = ShapeUtil.size(scale.dims); - const biasSize = bias ? ShapeUtil.size(bias.dims) : 0; - if (scaleSize !== normSize || (bias && biasSize !== normSize)) { - throw new Error(`Size of X.shape()[axis:] == ${normSize}. - Size of scale and bias (if provided) must match this. - Got scale size of ${scaleSize} and bias size of ${biasSize}`); - } - - const dataType = tensorTypeToWsglStorageType(inputs[0].dataType); - + const x = inputVariable('x', inputs[0].dataType, [xShape[0], xShape[1], normSize]); + const scale = inputVariable('scale', inputs[1].dataType, inputs[1].dims); + const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims); + const output = outputVariable('output', inputs[0].dataType, [xShape[0], xShape[1], normSize]); + const variables = [x, scale, bias, output]; + const dataType = x.type.value; + const workgroupSize = 64; const getShaderSource = (shaderHelper: ShaderHelper) => ` + const C: u32 = ${C}; const normSize: u32 = ${normSize}; - const normSizeTyped: ${dataType} = ${normSize}; const epsilon: f32 = ${attributes.epsilon}; + var meanShared : ${dataType}; + var squaredNormShared : ${dataType}; + var workgroupShared : array<${dataType}, ${workgroupSize}>; + const workgroupSize = ${workgroupSize}u; + ${shaderHelper.declareVariables(...variables)} + ${shaderHelper.mainStart(workgroupSize)} + let norm = global_idx / workgroupSize; + let batch = norm / C; + let channel = norm % C; + let localIndex = local_id.x; + + // initialize workgroup memory + var initial: ${dataType} = 0; + for (var h = localIndex; h < normSize; h += workgroupSize) { + initial = initial + ${x.get('batch', 'channel', 'h')}; + } + workgroupShared[localIndex] = initial; + workgroupBarrier(); - @group(0) @binding(0) var x : array<${dataType}>; - @group(0) @binding(1) var scale : array<${dataType}>; - @group(0) @binding(2) var bias : array<${dataType}>; - @group(0) @binding(3) var output : array<${dataType}>; - - ${shaderHelper.mainStart()} - let offset = global_idx * normSize; - if (offset + normSize >= ${outputSize}) { return; } - var mean: ${dataType} = 0; + // Calculate the mean of current channel data. + for (var currSize = workgroupSize >> 1; currSize > 0; currSize = currSize >> 1) { + if (localIndex < currSize) { + workgroupShared[localIndex] = workgroupShared[localIndex] + workgroupShared[localIndex + currSize]; + } + workgroupBarrier(); + } + if (localIndex == 0) { + meanShared = workgroupShared[0] / ${dataType}(normSize); + } + workgroupBarrier(); - for (var h: u32 = 0u; h < normSize; h++) { - mean = mean + x[h + offset]; + // reinitialize workgroup memory. + initial = 0; + for (var h = localIndex; h < normSize; h += workgroupSize) { + let deviation = ${x.get('batch', 'channel', 'h')} - meanShared; + initial = initial + deviation * deviation; } - mean = mean / normSizeTyped; + workgroupShared[localIndex] = initial; + workgroupBarrier(); - var squaredNorm: ${dataType} = 0; - for (var h: u32 = 0u; h < normSize; h++) { - let deviation: f32 = x[h + offset] - mean; - squaredNorm = squaredNorm + deviation * deviation; + // Calculate the sum of square of deviation of current channel data. + for (var currSize = workgroupSize >> 1; currSize > 0; currSize = currSize >> 1) { + if (localIndex < currSize) { + workgroupShared[localIndex] = workgroupShared[localIndex] + workgroupShared[localIndex + currSize]; + } + workgroupBarrier(); } - let invStdDev = 1 / sqrt(squaredNorm / normSizeTyped + epsilon); - let channelScale = invStdDev * scale[global_idx % C]; - let channelShift = bias[global_idx % C] - mean * channelScale; - for (var j: u32 = 0; j < normSize; j++) { - output[j + offset] = x[j + offset] * channelScale + channelShift; + if (localIndex == 0) { + squaredNormShared = workgroupShared[0]; + } + workgroupBarrier(); + + let invStdDev = 1 / sqrt(squaredNormShared / ${dataType}(normSize) + epsilon); + let channelScale = invStdDev * ${scale.getByOffset('channel')}; + let channelShift = ${bias.getByOffset('channel')} - meanShared * channelScale; + for (var h = localIndex; h < normSize; h += workgroupSize) { + let value = ${x.get('batch', 'channel', 'h')} * channelScale + channelShift; + ${output.set('batch', 'channel', 'h', 'value')}; } }`; return { @@ -86,7 +100,7 @@ const createInstanceNormProgramInfo = {dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}, ], getShaderSource, - dispatchGroup: () => ({x: Math.ceil(normCount / 64 /* workgroup size */)}) + dispatchGroup: () => ({x: normCount}) }; }; @@ -118,7 +132,7 @@ const createInstanceNormNHWCProgramInfo = ${shaderHelper.mainStart()} let currentImageNumber = global_idx / C; let currentChannelNumber = global_idx % C; - + // offset is channel num * N let offset = currentImageNumber * imageSize; if (offset >= ${outputSize}) { return; } @@ -156,8 +170,6 @@ export const parseInstanceNormAttributes = (attributes: InstanceNormAttributes): createAttributeWithCacheKey({epsilon: attributes.epsilon, format: attributes.format}); export const instanceNorm = (context: ComputeContext, attributes: InstanceNormAttributes): void => { - validateInputs(context.inputs); - const metadata = { name: 'InstanceNormalization', inputTypes: [GpuDataType.default, GpuDataType.default, GpuDataType.default], diff --git a/js/web/test/data/ops/instance-norm.jsonc b/js/web/test/data/ops/instance-norm.jsonc new file mode 100644 index 0000000000000..6a4e6912405ee --- /dev/null +++ b/js/web/test/data/ops/instance-norm.jsonc @@ -0,0 +1,79 @@ +[ + { + "name": "Simple test with NHWC", + "operator": "InstanceNormalization", + "inputShapeDefinitions": "rankOnly", + "opset": { "domain": "", "version": 17 }, + "cases": [ + { + "name": "Simple test", + "inputs": [ + { + "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4], + "dims": [1, 4, 2, 2], + "type": "float32" + }, + { + "data": [1, 2, 3, 4], + "dims": [4], + "type": "float32" + }, + { + "data": [4, 5, 6, 7], + "dims": [4], + "type": "float32" + } + ], + "outputs": [ + { + "data": [ + 2.6583645343780518, 3.552788257598877, 4.447211742401123, 5.341635704040527, 2.3167295455932617, + 4.105576515197754, 5.8944244384765625, 7.683271408081055, 6, 10.242595672607422, 6, 1.7574005126953125, + 12.36654281616211, 8.788846969604492, 5.211153030395508, 1.633458137512207 + ], + "dims": [1, 4, 2, 2], + "type": "float32" + } + ] + } + ] + }, + { + "name": "Simple test with NCHW", + "operator": "InstanceNormalization", + "opset": { "domain": "", "version": 17 }, + "cases": [ + { + "name": "Simple test", + "inputs": [ + { + "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4], + "dims": [1, 4, 2, 2], + "type": "float32" + }, + { + "data": [1, 2, 3, 4], + "dims": [4], + "type": "float32" + }, + { + "data": [4, 5, 6, 7], + "dims": [4], + "type": "float32" + } + ], + "outputs": [ + { + "data": [ + 2.6583645343780518, 3.552788257598877, 4.447211742401123, 5.341635704040527, 2.3167295455932617, + 4.105576515197754, 5.8944244384765625, 7.683271408081055, 6, 10.242595672607422, 6, 1.7574005126953125, + 12.36654281616211, 8.788846969604492, 5.211153030395508, 1.633458137512207 + ], + "dims": [1, 4, 2, 2], + "type": "float32" + } + ] + } + ] + } +] diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc index e580259071968..94592884ccad6 100644 --- a/js/web/test/suite-test-list.jsonc +++ b/js/web/test/suite-test-list.jsonc @@ -257,6 +257,7 @@ "greater.jsonc", //"identity.jsonc", "image-scaler.jsonc", + "instance-norm.jsonc", "less.jsonc", "log.jsonc", "matmul.jsonc", @@ -1347,6 +1348,7 @@ "gemm.jsonc", "global-average-pool.jsonc", "greater.jsonc", + "instance-norm.jsonc", "less.jsonc", "log.jsonc", "matmul.jsonc",