Skip to content

Commit

Permalink
[js/webgpu] Optimize NCHW layout for InstanceNormalization (#18123)
Browse files Browse the repository at this point in the history
### Description
The changes in this PR includes:
1) Fix f16 errors in InstanceNormalization with NCHW format.
2) Use vec to further optimize the original algorithm.
3) (Removed) Don't do layout conversion for InstanceNormalization for
JSEP since InstanceNormalization itself is suitable for NCHW layout and
has better performance in our current implementation.

Tested on sd-vae-decoder-f16.onnx, it becomes 285 ms from 314 ms. The
aggregate gpu profiling data can be found as below (Note the data is
based change 3).):
Before:
<html>
<body>
<!--StartFragment--><span><span class="ui-provider ef bbg bbh bbi bbj
bbk bbl bbm bbn bbo bbp bbq bbr bbs bbt bbu bbv bbw bbx bby bbz bca bcb
bcc bcd bce bcf bcg bch bci bcj bck bcl bcm bcn" dir="ltr">

Kernel | Time (Ms) | Percentage (%)
-- | -- | --
Conv | 201.55 | 69.56
InstanceNormalization | 42.49 | 14.67
Transpose | 28.95 | 9.99
Mul | 5.69 | 1.96
Add | 3.82 | 1.32
MatMul | 3.27 | 1.13
Sigmoid | 2.24 | 0.77
Resize | 1.16 | 0.40
Softmax | 0.34 | 0.12
Cast | 0.24 | 0.08
Sum | 289.75

<br class="Apple-interchange-newline"><!--EndFragment-->
</body>
</html>
After:
<html>
<body>
<!--StartFragment--><span><span class="ui-provider ef bbg bbh bbi bbj
bbk bbl bbm bbn bbo bbp bbq bbr bbs bbt bbu bbv bbw bbx bby bbz bca bcb
bcc bcd bce bcf bcg bch bci bcj bck bcl bcm bcn" dir="ltr">

Kernel | Time (Ms) | Percentage (%)
-- | -- | --
Conv | 205.44 | 79.43
InstanceNormalization | 18.24 | 7.05
Transpose | 17.64 | 6.82
Mul | 5.69 | 2.20
Add | 3.81 | 1.47
MatMul | 3.56 | 1.38
Sigmoid | 2.24 | 0.86
Resize | 1.19 | 0.46
Softmax | 0.59 | 0.23
Cast | 0.24 | 0.09
Sum | 258.65 |  

</span></span><!--EndFragment-->
</body>
</html>

From above table, we can see that two ops time are greatly reduced. One
is InstanceNormalization and the other is Transpose. The reason that the
transpose time is reduced is because each InstanceNormalization is
surrounded with two reshape ops in sd-vae-decoder-f16.onnx. Due to JSEP
is prefer NHWC and InstanceNormalization is layout sensitive op, so two
extra transpose ops are inserted dynamically when executing this model.
After this change, those inserted transpose ops are not needed anymore.
So the overall transpose time is reduced.
  • Loading branch information
qjia7 authored Dec 15, 2023
1 parent 4bbed4c commit 8f7b89b
Showing 1 changed file with 23 additions and 19 deletions.
42 changes: 23 additions & 19 deletions js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import {ShapeUtil} from '../../util';
import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
import {ComputeContext, ProgramInfo} from '../types';

import {fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType} from './common';
import {fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType} from './common';

export interface InstanceNormAttributes extends AttributeWithCacheKey {
epsilon: number;
Expand All @@ -26,22 +26,25 @@ const createInstanceNormProgramInfo =
const axis = 2;
const normCount = ShapeUtil.sizeToDimension(xShape, axis);
const normSize = ShapeUtil.sizeFromDimension(xShape, axis);
const components = getMaxComponents(normSize);
const normPackedSize = normSize / components;
const C = xShape[1];
const x = inputVariable('x', inputs[0].dataType, [xShape[0], xShape[1], normSize]);
const x = inputVariable('x', inputs[0].dataType, [xShape[0], xShape[1], normPackedSize], components);
const scale = inputVariable('scale', inputs[1].dataType, inputs[1].dims);
const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims);
const output = outputVariable('output', inputs[0].dataType, [xShape[0], xShape[1], normSize]);
const output = outputVariable('output', inputs[0].dataType, [xShape[0], xShape[1], normPackedSize], components);
const variables = [x, scale, bias, output];
const dataType = x.type.value;
const f32Type = components === 1 ? 'f32' : `vec${components}<f32>`;
const workgroupSize = 64;
const getShaderSource = (shaderHelper: ShaderHelper) => `
const C: u32 = ${C};
const normSize: u32 = ${normSize};
const epsilon: f32 = ${attributes.epsilon};
var<workgroup> meanShared : ${dataType};
var<workgroup> squaredNormShared : ${dataType};
var<workgroup> workgroupShared : array<${dataType}, ${workgroupSize}>;
var<workgroup> meanShared : f32;
var<workgroup> squaredNormShared : f32;
var<workgroup> workgroupShared : array<${f32Type}, ${workgroupSize}>;
const workgroupSize = ${workgroupSize}u;
${shaderHelper.declareVariables(...variables)}
${shaderHelper.mainStart(workgroupSize)}
Expand All @@ -51,9 +54,9 @@ const createInstanceNormProgramInfo =
let localIndex = local_id.x;
// initialize workgroup memory
var initial: ${dataType} = 0;
for (var h = localIndex; h < normSize; h += workgroupSize) {
initial = initial + ${x.get('batch', 'channel', 'h')};
var initial = ${f32Type}(0);
for (var h = localIndex; h < ${normPackedSize}; h += workgroupSize) {
initial = initial + ${f32Type}(${x.get('batch', 'channel', 'h')});
}
workgroupShared[localIndex] = initial;
workgroupBarrier();
Expand All @@ -66,14 +69,14 @@ const createInstanceNormProgramInfo =
workgroupBarrier();
}
if (localIndex == 0) {
meanShared = workgroupShared[0] / ${dataType}(normSize);
meanShared = ${sumVector('workgroupShared[0]', components)} / f32(normSize);
}
workgroupBarrier();
// reinitialize workgroup memory.
initial = 0;
for (var h = localIndex; h < normSize; h += workgroupSize) {
let deviation = ${x.get('batch', 'channel', 'h')} - meanShared;
initial = ${f32Type}(0);
for (var h = localIndex; h < ${normPackedSize}; h += workgroupSize) {
let deviation = ${f32Type}(${x.get('batch', 'channel', 'h')}) - ${f32Type}(meanShared);
initial = initial + deviation * deviation;
}
workgroupShared[localIndex] = initial;
Expand All @@ -87,15 +90,16 @@ const createInstanceNormProgramInfo =
workgroupBarrier();
}
if (localIndex == 0) {
squaredNormShared = workgroupShared[0];
squaredNormShared = ${sumVector('workgroupShared[0]', components)};
}
workgroupBarrier();
let invStdDev = 1 / sqrt(squaredNormShared / ${dataType}(normSize) + epsilon);
let channelScale = invStdDev * ${scale.getByOffset('channel')};
let channelShift = ${bias.getByOffset('channel')} - meanShared * channelScale;
for (var h = localIndex; h < normSize; h += workgroupSize) {
let value = ${x.get('batch', 'channel', 'h')} * channelScale + channelShift;
let invStdDev = 1 / sqrt(squaredNormShared / f32(normSize) + epsilon);
let channelScale = invStdDev * f32(${scale.getByOffset('channel')});
let channelShift = f32(${bias.getByOffset('channel')}) - meanShared * channelScale;
for (var h = localIndex; h < ${normPackedSize}; h += workgroupSize) {
let value = ${x.get('batch', 'channel', 'h')} * ${dataType}(${f32Type}(channelScale)) + ${dataType}(${
f32Type}(channelShift));
${output.set('batch', 'channel', 'h', 'value')};
}
}`;
Expand Down

0 comments on commit 8f7b89b

Please sign in to comment.