Skip to content

Commit

Permalink
Merge branch 'main' into adrianl/qnn-support-gatherelems-gathernd
Browse files Browse the repository at this point in the history
  • Loading branch information
adrianlizarraga committed Aug 12, 2024
2 parents 1d327fd + 154084e commit d5264c6
Show file tree
Hide file tree
Showing 28 changed files with 878 additions and 51 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ struct OrtTensorRTProviderOptionsV2 {
// can be updated using: UpdateTensorRTProviderOptionsWithValue
int trt_max_partition_iterations{1000}; // maximum iterations for TensorRT parser to get capability
int trt_min_subgraph_size{1}; // minimum size of TensorRT subgraphs
size_t trt_max_workspace_size{1 << 30}; // maximum workspace size for TensorRT.
size_t trt_max_workspace_size{0}; // maximum workspace size for TensorRT. Default is 0 means max device memory size
int trt_fp16_enable{0}; // enable TensorRT FP16 precision. Default 0 = false, nonzero = true
int trt_int8_enable{0}; // enable TensorRT INT8 precision. Default 0 = false, nonzero = true
const char* trt_int8_calibration_table_name{nullptr}; // TensorRT INT8 calibration table name.
Expand Down
1 change: 1 addition & 0 deletions js/web/docs/webgpu-operators.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ Do not modify directly.*
| Cosh | ai.onnx(9+) | |
| CumSum | ai.onnx(11-13,14+) | |
| DepthToSpace | ai.onnx(11-12,13+); com.ms.internal.nhwc(11-12,13+) | |
| DequantizeLinear | ai.onnx(10-12,13-18,19-20,21+) | |
| Div | ai.onnx(7-12,13,14+) | |
| Einsum | ai.onnx(12+) | |
| Elu | ai.onnx(6+) | |
Expand Down
2 changes: 2 additions & 0 deletions js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import {matMulNBits, parseMatMulNBitsAttributes} from './ops/matmulnbits';
import {multiHeadAttention, parseMultiHeadAttentionAttributes} from './ops/multihead-attention';
import {pad} from './ops/pad';
import * as pool from './ops/pool';
import {dequantizeLinear, parseDequantizeLinearAttributes} from './ops/quantize-linear';
import {range} from './ops/range';
import {reduceL1, reduceL2, reduceLogSum, reduceLogSumExp, reduceMax, reduceMean, reduceMin, reduceProd, reduceSum, reduceSumSquare} from './ops/reduce';
import {parseResizeAttributes, resize} from './ops/resize';
Expand Down Expand Up @@ -71,6 +72,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
['Cosh', [unaryOps.cosh]],
['CumSum', [cumsum, parseCumSumAttributes]],
['DepthToSpace', [depthToSpace, parseDepthToSpaceAttributes]],
['DequantizeLinear', [dequantizeLinear, parseDequantizeLinearAttributes]],
['Div', [binaryOps.div]],
['Einsum', [einsum, parseEinsumAttributes]],
['Elu', [unaryOps.elu, unaryOps.parseAlphaAttributes]],
Expand Down
219 changes: 219 additions & 0 deletions js/web/lib/wasm/jsep/webgpu/ops/quantize-linear.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

import {DataType} from '../../../wasm-common';
import {TensorView} from '../../tensor-view';
import {ShapeUtil} from '../../util';
import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
import {ComputeContext, ProgramInfo, ProgramUniform} from '../types';

import {createTensorShapeVariables, getMaxComponents, inputVariable, outputVariable, ShaderHelper, UniformsArrayType} from './common';

export interface DequantizeLinerAttributes extends AttributeWithCacheKey {
axis: number;
blockSize: number;
}

const validateInputs = (inputs: readonly TensorView[], attributes: DequantizeLinerAttributes): void => {
if (inputs.length < 2 || inputs.length > 3) {
throw new Error('DequantizeLinear requires 2 or 3 inputs.');
}
if (inputs.length === 3 && inputs[1].dims === inputs[2].dims) {
throw new Error('x-scale and x-zero-point must have the same shape.');
}
if (inputs.length === 3 && inputs[0].dataType !== inputs[2].dataType) {
throw new Error('x and x-zero-point must have the same data type.');
}
if (inputs[0].dataType === DataType.int32 && inputs.length > 2) {
throw new Error('In the case of dequantizing int32 there is no zero point.');
}
if (inputs[1].dims.length !== 0 && inputs[1].dims.length !== 1 && inputs[1].dims.length !== inputs[0].dims.length) {
throw new Error('scale input must be a scalar, a 1D tensor, or have the same rank as the input tensor.');
}
// validate scale and zero-point input shapes
if (inputs.length > 2) {
// zero-point input type should be the same as input data type.
if (inputs[0].dataType !== inputs[2].dataType) {
throw new Error('x and x-zero-point must have the same data type.');
}
// Scale and zero-point inputs must have the same shape
if (inputs[1].dims.length !== inputs[2].dims.length) {
throw new Error('scale and zero-point inputs must have the same rank.');
}
if (!inputs[1].dims.map((d, i) => d === inputs[2].dims[i]).reduce((a, b) => a && b, true)) {
throw new Error('scale and zero-point inputs must have the same shape.');
}
}
// Validate blockSize
if (attributes.blockSize > 0) {
// Block qunatization
if (inputs[1].dims.length === 0 || (inputs[1].dims.length === 1 && inputs[1].dims[0] === 1)) {
throw new Error('blockSize must be set only for block quantization.');
}
if (!inputs[1]
.dims.map((d, i) => i === attributes.axis || d === inputs[0].dims[i])
.reduce((a, b) => a && b, true)) {
throw new Error('For block qunatization, scale input shape to match the input shape except for the axis');
}
// Scale input rank should be same as the input rank
if (inputs[1].dims.length !== inputs[0].dims.length) {
throw new Error('For block qunatization the scale input rank must be the same as the x rank.');
}
const dI = inputs[0].dims[attributes.axis];
const si = inputs[1].dims[attributes.axis];
if (attributes.blockSize < Math.ceil(dI / si) || attributes.blockSize > Math.ceil(dI / (si - 1) - 1)) {
throw new Error('blockSize must be with in the range [ceil(dI / Si), ceil(dI / (Si - 1) - 1)].');
}
}
};

const createDequantizeLinearProgramInfo =
(inputs: readonly TensorView[], attributes: DequantizeLinerAttributes): ProgramInfo => {
const axis = ShapeUtil.normalizeAxis(attributes.axis, inputs[0].dims.length);
const inputType = inputs[0].dataType;
const isSigned = inputType === DataType.int8;
const outputShape = inputs[0].dims; // output shape is same as the input shape
const dataType = inputs[1].dataType; // output type is same as the the scale input type
const outputSize = ShapeUtil.size(outputShape);
const isPacked = inputType === DataType.int8 || inputType === DataType.uint8;
const inputShape = isPacked ? [Math.ceil(ShapeUtil.size(inputs[0].dims) / 4)] : inputs[0].dims;
const scaleShape = inputs[1].dims;
const zeroPointInput = inputs.length > 2 ? inputs[2] : undefined;
const zeroPointShape = zeroPointInput ?
(isPacked ? [Math.ceil(ShapeUtil.size(zeroPointInput.dims) / 4)] : zeroPointInput.dims) :
undefined;
// Scales input is a scaler for per-tensor/per-layer quantization, 1-D tensor for per-axis quantization
// or tensor with same rank as input for blocked quantization.
const perLayerQuantization = scaleShape.length === 0 || (scaleShape.length === 1 && scaleShape[0] === 1);
const perAxisQuantization = perLayerQuantization === false && scaleShape.length === 1;
// Left unnecessary commented-out assignment for documentation
// const blockQuantization = perLayerQuantization === false && perAxisQuantization === false;
const maxComponents = getMaxComponents(outputSize);
const useComponents = perLayerQuantization && (!isPacked || maxComponents === 4);
const components = useComponents ? maxComponents : 1;
const inputComponent = (useComponents && !isPacked) ? maxComponents : 1;
const input = inputVariable('input', isPacked ? DataType.uint32 : inputType, inputShape.length, inputComponent);
const scale = inputVariable('scale', dataType, scaleShape.length);
const zeroPoint = zeroPointInput ?
inputVariable('zero_point', isPacked ? DataType.uint32 : inputType, zeroPointShape!.length) :
undefined;
const output = outputVariable('output', dataType, outputShape.length, components);
const inputVariables = [input, scale];
if (zeroPoint) {
inputVariables.push(zeroPoint);
}
const inputShapes = [inputShape, scaleShape];
if (zeroPointInput) {
inputShapes.push(zeroPointShape!);
}
const programUniforms: ProgramUniform[] = [
{type: DataType.uint32, data: outputSize / components}, {type: DataType.uint32, data: axis},
{type: DataType.uint32, data: attributes.blockSize}, ...createTensorShapeVariables(...inputShapes, outputShape)
];
const getShaderSource = (shaderHelper: ShaderHelper) => {
const uniforms: UniformsArrayType =
[{name: 'output_size', type: 'u32'}, {name: 'axis', type: 'u32'}, {name: 'block_size', type: 'u32'}];
return `
${shaderHelper.registerUniforms(uniforms).declareVariables(...inputVariables, output)}
${shaderHelper.mainStart()}
${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
let output_indices = ${output.offsetToIndices('global_idx')};
// Set input x
${(() => {
if (isPacked) {
return `
let input = ${input.getByOffset('global_idx / 4')};
let x_vec = ${isSigned ? 'unpack4xI8(input)' : 'unpack4xU8(input)'};
let x_value = ${components === 1 ? 'x_vec[global_idx % 4]' : 'x_vec'};`;
} else {
return `let x_value = ${input.getByOffset('global_idx')};`;
}
})()};
// Set scale input
${(() => {
if (perLayerQuantization) {
// scale input is a scalar ()
return `let scale_value= ${scale.getByOffset('0')}`;
} else if (perAxisQuantization) {
// scale input is a 1D tensor
return `
let scale_index = ${output.indicesGet('output_indices', 'uniforms.axis')};
let scale_value= ${scale.getByOffset('scale_index')};`;
} else {
// Block quantization. Scale input rank is same as input/output rank.
return `
var scale_indices: ${scale.type.indices} = output_indices;
let index = ${scale.indicesGet('scale_indices', 'uniforms.axis')} / uniforms.block_size;
${scale.indicesSet('scale_indices', 'uniforms.axis', 'index')};
let scale_value= ${scale.getByIndices('scale_indices')};`;
}
})()};
// Set zero-point input
${(() => {
if (zeroPoint) {
if (perLayerQuantization) {
// zero-point input is a scalar
if (isPacked) {
return `
let zero_point_input = ${zeroPoint.getByOffset('0')};
let zero_point_vec = ${isSigned ? 'unpack4xI8(zero_point_input)' : 'unpack4xU8(zero_point_input)'};
let zero_point_value= zero_point_vec[0]`;
} else {
return `let zero_point_value = ${zeroPoint.getByOffset('0')}`;
}
} else if (perAxisQuantization) {
// zero-point input is a 1D tensor
if (isPacked) {
return `
let zero_point_index = ${output.indicesGet('output_indices', 'uniforms.axis')};
let zero_point_input = ${zeroPoint.getByOffset('zero_point_index / 4')};
let zero_point_vec = ${isSigned ? 'unpack4xI8(zero_point_input)' : 'unpack4xU8(zero_point_input)'};
let zero_point_value = zero_point_vec[zero_point_index % 4]`;
} else {
return `
let zero_point_index = ${output.indicesGet('output_indices', 'uniforms.axis')};
let zero_point_value = ${zeroPoint.getByOffset('zero_point_index')};`;
}
} else {
// BlockedQuantization. The zero-point input shape is same as the input shape except along axis.
if (isPacked) {
return `
let zero_point_offset = ${scale.indicesToOffset('scale_indices')};
let zero_point_input = ${zeroPoint.getByOffset('zero_point_offset / 4')};
let zero_point_vec = ${isSigned ? 'unpack4xI8(zero_point_input)' : 'unpack4xU8(zero_point_input)'};
let zero_point_value = zero_point_vec[zero_point_offset % 4];`;
} else {
return `let zero_point_value = ${zeroPoint.getByIndices('scale_indices')};`;
}
}
} else {
return `let zero_point_value = ${isPacked ? (isSigned ? 'i32' : 'u32') : input.type.value}(0);`;
}
})()};
// Compute and write output
${output.setByOffset('global_idx', `${output.type.value}(x_value - zero_point_value) * scale_value`)};
}`;
};
return {
name: 'DequantizeLinear',
shaderCache:
{hint: attributes.cacheKey, inputDependencies: zeroPoint ? ['rank', 'rank', 'rank'] : ['rank', 'rank']},
getShaderSource,
getRunData: () => ({
outputs: [{dims: outputShape, dataType}],
dispatchGroup: {x: Math.ceil(outputSize / components / 64), y: 1, z: 1},
programUniforms
})
};
};

export const dequantizeLinear = (context: ComputeContext, attributes: DequantizeLinerAttributes): void => {
validateInputs(context.inputs, attributes);
context.compute(createDequantizeLinearProgramInfo(context.inputs, attributes));
};

export const parseDequantizeLinearAttributes = (attributes: Record<string, unknown>): DequantizeLinerAttributes =>
createAttributeWithCacheKey({axis: attributes.axis as number, blockSize: attributes.blockSize as number});
Loading

0 comments on commit d5264c6

Please sign in to comment.