Merge branch 'main' into adrianl/qnn-support-gatherelems-gathernd

microsoft · Aug 12, 2024 · d5264c6 · d5264c6
2 parents 1d327fd + 154084e
commit d5264c6
Show file tree

Hide file tree

Showing 28 changed files with 878 additions and 51 deletions.
diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
@@ -19,7 +19,7 @@ struct OrtTensorRTProviderOptionsV2 {
                                                          // can be updated using: UpdateTensorRTProviderOptionsWithValue
   int trt_max_partition_iterations{1000};                // maximum iterations for TensorRT parser to get capability
   int trt_min_subgraph_size{1};                          // minimum size of TensorRT subgraphs
-  size_t trt_max_workspace_size{1 << 30};                // maximum workspace size for TensorRT.
+  size_t trt_max_workspace_size{0};                      // maximum workspace size for TensorRT. Default is 0 means max device memory size
   int trt_fp16_enable{0};                                // enable TensorRT FP16 precision. Default 0 = false, nonzero = true
   int trt_int8_enable{0};                                // enable TensorRT INT8 precision. Default 0 = false, nonzero = true
   const char* trt_int8_calibration_table_name{nullptr};  // TensorRT INT8 calibration table name.

diff --git a/js/web/docs/webgpu-operators.md b/js/web/docs/webgpu-operators.md
@@ -35,6 +35,7 @@ Do not modify directly.*
 | Cosh | ai.onnx(9+) |  |
 | CumSum | ai.onnx(11-13,14+) |  |
 | DepthToSpace | ai.onnx(11-12,13+); com.ms.internal.nhwc(11-12,13+) |  |
+| DequantizeLinear | ai.onnx(10-12,13-18,19-20,21+) |  |
 | Div | ai.onnx(7-12,13,14+) |  |
 | Einsum | ai.onnx(12+) |  |
 | Elu | ai.onnx(6+) |  |

diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -26,6 +26,7 @@ import {matMulNBits, parseMatMulNBitsAttributes} from './ops/matmulnbits';
 import {multiHeadAttention, parseMultiHeadAttentionAttributes} from './ops/multihead-attention';
 import {pad} from './ops/pad';
 import * as pool from './ops/pool';
+import {dequantizeLinear, parseDequantizeLinearAttributes} from './ops/quantize-linear';
 import {range} from './ops/range';
 import {reduceL1, reduceL2, reduceLogSum, reduceLogSumExp, reduceMax, reduceMean, reduceMin, reduceProd, reduceSum, reduceSumSquare} from './ops/reduce';
 import {parseResizeAttributes, resize} from './ops/resize';
@@ -71,6 +72,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['Cosh', [unaryOps.cosh]],
   ['CumSum', [cumsum, parseCumSumAttributes]],
   ['DepthToSpace', [depthToSpace, parseDepthToSpaceAttributes]],
+  ['DequantizeLinear', [dequantizeLinear, parseDequantizeLinearAttributes]],
   ['Div', [binaryOps.div]],
   ['Einsum', [einsum, parseEinsumAttributes]],
   ['Elu', [unaryOps.elu, unaryOps.parseAlphaAttributes]],

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/quantize-linear.ts b/js/web/lib/wasm/jsep/webgpu/ops/quantize-linear.ts
@@ -0,0 +1,219 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {DataType} from '../../../wasm-common';
+import {TensorView} from '../../tensor-view';
+import {ShapeUtil} from '../../util';
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
+import {ComputeContext, ProgramInfo, ProgramUniform} from '../types';
+
+import {createTensorShapeVariables, getMaxComponents, inputVariable, outputVariable, ShaderHelper, UniformsArrayType} from './common';
+
+export interface DequantizeLinerAttributes extends AttributeWithCacheKey {
+  axis: number;
+  blockSize: number;
+}
+
+const validateInputs = (inputs: readonly TensorView[], attributes: DequantizeLinerAttributes): void => {
+  if (inputs.length < 2 || inputs.length > 3) {
+    throw new Error('DequantizeLinear requires 2 or 3 inputs.');
+  }
+  if (inputs.length === 3 && inputs[1].dims === inputs[2].dims) {
+    throw new Error('x-scale and x-zero-point must have the same shape.');
+  }
+  if (inputs.length === 3 && inputs[0].dataType !== inputs[2].dataType) {
+    throw new Error('x and x-zero-point must have the same data type.');
+  }
+  if (inputs[0].dataType === DataType.int32 && inputs.length > 2) {
+    throw new Error('In the case of dequantizing int32 there is no zero point.');
+  }
+  if (inputs[1].dims.length !== 0 && inputs[1].dims.length !== 1 && inputs[1].dims.length !== inputs[0].dims.length) {
+    throw new Error('scale input must be a scalar, a 1D tensor, or have the same rank as the input tensor.');
+  }
+  // validate scale and zero-point input shapes
+  if (inputs.length > 2) {
+    // zero-point input type should be the same as input data type.
+    if (inputs[0].dataType !== inputs[2].dataType) {
+      throw new Error('x and x-zero-point must have the same data type.');
+    }
+    // Scale and zero-point inputs must have the same shape
+    if (inputs[1].dims.length !== inputs[2].dims.length) {
+      throw new Error('scale and zero-point inputs must have the same rank.');
+    }
+    if (!inputs[1].dims.map((d, i) => d === inputs[2].dims[i]).reduce((a, b) => a && b, true)) {
+      throw new Error('scale and zero-point inputs must have the same shape.');
+    }
+  }
+  // Validate blockSize
+  if (attributes.blockSize > 0) {
+    // Block qunatization
+    if (inputs[1].dims.length === 0 || (inputs[1].dims.length === 1 && inputs[1].dims[0] === 1)) {
+      throw new Error('blockSize must be set only for block quantization.');
+    }
+    if (!inputs[1]
+             .dims.map((d, i) => i === attributes.axis || d === inputs[0].dims[i])
+             .reduce((a, b) => a && b, true)) {
+      throw new Error('For block qunatization, scale input shape to match the input shape except for the axis');
+    }
+    // Scale input rank should be same as the input rank
+    if (inputs[1].dims.length !== inputs[0].dims.length) {
+      throw new Error('For block qunatization the scale input rank must be the same as the x rank.');
+    }
+    const dI = inputs[0].dims[attributes.axis];
+    const si = inputs[1].dims[attributes.axis];
+    if (attributes.blockSize < Math.ceil(dI / si) || attributes.blockSize > Math.ceil(dI / (si - 1) - 1)) {
+      throw new Error('blockSize must be with in the range [ceil(dI / Si), ceil(dI / (Si - 1) - 1)].');
+    }
+  }
+};
+
+const createDequantizeLinearProgramInfo =
+    (inputs: readonly TensorView[], attributes: DequantizeLinerAttributes): ProgramInfo => {
+      const axis = ShapeUtil.normalizeAxis(attributes.axis, inputs[0].dims.length);
+      const inputType = inputs[0].dataType;
+      const isSigned = inputType === DataType.int8;
+      const outputShape = inputs[0].dims;   // output shape is same as the input shape
+      const dataType = inputs[1].dataType;  // output type is same as the the scale input type
+      const outputSize = ShapeUtil.size(outputShape);
+      const isPacked = inputType === DataType.int8 || inputType === DataType.uint8;
+      const inputShape = isPacked ? [Math.ceil(ShapeUtil.size(inputs[0].dims) / 4)] : inputs[0].dims;
+      const scaleShape = inputs[1].dims;
+      const zeroPointInput = inputs.length > 2 ? inputs[2] : undefined;
+      const zeroPointShape = zeroPointInput ?
+          (isPacked ? [Math.ceil(ShapeUtil.size(zeroPointInput.dims) / 4)] : zeroPointInput.dims) :
+          undefined;
+      // Scales input is a scaler for per-tensor/per-layer quantization, 1-D tensor for per-axis quantization
+      // or tensor with same rank as input for blocked quantization.
+      const perLayerQuantization = scaleShape.length === 0 || (scaleShape.length === 1 && scaleShape[0] === 1);
+      const perAxisQuantization = perLayerQuantization === false && scaleShape.length === 1;
+      // Left unnecessary commented-out assignment for documentation
+      // const blockQuantization = perLayerQuantization === false && perAxisQuantization === false;
+      const maxComponents = getMaxComponents(outputSize);
+      const useComponents = perLayerQuantization && (!isPacked || maxComponents === 4);
+      const components = useComponents ? maxComponents : 1;
+      const inputComponent = (useComponents && !isPacked) ? maxComponents : 1;
+      const input = inputVariable('input', isPacked ? DataType.uint32 : inputType, inputShape.length, inputComponent);
+      const scale = inputVariable('scale', dataType, scaleShape.length);
+      const zeroPoint = zeroPointInput ?
+          inputVariable('zero_point', isPacked ? DataType.uint32 : inputType, zeroPointShape!.length) :
+          undefined;
+      const output = outputVariable('output', dataType, outputShape.length, components);
+      const inputVariables = [input, scale];
+      if (zeroPoint) {
+        inputVariables.push(zeroPoint);
+      }
+      const inputShapes = [inputShape, scaleShape];
+      if (zeroPointInput) {
+        inputShapes.push(zeroPointShape!);
+      }
+      const programUniforms: ProgramUniform[] = [
+        {type: DataType.uint32, data: outputSize / components}, {type: DataType.uint32, data: axis},
+        {type: DataType.uint32, data: attributes.blockSize}, ...createTensorShapeVariables(...inputShapes, outputShape)
+      ];
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const uniforms: UniformsArrayType =
+            [{name: 'output_size', type: 'u32'}, {name: 'axis', type: 'u32'}, {name: 'block_size', type: 'u32'}];
+        return `
+      ${shaderHelper.registerUniforms(uniforms).declareVariables(...inputVariables, output)}
+      ${shaderHelper.mainStart()}
+          ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+          let output_indices = ${output.offsetToIndices('global_idx')};
+
+          // Set input x
+          ${(() => {
+          if (isPacked) {
+            return `
+            let input = ${input.getByOffset('global_idx / 4')};
+            let x_vec = ${isSigned ? 'unpack4xI8(input)' : 'unpack4xU8(input)'};
+            let x_value = ${components === 1 ? 'x_vec[global_idx % 4]' : 'x_vec'};`;
+          } else {
+            return `let x_value = ${input.getByOffset('global_idx')};`;
+          }
+        })()};
+
+          // Set scale input
+          ${(() => {
+          if (perLayerQuantization) {
+            // scale input is a scalar ()
+            return `let scale_value= ${scale.getByOffset('0')}`;
+          } else if (perAxisQuantization) {
+            // scale input is a 1D tensor
+            return `
+            let scale_index = ${output.indicesGet('output_indices', 'uniforms.axis')};
+            let scale_value= ${scale.getByOffset('scale_index')};`;
+          } else {
+            // Block quantization. Scale input rank is same as input/output rank.
+            return `
+            var scale_indices: ${scale.type.indices} = output_indices;
+            let index = ${scale.indicesGet('scale_indices', 'uniforms.axis')} / uniforms.block_size;
+            ${scale.indicesSet('scale_indices', 'uniforms.axis', 'index')};
+            let scale_value= ${scale.getByIndices('scale_indices')};`;
+          }
+        })()};
+
+          // Set zero-point input
+          ${(() => {
+          if (zeroPoint) {
+            if (perLayerQuantization) {
+              // zero-point input is a scalar
+              if (isPacked) {
+                return `
+                let zero_point_input = ${zeroPoint.getByOffset('0')};
+                let zero_point_vec =  ${isSigned ? 'unpack4xI8(zero_point_input)' : 'unpack4xU8(zero_point_input)'};
+                let zero_point_value= zero_point_vec[0]`;
+              } else {
+                return `let zero_point_value = ${zeroPoint.getByOffset('0')}`;
+              }
+            } else if (perAxisQuantization) {
+              // zero-point input is a 1D tensor
+              if (isPacked) {
+                return `
+                let zero_point_index = ${output.indicesGet('output_indices', 'uniforms.axis')};
+                let zero_point_input = ${zeroPoint.getByOffset('zero_point_index / 4')};
+                let zero_point_vec =  ${isSigned ? 'unpack4xI8(zero_point_input)' : 'unpack4xU8(zero_point_input)'};
+                let zero_point_value = zero_point_vec[zero_point_index % 4]`;
+              } else {
+                return `
+                let zero_point_index = ${output.indicesGet('output_indices', 'uniforms.axis')};
+                let zero_point_value = ${zeroPoint.getByOffset('zero_point_index')};`;
+              }
+            } else {
+              // BlockedQuantization. The zero-point input shape is same as the input shape except along axis.
+              if (isPacked) {
+                return `
+                let zero_point_offset = ${scale.indicesToOffset('scale_indices')};
+                let zero_point_input = ${zeroPoint.getByOffset('zero_point_offset / 4')};
+                let zero_point_vec = ${isSigned ? 'unpack4xI8(zero_point_input)' : 'unpack4xU8(zero_point_input)'};
+                let zero_point_value = zero_point_vec[zero_point_offset % 4];`;
+              } else {
+                return `let zero_point_value = ${zeroPoint.getByIndices('scale_indices')};`;
+              }
+            }
+          } else {
+            return `let zero_point_value = ${isPacked ? (isSigned ? 'i32' : 'u32') : input.type.value}(0);`;
+          }
+        })()};
+      // Compute and write output
+      ${output.setByOffset('global_idx', `${output.type.value}(x_value - zero_point_value) * scale_value`)};
+      }`;
+      };
+      return {
+        name: 'DequantizeLinear',
+        shaderCache:
+            {hint: attributes.cacheKey, inputDependencies: zeroPoint ? ['rank', 'rank', 'rank'] : ['rank', 'rank']},
+        getShaderSource,
+        getRunData: () => ({
+          outputs: [{dims: outputShape, dataType}],
+          dispatchGroup: {x: Math.ceil(outputSize / components / 64), y: 1, z: 1},
+          programUniforms
+        })
+      };
+    };
+
+export const dequantizeLinear = (context: ComputeContext, attributes: DequantizeLinerAttributes): void => {
+  validateInputs(context.inputs, attributes);
+  context.compute(createDequantizeLinearProgramInfo(context.inputs, attributes));
+};
+
+export const parseDequantizeLinearAttributes = (attributes: Record<string, unknown>): DequantizeLinerAttributes =>
+    createAttributeWithCacheKey({axis: attributes.axis as number, blockSize: attributes.blockSize as number});