diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h index 816eaaf9bc71a..ec9be80a63574 100644 --- a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h +++ b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h @@ -19,7 +19,7 @@ struct OrtTensorRTProviderOptionsV2 { // can be updated using: UpdateTensorRTProviderOptionsWithValue int trt_max_partition_iterations{1000}; // maximum iterations for TensorRT parser to get capability int trt_min_subgraph_size{1}; // minimum size of TensorRT subgraphs - size_t trt_max_workspace_size{1 << 30}; // maximum workspace size for TensorRT. + size_t trt_max_workspace_size{0}; // maximum workspace size for TensorRT. Default is 0 means max device memory size int trt_fp16_enable{0}; // enable TensorRT FP16 precision. Default 0 = false, nonzero = true int trt_int8_enable{0}; // enable TensorRT INT8 precision. Default 0 = false, nonzero = true const char* trt_int8_calibration_table_name{nullptr}; // TensorRT INT8 calibration table name. diff --git a/js/web/docs/webgpu-operators.md b/js/web/docs/webgpu-operators.md index 3ee9441eeb981..fe46165ffbd50 100644 --- a/js/web/docs/webgpu-operators.md +++ b/js/web/docs/webgpu-operators.md @@ -35,6 +35,7 @@ Do not modify directly.* | Cosh | ai.onnx(9+) | | | CumSum | ai.onnx(11-13,14+) | | | DepthToSpace | ai.onnx(11-12,13+); com.ms.internal.nhwc(11-12,13+) | | +| DequantizeLinear | ai.onnx(10-12,13-18,19-20,21+) | | | Div | ai.onnx(7-12,13,14+) | | | Einsum | ai.onnx(12+) | | | Elu | ai.onnx(6+) | | diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts index ce5b4455fde60..e0288eebbe604 100644 --- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts +++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts @@ -26,6 +26,7 @@ import {matMulNBits, parseMatMulNBitsAttributes} from './ops/matmulnbits'; import {multiHeadAttention, parseMultiHeadAttentionAttributes} from './ops/multihead-attention'; import {pad} from './ops/pad'; import * as pool from './ops/pool'; +import {dequantizeLinear, parseDequantizeLinearAttributes} from './ops/quantize-linear'; import {range} from './ops/range'; import {reduceL1, reduceL2, reduceLogSum, reduceLogSumExp, reduceMax, reduceMean, reduceMin, reduceProd, reduceSum, reduceSumSquare} from './ops/reduce'; import {parseResizeAttributes, resize} from './ops/resize'; @@ -71,6 +72,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map = new ['Cosh', [unaryOps.cosh]], ['CumSum', [cumsum, parseCumSumAttributes]], ['DepthToSpace', [depthToSpace, parseDepthToSpaceAttributes]], + ['DequantizeLinear', [dequantizeLinear, parseDequantizeLinearAttributes]], ['Div', [binaryOps.div]], ['Einsum', [einsum, parseEinsumAttributes]], ['Elu', [unaryOps.elu, unaryOps.parseAlphaAttributes]], diff --git a/js/web/lib/wasm/jsep/webgpu/ops/quantize-linear.ts b/js/web/lib/wasm/jsep/webgpu/ops/quantize-linear.ts new file mode 100644 index 0000000000000..0d7c7ab408b3a --- /dev/null +++ b/js/web/lib/wasm/jsep/webgpu/ops/quantize-linear.ts @@ -0,0 +1,219 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +import {DataType} from '../../../wasm-common'; +import {TensorView} from '../../tensor-view'; +import {ShapeUtil} from '../../util'; +import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key'; +import {ComputeContext, ProgramInfo, ProgramUniform} from '../types'; + +import {createTensorShapeVariables, getMaxComponents, inputVariable, outputVariable, ShaderHelper, UniformsArrayType} from './common'; + +export interface DequantizeLinerAttributes extends AttributeWithCacheKey { + axis: number; + blockSize: number; +} + +const validateInputs = (inputs: readonly TensorView[], attributes: DequantizeLinerAttributes): void => { + if (inputs.length < 2 || inputs.length > 3) { + throw new Error('DequantizeLinear requires 2 or 3 inputs.'); + } + if (inputs.length === 3 && inputs[1].dims === inputs[2].dims) { + throw new Error('x-scale and x-zero-point must have the same shape.'); + } + if (inputs.length === 3 && inputs[0].dataType !== inputs[2].dataType) { + throw new Error('x and x-zero-point must have the same data type.'); + } + if (inputs[0].dataType === DataType.int32 && inputs.length > 2) { + throw new Error('In the case of dequantizing int32 there is no zero point.'); + } + if (inputs[1].dims.length !== 0 && inputs[1].dims.length !== 1 && inputs[1].dims.length !== inputs[0].dims.length) { + throw new Error('scale input must be a scalar, a 1D tensor, or have the same rank as the input tensor.'); + } + // validate scale and zero-point input shapes + if (inputs.length > 2) { + // zero-point input type should be the same as input data type. + if (inputs[0].dataType !== inputs[2].dataType) { + throw new Error('x and x-zero-point must have the same data type.'); + } + // Scale and zero-point inputs must have the same shape + if (inputs[1].dims.length !== inputs[2].dims.length) { + throw new Error('scale and zero-point inputs must have the same rank.'); + } + if (!inputs[1].dims.map((d, i) => d === inputs[2].dims[i]).reduce((a, b) => a && b, true)) { + throw new Error('scale and zero-point inputs must have the same shape.'); + } + } + // Validate blockSize + if (attributes.blockSize > 0) { + // Block qunatization + if (inputs[1].dims.length === 0 || (inputs[1].dims.length === 1 && inputs[1].dims[0] === 1)) { + throw new Error('blockSize must be set only for block quantization.'); + } + if (!inputs[1] + .dims.map((d, i) => i === attributes.axis || d === inputs[0].dims[i]) + .reduce((a, b) => a && b, true)) { + throw new Error('For block qunatization, scale input shape to match the input shape except for the axis'); + } + // Scale input rank should be same as the input rank + if (inputs[1].dims.length !== inputs[0].dims.length) { + throw new Error('For block qunatization the scale input rank must be the same as the x rank.'); + } + const dI = inputs[0].dims[attributes.axis]; + const si = inputs[1].dims[attributes.axis]; + if (attributes.blockSize < Math.ceil(dI / si) || attributes.blockSize > Math.ceil(dI / (si - 1) - 1)) { + throw new Error('blockSize must be with in the range [ceil(dI / Si), ceil(dI / (Si - 1) - 1)].'); + } + } +}; + +const createDequantizeLinearProgramInfo = + (inputs: readonly TensorView[], attributes: DequantizeLinerAttributes): ProgramInfo => { + const axis = ShapeUtil.normalizeAxis(attributes.axis, inputs[0].dims.length); + const inputType = inputs[0].dataType; + const isSigned = inputType === DataType.int8; + const outputShape = inputs[0].dims; // output shape is same as the input shape + const dataType = inputs[1].dataType; // output type is same as the the scale input type + const outputSize = ShapeUtil.size(outputShape); + const isPacked = inputType === DataType.int8 || inputType === DataType.uint8; + const inputShape = isPacked ? [Math.ceil(ShapeUtil.size(inputs[0].dims) / 4)] : inputs[0].dims; + const scaleShape = inputs[1].dims; + const zeroPointInput = inputs.length > 2 ? inputs[2] : undefined; + const zeroPointShape = zeroPointInput ? + (isPacked ? [Math.ceil(ShapeUtil.size(zeroPointInput.dims) / 4)] : zeroPointInput.dims) : + undefined; + // Scales input is a scaler for per-tensor/per-layer quantization, 1-D tensor for per-axis quantization + // or tensor with same rank as input for blocked quantization. + const perLayerQuantization = scaleShape.length === 0 || (scaleShape.length === 1 && scaleShape[0] === 1); + const perAxisQuantization = perLayerQuantization === false && scaleShape.length === 1; + // Left unnecessary commented-out assignment for documentation + // const blockQuantization = perLayerQuantization === false && perAxisQuantization === false; + const maxComponents = getMaxComponents(outputSize); + const useComponents = perLayerQuantization && (!isPacked || maxComponents === 4); + const components = useComponents ? maxComponents : 1; + const inputComponent = (useComponents && !isPacked) ? maxComponents : 1; + const input = inputVariable('input', isPacked ? DataType.uint32 : inputType, inputShape.length, inputComponent); + const scale = inputVariable('scale', dataType, scaleShape.length); + const zeroPoint = zeroPointInput ? + inputVariable('zero_point', isPacked ? DataType.uint32 : inputType, zeroPointShape!.length) : + undefined; + const output = outputVariable('output', dataType, outputShape.length, components); + const inputVariables = [input, scale]; + if (zeroPoint) { + inputVariables.push(zeroPoint); + } + const inputShapes = [inputShape, scaleShape]; + if (zeroPointInput) { + inputShapes.push(zeroPointShape!); + } + const programUniforms: ProgramUniform[] = [ + {type: DataType.uint32, data: outputSize / components}, {type: DataType.uint32, data: axis}, + {type: DataType.uint32, data: attributes.blockSize}, ...createTensorShapeVariables(...inputShapes, outputShape) + ]; + const getShaderSource = (shaderHelper: ShaderHelper) => { + const uniforms: UniformsArrayType = + [{name: 'output_size', type: 'u32'}, {name: 'axis', type: 'u32'}, {name: 'block_size', type: 'u32'}]; + return ` + ${shaderHelper.registerUniforms(uniforms).declareVariables(...inputVariables, output)} + ${shaderHelper.mainStart()} + ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')} + let output_indices = ${output.offsetToIndices('global_idx')}; + + // Set input x + ${(() => { + if (isPacked) { + return ` + let input = ${input.getByOffset('global_idx / 4')}; + let x_vec = ${isSigned ? 'unpack4xI8(input)' : 'unpack4xU8(input)'}; + let x_value = ${components === 1 ? 'x_vec[global_idx % 4]' : 'x_vec'};`; + } else { + return `let x_value = ${input.getByOffset('global_idx')};`; + } + })()}; + + // Set scale input + ${(() => { + if (perLayerQuantization) { + // scale input is a scalar () + return `let scale_value= ${scale.getByOffset('0')}`; + } else if (perAxisQuantization) { + // scale input is a 1D tensor + return ` + let scale_index = ${output.indicesGet('output_indices', 'uniforms.axis')}; + let scale_value= ${scale.getByOffset('scale_index')};`; + } else { + // Block quantization. Scale input rank is same as input/output rank. + return ` + var scale_indices: ${scale.type.indices} = output_indices; + let index = ${scale.indicesGet('scale_indices', 'uniforms.axis')} / uniforms.block_size; + ${scale.indicesSet('scale_indices', 'uniforms.axis', 'index')}; + let scale_value= ${scale.getByIndices('scale_indices')};`; + } + })()}; + + // Set zero-point input + ${(() => { + if (zeroPoint) { + if (perLayerQuantization) { + // zero-point input is a scalar + if (isPacked) { + return ` + let zero_point_input = ${zeroPoint.getByOffset('0')}; + let zero_point_vec = ${isSigned ? 'unpack4xI8(zero_point_input)' : 'unpack4xU8(zero_point_input)'}; + let zero_point_value= zero_point_vec[0]`; + } else { + return `let zero_point_value = ${zeroPoint.getByOffset('0')}`; + } + } else if (perAxisQuantization) { + // zero-point input is a 1D tensor + if (isPacked) { + return ` + let zero_point_index = ${output.indicesGet('output_indices', 'uniforms.axis')}; + let zero_point_input = ${zeroPoint.getByOffset('zero_point_index / 4')}; + let zero_point_vec = ${isSigned ? 'unpack4xI8(zero_point_input)' : 'unpack4xU8(zero_point_input)'}; + let zero_point_value = zero_point_vec[zero_point_index % 4]`; + } else { + return ` + let zero_point_index = ${output.indicesGet('output_indices', 'uniforms.axis')}; + let zero_point_value = ${zeroPoint.getByOffset('zero_point_index')};`; + } + } else { + // BlockedQuantization. The zero-point input shape is same as the input shape except along axis. + if (isPacked) { + return ` + let zero_point_offset = ${scale.indicesToOffset('scale_indices')}; + let zero_point_input = ${zeroPoint.getByOffset('zero_point_offset / 4')}; + let zero_point_vec = ${isSigned ? 'unpack4xI8(zero_point_input)' : 'unpack4xU8(zero_point_input)'}; + let zero_point_value = zero_point_vec[zero_point_offset % 4];`; + } else { + return `let zero_point_value = ${zeroPoint.getByIndices('scale_indices')};`; + } + } + } else { + return `let zero_point_value = ${isPacked ? (isSigned ? 'i32' : 'u32') : input.type.value}(0);`; + } + })()}; + // Compute and write output + ${output.setByOffset('global_idx', `${output.type.value}(x_value - zero_point_value) * scale_value`)}; + }`; + }; + return { + name: 'DequantizeLinear', + shaderCache: + {hint: attributes.cacheKey, inputDependencies: zeroPoint ? ['rank', 'rank', 'rank'] : ['rank', 'rank']}, + getShaderSource, + getRunData: () => ({ + outputs: [{dims: outputShape, dataType}], + dispatchGroup: {x: Math.ceil(outputSize / components / 64), y: 1, z: 1}, + programUniforms + }) + }; + }; + +export const dequantizeLinear = (context: ComputeContext, attributes: DequantizeLinerAttributes): void => { + validateInputs(context.inputs, attributes); + context.compute(createDequantizeLinearProgramInfo(context.inputs, attributes)); +}; + +export const parseDequantizeLinearAttributes = (attributes: Record): DequantizeLinerAttributes => + createAttributeWithCacheKey({axis: attributes.axis as number, blockSize: attributes.blockSize as number}); diff --git a/js/web/test/data/ops/dequantizelinear.jsonc b/js/web/test/data/ops/dequantizelinear.jsonc new file mode 100644 index 0000000000000..2dc04d11f2889 --- /dev/null +++ b/js/web/test/data/ops/dequantizelinear.jsonc @@ -0,0 +1,385 @@ +[ + { + "name": "dequantizelinear", + "operator": "DequantizeLinear", + "opset": { "domain": "", "version": 10 }, + "attributes": [], + "cases": [ + { + "name": "T[1]", + "inputs": [ + { + "data": [1, 2, 3, 4], + "dims": [4], + "type": "uint8" + }, + { + "data": [0.1], + "dims": [1], + "type": "float32" + }, + { + "data": [0], + "dims": [1], + "type": "uint8" + } + ], + "outputs": [ + { + "data": [0.1, 0.2, 0.3, 0.4], + "dims": [4], + "type": "float32" + } + ] + } + ] + }, + { + "name": "dequantizelinear", + "operator": "DequantizeLinear", + "opset": { "domain": "", "version": 10 }, + "attributes": [], + "cases": [ + { + "name": "T[2]", + "inputs": [ + { + "data": [1, 2, 3, 4], + "dims": [4], + "type": "int32" + }, + { + "data": [0.1], + "dims": [1], + "type": "float32" + } + ], + "outputs": [ + { + "data": [0.1, 0.2, 0.3, 0.4], + "dims": [4], + "type": "float32" + } + ] + } + ] + }, + { + "name": "dequantizelinear", + "operator": "DequantizeLinear", + "opset": { "domain": "", "version": 13 }, + "attributes": [ + { + "name": "axis", + "data": 1, + "type": "int" + } + ], + "cases": [ + { + "name": "T[3]", + "inputs": [ + { + "data": [1, 2, 3, 4], + "dims": [2, 2], + "type": "uint8" + }, + { + "data": [0.1], + "dims": [1], + "type": "float32" + }, + { + "data": [0], + "dims": [1], + "type": "uint8" + } + ], + "outputs": [ + { + "data": [0.1, 0.2, 0.3, 0.4], + "dims": [2, 2], + "type": "float32" + } + ] + } + ] + }, + { + "name": "dequantizelinear", + "operator": "DequantizeLinear", + "opset": { "domain": "", "version": 13 }, + "attributes": [ + { + "name": "axis", + "data": 1, + "type": "int" + } + ], + "cases": [ + { + "name": "T[4]", + "inputs": [ + { + "data": [1, 2, 3, 4], + "dims": [2, 2], + "type": "int32" + }, + { + "data": [0.1], + "dims": [1], + "type": "float32" + } + ], + "outputs": [ + { + "data": [0.1, 0.2, 0.3, 0.4], + "dims": [2, 2], + "type": "float32" + } + ] + } + ] + }, + { + "name": "dequantizelinear", + "operator": "DequantizeLinear", + "opset": { "domain": "", "version": 13 }, + "attributes": [ + { + "name": "axis", + "data": 1, + "type": "int" + } + ], + "cases": [ + { + "name": "T[5]", + "inputs": [ + { + "data": [1, 2, 3, 4, 5, 6, 7, 8], + "dims": [2, 2, 2], + "type": "uint8" + }, + { + "data": [0.1, 0.1], + "dims": [2], + "type": "float32" + }, + { + "data": [0, 0], + "dims": [2], + "type": "uint8" + } + ], + "outputs": [ + { + "data": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8], + "dims": [2, 2, 2], + "type": "float32" + } + ] + } + ] + }, + { + "name": "dequantizelinear", + "operator": "DequantizeLinear", + "opset": { "domain": "", "version": 13 }, + "attributes": [ + { + "name": "axis", + "data": 1, + "type": "int" + } + ], + "cases": [ + { + "name": "T[6]", + "inputs": [ + { + "data": [1, 2, 3, 4, 5, 6, 7, 8], + "dims": [2, 2, 2], + "type": "uint8" + }, + { + "data": [0.1, 0.2], + "dims": [2], + "type": "float32" + }, + { + "data": [0, 0], + "dims": [2], + "type": "uint8" + } + ], + "outputs": [ + { + "data": [0.1, 0.2, 0.6, 0.8, 0.5, 0.6, 1.4, 1.6], + "dims": [2, 2, 2], + "type": "float32" + } + ] + } + ] + }, + { + "name": "dequantizelinear", + "operator": "DequantizeLinear", + "opset": { "domain": "", "version": 13 }, + "attributes": [ + { + "name": "axis", + "data": 1, + "type": "int" + } + ], + "cases": [ + { + "name": "T[7]", + "inputs": [ + { + "data": [1, 2, 3, 4, 5, 6, 7, 8], + "dims": [2, 2, 2], + "type": "int32" + }, + { + "data": [0.1], + "dims": [1], + "type": "float32" + } + ], + "outputs": [ + { + "data": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8], + "dims": [2, 2, 2], + "type": "float32" + } + ] + } + ] + }, + { + "name": "dequantizelinear", + "operator": "DequantizeLinear", + "opset": { "domain": "", "version": 21 }, + "attributes": [ + { + "name": "axis", + "data": 1, + "type": "int" + }, + { + "name": "block_size", + "data": 2, + "type": "int" + } + ], + "cases": [ + { + "name": "T[8]", + "inputs": [ + { + "data": [1, 2, 3, 4, 5, 6, 7, 8], + "dims": [2, 2, 2], + "type": "uint8" + }, + { + "data": [0.1, 0.2, 0.3, 0.4], + "dims": [2, 1, 2], + "type": "float32" + }, + { + "data": [1, 2, 3, 4], + "dims": [2, 1, 2], + "type": "uint8" + } + ], + "outputs": [ + { + "data": [0.0, 0.0, 0.2, 0.4, 0.6, 0.8, 1.2, 1.6], + "dims": [2, 2, 2], + "type": "float32" + } + ] + } + ] + }, + { + "name": "dequantizelinear block dequantization", + "operator": "DequantizeLinear", + "opset": { "domain": "", "version": 21 }, + "attributes": [ + { + "name": "axis", + "data": 1, + "type": "int" + }, + { + "name": "block_size", + "data": 2, + "type": "int" + } + ], + "cases": [ + { + "name": "T[9]", + "inputs": [ + { + "data": [1, 2, 3, 4, 5, 6, 7, 8], + "dims": [2, 2, 2], + "type": "int32" + }, + { + "data": [0.1, 0.2, 0.3, 0.4], + "dims": [2, 1, 2], + "type": "float32" + } + ], + "outputs": [ + { + "data": [0.1, 0.4, 0.3, 0.8, 1.5, 2.4, 2.1, 3.2], + "dims": [2, 2, 2], + "type": "float32" + } + ] + } + ] + }, + { + "name": "dequantizelinear", + "operator": "DequantizeLinear", + "opset": { "domain": "", "version": 13 }, + "attributes": [ + { + "name": "axis", + "data": 1, + "type": "int" + } + ], + "cases": [ + { + "name": "T[3]", + "inputs": [ + { + "data": [1, 2, 3, 4], + "dims": [2, 2], + "type": "uint8" + }, + { + "data": [0.1], + "dims": [1], + "type": "float32" + } + ], + "outputs": [ + { + "data": [0.1, 0.2, 0.3, 0.4], + "dims": [2, 2], + "type": "float32" + } + ] + } + ] + } +] diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc index 4aaf9d16b2b0e..ede89f7557dd8 100644 --- a/js/web/test/suite-test-list.jsonc +++ b/js/web/test/suite-test-list.jsonc @@ -477,8 +477,8 @@ "test_depthtospace_dcr_mode", "test_depthtospace_example", "test_depthtospace", - // // "test_dequantizelinear_axis", - // // "test_dequantizelinear", + "test_dequantizelinear_axis", + "test_dequantizelinear", // // "test_det_2d", // // "test_det_nd", // // "test_dft_axis", @@ -1352,6 +1352,7 @@ "div.jsonc", "div_int32.jsonc", "depth-to-space.jsonc", + "dequantizelinear.jsonc", "equal.jsonc", "exp.jsonc", "expand.jsonc", diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc index cbd53298ab2ad..42f491825462c 100644 --- a/onnxruntime/core/framework/tensorprotoutils.cc +++ b/onnxruntime/core/framework/tensorprotoutils.cc @@ -1358,6 +1358,7 @@ common::Status ConstantNodeProtoToTensorProto(const ONNX_NAMESPACE::NodeProto& n common::Status ConstantNodeProtoToTensorProto(const ONNX_NAMESPACE::NodeProto& node, const std::filesystem::path& model_path, ONNX_NAMESPACE::TensorProto& tensor) { + ORT_ENFORCE(node.output_size() == 1, "NodeProto for Constant should have 1 output. Got:", node.output_size()); return ConstantNodeProtoToTensorProto(node, model_path, tensor, node.output(0)); } diff --git a/onnxruntime/core/graph/model.cc b/onnxruntime/core/graph/model.cc index ee4d9f9154971..d38c1ace7d7a8 100644 --- a/onnxruntime/core/graph/model.cc +++ b/onnxruntime/core/graph/model.cc @@ -646,7 +646,7 @@ Status Model::SaveWithExternalInitializers(Model& model, const std::filesystem:: return SaveModelWithExternalInitializers(model, file_path, external_file_name, initializer_size_threshold); } -Status Model::LoadFromBytes(int count, void* p_bytes, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) { +Status Model::LoadFromBytes(int count, const void* p_bytes, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) { const bool result = model_proto.ParseFromArray(p_bytes, count); if (!result) { return Status(ONNXRUNTIME, INVALID_PROTOBUF, "Protobuf parsing failed."); diff --git a/onnxruntime/core/graph/model.h b/onnxruntime/core/graph/model.h index 728af727ac83b..ea34dba889277 100644 --- a/onnxruntime/core/graph/model.h +++ b/onnxruntime/core/graph/model.h @@ -234,7 +234,7 @@ class Model { const ModelOptions& options = {}); // 'int' rather than 'size_t' because of a protobuf design choice; let callers handle type checks - static common::Status LoadFromBytes(int count, void* pBytes, + static common::Status LoadFromBytes(int count, const void* pBytes, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto); // 'int' rather than 'size_t' because of a protobuf design choice; let callers handle type checks diff --git a/onnxruntime/core/optimizer/unsqueeze_elimination.cc b/onnxruntime/core/optimizer/unsqueeze_elimination.cc index 4efc8018f0217..d52cc82af02bb 100644 --- a/onnxruntime/core/optimizer/unsqueeze_elimination.cc +++ b/onnxruntime/core/optimizer/unsqueeze_elimination.cc @@ -40,6 +40,10 @@ Status UnsqueezeElimination::Apply(Graph& graph, Node& node, RewriteRuleEffect& // Generate new dims. InlinedVector new_dims(output_rank, 0); for (int64_t axis : axes) { + if (static_cast(axis) >= new_dims.size()) { + LOGS(logger, WARNING) << "UnsqueezeElimination cannot remove node due to invalid axes" << node.Name(); + return Status::OK(); + } new_dims[static_cast(axis)] = 1; } diff --git a/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc b/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc index 21a256eee6f14..7797cbe678bd4 100644 --- a/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc +++ b/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc @@ -380,8 +380,8 @@ Status QLinearConv::PrePack(const Tensor& tensor, int input_idx, Alloca const int64_t M = shape[0]; const int64_t C = shape[1]; - // Verify that the total number of output channels is a multiple of the group count. - if (M % conv_attrs_.group != 0) { + // Verify that conv_attrs_.group is not 0 and the total number of output channels is a multiple of the group count. + if (conv_attrs_.group == 0 || M % conv_attrs_.group != 0) { return Status::OK(); } diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc index 0ad62b87d33b5..e51b53686fafc 100644 --- a/onnxruntime/core/providers/js/js_execution_provider.cc +++ b/onnxruntime/core/providers/js/js_execution_provider.cc @@ -370,6 +370,19 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomai class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 13, CumSum); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 14, CumSum); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 12, uint8_t, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 12, int8_t, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 12, int32_t, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 18, uint8_t, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 18, int8_t, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 18, int32_t, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 19, 20, uint8_t, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 19, 20, int8_t, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 19, 20, int32_t, DequantizeLinear); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 21, uint8_t, DequantizeLinear); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 21, int8_t, DequantizeLinear); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 21, int32_t, DequantizeLinear); + std::unique_ptr RegisterKernels() { auto kernel_registry = std::make_unique(); @@ -670,6 +683,18 @@ std::unique_ptr RegisterKernels() { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, }; for (auto& function_table_entry : function_table) { diff --git a/onnxruntime/core/providers/js/operators/quantize_linear.cc b/onnxruntime/core/providers/js/operators/quantize_linear.cc new file mode 100644 index 0000000000000..a3dd635f1fb13 --- /dev/null +++ b/onnxruntime/core/providers/js/operators/quantize_linear.cc @@ -0,0 +1,54 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "quantize_linear.h" + +namespace onnxruntime { +namespace js { +#define REGISTER_DEQUANTIZED_LINEAR_VERSIONED_TYPED_KERNEL(T, sinceVersion, endVerion) \ + ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ + DequantizeLinear, \ + kOnnxDomain, \ + sinceVersion, endVerion, \ + T, \ + kJsExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T1", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("T2", JsepSupportedFloatTypes()), \ + DequantizeLinear); + +#define REGISTER_DEQUANTIZED_LINEAR_TYPED_KERNEL(T, sinceVersion) \ + ONNX_OPERATOR_TYPED_KERNEL_EX( \ + DequantizeLinear, \ + kOnnxDomain, \ + sinceVersion, \ + T, \ + kJsExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T1", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("T2", JsepSupportedFloatTypes()), \ + DequantizeLinear); + +#define REGISTER_DEQUANTIZED_LINEAR_VERSIONED_TYPED_KERNEL_PRE_19(T, sinceVersion, endVerion) \ + ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ + DequantizeLinear, \ + kOnnxDomain, \ + sinceVersion, endVerion, \ + T, \ + kJsExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ + DequantizeLinear); + +#define REGISTER_DEQUANTIZED_LINEAR_KERNEL_TYPED(T) \ + REGISTER_DEQUANTIZED_LINEAR_VERSIONED_TYPED_KERNEL_PRE_19(T, 10, 12) \ + REGISTER_DEQUANTIZED_LINEAR_VERSIONED_TYPED_KERNEL_PRE_19(T, 13, 18) \ + REGISTER_DEQUANTIZED_LINEAR_VERSIONED_TYPED_KERNEL(T, 19, 20) \ + REGISTER_DEQUANTIZED_LINEAR_TYPED_KERNEL(T, 21) + +REGISTER_DEQUANTIZED_LINEAR_KERNEL_TYPED(int8_t) +REGISTER_DEQUANTIZED_LINEAR_KERNEL_TYPED(uint8_t) +REGISTER_DEQUANTIZED_LINEAR_KERNEL_TYPED(int32_t) + +} // namespace js +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/js/operators/quantize_linear.h b/onnxruntime/core/providers/js/operators/quantize_linear.h new file mode 100644 index 0000000000000..e15942aaf1a41 --- /dev/null +++ b/onnxruntime/core/providers/js/operators/quantize_linear.h @@ -0,0 +1,31 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/providers/js/js_kernel.h" + +namespace onnxruntime { +namespace js { + +class DequantizeLinear : public JsKernel { + public: + DequantizeLinear(const OpKernelInfo& info) : JsKernel(info) { + int64_t axis; + int64_t block_size; + if (!info.GetAttr("axis", &axis).IsOK()) { + axis = 1; + } + if (!info.GetAttr("block_size", &block_size).IsOK()) { + block_size = 0; + } + JSEP_INIT_KERNEL_ATTRIBUTE(DequantizeLinear, ({ + "axis" : $1, + "blockSize" : $2 + }), + static_cast(axis), static_cast(block_size)); + } +}; + +} // namespace js +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc index a2b3ed068235b..f1df1abf4c49a 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc @@ -583,22 +583,23 @@ static void AddQDQNodeUnit(onnxruntime::Graph& dst_graph, // Handle Qs in the NodeUnit if (!node_unit.GetQNodes().empty()) { - ORT_ENFORCE(node_unit.GetQNodes().size() == 1); - const auto& q_node = node_unit.GetQNodes().at(0); - - SkipReason reason; - - bool keep_q = CheckQRuleSet(node_unit, q_node, src_graph, reason); - - if (keep_q) { - AddNode(initializers_to_keep, src_graph, dst_graph, *q_node); - // if keep_q, then output defs of the target node doesn't change - output_args.push_back(&dst_graph.GetOrCreateNodeArg(target_node.OutputDefs().at(0)->Name(), - target_node.OutputDefs().at(0)->TypeAsProto())); - } else { - // convert this Q to float - output_args.push_back(&ProcessNodeUnitIO(dst_graph, src_graph, initializers_to_keep, - node_unit_outputs.at(0))); + for (size_t i = 0; i < node_unit.GetQNodes().size(); i++) { + const auto& q_node = node_unit.GetQNodes().at(i); + + SkipReason reason; + + bool keep_q = CheckQRuleSet(node_unit, q_node, src_graph, reason); + + if (keep_q) { + AddNode(initializers_to_keep, src_graph, dst_graph, *q_node); + // if keep_q, then output defs of the target node doesn't change + output_args.push_back(&dst_graph.GetOrCreateNodeArg(target_node.OutputDefs().at(i)->Name(), + target_node.OutputDefs().at(i)->TypeAsProto())); + } else { + // convert this Q to float + output_args.push_back(&ProcessNodeUnitIO(dst_graph, src_graph, initializers_to_keep, + node_unit_outputs.at(i))); + } } } else { for (const auto& node_unit_output : node_unit_outputs) { diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index cdbb7bb2a8094..a7daa98902afb 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -1583,10 +1583,6 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_min_subgraph_size must be a positive integer value. Set it to 1"; min_subgraph_size_ = 1; } - if (max_workspace_size_ <= 0) { - LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_max_workspace_size must be a positive integer value. Set it to 1073741824 (1GB)"; - max_workspace_size_ = 1 << 30; - } if (dla_core_ < 0) { LOGS_DEFAULT(WARNING) << "[TensorRT EP] TensorRT option trt_dla_core must be a non-negative integer value. Set it to 0"; dla_core_ = 0; @@ -2756,7 +2752,9 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView auto trt_config = std::unique_ptr(trt_builder->createBuilderConfig()); auto trt_parser = tensorrt_ptr::unique_pointer(nvonnxparser::createParser(*trt_network, trt_logger)); trt_parser->parse(string_buf.data(), string_buf.size(), model_path_); - trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_); + if (max_workspace_size_ > 0) { + trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_); + } // Force Pow + Reduce ops in layer norm to run in FP32 to avoid overflow if (fp16_enable_ && layer_norm_fp32_fallback_) { @@ -3363,7 +3361,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView &parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name], &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name], input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_, - dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, + dla_enable_, dla_core_, trt_node_name_with_precision, engine_cache_enable_, cache_path_, runtime_.get(), profiles_[context->node_name], context_memory_sharing_enable_, &max_ctx_mem_size_, dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, timing_cache_enable_, global_cache_path_, force_timing_cache_match_, @@ -3538,7 +3536,9 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView trt_state->context->reset(); trt_state->engine->reset(); auto trt_config = std::unique_ptr(trt_builder->createBuilderConfig()); - trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, *(trt_state->max_workspace_size_ptr)); + if (max_workspace_size_ > 0) { + trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_); + } for (auto trt_profile : trt_profiles) { trt_config->addOptimizationProfile(trt_profile); } @@ -3752,6 +3752,11 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView trt_context = trt_state->context->get(); } + // Check before using trt_engine + if (trt_engine == nullptr) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "No engine is found."); + } + // Get input and output binding names int total_bindings = trt_engine->getNbIOTensors(); std::vector input_binding_names, output_binding_names; @@ -4075,6 +4080,11 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &cuda_stream)); cudaStream_t stream = static_cast(cuda_stream); + // Check before using trt_engine + if (trt_engine == nullptr) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "No engine is found."); + } + // Get input and output binding names int total_bindings = trt_engine->getNbIOTensors(); std::vector input_binding_names, output_binding_names; diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h index 3f20314438564..97c9367b0bb61 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h @@ -175,7 +175,6 @@ struct TensorrtFuncState { bool int8_calibration_cache_available = false; bool dla_enable = false; int dla_core = 0; - size_t* max_workspace_size_ptr = nullptr; std::string trt_node_name_with_precision; bool engine_cache_enable = false; std::string engine_cache_path; @@ -290,7 +289,7 @@ class TensorrtExecutionProvider : public IExecutionProvider { cudaStream_t stream_ = nullptr; int max_partition_iterations_ = 1000; size_t min_subgraph_size_ = 1; - size_t max_workspace_size_ = 1 << 30; // 1GB + size_t max_workspace_size_ = 0; bool fp16_enable_ = false; bool int8_enable_ = false; bool dla_enable_ = false; diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h index 50b934fd5fcbc..fa1bbd6d3d7e6 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h @@ -22,7 +22,7 @@ struct TensorrtExecutionProviderInfo { bool has_trt_options{false}; int max_partition_iterations{1000}; int min_subgraph_size{1}; - size_t max_workspace_size{1 << 30}; + size_t max_workspace_size{0}; bool fp16_enable{false}; bool int8_enable{false}; std::string int8_calibration_table_name{""}; diff --git a/onnxruntime/python/tools/quantization/base_quantizer.py b/onnxruntime/python/tools/quantization/base_quantizer.py index aab04485246d6..d48964203ce76 100644 --- a/onnxruntime/python/tools/quantization/base_quantizer.py +++ b/onnxruntime/python/tools/quantization/base_quantizer.py @@ -515,8 +515,6 @@ def adjust_tensor_ranges(self): for node in self.model.nodes(): # adjust tensor_ranges for input of Clip and Relu node if node.op_type in ["Clip", "Relu"]: - if self.is_activation_symmetric: - continue if not self.should_quantize_node(node): continue if len(self.model.input_name_to_nodes()[node.input[0]]) != 1: diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py index cc8bd622df9b1..c0cc4f038cd3b 100644 --- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py +++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py @@ -712,14 +712,20 @@ def process(self): if self.algo_config.algorithm in ["HQQ", "DEFAULT"]: # use a stack to keep track of sub-graphs graph_stack = [self.model.graph()] - opset_import = self.model.opset_import() - - has_ms_domain = False - for opset in opset_import: - if opset.domain == "com.microsoft": - has_ms_domain = True - if not has_ms_domain: - opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)]) + + # Update domain opset + if self.algo_config.quant_format == QuantFormat.QOperator: + self.model.set_opset_import("com.microsoft", 1) + else: + opset_import = self.model.opset_import() + for opset in opset_import: + if opset.domain in [None, "ai.onnx", ""] and opset.version < 21: + logger.warning( + "The opset of the input model is under 21 and doesn't support int4 data type. " + "Force to update it to opset 21, but the generated model may not be a valid model." + ) + self.model.set_opset_import(opset.domain, 21) + self._process_subgraph(graph_stack) self.model.clean_initializers() else: diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py index 60bf90c243db0..b71f332252850 100644 --- a/onnxruntime/python/tools/quantization/qdq_quantizer.py +++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py @@ -989,8 +989,7 @@ def is_tensor_per_channel( per_chan_overrides = self.tensor_quant_overrides.get_per_channel_overrides(tensor_name) axis = per_chan_overrides[0]["axis"] # Prefer axis from user-specified tensor-level overrides if available - weight_nparray = tensor_proto_to_array(weight_initializer) - weight_rank = len(weight_nparray.shape) + weight_rank = len(weight_initializer.dims) axis_valid, axis = normalize_axis(axis, weight_rank) if not axis_valid: logging.warning(f"Axis {axis} is out-of-range for weight '{tensor_name}' with rank {weight_rank}") diff --git a/onnxruntime/python/tools/quantization/tensor_quant_overrides.py b/onnxruntime/python/tools/quantization/tensor_quant_overrides.py index 6050bd2e05ec5..219d929d22fce 100644 --- a/onnxruntime/python/tools/quantization/tensor_quant_overrides.py +++ b/onnxruntime/python/tools/quantization/tensor_quant_overrides.py @@ -12,7 +12,7 @@ import onnx -from .quant_utils import QuantType, tensor_proto_to_array +from .quant_utils import QuantType @dataclass @@ -235,7 +235,7 @@ def _is_valid_per_channel( "the first channel dictionary.", ) - weight_shape = tensor_proto_to_array(initializers[tensor_name]).shape + weight_shape = list(initializers[tensor_name].dims) weight_rank = len(weight_shape) norm_axis = axis if norm_axis < 0: diff --git a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py index 4cc8a0c151d14..0438d93227524 100644 --- a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py +++ b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py @@ -156,6 +156,9 @@ def quant_test( } ) check_qtype_by_node_type(self, model_int4_path, dqnode_io_qtypes) + for op in quant.model.opset_import(): + if op.domain in [None, "", "ai.onnx"] and op.version < 21: + self.fail(f"In QDQ format {op.domain} opset should be >= 21") data_reader.rewind() diff --git a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py index 8691471b040a7..21a772c5f56c7 100644 --- a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py +++ b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py @@ -5,7 +5,9 @@ # license information. # -------------------------------------------------------------------------- +import os import struct +import tempfile import unittest import numpy as np @@ -1150,6 +1152,48 @@ def test_get_qnn_qdq_config_ext_data(self): self.assertEqual(set(qnn_config.op_types_to_quantize), {"Add"}) self.assertTrue(qnn_config.use_external_data_format) + def test_get_qnn_qdq_config_ext_data_separate_dir(self): + """ + Test that get_qnn_qdq_config() can validate per-channel quantization overrides for a model with external data + that is in a separate directory not in the cwd. + """ + + # Create model with a weight large enough (> 1024 bytes) to be stored externally. + large_weight = onnx.numpy_helper.from_array(np.random.random((1, 2, 32, 32)).astype(np.float32), "weight") + graph = onnx.helper.make_graph( + [onnx.helper.make_node("Conv", ["input", "weight"], ["output"])], + "conv_ext_data", + [onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, (1, 2, 64, 64))], + [onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, None)], + initializer=[large_weight], + ) + model = onnx.helper.make_model( + graph, + opset_imports=[onnx.helper.make_opsetid("", 21)], + ) + + # Make a separate directory in which to save model and its external data. + model_dir_path = tempfile.mkdtemp(prefix="model_ext_data") + model_name = "conv_ext_data.onnx" + model_path = os.path.join(model_dir_path, model_name) + + onnx.save_model( + model, + str(model_path), + save_as_external_data=True, + ) + + # Use tensor quantization overrides to quantize Conv's weight input to 4 bits on axis 0. + init_overrides = {"weight": [{"quant_type": QuantType.QInt4, "axis": 0, "symmetric": True}]} + + # get_qnn_qdq_config() should be able to validate the per-channel axis without having to load + # the external weight data. + qnn_config = get_qnn_qdq_config( + str(model_path), DummyDataReader([]), init_overrides=init_overrides # Dummy data reader does nothing + ) + self.assertEqual(set(qnn_config.op_types_to_quantize), {"Conv"}) + self.assertTrue(qnn_config.use_external_data_format) + if __name__ == "__main__": t = TestTensorQuantOverridesOption() diff --git a/orttraining/orttraining/test/training_api/core/training_capi_tests.cc b/orttraining/orttraining/test/training_api/core/training_capi_tests.cc index 8f25e1e4c92b8..cff060134e679 100644 --- a/orttraining/orttraining/test/training_api/core/training_capi_tests.cc +++ b/orttraining/orttraining/test/training_api/core/training_capi_tests.cc @@ -265,6 +265,41 @@ TEST(TrainingCApiTest, LoadONNXModelsFromBuffer) { train_model_data); } +TEST(TrainingCApiTest, LoadONNXModelsFromBufferThenExport) { + auto model_path = MODEL_FOLDER "training_model.onnx"; + size_t model_data_len = 0; + ASSERT_STATUS_OK(Env::Default().GetFileLength(model_path, model_data_len)); + std::vector train_model_data(model_data_len); + std::ifstream bytes_stream(model_path, std::ifstream::in | std::ifstream::binary); + bytes_stream.read(reinterpret_cast(train_model_data.data()), model_data_len); + ASSERT_TRUE(train_model_data.size() == model_data_len); + + auto eval_model_path = MODEL_FOLDER "eval_model.onnx"; + size_t eval_model_data_len = 0; + ASSERT_STATUS_OK(Env::Default().GetFileLength(eval_model_path, eval_model_data_len)); + std::vector eval_model_data(eval_model_data_len); + std::ifstream eval_bytes_stream(eval_model_path, std::ifstream::in | std::ifstream::binary); + eval_bytes_stream.read(reinterpret_cast(eval_model_data.data()), eval_model_data_len); + ASSERT_TRUE(eval_model_data.size() == eval_model_data_len); + + Ort::Env env; + Ort::CheckpointState checkpoint_state = Ort::CheckpointState::LoadCheckpoint(MODEL_FOLDER "checkpoint.ckpt"); + Ort::TrainingSession training_session = Ort::TrainingSession(env, + Ort::SessionOptions(), + checkpoint_state, + train_model_data, + eval_model_data); + + // randomly selected output name + std::vector graph_output_names({"onnx::loss::21273"}); + training_session.ExportModelForInferencing(MODEL_FOLDER "inference_model.onnx", graph_output_names); + + // Check that the model is a valid inference model by loading into an InferenceSession + std::unique_ptr environment; + ASSERT_STATUS_OK(Environment::Create(nullptr, environment)); + InferenceSession inference_session = InferenceSession(SessionOptions(), *environment, MODEL_FOLDER "inference_model.onnx"); +} + TEST(TrainingCApiTest, LoadORTFormatModelsFromBuffer) { auto train_model_path = ORT_FORMAT_MODEL_FOLDER "training_model.ort"; auto eval_model_path = ORT_FORMAT_MODEL_FOLDER "eval_model.ort"; diff --git a/orttraining/orttraining/training_api/module.cc b/orttraining/orttraining/training_api/module.cc index dc724fbae48eb..939e1de334e52 100644 --- a/orttraining/orttraining/training_api/module.cc +++ b/orttraining/orttraining/training_api/module.cc @@ -412,11 +412,12 @@ Module::Module(const ModelIdentifiers& model_identifiers, eval_user_input_count_ = eval_user_input_names.size(); eval_input_names_.insert(eval_input_names_.end(), eval_param_input_names.begin(), eval_param_input_names.end()); - // Keep a copy of the eval model path to be able to later export the model for inferencing. + // Keep a copy of the eval model path or buffer to be able to later export the model for inferencing. // The inference model will be reconstructed from the eval model. - // TODO(askhade): Find a fix to export model for inference when the eval model is loaded from a buffer. if (std::holds_alternative>(model_identifiers.eval_model)) { eval_model_path_ = std::get>(model_identifiers.eval_model); + } else if (std::holds_alternative>(model_identifiers.eval_model)) { + eval_model_buffer_ = std::get>(model_identifiers.eval_model); } } @@ -658,11 +659,17 @@ Status Module::ExportModelForInferencing(const std::string& inference_model_path gsl::span graph_output_names) const { ORT_RETURN_IF(state_->module_checkpoint_state.is_nominal_state, "Cannot export the model with a nominal state. Please load the model parameters first."); - ORT_RETURN_IF(!eval_sess_ || !eval_model_path_.has_value(), + ORT_RETURN_IF(!eval_sess_ || (!eval_model_path_.has_value() && !eval_model_buffer_.has_value()), "Eval model was not provided. Cannot export a model for inferencing."); ONNX_NAMESPACE::ModelProto eval_model; - ORT_THROW_IF_ERROR(Model::Load(ToPathString(eval_model_path_.value()), eval_model)); + if (eval_model_path_.has_value()) { + ORT_THROW_IF_ERROR(Model::Load(ToPathString(eval_model_path_.value()), eval_model)); + } else if (eval_model_buffer_.has_value()) { + int eval_model_buffer_size = static_cast(eval_model_buffer_.value().size()); + const void* eval_model_buffer_ptr = static_cast(eval_model_buffer_.value().data()); + ORT_THROW_IF_ERROR(Model::LoadFromBytes(eval_model_buffer_size, eval_model_buffer_ptr, eval_model)); + } // Clone the eval mode into an inference onnxruntime::Model. std::shared_ptr inference_model; diff --git a/orttraining/orttraining/training_api/module.h b/orttraining/orttraining/training_api/module.h index 917887404217f..f4d894f33516a 100644 --- a/orttraining/orttraining/training_api/module.h +++ b/orttraining/orttraining/training_api/module.h @@ -198,6 +198,7 @@ struct Module { bool accumulate_gradient_ = false; std::optional eval_model_path_; + std::optional> eval_model_buffer_; size_t eval_user_input_count_{0U}; }; diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml index 3e8366b11f4aa..436d914c426ad 100644 --- a/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml +++ b/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml @@ -8,6 +8,7 @@ jobs: pool: vmImage: windows-2019 timeoutInMinutes: 60 + continueOnError: true workspace: clean: all steps: