diff --git a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts index 13d3a91bb339e..9c05080f7e118 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts @@ -62,14 +62,24 @@ const createBinaryOpProgramShader = let assignment: string; if (vectorize) { if (doBroadcast) { - assignment = ` + const isAOneElement = ShapeUtil.size(dimsA) === 1; + const isBOneElement = ShapeUtil.size(dimsB) === 1; + if (isAOneElement || isBOneElement) { + assignment = output.setByOffset( + 'global_idx', + expressionVector( + isAOneElement ? `${a.type.value}(${a.getByOffset('0')}.x)` : a.getByOffset('global_idx'), + isBOneElement ? `${b.type.value}(${b.getByOffset('0')}.x)` : b.getByOffset('global_idx'))); + } else { + assignment = ` let outputIndices = ${output.offsetToIndices('global_idx * 4u')}; let offsetA = calcOffsetA(outputIndices); let offsetB = calcOffsetB(outputIndices); ${ - output.setByOffset( - 'global_idx', expressionVector(a.getByOffset('offsetA / 4u'), b.getByOffset('offsetB / 4u')))} + output.setByOffset( + 'global_idx', expressionVector(a.getByOffset('offsetA / 4u'), b.getByOffset('offsetB / 4u')))} `; + } } else { assignment = output.setByOffset( 'global_idx', expressionVector(a.getByOffset('global_idx'), b.getByOffset('global_idx'))); @@ -141,6 +151,8 @@ const createBinaryOpProgramInfo = } outputShape = calculatedShape; outputSize = ShapeUtil.size(outputShape); + const isAOneElement = ShapeUtil.size(a.dims) === 1; + const isBOneElement = ShapeUtil.size(b.dims) === 1; // check whether vectorize can be enabled let sharedDimension = 1; @@ -153,7 +165,7 @@ const createBinaryOpProgramInfo = break; } } - if (sharedDimension % 4 === 0) { + if (sharedDimension % 4 === 0 || isAOneElement || isBOneElement) { vectorize = true; } } else { @@ -167,8 +179,7 @@ const createBinaryOpProgramInfo = shaderHelper, a.dims, b.dims, outputShape, vectorize, isBroadcast, funcCall, a.dataType, b.dataType, outputDataType, additionalImplementation), outputs: [{dims: outputShape, dataType: outputDataType, gpuDataType: GpuDataType.default}], - dispatchGroup: () => - ({x: Math.ceil(outputSize / 64 /* workgroup size */ / (vectorize ? 4 : 1) /* vec size */)}) + dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */ / 4 /* component size */)}) }; }; diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts index 7b82dc769535f..2df59f8ae467d 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/common.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts @@ -366,7 +366,7 @@ const createIndicesHelper = const getByIndicesImplementation = rank < 2 ? '' : ` fn get_${name}ByIndices(indices: ${type.indices}) -> ${valueType} { - return ${name}[i2o_${name}(indices)]; + return ${getByOffset(`i2o_${name}(indices)`)}; }`; const getImplementation = rank < 2 ? '' : (() => { diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts index 0db060dbec54a..47aae13d6799d 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts @@ -1,13 +1,12 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -import {DataType} from '../../../wasm-common'; import {TensorView} from '../../tensor-view'; import {ShapeUtil} from '../../util'; import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key'; import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types'; -import {ShaderHelper} from './common'; +import {inputVariable, outputVariable, ShaderHelper} from './common'; export interface GatherAttributes extends AttributeWithCacheKey { axis: number; @@ -30,63 +29,55 @@ const createGatherProgramInfo = const outputShape = inputShape.slice(0); outputShape.splice(axis, 1, ...indicesShape); - const inputDataType = inputs[0].dataType; - const block = ShapeUtil.sizeFromDimension(inputShape, axis + 1); - const elementSize = [DataType.int64, DataType.uint64, DataType.double].includes(inputDataType) ? 2 : 1; - const indicesElementSize = inputs[1].dataType === DataType.int64 ? 2 : 1; - const blockSize = elementSize * block; - const M = ShapeUtil.sizeToDimension(inputShape, axis); - const N = ShapeUtil.size(indicesShape); - const dataBatchElements = ShapeUtil.sizeFromDimension(inputShape, axis) * elementSize; - const gatheredBatchElements = N * block * elementSize; const axisDimLimit = inputShape[axis]; + const outputSize = ShapeUtil.size(outputShape); + + const data = inputVariable('data', inputs[0].dataType, inputs[0].dims); + const indices = inputVariable('inputIndices', inputs[1].dataType, inputs[1].dims); + const output = outputVariable('output', inputs[0].dataType, outputShape); + const calcDataIndices = (): string => { + const indicesRank = indicesShape.length; + let calcStr = `var indicesIndices = ${indices.type.indices}(0);`; + for (let i = 0; i < indicesRank; i++) { + calcStr += `${indicesRank > 1 ? `indicesIndices[${i}]` : 'indicesIndices'} = ${ + outputShape.length > 1 ? `outputIndices[${axis + i}]` : 'outputIndices'};`; + } + calcStr += ` + var idx = ${indices.getByIndices('indicesIndices')}; + if (idx < 0) { + idx = idx + ${axisDimLimit}; + } + var dataIndices = ${data.type.indices}(0); + `; + for (let i = 0, j = 0; i < inputRank; i++) { + if (i === axis) { + calcStr += `${inputRank > 1 ? `dataIndices[${i}]` : 'dataIndices'} = u32(idx);`; + j += indicesRank; + } else { + calcStr += `${inputRank > 1 ? `dataIndices[${i}]` : 'dataIndices'} = ${ + outputShape.length > 1 ? `outputIndices[${j}]` : 'outputIndices'};`; + j++; + } + } + return calcStr; + }; - const inputSize = ShapeUtil.size(inputShape) * elementSize; - const outputSize = ShapeUtil.size(outputShape) * elementSize; - - const totalGathers = M * N; - // int64 indices would be treated as little endian i32 with assumption they fall in i32 limits - // That assumption is safe as it's not possible to allocate >2gb buffer for input tensor - // Input data will be treated as u32 or two u32 for 8-byte tensors const getShaderSource = (shaderHelper: ShaderHelper) => ` - const N: u32 = ${N}; - const elementSize: u32 = ${elementSize}; - const indicesElementSize: u32 = ${indicesElementSize}; - - @group(0) @binding(0) var input : array; - @group(0) @binding(1) var inputIndices : array; - @group(0) @binding(2) var output: array; - - ${shaderHelper.mainStart()} - let batch: u32 = global_idx / N; - let i: u32 = global_idx % N; - - let srcOffsetBatch: u32 = batch * ${dataBatchElements}; - let dstOffsetBatch: u32 = batch * ${gatheredBatchElements}; - var idx = inputIndices[i * indicesElementSize]; - if (idx < 0) { - idx = idx + ${axisDimLimit}; - } - - let srcOffset = srcOffsetBatch + u32(idx) * ${blockSize}; - let dstOffset = dstOffsetBatch + i * ${blockSize}; - if (srcOffset >= ${inputSize}) { - return; - } - if (dstOffset >= ${outputSize}) { - return; - } - for (var j: u32 = 0; j < ${blockSize}; j++) { - output[dstOffset + j] = input[srcOffset + j]; - } - }`; + ${shaderHelper.declareVariables(data, indices, output)} + ${shaderHelper.mainStart()} + ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)} + let outputIndices = ${output.offsetToIndices('global_idx')}; + ${calcDataIndices()}; + let value = ${data.getByIndices('dataIndices')}; + ${output.setByOffset('global_idx', 'value')}; + }`; return { ...metadata, outputs: [ {dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}, ], getShaderSource, - dispatchGroup: () => ({x: Math.ceil(totalGathers / 64 /* workgroup size */)}) + dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)}) }; }; diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts index 8c8c12fc54ddb..120a0e9de5490 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts @@ -1,7 +1,6 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -import {DataType} from '../../../wasm-common'; import {TensorView} from '../../tensor-view'; import {PoolConvUtil, ShapeUtil} from '../../util'; import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key'; @@ -22,9 +21,6 @@ const validateInputs = (inputs: readonly TensorView[]): void => { if (inputs[0].dims.length !== 4) { throw new Error('Pool ops supports 2-D inputs only for now.'); } - if (inputs[0].dataType !== DataType.float) { - throw new Error('Invalid input type.'); - } }; const getAdjustedPoolAttributesAndOutputShape = ( @@ -248,7 +244,7 @@ const createAveragePoolProgramInfo = const kernelSize = ShapeUtil.size(adjustedAttributes.kernelShape); const x = inputVariable('x', input.dataType, input.dims); - const dataType = 'f32'; + const dataType = x.type.value; const op1 = 'value += x_val;'; let op2 = ''; diff --git a/js/web/lib/wasm/jsep/webgpu/ops/reduce.ts b/js/web/lib/wasm/jsep/webgpu/ops/reduce.ts index 0b8d03ea73b6b..598b1db033c61 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/reduce.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/reduce.ts @@ -17,10 +17,6 @@ const validateInputs = (inputs: readonly TensorView[]): void => { if (inputs.length === 2 && inputs[1].dims.length !== 1) { throw new Error('Invalid axes input dims.'); } - - if (inputs[0].dataType !== DataType.float) { - throw new Error('Invalid input type.'); - } }; export interface ReduceAttributes extends AttributeWithCacheKey { @@ -161,7 +157,7 @@ export const reduceL1 = (context: ComputeContext, attributes: ReduceAttributes): export const reduceL2 = (context: ComputeContext, attributes: ReduceAttributes): void => { validateInputs(context.inputs); const reduceOp: ReduceOp = (input, output) => - [`var t = f32(0); var value = ${output.type.storage}(0);`, + [`var t = ${output.type.value}(0); var value = ${output.type.value}(0);`, '', `t = ${input.getByOffset('inputOffset')}; value += (t * t);`, 'value = sqrt(value);', @@ -212,10 +208,10 @@ export const reduceMean = (context: ComputeContext, attributes: ReduceAttributes } return [ - `var value = ${output.type.storage}(0);`, + 'var sum = f32(0);', '', - `value += ${input.getByOffset('inputOffset')};`, - `value = value / ${size}.;`, + `sum += f32(${input.getByOffset('inputOffset')});`, + `let value = ${output.type.value}(sum / ${size});`, ]; }; context.compute(createReduceProgramInfoLoader(context.inputs, 'ReduceMean', attributes, reduceOp), {inputs: [0]}); @@ -266,7 +262,7 @@ export const reduceSum = (context: ComputeContext, attributes: ReduceAttributes) export const reduceSumSquare = (context: ComputeContext, attributes: ReduceAttributes): void => { validateInputs(context.inputs); const reduceOp: ReduceOp = (input, output) => - [`var t = f32(0); var value = ${output.type.storage}(0);`, + [`var t = ${output.type.value}(0); var value = ${output.type.value}(0);`, '', `t = ${input.getByOffset('inputOffset')}; value += t * t;`, '', diff --git a/onnxruntime/core/common/cpuid_info.cc b/onnxruntime/core/common/cpuid_info.cc index a23409292bb74..6a82b3fcc734d 100644 --- a/onnxruntime/core/common/cpuid_info.cc +++ b/onnxruntime/core/common/cpuid_info.cc @@ -135,38 +135,34 @@ void CPUIDInfo::ArmLinuxInit() { LOGS_DEFAULT(WARNING) << "Failed to init pytorch cpuinfo library, may cause CPU EP performance degradation due to undetected CPU features."; return; } + is_hybrid_ = cpuinfo_get_uarchs_count() > 1; + has_arm_neon_dot_ = cpuinfo_has_arm_neon_dot(); + has_fp16_ = cpuinfo_has_arm_neon_fp16_arith(); + const uint32_t core_cnt = cpuinfo_get_cores_count(); + core_uarchs_.resize(core_cnt, cpuinfo_uarch_unknown); + is_armv8_narrow_ld_.resize(core_cnt, false); + for (uint32_t c = 0; c < core_cnt; c++) { + const struct cpuinfo_processor* proc = cpuinfo_get_processor(c); + if (proc == nullptr) { + continue; + } + const struct cpuinfo_core* corep = proc->core; + if (corep == nullptr) { + continue; + } + auto coreid = proc->linux_id; + auto uarch = corep->uarch; + core_uarchs_[coreid] = uarch; + if (uarch == cpuinfo_uarch_cortex_a53 || uarch == cpuinfo_uarch_cortex_a55r0 || + uarch == cpuinfo_uarch_cortex_a55) { + is_armv8_narrow_ld_[coreid] = true; + } + } #else pytorch_cpuinfo_init_ = false; + has_arm_neon_dot_ = ((getauxval(AT_HWCAP) & HWCAP_ASIMDDP) != 0); + has_fp16_ |= has_arm_neon_dot_; #endif - - if (pytorch_cpuinfo_init_) { - is_hybrid_ = cpuinfo_get_uarchs_count() > 1; - has_arm_neon_dot_ = cpuinfo_has_arm_neon_dot(); - has_fp16_ = cpuinfo_has_arm_neon_fp16_arith(); - const uint32_t core_cnt = cpuinfo_get_cores_count(); - core_uarchs_.resize(core_cnt, cpuinfo_uarch_unknown); - is_armv8_narrow_ld_.resize(core_cnt, false); - for (uint32_t c = 0; c < core_cnt; c++) { - const struct cpuinfo_processor* proc = cpuinfo_get_processor(c); - if (proc == nullptr) { - continue; - } - const struct cpuinfo_core* corep = proc->core; - if (corep == nullptr) { - continue; - } - auto coreid = proc->linux_id; - auto uarch = corep->uarch; - core_uarchs_[coreid] = uarch; - if (uarch == cpuinfo_uarch_cortex_a53 || uarch == cpuinfo_uarch_cortex_a55r0 || - uarch == cpuinfo_uarch_cortex_a55) { - is_armv8_narrow_ld_[coreid] = true; - } - } - } else { - has_arm_neon_dot_ = ((getauxval(AT_HWCAP) & HWCAP_ASIMDDP) != 0); - has_fp16_ |= has_arm_neon_dot_; - } } #elif defined(_WIN32) diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp index 7e2b117d6f249..96bc1d8010bed 100644 --- a/onnxruntime/core/mlas/lib/platform.cpp +++ b/onnxruntime/core/mlas/lib/platform.cpp @@ -112,6 +112,14 @@ MLAS_INTERNAL_DATA MLAS_DECLSPEC_ALIGN(const int16_t MlasOpmask16BitTableAvx512[ #define _XCR_XFEATURE_ENABLED_MASK 0 #endif +#if !defined(XFEATURE_MASK_XTILE) +#define XFEATURE_XTILECFG 17 +#define XFEATURE_XTILEDATA 18 +#define XFEATURE_MASK_XTILECFG (1 << XFEATURE_XTILECFG) +#define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA) +#define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA) +#endif + inline uint64_t MlasReadExtendedControlRegister( @@ -142,11 +150,6 @@ bool MlasInitAMX() { #if defined(__linux__) -#define XFEATURE_XTILECFG 17 -#define XFEATURE_XTILEDATA 18 -#define XFEATURE_MASK_XTILECFG (1 << XFEATURE_XTILECFG) -#define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA) -#define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA) #define ARCH_GET_XCOMP_PERM 0x1022 #define ARCH_REQ_XCOMP_PERM 0x1023 @@ -417,7 +420,9 @@ Return Value: // Check if the processor supports AMX-TILE and AMX-INT8 // features. // - if ((Cpuid7[3] & 0b1 << 24) != 0 && (Cpuid7[3] & 0b1 << 25) != 0) { + if ((Cpuid7[3] & 0b1 << 24) != 0 && + (Cpuid7[3] & 0b1 << 25) != 0 && + (xcr0 & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE) { if (MlasInitAMX()) { this->GemmU8U8Dispatch = &MlasGemmU8S8DispatchAmx; this->GemmU8S8Dispatch = &MlasGemmU8S8DispatchAmx; diff --git a/onnxruntime/core/optimizer/layer_norm_fusion.cc b/onnxruntime/core/optimizer/layer_norm_fusion.cc index bf36f11521be2..159e3b23d1ab0 100644 --- a/onnxruntime/core/optimizer/layer_norm_fusion.cc +++ b/onnxruntime/core/optimizer/layer_norm_fusion.cc @@ -414,20 +414,20 @@ Status LayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, NodeArg* scale = nullptr; NodeArg* bias = nullptr; for (size_t i = 0; i < mul_node.MutableInputDefs().size(); i++) { - if (graph_utils::NodeArgIsConstant(graph, *(mul_node.MutableInputDefs()[i])) || - graph_utils::IsGraphInput(graph, mul_node.MutableInputDefs()[i])) { - if (mul_node.MutableInputDefs()[i]->Shape()->dim_size() == static_cast(axes_values.size())) { - scale = mul_node.MutableInputDefs()[i]; - } + if (mul_node.MutableInputDefs()[i]->Shape() == nullptr) { + continue; + } + if (mul_node.MutableInputDefs()[i]->Shape()->dim_size() == static_cast(axes_values.size())) { + scale = mul_node.MutableInputDefs()[i]; } } for (size_t i = 0; i < last_add_node.MutableInputDefs().size(); i++) { - if (graph_utils::NodeArgIsConstant(graph, *(last_add_node.MutableInputDefs()[i])) || - graph_utils::IsGraphInput(graph, last_add_node.MutableInputDefs()[i])) { - if (last_add_node.MutableInputDefs()[i]->Shape()->dim_size() == static_cast(axes_values.size())) { - bias = last_add_node.MutableInputDefs()[i]; - } + if (last_add_node.MutableInputDefs()[i]->Shape() == nullptr) { + continue; + } + if (last_add_node.MutableInputDefs()[i]->Shape()->dim_size() == static_cast(axes_values.size())) { + bias = last_add_node.MutableInputDefs()[i]; } } if (scale == nullptr || bias == nullptr) { @@ -667,20 +667,20 @@ Status SimplifiedLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int gr // because SkipLayerNorm kernel, for example, has dependency on single dim size NodeArg* scale = nullptr; for (size_t i = 0; i < mul_node.MutableInputDefs().size(); i++) { - if (graph_utils::NodeArgIsConstant(graph, *(mul_node.MutableInputDefs()[i])) || - graph_utils::IsGraphInput(graph, mul_node.MutableInputDefs()[i])) { + if (mul_node.MutableInputDefs()[i]->Shape() == nullptr) { + continue; + } #ifdef ENABLE_TRAINING_CORE - if (axes_values.empty() || - mul_node.MutableInputDefs()[i]->Shape()->dim_size() == static_cast(axes_values.size())) { - scale = mul_node.MutableInputDefs()[i]; - } + if (axes_values.empty() || + mul_node.MutableInputDefs()[i]->Shape()->dim_size() == static_cast(axes_values.size())) { + scale = mul_node.MutableInputDefs()[i]; + } #else - // Scale must be 1d. - if (mul_node.MutableInputDefs()[i]->Shape()->dim_size() == 1) { - scale = mul_node.MutableInputDefs()[i]; - } -#endif + // Scale must be 1d. + if (mul_node.MutableInputDefs()[i]->Shape()->dim_size() == 1) { + scale = mul_node.MutableInputDefs()[i]; } +#endif } if (scale == nullptr) { diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc index 7f6e065001488..fa462323192ef 100644 --- a/onnxruntime/core/providers/js/js_execution_provider.cc +++ b/onnxruntime/core/providers/js/js_execution_provider.cc @@ -129,56 +129,56 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 14, Rel class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6, 15, LeakyRelu); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 16, LeakyRelu); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceMax); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, float, ReduceMax); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, 12, float, ReduceMax); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceMax); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceMax); - -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceMean); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceMean); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceMean); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceMean); - -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceMin); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, float, ReduceMin); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, 12, float, ReduceMin); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceMin); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceMin); - -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceProd); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceProd); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceProd); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceProd); - -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceSum); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceSum); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, float, ReduceSum); - -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceL1); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceL1); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceL1); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceL1); - -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceL2); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceL2); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceL2); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceL2); - -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceLogSum); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceLogSum); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceLogSum); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceLogSum); - -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceSumSquare); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceSumSquare); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceSumSquare); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceSumSquare); - -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceLogSumExp); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceLogSumExp); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceLogSumExp); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceLogSumExp); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceMax); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, ReduceMax); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, 12, ReduceMax); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceMax); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceMax); + +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceMean); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceMean); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceMean); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceMean); + +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceMin); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, ReduceMin); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, 12, ReduceMin); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceMin); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceMin); + +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceProd); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceProd); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceProd); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceProd); + +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceSum); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceSum); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, ReduceSum); + +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceL1); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceL1); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceL1); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceL1); + +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceL2); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceL2); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceL2); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceL2); + +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceLogSum); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceLogSum); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceLogSum); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceLogSum); + +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceSumSquare); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceSumSquare); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceSumSquare); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceSumSquare); + +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceLogSumExp); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceLogSumExp); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceLogSumExp); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceLogSumExp); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, ThresholdedRelu); @@ -237,11 +237,11 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Tra class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, ConvTranspose); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 11, float, MaxPool); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 12, float, MaxPool); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, AveragePool); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, float, GlobalAveragePool); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, float, GlobalMaxPool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, AveragePool); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, GlobalAveragePool); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, GlobalMaxPool); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Conv); @@ -254,16 +254,16 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Gem class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, MatMul); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, MatMul); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 9, float, AveragePool); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, float, AveragePool); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, AveragePool); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, float, GlobalAveragePool); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 7, float, MaxPool); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 8, 9, float, MaxPool); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, float, MaxPool); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, float, MaxPool); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, float, MaxPool); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, float, GlobalMaxPool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 9, AveragePool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, AveragePool); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, AveragePool); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, GlobalAveragePool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 7, MaxPool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 8, 9, MaxPool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, MaxPool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, MaxPool); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, MaxPool); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, GlobalMaxPool); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ArgMax); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ArgMax); @@ -441,71 +441,71 @@ std::unique_ptr RegisterKernels() { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -518,16 +518,16 @@ std::unique_ptr RegisterKernels() { BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/core/providers/js/operators/pool.cc b/onnxruntime/core/providers/js/operators/pool.cc index 03e6caef7e5b8..7fdb4e5d114ea 100644 --- a/onnxruntime/core/providers/js/operators/pool.cc +++ b/onnxruntime/core/providers/js/operators/pool.cc @@ -8,69 +8,65 @@ namespace onnxruntime { namespace js { -#define POOLING_KERNEL(op_name, domain, is_channels_last, data_type, pool_type, since_version) \ - ONNX_OPERATOR_TYPED_KERNEL_EX( \ - op_name, \ - domain, \ - since_version, \ - data_type, \ - kJsExecutionProvider, \ - (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()), \ - Pool); +#define POOLING_KERNEL(op_name, domain, is_channels_last, pool_type, since_version) \ + ONNX_OPERATOR_KERNEL_EX( \ + op_name, \ + domain, \ + since_version, \ + kJsExecutionProvider, \ + (*KernelDefBuilder::Create()).TypeConstraint("T", JsepSupportedFloatTypes()), \ + Pool); -#define POOLING_KERNEL_VERSIONED(op_name, domain, is_channels_last, data_type, pool_type, since_version, end_version) \ - ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ - op_name, \ - domain, \ - since_version, \ - end_version, \ - data_type, \ - kJsExecutionProvider, \ - (*KernelDefBuilder::Create()) \ - .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ - Pool); +#define POOLING_KERNEL_VERSIONED(op_name, domain, is_channels_last, pool_type, since_version, end_version) \ + ONNX_OPERATOR_VERSIONED_KERNEL_EX( \ + op_name, \ + domain, \ + since_version, \ + end_version, \ + kJsExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T", JsepSupportedFloatTypes()), \ + Pool); -#define POOLING_KERNEL_WITH_INDICES(op_name, domain, is_channels_last, data_type, pool_type, since_version) \ - ONNX_OPERATOR_TYPED_KERNEL_EX( \ - op_name, \ - domain, \ - since_version, \ - data_type, \ - kJsExecutionProvider, \ - (*KernelDefBuilder::Create()) \ - .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ - .TypeConstraint("I", DataTypeImpl::GetTensorType()), \ - Pool); +#define POOLING_KERNEL_WITH_INDICES(op_name, domain, is_channels_last, pool_type, since_version) \ + ONNX_OPERATOR_KERNEL_EX( \ + op_name, \ + domain, \ + since_version, \ + kJsExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T", JsepSupportedFloatTypes()) \ + .TypeConstraint("I", DataTypeImpl::GetTensorType()), \ + Pool); -#define POOLING_KERNEL_VERSIONED_WITH_INDICES(op_name, domain, is_channels_last, data_type, pool_type, since_version, end_version) \ - ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ - op_name, \ - domain, \ - since_version, \ - end_version, \ - data_type, \ - kJsExecutionProvider, \ - (*KernelDefBuilder::Create()) \ - .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ - .TypeConstraint("I", DataTypeImpl::GetTensorType()), \ - Pool); +#define POOLING_KERNEL_VERSIONED_WITH_INDICES(op_name, domain, is_channels_last, pool_type, since_version, end_version) \ + ONNX_OPERATOR_VERSIONED_KERNEL_EX( \ + op_name, \ + domain, \ + since_version, \ + end_version, \ + kJsExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T", JsepSupportedFloatTypes()) \ + .TypeConstraint("I", DataTypeImpl::GetTensorType()), \ + Pool); -POOLING_KERNEL_VERSIONED(AveragePool, kOnnxDomain, false, float, AveragePool, 7, 9) -POOLING_KERNEL_VERSIONED(AveragePool, kOnnxDomain, false, float, AveragePool, 10, 10) -POOLING_KERNEL(AveragePool, kOnnxDomain, false, float, AveragePool, 11) -POOLING_KERNEL(AveragePool, kMSInternalNHWCDomain, true, float, AveragePool, 11) -POOLING_KERNEL(GlobalAveragePool, kOnnxDomain, false, float, AveragePool, 1) -POOLING_KERNEL(GlobalAveragePool, kMSInternalNHWCDomain, true, float, AveragePool, 1) +POOLING_KERNEL_VERSIONED(AveragePool, kOnnxDomain, false, AveragePool, 7, 9) +POOLING_KERNEL_VERSIONED(AveragePool, kOnnxDomain, false, AveragePool, 10, 10) +POOLING_KERNEL(AveragePool, kOnnxDomain, false, AveragePool, 11) +POOLING_KERNEL(AveragePool, kMSInternalNHWCDomain, true, AveragePool, 11) +POOLING_KERNEL(GlobalAveragePool, kOnnxDomain, false, AveragePool, 1) +POOLING_KERNEL(GlobalAveragePool, kMSInternalNHWCDomain, true, AveragePool, 1) -POOLING_KERNEL_VERSIONED(MaxPool, kOnnxDomain, false, float, MaxPool<1>, 1, 7) -POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, float, MaxPool<8>, 8, 9) -POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, float, MaxPool<8>, 10, 10) -POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, float, MaxPool<8>, 11, 11) -POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kMSInternalNHWCDomain, true, float, MaxPool<8>, 11, 11) -POOLING_KERNEL_WITH_INDICES(MaxPool, kOnnxDomain, false, float, MaxPool<8>, 12) -POOLING_KERNEL_WITH_INDICES(MaxPool, kMSInternalNHWCDomain, true, float, MaxPool<8>, 12) -POOLING_KERNEL(GlobalMaxPool, kOnnxDomain, false, float, MaxPool<1>, 1) -POOLING_KERNEL(GlobalMaxPool, kMSInternalNHWCDomain, true, float, MaxPool<1>, 1) +POOLING_KERNEL_VERSIONED(MaxPool, kOnnxDomain, false, MaxPool<1>, 1, 7) +POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, MaxPool<8>, 8, 9) +POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, MaxPool<8>, 10, 10) +POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, MaxPool<8>, 11, 11) +POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kMSInternalNHWCDomain, true, MaxPool<8>, 11, 11) +POOLING_KERNEL_WITH_INDICES(MaxPool, kOnnxDomain, false, MaxPool<8>, 12) +POOLING_KERNEL_WITH_INDICES(MaxPool, kMSInternalNHWCDomain, true, MaxPool<8>, 12) +POOLING_KERNEL(GlobalMaxPool, kOnnxDomain, false, MaxPool<1>, 1) +POOLING_KERNEL(GlobalMaxPool, kMSInternalNHWCDomain, true, MaxPool<1>, 1) } // namespace js } // namespace onnxruntime diff --git a/onnxruntime/core/providers/js/operators/pool.h b/onnxruntime/core/providers/js/operators/pool.h index 5dbe5d0b8881d..5723123c0c3b8 100644 --- a/onnxruntime/core/providers/js/operators/pool.h +++ b/onnxruntime/core/providers/js/operators/pool.h @@ -41,7 +41,7 @@ namespace js { #define GLOBAL_POOL_ATTRIBUTES_JS_OBJ_MAPPING ({"format" : $1 ? "NHWC" : "NCHW"}) #define GLOBAL_POOL_ATTRIBUTES_PARAM_LIST static_cast(is_channels_last) -template +template class Pool : public JsKernel, public PoolBase { public: Pool(const OpKernelInfo& info) : JsKernel(info), PoolBase(info) { @@ -65,10 +65,10 @@ class Pool : public JsKernel, public PoolBase { } }; -template -class Pool, is_channels_last> final : public Pool, is_channels_last> { +template +class Pool, is_channels_last> final : public Pool, is_channels_last> { public: - Pool(const OpKernelInfo& info) : Pool, is_channels_last>(info) {} + Pool(const OpKernelInfo& info) : Pool, is_channels_last>(info) {} }; } // namespace js diff --git a/onnxruntime/core/providers/js/operators/reduce.cc b/onnxruntime/core/providers/js/operators/reduce.cc index 21854fccc37ca..2679cfed86124 100644 --- a/onnxruntime/core/providers/js/operators/reduce.cc +++ b/onnxruntime/core/providers/js/operators/reduce.cc @@ -7,32 +7,30 @@ namespace onnxruntime { namespace js { #define REGISTER_REDUCE_ELEMENTWISE_VERSIONED_KERNEL(ReduceOp, sinceVersion, endVersion) \ - ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ + ONNX_OPERATOR_VERSIONED_KERNEL_EX( \ ReduceOp, \ kOnnxDomain, \ sinceVersion, endVersion, \ - float, \ kJsExecutionProvider, \ (*KernelDefBuilder::Create()) \ - .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ - ReduceOp); + .TypeConstraint("T", JsepSupportedFloatTypes()), \ + ReduceOp); // macro REGISTER_REDUCE_ELEMENTWISE_VERSIONED_KERNEL does not set .InputMemoryType(OrtMemTypeCPU, 1), so in future if // a new opset version update applies to Reduce* operators, we may need to add another macro like // REGISTER_REDUCE_ELEMENTWISE_VERSIONED_KERNEL_WITH_AXIS_IN_INPUT to set input memory type. // i.e. we cannot use REGISTER_REDUCE_ELEMENTWISE_VERSIONED_KERNEL to version 18 when the opset version is increased. -#define REGISTER_REDUCE_ELEMENTWISE_KERNEL(ReduceOp, sinceVersion) \ - ONNX_OPERATOR_TYPED_KERNEL_EX( \ - ReduceOp, \ - kOnnxDomain, \ - sinceVersion, \ - float, \ - kJsExecutionProvider, \ - (*KernelDefBuilder::Create()) \ - .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ - .InputMemoryType(OrtMemTypeCPU, 1), \ - ReduceOp); +#define REGISTER_REDUCE_ELEMENTWISE_KERNEL(ReduceOp, sinceVersion) \ + ONNX_OPERATOR_KERNEL_EX( \ + ReduceOp, \ + kOnnxDomain, \ + sinceVersion, \ + kJsExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T", JsepSupportedFloatTypes()) \ + .InputMemoryType(OrtMemTypeCPU, 1), \ + ReduceOp); REGISTER_REDUCE_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 1, 10); REGISTER_REDUCE_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 11, 12); diff --git a/onnxruntime/core/providers/js/operators/reduce.h b/onnxruntime/core/providers/js/operators/reduce.h index 19a6d298c7696..a5a4aa834c2ca 100644 --- a/onnxruntime/core/providers/js/operators/reduce.h +++ b/onnxruntime/core/providers/js/operators/reduce.h @@ -9,7 +9,7 @@ namespace onnxruntime { namespace js { #define JSEP_DEFINE_REDUCE_KERNEL(ReduceKernel) \ - template \ + template \ class ReduceKernel : public JsKernel, public ReduceKernelBase { \ public: \ using ReduceKernelBase::axes_; \ diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc index 511f2a5149f2e..4039c4fbf8d70 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc @@ -2,7 +2,8 @@ // Licensed under the MIT License. #include -#include +#include +#include #include "core/providers/common.h" #include "core/providers/shared/utils/utils.h" @@ -42,76 +43,6 @@ class ResizeOpBuilder : public BaseOpBuilder { bool do_op_validation) const override ORT_MUST_USE_RESULT; private: - /** - * Returns the QNN integer value that corresponds to the given ONNX mode (string). - * - * /param onnx_modes Array of ONNX modes supported by QNN. The index of each mode corresponds to the QNN value. - * /param onnx_mode The ONNX mode for which to get the corresponding QNN value. - * /param onnx_model_label Mode label to print out in case of error (e.g., "nearest_mode"). - * /param qnn_mode Output parameter that is set to the appropriate QNN value from the given ONNX mode. - * - * /returns A status indicating failure or success. - */ - template - Status GetQnnModeFromString(const std::array& onnx_modes, std::string_view onnx_mode, - const char* onnx_mode_label, QnnValType& qnn_mode) const ORT_MUST_USE_RESULT; - - /** - * Called by IsOpSupported to validate the op for non-quantized models. - * - * /param qnn_model_wrapper The QNN model wrapper instance. - * /param node_unit The node unit containing metadata for the ONNX Resize operator. - * - * /returns A status indicating failure or success. - */ - Status ValidateOp(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const ORT_MUST_USE_RESULT; - - /** - * Called by IsOpSupported to validate the op for quantized models. - * - * /param qnn_model_wrapper The QNN model wrapper instance. - * /param node_unit The node unit containing metadata for the ONNX Resize operator and its Q/DQ nodes. - * - * /returns A status indicating failure or success. - */ - Status ValidateQDQOp(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const ORT_MUST_USE_RESULT; - - /** - * Called by ProcessAttributesAndOutputs to process the op's attributes and outputs - * for non-quantized models. - * - * /param qnn_model_wrapper The QNN model wrapper instance. - * /param node_unit The node unit containing metadata for the ONNX Resize operator. - * /param input_names The operator's input names. - * /param logger A logger. - * /param do_op_validation Set to true if the op should be validated using QNN's validation API. - * - * /returns A status indicating failure or success. - */ - Status ProcessOpAttrsAndOutputs(QnnModelWrapper& qnn_model_wrapper, - const NodeUnit& node_unit, - std::vector&& input_names, - const logging::Logger& logger, - bool do_op_validation) const ORT_MUST_USE_RESULT; - - /** - * Called by ProcessAttributesAndOutputs to process the op's attributes and outputs - * for quantized models. - * - * /param qnn_model_wrapper The QNN model wrapper instance. - * /param node_unit The node unit containing metadata for the ONNX Resize operator and its Q/DQ nodes. - * /param input_names The operator's input names. - * /param logger A logger. - * /param do_op_validation Set to true if the op should be validated using QNN's validation API. - * - * /returns A status indicating failure or success. - */ - Status ProcessQDQOpAttrsAndOutputs(QnnModelWrapper& qnn_model_wrapper, - const NodeUnit& node_unit, - std::vector&& input_names, - const logging::Logger& logger, - bool do_op_validation) const ORT_MUST_USE_RESULT; - // Info for each ONNX attribute of interest (attribute name + default value) static const OnnxAttrInfo onnx_mode_attr; static const OnnxAttrInfo onnx_coord_transf_mode_attr; @@ -119,21 +50,29 @@ class ResizeOpBuilder : public BaseOpBuilder { static const OnnxAttrInfo onnx_antialias_attr; static const OnnxAttrInfo onnx_exclude_outside_attr; - // Arrays of supported QNN modes for QNN's Resize op. The index of each mode is used as the corresponding - // QNN parameter value. Ex: The "nearest" mode is represented as the value 0 in QNN. Note, that - // not all modes are supported by every QNN backend. + // Tables that map an ONNX attribute value (string) to the corresponding integer (enum) QNN parameter value. + // Ex: The "half_pixel" coordinate_transformation_mode is represented as the value 0 in QNN. + // Only the modes supported by QNN Resize are mapped by these tables. + static const std::unordered_map supported_modes; + static const std::unordered_map supported_coord_transf_modes; + static const std::unordered_map supported_nearest_modes; +}; - // QNN values: NEAREST = 0, LINEAR = 1 - static constexpr std::array supported_modes = {"nearest", "linear"}; +const std::unordered_map ResizeOpBuilder::supported_modes = { + {"nearest", QNN_OP_RESIZE_INTERPOLATION_MODE_NEAREST}, + {"linear", QNN_OP_RESIZE_INTERPOLATION_MODE_LINEAR}}; - // QNN values: HALF_PIXEL = 0, PYTORCH_HALF_PIXEL = 1, ALIGN_CORNERS = 2, ASYMMETRIC = 3 - static constexpr std::array supported_coord_transf_modes = {"half_pixel", "pytorch_half_pixel", - "align_corners", "asymmetric"}; +const std::unordered_map ResizeOpBuilder::supported_coord_transf_modes = { + {"half_pixel", QNN_OP_RESIZE_TRANSFORMATION_MODE_HALF_PIXEL}, + {"pytorch_half_pixel", QNN_OP_RESIZE_TRANSFORMATION_MODE_PYTORCH_HALF_PIXEL}, + {"align_corners", QNN_OP_RESIZE_TRANSFORMATION_MODE_ALIGN_CORNERS}, + {"asymmetric", QNN_OP_RESIZE_TRANSFORMATION_MODE_ASYMMETRIC}}; - // QNN values: ROUND_PREFER_FLOOR = 0, ROUND_PREFER_CEIL = 1, FLOOR = 2, CEIL = 3 - static constexpr std::array supported_nearest_modes = {"round_prefer_floor", "round_prefer_ceil", - "floor", "ceil"}; -}; +const std::unordered_map ResizeOpBuilder::supported_nearest_modes = { + {"round_prefer_floor", QNN_OP_RESIZE_NEAREST_MODE_ROUND_PREFER_FLOOR}, + {"round_prefer_ceil", QNN_OP_RESIZE_NEAREST_MODE_ROUND_PREFER_CEIL}, + {"floor", QNN_OP_RESIZE_NEAREST_MODE_FLOOR}, + {"ceil", QNN_OP_RESIZE_NEAREST_MODE_CEIL}}; const OnnxAttrInfo ResizeOpBuilder::onnx_mode_attr = {"mode", "nearest"}; const OnnxAttrInfo ResizeOpBuilder::onnx_coord_transf_mode_attr = {"coordinate_transformation_mode", @@ -143,19 +82,26 @@ const OnnxAttrInfo ResizeOpBuilder::onnx_nearest_mode_attr = {"near const OnnxAttrInfo ResizeOpBuilder::onnx_antialias_attr = {"antialias", 0}; const OnnxAttrInfo ResizeOpBuilder::onnx_exclude_outside_attr = {"exclude_outside", 0}; -template -Status ResizeOpBuilder::GetQnnModeFromString(const std::array& onnx_modes, - std::string_view onnx_mode, const char* onnx_mode_label, - QnnValType& qnn_mode) const { - for (size_t i = 0; i < onnx_modes.size(); ++i) { - if (onnx_modes[i] == onnx_mode) { - qnn_mode = SafeInt(i); - return Status::OK(); - } +// Returns the QNN parameter integer value that corresponds to the given ONNX attribute mode string value. +static Status GetQnnModeValFromOnnxString(const std::unordered_map& supported_qnn_modes, + const std::string& onnx_attr_value, + const char* onnx_attr_name, + uint32_t& qnn_mode_value) { + auto it = supported_qnn_modes.find(onnx_attr_value); + if (it != supported_qnn_modes.end()) { + qnn_mode_value = it->second; + return Status::OK(); } - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Resize operator does not support ", onnx_mode_label, - " ", std::string(onnx_mode)); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Resize operator does not support ", onnx_attr_name, + " ", std::string(onnx_attr_value)); +} + +// Returns true if the given ONNX attribute mode value is generally supported on QNN. Note that +// different QNN backends may support a smaller subset of modes. +static bool IsOnnxAttrModeSupported(const std::unordered_map& supported_qnn_modes, + const std::string& onnx_attr_value) { + return supported_qnn_modes.find(onnx_attr_value) != supported_qnn_modes.end(); } // Resize ops are sensitive with data layout, no special validation so far @@ -169,118 +115,95 @@ Status ResizeOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper, return AddToModelBuilder(qnn_model_wrapper, node_unit, logger, true); } + const bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType()); + NodeAttrHelper node_helper(node_unit); + // QNN doesn't support anti-aliasing (added in opset 18) if (node_unit.SinceVersion() >= 18) { - NodeAttrHelper node_helper(node_unit); const bool antialias = GetOnnxAttr(node_helper, onnx_antialias_attr) != 0; ORT_RETURN_IF(antialias, "QNN EP: Resize doesn't support anti-aliasing."); } - // The QNN Resize op does not currently work with the QNN cpu backend, but works with the HTP backend. Therefore, we - // currently use QNN's Resize op for quantized models and either ResizeBilinear or ResizeNearestNeighbor for - // non-quantized models. This requires separate validation for quantized models. - // TODO: Use only Resize once QNN's Resize op works in the QNN cpu backend. - bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType()); - return is_npu_backend ? ValidateQDQOp(qnn_model_wrapper, node_unit) : ValidateOp(qnn_model_wrapper, node_unit); -} - -Status ResizeOpBuilder::ValidateOp(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const { - NodeAttrHelper node_helper(node_unit); - const std::string resize_mode = GetOnnxAttr(node_helper, onnx_mode_attr); - ORT_RETURN_IF((resize_mode != "nearest") && (resize_mode != "linear"), - "QNN EP: Resize doesn't support mode '", resize_mode.c_str(), "'.", - "Only 'nearest' and 'linear' are supported."); - - const std::string coordinate_mode = GetOnnxAttr(node_helper, onnx_coord_transf_mode_attr); - ORT_RETURN_IF((coordinate_mode != "half_pixel") && (coordinate_mode != "align_corners"), - "QNN EP: coordinate transformation mode '", coordinate_mode.c_str(), "' not supported for Resize op.", - "Only 'align_corners' and 'half_pixel' are supported."); - - // Check for a valid "nearest_mode" if the mode is "nearest". - if (resize_mode == "nearest") { - // NOTE: QNN's ResizeNearestNeighbor operator does not have a way to specify rounding (i.e., "nearest_mode"). - // The output of the QNN ResizeNearestNeighbor operator is not always equivalent to ONNX's Resize - // operator with any single specific "nearest_mode". - // - // For some input/output shapes, QNN's ResizeNearestNeighbor is equivalent to ONNX's Resize with "round_prefer_floor". - // For other shapes, QNN's ResizeNearestNeighbor is equivalent to ONNX Resize with "round_prefer_ceil". - // - // From unit tests, I've found a relationship between input/output shapes and the equivalent ONNX "nearest_mode". - // If the new and old spatial dimensions are evenly divisible, the "nearest_mode" is "round_prefer_floor". - // Otherwise, the "nearest_mode" is "round_prefer_ceil". - // - // This relationship is probably incomplete/wrong. - // - // TODO: Ask Qualcomm what the correct "nearest_mode" should be, - // OR use QNN's own Resize operator once it works on QnnCpu. - const std::string& nearest_mode = GetOnnxAttr(node_helper, onnx_nearest_mode_attr); - ORT_RETURN_IF_NOT("floor" == nearest_mode, "QNN Resize only supports nearest_mode: floor!"); // This is wrong! - } - - auto& input_0 = node_unit.Inputs()[0]; - std::vector input_shape; - ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(input_0.node_arg, input_shape), - "QNN EP: Cannot get input shape for Resize op"); - - const auto& output_0 = node_unit.Outputs()[0]; - std::vector output_shape; - ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(output_0.node_arg, output_shape), - "QNN EP: Cannot get output shape for Resize op"); - - ORT_RETURN_IF(input_shape.size() != 4 || output_shape.size() != 4, "QNN Resize only supports 4D!"); - - ONNX_NAMESPACE::DataType input_data_type = input_0.node_arg.Type(); - ORT_RETURN_IF(input_data_type != ONNX_NAMESPACE::Utils::DataTypeUtils::ToType("float"), - "QNN EP: Data type ", input_data_type->c_str(), - " is not supported for Resize operator in CPU backend."); - - return Status::OK(); -} - -Status ResizeOpBuilder::ValidateQDQOp(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const { - NodeAttrHelper node_helper(node_unit); - - using namespace onnxruntime::qnn::utils; // Check mode const std::string interp_mode = GetOnnxAttr(node_helper, onnx_mode_attr); - ORT_RETURN_IF_NOT(ArrayHasString(supported_modes, interp_mode), "QNN EP: Resize does not support mode ", + ORT_RETURN_IF_NOT(IsOnnxAttrModeSupported(supported_modes, interp_mode), "QNN EP: Resize does not support mode ", interp_mode.c_str()); // Check coordinate transformation mode const std::string transformation_mode = GetOnnxAttr(node_helper, onnx_coord_transf_mode_attr); - ORT_RETURN_IF_NOT(ArrayHasString(supported_coord_transf_modes, transformation_mode), + ORT_RETURN_IF_NOT(IsOnnxAttrModeSupported(supported_coord_transf_modes, transformation_mode), "QNN EP: Resize does not support coordinate_transformation_mode ", transformation_mode.c_str()); - // Check nearest mode + const auto& input_0 = node_unit.Inputs()[0]; + std::vector input_shape; + ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(input_0.node_arg, input_shape), + "QNN EP: Cannot get shape for Resize input"); + const size_t input_rank = input_shape.size(); + + // Validate Resize w/ "nearest" mode. + // Translation matrix of ONNX Resize w/ "nearest" mode on HTP backend. + // Table entries correspond to the QNN operator used for the given configuration + // (Resize = QNN Resize op, RNN = QNN ResizeNearestNeighbor op, X = Unsupported). + // + // nearest_mode: + // coordinate_transformation_mode: | round_prefer_floor round_prefer_ceil floor ceil + // ----------------------------------------------------------------------------------------- + // half_pixel | Resize X RNN X + // pytorch_half_pixel | Resize X X X + // align_corners | Resize X RNN X + // asymmetric | Resize X RNN X + if (interp_mode == "nearest") { const std::string nearest_mode = GetOnnxAttr(node_helper, onnx_nearest_mode_attr); - ORT_RETURN_IF_NOT(ArrayHasString(supported_nearest_modes, nearest_mode), + ORT_RETURN_IF_NOT(IsOnnxAttrModeSupported(supported_nearest_modes, nearest_mode), "QNN EP: Resize does not support nearest_mode ", nearest_mode.c_str()); - // TODO: Support 'asymmetric' transformation mode with nearest_mode != 'floor'. - // - // QNN's ONNX converter tool translates 'nearest' + 'asymmetric' (regardless of rounding mode) - // to QNN's ResizeNearestNeighbor with {align_corners: 0, half_pixel: 0}. - // This is only accurate if the rounding mode is "floor". Need to investigate how to handle - // other rounding modes with Qualcomm. Ideally, we would use QNN's Resize operator, but it doesn't support - // the "asymmetric" coordinate transformation mode on HTP. - ORT_RETURN_IF(transformation_mode == "asymmetric" && nearest_mode != "floor", - "QNN EP: Resize with coordinate_transformation_mode 'asymmetric' and nearest_mode '", nearest_mode, - "' is not currently supported on the HTP backend."); + if (is_npu_backend) { + // QNN only supports the following nearest_mode values on HTP: + // - "round_prefer_floor" via QNN's Resize operator + // - "floor" via QNN's ResizeNearestNeighbor operator + // + // QNN validation does not throw an error if unsupported nearest_mode values are used, so we have to + // catch them here. Otherwise, accuracy is significantly degraded. + ORT_RETURN_IF_NOT(nearest_mode == "round_prefer_floor" || nearest_mode == "floor", + "QNN EP: Resize on the NPU does not support nearest_mode ", nearest_mode.c_str()); + + const bool use_resize_nn_op = nearest_mode == "floor"; + + // If HTP uses ResizeNearestNeighbor ("floor"), then the "pytorch_half_pixel" coordinate_transformation_mode + // is not supported. + ORT_RETURN_IF(use_resize_nn_op && transformation_mode == "pytorch_half_pixel", + "QNN EP: Resize on the NPU does not support the combination of nearest_mode == 'floor' ", + " and coordinate_transformation_mode == 'pytorch_half_pixel'."); + + // QNN's ResizeNearestNeighbor requires rank 4 inputs. + ORT_RETURN_IF(use_resize_nn_op && input_rank != 4, + "QNN EP: Resize on the NPU with nearest_mode == 'floor' requires an input with rank 4."); + } } - // Check that input shape has at least a rank of 3. - const auto& input_0 = node_unit.Inputs()[0]; - std::vector input_shape; - ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(input_0.node_arg, input_shape), - "QNN EP: Cannot get shape for Resize input"); - ORT_RETURN_IF(input_shape.size() < 3, "QNN EP: Resize input must have a rank >= 3."); + // Check that the input shape has at least a rank of 3 (and a max of 5 on HTP). + ORT_RETURN_IF(input_rank < 3 || (is_npu_backend && input_rank > 5), + "QNN EP: Resize input must have a rank >= 3. The maximum rank is 5 on the NPU."); const auto& output_0 = node_unit.Outputs()[0]; std::vector output_shape; ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(output_0.node_arg, output_shape), "QNN EP: Cannot get shape for Resize output"); - ORT_RETURN_IF(output_shape.size() < 3, "QNN EP: Resize output must have a rank >= 3."); + + // Check that only the spatial dimensions (width, height) are resized. The batch_size (N) and channels (C) should + // be untouched. This code runs before layout transformation, so we know that the current layout is "channel first" + // (e.g., N, C, S1, S2, ..., SN), and that the minimum rank is 3. + assert(node_unit.Domain() != kMSInternalNHWCDomain); + ORT_RETURN_IF_NOT(input_shape[0] == output_shape[0] && input_shape[1] == output_shape[1], + "QNN EP: Resize may only change the spatial dimensions."); + + if (!is_npu_backend) { + ONNX_NAMESPACE::DataType input_data_type = input_0.node_arg.Type(); + ORT_RETURN_IF(input_data_type != ONNX_NAMESPACE::Utils::DataTypeUtils::ToType("float"), + "QNN EP: Data type ", input_data_type->c_str(), + " is not supported for Resize operator in CPU backend."); + } return Status::OK(); } @@ -305,92 +228,34 @@ Status ResizeOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w std::vector&& input_names, const logging::Logger& logger, bool do_op_validation) const { - // The QNN Resize op does not currently work with the QNN cpu backend, but works with the HTP backend. Therefore, we - // currently use QNN's Resize op for quantized models and either ResizeBilinear or ResizeNearestNeighbor for - // non-quantized models. This requires separate handling for quantized models. - // TODO: Use only Resize once QNN's Resize op works in the QNN cpu backend. - bool is_quantized_node = NodeUnit::Type::QDQGroup == node_unit.UnitType(); - return is_quantized_node ? ProcessQDQOpAttrsAndOutputs(qnn_model_wrapper, node_unit, std::move(input_names), logger, do_op_validation) : ProcessOpAttrsAndOutputs(qnn_model_wrapper, node_unit, std::move(input_names), logger, do_op_validation); -} - -Status ResizeOpBuilder::ProcessOpAttrsAndOutputs(QnnModelWrapper& qnn_model_wrapper, - const NodeUnit& node_unit, - std::vector&& input_names, - const logging::Logger& logger, - bool do_op_validation) const { - ORT_UNUSED_PARAMETER(logger); - NodeAttrHelper node_helper(node_unit); - const std::string resize_mode = GetOnnxAttr(node_helper, onnx_mode_attr); - std::string qnn_node_type = "ResizeNearestNeighbor"; - if ("linear" == resize_mode) { - qnn_node_type = "ResizeBilinear"; - } - - const std::string coordinate_mode = GetOnnxAttr(node_helper, onnx_coord_transf_mode_attr); - - Qnn_Scalar_t qnn_align_corners = QNN_SCALAR_INIT; - qnn_align_corners.dataType = QNN_DATATYPE_BOOL_8; - qnn_align_corners.bool8Value = static_cast(0); - - Qnn_Scalar_t qnn_half_pixel = QNN_SCALAR_INIT; - qnn_half_pixel.dataType = QNN_DATATYPE_BOOL_8; - qnn_half_pixel.bool8Value = static_cast(0); - - if ("align_corners" == coordinate_mode) { - qnn_align_corners.bool8Value = static_cast(1); - } else if ("half_pixel" == coordinate_mode) { - qnn_half_pixel.bool8Value = static_cast(1); - } - QnnParamWrapper qnn_align_corners_param(node_unit.Index(), node_unit.Name(), - QNN_OP_RESIZE_BILINEAR_PARAM_ALIGN_CORNERS, qnn_align_corners); - QnnParamWrapper qnn_half_pixel_param(node_unit.Index(), node_unit.Name(), - QNN_OP_RESIZE_BILINEAR_PARAM_HALF_PIXEL_CENTERS, qnn_half_pixel); - - std::vector param_tensor_names; - param_tensor_names.push_back(qnn_align_corners_param.GetParamTensorName()); - qnn_model_wrapper.AddParamWrapper(std::move(qnn_align_corners_param)); - param_tensor_names.push_back(qnn_half_pixel_param.GetParamTensorName()); - qnn_model_wrapper.AddParamWrapper(std::move(qnn_half_pixel_param)); - - return ProcessOutputs(qnn_model_wrapper, node_unit, std::move(input_names), std::move(param_tensor_names), - logger, do_op_validation, qnn_node_type); -} - -Status ResizeOpBuilder::ProcessQDQOpAttrsAndOutputs(QnnModelWrapper& qnn_model_wrapper, - const NodeUnit& node_unit, - std::vector&& input_names, - const logging::Logger& logger, - bool do_op_validation) const { std::vector param_tensor_names; NodeAttrHelper node_helper(node_unit); const std::string interp_mode = GetOnnxAttr(node_helper, onnx_mode_attr); const std::string transformation_mode = GetOnnxAttr(node_helper, onnx_coord_transf_mode_attr); + const std::string nearest_mode = GetOnnxAttr(node_helper, onnx_nearest_mode_attr); + const bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType()); std::string qnn_op_type = "Resize"; - // Handle Resize with {mode: "nearest", coordinate_transformation_mode: "asymmetric"} uniquely. - // QNN's ONNX converter tool translates this configuration (regardless of rounding mode) - // to QNN's ResizeNearestNeighbor with {align_corners: 0, half_pixel: 0}. - // - // NOTE: This is only accurate if the rounding mode is "floor". Need to investigate how to handle - // other rounding modes with Qualcomm. Ideally, we would use QNN's Resize operator, but it doesn't support - // the "asymmetric" coordinate transformation mode on HTP. - if (interp_mode == "nearest" && transformation_mode == "asymmetric") { + // Translate Resize with {mode: "nearest", nearest_mode: "floor", coordinate_transformation_mode: XXX} to + // QNN's ResizeNearestNeighbor operator on the HTP backend. This combination of parameters is not supported on HTP + // via QNN's Resize operator. Note that QNN's ResizeNearestNeighbor operator always uses "floor" rounding. + if (is_npu_backend && interp_mode == "nearest" && nearest_mode == "floor") { qnn_op_type = "ResizeNearestNeighbor"; - // Set parameter 'align_corners' to 0 + // Parameter 'align_corners' Qnn_Scalar_t qnn_align_corners = QNN_SCALAR_INIT; qnn_align_corners.dataType = QNN_DATATYPE_BOOL_8; - qnn_align_corners.bool8Value = static_cast(0); + qnn_align_corners.bool8Value = static_cast(transformation_mode == "align_corners"); QnnParamWrapper qnn_align_corners_param(node_unit.Index(), node_unit.Name(), QNN_OP_RESIZE_BILINEAR_PARAM_ALIGN_CORNERS, qnn_align_corners); param_tensor_names.push_back(qnn_align_corners_param.GetParamTensorName()); qnn_model_wrapper.AddParamWrapper(std::move(qnn_align_corners_param)); - // Set parameter 'half_pixel_centers' to 0 + // Parameter 'half_pixel_centers' Qnn_Scalar_t qnn_half_pixel = QNN_SCALAR_INIT; qnn_half_pixel.dataType = QNN_DATATYPE_BOOL_8; - qnn_half_pixel.bool8Value = static_cast(0); + qnn_half_pixel.bool8Value = static_cast(transformation_mode == "half_pixel"); QnnParamWrapper qnn_half_pixel_param(node_unit.Index(), node_unit.Name(), QNN_OP_RESIZE_BILINEAR_PARAM_HALF_PIXEL_CENTERS, qnn_half_pixel); param_tensor_names.push_back(qnn_half_pixel_param.GetParamTensorName()); @@ -399,11 +264,12 @@ Status ResizeOpBuilder::ProcessQDQOpAttrsAndOutputs(QnnModelWrapper& qnn_model_w // Parameter 'transformation_mode' Qnn_Scalar_t qnn_transformation_mode = QNN_SCALAR_INIT; qnn_transformation_mode.dataType = QNN_DATATYPE_UINT_32; - ORT_RETURN_IF_ERROR(GetQnnModeFromString(supported_coord_transf_modes, transformation_mode, - "coordinate_transformation_mode", qnn_transformation_mode.uint32Value)); + ORT_RETURN_IF_ERROR(GetQnnModeValFromOnnxString(supported_coord_transf_modes, transformation_mode, + "coordinate_transformation_mode", + qnn_transformation_mode.uint32Value)); - QnnParamWrapper qnn_transformation_mode_param(node_unit.Index(), node_unit.Name(), QNN_OP_RESIZE_PARAM_TRANSFORMATION_MODE, - qnn_transformation_mode); + QnnParamWrapper qnn_transformation_mode_param(node_unit.Index(), node_unit.Name(), + QNN_OP_RESIZE_PARAM_TRANSFORMATION_MODE, qnn_transformation_mode); param_tensor_names.push_back(qnn_transformation_mode_param.GetParamTensorName()); qnn_model_wrapper.AddParamWrapper(std::move(qnn_transformation_mode_param)); @@ -420,7 +286,7 @@ Status ResizeOpBuilder::ProcessQDQOpAttrsAndOutputs(QnnModelWrapper& qnn_model_w // Parameter 'interpolation_mode' Qnn_Scalar_t qnn_interp_mode = QNN_SCALAR_INIT; qnn_interp_mode.dataType = QNN_DATATYPE_UINT_32; - ORT_RETURN_IF_ERROR(GetQnnModeFromString(supported_modes, interp_mode, "mode", qnn_interp_mode.uint32Value)); + ORT_RETURN_IF_ERROR(GetQnnModeValFromOnnxString(supported_modes, interp_mode, "mode", qnn_interp_mode.uint32Value)); QnnParamWrapper qnn_interp_mode_param(node_unit.Index(), node_unit.Name(), QNN_OP_RESIZE_PARAM_INTERPOLATION_MODE, qnn_interp_mode); @@ -429,11 +295,10 @@ Status ResizeOpBuilder::ProcessQDQOpAttrsAndOutputs(QnnModelWrapper& qnn_model_w // Parameter 'nearest_mode'. Processed only when 'interpolation_mode' is NEAREST(0). if (qnn_interp_mode.uint32Value == 0) { - const std::string nearest_mode = GetOnnxAttr(node_helper, onnx_nearest_mode_attr); Qnn_Scalar_t qnn_nearest_mode = QNN_SCALAR_INIT; qnn_nearest_mode.dataType = QNN_DATATYPE_UINT_32; - ORT_RETURN_IF_ERROR(GetQnnModeFromString(supported_nearest_modes, nearest_mode, "nearest_mode", - qnn_nearest_mode.uint32Value)); + ORT_RETURN_IF_ERROR(GetQnnModeValFromOnnxString(supported_nearest_modes, nearest_mode, "nearest_mode", + qnn_nearest_mode.uint32Value)); QnnParamWrapper qnn_nearest_mode_param(node_unit.Index(), node_unit.Name(), QNN_OP_RESIZE_PARAM_NEAREST_MODE, qnn_nearest_mode); diff --git a/onnxruntime/core/providers/vitisai/README.md b/onnxruntime/core/providers/vitisai/README.md index 15e0c804489c5..6ddb58b8d96ae 100644 --- a/onnxruntime/core/providers/vitisai/README.md +++ b/onnxruntime/core/providers/vitisai/README.md @@ -1,4 +1,4 @@ -VitsAI Execution Prividers +VitisAI Execution Provider ============================ diff --git a/onnxruntime/core/providers/vitisai/imp/register_xir_ops.cc b/onnxruntime/core/providers/vitisai/imp/register_xir_ops.cc index 544e18350635d..ee8dfc6d03d12 100644 --- a/onnxruntime/core/providers/vitisai/imp/register_xir_ops.cc +++ b/onnxruntime/core/providers/vitisai/imp/register_xir_ops.cc @@ -34,9 +34,12 @@ static void xir_shape_infer(ONNX_NAMESPACE::InferenceContext& ctx) { updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::INT64); } else if (data_type->s() == "int1") { updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::BOOL); + } else if (data_type->s() == "bfloat16") { + updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::BFLOAT16); + } else if (data_type->s() == "float16") { + updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::FLOAT16); } else { - std::cerr << "not supported data_type " << data_type->s(); - abort(); + vai_assert(false, ", not supported data_type: " + data_type->s()); } if (shape != nullptr) { for (auto i = 0; i < shape->ints_size(); ++i) { diff --git a/onnxruntime/core/providers/webnn/builders/helper.cc b/onnxruntime/core/providers/webnn/builders/helper.cc index 31453e005272e..774df067fe347 100644 --- a/onnxruntime/core/providers/webnn/builders/helper.cc +++ b/onnxruntime/core/providers/webnn/builders/helper.cc @@ -53,9 +53,12 @@ bool IsInputSupported(const NodeArg& input, const std::string& parent_name, cons } for (const auto& dim : shape_proto->dim()) { - // For now we workaround dynamic shape support by assuming 1. + // WebNN doesn't support dynamic shape - use sessionOptions.freeDimensionOverrides to fix the shape. if (!dim.has_dim_value()) { - LOGS(logger, VERBOSE) << "Dynamic shape is not supported for now, assume to be 1, for input:" << input_name; + LOGS(logger, VERBOSE) << "Dynamic shape is not supported, " + << "use sessionOptions.FreeDimensionOverrides to set a fixed shape for input: " + << input_name; + return false; } } diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.cc b/onnxruntime/core/providers/webnn/builders/model_builder.cc index 14ca4f1a1e674..2eae8cebbbd66 100644 --- a/onnxruntime/core/providers/webnn/builders/model_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/model_builder.cc @@ -218,12 +218,9 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i } else { dims.reserve(shape.size()); for (const auto& dim : shape) { - if (!dim.has_dim_value()) { - // FIXME: support dyanmic shape. - dims.push_back(1); - } else { - dims.push_back(SafeInt(dim.dim_value())); - } + // dim_param free dimensions should have already been excluded by IsInputSupported(). + assert(dim.has_dim_value()); + dims.push_back(SafeInt(dim.dim_value())); } } } diff --git a/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc b/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc index 1f671e90090ba..a55238396cea3 100755 --- a/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc +++ b/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc @@ -429,6 +429,40 @@ TEST_F(GraphTransformationTests, SimplifiedLayerNormFusionTest) { } } +// It tests the scenario when scale or bias are not Graph Inputs and not initialized in Graph +// To test this added a Identity node after Scale and Bias terms to ensure LayerNormFusion works properly +TEST_F(GraphTransformationTests, LayerNormScaleBiasTest) { + constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/layer_norm_fusion_scale_bias.onnx"; + std::shared_ptr p_model; + ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_)); + Graph& graph = p_model->MainGraph(); + + onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; + ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique(), TransformerLevel::Level2)); + ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_)); + + std::map op_to_count = CountOpsInGraph(graph); + ASSERT_EQ(op_to_count["ReduceMean"], 0); + ASSERT_EQ(op_to_count["Sub"], 0); + ASSERT_EQ(op_to_count["Cast"], 0); + ASSERT_EQ(op_to_count["Pow"], 0); + ASSERT_EQ(op_to_count["Add"], 0); + ASSERT_EQ(op_to_count["Sqrt"], 0); + ASSERT_EQ(op_to_count["Div"], 0); + ASSERT_EQ(op_to_count["Mul"], 0); + ASSERT_EQ(op_to_count["LayerNormalization"], 1); + + for (const Node& node : graph.Nodes()) { + if (node.OpType() == "LayerNormalization") { + // LayerNormalization should have three inputs. + EXPECT_EQ(node.InputDefs().size(), 3u) << "LayerNormalization number of inputs does not equal to 3. Got:" << node.InputDefs().size(); + // LayerNormalization input "scale" and "bias" should have the same dimension. + const TensorShapeProto* scale_shape = node.InputDefs()[1]->Shape(); + EXPECT_EQ(scale_shape->dim_size(), 1) << "LayerNormalization scale should be 1D. Got: " << scale_shape->dim_size(); + } + } +} + // If EP is non-GPU EP or unknown, the sub-graph will be not fused because CPU impl for SimplifiedLayerNormalization // doesn't support input and scale having different data types. TEST_F(GraphTransformationTests, SimplifiedLayerNormWithCastsFusionTest) { diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc index ef2d7e31654ba..9b41ba8c0d2ba 100644 --- a/onnxruntime/test/providers/cpu/model_tests.cc +++ b/onnxruntime/test/providers/cpu/model_tests.cc @@ -1133,11 +1133,15 @@ ::std::vector<::std::basic_string> GetParameterStrings() { #if defined(NDEBUG) || defined(RUN_MODELTEST_IN_DEBUG_MODE) #ifdef _WIN32 ORT_STRING_VIEW model_test_root_path = ORT_TSTR("..\\models"); + // thus, only the root path should be mounted. + ORT_STRING_VIEW model_zoo_path = ORT_TSTR("..\\models\\zoo"); #else ORT_STRING_VIEW model_test_root_path = ORT_TSTR("../models"); + ORT_STRING_VIEW model_zoo_path = ORT_TSTR("../models/zoo"); #endif for (auto p : kvp.second) { paths.push_back(ConcatPathComponent(model_test_root_path, p)); + paths.push_back(ConcatPathComponent(model_zoo_path, p)); } #endif diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc index 832a8a744c08b..0434b16dc66ce 100644 --- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc @@ -99,9 +99,8 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_with_extr // CUDA: result mismatch due to not implementing NHWC support // TensorRT: results mismatch // ROCm: results mismatch - // QNN: conflict with layout transformer, need furture investigation test.Run(OpTester::ExpectResult::kExpectSuccess, "", - {kCudaExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider, kQnnExecutionProvider}); + {kCudaExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider}); } TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_with_extrapolation_uint8) { @@ -131,7 +130,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_with_extr test.AddOutput("Y", {N, static_cast(H * scales[1]), static_cast(W * scales[2]), C}, Y); // CUDA: result mismatch due to not implementing NHWC support // ROCm: results mismatch - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kQnnExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider}); } TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_with_extrapolation_int8) { @@ -159,7 +158,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_with_extr 10, 10, 10}; test.AddOutput("Y", {N, static_cast(H * scales[1]), static_cast(W * scales[2]), C}, Y); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider}); + test.Run(); } TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_without_extrapolation_uint8) { @@ -188,7 +187,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_without_e test.AddOutput("Y", {N, static_cast(H * scales[1]), static_cast(W * scales[2]), C}, Y); // CUDA: result mismatch due to not implementing NHWC support // ROCm: results mismatch - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kQnnExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider}); } TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_without_extrapolation_int8) { @@ -215,7 +214,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_without_e 0, 0, 0}; test.AddOutput("Y", {N, static_cast(H * scales[1]), static_cast(W * scales[2]), C}, Y); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider}); + test.Run(); } TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear) { @@ -261,9 +260,8 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear) { test.AddOutput("Y", {N, static_cast(H * scales[1]), static_cast(W * scales[2]), C}, Y); // CUDA: result mismatch due to not implementing NHWC support // ROCm: results mismatch - // QNN: conflict with layout transformer, need furture investigation test.Run(OpTester::ExpectResult::kExpectSuccess, "", - {kCudaExecutionProvider, kRocmExecutionProvider, kQnnExecutionProvider}); + {kCudaExecutionProvider, kRocmExecutionProvider}); } TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_uint8) { @@ -287,7 +285,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_uint8) { test.AddOutput("Y", {N, static_cast(H * scales[1]), static_cast(W * scales[2]), C}, Y); // CUDA: result mismatch due to not implementing NHWC support // ROCm: results mismatch - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kQnnExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider}); } TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_int8) { @@ -309,7 +307,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_int8) { std::vector Y = {0, 0}; test.AddOutput("Y", {N, static_cast(H * scales[1]), static_cast(W * scales[2]), C}, Y); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider}); + test.Run(); } // Since NNAPI(TFLite) only using the scale calculate using the input/output size @@ -399,7 +397,9 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_align_corners) { std::vector Y = {1.0f, 4.0f}; test.AddOutput("Y", {N, C, static_cast(H * scales[2]), static_cast(W * scales[3])}, Y); - test.Run(); + + // QNN: result mismatch ("NaN" instead of 1.0f on QNN CPU backend) + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider}); }; run_test(false); @@ -435,7 +435,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_align_corners_uin test.AddOutput("Y", {N, static_cast(H * scales[1]), static_cast(W * scales[2]), C}, Y); // CUDA: result mismatch due to not implementing NHWC support // ROCm: results mismatch - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kQnnExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider}); }; run_test(false); @@ -465,7 +465,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_align_corners_int test.AddOutput("Y", {N, static_cast(H * scales[1]), static_cast(W * scales[2]), C}, Y); // TensorRT: results mismatch - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kQnnExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); }; run_test(false); @@ -532,7 +532,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_pytorch_half_pixe test.AddOutput("Y", {N, sizes[1], sizes[2], C}, Y); // CUDA: result mismatch due to not implementing NHWC support // ROCm: results mismatch - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kQnnExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider}); } TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_pytorch_half_pixel_int8) { @@ -560,7 +560,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_pytorch_half_pixe std::vector Y = {0, 2, -9}; test.AddOutput("Y", {N, sizes[1], sizes[2], C}, Y); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kQnnExecutionProvider}); // TensorRT: results mismatch + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: results mismatch } TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_asymmetric) { @@ -641,7 +641,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearUpSampleTest_4DBilinear_asymmetric_uint8) { Y, false, .0f, 1.0f); // CUDA: result mismatch due to not implementing NHWC support // ROCm: results mismatch - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kQnnExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider}); }; run_test(false); @@ -683,7 +683,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearUpSampleTest_4DBilinear_asymmetric_int8) { test.AddOutput("Y", {N, static_cast(H * scales[1]), static_cast(W * scales[2]), C}, Y, false, .0f, 1.0f); // TensorRT: results mismatch - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kQnnExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); }; run_test(false); @@ -1079,7 +1079,7 @@ TEST(ResizeOpTest, ResizeOpNearestUpSample_Floor_Align_Corners) { 13.0f, 13.0f, 13.0f, 14.0f, 14.0f, 15.0f, 15.0f, 16.0f}; test.AddOutput("Y", {N, C, static_cast(H * scales[2]), static_cast(W * scales[3])}, Y); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider}); // QNN: result diff + test.Run(); } TEST(ResizeOpTest, ResizeOpNearest_OneToOneMappingBetweenInputAndOutputDataDims) { @@ -1887,7 +1887,7 @@ void TestAntialiasing(std::map attributes, test.AddOutput("Y", output_shape, output_data); // TensorRT 8.5 supports operators up to Opset 17. Temporarily exclude TensorRT EP due to accurarcy issue. - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kQnnExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } TEST(ResizeOpTest, Antialias_Bilinear_No_ExcludeOutside) { diff --git a/onnxruntime/test/providers/qnn/resize_test.cc b/onnxruntime/test/providers/qnn/resize_test.cc index cf336ca9eeb8b..cd6865d443cc0 100644 --- a/onnxruntime/test/providers/qnn/resize_test.cc +++ b/onnxruntime/test/providers/qnn/resize_test.cc @@ -120,7 +120,7 @@ static void RunCPUResizeOpTest(const TestInputDef& input_def, const std:: const std::string& mode, const std::string& coordinate_transformation_mode, const std::string& nearest_mode, ExpectedEPNodeAssignment expected_ep_assignment, - int opset = 11) { + int opset = 19) { ProviderOptions provider_options; #if defined(_WIN32) provider_options["backend_path"] = "QnnCpu.dll"; @@ -138,7 +138,7 @@ static void RunCPUResizeOpTestWithScales(const TestInputDef& input_def, c const std::string& mode, const std::string& coordinate_transformation_mode, const std::string& nearest_mode, ExpectedEPNodeAssignment expected_ep_assignment, - int opset = 11) { + int opset = 19) { ProviderOptions provider_options; #if defined(_WIN32) provider_options["backend_path"] = "QnnCpu.dll"; @@ -157,7 +157,8 @@ static void RunQDQResizeOpTest(const TestInputDef& input_def, const std::vector& sizes_data, const std::string& mode, const std::string& coordinate_transformation_mode, const std::string& nearest_mode, - ExpectedEPNodeAssignment expected_ep_assignment) { + ExpectedEPNodeAssignment expected_ep_assignment, + int opset = 19) { ProviderOptions provider_options; #if defined(_WIN32) provider_options["backend_path"] = "QnnHtp.dll"; @@ -169,27 +170,20 @@ static void RunQDQResizeOpTest(const TestInputDef& input_def, GetQDQResizeModelBuilder(input_def, sizes_data, mode, coordinate_transformation_mode, nearest_mode), provider_options, - 18, // opset - expected_ep_assignment, - 1e-5f); + opset, + expected_ep_assignment); } // // CPU tests: // -// TODO: Our QNN CPU translation of ONNX Resize with "nearest" mode uses QNN's ResizeNearestNeighbor -// operator, which does not have a way to specify rounding (i.e., "nearest_mode" in ONNX). It is not clear -// what kind of rounding QNN's ResizeNearestNeighbor uses. Therefore, we do not yet know how to compare -// ONNX Resize to QNN ResizeNearestNeighbor. These tests should remain disabled until this behavior is -// clarified. If, for example, it turns out that ResizeNearestNeighbor uses "floor" rounding, then we should -// only compare against ONNX resize with "floor" rounding. - // Upsample that uses "round_prefer_floor" as the "nearest_mode". // coordinate_transformation_mode: "half_pixel" -TEST_F(QnnCPUBackendTests, DISABLED_ResizeUpsampleNearestHalfPixel_rpf) { - RunCPUResizeOpTest(TestInputDef({1, 2, 7, 5}, false, -10.0f, 10.0f), // Random input w/ range [-10, 10] - {1, 2, 21, 10}, // Sizes +TEST_F(QnnCPUBackendTests, ResizeUpsampleNearestHalfPixel_rpf) { + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 70); + RunCPUResizeOpTest(TestInputDef({1, 2, 7, 5}, false, input_data), + {1, 2, 21, 10}, // Sizes "nearest", "half_pixel", "round_prefer_floor", @@ -198,57 +192,72 @@ TEST_F(QnnCPUBackendTests, DISABLED_ResizeUpsampleNearestHalfPixel_rpf) { // Upsample that uses "round_prefer_ceil" as the "nearest_mode". // coordinate_transformation_mode: "half_pixel" -TEST_F(QnnCPUBackendTests, DISABLED_ResizeUpsampleNearestHalfPixel_rpc) { - RunCPUResizeOpTest(TestInputDef({1, 1, 2, 4}, false, -10.0f, 10.0f), +TEST_F(QnnCPUBackendTests, ResizeUpsampleNearestHalfPixel_rpc) { + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 8); + RunCPUResizeOpTest(TestInputDef({1, 1, 2, 4}, false, input_data), {1, 1, 7, 5}, "nearest", "half_pixel", "round_prefer_ceil", ExpectedEPNodeAssignment::All); } // Downsample that uses "round_prefer_ceil" as the "nearest_mode". // coordinate_transformation_mode: "half_pixel" -TEST_F(QnnCPUBackendTests, DISABLED_ResizeDownsampleNearestHalfPixel_rpc) { - RunCPUResizeOpTest(TestInputDef({1, 1, 2, 4}, false, -10.0f, 10.0f), +TEST_F(QnnCPUBackendTests, ResizeDownsampleNearestHalfPixel_rpc) { + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 8); + RunCPUResizeOpTest(TestInputDef({1, 1, 2, 4}, false, input_data), {1, 1, 1, 3}, "nearest", "half_pixel", "round_prefer_ceil", ExpectedEPNodeAssignment::All); } // Downsample that uses "round_prefer_floor" as the "nearest_mode". // coordinate_transformation_mode: "half_pixel" -TEST_F(QnnCPUBackendTests, DISABLED_ResizeDownsampleNearestHalfPixel_rpf) { - RunCPUResizeOpTest(TestInputDef({1, 1, 2, 4}, false, -10.0f, 10.0f), +TEST_F(QnnCPUBackendTests, ResizeDownsampleNearestHalfPixel_rpf) { + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 8); + RunCPUResizeOpTest(TestInputDef({1, 1, 2, 4}, false, input_data), {1, 1, 1, 2}, "nearest", "half_pixel", "round_prefer_ceil", ExpectedEPNodeAssignment::All); } // Upsample that uses "round_prefer_floor" as the "nearest_mode". // coordinate_transformation_mode: "align_corners" -// QNN v2.13: index #50 don't match, which is 4.67152 from -1.93515 -TEST_F(QnnCPUBackendTests, DISABLED_ResizeUpsampleNearestAlignCorners_rpf) { - RunCPUResizeOpTest(TestInputDef({1, 2, 7, 5}, false, -10.0f, 10.0f), +TEST_F(QnnCPUBackendTests, ResizeUpsampleNearestAlignCorners_rpf) { + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 70); + RunCPUResizeOpTest(TestInputDef({1, 2, 7, 5}, false, input_data), {1, 2, 21, 10}, "nearest", "align_corners", "round_prefer_floor", ExpectedEPNodeAssignment::All); } +// Upsample that uses "round_prefer_floor" as the "nearest_mode". +// coordinate_transformation_mode: "asymmetric" +TEST_F(QnnCPUBackendTests, ResizeUpsampleNearestAsymmetric_rpf) { + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 70); + RunCPUResizeOpTest(TestInputDef({1, 2, 7, 5}, false, input_data), + {1, 2, 21, 10}, "nearest", "asymmetric", "round_prefer_floor", + ExpectedEPNodeAssignment::All); +} + // Upsample that uses "round_prefer_ceil" as the "nearest_mode". // coordinate_transformation_mode: "align_corners" -TEST_F(QnnCPUBackendTests, DISABLED_ResizeUpsampleNearestAlignCorners_rpc) { - RunCPUResizeOpTest(TestInputDef({1, 1, 2, 4}, false, -10.0f, 10.0f), +TEST_F(QnnCPUBackendTests, ResizeUpsampleNearestAlignCorners_rpc) { + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 8); + RunCPUResizeOpTest(TestInputDef({1, 1, 2, 4}, false, input_data), {1, 1, 7, 5}, "nearest", "align_corners", "round_prefer_ceil", ExpectedEPNodeAssignment::All); } // Downsample that uses "round_prefer_ceil" as the "nearest_mode". // coordinate_transformation_mode: "align_corners" -TEST_F(QnnCPUBackendTests, DISABLED_ResizeDownsampleNearestAlignCorners_rpc) { - RunCPUResizeOpTest(TestInputDef({1, 1, 2, 4}, false, -10.0f, 10.0f), +TEST_F(QnnCPUBackendTests, ResizeDownsampleNearestAlignCorners_rpc) { + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 8); + RunCPUResizeOpTest(TestInputDef({1, 1, 2, 4}, false, input_data), {1, 1, 1, 3}, "nearest", "align_corners", "round_prefer_ceil", ExpectedEPNodeAssignment::All); } // Downsample that uses "round_prefer_floor" as the "nearest_mode". // coordinate_transformation_mode: "align_corners" -TEST_F(QnnCPUBackendTests, DISABLED_ResizeDownsampleNearestAlignCorners_rpf) { - RunCPUResizeOpTest(TestInputDef({1, 1, 2, 4}, false, -10.0f, 10.0f), +TEST_F(QnnCPUBackendTests, ResizeDownsampleNearestAlignCorners_rpf) { + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 8); + RunCPUResizeOpTest(TestInputDef({1, 1, 2, 4}, false, input_data), {1, 1, 1, 2}, "nearest", "align_corners", "round_prefer_floor", ExpectedEPNodeAssignment::All); } @@ -258,76 +267,177 @@ TEST_F(QnnCPUBackendTests, DISABLED_ResizeDownsampleNearestAlignCorners_rpf) { // TEST_F(QnnCPUBackendTests, Resize2xLinearHalfPixel) { - RunCPUResizeOpTest(TestInputDef({1, 3, 4, 5}, false, -10.0f, 10.0f), + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 60); + RunCPUResizeOpTest(TestInputDef({1, 3, 4, 5}, false, input_data), {1, 3, 8, 10}, "linear", "half_pixel", "", ExpectedEPNodeAssignment::All); } TEST_F(QnnCPUBackendTests, Resize2xLinearHalfPixel_scales) { - RunCPUResizeOpTestWithScales(TestInputDef({1, 3, 4, 5}, false, -10.0f, 10.0f), + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 60); + RunCPUResizeOpTestWithScales(TestInputDef({1, 3, 4, 5}, false, input_data), {1.0f, 1.0f, 2.0f, 2.0f}, "linear", "half_pixel", "", ExpectedEPNodeAssignment::All); } TEST_F(QnnCPUBackendTests, Resize2xLinearAlignCorners) { - RunCPUResizeOpTest(TestInputDef({1, 3, 4, 5}, false, -10.0f, 10.0f), + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 60); + RunCPUResizeOpTest(TestInputDef({1, 3, 4, 5}, false, input_data), {1, 3, 8, 10}, "linear", "align_corners", "", ExpectedEPNodeAssignment::All); } TEST_F(QnnCPUBackendTests, Resize2xLinearAlignCorners_scales) { - RunCPUResizeOpTestWithScales(TestInputDef({1, 3, 4, 5}, false, -10.0f, 10.0f), + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 60); + RunCPUResizeOpTestWithScales(TestInputDef({1, 3, 4, 5}, false, input_data), {1.0f, 1.0f, 2.0f, 2.0f}, "linear", "align_corners", "", ExpectedEPNodeAssignment::All); } +// Test Resize downsample with mode: "linear", coordinate_transformation_mode: "align_corners" +// TODO: Enable ResizeOpTest.ResizeOpLinearDownSampleTest_4DBilinear_align_corners in cpu resize_op tests when fixed. +// +// Input f32[1,1,2,4]: 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0 +// Expected output f32[1, 1, 1, 2]: 1.0, 4.0 +// Actual output f32[1, 1, 1, 2]: NaN, NaN +TEST_F(QnnCPUBackendTests, DISABLED_Resize_DownSample_Linear_AlignCorners_scales) { + std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}; + RunCPUResizeOpTestWithScales(TestInputDef({1, 1, 2, 4}, false, input_data), + {1.0f, 1.0f, 0.6f, 0.6f}, "linear", "align_corners", "", + ExpectedEPNodeAssignment::All); +} + +// Test Resize downsample with mode: "linear", coordinate_transformation_mode: "half_pixel" +// TODO: Enable ResizeOpTest.ResizeOpLinearDownSampleTest_4DBilinear cpu resize_op tests when fixed. +// +// Input f32[1,1,2,4]: 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0 +// Expected output f32[1, 1, 1, 2]: 2.6666 4.3333 +// Actual output f32[1, 1, 1, 2]: NaN, NaN +TEST_F(QnnCPUBackendTests, DISABLED_Resize_DownSample_Linear_HalfPixel_scales) { + std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}; + RunCPUResizeOpTestWithScales(TestInputDef({1, 1, 2, 4}, false, input_data), + {1.0f, 1.0f, 0.6f, 0.6f}, "linear", "half_pixel", "", + ExpectedEPNodeAssignment::All); +} + #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__) // // HTP tests: // +// Test QDQ Resize downsample with mode: "linear", coordinate_transformation_mode: "align_corners" +TEST_F(QnnHTPBackendTests, Resize_DownSample_Linear_AlignCorners) { + std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}; + RunQDQResizeOpTest(TestInputDef({1, 1, 2, 4}, false, input_data), + {1, 1, 1, 2}, "linear", "align_corners", "", + ExpectedEPNodeAssignment::All); +} + +// Test QDQ Resize downsample with mode: "linear", coordinate_transformation_mode: "half_pixel" +TEST_F(QnnHTPBackendTests, Resize_DownSample_Linear_HalfPixel) { + std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}; + RunQDQResizeOpTest(TestInputDef({1, 1, 2, 4}, false, input_data), + {1, 1, 1, 2}, "linear", "half_pixel", "", + ExpectedEPNodeAssignment::All); +} + +// Test 2x QDQ Resize mode: "linear", coordinate_transformation_mode: "pytorch_half_pixel" +// QNN EP uses QNN's Resize op. TEST_F(QnnHTPBackendTests, ResizeU8_2xLinearPytorchHalfPixel) { - RunQDQResizeOpTest(TestInputDef({1, 3, 4, 4}, false, -10.0f, 10.0f), + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 48); + RunQDQResizeOpTest(TestInputDef({1, 3, 4, 4}, false, input_data), {1, 3, 8, 8}, "linear", "pytorch_half_pixel", "", ExpectedEPNodeAssignment::All); } -TEST_F(QnnHTPBackendTests, ResizeU8_2xNearestHalfPixelRoundPreferFloor) { - RunQDQResizeOpTest(TestInputDef({1, 3, 4, 4}, false, -10.0f, 10.0f), - {1, 3, 8, 8}, "nearest", "half_pixel", "round_prefer_floor", +// Test 2x QDQ Resize mode: "linear", coordinate_transformation_mode: "half_pixel" +// QNN EP uses QNN's Resize op. +TEST_F(QnnHTPBackendTests, ResizeU8_2xLinearHalfPixel) { + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 48); + RunQDQResizeOpTest(TestInputDef({1, 3, 4, 4}, false, input_data), + {1, 3, 8, 8}, "linear", "half_pixel", "", ExpectedEPNodeAssignment::All); } -TEST_F(QnnHTPBackendTests, ResizeU8_2xNearestAsymmetricFloor) { - RunQDQResizeOpTest(TestInputDef({1, 3, 4, 4}, false, -10.0f, 10.0f), - {1, 3, 8, 8}, "nearest", "asymmetric", "floor", +// Test 2x QDQ Resize mode: "linear", coordinate_transformation_mode: "align_corners" +// QNN EP uses QNN's Resize op. +TEST_F(QnnHTPBackendTests, ResizeU8_2xLinearAlignCorners) { + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 48); + RunQDQResizeOpTest(TestInputDef({1, 3, 4, 4}, false, input_data), + {1, 3, 8, 8}, "linear", "align_corners", "", ExpectedEPNodeAssignment::All); } -// TODO: Investigate with Qualcomm. The qnn-onnx-converter tool translates ONNX Resize [nearest, asymmetric, ceil] to -// QNN ResizeNearestNeighbor {align_corners: 0, half_pixel: 0}, which is NOT equivalent. It would be better to use -// QNN's own Resize operator (instead of ResizeNearestNeighbor), but it doesn't support the "asymmetric" coordinate -// transform mode. -// -// QNN v2.13: Inaccuracy detected for output 'output', element 189. -// Output quant params: scale=0.078431375324726105, zero_point=127. -// Expected val: -2.663428783416748 -// QNN QDQ val: 7.4509806632995605 (err 10.114409446716309) -// CPU QDQ val: -2.6666667461395264 (err 0.0032379627227783203) -TEST_F(QnnHTPBackendTests, DISABLED_ResizeU8_2xNearestAsymmetricCeil) { - RunQDQResizeOpTest(TestInputDef({1, 3, 4, 4}, false, -10.0f, 10.0f), - {1, 3, 8, 8}, "nearest", "asymmetric", "ceil", +// Test 2x QDQ Resize mode: "linear", coordinate_transformation_mode: "asymmetric" +// QNN EP uses QNN's Resize op. +TEST_F(QnnHTPBackendTests, ResizeU8_2xLinearAsymmetric) { + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 48); + RunQDQResizeOpTest(TestInputDef({1, 3, 4, 4}, false, input_data), + {1, 3, 8, 8}, "linear", "asymmetric", "", ExpectedEPNodeAssignment::All); } +// Test 2x QDQ Resize mode: "nearest", coordinate_transformation_mode: "half_pixel", nearest_mode: "round_prefer_floor" +// QNN EP uses QNN's Resize op. +TEST_F(QnnHTPBackendTests, ResizeU8_2xNearestHalfPixelRoundPreferFloor) { + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 48); + RunQDQResizeOpTest(TestInputDef({1, 3, 4, 4}, false, input_data), + {1, 3, 8, 8}, "nearest", "half_pixel", "round_prefer_floor", + ExpectedEPNodeAssignment::All); +} + +// Test that the nearest_mode "ceil" is not supported on the HTP backend. +TEST_F(QnnHTPBackendTests, ResizeU8_NearestModeCeil_Unsupported) { + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 48); + RunQDQResizeOpTest(TestInputDef({1, 3, 4, 4}, false, input_data), + {1, 3, 8, 8}, "nearest", "asymmetric", "ceil", + ExpectedEPNodeAssignment::None); +} + +// Test 3x QDQ Resize mode: "nearest", coordinate_transformation_mode: "asymmetric", nearest_mode: "floor". +// QNN EP uses QNN's ResizeNearestNeighbor op. TEST_F(QnnHTPBackendTests, ResizeU8_3xNearestAsymmetricFloor) { - RunQDQResizeOpTest(TestInputDef({1, 3, 4, 4}, false, -10.0f, 10.0f), + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 48); + RunQDQResizeOpTest(TestInputDef({1, 3, 4, 4}, false, input_data), {1, 3, 12, 12}, "nearest", "asymmetric", "floor", ExpectedEPNodeAssignment::All); } +// Test 2x QDQ Resize mode: "nearest", coordinate_transformation_mode: "asymmetric", nearest_mode: "round_prefer_floor" +// QNN EP uses QNN's Resize op. +TEST_F(QnnHTPBackendTests, ResizeU8_2xNearestAsymmetricRoundPreferFloor) { + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 8); + RunQDQResizeOpTest(TestInputDef({1, 2, 2, 2}, false, input_data), + {1, 2, 4, 4}, "nearest", "asymmetric", "round_prefer_floor", + ExpectedEPNodeAssignment::All); +} + +// Test 3x QDQ Resize mode: "nearest", coordinate_transformation_mode: "asymmetric", nearest_mode: "round_prefer_floor" +// QNN EP uses QNN's Resize op. +// +// TODO: Inaccuracy detected for output 'output_0', element 2. +// Output quant params: scale=0.078431375324726105, zero_point=127. +// Expected val: -3.3333334922790527 +// QNN QDQ val: -9.960784912109375 (err 6.6274514198303223) +// CPU QDQ val: -3.2941176891326904 (err 0.039215803146362305) +// +// More debugging info: +// Input elements f32[1,1,2,2] = -10.0000000 -3.33333349 3.33333302 10.0000000 +// ORT CPU EP (f32 model) outputs: -10.0000000 -10.0000000 -3.33333349 -3.33333349 -3.33333349 -3.33333349 -10.00 ... +// ORT CPU EP (qdq model) outputs: -9.96078491 -9.96078491 -3.29411769 -3.29411769 -3.29411769 -3.29411769 -9.961 ... +// ORT QNN EP (qdq model) outputs: -9.96078491 -9.96078491 -9.96078491 -3.37254906 -3.37254906 -3.37254906 -9.961 ... +TEST_F(QnnHTPBackendTests, DISABLED_ResizeU8_3xNearestAsymmetricRoundPreferFloor) { + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 4); + RunQDQResizeOpTest(TestInputDef({1, 1, 2, 2}, false, input_data), + {1, 1, 6, 6}, "nearest", "asymmetric", "round_prefer_floor", + ExpectedEPNodeAssignment::All); +} + +// Test 0.5x QDQ Resize mode: "nearest", coordinate_transformation_mode: "asymmetric", nearest_mode: "floor" +// QNN EP uses QNN's ResizeNearestNeighbor op. TEST_F(QnnHTPBackendTests, ResizeU8_HalfNearestAsymmetricFloor) { - RunQDQResizeOpTest(TestInputDef({1, 3, 4, 4}, false, -10.0f, 10.0f), + std::vector input_data = GetFloatDataInRange(-10.0f, 10.0f, 48); + RunQDQResizeOpTest(TestInputDef({1, 3, 4, 4}, false, input_data), {1, 3, 2, 2}, "nearest", "asymmetric", "floor", ExpectedEPNodeAssignment::All); } diff --git a/onnxruntime/test/testdata/transform/fusion/layer_norm_fusion_scale_bias.onnx b/onnxruntime/test/testdata/transform/fusion/layer_norm_fusion_scale_bias.onnx new file mode 100644 index 0000000000000..ec0f9a97815b8 Binary files /dev/null and b/onnxruntime/test/testdata/transform/fusion/layer_norm_fusion_scale_bias.onnx differ diff --git a/onnxruntime/test/testdata/transform/fusion/layer_norm_fusion_scale_bias.py b/onnxruntime/test/testdata/transform/fusion/layer_norm_fusion_scale_bias.py new file mode 100644 index 0000000000000..a59e263763a3d --- /dev/null +++ b/onnxruntime/test/testdata/transform/fusion/layer_norm_fusion_scale_bias.py @@ -0,0 +1,81 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +import onnx +from onnx import OperatorSetIdProto, TensorProto, helper + + +def GenerateModel(model_name, has_casts=False, has_identity=False): # noqa: N802 + nodes = [ # LayerNorm subgraph + helper.make_node("ReduceMean", ["A"], ["rd_out"], "reduce1", axes=[-1], keepdims=1), + helper.make_node("Sub", ["A", "rd_out"], ["sub_out"], "sub"), + helper.make_node("Pow", ["cast_sub_out" if has_casts else "sub_out", "pow_in_2"], ["pow_out"], "pow"), + helper.make_node("ReduceMean", ["pow_out"], ["rd2_out"], "reduce2", axes=[-1], keepdims=1), + helper.make_node("Add", ["rd2_out", "const_e12_f32"], ["add1_out"], "add1"), + helper.make_node("Sqrt", ["add1_out"], ["sqrt_out"], "sqrt"), + helper.make_node("Div", ["cast_sub_out" if has_casts else "sub_out", "sqrt_out"], ["div_out"], "div"), + helper.make_node( + "Mul", + ["gamma_id_out" if has_identity else "gamma", "cast_div_out" if has_casts else "div_out"], + ["mul_out"], + "mul", + ), + helper.make_node("Add", ["mul_out", "const_e6_f16_out" if has_identity else "const_e6_f16"], ["C"], "add2"), + ] + + if has_casts: + nodes.extend( + [ + helper.make_node("Cast", ["sub_out"], ["cast_sub_out"], "cast_sub", to=1), + helper.make_node("Cast", ["div_out"], ["cast_div_out"], "cast_2", to=10), + ] + ) + + if has_identity: + nodes.extend( + [ + helper.make_node("Identity", ["gamma"], ["gamma_id_out"], "gamma_identity"), + helper.make_node("Identity", ["const_e6_f16"], ["const_e6_f16_out"], "const_e6_f16_identity"), + ] + ) + + initializers = [ # initializers + helper.make_tensor("pow_in_2", TensorProto.FLOAT, [], [2]), + helper.make_tensor("const_e12_f32", TensorProto.FLOAT, [], [1e-12]), + helper.make_tensor("const_e6_f16", TensorProto.FLOAT16, [4], [1e-6, 1e-6, 1e-6, 1e-6]), + helper.make_tensor( + "gamma", + TensorProto.FLOAT16 if has_casts else TensorProto.FLOAT, + [4], + [1, 2, 3, 4], + ), + ] + + input_type = TensorProto.FLOAT16 if has_casts else TensorProto.FLOAT + output_type = TensorProto.FLOAT16 if has_casts else TensorProto.FLOAT + + graph = helper.make_graph( + nodes, + "LayerNorm", # name + [ # inputs + helper.make_tensor_value_info("A", input_type, [16, 32, 4]), + ], + [ # outputs + helper.make_tensor_value_info("C", output_type, [16, 32, 4]), + ], + initializers, + ) + + onnxdomain = OperatorSetIdProto() + onnxdomain.version = 12 + # The empty string ("") or absence of this field implies the operator set that is defined as part of the ONNX specification. + onnxdomain.domain = "" + msdomain = OperatorSetIdProto() + msdomain.version = 1 + msdomain.domain = "com.microsoft" + opsets = [onnxdomain, msdomain] + + model = helper.make_model(graph, opset_imports=opsets) + onnx.save(model, model_name) + + +GenerateModel("layer_norm_fusion_scale_bias.onnx", True, True) diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py index 4c72b6d98a088..f75d553a5f460 100644 --- a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py +++ b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py @@ -28,7 +28,8 @@ class PythonOpShapeInferStore: @classmethod def register(cls, kclass: torch.autograd.Function) -> None: - """Register a shape inference function for a torch.autograd.Function if there is staticmethod "infer_shape" defined. + """Register a shape inference function for a torch.autograd.Function if there is staticmethod + "infer_shape" defined. The signature of the shape inference function should be: @staticmethod @@ -51,6 +52,11 @@ def infer_shape( if hasattr(kclass, "infer_shape") and kclass_name not in cls._CLASS_MAP: cls._CLASS_MAP[kclass_name] = kclass.infer_shape + @classmethod + def register_func(cls, name: str, func: Callable) -> None: + """Register a shape inference function for a torch.autograd.Function by name.""" + cls._CLASS_MAP[name] = func + @classmethod def get_shape_infer(cls, name: str) -> Optional[Callable]: return cls._CLASS_MAP.get(name, None) @@ -228,9 +234,9 @@ def _export_pt_1_10(g, n, *args, **kwargs): input_float_tuples.extend(list(arg)) continue - is_inspect_activation = ( - func_full_qual_name == "onnxruntime.training.utils.hooks._subscriber_manager._InspectActivation" - ) + from onnxruntime.training.utils.hooks._statistics_subscriber import _InspectActivation + + is_inspect_activation = func_full_qual_name == get_fully_qualified_class_name(_InspectActivation) if is_inspect_activation and isinstance(arg, str): # _InspectActivation is a special case where the first argument is a string # that is used to determine the activation name to be inspected. @@ -307,14 +313,7 @@ def _export_pt_1_10(g, n, *args, **kwargs): _export = wrap_custom_export_function(_export_pt_1_10) -def _post_process_after_export(exported_model: ModelProto, enable_custom_autograd_function: bool) -> ModelProto: - """Post process the exported model.""" - if enable_custom_autograd_function: - exported_model = _post_process_enabling_autograd_function(exported_model) - return exported_model - - -def _post_process_enabling_autograd_function(exported_model: ModelProto) -> ModelProto: +def post_process_enabling_autograd_function(exported_model: ModelProto) -> ModelProto: # Loop all PythonOp, append "_ctx" as the first output. index = 0 for node in exported_model.graph.node: @@ -330,8 +329,7 @@ def _post_process_enabling_autograd_function(exported_model: ModelProto) -> Mode op_name_prefix = kclass_name break - if not node.name: - node.name = f"{op_name_prefix}_id_{index}" - index += 1 + node.name = f"{op_name_prefix}_id_{index}" + index += 1 return exported_model diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_runner.py b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_runner.py index 845c7d83c2e7b..a5b96c4e37140 100644 --- a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_runner.py +++ b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_runner.py @@ -376,6 +376,16 @@ def wrap_all_outputs(result): result = backward_function(*wrapped_args) # Extract results as DLPack tensor list. + if isinstance(result, torch.Tensor): + result = [result] + elif isinstance(result, (tuple, list)): + result = list(result) + else: + raise wrap_exception( + ORTModuleIOError, + TypeError(f"ORTModule does not support the following model output type {type(result)}."), + ) + wrapped_returned_args = wrap_all_outputs(result) torch_interop_utils.unregister_grad_fn(id(ctx)) diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py index 2227b630aee23..dfaac5f0fa836 100755 --- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py +++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py @@ -19,11 +19,10 @@ import onnxruntime from onnxruntime.capi import _pybind_state as C from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference -from onnxruntime.training.utils import ORTModelInputOutputSchemaType +from onnxruntime.training.utils import ORTModelInputOutputSchemaType, onnx_dtype_to_pytorch from onnxruntime.training.utils.hooks import configure_ort_compatible_zero_stage3 from . import _are_deterministic_algorithms_enabled, _io, _logger, _onnx_models, _utils -from ._custom_autograd_function_exporter import _post_process_after_export from ._fallback import ( ORTModuleDeviceException, ORTModuleONNXModelException, @@ -141,9 +140,14 @@ def __init__( register_triton_op_executor() + self._zero_stage3_param_map = {} if self._runtime_options.enable_zero_stage3_support: # Cannot toggle feature enabling/disabling after the first time enabled. - configure_ort_compatible_zero_stage3() + from onnxruntime.training.utils.hooks._zero_offload_subscriber import _get_all_zero_stage3_params + + self._zero_stage3_param_map = _get_all_zero_stage3_params(self._flattened_module) + + configure_ort_compatible_zero_stage3(debug=False, stats_output_dir="ort_output", stats_overwrite=True) def _get_torch_gpu_allocator_function_addresses(self): if self._runtime_options.use_external_gpu_allocator and torch.cuda.is_available(): @@ -345,7 +349,8 @@ def _get_exported_model(self, input_schema: ORTModelInputOutputSchemaType, *inpu ) if os.path.exists(cache_dir) and os.path.isfile(filename): self._logger.info( - f"Cached model detected! Cached model will be used to save export and initialization time. If you want the model to be re-exported then DELETE {filename}." + f"Cached model detected! Cached model will be used to save export and initialization time." + f"If you want the model to be re-exported then DELETE {filename}." ) exported_model = onnx.load(filename) return exported_model @@ -409,9 +414,24 @@ def _get_exported_model(self, input_schema: ORTModelInputOutputSchemaType, *inpu ) exported_model = onnx.load_model_from_string(f.getvalue()) - exported_model = _post_process_after_export( - exported_model, self._runtime_options.enable_custom_autograd_function - ) + if self._runtime_options.enable_custom_autograd_function: + from ._custom_autograd_function_exporter import post_process_enabling_autograd_function + + exported_model = post_process_enabling_autograd_function(exported_model) + + if self._runtime_options.enable_zero_stage3_support: + from ._zero_stage3_compatibility import post_processing_enable_zero_stage3_compat + + exported_model = post_processing_enable_zero_stage3_compat( + exported_model, + self._zero_stage3_param_map, + [name for name, _ in self._flattened_module.named_parameters()], + ) + + # Cannot append pull weight trigger name to input names as following, otherwise, the later check ( + # https://github.com/microsoft/onnxruntime/blob/068300d97eb25e5b52324e7af54a45ed1fa6a4c3/orttraining/orttraining/python/training/ortmodule/_training_manager.py#L466C18-L466C18) + # find input info mismatch, will re-initialize the graph builder. + # self._input_info.require_grad_names.append(STAGE3_PULL_WEIGHT_TRIGGER_NAME) # Cache model for future runs if cache_dir: @@ -477,7 +497,14 @@ def _initialize_graph_builder(self): grad_builder_config = C.OrtModuleGraphBuilderConfiguration() grad_builder_config.initializer_names = initializer_names grad_builder_config.initializer_names_to_train = initializer_names_to_train - grad_builder_config.input_names_require_grad = self._input_info.require_grad_names + + input_names_require_grad = self._input_info.require_grad_names + if self._runtime_options.enable_zero_stage3_support: + from ._zero_stage3_compatibility import STAGE3_PULL_WEIGHT_TRIGGER_NAME + + # Add stage3 pull weight trigger name to require_grad_names, so that it will be included in the gradient graph. + input_names_require_grad.append(STAGE3_PULL_WEIGHT_TRIGGER_NAME) + grad_builder_config.input_names_require_grad = input_names_require_grad grad_builder_config.build_gradient_graph = self._export_mode == torch.onnx.TrainingMode.TRAINING grad_builder_config.enable_caching = self._runtime_options.enable_grad_acc_optimization grad_builder_config.loglevel = _logger.ortmodule_loglevel_to_onnxruntime_c_loglevel( @@ -553,6 +580,9 @@ def _enable_conditional_optimizations( inputs, kwargs ) + if self._runtime_options.enable_zero_stage3_support: + self._append_pull_weight_trigger_as_input(kwargs, detected_device) + _, embed_sparsity_results, label_sparsity_results = _io._combine_input_buffers_initializers( self._graph_initializers, self._graph_builder.get_graph_info().user_input_names, @@ -562,6 +592,7 @@ def _enable_conditional_optimizations( kwargs, detected_device, self._runtime_inspector, + self._zero_stage3_param_map, ) # Enable sparsity-based optimization when applicable. @@ -587,6 +618,21 @@ def _enable_conditional_optimizations( if self._runtime_options.print_memory_stat: self._runtime_inspector.enable_memory_inspector(self._original_module) + def _append_pull_weight_trigger_as_input(self, kwargs: Dict, device: torch.device): + from ._zero_stage3_compatibility import ( + STAGE3_PULL_WEIGHT_TRIGGER_NAME, + STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE, + STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE, + ) + + kwargs[STAGE3_PULL_WEIGHT_TRIGGER_NAME] = torch.zeros( + STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE, + dtype=onnx_dtype_to_pytorch(STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE), + device=device, + ).requires_grad_() + + return kwargs + def _log_feature_stats(self): if get_rank() != 0: return diff --git a/orttraining/orttraining/python/training/ortmodule/_inference_manager.py b/orttraining/orttraining/python/training/ortmodule/_inference_manager.py index b7c01a1f5baf9..8d8be81c549d1 100644 --- a/orttraining/orttraining/python/training/ortmodule/_inference_manager.py +++ b/orttraining/orttraining/python/training/ortmodule/_inference_manager.py @@ -159,6 +159,9 @@ def forward(self, *inputs, **kwargs): # Assert that the input and model device match _utils._check_same_device(self._device, "Input argument to forward", *inputs) + if self._runtime_options.enable_zero_stage3_support: + self._append_pull_weight_trigger_as_input(kwargs, self._device) + prepared_input_list, _, _ = _io._combine_input_buffers_initializers( self._graph_initializers, self._graph_info.user_input_names, @@ -168,6 +171,7 @@ def forward(self, *inputs, **kwargs): kwargs, self._device, self._runtime_inspector, + self._zero_stage3_param_map, ) user_outputs, _ = InferenceManager.execution_session_run_forward( diff --git a/orttraining/orttraining/python/training/ortmodule/_io.py b/orttraining/orttraining/python/training/ortmodule/_io.py index 18b965c549645..e7c1b30daae0d 100644 --- a/orttraining/orttraining/python/training/ortmodule/_io.py +++ b/orttraining/orttraining/python/training/ortmodule/_io.py @@ -168,6 +168,7 @@ def _combine_input_buffers_initializers( kwargs: Mapping[str, ORTModelInputOutputType], device: torch.device, rt_inspector: RuntimeInspector, + zero_stage3_offload_param_map: Optional[Dict[str, torch.nn.parameter.Parameter]], ): """Creates forward `*inputs` list from user input and PyTorch initializers @@ -254,7 +255,12 @@ def _expand_inputs(current_input, non_none_inputs, name=""): ) # params is a list of all initializers known to the onnx graph - result.extend(params) + if zero_stage3_offload_param_map: + for p in params: + if p not in zero_stage3_offload_param_map.values(): + result.append(p) + else: + result.extend(params) return result, embed_sparsity_results, label_sparsity_results diff --git a/orttraining/orttraining/python/training/ortmodule/_training_manager.py b/orttraining/orttraining/python/training/ortmodule/_training_manager.py index 3be4c05797978..19effe2086e0a 100644 --- a/orttraining/orttraining/python/training/ortmodule/_training_manager.py +++ b/orttraining/orttraining/python/training/ortmodule/_training_manager.py @@ -311,6 +311,9 @@ def forward(self, *inputs, **kwargs): self._gradient_accumulation_manager.maybe_update_cache_before_run() + if self._runtime_options.enable_zero_stage3_support: + self._append_pull_weight_trigger_as_input(kwargs, self._device) + prepared_input_list, _, _ = _io._combine_input_buffers_initializers( self._graph_initializers, self._graph_info.user_input_names, @@ -320,6 +323,7 @@ def forward(self, *inputs, **kwargs): kwargs, self._device, self._runtime_inspector, + self._zero_stage3_param_map, ) outputs = unflatten_user_output( diff --git a/orttraining/orttraining/python/training/ortmodule/_zero_stage3_compatibility.py b/orttraining/orttraining/python/training/ortmodule/_zero_stage3_compatibility.py new file mode 100644 index 0000000000000..17756600d601e --- /dev/null +++ b/orttraining/orttraining/python/training/ortmodule/_zero_stage3_compatibility.py @@ -0,0 +1,312 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +from typing import Dict, List, Optional, Tuple, Union + +import torch +from onnx import ModelProto, NodeProto, TensorProto, ValueInfoProto, helper + +from onnxruntime.capi._pybind_state import register_torch_autograd_function +from onnxruntime.training.utils import pytorch_dtype_to_onnx + +from ._custom_autograd_function_exporter import PythonOpShapeInferStore +from ._utils import get_fully_qualified_class_name + +STAGE3_PULL_WEIGHT_TRIGGER_NAME = "pull_weight_trigger" +STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE = TensorProto.FLOAT +STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE = [1] + + +def post_processing_enable_zero_stage3_compat( + exported_model: ModelProto, + zero_stage3_named_params: Dict[str, torch.nn.parameter.Parameter], + all_param_names: List[str], +) -> ModelProto: + """This function is used to enable zero stage3 compatibility. + + Args: + exported_model (ModelProto): The exported model. + zero_stage3_named_params (Optional[Dict[str, torch.nn.parameter.Parameter]]): The offload named parameters. + all_param_names (List[str]): All parameter names. + """ + + # Register symbolic shape inference functions for PythonOp used in DeepSpeed ZeRO stage3. + _register_symbolic_shape_infer_functions() + + # Create weight retrieving function using zero_stage3_named_params. + func_full_qual_name = _create_weight_retrieval_function(zero_stage3_named_params) + + consumer_map = {} + for node in exported_model.graph.node: + for inp in node.input: + if inp not in consumer_map: + consumer_map[inp] = [] + + if node not in consumer_map[inp]: + consumer_map[inp].append(node) + + def _get_param_pull_trigger_name(param_name: str) -> str: + return f"pull_{param_name}" + + def _get_func_name(node: NodeProto) -> Optional[str]: + for attr in node.attribute: + if attr.name == "func_name": + return attr.s.decode("utf-8") if isinstance(attr.s, bytes) else attr.s + return None + + # Create weight retrieving PythonOp. + new_input, weight_pull_node = _create_weight_retrieval_pythonop( + zero_stage3_named_params, + func_full_qual_name, + STAGE3_PULL_WEIGHT_TRIGGER_NAME, + [_get_param_pull_trigger_name(pname) for pname in zero_stage3_named_params], + STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE, + STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE, + ) + + from onnxruntime.training.utils.hooks._zero_offload_subscriber import ORTZeROOffloadPreForwardFunction + + prefowrad_function_name = get_fully_qualified_class_name(ORTZeROOffloadPreForwardFunction) + + # Connect weight consumers to use the full-sized parameter output of ORTZeROOffloadPreForwardFunction. + for graph_input in exported_model.graph.input: + if graph_input.name not in zero_stage3_named_params: + continue + + if graph_input.name not in consumer_map: + continue + + consumers = consumer_map[graph_input.name] + pre_forward_pythonop_node = None + + for c in consumers: + if c.op_type != "PythonOp": + continue + + func_name = _get_func_name(c) + if func_name == prefowrad_function_name: + assert ( + pre_forward_pythonop_node is None + ), "Multiple ORTZeROOffloadPreForwardFunction nodes found, it should not happen" + pre_forward_pythonop_node = c + + if pre_forward_pythonop_node is None: + raise RuntimeError( + "Fail to find ORTZeROOffloadPreForwardFunction for partitioned param: " + graph_input.name + ) + + index_offset_on_python_op_input = [] + for i, input_name in enumerate(pre_forward_pythonop_node.input): + if input_name == graph_input.name: + index_offset_on_python_op_input.append(i) + + assert ( + len(index_offset_on_python_op_input) == 1 + ), f"index_offset_on_python_op_input length is not 1: {index_offset_on_python_op_input}" + + reverse_index_among_inputs = index_offset_on_python_op_input[0] - len(pre_forward_pythonop_node.input) + new_input_name = _get_param_pull_trigger_name(graph_input.name) + pre_forward_pythonop_node.input[index_offset_on_python_op_input[0]] = new_input_name + + _update_python_op_input_related_attributes( + pre_forward_pythonop_node, + new_input_name, + len(STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE), # new rank + STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE, # new data type + ) + + output_index = reverse_index_among_inputs + len(pre_forward_pythonop_node.output) + pre_forward_pythonop_node.output[output_index] = graph_input.name + + # If the consumer of original `graph_input.name` is PythonOp, we need also update its attributes because now + # `graph_input.name` as output of pre_forward_pythonop_node, is full-sized parameter, the rank might differ + # from the original one. + for c in consumers: + if c == pre_forward_pythonop_node or c.op_type != "PythonOp": + continue + _update_python_op_input_related_attributes( + c, + graph_input.name, + len(zero_stage3_named_params[graph_input.name].ds_shape), # new rank + pytorch_dtype_to_onnx(zero_stage3_named_params[graph_input.name].dtype), # new data type + ) + + # Delete exported_model.graph.input + graph_inputs_to_remove = [ + graph_input for graph_input in exported_model.graph.input if graph_input.name in zero_stage3_named_params + ] + for input_to_remove in graph_inputs_to_remove: + exported_model.graph.input.remove(input_to_remove) + + # Re-order graph input to make sure the weight pull trigger is before all parameter inputs. + offset = 0 + for graph_input in exported_model.graph.input: + if graph_input.name in all_param_names: + break + offset += 1 + + exported_model.graph.input.insert(offset, new_input) + exported_model.graph.node.insert(0, weight_pull_node) + + return exported_model + + +def _create_weight_retrieval_function( + zero_stage3_named_params: Optional[Dict[str, torch.nn.parameter.Parameter]] +) -> str: + """This function is used to create a weight retrieving function using zero_stage3_named_params.""" + + class WeightRetrievalFunction(torch.autograd.Function): + @staticmethod + def forward(ctx, weight_in_trigger): + params = list(zero_stage3_named_params.values()) + ctx.params = params + ctx.dtype = weight_in_trigger.dtype + ctx.device = weight_in_trigger.device + ctx.shape = weight_in_trigger.shape + return (torch.zeros(ctx.shape, device=ctx.device, dtype=ctx.dtype),) * len(params) + + @staticmethod + def backward(ctx, *grad_outputs): + return torch.zeros(ctx.shape, device=ctx.device, dtype=ctx.dtype) + + @staticmethod + def infer_shape( + node: NodeProto, + tensor_input_shapes: List[Optional[List[Union[int, str]]]], + tensor_input_dtypes: List[torch.onnx.TensorProtoDataType], + ) -> Tuple[List[Optional[List[Union[int, str]]]], List[torch.onnx.TensorProtoDataType]]: + param_count = len(zero_stage3_named_params.values()) + tensor_output_shapes = [ + tensor_input_shapes[0], + ] * param_count + tensor_output_dtypes = [ + tensor_input_dtypes[0], + ] * param_count + return tensor_output_shapes, tensor_output_dtypes + + func_full_qual_name = get_fully_qualified_class_name(WeightRetrievalFunction) + register_torch_autograd_function(func_full_qual_name, WeightRetrievalFunction) + PythonOpShapeInferStore.register(WeightRetrievalFunction) + + return func_full_qual_name + + +def _register_symbolic_shape_infer_functions(): + """This function is used to register symbolic shape inference functions for PythonOp used in + DeepSpeed ZeRO stage3.""" + + def _simple_pass_through_infer_shape( + node: NodeProto, + tensor_input_shapes: List[Optional[List[Union[int, str]]]], + tensor_input_dtypes: List[torch.onnx.TensorProtoDataType], + ) -> Tuple[List[Optional[List[Union[int, str]]]], List[torch.onnx.TensorProtoDataType]]: + return tensor_input_shapes, tensor_input_dtypes + + PythonOpShapeInferStore.register_func( + "deepspeed.runtime.zero.parameter_offload.PreBackwardFunction", _simple_pass_through_infer_shape + ) + PythonOpShapeInferStore.register_func( + "deepspeed.runtime.zero.parameter_offload.PostBackwardFunction", _simple_pass_through_infer_shape + ) + + def _linear_infer_shape( + node: NodeProto, + tensor_input_shapes: List[Optional[List[Union[int, str]]]], + tensor_input_dtypes: List[torch.onnx.TensorProtoDataType], + ) -> Tuple[List[Optional[List[Union[int, str]]]], List[torch.onnx.TensorProtoDataType]]: + # output = input.matmul(weight.t()) + tensor_input_shapes[0] # input + shape2 = tensor_input_shapes[1] # weight + output_shape = tensor_input_shapes[0] + output_shape[-1] = shape2[-2] + return [output_shape], [tensor_input_dtypes[0]] + + PythonOpShapeInferStore.register_func( + "deepspeed.runtime.zero.linear.LinearFunctionForZeroStage3", _linear_infer_shape + ) + + +def _create_weight_retrieval_pythonop( + zero_stage3_named_params: Optional[Dict[str, torch.nn.parameter.Parameter]], + func_full_qual_name: str, + input_name: str, + output_names: List[str], + STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE, + STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE: List[int], +) -> Tuple[ValueInfoProto, NodeProto]: + """This function is used to create a weight retrieving PythonOp.""" + offload_param_count = 0 if zero_stage3_named_params is None else len(zero_stage3_named_params) + new_input = helper.make_tensor_value_info( + input_name, STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE, STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE + ) + output_rank_for_pull_weight_trigger = len(STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE) + output_dtype_for_pull_weight_trigger = STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE + output_tensor_ranks = [ + output_rank_for_pull_weight_trigger, + ] * offload_param_count + output_tensor_types = [ + output_dtype_for_pull_weight_trigger, + ] * offload_param_count + + node_attributes = { + "comment": "", + "inplace": 0, + "input_convention": "d", + "input_tensor_ranks": [len(STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE)], + "input_tensor_types": [STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE], + "output_tensor_ranks": output_tensor_ranks, + "output_tensor_types": output_tensor_types, + "training_mode": 1, + "func_name": func_full_qual_name, + } + + weight_pull_node = helper.make_node( + "PythonOp", + [input_name], + ["pull_weight_trigger_ctx", *output_names], + "pull_weight_trigger", # node name + "PythonOp for weight retrieving.", + "com.microsoft", + **node_attributes, + ) + + return new_input, weight_pull_node + + +def _update_python_op_input_related_attributes(node: NodeProto, input_name: str, new_rank: int, new_dtype: int): + """This function is used to update PythonOp's input related attributes, e.g. + input_tensor_ranks and input_tensor_types. + + Args: + node (NodeProto): The PythonOp node. + input_name (str): The input name to be updated. + new_rank (int): The new rank of the input, to be used in input_tensor_ranks. + new_dtype (int): The new data type of the input, to be used in input_tensor_types. + """ + input_tensor_ranks = None + input_tensor_dtypes = None + rank_attr = None + dtype_attr = None + for attr in node.attribute: + if attr.name == "input_tensor_ranks": + input_tensor_ranks = attr.ints + rank_attr = attr + if attr.name == "input_tensor_types": + input_tensor_dtypes = attr.ints + dtype_attr = attr + + assert input_tensor_ranks is not None, "input_tensor_ranks is None" + assert input_tensor_dtypes is not None, "input_tensor_dtypes is None" + + for index, node_input_name in enumerate(node.input): + if node_input_name == input_name: + input_tensor_ranks[index] = new_rank + input_tensor_dtypes[index] = new_dtype + + node.attribute.remove(rank_attr) + node.attribute.remove(dtype_attr) + node.attribute.append(helper.make_attribute("input_tensor_ranks", input_tensor_ranks)) + node.attribute.append(helper.make_attribute("input_tensor_types", input_tensor_dtypes)) diff --git a/orttraining/orttraining/python/training/utils/__init__.py b/orttraining/orttraining/python/training/utils/__init__.py index acf2698d55eaf..fa7c9f2750cdd 100644 --- a/orttraining/orttraining/python/training/utils/__init__.py +++ b/orttraining/orttraining/python/training/utils/__init__.py @@ -9,7 +9,7 @@ extract_data_and_schema, unflatten_data_using_schema, ) -from onnxruntime.training.utils.torch_type_map import pytorch_dtype_to_onnx +from onnxruntime.training.utils.torch_type_map import onnx_dtype_to_pytorch, pytorch_dtype_to_onnx __all__ = [ "PrimitiveType", @@ -18,4 +18,5 @@ "extract_data_and_schema", "unflatten_data_using_schema", "pytorch_dtype_to_onnx", + "onnx_dtype_to_pytorch", ] diff --git a/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py b/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py index 6c8027b2fefaa..db1c69cf95ba4 100644 --- a/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py +++ b/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py @@ -6,6 +6,7 @@ import os import shutil import warnings +from io import TextIOWrapper from pathlib import Path from typing import List, Optional, Tuple, Union @@ -178,87 +179,97 @@ def _summarize_activations(self, tensor: torch.Tensor, depth: int, name: str, st order_file_path = step_path / "order.txt" tensor_file_path = step_path / output_file_name - # This is to try the best effort to align the count of numbers per line for easier comparison in diff views, - # though it does not always guarantee to do this way. - torch.set_printoptions(precision=6, linewidth=128) - - tensor_shape = tensor.shape - tensor_dtype = tensor.dtype - flatten_array = tensor.flatten().view(-1) - - if self._run_on_cpu: - flatten_array = flatten_array.to("cpu") - - if self._run_on_cpu: - num_nan = torch.isnan(flatten_array).sum() - num_inf = torch.isinf(flatten_array).sum() - num_neg = (flatten_array < 0).sum() - num_pos = (flatten_array > 0).sum() - num_zero = (flatten_array == 0).sum() - min_value = flatten_array.min() - max_value = flatten_array.max() - mean_value = flatten_array.mean() - std_value = flatten_array.std() - else: - # Split the calculation for each bucket, then do another round of calculation on the bucket results. - # This can at the best effort reduce the peak memory impact. - bucket_size = self._bucket_size - element_count = flatten_array.numel() - ceil_bucket_count = (element_count + bucket_size - 1) // (bucket_size) - nan_buckets = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device) - inf_buckets = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device) - neg_buckets = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device) - pos_buckets = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device) - zero_buckets = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device) - min_buckets = torch.zeros(ceil_bucket_count, dtype=flatten_array.dtype, device=flatten_array.device) - max_buckets = torch.zeros(ceil_bucket_count, dtype=flatten_array.dtype, device=flatten_array.device) - mean_buckets = torch.zeros(ceil_bucket_count, dtype=flatten_array.dtype, device=flatten_array.device) - std_buckets = torch.zeros(ceil_bucket_count, dtype=flatten_array.dtype, device=flatten_array.device) - - # Summary for each bucket - element_count_per_bucket = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device) - for i in range(ceil_bucket_count): - end = min((i + 1) * bucket_size, element_count) - bucket = flatten_array[i * bucket_size : end] - element_count_per_bucket[i] = bucket.numel() - - nan_buckets[i] = torch.isnan(bucket).sum() - inf_buckets[i] = torch.isinf(bucket).sum() - neg_buckets[i] = (bucket < 0).sum() - pos_buckets[i] = (bucket > 0).sum() - zero_buckets[i] = (bucket == 0).sum() - min_buckets[i] = bucket.min() - max_buckets[i] = bucket.max() - mean_buckets[i] = bucket.sum() - std_buckets[i] = bucket.std() - - # Reduction across all buckets - num_nan = nan_buckets.sum() - num_inf = inf_buckets.sum() - num_neg = neg_buckets.sum() - num_pos = pos_buckets.sum() - num_zero = zero_buckets.sum() - min_value = min_buckets.min() - max_value = max_buckets.max() - mean_value = float(mean_buckets.sum()) / float(element_count) - # Here we refer to - # https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups - # to calculate the combined standard deviation of all buckets. - s = (element_count_per_bucket - 1) * (std_buckets**2) + element_count_per_bucket * ( - (mean_buckets - mean_value) ** 2 - ) - std_value = torch.sqrt(s.sum() / (element_count - 1)) - with order_file_path.open(mode="a", encoding="utf-8") as f: f.write(f"{output_file_name}\n") with tensor_file_path.open(mode="w", encoding="utf-8") as f: - f.write( - f"{'>'*max(0, depth) + display_name} shape: {tensor_shape} dtype: {tensor_dtype} size: {flatten_array.size()} \n" - f"min: {min_value} max: {max_value}, mean: {mean_value}, " - f"std: {std_value} \n" - f"nan: {num_nan}, inf: {num_inf}\n" - ) - f.write(f"samples(top 128): {flatten_array[:128]}\n") - f.write(f"neg: {num_neg}, pos: {num_pos}, zero: {num_zero},\n") - f.write(f"{'='*16}\n") + _summarize_tensor(display_name, tensor, f, depth, self._run_on_cpu, self._bucket_size) + + +def _summarize_tensor( + display_name: str, + tensor: torch.Tensor, + f: TextIOWrapper, + depth: int = 0, + run_on_cpu: bool = False, + bucket_size: int = 1024 * 1024 * 1024 // 2, +): + # This is to try the best effort to align the count of numbers per line for easier comparison in diff views, + # though it does not always guarantee to do this way. + torch.set_printoptions(precision=6, linewidth=128) + + tensor_shape = tensor.shape + tensor_dtype = tensor.dtype + flatten_array = tensor.flatten().view(-1) + + if run_on_cpu: + flatten_array = flatten_array.to("cpu") + + if run_on_cpu: + num_nan = torch.isnan(flatten_array).sum() + num_inf = torch.isinf(flatten_array).sum() + num_neg = (flatten_array < 0).sum() + num_pos = (flatten_array > 0).sum() + num_zero = (flatten_array == 0).sum() + min_value = flatten_array.min() + max_value = flatten_array.max() + mean_value = flatten_array.mean() + std_value = flatten_array.std() + else: + # Split the calculation for each bucket, then do another round of calculation on the bucket results. + # This can at the best effort reduce the peak memory impact. + element_count = flatten_array.numel() + ceil_bucket_count = (element_count + bucket_size - 1) // (bucket_size) + nan_buckets = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device) + inf_buckets = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device) + neg_buckets = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device) + pos_buckets = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device) + zero_buckets = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device) + min_buckets = torch.zeros(ceil_bucket_count, dtype=flatten_array.dtype, device=flatten_array.device) + max_buckets = torch.zeros(ceil_bucket_count, dtype=flatten_array.dtype, device=flatten_array.device) + mean_buckets = torch.zeros(ceil_bucket_count, dtype=flatten_array.dtype, device=flatten_array.device) + std_buckets = torch.zeros(ceil_bucket_count, dtype=flatten_array.dtype, device=flatten_array.device) + + # Summary for each bucket + element_count_per_bucket = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device) + for i in range(ceil_bucket_count): + end = min((i + 1) * bucket_size, element_count) + bucket = flatten_array[i * bucket_size : end] + element_count_per_bucket[i] = bucket.numel() + + nan_buckets[i] = torch.isnan(bucket).sum() + inf_buckets[i] = torch.isinf(bucket).sum() + neg_buckets[i] = (bucket < 0).sum() + pos_buckets[i] = (bucket > 0).sum() + zero_buckets[i] = (bucket == 0).sum() + min_buckets[i] = bucket.min() + max_buckets[i] = bucket.max() + mean_buckets[i] = bucket.sum() + std_buckets[i] = bucket.std() + + # Reduction across all buckets + num_nan = nan_buckets.sum() + num_inf = inf_buckets.sum() + num_neg = neg_buckets.sum() + num_pos = pos_buckets.sum() + num_zero = zero_buckets.sum() + min_value = min_buckets.min() + max_value = max_buckets.max() + mean_value = float(mean_buckets.sum()) / float(element_count) + # Here we refer to + # https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups + # to calculate the combined standard deviation of all buckets. + s = (element_count_per_bucket - 1) * (std_buckets**2) + element_count_per_bucket * ( + (mean_buckets - mean_value) ** 2 + ) + std_value = torch.sqrt(s.sum() / (element_count - 1)) + + f.write( + f"{'>'*max(0, depth) + display_name} shape: {tensor_shape} dtype: {tensor_dtype} size: {flatten_array.size()} \n" + f"min: {min_value} max: {max_value}, mean: {mean_value}, " + f"std: {std_value} \n" + f"nan: {num_nan}, inf: {num_inf}\n" + ) + f.write(f"samples(top 128): {flatten_array[:128]}\n") + f.write(f"neg: {num_neg}, pos: {num_pos}, zero: {num_zero},\n") + f.write(f"{'='*16}\n") diff --git a/orttraining/orttraining/python/training/utils/hooks/_subscriber_manager.py b/orttraining/orttraining/python/training/utils/hooks/_subscriber_manager.py index db38f58d8f324..b2bc64be42fc1 100644 --- a/orttraining/orttraining/python/training/utils/hooks/_subscriber_manager.py +++ b/orttraining/orttraining/python/training/utils/hooks/_subscriber_manager.py @@ -29,14 +29,6 @@ def no_increase_global_step(): finally: ORT_NO_INCREASE_GLOBAL_STEP[0] = False - @staticmethod - def infer_shape( - node: onnx.NodeProto, - tensor_input_shapes: List[Optional[List[Union[int, str]]]], - tensor_input_dtypes: List[torch.onnx.TensorProtoDataType], - ) -> Tuple[List[Optional[List[Union[int, str]]]], List[torch.onnx.TensorProtoDataType]]: - return tensor_input_shapes, tensor_input_dtypes - class _IncrementStep(torch.autograd.Function): """This class is used to manage the global execution step, e.g. @@ -55,8 +47,9 @@ def forward(ctx, run_ctx: RuntimeStates, *input_tensor_list: Tuple[torch.Tensor, ctx.current_step = run_ctx.global_states.execution_step ctx.run_ctx = run_ctx - if ctx.current_step >= 0: - print(f"{'='*6} Completed forward pass for STEP {ctx.current_step} {'='*6}") + # Uncomment the following line for debugging purposes. + # if ctx.current_step >= 0: + # print(f"{'='*6} Completed forward pass for STEP {ctx.current_step} {'='*6}") if ORT_NO_INCREASE_GLOBAL_STEP[0] is False: ctx.run_ctx.global_states.execution_step += 1 @@ -191,7 +184,7 @@ def _reset_recursively(module: torch.nn.Module, depth: int, next_module_index: L next_module_index: list of int, carrying a global unique module index that can be used next. """ module_index = next_module_index[0] - module.id = module_index # STAGE3WARN: needed by DeepSpeed + module.id = module_index # STAGE3WARN#1: needed by DeepSpeed self._run_ctx.global_states.module_index_to_depth[module_index] = depth self._run_ctx.global_states.module_to_module_index[module] = module_index @@ -217,7 +210,7 @@ def _register_hooks_recursively(self, module: torch.nn.Module, depth: int, next_ next_module_index: list of int, carrying a global unique module index that can be used next. """ module_index = next_module_index[0] - module.id = module_index # STAGE3WARN: needed by DeepSpeed + module.id = module_index # STAGE3WARN#2: needed by DeepSpeed self._run_ctx.global_states.module_index_to_depth[module_index] = depth self._run_ctx.global_states.module_to_module_index[module] = module_index diff --git a/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py b/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py index 3d42e172eea82..ad1297962db71 100644 --- a/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py +++ b/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py @@ -23,25 +23,37 @@ from ._subscriber_base import RuntimeStates, SubscriberBase -# Used to monkey patch the original function -# Adapted from https://github.com/microsoft/DeepSpeed/blob/e8318634b4313eaad89842cf4322e1762d34ced3/deepspeed/runtime/zero/parameter_offload.py#L333 -def _setup_zero_stage3_ort_compatible_hooks(self): - self.hierarchy = 0 +def _get_ort_compatible_zero_stage3_hook_function(debug, stats_output_dir, stats_overwrite): + """Create ort compatible hook function for DeepSpeed ZeRO stage3. - from onnxruntime.training.utils.hooks import SubscriberManager, ZeROOffloadSubscriber - from onnxruntime.training.utils.hooks._zero_offload_subscriber import _zero_offload_one_time_initializer + Args: + debug: whether to enable convergence debugging. + stats_output_dir: the directory to store convergence stats. + stats_overwrite: whether to overwrite the stats file if it already exists. + """ + + # Used to monkey patch the original function + # Adapted from https://github.com/microsoft/DeepSpeed/blob/e8318634b4313eaad89842cf4322e1762d34ced3/deepspeed/runtime/zero/parameter_offload.py#L333 + def _setup_zero_stage3_ort_compatible_hooks(self): + self.hierarchy = 0 + + from onnxruntime.training.utils.hooks import StatisticsSubscriber, SubscriberManager, ZeROOffloadSubscriber + from onnxruntime.training.utils.hooks._zero_offload_subscriber import _zero_offload_one_time_initializer - # Each DeepSpeed engine has a separate subscriber manager. - self._offload_subscriber_manager = SubscriberManager() - self._offload_subscriber_manager.subscribe( - self.module, [ZeROOffloadSubscriber(self, _zero_offload_one_time_initializer)] - ) - self.forward_hooks.extend(self._offload_subscriber_manager._pre_forward_hooks) - self.forward_hooks.extend(self._offload_subscriber_manager._post_forward_hooks) + subscribers = [ZeROOffloadSubscriber(self, _zero_offload_one_time_initializer)] + if debug is True: + subscribers.append(StatisticsSubscriber(output_dir=stats_output_dir, override_output_dir=stats_overwrite)) + # Each DeepSpeed engine has a separate subscriber manager. + self._offload_subscriber_manager = SubscriberManager() + self._offload_subscriber_manager.subscribe(self.module, subscribers) + self.forward_hooks.extend(self._offload_subscriber_manager._pre_forward_hooks) + self.forward_hooks.extend(self._offload_subscriber_manager._post_forward_hooks) - # Add top module to stack trace - global FWD_MODULE_STACK # noqa: PLW0602 - FWD_MODULE_STACK.append(self.module) + # Add top module to stack trace + global FWD_MODULE_STACK # noqa: PLW0602 + FWD_MODULE_STACK.append(self.module) + + return _setup_zero_stage3_ort_compatible_hooks # Adapted from https://github.com/microsoft/DeepSpeed/blob/e8318634b4313eaad89842cf4322e1762d34ced3/deepspeed/runtime/zero/linear.py#L104 @@ -86,14 +98,16 @@ def collect_code(self, function: Callable): _zero_offload_one_time_initializer.collect_code(DeepSpeedZeRoOffload.setup_zero_stage3_hooks) # This is the function to enable ORT ZeRO offload. - def configure_ort_compatible_zero_stage3(): + def configure_ort_compatible_zero_stage3(debug=False, stats_output_dir="./", stats_overwrite=False): """Configure ZeRO stage3 to be ORT compatible. This function will overwrite the original DeepSpeed ZeRO stage3 hooks to make it ORT compatible. """ # Only done once no matter how many times this function is called for different modules. - DeepSpeedZeRoOffload.setup_zero_stage3_hooks = _setup_zero_stage3_ort_compatible_hooks + DeepSpeedZeRoOffload.setup_zero_stage3_hooks = _get_ort_compatible_zero_stage3_hook_function( + debug, stats_output_dir, stats_overwrite + ) from deepspeed.runtime.zero.linear import zero3_linear_wrap @@ -103,7 +117,7 @@ def configure_ort_compatible_zero_stage3(): except ImportError as e: warnings.warn(f"DeepSpeed import error {e}") - def configure_ort_compatible_zero_stage3(): + def configure_ort_compatible_zero_stage3(debug=False, stats_output_dir=None, stats_overwrite=False): raise RuntimeError("DeepSpeed is not installed, cannot configure ORT compatible ZeRO stage3.") @@ -115,13 +129,13 @@ def _get_params_for_current_module(module: torch.nn.Module) -> List[torch.nn.par """ from deepspeed.runtime.zero.partitioned_param_coordinator import iter_params - # Retrive the parameters that are not available for this module. + # Retrieve all parameters for this module. partitioned_params = [param for param in iter_params(module)] return partitioned_params -def _get_all_offloaded_params(module: torch.nn.Module) -> Dict[str, torch.nn.parameter.Parameter]: +def _get_all_zero_stage3_params(module: torch.nn.Module) -> Dict[str, torch.nn.parameter.Parameter]: """Retrieve all the parameters that are offloaded.""" from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus @@ -134,16 +148,13 @@ def _get_all_offloaded_params(module: torch.nn.Module) -> Dict[str, torch.nn.par class ORTZeROOffloadPreForwardFunction(torch.autograd.Function): - """This function is a common bridge to call original PyTorch's - pre_forward_function and post_backward_function. - """ + """This function is a common bridge to call original PyTorch's pre_forward_function""" @staticmethod def forward( ctx, module, pre_forward_with_kwargs_function, - post_backward_function, args_schema, kwargs_schema, args_tensor_count, @@ -155,7 +166,6 @@ def forward( ctx: context object module: the module to be called pre_forward_with_kwargs_function: the function to be called before forward (PyTorch's pre_forward_function) - post_backward_function: the function to be called after backward (PyTorch's post_backward_function) args_schema: the schema of the args, used to reconstruct the args in original form in PyTorch's pre_forward_function's inputs. kwargs_schema: the schema of the kwargs, used to reconstruct the kwargs in original form in @@ -168,6 +178,17 @@ def forward( args_tensors = tensor_list[:args_tensor_count] kwargs_tensors = tensor_list[args_tensor_count : args_tensor_count + kwargs_tensor_count] + # For PyTorch runs, the sizes are all 0, it does not need a gradient because + # param._detach().requires_grad_(False) is called. + # But for ORT runs, the sizes are all [1], as output of weight retrieval function. + # So we keep track of the shapes and dtypes of the passed-in tensors, then generate the grads in backward. + # While for both PyTorch and ORT runs, the grad is not important because they are not param grads + # anymore, they are only used for completing the full backward propagation. + passed_in_param_tensors = tensor_list[args_tensor_count + kwargs_tensor_count :] + ctx.shapes = [p.shape for p in passed_in_param_tensors] + ctx.dtypes = [p.dtype for p in passed_in_param_tensors] + ctx.devices = [p.device for p in passed_in_param_tensors] + args = unflatten_data_using_schema(args_tensors, args_schema) kwargs = unflatten_data_using_schema(kwargs_tensors, kwargs_schema) @@ -179,6 +200,8 @@ def forward( partitioned_params = _get_params_for_current_module(module) ctx.partitioned_params = partitioned_params + assert len(partitioned_params) == len(passed_in_param_tensors) + f_ret = pre_forward_with_kwargs_function(module, args, kwargs) if f_ret is None: @@ -188,7 +211,6 @@ def forward( updated_args, updated_kwargs = f_ret ctx.module = module - ctx.post_backward_function = post_backward_function updated_args_tensors, _ = extract_data_and_schema(updated_args) updated_kwargs_tensors, _ = extract_data_and_schema(updated_kwargs) @@ -203,17 +225,32 @@ def forward( @staticmethod def backward(ctx, *grads): updated_grads = grads - if ctx.post_backward_function is not None: - ret = ctx.post_backward_function(ctx.module, grads) - if ret is not None: - updated_grads = ret - # TODO(pengwa) Update grad for partitioned parameters. input_count = len(updated_grads) - len(ctx.partitioned_params) - zeros = [torch.zeros(0, dtype=p.dtype, device=p.device) for p in ctx.partitioned_params] - zero_grads = updated_grads[:input_count] + tuple(zeros) - - return (None, None, None, None, None, None, None, *zero_grads) + param_start_offset = input_count + + # Only need to accumulate grad explicitly for ORT run (e.g. ctx.shapes[0] == (1,)); + # In the PyTorch run, the accumulation happens automatically. + need_manual_grad_acc = len(ctx.shapes) > 0 and ctx.shapes[0] == (1,) + if need_manual_grad_acc: + for param_index, p in enumerate(ctx.partitioned_params): + g = updated_grads[param_index + param_start_offset] + if g is None: + raise RuntimeError(f"param {p} has no grad, this should not happen.") + # Param gradient accumulation is triggered here, along with the attached hooks, done by PyTorch. + assert p.shape == g.shape, f"param_index: {param_index} - param shape {p.shape} != grad shape {g.shape}" + p.backward(g) + + # At this point, the **real** param grads are already updated, the following grads are only used for + # completing the full backward propagation, will not affect parameter updates. + passed_in_param_grad = [ + torch.zeros(shape, dtype=dtype, device=device) + for shape, dtype, device in zip(ctx.shapes, ctx.dtypes, ctx.devices) + ] + + zero_grads = updated_grads[:input_count] + tuple(passed_in_param_grad) + + return (None, None, None, None, None, None, *zero_grads) @staticmethod def infer_shape( @@ -258,14 +295,14 @@ def forward( module: the module to be called post_forward_function: the function to be called after forward (PyTorch's post_forward_function) pre_backward_function: the function to be called before backward (PyTorch's pre_backward_function) - output_schema: the schema of the output, used to reconstruct the output in original form in + output_schema: the schema of the output, used to reconstruct the output in its original form in PyTorch's post_forward_function's inputs. output_tensors: the list of tensors. """ outputs = unflatten_data_using_schema(output_tensors, output_schema) - # STAGE3WARN: _post_forward_module_hook's second argument `input is not used, so we just pass a None here. + # STAGE3WARN#3: _post_forward_module_hook's second argument `input is not used, so we just pass a None here. updated_outputs = post_forward_function(module, None, outputs) if updated_outputs is None: @@ -341,11 +378,19 @@ def pre_forward_module_apply_impl( input and output for torch.autograd.Function, so we do flatten and unflatten here. """ + ## Handle `_post_backward_module_hook` - args_tensors, args_schema = extract_data_and_schema(args) - kwargs_tensors, kwargs_schema = extract_data_and_schema(kwargs) + # Put `_post_backward_module_hook` first because in backward, it is responsible for unloading parameters, + # we want ORTZeROOffloadPreForwardFunction's backward still be able to access the full sized parameters. + _post_backward_module_hook = self._functions.get("_post_backward_module_hook") + # STAGE3WARN#4: most logic in _post_backward_module_hook can be traced correctly so we don't need to + # wrap with PythonOp. For those cannot be traced, we handle them in STAGE3WARN#5. + updated_args = _post_backward_module_hook(module, args) - partitioned_params = _get_params_for_current_module(module) + ## Handle `_pre_forward_module_hook` + + args_tensors, args_schema = extract_data_and_schema(updated_args) + kwargs_tensors, kwargs_schema = extract_data_and_schema(kwargs) _pre_forward_module_hook = self._functions.get("_pre_forward_module_hook") @@ -358,18 +403,29 @@ def _wrap_pre_forward_module_hook(module, args, kwargs): if rets is not None: updated_args = rets - # STAGE3WARN: Moved from _post_backward_module_hook to make sure ORT run will trigger every iteration. + # STAGE3WARN#5: Moved from _post_backward_module_hook to make sure ORT run will trigger every iteration. module.ds_grads_remaining = 0 + return updated_args, updated_kwargs - all_tensors = args_tensors + kwargs_tensors + partitioned_params + # Need to pass the parameters as input to let the exporter trace the related weights for + # current ORTZeROOffloadPreForwardFunction + partitioned_params = _get_params_for_current_module(module) + # Don't require grad for passed-in parameter, otherwise it will be treated as a leaf node, in backward + # returned 0-sized grad did not match the param's gradient accumulator function's input shape metadata, + # PyTorch run will fail during backward. + # This will not harm parameter gradient build either in ORT or PyTorch, imagine the weights are used by + # computation anyway, so the gradient will be built. This hook only references the parameter, but won't + # generate a gradient path for it. + detached_partitioned_params = [p.detach().requires_grad_(False) for p in partitioned_params] + + all_tensors = args_tensors + kwargs_tensors + detached_partitioned_params self._check_all_tensor(all_tensors, module, "pre_forward_module_apply_impl input check") rets = ORTZeROOffloadPreForwardFunction.apply( module, _wrap_pre_forward_module_hook, - None, args_schema, kwargs_schema, args_tensor_count, @@ -385,11 +441,6 @@ def _wrap_pre_forward_module_hook(module, args, kwargs): updated_args = unflatten_data_using_schema(updated_args_tensors, args_schema) updated_kwargs = unflatten_data_using_schema(updated_kwargs_tensors, kwargs_schema) - _post_backward_module_hook = self._functions.get("_post_backward_module_hook") - # STAGE3WARN: Other part of _post_backward_module_hook can be traced correctly so we don't need to - # wrap with PythonOp. - updated_args = _post_backward_module_hook(module, updated_args) - return updated_args, updated_kwargs def post_forward_module_apply_impl( @@ -411,7 +462,7 @@ def post_forward_module_apply_impl( _post_forward_module_hook = self._functions.get("_post_forward_module_hook") def _wrap_post_forward_module_hook(module, input, outputs): - # STAGE3WARN: _post_forward_module_hook applied this for each tensor output, so we do a simple wrap here. + # STAGE3WARN#6: _post_forward_module_hook applied this for each tensor output, so we do a simple wrap here. from deepspeed.runtime.zero.partition_parameters import is_zero_param updated_outputs = _post_forward_module_hook(module, input, outputs) @@ -438,8 +489,8 @@ def _wrap_post_forward_module_hook(module, input, outputs): updated_outputs = unflatten_data_using_schema(updated_outputs_tensors, outputs_schema) _pre_backward_module_hook = self._functions.get("_pre_backward_module_hook") - # STAGE3WARN: _pre_backward_module_hook's second argument `input is not used, so we just pass a None here. - # STAGE3WARN: part of the original _pre_backward_module_hook can be traced correctly so we moved them into + # STAGE3WARN#7: _pre_backward_module_hook's second argument `input is not used, so we just pass a None here. + # STAGE3WARN#8: part of the original _pre_backward_module_hook can be traced correctly so we moved them into # _wrap_post_forward_module_hook above. updated_outputs = _pre_backward_module_hook(module, None, updated_outputs) diff --git a/orttraining/orttraining/python/training/utils/torch_type_map.py b/orttraining/orttraining/python/training/utils/torch_type_map.py index 699747723f457..bdacab8ad04fe 100644 --- a/orttraining/orttraining/python/training/utils/torch_type_map.py +++ b/orttraining/orttraining/python/training/utils/torch_type_map.py @@ -33,6 +33,8 @@ _DTYPE_TO_ONNX = {torch_dtype: onnx_dtype for k, (onnx_dtype, torch_dtype) in _CAST_PYTORCH_TO_ONNX.items()} +_ONNX_TO_DTYPE = {onnx_dtype: torch_dtype for torch_dtype, onnx_dtype in _DTYPE_TO_ONNX.items()} + def pytorch_dtype_to_onnx(dtype_or_scalar_type: Union[torch.dtype, str]) -> torch.onnx.TensorProtoDataType: """Converts a pytorch dtype or scalar type string to an onnx dtype.""" @@ -45,3 +47,10 @@ def pytorch_dtype_to_onnx(dtype_or_scalar_type: Union[torch.dtype, str]) -> torc if dtype not in _DTYPE_TO_ONNX: raise RuntimeError(f"Unsupported dtype {dtype}") return _DTYPE_TO_ONNX[dtype] + + +def onnx_dtype_to_pytorch(dtype: torch.onnx.TensorProtoDataType) -> torch.dtype: + """Converts an onnx dtype to a pytorch dtype.""" + if dtype not in _ONNX_TO_DTYPE: + raise RuntimeError(f"Unsupported dtype {dtype}") + return _ONNX_TO_DTYPE[dtype] diff --git a/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel_base.cc b/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel_base.cc index 4e7fcbc95bb1d..e1d4be24861f5 100644 --- a/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel_base.cc +++ b/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel_base.cc @@ -153,8 +153,11 @@ void PythonOpBase::RunForward(OpKernelContext* context, inplace_ != 0, kernel_invoke_id_); - ORT_ENFORCE(1 + returned_ortvalues.size() == static_cast(context->OutputCount()), - "Output count mismatch for PythonOp run"); + const size_t returned_output_count = 1 + returned_ortvalues.size(); + const size_t kernel_output_count = static_cast(context->OutputCount()); + ORT_ENFORCE(returned_output_count == kernel_output_count, "Output count mismatch for PythonOp run, ", + "returned_output_count: ", returned_output_count, ", expected kernel_output_count: ", + kernel_output_count); } void PythonOpBase::SetOutputs(OpKernelContext* context, void* diff_ctx, std::vector& returned_args) const { diff --git a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml index 21bc1c481b3e6..33fc9d94bac09 100644 --- a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml @@ -200,8 +200,11 @@ stages: - stage: arm64_test dependsOn: ['arm64_build'] jobs: - - template: templates/py-packaging-linux-test.yml + - template: templates/py-packaging-linux-test-cpu.yml parameters: arch: 'aarch64' machine_pool: 'onnxruntime-linux-ARM64-CPU-2019' - device: 'CPU' + base_image: 'arm64v8/almalinux:8' + devtoolset_rootpath: /opt/rh/gcc-toolset-12/root + ld_library_path_arg: /opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64 + prepend_path: '/opt/rh/gcc-toolset-12/root/usr/bin:' diff --git a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml index 3333a7d22a41b..8dd1f0c5c6461 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml @@ -222,7 +222,7 @@ jobs: clean: all pool: AMD-GPU dependsOn: - - Linux_Build + - Linux_Build_ubuntu timeoutInMinutes: 120 steps: diff --git a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml index c684e08ba1258..2161a9205f22d 100644 --- a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml @@ -3,24 +3,38 @@ resources: - pipeline: build source: 'Python packaging pipeline' trigger: true + branch: main # branch to pick the artifact, Used only for manual triggered pipeline runs for testing the pipeline itself + #TODO: Remove the following dependency. Running python tests should not need to use manylinux. + repositories: + - repository: manylinux # The name used to reference this repository in the checkout step + type: Github + endpoint: Microsoft + name: pypa/manylinux + ref: 5eda9aded5462201e6310105728d33016e637ea7 stages: - stage: Linux_Test_CPU_x86_64_stage jobs: - - template: templates/py-packaging-linux-test.yml + - template: templates/py-packaging-linux-test-cpu.yml parameters: arch: 'x86_64' machine_pool: 'onnxruntime-Ubuntu2004-AMD-CPU' - device: 'CPU' + base_image: 'registry.access.redhat.com/ubi8/ubi' + devtoolset_rootpath: /opt/rh/gcc-toolset-12/root + ld_library_path_arg: /opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64 + prepend_path: '/opt/rh/gcc-toolset-12/root/usr/bin:' - stage: Linux_Test_CPU_aarch64_stage dependsOn: [] jobs: - - template: templates/py-packaging-linux-test.yml + - template: templates/py-packaging-linux-test-cpu.yml parameters: arch: 'aarch64' machine_pool: 'aiinfra-linux-ARM64-CPU-2019' - device: 'CPU' + base_image: 'arm64v8/almalinux:8' + devtoolset_rootpath: /opt/rh/gcc-toolset-12/root + ld_library_path_arg: /opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64 + prepend_path: '/opt/rh/gcc-toolset-12/root/usr/bin:' - stage: Packages_Somking_Test dependsOn: [] @@ -31,19 +45,6 @@ stages: machine_pool: vmImage: 'macOS-13' itemPattern: '*/*mac*x86_64.whl' - - template: templates/py-package-smoking-test.yml - parameters: - job_name: Test_WIN_64_Wheels - itemPattern: '*/*win_amd64.whl' - machine_pool: - vmImage: 'windows-2022' - - template: templates/py-package-smoking-test.yml - parameters: - job_name: Test_WIN_32_Wheels - itemPattern: '*/*win32.whl' - python_arch: 'x86' - machine_pool: - vmImage: 'windows-2022' - template: templates/py-package-smoking-test.yml parameters: job_name: Test_LINUX_x86_64_Wheels @@ -61,7 +62,7 @@ stages: - Linux_Test_CPU_aarch64_stage - Packages_Somking_Test jobs: - - template: templates/py-packaging-linux-test.yml + - template: templates/py-packaging-linux-test-cuda.yml parameters: arch: 'x86_64' machine_pool: 'Onnxruntime-Linux-GPU' diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml index 796938dc22a67..15fcec0511741 100644 --- a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml @@ -68,7 +68,7 @@ jobs: script: | mkdir -p $HOME/.onnx docker run --rm -e CFLAGS="${{parameters.OnnxruntimeCFlags}}" -e CXXFLAGS="${{parameters.OnnxruntimeCXXFlags}}" --volume /data/onnx:/data/onnx:ro --volume $(Build.SourcesDirectory):/onnxruntime_src --volume $(Build.BinariesDirectory):/build \ - --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}} /bin/bash -c "python3 \ + --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}} /bin/bash -c "python3.9 \ /onnxruntime_src/tools/ci_build/build.py --build_java --build_nodejs --build_dir /build --config Release \ --skip_submodule_sync --parallel --build_shared_lib ${{ parameters.AdditionalBuildFlags }} && cd /build/Release && make install DESTDIR=/build/linux-${{parameters.OnnxruntimeArch}}" workingDirectory: $(Build.SourcesDirectory) diff --git a/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml b/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml index cee3bd9c9e968..8d5ca19a73535 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml @@ -39,36 +39,22 @@ jobs: versionSpec: $(PythonVersion) architecture: ${{ parameters.python_arch }} - - task: DownloadPipelineArtifact@2 - displayName: 'Download Pipeline Artifact' - inputs: - artifactName: 'onnxruntime' - targetPath: '$(Build.BinariesDirectory)/whl' - itemPattern: ${{parameters.itemPattern}} - # The public ADO project - ${{ if eq(variables['System.CollectionId'], 'f3ad12f2-e480-4533-baf2-635c95467d29') }}: - buildType: current - # The private ADO project - ${{ if eq(variables['System.CollectionId'], 'bc038106-a83b-4dab-9dd3-5a41bc58f34c') }}: - project: '530acbc4-21bc-487d-8cd8-348ff451d2ff' - definition: 841 - preferTriggeringPipeline: true - runVersion: 'latest' - buildType: specific + - download: build # pipeline resource identifier. + artifact: 'onnxruntime' - task: Bash@3 inputs: targetType: 'inline' script: | set -ex - files=(whl/*.whl) + files=(*.whl) FILE_NAME="${files[0]}" FILE_NAME=$(basename $FILE_NAME) PYTHON_PACKAGE_NAME=$(echo "$FILE_NAME" | cut -f 1 -d '-') - python3 -m pip install --find-links "$(Build.BinariesDirectory)/whl" $PYTHON_PACKAGE_NAME - pip show $PYTHON_PACKAGE_NAME - python -c "import onnxruntime as ort; print(ort.__version__)" - workingDirectory: $(Build.BinariesDirectory) + python3 -m pip install --find-links "$(Pipeline.Workspace)/build/onnxruntime" $PYTHON_PACKAGE_NAME + python3 -m pip show $PYTHON_PACKAGE_NAME + python3 -c "import onnxruntime as ort; print(ort.__version__)" + workingDirectory: $(Pipeline.Workspace)/build/onnxruntime displayName: Test Package Installation - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cpu.yml new file mode 100644 index 0000000000000..cc90085e184dc --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cpu.yml @@ -0,0 +1,117 @@ +parameters: +- name: arch + type: string + +- name: base_image + type: string + +- name: devtoolset_rootpath + type: string + +- name: ld_library_path_arg + type: string + +- name: prepend_path + type: string + +- name: machine_pool + type: string + +- name: extra_job_id + type: string + default: '' + +- name: python_wheel_suffix + type: string + default: '' + + +# TODO: Ideally it should fetch information from the build that triggers it +- name: cmake_build_type + type: string + default: 'Release' + values: + - Debug + - Release + - RelWithDebInfo + - MinSizeRel + +- name: timeout + type: number + default: 120 + +jobs: +- job: Linux_Test_CPU${{ parameters.extra_job_id }}_${{ parameters.arch }} + timeoutInMinutes: ${{ parameters.timeout }} + variables: + skipComponentGovernanceDetection: true + workspace: + clean: all + pool: ${{ parameters.machine_pool }} + steps: + - checkout: self + clean: true + submodules: none + # The public ADO project + - ${{ if eq(variables['System.CollectionId'], 'f3ad12f2-e480-4533-baf2-635c95467d29') }}: + - download: current # pipeline resource identifier. + artifact: 'drop-linux-cpu-${{ parameters.arch }}' + + - download: current # pipeline resource identifier. + artifact: 'onnxruntime${{ parameters.python_wheel_suffix }}' + + - bash: | + set -e -x + mv "$(Pipeline.Workspace)/drop-linux-cpu-${{ parameters.arch }}" $(Build.BinariesDirectory)/${{parameters.cmake_build_type}} + mv "$(Pipeline.Workspace)/onnxruntime${{ parameters.python_wheel_suffix }}" "$(Build.BinariesDirectory)/whl" + cp -r "$(Build.BinariesDirectory)/whl" $(Build.BinariesDirectory)/tmp + find "$(Build.BinariesDirectory)/tmp" -name '*.whl' -exec bash -c 'unzip -d "${1%.*}" "$1"' _ {} \; + # The private ADO project + - ${{ if eq(variables['System.CollectionId'], 'bc038106-a83b-4dab-9dd3-5a41bc58f34c') }}: + - download: build # pipeline resource identifier. + artifact: 'drop-linux-cpu-${{ parameters.arch }}' + + - download: build # pipeline resource identifier. + artifact: 'onnxruntime${{ parameters.python_wheel_suffix }}' + + - bash: | + set -e -x + ls $(Pipeline.Workspace)/build + mv "$(Pipeline.Workspace)/build/drop-linux-cpu-${{ parameters.arch }}" $(Build.BinariesDirectory)/${{parameters.cmake_build_type}} + mv "$(Pipeline.Workspace)/build/onnxruntime${{ parameters.python_wheel_suffix }}" "$(Build.BinariesDirectory)/whl" + cp -r "$(Build.BinariesDirectory)/whl" $(Build.BinariesDirectory)/tmp + find "$(Build.BinariesDirectory)/tmp" -name '*.whl' -exec bash -c 'unzip -d "${1%.*}" "$1"' _ {} \; + + # The BinSkim task uses a dotnet program which doesn't support ARM CPUs yet + - ${{ if eq(parameters.arch, 'x86_64') }}: + - task: BinSkim@4 + displayName: 'Run BinSkim' + inputs: + AnalyzeTargetGlob: '$(Build.BinariesDirectory)/tmp/**/*.so' + continueOnError: true + + #- task: PostAnalysis@2 + # inputs: + # GdnBreakAllTools: true + # GdnBreakPolicy: M365 + # GdnBreakPolicyMinSev: Error + + - template: get-docker-image-steps.yml + parameters: + Dockerfile: tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu + Context: tools/ci_build/github/linux/docker/inference/x64/python/cpu + DockerBuildArgs: "--build-arg POLICY=manylinux_2_28 --build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=${{ parameters.base_image }} --build-arg PLATFORM=${{ parameters.arch }} --build-arg PREPEND_PATH=${{ parameters.prepend_path }} --build-arg LD_LIBRARY_PATH_ARG=${{ parameters.ld_library_path_arg }} --build-arg DEVTOOLSET_ROOTPATH=${{ parameters.devtoolset_rootpath }}" + Repository: onnxruntimecpubuildpython${{ parameters.arch }} + ${{ if eq(parameters.arch, 'aarch64') }}: + UpdateDepsTxt: false + + - task: Bash@3 + displayName: 'Bash Script' + inputs: + targetType: filePath + filePath: tools/ci_build/github/linux/run_python_dockertest.sh + arguments: -d CPU -c ${{parameters.cmake_build_type}} -i onnxruntimecpubuildpython${{ parameters.arch }} + + - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 + displayName: 'Clean Agent Directories' + condition: always() diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml new file mode 100644 index 0000000000000..43ed0172825bc --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml @@ -0,0 +1,98 @@ +parameters: +- name: arch + type: string + +- name: device + type: string + values: + - CPU + - GPU + +- name: machine_pool + type: string + +- name: extra_job_id + type: string + default: '' + +- name: python_wheel_suffix + type: string + default: '' + + +# TODO: Ideally it should fetch information from the build that triggers it +- name: cmake_build_type + type: string + default: 'Release' + values: + - Debug + - Release + - RelWithDebInfo + - MinSizeRel + +- name: timeout + type: number + default: 120 + +jobs: +- job: Linux_Test_GPU${{ parameters.extra_job_id }}_${{ parameters.arch }} + timeoutInMinutes: ${{ parameters.timeout }} + variables: + skipComponentGovernanceDetection: true + workspace: + clean: all + pool: ${{ parameters.machine_pool }} + steps: + - checkout: self + clean: true + submodules: none + # The public ADO project + # - ${{ if eq(variables['System.CollectionId'], 'f3ad12f2-e480-4533-baf2-635c95467d29') }}: + + # The private ADO project + - ${{ if eq(variables['System.CollectionId'], 'bc038106-a83b-4dab-9dd3-5a41bc58f34c') }}: + - download: build # pipeline resource identifier. + artifact: 'drop-linux-gpu-${{ parameters.arch }}' + + - download: build # pipeline resource identifier. + artifact: 'onnxruntime${{ parameters.python_wheel_suffix }}' + + - bash: | + set -e -x + ls $(Pipeline.Workspace)/build + mv "$(Pipeline.Workspace)/build/drop-linux-gpu-${{ parameters.arch }}" $(Build.BinariesDirectory)/${{parameters.cmake_build_type}} + mv "$(Pipeline.Workspace)/build/onnxruntime${{ parameters.python_wheel_suffix }}" "$(Build.BinariesDirectory)/whl" + cp -r "$(Build.BinariesDirectory)/whl" $(Build.BinariesDirectory)/tmp + find "$(Build.BinariesDirectory)/tmp" -name '*.whl' -exec bash -c 'unzip -d "${1%.*}" "$1"' _ {} \; + + # The BinSkim task uses a dotnet program which doesn't support ARM CPUs yet + - ${{ if eq(parameters.arch, 'x86_64') }}: + - task: BinSkim@4 + displayName: 'Run BinSkim' + inputs: + AnalyzeTargetGlob: '$(Build.BinariesDirectory)/tmp/**/*.so' + continueOnError: true + + #- task: PostAnalysis@2 + # inputs: + # GdnBreakAllTools: true + # GdnBreakPolicy: M365 + # GdnBreakPolicyMinSev: Error + + - template: get-docker-image-steps.yml + parameters: + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6 + Context: tools/ci_build/github/linux/docker + DockerBuildArgs: "--network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64 --build-arg PREPEND_PATH=/usr/local/cuda/bin --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64 --build-arg DEVTOOLSET_ROOTPATH=/usr --build-arg BUILD_UID=$( id -u ) --build-arg PLATFORM=${{ parameters.arch }}" + Repository: onnxruntimecuda118xtrt86build${{ parameters.arch }} + + - task: Bash@3 + displayName: 'Bash Script' + inputs: + targetType: filePath + filePath: tools/ci_build/github/linux/run_python_dockertest.sh + arguments: -d GPU -c ${{parameters.cmake_build_type}} -i onnxruntimecuda118xtrt86build${{ parameters.arch }} + + - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 + displayName: 'Clean Agent Directories' + condition: always() diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test.yml deleted file mode 100644 index 8ddc917e8591e..0000000000000 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test.yml +++ /dev/null @@ -1,85 +0,0 @@ -parameters: -- name: arch - type: string - -- name: device - type: string - -- name: machine_pool - type: string - -- name: extra_job_id - type: string - default: '' - -- name: python_wheel_suffix - type: string - default: '' - - -# TODO: Ideally it should fetch information from the build that triggers it -- name: cmake_build_type - type: string - default: 'Release' - values: - - Debug - - Release - - RelWithDebInfo - - MinSizeRel - -- name: timeout - type: number - default: 120 - -jobs: -- job: Linux_Test_${{ parameters.device }}${{ parameters.extra_job_id }}_${{ parameters.arch }} - timeoutInMinutes: ${{ parameters.timeout }} - variables: - skipComponentGovernanceDetection: true - workspace: - clean: all - pool: ${{ parameters.machine_pool }} - steps: - - task: DownloadPipelineArtifact@2 - displayName: 'Download Pipeline Artifact' - inputs: - artifactName: 'drop-linux-${{ lower(parameters.device) }}-${{ parameters.arch }}' - targetPath: '$(Build.BinariesDirectory)/${{parameters.cmake_build_type}}' - # The public ADO project - ${{ if eq(variables['System.CollectionId'], 'f3ad12f2-e480-4533-baf2-635c95467d29') }}: - buildType: current - # The private ADO project - ${{ if eq(variables['System.CollectionId'], 'bc038106-a83b-4dab-9dd3-5a41bc58f34c') }}: - project: '530acbc4-21bc-487d-8cd8-348ff451d2ff' - definition: 841 - preferTriggeringPipeline: true - runVersion: 'latest' - buildType: specific - - - task: DownloadPipelineArtifact@2 - displayName: 'Download Pipeline Artifact' - inputs: - artifactName: 'onnxruntime${{ parameters.python_wheel_suffix }}' - targetPath: '$(Build.BinariesDirectory)/whl' - # The public ADO project - ${{ if eq(variables['System.CollectionId'], 'f3ad12f2-e480-4533-baf2-635c95467d29') }}: - buildType: current - # The private ADO project - ${{ if eq(variables['System.CollectionId'], 'bc038106-a83b-4dab-9dd3-5a41bc58f34c') }}: - project: '530acbc4-21bc-487d-8cd8-348ff451d2ff' - definition: 841 - preferTriggeringPipeline: true - runVersion: 'latest' - buildType: specific - - - - task: Bash@3 - displayName: 'Bash Script' - inputs: - targetType: filePath - filePath: tools/ci_build/github/linux/run_python_tests.sh - arguments: -d ${{ parameters.device }} -c ${{parameters.cmake_build_type}} - - - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 - displayName: 'Clean Agent Directories' - condition: always() diff --git a/tools/ci_build/github/linux/copy_strip_binary.sh b/tools/ci_build/github/linux/copy_strip_binary.sh index b875a3937aaa9..63690b69fc91a 100755 --- a/tools/ci_build/github/linux/copy_strip_binary.sh +++ b/tools/ci_build/github/linux/copy_strip_binary.sh @@ -48,6 +48,7 @@ fi cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_c_api.h $BINARY_DIR/$ARTIFACT_NAME/include cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_cxx_api.h $BINARY_DIR/$ARTIFACT_NAME/include cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_cxx_inline.h $BINARY_DIR/$ARTIFACT_NAME/include +cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_float16.h $BINARY_DIR/$ARTIFACT_NAME/include cp $SOURCE_DIR/include/onnxruntime/core/providers/cpu/cpu_provider_factory.h $BINARY_DIR/$ARTIFACT_NAME/include cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h $BINARY_DIR/$ARTIFACT_NAME/include cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h $BINARY_DIR/$ARTIFACT_NAME/include diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu index a9a1e6b39a8cb..af87852561e0a 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu @@ -1,9 +1,9 @@ -ARG BASEIMAGE=amd64/almalinux:8 +ARG BASEIMAGE=registry.access.redhat.com/ubi8/ubi ARG POLICY=manylinux_2_28 ARG PLATFORM=x86_64 ARG DEVTOOLSET_ROOTPATH=/opt/rh/gcc-toolset-12/root ARG LD_LIBRARY_PATH_ARG=${DEVTOOLSET_ROOTPATH}/usr/lib64:${DEVTOOLSET_ROOTPATH}/usr/lib:${DEVTOOLSET_ROOTPATH}/usr/lib64/dyninst:${DEVTOOLSET_ROOTPATH}/usr/lib/dyninst:/usr/local/lib64 -ARG PREPEND_PATH=${DEVTOOLSET_ROOTPATH}/usr/bin: +ARG PREPEND_PATH=/usr/lib/jvm/msopenjdk-11/bin:${DEVTOOLSET_ROOTPATH}/usr/bin: #Build manylinux2014 docker image begin FROM $BASEIMAGE AS runtime_base @@ -26,7 +26,6 @@ COPY build_scripts/fixup-mirrors.sh /usr/local/sbin/fixup-mirrors # setup entrypoint, this will wrap commands with `linux32` with i686 images COPY build_scripts/install-entrypoint.sh \ - build_scripts/update-system-packages.sh \ build_scripts/build_utils.sh \ /build_scripts/ @@ -35,7 +34,6 @@ COPY manylinux-entrypoint /usr/local/bin/manylinux-entrypoint ENTRYPOINT ["manylinux-entrypoint"] COPY build_scripts/install-runtime-packages.sh \ - build_scripts/update-system-packages.sh \ build_scripts/build_utils.sh \ /build_scripts/ RUN manylinux-entrypoint /build_scripts/install-runtime-packages.sh && rm -rf /build_scripts/ @@ -137,9 +135,7 @@ COPY --from=build_git /manylinux-rootfs / COPY --from=build_cpython /manylinux-rootfs / COPY --from=all_python /opt/_internal /opt/_internal/ COPY build_scripts/finalize.sh \ - build_scripts/update-system-packages.sh \ build_scripts/python-tag-abi-tag.py \ - build_scripts/requirements3.8.txt \ build_scripts/requirements3.9.txt \ build_scripts/requirements3.10.txt \ @@ -156,6 +152,7 @@ CMD ["/bin/bash"] #Build manylinux2014 docker image end ENV PATH ${DEVTOOLSET_ROOTPATH}/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11 ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11 index dab8df6703c4f..933b0211b0e6c 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11 +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11 @@ -31,7 +31,6 @@ COPY build_scripts/fixup-mirrors.sh /usr/local/sbin/fixup-mirrors # setup entrypoint, this will wrap commands with `linux32` with i686 images COPY build_scripts/install-entrypoint.sh \ - build_scripts/update-system-packages.sh \ build_scripts/build_utils.sh \ /build_scripts/ @@ -40,7 +39,6 @@ COPY manylinux-entrypoint /usr/local/bin/manylinux-entrypoint ENTRYPOINT ["manylinux-entrypoint"] COPY build_scripts/install-runtime-packages.sh \ - build_scripts/update-system-packages.sh \ build_scripts/build_utils.sh \ /build_scripts/ RUN manylinux-entrypoint /build_scripts/install-runtime-packages.sh && rm -rf /build_scripts/ @@ -140,7 +138,6 @@ COPY --from=build_git /manylinux-rootfs / COPY --from=build_cpython /manylinux-rootfs / COPY --from=all_python /opt/_internal /opt/_internal/ COPY build_scripts/finalize.sh \ - build_scripts/update-system-packages.sh \ build_scripts/python-tag-abi-tag.py \ build_scripts/requirements3.8.txt \ build_scripts/requirements3.9.txt \ @@ -156,7 +153,7 @@ ENV SSL_CERT_FILE=/opt/_internal/certs.pem CMD ["/bin/bash"] #Build manylinux2014 docker image end - +ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11 #Add our own dependencies ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh && /tmp/scripts/manylinux/install_deps.sh && rm -rf /tmp/scripts diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_4 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_4 index 303e83eb23bca..003bb2324c049 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_4 +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_4 @@ -31,7 +31,6 @@ COPY build_scripts/fixup-mirrors.sh /usr/local/sbin/fixup-mirrors # setup entrypoint, this will wrap commands with `linux32` with i686 images COPY build_scripts/install-entrypoint.sh \ - build_scripts/update-system-packages.sh \ build_scripts/build_utils.sh \ /build_scripts/ @@ -40,7 +39,6 @@ COPY manylinux-entrypoint /usr/local/bin/manylinux-entrypoint ENTRYPOINT ["manylinux-entrypoint"] COPY build_scripts/install-runtime-packages.sh \ - build_scripts/update-system-packages.sh \ build_scripts/build_utils.sh \ /build_scripts/ RUN manylinux-entrypoint /build_scripts/install-runtime-packages.sh && rm -rf /build_scripts/ @@ -140,7 +138,6 @@ COPY --from=build_git /manylinux-rootfs / COPY --from=build_cpython /manylinux-rootfs / COPY --from=all_python /opt/_internal /opt/_internal/ COPY build_scripts/finalize.sh \ - build_scripts/update-system-packages.sh \ build_scripts/python-tag-abi-tag.py \ build_scripts/requirements3.8.txt \ build_scripts/requirements3.9.txt \ @@ -163,7 +160,7 @@ RUN v="8.4.1-1.cuda11.6" &&\ yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo &&\ yum -y install libnvinfer8-${v} libnvparsers8-${v} libnvonnxparsers8-${v} libnvinfer-plugin8-${v} \ libnvinfer-devel-${v} libnvparsers-devel-${v} libnvonnxparsers-devel-${v} libnvinfer-plugin-devel-${v} - +ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11 #Add our own dependencies ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh && /tmp/scripts/manylinux/install_deps.sh && rm -rf /tmp/scripts diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_5 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_5 index d17e4b24582fe..0337ffc5e00a0 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_5 +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_5 @@ -31,7 +31,6 @@ COPY build_scripts/fixup-mirrors.sh /usr/local/sbin/fixup-mirrors # setup entrypoint, this will wrap commands with `linux32` with i686 images COPY build_scripts/install-entrypoint.sh \ - build_scripts/update-system-packages.sh \ build_scripts/build_utils.sh \ /build_scripts/ @@ -40,7 +39,6 @@ COPY manylinux-entrypoint /usr/local/bin/manylinux-entrypoint ENTRYPOINT ["manylinux-entrypoint"] COPY build_scripts/install-runtime-packages.sh \ - build_scripts/update-system-packages.sh \ build_scripts/build_utils.sh \ /build_scripts/ RUN manylinux-entrypoint /build_scripts/install-runtime-packages.sh && rm -rf /build_scripts/ @@ -140,7 +138,6 @@ COPY --from=build_git /manylinux-rootfs / COPY --from=build_cpython /manylinux-rootfs / COPY --from=all_python /opt/_internal /opt/_internal/ COPY build_scripts/finalize.sh \ - build_scripts/update-system-packages.sh \ build_scripts/python-tag-abi-tag.py \ build_scripts/requirements3.8.txt \ build_scripts/requirements3.9.txt \ @@ -163,7 +160,7 @@ RUN v="8.5.1-1.cuda11.8" &&\ yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo &&\ yum -y install libnvinfer8-${v} libnvparsers8-${v} libnvonnxparsers8-${v} libnvinfer-plugin8-${v} \ libnvinfer-devel-${v} libnvparsers-devel-${v} libnvonnxparsers-devel-${v} libnvinfer-plugin-devel-${v} - +ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11 #Add our own dependencies ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh && /tmp/scripts/manylinux/install_deps.sh && rm -rf /tmp/scripts diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6 index 3c0ac22e38b5a..2c953a10cbf64 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6 +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6 @@ -31,7 +31,6 @@ COPY build_scripts/fixup-mirrors.sh /usr/local/sbin/fixup-mirrors # setup entrypoint, this will wrap commands with `linux32` with i686 images COPY build_scripts/install-entrypoint.sh \ - build_scripts/update-system-packages.sh \ build_scripts/build_utils.sh \ /build_scripts/ @@ -40,7 +39,6 @@ COPY manylinux-entrypoint /usr/local/bin/manylinux-entrypoint ENTRYPOINT ["manylinux-entrypoint"] COPY build_scripts/install-runtime-packages.sh \ - build_scripts/update-system-packages.sh \ build_scripts/build_utils.sh \ /build_scripts/ RUN manylinux-entrypoint /build_scripts/install-runtime-packages.sh && rm -rf /build_scripts/ @@ -147,7 +145,6 @@ COPY --from=build_git /manylinux-rootfs / COPY --from=build_cpython /manylinux-rootfs / COPY --from=all_python /opt/_internal /opt/_internal/ COPY build_scripts/finalize.sh \ - build_scripts/update-system-packages.sh \ build_scripts/python-tag-abi-tag.py \ build_scripts/requirements3.7.txt \ build_scripts/requirements3.8.txt \ @@ -171,7 +168,7 @@ RUN v="8.6.1.6-1.cuda11.8" &&\ yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo &&\ yum -y install libnvinfer8-${v} libnvparsers8-${v} libnvonnxparsers8-${v} libnvinfer-plugin8-${v} libnvinfer-vc-plugin8-${v}\ libnvinfer-devel-${v} libnvparsers-devel-${v} libnvonnxparsers-devel-${v} libnvinfer-plugin-devel-${v} libnvinfer-vc-plugin-devel-${v} libnvinfer-headers-devel-${v} libnvinfer-headers-plugin-devel-${v} - +ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11 #Add our own dependencies ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh && /tmp/scripts/manylinux/install_deps.sh && rm -rf /tmp/scripts diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8 index 326e15d58456a..09ab7951552a0 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8 +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8 @@ -31,7 +31,6 @@ COPY build_scripts/fixup-mirrors.sh /usr/local/sbin/fixup-mirrors # setup entrypoint, this will wrap commands with `linux32` with i686 images COPY build_scripts/install-entrypoint.sh \ - build_scripts/update-system-packages.sh \ build_scripts/build_utils.sh \ /build_scripts/ @@ -40,7 +39,6 @@ COPY manylinux-entrypoint /usr/local/bin/manylinux-entrypoint ENTRYPOINT ["manylinux-entrypoint"] COPY build_scripts/install-runtime-packages.sh \ - build_scripts/update-system-packages.sh \ build_scripts/build_utils.sh \ /build_scripts/ RUN manylinux-entrypoint /build_scripts/install-runtime-packages.sh && rm -rf /build_scripts/ @@ -140,7 +138,6 @@ COPY --from=build_git /manylinux-rootfs / COPY --from=build_cpython /manylinux-rootfs / COPY --from=all_python /opt/_internal /opt/_internal/ COPY build_scripts/finalize.sh \ - build_scripts/update-system-packages.sh \ build_scripts/python-tag-abi-tag.py \ build_scripts/requirements3.8.txt \ build_scripts/requirements3.9.txt \ diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_cuda11_8_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_cuda11_8_tensorrt8_6 index c211fa9b9e2b8..83a974469234f 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_cuda11_8_tensorrt8_6 +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_cuda11_8_tensorrt8_6 @@ -7,40 +7,30 @@ # Build base image with required system packages FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 AS base -# The local directory into which to build and install CMAKE -ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code - -ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH} +ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH} ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update &&\ - apt-get install -y sudo git bash unattended-upgrades wget -RUN unattended-upgrade + apt-get install -y git bash wget # Install python3 RUN apt-get install -y --no-install-recommends \ python3 \ python3-pip \ python3-dev \ - python3-wheel &&\ - cd /usr/local/bin &&\ - ln -s /usr/bin/python3 python &&\ - ln -s /usr/bin/pip3 pip; + python3-wheel + RUN pip install --upgrade pip -RUN pip install setuptools>=41.0.0 # Install TensorRT RUN v="8.6.1.6-1+cuda11.8" &&\ apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ apt-get update &&\ - sudo apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} libnvinfer-lean8=${v} libnvinfer-vc-plugin8=${v} libnvinfer-dispatch8=${v}\ + apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} libnvinfer-lean8=${v} libnvinfer-vc-plugin8=${v} libnvinfer-dispatch8=${v}\ libnvinfer-headers-dev=${v} libnvinfer-headers-plugin-dev=${v} libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvparsers-dev=${v} libnvinfer-plugin-dev=${v} libnvinfer-lean-dev=${v} libnvinfer-vc-plugin-dev=${v} libnvinfer-dispatch-dev=${v}\ python3-libnvinfer=${v} libnvinfer-samples=${v} tensorrt-dev=${v} tensorrt-libs=${v} -# Install Valgrind -RUN apt-get install -y valgrind - ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/install_dotnet.sh && rm -rf /tmp/scripts diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_centos.sh index a1ade39e57e16..adb0464d6496a 100755 --- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_centos.sh +++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_centos.sh @@ -1,9 +1,8 @@ #!/bin/bash set -e -x -os_major_version=$(cat /etc/redhat-release | tr -dc '0-9.'|cut -d \. -f1) +os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1) echo "installing for CentOS version : $os_major_version" - -dnf install -y glibc-langpack-\* glibc-locale-source which gdb redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel java-11-openjdk-devel graphviz gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran -locale \ No newline at end of file +dnf install -y python39-devel glibc-langpack-\* glibc-locale-source which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel java-11-openjdk-devel graphviz gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran +locale diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh index 7ecd0525c7e7e..7598ab0a7a536 100755 --- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh +++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh @@ -14,20 +14,20 @@ function GetFile { echo "File '$path' already exists. Skipping download" return 0 else - rm -rf $path + rm -rf "$path" fi fi if [[ -f $uri ]]; then echo "'$uri' is a file path, copying file to '$path'" - cp $uri $path + cp "$uri" "$path" return $? fi echo "Downloading $uri" # Use aria2c if available, otherwise use curl if command -v aria2c > /dev/null; then - aria2c -q -d $(dirname $path) -o $(basename $path) "$uri" + aria2c -q -d "$(dirname $path)" -o "$(basename $path)" "$uri" else curl "$uri" -sSL --retry $download_retries --retry-delay $retry_wait_time_seconds --create-dirs -o "$path" --fail fi @@ -38,9 +38,10 @@ mkdir -p /tmp/src cd /tmp/src +CPU_ARCH=$(uname -m) echo "Installing cmake" -GetFile https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-`uname -m`.tar.gz /tmp/src/cmake-3.27.3-linux-`uname -m`.tar.gz -tar -zxf /tmp/src/cmake-3.27.3-linux-`uname -m`.tar.gz --strip=1 -C /usr +GetFile "https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-$CPU_ARCH.tar.gz" "/tmp/src/cmake.tar.gz" +tar -zxf /tmp/src/cmake.tar.gz --strip=1 -C /usr echo "Installing Ninja" GetFile https://github.com/ninja-build/ninja/archive/v1.10.0.tar.gz /tmp/src/ninja-linux.tar.gz @@ -52,7 +53,7 @@ mv ./build-cmake/ninja /usr/bin popd echo "Installing Node.js" -CPU_ARCH=`uname -m` + if [[ "$CPU_ARCH" = "x86_64" ]]; then NODEJS_ARCH=x64 elif [[ "$CPU_ARCH" = "aarch64" ]]; then @@ -64,16 +65,5 @@ fi GetFile https://nodejs.org/dist/v18.17.1/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz /tmp/src/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz tar --strip 1 -xf /tmp/src/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz -C /usr -# The Python version in CentOS 7's python3 package is no longer supported (3.6) so we will build Python from source. -echo "Installing Python" -PYTHON_VERSION="3.8.17" -GetFile https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz /tmp/src/Python-${PYTHON_VERSION}.tgz -tar -zxf Python-${PYTHON_VERSION}.tgz -pushd Python-${PYTHON_VERSION} -./configure -make -make install -popd - cd / rm -rf /tmp/src diff --git a/tools/ci_build/github/linux/docker/inference/x64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x64/default/cpu/Dockerfile index 0324f377b8e9e..caf9583807b62 100644 --- a/tools/ci_build/github/linux/docker/inference/x64/default/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x64/default/cpu/Dockerfile @@ -5,10 +5,10 @@ ARG BASEIMAGE=amd64/almalinux:8 FROM $BASEIMAGE -ENV PATH /opt/rh/gcc-toolset-12/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +ENV PATH /usr/lib/jvm/msopenjdk-11/bin:/opt/rh/gcc-toolset-12/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin ENV LANG=en_US.UTF-8 ENV LC_ALL=en_US.UTF-8 - +ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11 ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts diff --git a/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_centos.sh index 8e18a237a807e..b5f8bf1a49a19 100755 --- a/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_centos.sh +++ b/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_centos.sh @@ -1,9 +1,9 @@ #!/bin/bash set -e -x -os_major_version=$(cat /etc/redhat-release | tr -dc '0-9.'|cut -d \. -f1) +os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1) echo "installing for CentOS version : $os_major_version" - -dnf install -y python39-devel glibc-langpack-\* glibc-locale-source which gdb redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel java-11-openjdk-devel graphviz gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran -locale \ No newline at end of file +rpm -Uvh https://packages.microsoft.com/config/centos/$os_major_version/packages-microsoft-prod.rpm +dnf install -y python39-devel glibc-langpack-\* glibc-locale-source which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel msopenjdk-11 graphviz gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran +locale diff --git a/tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile index 386759890d085..318791072f46d 100644 --- a/tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile @@ -4,8 +4,10 @@ # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline FROM nvidia/cuda:11.8.0-cudnn8-devel-ubi8 +ENV PATH /usr/lib/jvm/msopenjdk-11/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin ENV LANG=en_US.UTF-8 ENV LC_ALL=en_US.UTF-8 +ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11 ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts diff --git a/tools/ci_build/github/linux/docker/inference/x64/default/gpu/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/x64/default/gpu/scripts/install_centos.sh index 3cf259dc7240e..31e3e40f1b7ee 100755 --- a/tools/ci_build/github/linux/docker/inference/x64/default/gpu/scripts/install_centos.sh +++ b/tools/ci_build/github/linux/docker/inference/x64/default/gpu/scripts/install_centos.sh @@ -1,9 +1,9 @@ #!/bin/bash set -e -x -os_major_version=$(cat /etc/redhat-release | tr -dc '0-9.'|cut -d \. -f1) +os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1) echo "installing for CentOS version : $os_major_version" - -dnf install -y python39-devel python3-devel glibc-langpack-\* glibc-locale-source which gdb redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel java-11-openjdk-devel -locale \ No newline at end of file +rpm -Uvh https://packages.microsoft.com/config/centos/$os_major_version/packages-microsoft-prod.rpm +dnf install -y python39-devel glibc-langpack-\* glibc-locale-source which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel msopenjdk-11 +locale diff --git a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu index 33660cbb3f2e5..06e75ee1a39f6 100644 --- a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu +++ b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu @@ -26,7 +26,6 @@ COPY build_scripts/fixup-mirrors.sh /usr/local/sbin/fixup-mirrors # setup entrypoint, this will wrap commands with `linux32` with i686 images COPY build_scripts/install-entrypoint.sh \ - build_scripts/update-system-packages.sh \ build_scripts/build_utils.sh \ /build_scripts/ @@ -35,7 +34,6 @@ COPY manylinux-entrypoint /usr/local/bin/manylinux-entrypoint ENTRYPOINT ["manylinux-entrypoint"] COPY build_scripts/install-runtime-packages.sh \ - build_scripts/update-system-packages.sh \ build_scripts/build_utils.sh \ /build_scripts/ RUN manylinux-entrypoint /build_scripts/install-runtime-packages.sh && rm -rf /build_scripts/ @@ -132,7 +130,6 @@ COPY --from=build_git /manylinux-rootfs / COPY --from=build_cpython /manylinux-rootfs / COPY --from=all_python /opt/_internal /opt/_internal/ COPY build_scripts/finalize.sh \ - build_scripts/update-system-packages.sh \ build_scripts/python-tag-abi-tag.py \ build_scripts/requirements3.8.txt \ build_scripts/requirements3.9.txt \ diff --git a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_centos.sh index 98bb730a43776..c81e57c60c9da 100755 --- a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_centos.sh +++ b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_centos.sh @@ -1,11 +1,11 @@ #!/bin/bash set -e -os_major_version=$(cat /etc/redhat-release | tr -dc '0-9.'|cut -d \. -f1) +os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1) echo "installing for os major version : $os_major_version" dnf install -y glibc-langpack-\* -yum install -y which gdb redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel perl-IPC-Cmd openssl-devel wget +yum install -y which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel perl-IPC-Cmd openssl-devel wget # export PATH=/opt/python/cp38-cp38/bin:$PATH @@ -17,4 +17,4 @@ mkdir build cd build cmake .. cmake --install . -cd ../.. \ No newline at end of file +cd ../.. diff --git a/tools/ci_build/github/linux/docker/manylinux.patch b/tools/ci_build/github/linux/docker/manylinux.patch index f1821f9197525..75923e746f93c 100644 --- a/tools/ci_build/github/linux/docker/manylinux.patch +++ b/tools/ci_build/github/linux/docker/manylinux.patch @@ -94,7 +94,7 @@ index 9ef1e99..ec52833 100755 +fi \ No newline at end of file diff --git a/install-runtime-packages.sh b/install-runtime-packages.sh -index 137d2e2..4269afb 100755 +index 137d2e2..203b4bc 100755 --- a/install-runtime-packages.sh +++ b/install-runtime-packages.sh @@ -33,7 +33,7 @@ source $MY_DIR/build_utils.sh @@ -130,7 +130,7 @@ index 137d2e2..4269afb 100755 elif [ "${AUDITWHEEL_ARCH}" == "aarch64" ] || [ "${AUDITWHEEL_ARCH}" == "ppc64le" ] || [ "${AUDITWHEEL_ARCH}" == "s390x" ]; then # Software collection (for devtoolset-10) yum -y install centos-release-scl-rh -@@ -86,19 +88,18 @@ if [ "${AUDITWHEEL_POLICY}" == "manylinux2014" ]; then +@@ -86,19 +88,21 @@ if [ "${AUDITWHEEL_POLICY}" == "manylinux2014" ]; then fi elif [ "${AUDITWHEEL_POLICY}" == "manylinux_2_28" ]; then PACKAGE_MANAGER=dnf @@ -148,6 +148,9 @@ index 137d2e2..4269afb 100755 - TOOLCHAIN_DEPS="gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran" - if [ "${AUDITWHEEL_ARCH}" == "x86_64" ]; then - TOOLCHAIN_DEPS="${TOOLCHAIN_DEPS} yasm" ++ if test -f "/etc/yum.repos.d/ubi.repo"; then ++ sed -i 's/enabled\s*=\s*1/enabled = 1\nexclude=dotnet* aspnet* netstandard*/g' /etc/yum.repos.d/ubi.repo ++ fi + if [[ -d /usr/local/cuda ]]; then + TOOLCHAIN_DEPS="gcc gcc-c++" + else @@ -155,7 +158,7 @@ index 137d2e2..4269afb 100755 fi elif [ "${AUDITWHEEL_POLICY}" == "musllinux_1_1" ]; then TOOLCHAIN_DEPS="binutils gcc g++ gfortran" -@@ -121,12 +122,6 @@ else +@@ -121,12 +125,6 @@ else exit 1 fi diff --git a/tools/ci_build/github/linux/docker/scripts/install_dotnet.sh b/tools/ci_build/github/linux/docker/scripts/install_dotnet.sh index b9accb134b26d..c4689ed19c148 100755 --- a/tools/ci_build/github/linux/docker/scripts/install_dotnet.sh +++ b/tools/ci_build/github/linux/docker/scripts/install_dotnet.sh @@ -2,13 +2,15 @@ set -e -x if [ -f /etc/redhat-release ]; then - dnf update --refresh -y \ - && dnf install -y dotnet-sdk-6.0 + # If you found the following command went successfully but dotnet command still reports no sdk was found, most likely + # it was because the dotnet packages were installed from more than one dnf repos. + dnf install -y dotnet-sdk-6.0 dotnet-runtime-6.0 elif [ -f /etc/os-release ]; then # Get Ubuntu version - declare repo_version=$(if command -v lsb_release &> /dev/null; then lsb_release -r -s; else grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"'; fi) + declare repo_version + repo_version=$(if command -v lsb_release &> /dev/null; then lsb_release -r -s; else grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"'; fi) # Download Microsoft signing key and repository - wget https://packages.microsoft.com/config/ubuntu/$repo_version/packages-microsoft-prod.deb -O packages-microsoft-prod.deb + wget "https://packages.microsoft.com/config/ubuntu/$repo_version/packages-microsoft-prod.deb" -O packages-microsoft-prod.deb # Install Microsoft signing key and repository dpkg -i packages-microsoft-prod.deb # Clean up diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_centos.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_centos.sh index 4f544a50cb94d..63b953a95add6 100755 --- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_centos.sh +++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_centos.sh @@ -1,17 +1,18 @@ #!/bin/bash set -e -os_major_version=$(cat /etc/redhat-release | tr -dc '0-9.'|cut -d \. -f1) +os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1) echo "installing for os major version : $os_major_version" if [ "$os_major_version" -gt 7 ]; then PACKAGE_MANAGER="dnf" - $PACKAGE_MANAGER install -y which gdb redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel perl-IPC-Cmd openssl-devel wget + $PACKAGE_MANAGER install -y which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel perl-IPC-Cmd openssl-devel wget else PACKAGE_MANAGER="yum" - $PACKAGE_MANAGER install -y which gdb redhat-lsb-core expat-devel tar unzip zlib-devel make libunwind bzip2 bzip2-devel perl-IPC-Cmd openssl-devel wget + $PACKAGE_MANAGER install -y which redhat-lsb-core expat-devel tar unzip zlib-devel make libunwind bzip2 bzip2-devel perl-IPC-Cmd openssl-devel wget fi +rpm -Uvh https://packages.microsoft.com/config/centos/$os_major_version/packages-microsoft-prod.rpm # Install Java # Install automatic documentation generation dependencies -$PACKAGE_MANAGER install -y java-11-openjdk-devel graphviz +$PACKAGE_MANAGER install -y msopenjdk-11 graphviz diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh index a1cb4be5b72c9..8c79918120d8d 100755 --- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh +++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh @@ -3,18 +3,20 @@ set -e -x # Development tools and libraries if [ -f /etc/redhat-release ]; then - yum update && yum -y install graphviz - os_major_version=$(cat /etc/redhat-release | tr -dc '0-9.'|cut -d \. -f1) + dnf -y install graphviz elif [ -f /etc/os-release ]; then apt-get update && apt-get install -y graphviz - os_major_version=$(cat /etc/os-release | tr -dc '0-9.'|cut -d \. -f1) else echo "Unsupported OS" exit 1 fi # Install dotnet -source $(cd "$(dirname "${BASH_SOURCE[0]}")/.." &> /dev/null && pwd)/install_dotnet.sh +LOCAL_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)" +PARENT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." &> /dev/null && pwd)" +# ShellCheck is unable to follow dynamic paths, such as source "$somedir/file". +# shellcheck disable=SC1091 +source "$PARENT_DIR/install_dotnet.sh" if [ ! -d "/opt/conda/bin" ]; then PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11") @@ -22,23 +24,17 @@ else PYTHON_EXES=("/opt/conda/bin/python") fi -SYS_LONG_BIT=$(getconf LONG_BIT) mkdir -p /tmp/src -GLIBC_VERSION=$(getconf GNU_LIBC_VERSION | cut -f 2 -d \.) - -if [[ $SYS_LONG_BIT = "64" ]]; then - LIBDIR="lib64" -else - LIBDIR="lib" -fi cd /tmp/src -source $(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)/install_shared_deps.sh +# shellcheck disable=SC1091 +source "$LOCAL_DIR/install_shared_deps.sh" cd /tmp/src if ! [ -x "$(command -v protoc)" ]; then - source ${0/%install_deps.sh/..\/install_protobuf.sh} +# shellcheck disable=SC1091 + source "$PARENT_DIR/install_protobuf.sh" fi export ONNX_ML=1 @@ -46,7 +42,7 @@ export CMAKE_ARGS="-DONNX_GEN_PB_TYPE_STUBS=OFF -DONNX_WERROR=OFF" for PYTHON_EXE in "${PYTHON_EXES[@]}" do - ${PYTHON_EXE} -m pip install -r ${0/%install_deps\.sh/requirements\.txt} + ${PYTHON_EXE} -m pip install -r "${0/%install_deps\.sh/requirements\.txt}" done cd / diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_aten.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_aten.sh index ed220b487d06c..1f85f72aef423 100755 --- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_aten.sh +++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_aten.sh @@ -11,7 +11,7 @@ else PYTHON_EXES=("/opt/conda/bin/python") fi -os_major_version=$(cat /etc/redhat-release | tr -dc '0-9.'|cut -d \. -f1) +os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1) SYS_LONG_BIT=$(getconf LONG_BIT) mkdir -p /tmp/src diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh index e141e0793a2bd..ad3366b0bb3b6 100755 --- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh +++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh @@ -11,7 +11,7 @@ else PYTHON_EXES=("/opt/conda/bin/python") fi -os_major_version=$(cat /etc/redhat-release | tr -dc '0-9.'|cut -d \. -f1) +os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1) SYS_LONG_BIT=$(getconf LONG_BIT) mkdir -p /tmp/src diff --git a/tools/ci_build/github/linux/run_python_dockertest.sh b/tools/ci_build/github/linux/run_python_dockertest.sh new file mode 100755 index 0000000000000..332dd9c7284c0 --- /dev/null +++ b/tools/ci_build/github/linux/run_python_dockertest.sh @@ -0,0 +1,29 @@ +#!/bin/bash +set -e -x +BUILD_CONFIG="Release" + +while getopts "i:d:x:c:" parameter_Option +do case "${parameter_Option}" +in +i) DOCKER_IMAGE=${OPTARG};; +d) DEVICE=${OPTARG};; +c) BUILD_CONFIG=${OPTARG};; +esac +done + +if [ $DEVICE = "GPU" ]; then + ADDITIONAL_DOCKER_PARAMETER="--gpus all" +fi + +mkdir -p $HOME/.onnx +docker run --rm \ + --volume /data/onnx:/data/onnx:ro \ + --volume $BUILD_SOURCESDIRECTORY:/onnxruntime_src \ + --volume $BUILD_BINARIESDIRECTORY:/build \ + --volume /data/models:/build/models:ro \ + --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \ + -w /onnxruntime_src \ + -e NIGHTLY_BUILD \ + -e BUILD_BUILDNUMBER \ + $ADDITIONAL_DOCKER_PARAMETER \ + $DOCKER_IMAGE tools/ci_build/github/linux/run_python_tests.sh -d $DEVICE -c $BUILD_CONFIG diff --git a/tools/ci_build/github/linux/run_python_tests.sh b/tools/ci_build/github/linux/run_python_tests.sh index c11ea42cd0541..f080c7e8c39d8 100755 --- a/tools/ci_build/github/linux/run_python_tests.sh +++ b/tools/ci_build/github/linux/run_python_tests.sh @@ -15,7 +15,8 @@ c) BUILD_CONFIG=${OPTARG};; esac done -cd $BUILD_BINARIESDIRECTORY +export PATH=/opt/python/cp38-cp38/bin:$PATH +cd /build files=(whl/*.whl) FILE_NAME="${files[0]}" FILE_NAME=$(basename $FILE_NAME) @@ -23,7 +24,7 @@ PYTHON_PACKAGE_NAME=$(echo "$FILE_NAME" | cut -f 1 -d '-') echo "Package name:$PYTHON_PACKAGE_NAME" -BUILD_ARGS="--build_dir $BUILD_BINARIESDIRECTORY --config $BUILD_CONFIG --test --skip_submodule_sync --parallel --enable_lto --build_wheel " +BUILD_ARGS="--build_dir /build --config $BUILD_CONFIG --test --skip_submodule_sync --parallel --enable_lto --build_wheel " ARCH=$(uname -m) @@ -35,20 +36,15 @@ if [ $BUILD_DEVICE == "GPU" ]; then BUILD_ARGS="$BUILD_ARGS --use_cuda --use_tensorrt --cuda_version=11.8 --tensorrt_home=/usr --cuda_home=/usr/local/cuda-11.8 --cudnn_home=/usr/local/cuda-11.8" fi # We assume the machine doesn't have gcc and python development header files, so we don't build onnxruntime from source -sudo rm -rf /build /onnxruntime_src -sudo ln -s $BUILD_SOURCESDIRECTORY /onnxruntime_src python3 -m pip install --upgrade pip -python3 -m pip uninstall -y $PYTHON_PACKAGE_NAME ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml onnx -qq # Install the packages that are needed for installing the onnxruntime python package -python3 -m pip install -r $BUILD_BINARIESDIRECTORY/$BUILD_CONFIG/requirements.txt +python3 -m pip install -r /build/$BUILD_CONFIG/requirements.txt # Install the packages that are needed for running test scripts -# Install the latest ONNX release which may contain not fixed bugs. However, it is what most people use. -python3 -m pip install onnx pytest +python3 -m pip install pytest # The "--no-index" flag is crucial. The local whl folder is just an additional source. Pypi's doc says "there is no # ordering in the locations that are searched" if we don't disable the default one with "--no-index" -python3 -m pip install --no-index --find-links $BUILD_BINARIESDIRECTORY/whl $PYTHON_PACKAGE_NAME -ln -s /data/models $BUILD_BINARIESDIRECTORY -cd $BUILD_BINARIESDIRECTORY/$BUILD_CONFIG +python3 -m pip install --no-index --find-links /build/whl $PYTHON_PACKAGE_NAME +cd /build/$BUILD_CONFIG # Restore file permissions xargs -a perms.txt chmod a+x -python3 $BUILD_SOURCESDIRECTORY/tools/ci_build/build.py $BUILD_ARGS --ctest_path '' +python3 /onnxruntime_src/tools/ci_build/build.py $BUILD_ARGS --ctest_path '' diff --git a/tools/scripts/python_test.sh b/tools/scripts/python_test.sh old mode 100644 new mode 100755 diff --git a/tools/scripts/symbolic_shape_infer_test.sh b/tools/scripts/symbolic_shape_infer_test.sh old mode 100644 new mode 100755