Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/microsoft/onnxruntime into …
Browse files Browse the repository at this point in the history
…artif
  • Loading branch information
xadupre committed Nov 21, 2023
2 parents db4f6e2 + 29a409a commit e2c0839
Show file tree
Hide file tree
Showing 41 changed files with 988 additions and 200 deletions.
1 change: 0 additions & 1 deletion cmake/external/abseil-cpp.natvis
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
<Intrinsic Name="_capacity" Expression="_commonfields().capacity_"/>
<Intrinsic Name="_control" Expression="_commonfields().control_"/>
<Intrinsic Name="_slots" Expression="(slot_type*)(_commonfields().slots_)"/>
<DisplayString Condition="_size() == 0">empty</DisplayString>
<DisplayString IncludeView="noparens">size={ _size() }</DisplayString>
<DisplayString ExcludeView="noparens">size=({_size()})</DisplayString>
<Expand>
Expand Down
4 changes: 2 additions & 2 deletions docs/ContribOperators.md
Original file line number Diff line number Diff line change
Expand Up @@ -2649,8 +2649,8 @@ This version of the operator has been available since version 1 of the 'com.micr
#### Type Constraints

<dl>
<dt><tt>T1</tt> : tensor(float), tensor(float16)</dt>
<dd>Constrain input and output types to float/half_float tensors.</dd>
<dt><tt>T1</tt> : tensor(float), tensor(float16), tensor(bfloat16)</dt>
<dd>Constrain input and output types to float/half_float/brain_float tensors.</dd>
<dt><tt>T2</tt> : tensor(uint8)</dt>
<dd>Constrain quantized weight types to uint8.</dd>
</dl>
Expand Down
2 changes: 1 addition & 1 deletion docs/OperatorKernels.md
Original file line number Diff line number Diff line change
Expand Up @@ -840,7 +840,7 @@ Do not modify directly.*
|Inverse|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
|Irfft|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
|LongformerAttention|*in* input:**T**<br> *in* weight:**T**<br> *in* bias:**T**<br> *in* mask:**T**<br> *in* global_weight:**T**<br> *in* global_bias:**T**<br> *in* global:**G**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
|MatMulBnb4|*in* A:**T1**<br> *in* B:**T2**<br> *in* absmax:**T1**<br> *out* Y:**T1**|1+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
|MatMulBnb4|*in* A:**T1**<br> *in* B:**T2**<br> *in* absmax:**T1**<br> *out* Y:**T1**|1+|**T1** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
|MatMulNBits|*in* A:**T1**<br> *in* B:**T2**<br> *in* scales:**T1**<br> *in* zero_points:**T2**<br> *out* Y:**T1**|1+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
|MoE|*in* input:**T**<br> *in* router_probs:**T**<br> *in* fc1_experts_weights:**T**<br> *in* fc2_experts_weights:**T**<br> *in* fc1_experts_bias:**T**<br> *in* fc2_experts_bias:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
|MultiHeadAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* bias:**T**<br> *in* key_padding_mask:**M**<br> *in* relative_position_bias:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**T** = tensor(float), tensor(float16)|
Expand Down
70 changes: 60 additions & 10 deletions js/.eslintrc.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,18 @@

module.exports = {
root: true,
ignorePatterns: ['**/*.js', 'ort-schema/', 'common/test/type-tests/', 'test/data/', 'node_modules/', 'dist/'],
ignorePatterns: [
'**/*.js',
'node_modules/',
'ort-schema/',
'common/test/type-tests/',
'web/types.d.ts',
'test/data/',
'dist/',
],
env: { 'es6': true },
parser: '@typescript-eslint/parser',
parserOptions: { 'project': 'tsconfig.json', 'sourceType': 'module' },
parserOptions: { 'project': true, 'sourceType': 'module' },
plugins: ['@typescript-eslint', 'prefer-arrow', 'header', 'import', 'unicorn', 'jsdoc'],
rules: {
'unicorn/filename-case': 'error',
Expand Down Expand Up @@ -144,15 +152,56 @@ module.exports = {
'no-unused-expressions': 'off',
}
}, {
files: ['web/lib/**/*.ts'],
excludedFiles: 'web/lib/wasm/proxy-worker/**/*',
parserOptions: { 'project': 'web/tsconfig.json' },
rules: {
'no-underscore-dangle': 'off',
files: ['web/lib/**/*.ts'], rules: {
'no-underscore-dangle': ['error', {
'allow': [
'_free',
'_malloc',
'_JsepGetNodeName',
'_JsepOutput',
'_OrtAddFreeDimensionOverride',
'_OrtAddRunConfigEntry',
'_OrtAddSessionConfigEntry',
'_OrtAppendExecutionProvider',
'_OrtBindInput',
'_OrtBindOutput',
'_OrtClearBoundOutputs',
'_OrtCreateBinding',
'_OrtCreateRunOptions',
'_OrtCreateSession',
'_OrtCreateSessionOptions',
'_OrtCreateTensor',
'_OrtEndProfiling',
'_OrtFree',
'_OrtGetInputName',
'_OrtGetInputOutputCount',
'_OrtGetLastError',
'_OrtGetOutputName',
'_OrtGetTensorData',
'_OrtInit',
'_OrtReleaseBinding',
'_OrtReleaseRunOptions',
'_OrtReleaseSession',
'_OrtReleaseSessionOptions',
'_OrtReleaseTensor',
'_OrtRun',
'_OrtRunWithBinding',
'_OrtTrainingCopyParametersFromBuffer',
'_OrtTrainingCopyParametersToBuffer',
'_OrtTrainingCreateSession',
'_OrtTrainingEvalStep',
'_OrtTrainingGetModelInputOutputCount',
'_OrtTrainingGetModelInputOutputName',
'_OrtTrainingGetParametersSize',
'_OrtTrainingLazyResetGrad',
'_OrtTrainingLoadCheckpoint',
'_OrtTrainingOptimizerStep',
'_OrtTrainingReleaseCheckpoint',
'_OrtTrainingReleaseSession',
'_OrtTrainingRunTrainStep'
]
}]
}
}, {
files: ['web/lib/wasm/proxy-worker/**/*.ts'],
parserOptions: { 'project': 'web/lib/wasm/proxy-worker/tsconfig.json' },
}, {
files: ['web/lib/onnxjs/**/*.ts'], rules: {
// TODO: those rules are useful. should turn on them in future (webgl refactor)
Expand All @@ -164,6 +213,7 @@ module.exports = {
'import/no-internal-modules': 'off',
'prefer-arrow/prefer-arrow-functions': 'off',
'no-param-reassign': 'off',
'no-underscore-dangle': 'off',
'guard-for-in': 'off'
}
}, {
Expand Down
8 changes: 4 additions & 4 deletions js/web/lib/onnxjs/attribute-with-cache-key.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ class AttributeWithCacheKeyImpl {
Object.assign(this, attribute);
}

private _cacheKey: string;
private key: string;
public get cacheKey(): string {
if (!this._cacheKey) {
this._cacheKey =
if (!this.key) {
this.key =
Object.getOwnPropertyNames(this).sort().map(name => `${(this as Record<string, unknown>)[name]}`).join(';');
}
return this._cacheKey;
return this.key;
}
}

Expand Down
8 changes: 4 additions & 4 deletions js/web/lib/wasm/jsep/webgpu/attribute-with-cache-key.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ class AttributeWithCacheKeyImpl {
Object.assign(this, attribute);
}

private _cacheKey: string;
private key: string;
public get cacheKey(): string {
if (!this._cacheKey) {
this._cacheKey =
if (!this.key) {
this.key =
Object.getOwnPropertyNames(this).sort().map(name => `${(this as Record<string, unknown>)[name]}`).join(';');
}
return this._cacheKey;
return this.key;
}
}

Expand Down
1 change: 0 additions & 1 deletion js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
['BiasSplitGelu', [biasSplitGelu]],
['Cast', [unaryOps.cast, unaryOps.parseCastAttributes]],
['Ceil', [unaryOps.ceil]],
['ClipV10', [unaryOps.clipV10]],
['Clip', [unaryOps.clip]],
['Concat', [concat, parseConcatAttributes]],
['Conv', [conv, parseConvAttributes]],
Expand Down
30 changes: 24 additions & 6 deletions js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ type BinaryFunctionCall = BuiltinFunctionName|BinaryCustomExpression|{

const createBinaryOpProgramShader =
(shaderHelper: ShaderHelper, dimsA: readonly number[], dimsB: readonly number[], dimsOutput: readonly number[],
vectorize: boolean, doBroadcast: boolean, funcCall: BinaryFunctionCall, typeA: number, typeB: number,
typeOutput: number, useShapesUniforms: boolean, additionalImplementation?: string) => {
vectorize: boolean, doBroadcast: boolean, sharedDimensionDivisibleBy4: boolean, funcCall: BinaryFunctionCall,
typeA: number, typeB: number, typeOutput: number, useShapesUniforms: boolean,
additionalImplementation?: string) => {
let expressionScalar: BinaryCustomExpression;
let expressionVector: BinaryCustomExpression;
if (typeof funcCall === 'string') {
Expand All @@ -42,6 +43,8 @@ const createBinaryOpProgramShader =
if (doBroadcast) {
const isAOneElement = ShapeUtil.size(dimsA) === 1;
const isBOneElement = ShapeUtil.size(dimsB) === 1;
const aLastDimDivisibleBy4 = dimsA.length > 0 && dimsA[dimsA.length - 1] % 4 === 0;
const bLastDimDivisibleBy4 = dimsB.length > 0 && dimsB[dimsB.length - 1] % 4 === 0;
if (isAOneElement || isBOneElement) {
assignment = output.setByOffset(
'global_idx',
Expand All @@ -55,7 +58,14 @@ const createBinaryOpProgramShader =
let offsetB = ${b.broadcastedIndicesToOffset('outputIndices', output)};
${
output.setByOffset(
'global_idx', expressionVector(a.getByOffset('offsetA / 4u'), b.getByOffset('offsetB / 4u')))}
'global_idx',
expressionVector(
sharedDimensionDivisibleBy4 || aLastDimDivisibleBy4 ?
a.getByOffset('offsetA / 4u') :
`${a.type.value}(${a.getByOffset('offsetA / 4u')}[offsetA % 4u])`,
sharedDimensionDivisibleBy4 || bLastDimDivisibleBy4 ?
b.getByOffset('offsetB / 4u') :
`${b.type.value}(${b.getByOffset('offsetB / 4u')}[offsetB % 4u])`))}
`;
}
} else {
Expand Down Expand Up @@ -118,6 +128,7 @@ const createBinaryOpProgramInfo =
let outputSize = ShapeUtil.size(a.dims);

let vectorize = false;
let sharedDimensionDivisibleBy4 = false;

// TODO: deal with zero-sized tensors (eg. dims=[1,0])
const cacheKeyAux = [isBroadcast];
Expand All @@ -130,8 +141,12 @@ const createBinaryOpProgramInfo =
outputSize = ShapeUtil.size(outputShape);
const isAOneElement = ShapeUtil.size(a.dims) === 1;
const isBOneElement = ShapeUtil.size(b.dims) === 1;
const aLastDimDivisibleBy4 = a.dims.length > 0 && a.dims[a.dims.length - 1] % 4 === 0;
const bLastDimDivisibleBy4 = b.dims.length > 0 && b.dims[b.dims.length - 1] % 4 === 0;
cacheKeyAux.push(isAOneElement);
cacheKeyAux.push(isBOneElement);
cacheKeyAux.push(aLastDimDivisibleBy4);
cacheKeyAux.push(bLastDimDivisibleBy4);
// check whether vectorize can be enabled
let sharedDimension = 1;
for (let i = 1; i < outputShape.length; i++) {
Expand All @@ -143,7 +158,10 @@ const createBinaryOpProgramInfo =
break;
}
}
if (sharedDimension % 4 === 0 || isAOneElement || isBOneElement) {
if (sharedDimension % 4 === 0) {
sharedDimensionDivisibleBy4 = true;
vectorize = true;
} else if (isAOneElement || isBOneElement || aLastDimDivisibleBy4 || bLastDimDivisibleBy4) {
vectorize = true;
}
} else {
Expand All @@ -160,8 +178,8 @@ const createBinaryOpProgramInfo =
inputDependencies: useShapesUniforms ? ['rank', 'rank'] : ['dims', 'dims'],
},
getShaderSource: (shaderHelper) => createBinaryOpProgramShader(
shaderHelper, a.dims, b.dims, outputShape, vectorize, isBroadcast, funcCall, a.dataType, b.dataType,
outputDataType, useShapesUniforms, additionalImplementation),
shaderHelper, a.dims, b.dims, outputShape, vectorize, isBroadcast, sharedDimensionDivisibleBy4, funcCall,
a.dataType, b.dataType, outputDataType, useShapesUniforms, additionalImplementation),
getRunData: () => ({
outputs: [{dims: outputShape, dataType: outputDataType}],
dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */ / 4 /* component size */)},
Expand Down
19 changes: 8 additions & 11 deletions js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,14 @@ export interface ClipAttributes extends AttributeWithCacheKey {
readonly max: number;
}

export const clipV10 = (context: ComputeContext, attributes: ClipAttributes): void => {
const generateClipAttributesFromInputs = (inputs: readonly TensorView[]): ClipAttributes => {
const min = (inputs.length >= 2) ? inputs[1].getFloat32Array()[0] : MIN_CLIP;
const max = (inputs.length >= 3) ? inputs[2].getFloat32Array()[0] : MAX_CLIP;
return createAttributeWithCacheKey({min, max});
};

export const clip = (context: ComputeContext, clipAttributes: ClipAttributes): void => {
const attributes = context.inputs.length === 1 ? clipAttributes : generateClipAttributesFromInputs(context.inputs);
const dataType = tensorTypeToWsglStorageType(context.inputs[0].dataType);
context.compute(
createElementwiseProgramInfo(
Expand All @@ -135,16 +142,6 @@ export const clipV10 = (context: ComputeContext, attributes: ClipAttributes): vo
attributes.cacheKey),
{inputs: [0]});
};
const generateClipAttributesFromInputs = (inputs: readonly TensorView[]): ClipAttributes => {
const min = (inputs.length >= 2) ? inputs[1].getFloat32Array()[0] : MIN_CLIP;
const max = (inputs.length >= 3) ? inputs[2].getFloat32Array()[0] : MAX_CLIP;
return createAttributeWithCacheKey({min, max});
};

export const clip = (context: ComputeContext): void => {
const attributes = generateClipAttributesFromInputs(context.inputs);
clipV10(context, attributes);
};

export const ceil = (context: ComputeContext): void => {
context.compute(createElementwiseProgramInfo(context.inputs[0], 'Ceil', 'ceil'));
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Inverse);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MatMulNBits);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MatMulNBits);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, MatMulBnb4);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MatMulBnb4);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MatMulBnb4);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Trilu);
Expand Down Expand Up @@ -313,6 +314,7 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Inverse)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MatMulNBits)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MatMulNBits)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, MatMulBnb4)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MatMulBnb4)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MatMulBnb4)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BiasSoftmax)>,
Expand Down
30 changes: 18 additions & 12 deletions onnxruntime/contrib_ops/cuda/math/gemm_float8.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,23 @@ namespace onnxruntime {
namespace contrib {
namespace cuda {

#define REGISTER_KERNEL() \
ONNX_OPERATOR_KERNEL_EX( \
GemmFloat8, \
kMSDomain, \
1, \
kCudaExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("TA", BuildKernelDefConstraints<Float8E4M3FN, Float8E5M2, MLFloat16, BFloat16, float>()) \
.TypeConstraint("TB", BuildKernelDefConstraints<Float8E4M3FN, Float8E5M2, MLFloat16, BFloat16, float>()) \
.TypeConstraint("TR", BuildKernelDefConstraints<Float8E4M3FN, Float8E5M2, MLFloat16, BFloat16, float>()) \
.TypeConstraint("TS", BuildKernelDefConstraints<float>()), \
#if !defined(DISABLE_FLOAT8_TYPES)
#define GEMM_FLOAT8_CONSTRAINTS BuildKernelDefConstraints<Float8E4M3FN, Float8E5M2, MLFloat16, BFloat16, float>()
#else
#define GEMM_FLOAT8_CONSTRAINTS BuildKernelDefConstraints<MLFloat16, BFloat16, float>()
#endif

#define REGISTER_KERNEL() \
ONNX_OPERATOR_KERNEL_EX( \
GemmFloat8, \
kMSDomain, \
1, \
kCudaExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("TA", GEMM_FLOAT8_CONSTRAINTS) \
.TypeConstraint("TB", GEMM_FLOAT8_CONSTRAINTS) \
.TypeConstraint("TR", GEMM_FLOAT8_CONSTRAINTS) \
.TypeConstraint("TS", BuildKernelDefConstraints<float>()), \
GemmFloat8);

REGISTER_KERNEL()
Expand All @@ -38,7 +44,7 @@ GemmFloat8::GemmFloat8(const OpKernelInfo& info) : CudaKernel(info) {
alpha_ = info.GetAttrOrDefault<float>("alpha", 1);
beta_ = info.GetAttrOrDefault<float>("beta", 0);

#if (CUDA_VERSION <= 12000)
#if (CUDA_VERSION < 12000)
ORT_ENFORCE(beta_ == 0, "CUDA < 12.0 does not support bias, beta must be 0.");
#endif

Expand Down
Loading

0 comments on commit e2c0839

Please sign in to comment.