Merge branch 'main' of https://github.com/microsoft/onnxruntime into …

…artif
microsoft · Nov 21, 2023 · e2c0839 · e2c0839
2 parents db4f6e2 + 29a409a
commit e2c0839
Show file tree

Hide file tree

Showing 41 changed files with 988 additions and 200 deletions.
diff --git a/cmake/external/abseil-cpp.natvis b/cmake/external/abseil-cpp.natvis
@@ -30,7 +30,6 @@
     <Intrinsic Name="_capacity" Expression="_commonfields().capacity_"/>
     <Intrinsic Name="_control" Expression="_commonfields().control_"/>
     <Intrinsic Name="_slots" Expression="(slot_type*)(_commonfields().slots_)"/>
-    <DisplayString Condition="_size() == 0">empty</DisplayString>
     <DisplayString IncludeView="noparens">size={ _size() }</DisplayString>
     <DisplayString ExcludeView="noparens">size=({_size()})</DisplayString>
     <Expand>

diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
@@ -2649,8 +2649,8 @@ This version of the operator has been available since version 1 of the 'com.micr
 #### Type Constraints
 
 <dl>
-<dt><tt>T1</tt> : tensor(float), tensor(float16)</dt>
-<dd>Constrain input and output types to float/half_float tensors.</dd>
+<dt><tt>T1</tt> : tensor(float), tensor(float16), tensor(bfloat16)</dt>
+<dd>Constrain input and output types to float/half_float/brain_float tensors.</dd>
 <dt><tt>T2</tt> : tensor(uint8)</dt>
 <dd>Constrain quantized weight types to uint8.</dd>
 </dl>

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
@@ -840,7 +840,7 @@ Do not modify directly.*
 |Inverse|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
 |Irfft|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
 |LongformerAttention|*in* input:**T**<br> *in* weight:**T**<br> *in* bias:**T**<br> *in* mask:**T**<br> *in* global_weight:**T**<br> *in* global_bias:**T**<br> *in* global:**G**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
-|MatMulBnb4|*in* A:**T1**<br> *in* B:**T2**<br> *in* absmax:**T1**<br> *out* Y:**T1**|1+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
+|MatMulBnb4|*in* A:**T1**<br> *in* B:**T2**<br> *in* absmax:**T1**<br> *out* Y:**T1**|1+|**T1** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
 |MatMulNBits|*in* A:**T1**<br> *in* B:**T2**<br> *in* scales:**T1**<br> *in* zero_points:**T2**<br> *out* Y:**T1**|1+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
 |MoE|*in* input:**T**<br> *in* router_probs:**T**<br> *in* fc1_experts_weights:**T**<br> *in* fc2_experts_weights:**T**<br> *in* fc1_experts_bias:**T**<br> *in* fc2_experts_bias:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
 |MultiHeadAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* bias:**T**<br> *in* key_padding_mask:**M**<br> *in* relative_position_bias:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**T** = tensor(float), tensor(float16)|

diff --git a/js/.eslintrc.js b/js/.eslintrc.js
@@ -5,10 +5,18 @@
 
 module.exports = {
   root: true,
-  ignorePatterns: ['**/*.js', 'ort-schema/', 'common/test/type-tests/', 'test/data/', 'node_modules/', 'dist/'],
+  ignorePatterns: [
+    '**/*.js',
+    'node_modules/',
+    'ort-schema/',
+    'common/test/type-tests/',
+    'web/types.d.ts',
+    'test/data/',
+    'dist/',
+  ],
   env: { 'es6': true },
   parser: '@typescript-eslint/parser',
-  parserOptions: { 'project': 'tsconfig.json', 'sourceType': 'module' },
+  parserOptions: { 'project': true, 'sourceType': 'module' },
   plugins: ['@typescript-eslint', 'prefer-arrow', 'header', 'import', 'unicorn', 'jsdoc'],
   rules: {
     'unicorn/filename-case': 'error',
@@ -144,15 +152,56 @@ module.exports = {
       'no-unused-expressions': 'off',
     }
   }, {
-    files: ['web/lib/**/*.ts'],
-    excludedFiles: 'web/lib/wasm/proxy-worker/**/*',
-    parserOptions: { 'project': 'web/tsconfig.json' },
-    rules: {
-      'no-underscore-dangle': 'off',
+    files: ['web/lib/**/*.ts'], rules: {
+      'no-underscore-dangle': ['error', {
+        'allow': [
+          '_free',
+          '_malloc',
+          '_JsepGetNodeName',
+          '_JsepOutput',
+          '_OrtAddFreeDimensionOverride',
+          '_OrtAddRunConfigEntry',
+          '_OrtAddSessionConfigEntry',
+          '_OrtAppendExecutionProvider',
+          '_OrtBindInput',
+          '_OrtBindOutput',
+          '_OrtClearBoundOutputs',
+          '_OrtCreateBinding',
+          '_OrtCreateRunOptions',
+          '_OrtCreateSession',
+          '_OrtCreateSessionOptions',
+          '_OrtCreateTensor',
+          '_OrtEndProfiling',
+          '_OrtFree',
+          '_OrtGetInputName',
+          '_OrtGetInputOutputCount',
+          '_OrtGetLastError',
+          '_OrtGetOutputName',
+          '_OrtGetTensorData',
+          '_OrtInit',
+          '_OrtReleaseBinding',
+          '_OrtReleaseRunOptions',
+          '_OrtReleaseSession',
+          '_OrtReleaseSessionOptions',
+          '_OrtReleaseTensor',
+          '_OrtRun',
+          '_OrtRunWithBinding',
+          '_OrtTrainingCopyParametersFromBuffer',
+          '_OrtTrainingCopyParametersToBuffer',
+          '_OrtTrainingCreateSession',
+          '_OrtTrainingEvalStep',
+          '_OrtTrainingGetModelInputOutputCount',
+          '_OrtTrainingGetModelInputOutputName',
+          '_OrtTrainingGetParametersSize',
+          '_OrtTrainingLazyResetGrad',
+          '_OrtTrainingLoadCheckpoint',
+          '_OrtTrainingOptimizerStep',
+          '_OrtTrainingReleaseCheckpoint',
+          '_OrtTrainingReleaseSession',
+          '_OrtTrainingRunTrainStep'
+        ]
+      }]
     }
-  }, {
-    files: ['web/lib/wasm/proxy-worker/**/*.ts'],
-    parserOptions: { 'project': 'web/lib/wasm/proxy-worker/tsconfig.json' },
   }, {
     files: ['web/lib/onnxjs/**/*.ts'], rules: {
       // TODO: those rules are useful. should turn on them in future (webgl refactor)
@@ -164,6 +213,7 @@ module.exports = {
       'import/no-internal-modules': 'off',
       'prefer-arrow/prefer-arrow-functions': 'off',
       'no-param-reassign': 'off',
+      'no-underscore-dangle': 'off',
       'guard-for-in': 'off'
     }
   }, {

diff --git a/js/web/lib/onnxjs/attribute-with-cache-key.ts b/js/web/lib/onnxjs/attribute-with-cache-key.ts
@@ -6,13 +6,13 @@ class AttributeWithCacheKeyImpl {
     Object.assign(this, attribute);
   }
 
-  private _cacheKey: string;
+  private key: string;
   public get cacheKey(): string {
-    if (!this._cacheKey) {
-      this._cacheKey =
+    if (!this.key) {
+      this.key =
           Object.getOwnPropertyNames(this).sort().map(name => `${(this as Record<string, unknown>)[name]}`).join(';');
     }
-    return this._cacheKey;
+    return this.key;
   }
 }
 

diff --git a/js/web/lib/wasm/jsep/webgpu/attribute-with-cache-key.ts b/js/web/lib/wasm/jsep/webgpu/attribute-with-cache-key.ts
@@ -6,13 +6,13 @@ class AttributeWithCacheKeyImpl {
     Object.assign(this, attribute);
   }
 
-  private _cacheKey: string;
+  private key: string;
   public get cacheKey(): string {
-    if (!this._cacheKey) {
-      this._cacheKey =
+    if (!this.key) {
+      this.key =
           Object.getOwnPropertyNames(this).sort().map(name => `${(this as Record<string, unknown>)[name]}`).join(';');
     }
-    return this._cacheKey;
+    return this.key;
   }
 }
 

diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -55,7 +55,6 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['BiasSplitGelu', [biasSplitGelu]],
   ['Cast', [unaryOps.cast, unaryOps.parseCastAttributes]],
   ['Ceil', [unaryOps.ceil]],
-  ['ClipV10', [unaryOps.clipV10]],
   ['Clip', [unaryOps.clip]],
   ['Concat', [concat, parseConcatAttributes]],
   ['Conv', [conv, parseConvAttributes]],

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
@@ -17,8 +17,9 @@ type BinaryFunctionCall = BuiltinFunctionName|BinaryCustomExpression|{
 
 const createBinaryOpProgramShader =
     (shaderHelper: ShaderHelper, dimsA: readonly number[], dimsB: readonly number[], dimsOutput: readonly number[],
-     vectorize: boolean, doBroadcast: boolean, funcCall: BinaryFunctionCall, typeA: number, typeB: number,
-     typeOutput: number, useShapesUniforms: boolean, additionalImplementation?: string) => {
+     vectorize: boolean, doBroadcast: boolean, sharedDimensionDivisibleBy4: boolean, funcCall: BinaryFunctionCall,
+     typeA: number, typeB: number, typeOutput: number, useShapesUniforms: boolean,
+     additionalImplementation?: string) => {
       let expressionScalar: BinaryCustomExpression;
       let expressionVector: BinaryCustomExpression;
       if (typeof funcCall === 'string') {
@@ -42,6 +43,8 @@ const createBinaryOpProgramShader =
         if (doBroadcast) {
           const isAOneElement = ShapeUtil.size(dimsA) === 1;
           const isBOneElement = ShapeUtil.size(dimsB) === 1;
+          const aLastDimDivisibleBy4 = dimsA.length > 0 && dimsA[dimsA.length - 1] % 4 === 0;
+          const bLastDimDivisibleBy4 = dimsB.length > 0 && dimsB[dimsB.length - 1] % 4 === 0;
           if (isAOneElement || isBOneElement) {
             assignment = output.setByOffset(
                 'global_idx',
@@ -55,7 +58,14 @@ const createBinaryOpProgramShader =
             let offsetB = ${b.broadcastedIndicesToOffset('outputIndices', output)};
             ${
                 output.setByOffset(
-                    'global_idx', expressionVector(a.getByOffset('offsetA / 4u'), b.getByOffset('offsetB / 4u')))}
+                    'global_idx',
+                    expressionVector(
+                        sharedDimensionDivisibleBy4 || aLastDimDivisibleBy4 ?
+                            a.getByOffset('offsetA / 4u') :
+                            `${a.type.value}(${a.getByOffset('offsetA / 4u')}[offsetA % 4u])`,
+                        sharedDimensionDivisibleBy4 || bLastDimDivisibleBy4 ?
+                            b.getByOffset('offsetB / 4u') :
+                            `${b.type.value}(${b.getByOffset('offsetB / 4u')}[offsetB % 4u])`))}
           `;
           }
         } else {
@@ -118,6 +128,7 @@ const createBinaryOpProgramInfo =
       let outputSize = ShapeUtil.size(a.dims);
 
       let vectorize = false;
+      let sharedDimensionDivisibleBy4 = false;
 
       // TODO: deal with zero-sized tensors (eg. dims=[1,0])
       const cacheKeyAux = [isBroadcast];
@@ -130,8 +141,12 @@ const createBinaryOpProgramInfo =
         outputSize = ShapeUtil.size(outputShape);
         const isAOneElement = ShapeUtil.size(a.dims) === 1;
         const isBOneElement = ShapeUtil.size(b.dims) === 1;
+        const aLastDimDivisibleBy4 = a.dims.length > 0 && a.dims[a.dims.length - 1] % 4 === 0;
+        const bLastDimDivisibleBy4 = b.dims.length > 0 && b.dims[b.dims.length - 1] % 4 === 0;
         cacheKeyAux.push(isAOneElement);
         cacheKeyAux.push(isBOneElement);
+        cacheKeyAux.push(aLastDimDivisibleBy4);
+        cacheKeyAux.push(bLastDimDivisibleBy4);
         // check whether vectorize can be enabled
         let sharedDimension = 1;
         for (let i = 1; i < outputShape.length; i++) {
@@ -143,7 +158,10 @@ const createBinaryOpProgramInfo =
             break;
           }
         }
-        if (sharedDimension % 4 === 0 || isAOneElement || isBOneElement) {
+        if (sharedDimension % 4 === 0) {
+          sharedDimensionDivisibleBy4 = true;
+          vectorize = true;
+        } else if (isAOneElement || isBOneElement || aLastDimDivisibleBy4 || bLastDimDivisibleBy4) {
           vectorize = true;
         }
       } else {
@@ -160,8 +178,8 @@ const createBinaryOpProgramInfo =
           inputDependencies: useShapesUniforms ? ['rank', 'rank'] : ['dims', 'dims'],
         },
         getShaderSource: (shaderHelper) => createBinaryOpProgramShader(
-            shaderHelper, a.dims, b.dims, outputShape, vectorize, isBroadcast, funcCall, a.dataType, b.dataType,
-            outputDataType, useShapesUniforms, additionalImplementation),
+            shaderHelper, a.dims, b.dims, outputShape, vectorize, isBroadcast, sharedDimensionDivisibleBy4, funcCall,
+            a.dataType, b.dataType, outputDataType, useShapesUniforms, additionalImplementation),
         getRunData: () => ({
           outputs: [{dims: outputShape, dataType: outputDataType}],
           dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */ / 4 /* component size */)},

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
@@ -124,7 +124,14 @@ export interface ClipAttributes extends AttributeWithCacheKey {
   readonly max: number;
 }
 
-export const clipV10 = (context: ComputeContext, attributes: ClipAttributes): void => {
+const generateClipAttributesFromInputs = (inputs: readonly TensorView[]): ClipAttributes => {
+  const min = (inputs.length >= 2) ? inputs[1].getFloat32Array()[0] : MIN_CLIP;
+  const max = (inputs.length >= 3) ? inputs[2].getFloat32Array()[0] : MAX_CLIP;
+  return createAttributeWithCacheKey({min, max});
+};
+
+export const clip = (context: ComputeContext, clipAttributes: ClipAttributes): void => {
+  const attributes = context.inputs.length === 1 ? clipAttributes : generateClipAttributesFromInputs(context.inputs);
   const dataType = tensorTypeToWsglStorageType(context.inputs[0].dataType);
   context.compute(
       createElementwiseProgramInfo(
@@ -135,16 +142,6 @@ export const clipV10 = (context: ComputeContext, attributes: ClipAttributes): vo
           attributes.cacheKey),
       {inputs: [0]});
 };
-const generateClipAttributesFromInputs = (inputs: readonly TensorView[]): ClipAttributes => {
-  const min = (inputs.length >= 2) ? inputs[1].getFloat32Array()[0] : MIN_CLIP;
-  const max = (inputs.length >= 3) ? inputs[2].getFloat32Array()[0] : MAX_CLIP;
-  return createAttributeWithCacheKey({min, max});
-};
-
-export const clip = (context: ComputeContext): void => {
-  const attributes = generateClipAttributesFromInputs(context.inputs);
-  clipV10(context, attributes);
-};
 
 export const ceil = (context: ComputeContext): void => {
   context.compute(createElementwiseProgramInfo(context.inputs[0], 'Ceil', 'ceil'));

diff --git a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
@@ -121,6 +121,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Inverse);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MatMulNBits);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MatMulNBits);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, MatMulBnb4);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MatMulBnb4);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MatMulBnb4);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Trilu);
@@ -313,6 +314,7 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Inverse)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MatMulNBits)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MatMulNBits)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, MatMulBnb4)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MatMulBnb4)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MatMulBnb4)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BiasSoftmax)>,

diff --git a/onnxruntime/contrib_ops/cuda/math/gemm_float8.cc b/onnxruntime/contrib_ops/cuda/math/gemm_float8.cc
@@ -14,17 +14,23 @@ namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
-#define REGISTER_KERNEL()                                                                                          \
-  ONNX_OPERATOR_KERNEL_EX(                                                                                         \
-      GemmFloat8,                                                                                                  \
-      kMSDomain,                                                                                                   \
-      1,                                                                                                           \
-      kCudaExecutionProvider,                                                                                      \
-      (*KernelDefBuilder::Create())                                                                                \
-          .TypeConstraint("TA", BuildKernelDefConstraints<Float8E4M3FN, Float8E5M2, MLFloat16, BFloat16, float>()) \
-          .TypeConstraint("TB", BuildKernelDefConstraints<Float8E4M3FN, Float8E5M2, MLFloat16, BFloat16, float>()) \
-          .TypeConstraint("TR", BuildKernelDefConstraints<Float8E4M3FN, Float8E5M2, MLFloat16, BFloat16, float>()) \
-          .TypeConstraint("TS", BuildKernelDefConstraints<float>()),                                               \
+#if !defined(DISABLE_FLOAT8_TYPES)
+#define GEMM_FLOAT8_CONSTRAINTS BuildKernelDefConstraints<Float8E4M3FN, Float8E5M2, MLFloat16, BFloat16, float>()
+#else
+#define GEMM_FLOAT8_CONSTRAINTS BuildKernelDefConstraints<MLFloat16, BFloat16, float>()
+#endif
+
+#define REGISTER_KERNEL()                                            \
+  ONNX_OPERATOR_KERNEL_EX(                                           \
+      GemmFloat8,                                                    \
+      kMSDomain,                                                     \
+      1,                                                             \
+      kCudaExecutionProvider,                                        \
+      (*KernelDefBuilder::Create())                                  \
+          .TypeConstraint("TA", GEMM_FLOAT8_CONSTRAINTS)             \
+          .TypeConstraint("TB", GEMM_FLOAT8_CONSTRAINTS)             \
+          .TypeConstraint("TR", GEMM_FLOAT8_CONSTRAINTS)             \
+          .TypeConstraint("TS", BuildKernelDefConstraints<float>()), \
       GemmFloat8);
 
 REGISTER_KERNEL()
@@ -38,7 +44,7 @@ GemmFloat8::GemmFloat8(const OpKernelInfo& info) : CudaKernel(info) {
   alpha_ = info.GetAttrOrDefault<float>("alpha", 1);
   beta_ = info.GetAttrOrDefault<float>("beta", 0);
 
-#if (CUDA_VERSION <= 12000)
+#if (CUDA_VERSION < 12000)
   ORT_ENFORCE(beta_ == 0, "CUDA < 12.0 does not support bias, beta must be 0.");
 #endif