Merge remote-tracking branch 'origin/main' into skottmckay/CheckIniti…

…alizerIsConstInNNAPI
microsoft · Feb 6, 2024 · 9ca6ac9 · 9ca6ac9
2 parents 1a4da1a + bedf0ee
commit 9ca6ac9
Show file tree

Hide file tree

Showing 83 changed files with 2,693 additions and 298 deletions.
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
@@ -7,7 +7,7 @@ jobs:
   triage:
     runs-on: ubuntu-latest
     steps:
-    - uses: github/issue-labeler@v3.3
+    - uses: github/issue-labeler@v3.4
       with:
         repo-token: "${{ secrets.GITHUB_TOKEN }}"
         configuration-path: .github/labeler.yml

diff --git a/.github/workflows/publish-java-apidocs.yml b/.github/workflows/publish-java-apidocs.yml
@@ -30,7 +30,7 @@ jobs:
           java-version: '11'
           distribution: 'adopt'
       - name: Build with Gradle
-        uses: gradle/gradle-build-action@v2
+        uses: gradle/gradle-build-action@v3
         with:
           build-root-directory: java
           gradle-executable: java/gradlew

diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
@@ -473,6 +473,9 @@ file(GLOB onnxruntime_python_transformers_models_llama_src CONFIGURE_DEPENDS
 file(GLOB onnxruntime_python_transformers_models_longformer_src CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/python/tools/transformers/models/longformer/*.py"
 )
+file(GLOB onnxruntime_python_transformers_models_phi2_src CONFIGURE_DEPENDS
+    "${ONNXRUNTIME_ROOT}/python/tools/transformers/models/phi2/*.py"
+)
 file(GLOB onnxruntime_python_transformers_models_stable_diffusion_src CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/python/tools/transformers/models/stable_diffusion/*.py"
 )
@@ -543,6 +546,7 @@ add_custom_command(
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/gpt2
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/llama
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/longformer
+  COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/phi2
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/stable_diffusion
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/t5
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/whisper
@@ -646,6 +650,9 @@ add_custom_command(
   COMMAND ${CMAKE_COMMAND} -E copy
       ${onnxruntime_python_transformers_models_longformer_src}
       $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/longformer/
+  COMMAND ${CMAKE_COMMAND} -E copy
+      ${onnxruntime_python_transformers_models_phi2_src}
+      $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/phi2/
   COMMAND ${CMAKE_COMMAND} -E copy
       ${onnxruntime_python_transformers_models_stable_diffusion_src}
       $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/stable_diffusion/

diff --git a/include/onnxruntime/core/providers/cuda/cuda_context.h b/include/onnxruntime/core/providers/cuda/cuda_context.h
@@ -37,6 +37,7 @@ struct CudaContext : public CustomOpContext {
   bool cudnn_conv1d_pad_to_nc1d = false;
   bool enable_skip_layer_norm_strict_mode = false;
   bool prefer_nhwc = false;
+  bool use_tf32 = true;
 
   void Init(const OrtKernelContext& kernel_ctx) {
     cuda_stream = FetchResource<cudaStream_t>(kernel_ctx, CudaResource::cuda_stream_t);
@@ -52,6 +53,7 @@ struct CudaContext : public CustomOpContext {
     cudnn_conv1d_pad_to_nc1d = FetchResource<bool>(kernel_ctx, CudaResource::cudnn_conv1d_pad_to_nc1d_t);
     enable_skip_layer_norm_strict_mode = FetchResource<bool>(kernel_ctx, CudaResource::enable_skip_layer_norm_strict_mode_t);
     prefer_nhwc = FetchResource<bool>(kernel_ctx, CudaResource::prefer_nhwc_t);
+    use_tf32 = FetchResource<bool>(kernel_ctx, CudaResource::use_tf32_t);
   }
 
   template <typename T>

diff --git a/include/onnxruntime/core/providers/cuda/cuda_provider_options.h b/include/onnxruntime/core/providers/cuda/cuda_provider_options.h
@@ -37,4 +37,5 @@ struct OrtCUDAProviderOptionsV2 {
                                                                                                                // The strict mode has better accuracy but lower performance.
   int prefer_nhwc = 0;                                                                                         // make the CUDA EP NHWC preferred
   int use_ep_level_unified_stream = 0;                                                                         // flag specifying if ep level stream is used or not
+  int use_tf32 = 1;                                                                                            // use TF32
 };
diff --git a/include/onnxruntime/core/providers/cuda/cuda_resource.h b/include/onnxruntime/core/providers/cuda/cuda_resource.h
@@ -18,4 +18,5 @@ enum CudaResource : int {
   cudnn_conv1d_pad_to_nc1d_t,
   enable_skip_layer_norm_strict_mode_t,
   prefer_nhwc_t,
+  use_tf32_t,
 };
diff --git a/js/web/docs/webgpu-operators.md b/js/web/docs/webgpu-operators.md
@@ -41,6 +41,7 @@ Do not modify directly.*
 | Erf | ai.onnx(9-12,13+) |  |
 | Exp | ai.onnx(6-12,13+) |  |
 | Expand | ai.onnx(8-12,13+) |  |
+| FastGelu | com.microsoft(1+) |  |
 | Flatten | ai.onnx(1-8,9-10,11-12,13+) |  |
 | Floor | ai.onnx(6-12,13+) |  |
 | FusedConv | com.microsoft(1+) |  |

diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -13,6 +13,7 @@ import {convTranspose, parseConvTransposeAttributes} from './ops/conv-transpose'
 import {cumsum, parseCumSumAttributes} from './ops/cumsum';
 import {einsum, parseEinsumAttributes} from './ops/einsum';
 import {expand} from './ops/expand';
+import {fastGelu} from './ops/fast-gelu';
 import {gather, parseGatherAttributes} from './ops/gather';
 import {gatherElements, parseGatherElementsAttributes} from './ops/gather-elements';
 import {gemm, parseGemmAttributes} from './ops/gemm';
@@ -72,6 +73,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['Erf', [unaryOps.erf]],
   ['Exp', [unaryOps.exp]],
   ['Expand', [expand]],
+  ['FastGelu', [fastGelu]],
   ['Floor', [unaryOps.floor]],
   ['FusedConv', [conv, parseConvAttributes]],
   ['Gather', [gather, parseGatherAttributes]],

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/bias-split-gelu.ts b/js/web/lib/wasm/jsep/webgpu/ops/bias-split-gelu.ts
@@ -43,7 +43,7 @@ const createBiasSplitGeluProgramInfo = (inputs: readonly TensorView[]): ProgramI
 
   ${shaderHelper.declareVariables(input, bias, output)}
 
-  ${erfImpl(`vec4<${dataType}>`, dataType)}
+  ${erfImpl(dataType)}
 
   ${shaderHelper.mainStart()}
     ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/fast-gelu.ts b/js/web/lib/wasm/jsep/webgpu/ops/fast-gelu.ts
@@ -0,0 +1,69 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {DataType} from '../../../wasm-common';
+import {TensorView} from '../../tensor-view';
+import {ShapeUtil} from '../../util';
+import {ComputeContext, ProgramInfo} from '../types';
+
+import {inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglValueType, UniformsArrayType, WORKGROUP_SIZE} from './common';
+import * as unary from './unary-op';
+
+// GELU is defined as Y=0.5*X*(1+tanh(0.797885*X+0.035677*X*X*X)), where X may pre-add a bias.
+
+const createFastGeluProgramInfo = (inputTensors: readonly TensorView[]): ProgramInfo => {
+  const dataType = inputTensors[0].dataType;
+  const outputSize = ShapeUtil.size(inputTensors[0].dims);
+  const biasLength = ShapeUtil.size(inputTensors[1].dims);
+  // can only use vec4 when bias length is multiple of 4
+  const useVec4 = biasLength % 4 === 0;
+  const getShaderSource = (shaderHelper: ShaderHelper): string => {
+    const x = inputVariable('x', dataType, [1], 4);
+    const bias = inputVariable('bias', dataType, [1], 4);
+    const y = outputVariable('y', dataType, [1], 4);
+
+    const uniforms: UniformsArrayType = [{name: 'output_vec_size', type: 'u32'}, {name: 'bias_size', type: 'u32'}];
+
+    const singleElementBias = (i: 0|1|2|3) => `
+      let bias${i}_offset: u32 = (global_idx * 4 + ${i}) % uniforms.bias_size;
+      let bias${i} = ${bias.getByOffset(`bias${i}_offset / 4`)}[bias${i}_offset % 4];`;
+    const biasGetExpression = useVec4 ?
+        `
+      let bias = ${bias.getByOffset('global_idx % (uniforms.bias_size / 4)')};` :
+        `${singleElementBias(0)}${singleElementBias(1)}${singleElementBias(2)}${singleElementBias(3)}
+      let bias = ${x.type.value}(bias0, bias1, bias2, bias3);`;
+
+    return `${shaderHelper.registerUniforms(uniforms).declareVariables(x, bias, y)}
+
+    ${unary.fastGeluImpl(tensorTypeToWsglValueType(dataType))}
+
+    ${shaderHelper.mainStart(WORKGROUP_SIZE)}
+      ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_vec_size')}
+
+      let x = ${x.getByOffset('global_idx')};
+      ${biasGetExpression}
+      let x_in = x + bias;
+      ${y.setByOffset('global_idx', unary.fastGeluExpression('x_in'))}
+    }`;
+  };
+
+  return {
+    name: 'FastGeluWithBias',
+    shaderCache: {hint: `${useVec4}`, inputDependencies: ['type', 'type']},
+    getShaderSource,
+    getRunData: (inputs) => ({
+      outputs: [{dims: inputs[0].dims, dataType: inputs[0].dataType}],
+      programUniforms:
+          [{type: DataType.uint32, data: Math.ceil(outputSize / 4)}, {type: DataType.uint32, data: biasLength}],
+      dispatchGroup: {x: Math.ceil(outputSize / WORKGROUP_SIZE / 4)}
+    })
+  };
+};
+
+export const fastGelu = (context: ComputeContext): void => {
+  if (context.inputs.length < 2 || ShapeUtil.size(context.inputs[1].dims) === 0) {
+    unary.fastGelu(context);
+  } else {
+    context.compute(createFastGeluProgramInfo(context.inputs));
+  }
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
@@ -178,24 +178,23 @@ export const elu = (context: ComputeContext, attributes: AlphaAttributes): void
       attributes.cacheKey));
 };
 
-export const erfImpl = (dataType: string, varType = 'f32') => `
+export const erfImpl = (varType = 'f32') => `
 const r0: ${varType} = 0.3275911;
 const r1: ${varType} = 0.254829592;
 const r2: ${varType} = -0.284496736;
 const r3: ${varType} = 1.421413741;
 const r4: ${varType} = -1.453152027;
 const r5: ${varType} = 1.061405429;
 
-fn erf_vf32(v: ${dataType}) -> ${dataType} {
+fn erf_vf32(v: vec4<${varType}>) -> vec4<${varType}> {
   let absv = abs(v);
   let x = 1.0 / (1.0 + r0 * absv);
   return sign(v) * (1.0 - ((((r5 * x + r4) * x + r3) * x + r2) * x + r1) * x * exp(-absv * absv));
 }`;
 
 export const erf = (context: ComputeContext): void => {
   const dataType = tensorTypeToWsglValueType(context.inputs[0].dataType);
-  context.compute(createElementwiseProgramInfo(
-      context.inputs[0], 'Erf', a => `erf_vf32(${a})`, erfImpl(`vec4<${dataType}>`, dataType)));
+  context.compute(createElementwiseProgramInfo(context.inputs[0], 'Erf', a => `erf_vf32(${a})`, erfImpl(dataType)));
 };
 
 export const exp = (context: ComputeContext): void => {
@@ -209,8 +208,7 @@ export const floor = (context: ComputeContext): void => {
 export const gelu = (context: ComputeContext): void => {
   const dataType = tensorTypeToWsglValueType(context.inputs[0].dataType);
   context.compute(createElementwiseProgramInfo(
-      context.inputs[0], 'Gelu', a => `0.5 * ${a} * (1.0 + erf_vf32(${a} * 0.7071067811865475))`,
-      erfImpl(`vec4<${dataType}>`, dataType)));
+      context.inputs[0], 'Gelu', a => `0.5 * ${a} * (1.0 + erf_vf32(${a} * 0.7071067811865475))`, erfImpl(dataType)));
 };
 
 export const leakyRelu = (context: ComputeContext, attributes: AlphaAttributes): void => {
@@ -278,10 +276,31 @@ export const tan = (context: ComputeContext): void => {
   context.compute(createElementwiseProgramInfo(context.inputs[0], 'Tan', 'tan'));
 };
 
+export const tanhExpression = (a: string) => `sign(${a}) * (1 - exp(-2 * abs(${a}))) / (1 + exp(-2 * abs(${a})))`;
+
 export const tanh = (context: ComputeContext): void => {
   // TODO: revisit after https://github.com/gpuweb/gpuweb/issues/4458 is resolved
+  context.compute(createElementwiseProgramInfo(context.inputs[0], 'Tanh', tanhExpression));
+};
+
+export const fastGeluImpl = (varType = 'f32') => `
+const fast_gelu_a: ${varType} = 0.5;
+const fast_gelu_b: ${varType} = 0.7978845608028654;
+const fast_gelu_c: ${varType} = 0.035677408136300125;
+
+fn tanh_v(v: vec4<${varType}>) -> vec4<${varType}> {
+  return ${tanhExpression('v')};
+}
+`;
+
+export const fastGeluExpression = (x: string) =>
+    `(fast_gelu_a + fast_gelu_a * tanh_v(${x} * (fast_gelu_c * ${x} * ${x} + fast_gelu_b))) * ${x}`;
+
+export const fastGelu = (context: ComputeContext): void => {
+  const dataType = tensorTypeToWsglValueType(context.inputs[0].dataType);
   context.compute(createElementwiseProgramInfo(
-      context.inputs[0], 'Tanh', a => `sign(${a}) * (1 - exp(-2 * abs(${a}))) / (1 + exp(-2 * abs(${a})))`));
+      context.inputs[0], 'FastGelu', fastGeluExpression, fastGeluImpl(dataType), undefined,
+      context.inputs[0].dataType));
 };
 
 export const thresholdedRelu = (context: ComputeContext, attributes: AlphaAttributes): number => {