microsoft · tianleiwu · Feb 12, 2024 · Jan 31, 2024
diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
@@ -36,6 +36,8 @@
             python benchmark.py -e torchscript onnxruntime -p "int8" -o
         Run OnnxRuntime with the ROCM provider and graph optimization script:
             python benchmark.py -g -m bert-base-cased --provider rocm --optimizer_info by_script --disable_embed_layer_norm
+        Run OnnxRuntime with bfloat16 fastmath mode kernels on aarch64 platforms with bfloat16 support:
+            python benchmark.py --enable_arm64_bfloat16_fastmath_mlas_gemm
 
     It is recommended to use run_benchmark.sh to launch benchmark.
 """
@@ -106,6 +108,7 @@ def run_onnxruntime(
     use_raw_attention_mask,
     model_fusion_statistics,
     model_source,
+    enable_arm64_bfloat16_fastmath_mlas_gemm,
     args,
 ):
     import onnxruntime
@@ -209,6 +212,7 @@ def run_onnxruntime(
                 enable_all_optimization=True,
                 num_threads=num_threads,
                 verbose=verbose,
+                enable_mlas_gemm_fastmath_arm64_bfloat16=enable_arm64_bfloat16_fastmath_mlas_gemm,
             )
             if ort_session is None:
                 continue
@@ -764,6 +768,14 @@ def parse_arguments():
         help="Manually set the model's layer number",
     )
 
+    parser.add_argument(
+        "--enable_arm64_bfloat16_fastmath_mlas_gemm",
+        required=False,
+        action="store_true",
+        help="Enable bfloat16 mlas gemm kernels on aarch64. Supported only for CPU EP ",
+    )
+    parser.set_defaults(enable_arm64_bfloat16_fastmath_mlas_gemm=False)
+
     FusionOptions.add_arguments(parser)
 
     args = parser.parse_args()
@@ -909,6 +921,7 @@ def main():
                     use_raw_attention_mask,
                     model_fusion_statistics,
                     args.model_source,
+                    args.enable_arm64_bfloat16_fastmath_mlas_gemm,
                     args,
                 )
             except Exception:

diff --git a/onnxruntime/python/tools/transformers/benchmark_helper.py b/onnxruntime/python/tools/transformers/benchmark_helper.py
@@ -85,6 +85,7 @@ def create_onnxruntime_session(
     num_threads=-1,
     enable_profiling=False,
     verbose=False,
+    enable_mlas_gemm_fastmath_arm64_bfloat16=False,
     provider_options={},  # map execution provider name to its option  # noqa: B006
 ):
     session = None
@@ -136,6 +137,9 @@ def create_onnxruntime_session(
         if provider_options:
             providers = [(name, provider_options[name]) if name in provider_options else name for name in providers]
 
+        if enable_mlas_gemm_fastmath_arm64_bfloat16:
+            sess_options.add_session_config_entry("mlas.enable_gemm_fastmath_arm64_bfloat16", "1")
+
         session = onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=providers)
     except Exception:
         logger.error("Exception", exc_info=True)

diff --git a/onnxruntime/python/tools/transformers/run_benchmark.sh b/onnxruntime/python/tools/transformers/run_benchmark.sh
@@ -1,4 +1,4 @@
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation.  All rights reserved.
 # Licensed under the MIT License.  See License.txt in the project root for
 # license information.
@@ -34,6 +34,9 @@
 run_cpu_fp32=false
 run_cpu_int8=false
 
+# Set this to true to enable bfloat16 fastmath gemm kernels on aarch64 platforms with bfloat16 support
+arm64_bfloat16_fastmath_mode=false
+
 average_over=1000
 # CPU takes longer time to run, only run 100 inferences to get average latency.
 if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then
@@ -63,7 +66,7 @@
 # export CUDA_VISIBLE_DEVICES=1
 
 # This script will generate a logs file with a list of commands used in tests.
-echo echo "ort=$run_ort torch=$run_torch torch2=$run_torch2 torchscript=$run_torchscript tensorflow=$run_tensorflow gpu_fp32=$run_gpu_fp32 gpu_fp16=$run_gpu_fp16 cpu=$run_cpu optimizer=$use_optimizer batch=$batch_sizes sequence=$sequence_length models=$models_to_test" >> benchmark.log
+echo echo "ort=$run_ort torch=$run_torch torch2=$run_torch2 torchscript=$run_torchscript tensorflow=$run_tensorflow gpu_fp32=$run_gpu_fp32 gpu_fp16=$run_gpu_fp16 cpu=$run_cpu optimizer=$use_optimizer batch=$batch_sizes sequence=$sequence_length models=$models_to_test" arm64_bfloat16_fastmath_mode=$arm64_bfloat16_fastmath_mode >> benchmark.log
 
 # Set it to false to skip testing. You can use it to dry run this script with the log file.
 run_tests=true
@@ -127,11 +130,15 @@
   benchmark_options="$benchmark_options --force_num_layers $layer_number"
 fi
 
+if [ "$arm64_bfloat16_fastmath_mode" = true ] ; then
+  benchmark_options="$benchmark_options --enable_arm64_bfloat16_fastmath_mlas_gemm"
+fi
+
 # -------------------------------------------
 run_one_test() {
     if [ "$run_ort" = true ] ; then
      echo python $benchmark_script -m $1 $onnx_export_options $2 $3 $4 >> benchmark.log
      echo python $benchmark_script -m $1 $benchmark_options $2 $3 $4 -i $input_counts >> benchmark.log
      if [ "$run_tests" = true ] ; then
        python $benchmark_script -m $1 $onnx_export_options $2 $3 $4
        python $benchmark_script -m $1 $benchmark_options $2 $3 $4 -i $input_counts