diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py index f506516442b1e..17c53957ad54c 100644 --- a/onnxruntime/python/tools/transformers/benchmark.py +++ b/onnxruntime/python/tools/transformers/benchmark.py @@ -36,6 +36,8 @@ python benchmark.py -e torchscript onnxruntime -p "int8" -o Run OnnxRuntime with the ROCM provider and graph optimization script: python benchmark.py -g -m bert-base-cased --provider rocm --optimizer_info by_script --disable_embed_layer_norm + Run OnnxRuntime with bfloat16 fastmath mode kernels on aarch64 platforms with bfloat16 support: + python benchmark.py --enable_arm64_bfloat16_fastmath_mlas_gemm It is recommended to use run_benchmark.sh to launch benchmark. """ @@ -106,6 +108,7 @@ def run_onnxruntime( use_raw_attention_mask, model_fusion_statistics, model_source, + enable_arm64_bfloat16_fastmath_mlas_gemm, args, ): import onnxruntime @@ -209,6 +212,7 @@ def run_onnxruntime( enable_all_optimization=True, num_threads=num_threads, verbose=verbose, + enable_mlas_gemm_fastmath_arm64_bfloat16=enable_arm64_bfloat16_fastmath_mlas_gemm, ) if ort_session is None: continue @@ -764,6 +768,14 @@ def parse_arguments(): help="Manually set the model's layer number", ) + parser.add_argument( + "--enable_arm64_bfloat16_fastmath_mlas_gemm", + required=False, + action="store_true", + help="Enable bfloat16 mlas gemm kernels on aarch64. Supported only for CPU EP ", + ) + parser.set_defaults(enable_arm64_bfloat16_fastmath_mlas_gemm=False) + FusionOptions.add_arguments(parser) args = parser.parse_args() @@ -909,6 +921,7 @@ def main(): use_raw_attention_mask, model_fusion_statistics, args.model_source, + args.enable_arm64_bfloat16_fastmath_mlas_gemm, args, ) except Exception: diff --git a/onnxruntime/python/tools/transformers/benchmark_helper.py b/onnxruntime/python/tools/transformers/benchmark_helper.py index b6f7a44450c62..018495829f02c 100644 --- a/onnxruntime/python/tools/transformers/benchmark_helper.py +++ b/onnxruntime/python/tools/transformers/benchmark_helper.py @@ -85,6 +85,7 @@ def create_onnxruntime_session( num_threads=-1, enable_profiling=False, verbose=False, + enable_mlas_gemm_fastmath_arm64_bfloat16=False, provider_options={}, # map execution provider name to its option # noqa: B006 ): session = None @@ -136,6 +137,9 @@ def create_onnxruntime_session( if provider_options: providers = [(name, provider_options[name]) if name in provider_options else name for name in providers] + if enable_mlas_gemm_fastmath_arm64_bfloat16: + sess_options.add_session_config_entry("mlas.enable_gemm_fastmath_arm64_bfloat16", "1") + session = onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=providers) except Exception: logger.error("Exception", exc_info=True) diff --git a/onnxruntime/python/tools/transformers/run_benchmark.sh b/onnxruntime/python/tools/transformers/run_benchmark.sh old mode 100644 new mode 100755 index f0422839c11eb..64d6ecde618f6 --- a/onnxruntime/python/tools/transformers/run_benchmark.sh +++ b/onnxruntime/python/tools/transformers/run_benchmark.sh @@ -34,6 +34,9 @@ run_gpu_fp16=true run_cpu_fp32=false run_cpu_int8=false +# Set this to true to enable bfloat16 fastmath gemm kernels on aarch64 platforms with bfloat16 support +arm64_bfloat16_fastmath_mode=false + average_over=1000 # CPU takes longer time to run, only run 100 inferences to get average latency. if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then @@ -63,7 +66,7 @@ models_to_test="bert-base-cased roberta-base distilbert-base-uncased" # export CUDA_VISIBLE_DEVICES=1 # This script will generate a logs file with a list of commands used in tests. -echo echo "ort=$run_ort torch=$run_torch torch2=$run_torch2 torchscript=$run_torchscript tensorflow=$run_tensorflow gpu_fp32=$run_gpu_fp32 gpu_fp16=$run_gpu_fp16 cpu=$run_cpu optimizer=$use_optimizer batch=$batch_sizes sequence=$sequence_length models=$models_to_test" >> benchmark.log +echo echo "ort=$run_ort torch=$run_torch torch2=$run_torch2 torchscript=$run_torchscript tensorflow=$run_tensorflow gpu_fp32=$run_gpu_fp32 gpu_fp16=$run_gpu_fp16 cpu=$run_cpu optimizer=$use_optimizer batch=$batch_sizes sequence=$sequence_length models=$models_to_test" arm64_bfloat16_fastmath_mode=$arm64_bfloat16_fastmath_mode >> benchmark.log # Set it to false to skip testing. You can use it to dry run this script with the log file. run_tests=true @@ -127,6 +130,10 @@ if [ "$force_layer_number" = true ] ; then benchmark_options="$benchmark_options --force_num_layers $layer_number" fi +if [ "$arm64_bfloat16_fastmath_mode" = true ] ; then + benchmark_options="$benchmark_options --enable_arm64_bfloat16_fastmath_mlas_gemm" +fi + # ------------------------------------------- run_one_test() { if [ "$run_ort" = true ] ; then