From 2b3071119ab93f7c6c4122ea8f2fda1678c40f07 Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Fri, 5 Apr 2024 07:02:01 -0700
Subject: [PATCH] Add onnxruntime/test/run_benchmark.py helper script. (#19234)

### Description
Add onnxruntime/test/run_benchmark.py helper script to repeat benchmark
runs until a target coefficient of variance is reached. It works with
[Google Benchmark](https://github.com/google/benchmark) programs like
`onnxruntime_mlas_benchmark`.

### Motivation and Context
Sometimes there is variability in benchmark run results. This automates
the repeated running needed to get results that are stable enough.
---
 onnxruntime/test/run_benchmark.py        | 201 +++++++++++++++++++++++
 onnxruntime/test/run_benchmark.readme.md |  21 +++
 2 files changed, 222 insertions(+)
 create mode 100755 onnxruntime/test/run_benchmark.py
 create mode 100644 onnxruntime/test/run_benchmark.readme.md

diff --git a/onnxruntime/test/run_benchmark.py b/onnxruntime/test/run_benchmark.py
new file mode 100755
index 0000000000000..0c00bb7ac0a04
--- /dev/null
+++ b/onnxruntime/test/run_benchmark.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from __future__ import annotations
+
+import argparse
+import dataclasses
+import json
+import pathlib
+import subprocess
+import sys
+import tempfile
+
+
+def warn(message: str):
+    print(f"WARNING: {message}", file=sys.stderr)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Benchmark (https://github.com/google/benchmark) program runner. "
+        "Runs a benchmark program until the benchmark measurements are within the desired coefficient of variation "
+        "(CV) (stddev / mean) tolerance and outputs those measurements."
+    )
+
+    parser.add_argument(
+        "--program",
+        required=True,
+        type=pathlib.Path,
+        help="Path to the benchmark program to run.",
+    )
+
+    parser.add_argument(
+        "--pattern",
+        required=True,
+        dest="patterns",
+        action="extend",
+        nargs="+",
+        help="Benchmark test name pattern to specify which benchmark tests to run. "
+        "Each pattern value will have its own invocation of the benchmark program (passed to the benchmark program "
+        "with the --benchmark_filter option). "
+        "To list the benchmark test names, run the benchmark program with the --benchmark_list_tests option.",
+    )
+    parser.add_argument(
+        "--repetitions",
+        type=int,
+        default=10,
+        help="Number of benchmark run repetitions (passed to the benchmark program with the "
+        "--benchmark_repetitions option).",
+    )
+
+    parser.add_argument(
+        "--max-cv",
+        type=float,
+        default=0.05,
+        help="Maximum allowed CV (stddev / mean) value. "
+        "The CV value is a number, not a percentage. E.g., a value of 0.05 corresponds to 5%%.",
+    )
+    parser.add_argument(
+        "--max-attempts",
+        type=int,
+        default=3,
+        help="Maximum number of times to attempt running the benchmark program.",
+    )
+
+    parser.add_argument(
+        "--show-program-output",
+        action="store_true",
+        help="Display the output from the benchmark program.",
+    )
+
+    return parser.parse_args()
+
+
+@dataclasses.dataclass
+class BenchmarkResult:
+    name: str
+    median_real_time: float
+    median_cpu_time: float
+    time_unit: str
+
+
+def run_benchmark(
+    program: pathlib.Path,
+    output_file: pathlib.Path,
+    show_output: bool,
+    pattern: str,
+    repetitions: int,
+    max_cv: float,
+    max_attempts: int,
+) -> list[BenchmarkResult]:
+    benchmark_cmd = [
+        f"{program}",
+        f"--benchmark_filter={pattern}",
+        f"--benchmark_repetitions={repetitions}",
+        "--benchmark_report_aggregates_only",
+        f"--benchmark_out={output_file}",
+        "--benchmark_out_format=json",
+    ]
+
+    def check_cv(entries):
+        valid = True
+
+        for entry in entries:
+            if entry.get("aggregate_name") != "cv":
+                continue
+
+            run_name = entry["run_name"]
+
+            real_time_cv = float(entry["real_time"])
+            if real_time_cv > max_cv:
+                warn(f"real_time CV exceeds limit for run '{run_name}': {real_time_cv} > {max_cv}")
+                valid = False
+
+            cpu_time_cv = float(entry["cpu_time"])
+            if cpu_time_cv > max_cv:
+                warn(f"cpu_time CV exceeds limit for run '{run_name}': {cpu_time_cv} > {max_cv}")
+                valid = False
+
+        return valid
+
+    def process_entries(entries) -> list[BenchmarkResult]:
+        results = []
+
+        for entry in entries:
+            if entry.get("aggregate_name") != "median":
+                continue
+
+            result = BenchmarkResult(
+                name=entry["run_name"],
+                median_real_time=float(entry["real_time"]),
+                median_cpu_time=float(entry["cpu_time"]),
+                time_unit=entry["time_unit"],
+            )
+
+            results.append(result)
+
+        return results
+
+    attempts = 0
+    while attempts < max_attempts:
+        attempts += 1
+
+        output_handle = None if show_output else subprocess.DEVNULL
+        subprocess.run(
+            benchmark_cmd,
+            check=True,
+            stdout=output_handle,
+            stderr=output_handle,
+            creationflags=subprocess.HIGH_PRIORITY_CLASS,
+        )
+
+        with open(output_file) as output:
+            output_json = json.load(output)
+            entries = output_json["benchmarks"]
+
+        if not check_cv(entries):
+            warn("Discarding benchmark run.")
+            continue
+
+        return process_entries(entries)
+
+    raise RuntimeError("Failed to get measurements within the CV limit.")
+
+
+def main():
+    args = parse_args()
+
+    program = args.program.resolve(strict=True)
+
+    benchmark_results: list[BenchmarkResult] = []
+
+    with tempfile.TemporaryDirectory() as temp_dir_name:
+        temp_dir = pathlib.Path(temp_dir_name)
+        output_file = temp_dir / "benchmark.out.json"
+
+        for pattern in args.patterns:
+            benchmark_results += run_benchmark(
+                program=program,
+                output_file=output_file,
+                show_output=args.show_program_output,
+                pattern=pattern,
+                repetitions=args.repetitions,
+                max_cv=args.max_cv,
+                max_attempts=args.max_attempts,
+            )
+
+    print("name|median_real_time|median_cpu_time")
+    print("-|-|-")
+    for result in benchmark_results:
+        print(
+            f"{result.name}|"
+            f"{round(result.median_real_time)} {result.time_unit}|"
+            f"{round(result.median_cpu_time)} {result.time_unit}"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/onnxruntime/test/run_benchmark.readme.md b/onnxruntime/test/run_benchmark.readme.md
new file mode 100644
index 0000000000000..06a9cbb85df13
--- /dev/null
+++ b/onnxruntime/test/run_benchmark.readme.md
@@ -0,0 +1,21 @@
+# run_benchmark.py
+
+`run_benchmark.py` is a helper script that runs a [Google Benchmark](https://github.com/google/benchmark) program
+repeatedly until the measurements are within the desired
+[coefficient of variation](https://en.wikipedia.org/wiki/Coefficient_of_variation) and then outputs the measurements.
+
+It can be useful for obtaining measurements that are stable enough when repeated invocations of a benchmark program
+show some measurement variance across runs.
+
+Note that the script runs the benchmark program with specific options and parses specifically formatted output, so it
+is only expected to work with Google Benchmark programs.
+
+## Example usage
+
+To run a benchmark program and get measurements for benchmark test(s) with a particular name:
+
+```
+python run_benchmark.py --program <path to benchmark program, e.g., onnxruntime_mlas_benchmark> --pattern <benchmark test name pattern>
+```
+
+For more detailed usage information, run it with the `--help` option.