diff --git a/tuner/tuner/libtuner.py b/tuner/tuner/libtuner.py index b18736ffb..5b5646ee3 100644 --- a/tuner/tuner/libtuner.py +++ b/tuner/tuner/libtuner.py @@ -793,24 +793,6 @@ def benchmark( ): logging.debug("benchmark()") - task_list = [ - BenchmarkPack( - iree_benchmark_module_flags=tuning_client.get_iree_benchmark_module_flags(), - benchmark_timeout=tuning_client.get_benchmark_timeout_s(), - candidate_tracker=candidate_trackers[i], - ) - for i in compiled_candidates - if i != 0 - ] - worker_context_queue = create_worker_context_queue(args.devices) - candidate_results = multiprocess_progress_wrapper( - num_worker=len(args.devices), - task_list=task_list, - function=run_iree_benchmark_module_command, - initializer=init_worker_context, - initializer_inputs=(worker_context_queue,), - ) - # Benchmarking baselines on each involved device. worker_context_queue = create_worker_context_queue(args.devices) baseline_task_list = [ @@ -831,6 +813,24 @@ def benchmark( for r in baseline_results: baseline_times_by_device[r.device_id] = r.time + task_list = [ + BenchmarkPack( + iree_benchmark_module_flags=tuning_client.get_iree_benchmark_module_flags(), + benchmark_timeout=tuning_client.get_benchmark_timeout_s(), + candidate_tracker=candidate_trackers[i], + ) + for i in compiled_candidates + if i != 0 + ] + worker_context_queue = create_worker_context_queue(args.devices) + candidate_results = multiprocess_progress_wrapper( + num_worker=len(args.devices), + task_list=task_list, + function=run_iree_benchmark_module_command, + initializer=init_worker_context, + initializer_inputs=(worker_context_queue,), + ) + # Select top candidates def get_speedup(result: BenchmarkResult) -> float: return result.time / baseline_times_by_device[result.device_id] @@ -848,4 +848,45 @@ def get_speedup(result: BenchmarkResult) -> float: ) top_candidates = [result.candidate_id for result in best_results] + + # Benchmarking baselines on each involved device again to check performance regression on devices. + worker_context_queue = create_worker_context_queue(args.devices) + baseline_task_list = [ + BenchmarkPack( + iree_benchmark_module_flags=tuning_client.get_iree_benchmark_module_flags(), + benchmark_timeout=tuning_client.get_benchmark_timeout_s(), + candidate_tracker=candidate_trackers[0], + ) + ] * len(args.devices) + post_baseline_results = multiprocess_progress_wrapper( + num_worker=len(args.devices), + task_list=baseline_task_list, + function=run_iree_benchmark_module_command, + initializer=init_worker_context, + initializer_inputs=(worker_context_queue,), + ) + post_baseline_times_by_device = {} + for r in post_baseline_results: + post_baseline_times_by_device[r.device_id] = r.time + + assert ( + baseline_times_by_device.keys() == post_baseline_times_by_device.keys() + ), "Error: The device IDs in baseline and post-baseline results do not match." + + regression_detected = False + for device_id in baseline_times_by_device: + baseline_time = baseline_times_by_device[device_id] + post_time = post_baseline_times_by_device[device_id] + + if post_time > baseline_time * 1.03: + regression_detected = True + percentage_slower = ((post_time - baseline_time) / baseline_time) * 100 + logging.info( + f"Performance regression detected on device {device_id}: " + f"Baseline time = {baseline_time}, Post-baseline time = {post_time}, " + f"Slower by {percentage_slower:.3f}%" + ) + + if not regression_detected: + logging.info("No performance regressions detected.") return top_candidates