diff --git a/tuner/tuner/candidate_gen.py b/tuner/tuner/candidate_gen.py
index a1ee421d0..1e1388373 100644
--- a/tuner/tuner/candidate_gen.py
+++ b/tuner/tuner/candidate_gen.py
@@ -25,7 +25,6 @@
 from abc import abstractmethod
 
 from iree.compiler import ir  # type: ignore
-
 from iree.compiler.dialects import iree_codegen  # type: ignore
 
 from .common import *
@@ -355,7 +354,7 @@ def main():
             prefetch_shared_memory=args.prefetch_shared_memory_options,
             no_reduce_shared_memory_bank_conflicts=args.no_reduce_shared_memory_bank_conflicts_options,
         )
-        specs: list[ir.Module] = generate_configs_and_td_specs(
+        specs = generate_configs_and_td_specs(
             mlir_module,
             tuner_ctx,
             args.limit,
@@ -369,7 +368,7 @@ def main():
             spec_path = spec_dir / f"{candidate_num}_spec.mlir"
             spec_dir.mkdir(parents=True, exist_ok=True)
             with open(spec_path, "w") as f:
-                local_scope_spec_str: str = spec.operation.get_asm(use_local_scope=True)
+                local_scope_spec_str = spec.operation.get_asm(use_local_scope=True)
                 f.write(local_scope_spec_str)
 
 
diff --git a/tuner/tuner/libtuner.py b/tuner/tuner/libtuner.py
index 28c2e4429..1b4dc032d 100644
--- a/tuner/tuner/libtuner.py
+++ b/tuner/tuner/libtuner.py
@@ -245,8 +245,9 @@ def validate_baselines_device_ids_match(
 def validate_baseline_regression(
     first_baseline_by_device: dict[str, float],
     second_baseline_by_device: dict[str, float],
-) -> bool:
-    regression_detected = False
+) -> list[str]:
+    regression_device_ids = []
+
     for device_id in first_baseline_by_device:
         if device_id not in second_baseline_by_device:
             continue
@@ -262,9 +263,9 @@ def validate_baseline_regression(
                 f"Baseline time = {first_baseline_time}, Post-baseline time = {second_baseline_time}, "
                 f"Slower by {percentage_slower:.3f}%"
             )
-            regression_detected = True
+            regression_device_ids.append(device_id)
 
-    return regression_detected
+    return regression_device_ids
 
 
 class ExecutionPhases(str, Enum):
@@ -785,6 +786,31 @@ def collision_handler(index_hash_list: list[tuple[int, str]]) -> tuple[bool, lis
     return collision_detected, unique_indexes
 
 
+def benchmark_candidates(candidate_indices, devices, tuning_client, candidate_trackers):
+    """
+    Runs the benchmarking for a given list of candidate indices.
+    """
+    worker_context_queue = create_worker_context_queue(devices)
+
+    task_list = [
+        BenchmarkPack(
+            iree_benchmark_module_flags=tuning_client.get_iree_benchmark_module_flags(),
+            benchmark_timeout=tuning_client.get_benchmark_timeout_s(),
+            candidate_tracker=candidate_trackers[idx],
+        )
+        for idx in candidate_indices
+    ]
+
+    # Perform benchmarking.
+    return multiprocess_progress_wrapper(
+        num_worker=len(devices),
+        task_list=task_list,
+        function=run_iree_benchmark_module_command,
+        initializer=init_worker_context,
+        initializer_inputs=(worker_context_queue,),
+    )
+
+
 def compile(
     args: argparse.Namespace,
     path_config: PathConfig,
@@ -873,12 +899,13 @@ def select_best_benchmark_results(
     baseline_results: list[BenchmarkResult],
     num_candidates: Optional[int],
 ) -> list[BenchmarkResult]:
-    filtered_candidate_results = [r for r in candidate_results if math.isfinite(r.time)]
+    filtered_candidate_results = validate_benchmark_results(candidate_results)
     if len(filtered_candidate_results) == 0:
         logging.error("No successful candidate benchmarks.")
         return []
     fallback_baseline_time: Optional[float] = None
     filtered_baseline_results: list[BenchmarkResult] = []
+    # TODO(Bangtian): use median number instead of last valid baseline result as fallback.
     for r in baseline_results:
         if math.isfinite(r.time):
             filtered_baseline_results.append(r)
@@ -889,9 +916,8 @@ def select_best_benchmark_results(
         logging.warning(
             f"All baseline benchmarks failed. Baselines will not be used to select top candidates"
         )
-    baseline_times_by_device = {}
-    for r in filtered_baseline_results:
-        baseline_times_by_device[r.device_id] = r.time
+
+    baseline_times_by_device = map_baseline_by_device(filtered_baseline_results)
 
     # Select top candidates
     def get_speedup(result: BenchmarkResult) -> float:
@@ -938,41 +964,45 @@ def benchmark(
         logging.warning("No candidates to benchmark.")
         return []
 
-    task_list = [
-        BenchmarkPack(
-            iree_benchmark_module_flags=tuning_client.get_iree_benchmark_module_flags(),
-            benchmark_timeout=tuning_client.get_benchmark_timeout_s(),
-            candidate_tracker=candidate_trackers[i],
-        )
-        for i in compiled_candidates
-        if i != 0
-    ]
-    worker_context_queue = create_worker_context_queue(args.devices)
-    candidate_results: list[BenchmarkResult] = multiprocess_progress_wrapper(
-        num_worker=len(args.devices),
-        task_list=task_list,
-        function=run_iree_benchmark_module_command,
-        initializer=init_worker_context,
-        initializer_inputs=(worker_context_queue,),
+    # Benchmarking baselines on each involved device.
+    baseline_indices = [0] * len(args.devices)
+    baseline_results = benchmark_candidates(
+        candidate_indices=baseline_indices,
+        devices=args.devices,
+        tuning_client=tuning_client,
+        candidate_trackers=candidate_trackers,
     )
 
-    # Benchmarking baselines on each involved device.
-    worker_context_queue = create_worker_context_queue(args.devices)
-    baseline_task_list = [
-        BenchmarkPack(
-            iree_benchmark_module_flags=tuning_client.get_iree_benchmark_module_flags(),
-            benchmark_timeout=tuning_client.get_benchmark_timeout_s(),
-            candidate_tracker=candidate_trackers[0],
-        )
-    ] * len(args.devices)
-    baseline_results: list[BenchmarkResult] = multiprocess_progress_wrapper(
-        num_worker=len(args.devices),
-        task_list=baseline_task_list,
-        function=run_iree_benchmark_module_command,
-        initializer=init_worker_context,
-        initializer_inputs=(worker_context_queue,),
+    baseline_times_by_device = map_baseline_by_device(baseline_results)
+
+    candidate_indices = [i for i in compiled_candidates if i != 0]
+    candidate_results = benchmark_candidates(
+        candidate_indices=candidate_indices,
+        devices=args.devices,
+        tuning_client=tuning_client,
+        candidate_trackers=candidate_trackers,
     )
 
+    # Benchmarking baselines again to check for performance regressions.
+    # These may indicate machine instability, overheating, etc.
+    post_baseline_indices = [0] * len(args.devices)
+    post_baseline_results = benchmark_candidates(
+        candidate_indices=post_baseline_indices,
+        devices=args.devices,
+        tuning_client=tuning_client,
+        candidate_trackers=candidate_trackers,
+    )
+    post_baseline_times_by_device = map_baseline_by_device(post_baseline_results)
+
+    if validate_baselines_device_ids_match(baseline_times_by_device, post_baseline_times_by_device):
+        logging.warning("Device ID mismatch between baseline runs.")
+
+    regression_devices = validate_baseline_regression(baseline_times_by_device, post_baseline_results)
+    if regression_devices:
+        logging.warning(
+            f"Performance regressions detected for the following devices: {', '.join(regression_devices)}"
+        )
+
     best_results: list[BenchmarkResult] = select_best_benchmark_results(
         candidate_results=candidate_results,
         baseline_results=baseline_results,
diff --git a/tuner/tuner/libtuner_test.py b/tuner/tuner/libtuner_test.py
index 1ef32daf1..cda738470 100644
--- a/tuner/tuner/libtuner_test.py
+++ b/tuner/tuner/libtuner_test.py
@@ -6,8 +6,6 @@
 
 import argparse
 import math
-import pytest
-import json
 from subprocess import CompletedProcess
 from unittest.mock import call, patch, MagicMock
 from . import libtuner
@@ -270,3 +268,27 @@ def test_validate_baselines_device_id_match():
         first_baseline, second_baseline
     )
     assert result is True
+
+
+def test_validate_baseline_regression():
+    first_baseline = {"hip://0": 1000.0, "hip://1": 2000.0}
+    second_baseline = {"hip://0": 1100.0, "hip://1": 1900.0}
+    regression_devices = libtuner.validate_baseline_regression(
+        first_baseline, second_baseline
+    )
+    assert regression_devices == ["hip://0"]
+
+    first_baseline = {"hip://0": 1000.0, "hip://1": 2000.0}
+    second_baseline = {"hip://0": 1000.0, "hip://1": 2000.0}
+
+    regression_devices = libtuner.validate_baseline_regression(
+        first_baseline, second_baseline
+    )
+    assert regression_devices == []
+
+    first_baseline = {"hip://0": 1000.0, "hip://1": 2000.0}
+    second_baseline = {"hip://0": 1100.0}
+    regression_devices = libtuner.validate_baseline_regression(
+        first_baseline, second_baseline
+    )
+    assert regression_devices == ["hip://0"]