Add option to profile all iterations or only last

rapidsai · Nov 4, 2024 · 2b15bb4 · 2b15bb4
1 parent 173c1e5
commit 2b15bb4
Show file tree

Hide file tree

Showing 7 changed files with 38 additions and 26 deletions.
diff --git a/dask_cuda/benchmarks/common.py b/dask_cuda/benchmarks/common.py
@@ -1,3 +1,4 @@
+import contextlib
 from argparse import Namespace
 from functools import partial
 from typing import Any, Callable, List, Mapping, NamedTuple, Optional, Tuple
@@ -7,7 +8,7 @@
 import pandas as pd
 
 import dask
-from distributed import Client
+from distributed import Client, performance_report
 
 from dask_cuda.benchmarks.utils import (
     address_to_index,
@@ -87,14 +88,20 @@ def run_benchmark(client: Client, args: Namespace, config: Config):
 
     If ``args.profile`` is set, the final run is profiled.
     """
+
     results = []
     for _ in range(max(0, args.warmup_runs)):
         config.bench_once(client, args, write_profile=None)
-    for _ in range(max(1, args.runs) - 1):
-        res = config.bench_once(client, args, write_profile=None)
-        results.append(res)
-    results.append(config.bench_once(client, args, write_profile=args.profile))
-    return results
+
+    ctx = contextlib.nullcontext()
+    if args.profile is not None:
+        ctx = performance_report(filename=args.profile)
+    with ctx:
+        for _ in range(max(1, args.runs) - 1):
+            res = config.bench_once(client, args, write_profile=None)
+            results.append(res)
+        results.append(config.bench_once(client, args, write_profile=args.profile_last))
+        return results
 
 
 def gather_bench_results(client: Client, args: Namespace, config: Config):

diff --git a/dask_cuda/benchmarks/local_cudf_groupby.py b/dask_cuda/benchmarks/local_cudf_groupby.py
@@ -98,10 +98,9 @@ def bench_once(client, args, write_profile=None):
         "False": False,
     }.get(args.shuffle, args.shuffle)
 
-    if write_profile is None:
-        ctx = contextlib.nullcontext()
-    else:
-        ctx = performance_report(filename=args.profile)
+    ctx = contextlib.nullcontext()
+    if write_profile is not None:
+        ctx = performance_report(filename=write_profile)
 
     with ctx:
         t1 = clock()

diff --git a/dask_cuda/benchmarks/local_cudf_merge.py b/dask_cuda/benchmarks/local_cudf_merge.py
@@ -190,7 +190,7 @@ def bench_once(client, args, write_profile=None):
     if args.backend == "explicit-comms":
         ctx1 = dask.config.set(explicit_comms=True)
     if write_profile is not None:
-        ctx2 = performance_report(filename=args.profile)
+        ctx2 = performance_report(filename=write_profile)
 
     with ctx1:
         with ctx2:

diff --git a/dask_cuda/benchmarks/local_cudf_shuffle.py b/dask_cuda/benchmarks/local_cudf_shuffle.py
@@ -121,10 +121,9 @@ def create_data(
 def bench_once(client, args, write_profile=None):
     data_processed, df = create_data(client, args)
 
-    if write_profile is None:
-        ctx = contextlib.nullcontext()
-    else:
-        ctx = performance_report(filename=args.profile)
+    ctx = contextlib.nullcontext()
+    if write_profile is not None:
+        ctx = performance_report(filename=write_profile)
 
     with ctx:
         if args.backend in {"dask", "dask-noop"}:

diff --git a/dask_cuda/benchmarks/local_cupy.py b/dask_cuda/benchmarks/local_cupy.py
@@ -141,12 +141,11 @@ def bench_once(client, args, write_profile=None):
     chunksize = x.chunksize
     data_processed = sum(arg.nbytes for arg in func_args)
 
-    # Execute the operations to benchmark
-    if args.profile is not None and write_profile is not None:
-        ctx = performance_report(filename=args.profile)
-    else:
-        ctx = contextlib.nullcontext()
+    ctx = contextlib.nullcontext()
+    if write_profile is not None:
+        ctx = performance_report(filename=write_profile)
 
+    # Execute the operations to benchmark
     with ctx:
         rng = start_range(message=args.operation, color="purple")
         result = func(*func_args)

diff --git a/dask_cuda/benchmarks/local_cupy_map_overlap.py b/dask_cuda/benchmarks/local_cupy_map_overlap.py
@@ -42,12 +42,11 @@ def bench_once(client, args, write_profile=None):
 
     data_processed = x.nbytes
 
-    # Execute the operations to benchmark
-    if args.profile is not None and write_profile is not None:
-        ctx = performance_report(filename=args.profile)
-    else:
-        ctx = contextlib.nullcontext()
+    ctx = contextlib.nullcontext()
+    if write_profile is not None:
+        ctx = performance_report(filename=write_profile)
 
+    # Execute the operations to benchmark
     with ctx:
         result = x.map_overlap(mean_filter, args.kernel_size, shape=ks)
         if args.backend == "dask-noop":

diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
@@ -323,7 +323,16 @@ def parse_benchmark_args(
         metavar="PATH",
         default=None,
         type=str,
-        help="Write dask profile report (E.g. dask-report.html)",
+        help="Write dask profile report (E.g. dask-report.html) on all "
+        "iterations (excluding warmup).",
+    )
+    parser.add_argument(
+        "--profile-last",
+        metavar="PATH",
+        default=None,
+        type=str,
+        help="Write dask profile report (E.g. dask-report.html) on last "
+        "iteration only.",
     )
     # See save_benchmark_data for more information
     parser.add_argument(