From 51eca7b7d52b29373af3d4cc6261798410efba27 Mon Sep 17 00:00:00 2001 From: Jez Ng Date: Mon, 17 Jun 2024 12:04:20 -0700 Subject: [PATCH] Make CUDA graph benchmarking overridable on a per-op basis Summary: some operators need to do gpu-cpu syncs, which is not supported under graph capture Reviewed By: davidberard98 Differential Revision: D58680076 fbshipit-source-id: 7c86c484990445512723ebdda25ef4af8cfffde5 --- torchbenchmark/util/triton_op.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/torchbenchmark/util/triton_op.py b/torchbenchmark/util/triton_op.py index fd0fa817d..3a67c1024 100644 --- a/torchbenchmark/util/triton_op.py +++ b/torchbenchmark/util/triton_op.py @@ -421,6 +421,7 @@ class BenchmarkOperator(metaclass=PostInitProcessor): _input_iter: Optional[Generator] = None extra_args: List[str] = [] example_inputs: Any = None + use_cuda_graphs: bool = True # By default, only collect latency metrics # Each operator can override to define their own default metrics @@ -743,9 +744,18 @@ def _init_extra_metrics() -> Dict[str, Any]: if set(["latency", "tflops", "speedup", "compile_time"]) & set( self.required_metrics ): - with torch.cuda.stream(torch.cuda.Stream()): - metrics.latency = triton.testing.do_bench_cudagraph( + if self.use_cuda_graphs: + with torch.cuda.stream(torch.cuda.Stream()): + metrics.latency = triton.testing.do_bench_cudagraph( + fn, + rep=rep, + return_mode="median", + grad_to_none=self.get_grad_to_none(self.example_inputs), + ) + else: + metrics.latency = triton.testing.do_bench( fn, + warmup=warmup, rep=rep, return_mode="median", grad_to_none=self.get_grad_to_none(self.example_inputs),