From 51eca7b7d52b29373af3d4cc6261798410efba27 Mon Sep 17 00:00:00 2001
From: Jez Ng <jezng@meta.com>
Date: Mon, 17 Jun 2024 12:04:20 -0700
Subject: [PATCH] Make CUDA graph benchmarking overridable on a per-op basis

Summary: some operators need to do gpu-cpu syncs, which is not supported under graph capture

Reviewed By: davidberard98

Differential Revision: D58680076

fbshipit-source-id: 7c86c484990445512723ebdda25ef4af8cfffde5
---
 torchbenchmark/util/triton_op.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/torchbenchmark/util/triton_op.py b/torchbenchmark/util/triton_op.py
index fd0fa817d..3a67c1024 100644
--- a/torchbenchmark/util/triton_op.py
+++ b/torchbenchmark/util/triton_op.py
@@ -421,6 +421,7 @@ class BenchmarkOperator(metaclass=PostInitProcessor):
     _input_iter: Optional[Generator] = None
     extra_args: List[str] = []
     example_inputs: Any = None
+    use_cuda_graphs: bool = True
 
     # By default, only collect latency metrics
     # Each operator can override to define their own default metrics
@@ -743,9 +744,18 @@ def _init_extra_metrics() -> Dict[str, Any]:
             if set(["latency", "tflops", "speedup", "compile_time"]) & set(
                 self.required_metrics
             ):
-                with torch.cuda.stream(torch.cuda.Stream()):
-                    metrics.latency = triton.testing.do_bench_cudagraph(
+                if self.use_cuda_graphs:
+                    with torch.cuda.stream(torch.cuda.Stream()):
+                        metrics.latency = triton.testing.do_bench_cudagraph(
+                            fn,
+                            rep=rep,
+                            return_mode="median",
+                            grad_to_none=self.get_grad_to_none(self.example_inputs),
+                        )
+                else:
+                    metrics.latency = triton.testing.do_bench(
                         fn,
+                        warmup=warmup,
                         rep=rep,
                         return_mode="median",
                         grad_to_none=self.get_grad_to_none(self.example_inputs),