intel · anmyachev · Sep 16, 2024 · Sep 16, 2024 · Sep 16, 2024 · Sep 17, 2024
diff --git a/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py b/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py
@@ -234,10 +234,11 @@ def benchmark(Z, H, N_CTX, D_HEAD, CAUSAL, provider):
  v = torch.randn((Z, H, N_CTX, D_HEAD), device='xpu', dtype=dtype)
  sm_scale = 0.125
  quantiles = [0.5, 0.0, 1.0]
+ warmup, rep = 10, 500
  if provider == 'onednn':
  _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(
  lambda: torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=
- CAUSAL, scale=sm_scale), warmup=10, rep=10,
+ CAUSAL, scale=sm_scale), warmup=warmup, rep=rep,
  quantiles=quantiles)
 
  elif provider == 'triton':
@@ -257,7 +258,8 @@ def benchmark(Z, H, N_CTX, D_HEAD, CAUSAL, provider):
  ), attn_mask=None, dropout_p=0.0, is_causal=CAUSAL, scale=sm_scale).to(torch.float32)
  atol = 1e-1 if N_CTX == 16384 else 1e-2
  benchmark_suit.assert_close(triton_fn(), torch_fn(), atol=atol, rtol=1e-3, err_msg='triton to torch')
- _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, warmup=10, rep=10, quantiles=quantiles)
+ _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, warmup=warmup, rep=rep,
+ quantiles=quantiles)
 
  elif provider == 'xetla':
  module_name = f'flash_attn_causal_{CAUSAL}'.lower()
@@ -272,8 +274,7 @@ def benchmark(Z, H, N_CTX, D_HEAD, CAUSAL, provider):
  l = torch.empty((size_ml, ), device='xpu', dtype=torch.float)
 
  xetla_fn = lambda: func(q, k, v, out, dropout_mask, bias, m, l, Z, H, D_HEAD, N_CTX, N_CTX, sm_scale)
- _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(xetla_fn, warmup=10, rep=10, quantiles=quantiles)
-
+ _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(xetla_fn, warmup=warmup, rep=rep, quantiles=quantiles)
  else:
  raise NotImplementedError(f'Unsupported provider {provider}')