intel · anmyachev · Sep 17, 2024 · Sep 19, 2024 · Sep 19, 2024 · Sep 19, 2024
diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml
@@ -179,7 +179,7 @@ jobs:
  if: ${{ steps.install.outcome == 'success' && !cancelled() }}
  run: |
  cd benchmarks/triton_kernels_benchmark
- python flash_attention_fwd_benchmark.py --reports $REPORTS
+ ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE python flash_attention_fwd_benchmark.py --reports $REPORTS
 
  TAG=${{ inputs.tag || 'ci' }}
  source ../../scripts/capture-hw-details.sh
@@ -194,7 +194,7 @@ jobs:
  TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
  IGC_VISAOptions=" -enableBCR -nolocalra -printregusage -DPASTokenReduction -enableHalfLSC -abiver 2" \
  IGC_DisableLoopUnroll=1 \
- python flash_attention_fwd_benchmark.py --reports $REPORTS
+ ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE python flash_attention_fwd_benchmark.py --reports $REPORTS
 
  TAG=${{ inputs.tag || 'ci' }}-dflt
  source ../../scripts/capture-hw-details.sh
@@ -209,7 +209,7 @@ jobs:
  TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
  IGC_VISAOptions=" -enableBCR -nolocalra -printregusage -DPASTokenReduction -enableHalfLSC -abiver 2" \
  IGC_DisableLoopUnroll=1 \
- python flash_attention_fwd_benchmark.py --reports $REPORTS
+ ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE python flash_attention_fwd_benchmark.py --reports $REPORTS
 
  TAG=${{ inputs.tag || 'ci' }}-adv
  source ../../scripts/capture-hw-details.sh

diff --git a/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py b/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py
@@ -225,12 +225,10 @@ def benchmark(Z, H, N_CTX, D_HEAD, provider):
 
  elif provider == 'triton':
  triton_fn = lambda: forward(q, k, v, causal, sm_scale)
- if benchmark_suit.USE_IPEX_OPTION:
- # FIXME: use torch sdpa for result check after https://github.com/intel/intel-xpu-backend-for-triton/issues/2042 fixed
- torch_fn = lambda: torch.nn.functional.scaled_dot_product_attention(
- q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False, scale=sm_scale).to(torch.float32)
- atol = 1e-1 if N_CTX == 16384 else 1e-2
- benchmark_suit.assert_close(triton_fn(), torch_fn(), atol=atol, rtol=1e-3, err_msg='triton to torch')
+ torch_fn = lambda: torch.nn.functional.scaled_dot_product_attention(
+ q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False, scale=sm_scale).to(torch.float32)
+ atol = 1e-1 if N_CTX == 16384 else 1e-2
+ benchmark_suit.assert_close(triton_fn(), torch_fn(), atol=atol, rtol=1e-3, err_msg='triton to torch')
  _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, warmup=10, rep=10, quantiles=quantiles,
  fast_flush=False)