Skip to content

Commit

Permalink
Use iree_device to run tests on different hip devices to avoid confli…
Browse files Browse the repository at this point in the history
…ct/queue times (#597)

Use iree_device to run tests on different hip devices to avoid
conflict/queue times

---------

Signed-off-by: aviator19941 <[email protected]>
  • Loading branch information
aviator19941 authored Nov 28, 2024
1 parent ba8dd7d commit 1896d7a
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 35 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci-llama-large-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ jobs:
iree-base-runtime
- name: Run llama tests
run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 --html=out/llm/llama/benchmark/index.html
run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 --iree-device=hip://7 --html=out/llm/llama/benchmark/index.html

- name: Deploy to GitHub Pages
uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci-llama-quick-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ jobs:
iree-base-runtime
- name: Run llama 8b f16 decomposed test
run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-quick-llama-test
run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --iree-device=hip://0 --run-quick-llama-test

- name: Upload llama executable files
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
Expand Down
7 changes: 5 additions & 2 deletions sharktank/sharktank/utils/export_artifacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,9 +270,12 @@ def iree_benchmark_vmfb(
f"--device=hip://{i}" for i in range(self.tensor_parallelism_size)
]
else:
rocr_visible_devices = [f"ROCR_VISIBLE_DEVICES={hip_device_id}"]
hip_device_arg = int(hip_device_id.split("://")[1])
rocr_visible_devices = [
f"ROCR_VISIBLE_DEVICES={','.join(str(i) for i in range(hip_device_arg + 1))}"
]
params = [f"--parameters=model={irpa_path}"]
devices = [f"--device=hip://{hip_device_id}"]
devices = [f"--device={hip_device_id}"]
benchmark_args += rocr_visible_devices
benchmark_args += [
"iree-benchmark-module",
Expand Down
65 changes: 34 additions & 31 deletions sharktank/tests/models/llama/benchmark_amdgpu_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def setUpClass(cls):
cls.directory_created = True

def setUp(self):
self.hip_device_id = os.getenv("HIP_DEVICE_ID", default="0")
self.compile_args = [
"--iree-dispatch-creation-enable-aggressive-fusion=true",
"--iree-global-opt-propagate-transposes=true",
Expand Down Expand Up @@ -181,15 +180,15 @@ def testBenchmark8B_f16_Decomposed(self):
)
# benchmark prefill
self.llama8b_f16_decomposed_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path,
args=self.iree_run_prefill_args,
cwd=self.repo_root,
)
# benchmark decode
self.llama8b_f16_decomposed_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path,
args=self.iree_run_decode_args,
Expand Down Expand Up @@ -223,7 +222,7 @@ def testBenchmark8B_f16_Non_Decomposed_Prefill(self):
)
# benchmark prefill
self.llama8b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path,
args=self.iree_run_prefill_nondecomposed_args_fp16,
Expand Down Expand Up @@ -257,15 +256,15 @@ def testBenchmark8B_f16_Non_Decomposed(self):
)
# benchmark prefill
self.llama8b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path,
args=self.iree_run_prefill_args,
cwd=self.repo_root,
)
# benchmark decode
self.llama8b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path,
args=self.iree_run_decode_args,
Expand Down Expand Up @@ -297,15 +296,15 @@ def testBenchmark8B_fp8_Decomposed(self):
)
# benchmark prefill
self.llama8b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path_fp8,
args=self.iree_run_prefill_args,
cwd=self.repo_root,
)
# benchmark decode
self.llama8b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path_fp8,
args=self.iree_run_decode_args,
Expand Down Expand Up @@ -337,15 +336,15 @@ def testBenchmark8B_fp8_Non_Decomposed(self):
)
# benchmark prefill
self.llama8b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path_fp8,
args=self.iree_run_prefill_args,
cwd=self.repo_root,
)
# benchmark decode
self.llama8b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path_fp8,
args=self.iree_run_decode_args,
Expand Down Expand Up @@ -481,15 +480,15 @@ def testBenchmark70B_f16_TP8_Decomposed(self):
)
# benchmark prefill
self.llama70b_f16_decomposed_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path,
args=self.iree_run_prefill_args,
cwd=self.repo_root,
)
# benchmark decode
self.llama70b_f16_decomposed_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path,
args=self.iree_run_decode_args,
Expand Down Expand Up @@ -528,22 +527,24 @@ def testBenchmark70B_f16_TP8_Non_Decomposed(self):
)
# benchmark prefill
self.llama70b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path,
args=self.iree_run_prefill_args,
cwd=self.repo_root,
)
# benchmark decode
self.llama70b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path,
args=self.iree_run_decode_args,
cwd=self.repo_root,
)

@pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
@pytest.mark.xfail(
reason="70b fp8 irpa does not exist", strict=True, raises=ExportMlirException
)
def testBenchmark70B_fp8_TP8_Decomposed(self):
output_file_name = self.dir_path_70b / "fp8_decomposed"
output_mlir = self.llama70b_fp8_decomposed_artifacts.create_file(
Expand Down Expand Up @@ -574,22 +575,24 @@ def testBenchmark70B_fp8_TP8_Decomposed(self):
)
# benchmark prefill
self.llama70b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path_fp8,
args=self.iree_run_prefill_args,
cwd=self.repo_root,
)
# benchmark decode
self.llama70b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path_fp8,
args=self.iree_run_decode_args,
cwd=self.repo_root,
)

@pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
@pytest.mark.xfail(
reason="70b fp8 irpa does not exist", strict=True, raises=ExportMlirException
)
def testBenchmark70B_fp8_TP8_Non_Decomposed(self):
output_file_name = self.dir_path_70b / "fp8_torch"
output_mlir = self.llama70b_fp8_torch_sdpa_artifacts.create_file(
Expand All @@ -603,7 +606,7 @@ def testBenchmark70B_fp8_TP8_Non_Decomposed(self):
)
output_shard_file_name = (
self.artifacts_dir
/ f"f8/tp8/llama3.1_70b_f8_tp{self.tensor_parallelism_size}_parameters.irpa"
/ f"f8/tp8/llama3.1_70b_fp8_tp{self.tensor_parallelism_size}_parameters.irpa"
)
if output_shard_file_name.exists():
self.irpa_path = output_shard_file_name
Expand All @@ -620,15 +623,15 @@ def testBenchmark70B_fp8_TP8_Non_Decomposed(self):
)
# benchmark prefill
self.llama70b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path_fp8,
args=self.iree_run_prefill_args,
cwd=self.repo_root,
)
# benchmark decode
self.llama70b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path_fp8,
args=self.iree_run_decode_args,
Expand Down Expand Up @@ -764,15 +767,15 @@ def testBenchmark405B_f16_TP8_Decomposed(self):
)
# benchmark prefill
self.llama405b_f16_decomposed_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path,
args=self.iree_run_prefill_args,
cwd=self.repo_root,
)
# benchmark decode
self.llama405b_f16_decomposed_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path,
args=self.iree_run_decode_args,
Expand Down Expand Up @@ -814,15 +817,15 @@ def testBenchmark405B_f16_TP8_Non_Decomposed(self):
)
# benchmark prefill
self.llama405b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path,
args=self.iree_run_prefill_args,
cwd=self.repo_root,
)
# benchmark decode
self.llama405b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path,
args=self.iree_run_decode_args,
Expand All @@ -845,7 +848,7 @@ def testBenchmark405B_fp8_TP8_Decomposed(self):
)
output_shard_file_name = (
self.artifacts_dir
/ f"f8/tp8/llama3.1_405b_f8_tp{self.tensor_parallelism_size}_parameters.irpa"
/ f"f8/tp8/llama3.1_405b_fp8_tp{self.tensor_parallelism_size}_parameters.irpa"
)
if output_shard_file_name.exists():
self.irpa_path = output_shard_file_name
Expand All @@ -862,15 +865,15 @@ def testBenchmark405B_fp8_TP8_Decomposed(self):
)
# benchmark prefill
self.llama405b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path_fp8,
args=self.iree_run_prefill_args,
cwd=self.repo_root,
)
# benchmark decode
self.llama405b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path_fp8,
args=self.iree_run_decode_args,
Expand All @@ -893,7 +896,7 @@ def testBenchmark405B_fp8_TP8_Non_Decomposed(self):
)
output_shard_file_name = (
self.artifacts_dir
/ f"f8/tp8/llama3.1_405b_f8_tp{self.tensor_parallelism_size}_parameters.irpa"
/ f"f8/tp8/llama3.1_405b_fp8_tp{self.tensor_parallelism_size}_parameters.irpa"
)
if output_shard_file_name.exists():
self.irpa_path = output_shard_file_name
Expand All @@ -910,15 +913,15 @@ def testBenchmark405B_fp8_TP8_Non_Decomposed(self):
)
# benchmark prefill
self.llama405b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path_fp8,
args=self.iree_run_prefill_args,
cwd=self.repo_root,
)
# benchmark decode
self.llama405b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
hip_device_id=self.hip_device_id,
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path_fp8,
args=self.iree_run_decode_args,
Expand Down

0 comments on commit 1896d7a

Please sign in to comment.