From 38819604ac83eddf1ac1993f0343ceed9db82307 Mon Sep 17 00:00:00 2001 From: vikram singh shekhawat Date: Tue, 4 Jun 2024 13:13:50 +0300 Subject: [PATCH 1/6] Upgrade HPU image to v1.16.0 Upgrade HPU/Gaudi image to v1.16.0. --- .github/workflows/hpu-gaudi2.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml index ba337cb83bf3..4423bd0001ec 100644 --- a/.github/workflows/hpu-gaudi2.yml +++ b/.github/workflows/hpu-gaudi2.yml @@ -39,13 +39,14 @@ jobs: # The type of runner that the job will run on runs-on: [self-hosted, intel, gaudi2] container: - image: vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest + image: vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest ports: - 80 options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice env: PT_HPU_LAZY_MODE: 0 + TORCHINDUCTOR_COMPILE_THREADS: 1 TEST_LIST: | test_accelerator.py test_autotuning.py @@ -129,7 +130,7 @@ jobs: unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests export PT_HPU_LAZY_MODE=${PT_HPU_LAZY_MODE} + export TORCHINDUCTOR_COMPILE_THREADS=${TORCHINDUCTOR_COMPILE_THREADS} TEST_LIST=$(echo "$TEST_LIST" | awk 'NF{printf "%s%s", (NR>1 ? " or " : ""), $0} END{if (NR>1) print ""}') echo "TEST_LIST ${TEST_LIST}" - echo "PT_HPU_LAZY_MODE ${PT_HPU_LAZY_MODE}" pytest --verbose unit/ -k "${TEST_LIST}" From a71ce4a2a77344c9ddd0b987f9721fbe73b9fe0d Mon Sep 17 00:00:00 2001 From: vikram singh shekhawat Date: Wed, 26 Jun 2024 14:17:40 +0300 Subject: [PATCH 2/6] Updated HPU image to 1.16.2. --- .github/workflows/hpu-gaudi2.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml index 46aabb54d82a..ffada46f1ca7 100644 --- a/.github/workflows/hpu-gaudi2.yml +++ b/.github/workflows/hpu-gaudi2.yml @@ -39,7 +39,7 @@ jobs: # The type of runner that the job will run on runs-on: [self-hosted, intel, gaudi2] container: - image: vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest + image: vault.habana.ai/gaudi-docker/1.16.2/ubuntu22.04/habanalabs/pytorch-installer-2.2.2/:latest ports: - 80 options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice From f47840fe087c282f22a4691fd5acf26e9d577966 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Wed, 26 Jun 2024 07:54:53 -0700 Subject: [PATCH 3/6] Fix typo on docker image name --- .github/workflows/hpu-gaudi2.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml index ffada46f1ca7..8fcf2bb29a63 100644 --- a/.github/workflows/hpu-gaudi2.yml +++ b/.github/workflows/hpu-gaudi2.yml @@ -39,7 +39,7 @@ jobs: # The type of runner that the job will run on runs-on: [self-hosted, intel, gaudi2] container: - image: vault.habana.ai/gaudi-docker/1.16.2/ubuntu22.04/habanalabs/pytorch-installer-2.2.2/:latest + image: vault.habana.ai/gaudi-docker/1.16.2/ubuntu22.04/habanalabs/pytorch-installer-2.2.2/latest ports: - 80 options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice From 105c4b4868537ead8523192ea546ea43ec71b0c1 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Wed, 26 Jun 2024 07:57:53 -0700 Subject: [PATCH 4/6] Typo part 2 --- .github/workflows/hpu-gaudi2.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml index 8fcf2bb29a63..bead08211ee8 100644 --- a/.github/workflows/hpu-gaudi2.yml +++ b/.github/workflows/hpu-gaudi2.yml @@ -39,7 +39,7 @@ jobs: # The type of runner that the job will run on runs-on: [self-hosted, intel, gaudi2] container: - image: vault.habana.ai/gaudi-docker/1.16.2/ubuntu22.04/habanalabs/pytorch-installer-2.2.2/latest + image: vault.habana.ai/gaudi-docker/1.16.2/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest ports: - 80 options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice From 457ba74c14708c539b31889a5b124bc549504e4d Mon Sep 17 00:00:00 2001 From: vikram singh shekhawat Date: Thu, 27 Jun 2024 13:43:17 +0300 Subject: [PATCH 5/6] Enabled dmesg logs and added hl-smi list command. --- .github/workflows/hpu-gaudi2.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml index bead08211ee8..6949b830ab64 100644 --- a/.github/workflows/hpu-gaudi2.yml +++ b/.github/workflows/hpu-gaudi2.yml @@ -105,6 +105,8 @@ jobs: run: | ldd --version hl-smi + hl-smi -L + dmesg -T python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" From 3ec2c438f3ffb183f1fdcd108be1ca4564a43079 Mon Sep 17 00:00:00 2001 From: vikram singh shekhawat Date: Wed, 17 Jul 2024 07:11:11 +0300 Subject: [PATCH 6/6] Remove dmesg and duplicate hl-smi cmd from hpu-gaudi2 workflow. --- .github/workflows/hpu-gaudi2.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml index 6949b830ab64..ac19638e67de 100644 --- a/.github/workflows/hpu-gaudi2.yml +++ b/.github/workflows/hpu-gaudi2.yml @@ -104,9 +104,7 @@ jobs: - name: Check container state run: | ldd --version - hl-smi hl-smi -L - dmesg -T python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())"