From 2989cf7707e82d81b3663150916177fa8a9007c8 Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Tue, 12 Mar 2024 15:21:28 -0700 Subject: [PATCH] Add CI for Habana Labs HPU/Gaudi2 (#5244) Add basic workflow that tests on hpu-gaudi2. Currently, ops are not implemented, so the full unit tests are not yet enabled. --- .github/workflows/hpu-gaudi2.yml | 48 ++++++++++++++++++++++++++++++++ tests/unit/util.py | 3 ++ 2 files changed, 51 insertions(+) create mode 100644 .github/workflows/hpu-gaudi2.yml diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml new file mode 100644 index 000000000000..a64a337d50af --- /dev/null +++ b/.github/workflows/hpu-gaudi2.yml @@ -0,0 +1,48 @@ +name: hpu-gaudi2 + +on: + workflow_dispatch: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - ".github/workflows/hpu-gaudi2.yml" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + issues: write + +jobs: + unit-tests: + # The type of runner that the job will run on + runs-on: [self-hosted, intel, gaudi2] + container: + image: vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest + ports: + - 80 + options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - uses: actions/checkout@v3 + + - name: Check container state + run: | + ldd --version + hl-smi + python -c "import torch; print('torch:', torch.__version__, torch)" + python -c "import torch; print('CUDA available:', torch.cuda.is_available())" + + - name: Install deepspeed + run: | + pip install .[dev] + ds_report + + - name: Python environment + run: | + pip list diff --git a/tests/unit/util.py b/tests/unit/util.py index 75c3000bd4a2..e8e0f476371b 100644 --- a/tests/unit/util.py +++ b/tests/unit/util.py @@ -47,11 +47,14 @@ def bf16_required_version_check(accelerator_check=True): cuda_version_available = CUDA_MAJOR >= 11 nccl_version_available = NCCL_MAJOR > 2 or (NCCL_MAJOR == 2 and NCCL_MINOR >= 10) npu_available = get_accelerator().device_name() == 'npu' + hpu_available = get_accelerator().device_name() == 'hpu' if torch_version_available and cuda_version_available and nccl_version_available and accelerator_pass: return True elif npu_available: return True + elif hpu_available: + return True else: return False