Skip to content

Commit

Permalink
test gpu ci for distributed (#1636)
Browse files Browse the repository at this point in the history
Summary: Pull Request resolved: #1636

Differential Revision: D52841757
  • Loading branch information
henrylhtsang authored and facebook-github-bot committed Jan 17, 2024
1 parent e88c06e commit c729d11
Showing 1 changed file with 39 additions and 15 deletions.
54 changes: 39 additions & 15 deletions .github/workflows/unittest_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,33 @@
name: Unit Test CI

on:
# TODO: re-enable when GPU unit tests are working
# push:
# paths-ignore:
# - "docs/*"
# - "third_party/*"
# - .gitignore
# - "*.md"
# pull_request:
# paths-ignore:
# - "docs/*"
# - "third_party/*"
# - .gitignore
# - "*.md"
pull_request:
# paths-ignore:
# - "docs/*"
# - "third_party/*"
# - .gitignore
# - "*.md"
workflow_dispatch:

jobs:
# build on cpu hosts and upload to GHA
build_on_cpu:
runs-on: ${{ matrix.os }}
timeout-minutes: 30
strategy:
matrix:
include:
- os: linux.2xlarge
# ideally we run on 3.9 and 3.10 as well, however we are limited in resources.
python-version: 3.8
python-tag: "py38"
cuda-tag: "cu11"
cuda-tag: "cu118"
steps:
# Checkout the repository to the GitHub Actions runner
- name: Check ldd --version
Expand Down Expand Up @@ -83,6 +83,8 @@ jobs:
# here is the issue: https://github.com/conda/conda/issues/10972
- name: Build TorchRec Binary
run: |
export CU_VERSION=${{ matrix.cuda-tag }}
export CHANNEL="nightly"
conda run -n build_binary \
python setup.py bdist_wheel \
--python-tag=${{ matrix.python-tag }}
Expand All @@ -95,11 +97,12 @@ jobs:
# download from GHA, test on gpu
test_on_gpu:
runs-on: ${{ matrix.os }}
timeout-minutes: 30
strategy:
matrix:
os: [linux.4xlarge.nvidia.gpu]
os: [linux.8xlarge.nvidia.gpu]
python-version: [3.8]
cuda-tag: ["cu11"]
cuda-tag: ["cu118"]
needs: build_on_cpu
# the glibc version should match the version of the one we used to build the binary
# for this case, it's 2.26
Expand Down Expand Up @@ -165,12 +168,29 @@ jobs:
- name: Install PyTorch and CUDA
shell: bash
run: |
conda install -n build_binary -y pytorch pytorch-cuda=11.8 -c pytorch-nightly -c nvidia
- name: Install fbgemm
conda run -n build_binary \
python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu118
- name: Test torch installation
shell: bash
run: |
conda run -n build_binary \
python -c "import torch"
- name: Install FBGEMM
shell: bash
run: |
conda run -n build_binary \
pip install fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu118
python -m pip install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu118
- name: Test fbgemm installation
shell: bash
run: |
conda run -n build_binary \
python -c "import fbgemm_gpu"
- name: Test cuda
shell: bash
run: |
conda run -n build_binary \
python -c "import torch; print(torch.cuda.is_available()); print(torch.cuda.device_count())"
nvidia-smi
# download wheel from GHA
- name: Download wheel
uses: actions/download-artifact@v2
Expand All @@ -181,7 +201,11 @@ jobs:
- name: Install TorchRec GPU
run: |
rm -r dist || true
conda run -n build_binary python -m pip install dist/*.whl
conda run -n build_binary python -m pip install *.whl
- name: Install Dependencies
shell: bash
run: |
conda run -n build_binary python -m pip install -r requirements.txt
- name: Test torchrec installation
shell: bash
run: |
Expand All @@ -192,4 +216,4 @@ jobs:
conda run -n build_binary \
python -m pip install pytest
conda run -n build_binary \
python -m pytest torchrec -v -s -W ignore::pytest.PytestCollectionWarning --continue-on-collection-errors
python -m pytest torchrec/distributed -v -s -W ignore::pytest.PytestCollectionWarning --continue-on-collection-errors

0 comments on commit c729d11

Please sign in to comment.