test gpu ci for distributed #1486
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions | |
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions | |
name: Unit Test CI | |
on: | |
# push: | |
# paths-ignore: | |
# - "docs/*" | |
# - "third_party/*" | |
# - .gitignore | |
# - "*.md" | |
pull_request: | |
# paths-ignore: | |
# - "docs/*" | |
# - "third_party/*" | |
# - .gitignore | |
# - "*.md" | |
workflow_dispatch: | |
jobs: | |
# build on cpu hosts and upload to GHA | |
build_on_cpu: | |
runs-on: ${{ matrix.os }} | |
timeout-minutes: 60 | |
strategy: | |
matrix: | |
include: | |
- os: linux.2xlarge | |
# ideally we run on 3.9 and 3.10 as well, however we are limited in resources. | |
python-version: 3.8 | |
python-tag: "py38" | |
cuda-tag: "cu118" | |
steps: | |
# Checkout the repository to the GitHub Actions runner | |
- name: Check ldd --version | |
run: ldd --version | |
- name: Checkout | |
uses: actions/checkout@v2 | |
- name: Update pip | |
run: | | |
sudo yum update -y | |
sudo yum -y install git python3-pip | |
sudo pip3 install --upgrade pip | |
- name: Setup conda | |
run: | | |
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh | |
bash ~/miniconda.sh -b -p $HOME/miniconda -u | |
- name: setup Path | |
run: | | |
echo "/home/ec2-user/miniconda/bin" >> $GITHUB_PATH | |
echo "CONDA=/home/ec2-user/miniconda" >> $GITHUB_PATH | |
- name: create conda env | |
run: | | |
conda create --name build_binary python=${{ matrix.python-version }} | |
conda info | |
- name: check python version no Conda | |
run: | | |
python --version | |
- name: check python version | |
run: | | |
conda run -n build_binary python --version | |
- name: Install C/C++ compilers | |
run: | | |
sudo yum install -y gcc gcc-c++ | |
- name: Install PyTorch and CUDA | |
shell: bash | |
run: | | |
conda install -n build_binary -y pytorch pytorch-cuda=11.8 -c pytorch-nightly -c nvidia | |
- name: Install Dependencies | |
shell: bash | |
run: | | |
conda run -n build_binary python -m pip install -r requirements.txt | |
- name: Test Installation of dependencies | |
run: | | |
conda run -n build_binary python -c "import torch.distributed" | |
echo "torch.distributed succeeded" | |
conda run -n build_binary python -c "import skbuild" | |
echo "skbuild succeeded" | |
conda run -n build_binary python -c "import numpy" | |
echo "numpy succeeded" | |
# for the conda run with quotes, we have to use "\" and double quotes | |
# here is the issue: https://github.com/conda/conda/issues/10972 | |
- name: Build TorchRec Binary | |
run: | | |
export CU_VERSION=${{ matrix.cuda-tag }} | |
export CHANNEL="nightly" | |
conda run -n build_binary \ | |
python setup.py bdist_wheel \ | |
--python-tag=${{ matrix.python-tag }} | |
- name: Upload wheel as GHA artifact | |
uses: actions/upload-artifact@v2 | |
with: | |
name: torchrec_${{ matrix.python-version }}_${{ matrix.cuda-tag }}.whl | |
path: dist/*.whl | |
# download from GHA, test on gpu | |
test_on_gpu: | |
runs-on: ${{ matrix.os }} | |
timeout-minutes: 30 | |
strategy: | |
matrix: | |
os: [linux.g4dn.12xlarge.nvidia.gpu] | |
python-version: [3.8] | |
cuda-tag: ["cu118"] | |
needs: build_on_cpu | |
# the glibc version should match the version of the one we used to build the binary | |
# for this case, it's 2.26 | |
steps: | |
- name: Check ldd --version | |
# Run unit tests | |
run: ldd --version | |
- name: check cpu info | |
shell: bash | |
run: | | |
cat /proc/cpuinfo | |
- name: check distribution info | |
shell: bash | |
run: | | |
cat /proc/version | |
- name: Display EC2 information | |
shell: bash | |
run: | | |
set -euo pipefail | |
function get_ec2_metadata() { | |
# Pulled from instance metadata endpoint for EC2 | |
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html | |
category=$1 | |
curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" | |
} | |
echo "ami-id: $(get_ec2_metadata ami-id)" | |
echo "instance-id: $(get_ec2_metadata instance-id)" | |
echo "instance-type: $(get_ec2_metadata instance-type)" | |
- name: check gpu info | |
shell: bash | |
run: | | |
sudo yum install lshw -y | |
sudo lshw -C display | |
# Checkout the repository to the GitHub Actions runner | |
- name: Checkout | |
uses: actions/checkout@v2 | |
- name: Update pip | |
run: | | |
sudo yum update -y | |
sudo yum -y install git python3-pip | |
sudo pip3 install --upgrade pip | |
- name: Setup conda | |
run: | | |
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh | |
bash ~/miniconda.sh -b -p $HOME/miniconda | |
- name: setup Path | |
run: | | |
echo "/home/ec2-user/miniconda/bin" >> $GITHUB_PATH | |
echo "CONDA=/home/ec2-user/miniconda" >> $GITHUB_PATH | |
- name: create conda env | |
run: | | |
conda create --name build_binary python=${{ matrix.python-version }} | |
conda info | |
- name: check python version no Conda | |
run: | | |
python --version | |
- name: check python version | |
run: | | |
conda run -n build_binary python --version | |
- name: Install C/C++ compilers | |
run: | | |
sudo yum install -y gcc gcc-c++ | |
- name: Install PyTorch and CUDA | |
shell: bash | |
run: | | |
conda run -n build_binary \ | |
python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu118 | |
- name: Test torch installation | |
shell: bash | |
run: | | |
conda run -n build_binary \ | |
python -c "import torch" | |
- name: Install FBGEMM | |
shell: bash | |
run: | | |
conda run -n build_binary \ | |
python -m pip install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu118 | |
- name: Test fbgemm installation | |
shell: bash | |
run: | | |
conda run -n build_binary \ | |
python -c "import fbgemm_gpu" | |
- name: Test cuda | |
shell: bash | |
run: | | |
conda run -n build_binary \ | |
python -c "import torch; print(torch.cuda.is_available()); print(torch.cuda.device_count())" | |
nvidia-smi | |
# download wheel from GHA | |
- name: Download wheel | |
uses: actions/download-artifact@v2 | |
with: | |
name: torchrec_${{ matrix.python-version }}_${{ matrix.cuda-tag }}.whl | |
- name: Display structure of downloaded files | |
run: ls -R | |
- name: Install TorchRec GPU | |
run: | | |
rm -r dist || true | |
conda run -n build_binary python -m pip install *.whl | |
- name: Install Dependencies | |
shell: bash | |
run: | | |
conda run -n build_binary python -m pip install -r requirements.txt | |
- name: Test torchrec installation | |
shell: bash | |
run: | | |
conda run -n build_binary \ | |
python -c "import torchrec" | |
- name: Test with pytest | |
run: | | |
conda run -n build_binary \ | |
python -m pip install pytest | |
conda run -n build_binary \ | |
python -m pytest torchrec/distributed -v -s -W ignore::pytest.PytestCollectionWarning --continue-on-collection-errors |