diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh index 85ad0de47..6750b5b74 100755 --- a/.github/scripts/validate_binaries.sh +++ b/.github/scripts/validate_binaries.sh @@ -7,8 +7,9 @@ export PYTORCH_CUDA_PKG="" +export CONDA_ENV="build_binary" -conda create -y -n build_binary python="${MATRIX_PYTHON_VERSION}" +conda create -y -n "${CONDA_ENV}" python="${MATRIX_PYTHON_VERSION}" conda run -n build_binary python --version @@ -49,41 +50,60 @@ elif [[ ${MATRIX_CHANNEL} = 'release' ]]; then export PYTORCH_URL="https://download.pytorch.org/whl/${CUDA_VERSION}" fi + +echo "CU_VERSION: ${CUDA_VERSION}" +echo "MATRIX_CHANNEL: ${MATRIX_CHANNEL}" +echo "CONDA_ENV: ${CONDA_ENV}" + +# shellcheck disable=SC2155 +export CONDA_PREFIX=$(conda run -n "${CONDA_ENV}" printenv CONDA_PREFIX) + +find / -name *cuda* + +if [[ $CUDA_VERSION = cu* ]]; then + # Setting LD_LIBRARY_PATH fixes the runtime error with fbgemm_gpu not + # being able to locate libnvrtc.so + echo "[NOVA] Setting LD_LIBRARY_PATH ..." + conda env config vars set -n ${CONDA_ENV} \ + LD_LIBRARY_PATH="/usr/local/lib:/usr/lib64:${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}" +fi + + # install pytorch # switch back to conda once torch nightly is fixed # if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then # export PYTORCH_CUDA_PKG="pytorch-cuda=${MATRIX_GPU_ARCH_VERSION}" # fi -conda run -n build_binary pip install torch --index-url "$PYTORCH_URL" +conda run -n "${CONDA_ENV}" pip install torch --index-url "$PYTORCH_URL" # install fbgemm -conda run -n build_binary pip install fbgemm-gpu --index-url "$PYTORCH_URL" +conda run -n "${CONDA_ENV}" pip install fbgemm-gpu --index-url "$PYTORCH_URL" # install requirements from pypi -conda run -n build_binary pip install torchmetrics==1.0.3 +conda run -n "${CONDA_ENV}" pip install torchmetrics==1.0.3 # install torchrec -conda run -n build_binary pip install torchrec --index-url "$PYTORCH_URL" +conda run -n "${CONDA_ENV}" pip install torchrec --index-url "$PYTORCH_URL" # Run small import test -conda run -n build_binary python -c "import torch; import fbgemm_gpu; import torchrec" +conda run -n "${CONDA_ENV}" python -c "import torch; import fbgemm_gpu; import torchrec" # check directory ls -R # check if cuda available -conda run -n build_binary python -c "import torch; print(torch.cuda.is_available())" +conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.cuda.is_available())" # check cuda version -conda run -n build_binary python -c "import torch; print(torch.version.cuda)" +conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.version.cuda)" # Finally run smoke test # python 3.11 needs torchx-nightly -conda run -n build_binary pip install torchx-nightly iopath +conda run -n "${CONDA_ENV}" pip install torchx-nightly iopath if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then - conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py + conda run -n "${CONDA_ENV}" torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py else - conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --script test_installation.py -- --cpu_only + conda run -n "${CONDA_ENV}" torchx run -s local_cwd dist.ddp -j 1 --script test_installation.py -- --cpu_only fi @@ -93,8 +113,8 @@ if [[ ${MATRIX_CHANNEL} != 'release' ]]; then exit 0 else # Check version matches only for release binaries - torchrec_version=$(conda run -n build_binary pip show torchrec | grep Version | cut -d' ' -f2) - fbgemm_version=$(conda run -n build_binary pip show fbgemm_gpu | grep Version | cut -d' ' -f2) + torchrec_version=$(conda run -n "${CONDA_ENV}" pip show torchrec | grep Version | cut -d' ' -f2) + fbgemm_version=$(conda run -n "${CONDA_ENV}" pip show fbgemm_gpu | grep Version | cut -d' ' -f2) if [ "$torchrec_version" != "$fbgemm_version" ]; then echo "Error: TorchRec package version does not match FBGEMM package version" @@ -102,22 +122,22 @@ else fi fi -conda create -y -n build_binary python="${MATRIX_PYTHON_VERSION}" +conda create -y -n "${CONDA_ENV}" python="${MATRIX_PYTHON_VERSION}" -conda run -n build_binary python --version +conda run -n "${CONDA_ENV}" python --version if [[ ${MATRIX_GPU_ARCH_VERSION} != '12.4' ]]; then exit 0 fi echo "checking pypi release" -conda run -n build_binary pip install torch -conda run -n build_binary pip install fbgemm-gpu -conda run -n build_binary pip install torchrec +conda run -n "${CONDA_ENV}" pip install torch +conda run -n "${CONDA_ENV}" pip install fbgemm-gpu +conda run -n "${CONDA_ENV}" pip install torchrec # Check version matching again for PyPI -torchrec_version=$(conda run -n build_binary pip show torchrec | grep Version | cut -d' ' -f2) -fbgemm_version=$(conda run -n build_binary pip show fbgemm_gpu | grep Version | cut -d' ' -f2) +torchrec_version=$(conda run -n "${CONDA_ENV}" pip show torchrec | grep Version | cut -d' ' -f2) +fbgemm_version=$(conda run -n "${CONDA_ENV}" pip show fbgemm_gpu | grep Version | cut -d' ' -f2) if [ "$torchrec_version" != "$fbgemm_version" ]; then echo "Error: TorchRec package version does not match FBGEMM package version" @@ -128,13 +148,13 @@ fi ls -R # check if cuda available -conda run -n build_binary python -c "import torch; print(torch.cuda.is_available())" +conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.cuda.is_available())" # check cuda version -conda run -n build_binary python -c "import torch; print(torch.version.cuda)" +conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.version.cuda)" # python 3.11 needs torchx-nightly -conda run -n build_binary pip install torchx-nightly iopath +conda run -n "${CONDA_ENV}" pip install torchx-nightly iopath # Finally run smoke test -conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py +conda run -n "${CONDA_ENV}" torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py diff --git a/.github/workflows/validate-binaries.yml b/.github/workflows/validate-binaries.yml index 248857214..98d69d721 100644 --- a/.github/workflows/validate-binaries.yml +++ b/.github/workflows/validate-binaries.yml @@ -1,6 +1,12 @@ name: Validate binaries on: + pull_request: + paths-ignore: + - "docs/*" + - "third_party/*" + - .gitignore + - "*.md" workflow_call: inputs: channel: