test gpu ci for distributed #1483

Workflow file for this run

.github/workflows/unittest_ci.yml at 2fb0859

	# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
	# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions

	name: Unit Test CI

	on:
	# push:
	# paths-ignore:
	# - "docs/*"
	# - "third_party/*"
	# - .gitignore
	# - "*.md"
	pull_request:
	# paths-ignore:
	# - "docs/*"
	# - "third_party/*"
	# - .gitignore
	# - "*.md"
	workflow_dispatch:

	jobs:
	# build on cpu hosts and upload to GHA
	build_on_cpu:
	runs-on: ${{ matrix.os }}
	timeout-minutes: 30
	strategy:
	matrix:
	include:
	- os: linux.2xlarge
	# ideally we run on 3.9 and 3.10 as well, however we are limited in resources.
	python-version: 3.8
	python-tag: "py38"
	cuda-tag: "cu118"
	steps:
	# Checkout the repository to the GitHub Actions runner
	- name: Check ldd --version
	run: ldd --version
	- name: Checkout
	uses: actions/checkout@v2
	- name: Update pip
	run: \|
	sudo yum update -y
	sudo yum -y install git python3-pip
	sudo pip3 install --upgrade pip
	- name: Setup conda
	run: \|
	wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
	bash ~/miniconda.sh -b -p $HOME/miniconda -u
	- name: setup Path
	run: \|
	echo "/home/ec2-user/miniconda/bin" >> $GITHUB_PATH
	echo "CONDA=/home/ec2-user/miniconda" >> $GITHUB_PATH
	- name: create conda env
	run: \|
	conda create --name build_binary python=${{ matrix.python-version }}
	conda info
	- name: check python version no Conda
	run: \|
	python --version
	- name: check python version
	run: \|
	conda run -n build_binary python --version
	- name: Install C/C++ compilers
	run: \|
	sudo yum install -y gcc gcc-c++
	- name: Install PyTorch and CUDA
	shell: bash
	run: \|
	conda install -n build_binary -y pytorch pytorch-cuda=11.8 -c pytorch-nightly -c nvidia
	- name: Install Dependencies
	shell: bash
	run: \|
	conda run -n build_binary python -m pip install -r requirements.txt
	- name: Test Installation of dependencies
	run: \|
	conda run -n build_binary python -c "import torch.distributed"
	echo "torch.distributed succeeded"
	conda run -n build_binary python -c "import skbuild"
	echo "skbuild succeeded"
	conda run -n build_binary python -c "import numpy"
	echo "numpy succeeded"
	# for the conda run with quotes, we have to use "\" and double quotes
	# here is the issue: https://github.com/conda/conda/issues/10972
	- name: Build TorchRec Binary
	run: \|
	export CU_VERSION=${{ matrix.cuda-tag }}
	export CHANNEL="nightly"
	conda run -n build_binary \
	python setup.py bdist_wheel \
	--python-tag=${{ matrix.python-tag }}
	- name: Upload wheel as GHA artifact
	uses: actions/upload-artifact@v2
	with:
	name: torchrec_${{ matrix.python-version }}_${{ matrix.cuda-tag }}.whl
	path: dist/*.whl

	# download from GHA, test on gpu
	test_on_gpu:
	runs-on: ${{ matrix.os }}
	timeout-minutes: 30
	strategy:
	matrix:
	os: [linux.g5.12xlarge.nvidia.gpu]
	python-version: [3.8]
	cuda-tag: ["cu118"]
	needs: build_on_cpu
	# the glibc version should match the version of the one we used to build the binary
	# for this case, it's 2.26
	steps:
	- name: Check ldd --version
	# Run unit tests
	run: ldd --version
	- name: check cpu info
	shell: bash
	run: \|
	cat /proc/cpuinfo
	- name: check distribution info
	shell: bash
	run: \|
	cat /proc/version
	- name: Display EC2 information
	shell: bash
	run: \|
	set -euo pipefail
	function get_ec2_metadata() {
	# Pulled from instance metadata endpoint for EC2
	# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
	category=$1
	curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
	}
	echo "ami-id: $(get_ec2_metadata ami-id)"
	echo "instance-id: $(get_ec2_metadata instance-id)"
	echo "instance-type: $(get_ec2_metadata instance-type)"
	- name: check gpu info
	shell: bash
	run: \|
	sudo yum install lshw -y
	sudo lshw -C display
	# Checkout the repository to the GitHub Actions runner
	- name: Checkout
	uses: actions/checkout@v2
	- name: Update pip
	run: \|
	sudo yum update -y
	sudo yum -y install git python3-pip
	sudo pip3 install --upgrade pip
	- name: Setup conda
	run: \|
	wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
	bash ~/miniconda.sh -b -p $HOME/miniconda
	- name: setup Path
	run: \|
	echo "/home/ec2-user/miniconda/bin" >> $GITHUB_PATH
	echo "CONDA=/home/ec2-user/miniconda" >> $GITHUB_PATH
	- name: create conda env
	run: \|
	conda create --name build_binary python=${{ matrix.python-version }}
	conda info
	- name: check python version no Conda
	run: \|
	python --version
	- name: check python version
	run: \|
	conda run -n build_binary python --version
	- name: Install C/C++ compilers
	run: \|
	sudo yum install -y gcc gcc-c++
	- name: Install PyTorch and CUDA
	shell: bash
	run: \|
	conda run -n build_binary \
	python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu118
	- name: Test torch installation
	shell: bash
	run: \|
	conda run -n build_binary \
	python -c "import torch"
	- name: Install FBGEMM
	shell: bash
	run: \|
	conda run -n build_binary \
	python -m pip install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu118
	- name: Test fbgemm installation
	shell: bash
	run: \|
	conda run -n build_binary \
	python -c "import fbgemm_gpu"
	- name: Test cuda
	shell: bash
	run: \|
	conda run -n build_binary \
	python -c "import torch; print(torch.cuda.is_available()); print(torch.cuda.device_count())"
	nvidia-smi
	# download wheel from GHA
	- name: Download wheel
	uses: actions/download-artifact@v2
	with:
	name: torchrec_${{ matrix.python-version }}_${{ matrix.cuda-tag }}.whl
	- name: Display structure of downloaded files
	run: ls -R
	- name: Install TorchRec GPU
	run: \|
	rm -r dist \|\| true
	conda run -n build_binary python -m pip install *.whl
	- name: Install Dependencies
	shell: bash
	run: \|
	conda run -n build_binary python -m pip install -r requirements.txt
	- name: Test torchrec installation
	shell: bash
	run: \|
	conda run -n build_binary \
	python -c "import torchrec"
	- name: Test with pytest
	run: \|
	conda run -n build_binary \
	python -m pip install pytest
	conda run -n build_binary \
	python -m pytest torchrec -v -s -W ignore::pytest.PytestCollectionWarning --continue-on-collection-errors

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

test gpu ci for distributed #1483

Workflow file

test gpu ci for distributed #1483

Jobs

Run details

Workflow file for this run