-
Notifications
You must be signed in to change notification settings - Fork 465
219 lines (215 loc) · 7.24 KB
/
unittest_ci.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
name: Unit Test CI
on:
# push:
# paths-ignore:
# - "docs/*"
# - "third_party/*"
# - .gitignore
# - "*.md"
pull_request:
# paths-ignore:
# - "docs/*"
# - "third_party/*"
# - .gitignore
# - "*.md"
workflow_dispatch:
jobs:
# build on cpu hosts and upload to GHA
build_on_cpu:
runs-on: ${{ matrix.os }}
timeout-minutes: 60
strategy:
matrix:
include:
- os: linux.2xlarge
# ideally we run on 3.9 and 3.10 as well, however we are limited in resources.
python-version: 3.8
python-tag: "py38"
cuda-tag: "cu118"
steps:
# Checkout the repository to the GitHub Actions runner
- name: Check ldd --version
run: ldd --version
- name: Checkout
uses: actions/checkout@v2
- name: Update pip
run: |
sudo yum update -y
sudo yum -y install git python3-pip
sudo pip3 install --upgrade pip
- name: Setup conda
run: |
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
bash ~/miniconda.sh -b -p $HOME/miniconda -u
- name: setup Path
run: |
echo "/home/ec2-user/miniconda/bin" >> $GITHUB_PATH
echo "CONDA=/home/ec2-user/miniconda" >> $GITHUB_PATH
- name: create conda env
run: |
conda create --name build_binary python=${{ matrix.python-version }}
conda info
- name: check python version no Conda
run: |
python --version
- name: check python version
run: |
conda run -n build_binary python --version
- name: Install C/C++ compilers
run: |
sudo yum install -y gcc gcc-c++
- name: Install PyTorch and CUDA
shell: bash
run: |
conda install -n build_binary -y pytorch pytorch-cuda=11.8 -c pytorch-nightly -c nvidia
- name: Install Dependencies
shell: bash
run: |
conda run -n build_binary python -m pip install -r requirements.txt
- name: Test Installation of dependencies
run: |
conda run -n build_binary python -c "import torch.distributed"
echo "torch.distributed succeeded"
conda run -n build_binary python -c "import skbuild"
echo "skbuild succeeded"
conda run -n build_binary python -c "import numpy"
echo "numpy succeeded"
# for the conda run with quotes, we have to use "\" and double quotes
# here is the issue: https://github.com/conda/conda/issues/10972
- name: Build TorchRec Binary
run: |
export CU_VERSION=${{ matrix.cuda-tag }}
export CHANNEL="nightly"
conda run -n build_binary \
python setup.py bdist_wheel \
--python-tag=${{ matrix.python-tag }}
- name: Upload wheel as GHA artifact
uses: actions/upload-artifact@v2
with:
name: torchrec_${{ matrix.python-version }}_${{ matrix.cuda-tag }}.whl
path: dist/*.whl
# download from GHA, test on gpu
test_on_gpu:
runs-on: ${{ matrix.os }}
timeout-minutes: 30
strategy:
matrix:
os: [linux.g4dn.12xlarge.nvidia.gpu]
python-version: [3.8]
cuda-tag: ["cu118"]
needs: build_on_cpu
# the glibc version should match the version of the one we used to build the binary
# for this case, it's 2.26
steps:
- name: Check ldd --version
# Run unit tests
run: ldd --version
- name: check cpu info
shell: bash
run: |
cat /proc/cpuinfo
- name: check distribution info
shell: bash
run: |
cat /proc/version
- name: Display EC2 information
shell: bash
run: |
set -euo pipefail
function get_ec2_metadata() {
# Pulled from instance metadata endpoint for EC2
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
category=$1
curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
}
echo "ami-id: $(get_ec2_metadata ami-id)"
echo "instance-id: $(get_ec2_metadata instance-id)"
echo "instance-type: $(get_ec2_metadata instance-type)"
- name: check gpu info
shell: bash
run: |
sudo yum install lshw -y
sudo lshw -C display
# Checkout the repository to the GitHub Actions runner
- name: Checkout
uses: actions/checkout@v2
- name: Update pip
run: |
sudo yum update -y
sudo yum -y install git python3-pip
sudo pip3 install --upgrade pip
- name: Setup conda
run: |
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
bash ~/miniconda.sh -b -p $HOME/miniconda
- name: setup Path
run: |
echo "/home/ec2-user/miniconda/bin" >> $GITHUB_PATH
echo "CONDA=/home/ec2-user/miniconda" >> $GITHUB_PATH
- name: create conda env
run: |
conda create --name build_binary python=${{ matrix.python-version }}
conda info
- name: check python version no Conda
run: |
python --version
- name: check python version
run: |
conda run -n build_binary python --version
- name: Install C/C++ compilers
run: |
sudo yum install -y gcc gcc-c++
- name: Install PyTorch and CUDA
shell: bash
run: |
conda run -n build_binary \
python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu118
- name: Test torch installation
shell: bash
run: |
conda run -n build_binary \
python -c "import torch"
- name: Install FBGEMM
shell: bash
run: |
conda run -n build_binary \
python -m pip install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu118
- name: Test fbgemm installation
shell: bash
run: |
conda run -n build_binary \
python -c "import fbgemm_gpu"
- name: Test cuda
shell: bash
run: |
conda run -n build_binary \
python -c "import torch; print(torch.cuda.is_available()); print(torch.cuda.device_count())"
nvidia-smi
# download wheel from GHA
- name: Download wheel
uses: actions/download-artifact@v2
with:
name: torchrec_${{ matrix.python-version }}_${{ matrix.cuda-tag }}.whl
- name: Display structure of downloaded files
run: ls -R
- name: Install TorchRec GPU
run: |
rm -r dist || true
conda run -n build_binary python -m pip install *.whl
- name: Install Dependencies
shell: bash
run: |
conda run -n build_binary python -m pip install -r requirements.txt
- name: Test torchrec installation
shell: bash
run: |
conda run -n build_binary \
python -c "import torchrec"
- name: Test with pytest
run: |
conda run -n build_binary \
python -m pip install pytest
conda run -n build_binary \
python -m pytest torchrec/distributed -v -s -W ignore::pytest.PytestCollectionWarning --continue-on-collection-errors