Loss too big when using TP #1466
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Optimum Neuron - Common tests on Trainium | |
on: | |
push: | |
branches: [ main ] | |
paths: | |
- "setup.py" | |
- "optimum/**.py" | |
- ".github/workflows/test_trainium_common.yml" | |
pull_request: | |
branches: [ main ] | |
paths: | |
- "setup.py" | |
- "optimum/**.py" | |
- ".github/workflows/test_trainium_common.yml" | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} | |
cancel-in-progress: true | |
jobs: | |
optimum-neuron-tests: | |
name: Run common tests on Trainium 1 | |
runs-on: | |
group: aws-trn1-32xlarge | |
env: | |
TESTS_TO_IGNORE_FLAGS: --ignore tests/distributed/ --ignore tests/test_examples.py | |
steps: | |
- name: Install Neuron runtime | |
run: | | |
. /etc/os-release | |
sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF | |
deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main | |
EOF | |
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add - | |
sudo apt-get update -y | |
sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.14.0-6e27b8d5b aws-neuronx-collectives=2.22.26.0-17a033bc8 -y | |
export PATH=/opt/aws/neuron/bin:$PATH | |
- name: Install cv2 dependencies | |
run: | | |
sudo apt-get install ffmpeg libsm6 libxext6 -y | |
- name: Checkout | |
uses: actions/checkout@v2 | |
- name: Install python dependencies | |
run: | | |
sudo apt install python3-venv python3-dev -y | |
python3 -m venv aws_neuron_venv_pytorch | |
source aws_neuron_venv_pytorch/bin/activate | |
python -m pip install -U pip | |
pip install --upgrade setuptools==69.5.1 | |
python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com | |
python -m pip install .[neuronx,tests] | |
- name: Collect tests on Neuron Cores | |
run: | | |
source aws_neuron_venv_pytorch/bin/activate | |
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests --collect-only | |
- name: Run tests on Neuron cores | |
run: | | |
source aws_neuron_venv_pytorch/bin/activate | |
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests --durations=0 -v | |
- name: Collect staging tests on Neuron Cores | |
run: | | |
source aws_neuron_venv_pytorch/bin/activate | |
HUGGINGFACE_CO_STAGING=1 pytest -m "is_trainium_test and is_staging_test" $TESTS_TO_IGNORE_FLAGS tests -s --collect-only | |
- name: Run staging tests on Neuron cores | |
run: | | |
source aws_neuron_venv_pytorch/bin/activate | |
HUGGINGFACE_CO_STAGING=1 pytest -m "is_trainium_test and is_staging_test" $TESTS_TO_IGNORE_FLAGS tests -s |