Skip to content

Loss too big when using TP #1466

Loss too big when using TP

Loss too big when using TP #1466

name: Optimum Neuron - Common tests on Trainium
on:
push:
branches: [ main ]
paths:
- "setup.py"
- "optimum/**.py"
- ".github/workflows/test_trainium_common.yml"
pull_request:
branches: [ main ]
paths:
- "setup.py"
- "optimum/**.py"
- ".github/workflows/test_trainium_common.yml"
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
optimum-neuron-tests:
name: Run common tests on Trainium 1
runs-on:
group: aws-trn1-32xlarge
env:
TESTS_TO_IGNORE_FLAGS: --ignore tests/distributed/ --ignore tests/test_examples.py
steps:
- name: Install Neuron runtime
run: |
. /etc/os-release
sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
EOF
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
sudo apt-get update -y
sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.14.0-6e27b8d5b aws-neuronx-collectives=2.22.26.0-17a033bc8 -y
export PATH=/opt/aws/neuron/bin:$PATH
- name: Install cv2 dependencies
run: |
sudo apt-get install ffmpeg libsm6 libxext6 -y
- name: Checkout
uses: actions/checkout@v2
- name: Install python dependencies
run: |
sudo apt install python3-venv python3-dev -y
python3 -m venv aws_neuron_venv_pytorch
source aws_neuron_venv_pytorch/bin/activate
python -m pip install -U pip
pip install --upgrade setuptools==69.5.1
python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
python -m pip install .[neuronx,tests]
- name: Collect tests on Neuron Cores
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests --collect-only
- name: Run tests on Neuron cores
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests --durations=0 -v
- name: Collect staging tests on Neuron Cores
run: |
source aws_neuron_venv_pytorch/bin/activate
HUGGINGFACE_CO_STAGING=1 pytest -m "is_trainium_test and is_staging_test" $TESTS_TO_IGNORE_FLAGS tests -s --collect-only
- name: Run staging tests on Neuron cores
run: |
source aws_neuron_venv_pytorch/bin/activate
HUGGINGFACE_CO_STAGING=1 pytest -m "is_trainium_test and is_staging_test" $TESTS_TO_IGNORE_FLAGS tests -s