Skip to content

Commit

Permalink
Merge branch 'master' into loadams/lamb-bf16
Browse files Browse the repository at this point in the history
  • Loading branch information
loadams authored Jan 3, 2024
2 parents 023aa62 + 691458f commit 35aabdc
Show file tree
Hide file tree
Showing 456 changed files with 28,417 additions and 1,029 deletions.
56 changes: 0 additions & 56 deletions .github/workflows/amd-mi100.yml

This file was deleted.

1 change: 1 addition & 0 deletions .github/workflows/amd-mi200.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ jobs:
run: |
git clone https://github.com/ROCmSoftwarePlatform/apex.git
cd apex
git checkout torch_2.1_higher
CURRENT_VER=$(git rev-parse HEAD)
INSTALLED_VER=$(cat /blob/amd-apex/.venv_installed_version)
if [[ "$CURRENT_VER" != "$INSTALLED_VER" ]]; then
Expand Down
59 changes: 0 additions & 59 deletions .github/workflows/auto-sync.yml

This file was deleted.

6 changes: 0 additions & 6 deletions .github/workflows/cpu-inference.yml
Original file line number Diff line number Diff line change
@@ -1,13 +1,7 @@
name: cpu-inference

on:
pull_request:
paths-ignore:
- 'docs/**'
- 'blogs/**'
workflow_dispatch:
merge_group:
branches: [ master ]

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/formatting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ concurrency:
jobs:

# formatting and basic install on cpu-only machine
formatting:
unit-tests:
runs-on: ubuntu-20.04

steps:
Expand Down
65 changes: 65 additions & 0 deletions .github/workflows/nv-a6000.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
name: nv-a6000

on:
pull_request:
paths:
- 'accelerator/cuda_accelerator.py'
- 'deepspeed/inference/v2/**'
- 'tests/unit/inference/v2/**'
- '.github/workflows/nv-a6000.yml'
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

permissions:
contents: read
issues: write

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, a6000]
container:
image: nvcr.io/nvidia/pytorch:23.03-py3
ports:
- 80
options: --gpus all --shm-size "8G"

steps:
- uses: actions/checkout@v3

- name: Check container state
run: |
ldd --version
nvcc --version
nvidia-smi
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone --depth=1 https://github.com/huggingface/transformers
cd transformers
git rev-parse --short HEAD
python -m pip install .
- name: Install deepspeed
run: |
python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
python -m pip install .[dev,1bit,autotuning]
ds_report
- name: Python environment
run: |
python -m pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.0" --cuda_ver="12"
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.0" --cuda_ver="12"
- name: MII unit tests
run: |
git clone --depth=1 https://github.com/microsoft/DeepSpeed-MII.git
cd DeepSpeed-MII
pip install .[dev]
cd tests
python -m pytest --color=yes --durations=0 --verbose -rF ./
6 changes: 4 additions & 2 deletions .github/workflows/nv-accelerate-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ on:
paths-ignore:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
Expand All @@ -16,7 +18,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu111, v100]
runs-on: [self-hosted, nvidia, cu116, v100]

steps:
- uses: actions/checkout@v3
Expand All @@ -26,7 +28,7 @@ jobs:

- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
pip install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand Down
17 changes: 11 additions & 6 deletions .github/workflows/nv-ds-chat.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,7 @@ jobs:
pip install .[dev]
ds_report
- name: Python environment
run: |
pip list
- name: DS-Chat unit tests
- name: Install deepspeed-chat
run: |
BRANCH="master"
if [[ ! -z "${{ github.event.inputs.dse_branch }}" ]]; then
Expand All @@ -50,8 +46,17 @@ jobs:
git clone -b $BRANCH https://github.com/microsoft/DeepSpeedExamples.git
cd DeepSpeedExamples/applications/DeepSpeed-Chat
pip install -r requirements.txt
pip install -e .
- name: Python environment
run: |
pip list
- name: DS-Chat unit tests
run: |
cd DeepSpeedExamples/applications/DeepSpeed-Chat
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd training/tests
cd tests
pytest $PYTEST_OPTS ./
- name: Open GitHub issue if nightly CI fails
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/nv-inference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ on:
paths-ignore:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
Expand Down Expand Up @@ -34,6 +36,7 @@ jobs:
run: |
git clone https://github.com/huggingface/transformers
cd transformers
git checkout f370bebdc
git rev-parse --short HEAD
pip install .
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/nv-lightning-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ on:
paths-ignore:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/nv-megatron.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ on:
paths-ignore:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-mii.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,5 +54,5 @@ jobs:
cd DeepSpeed-MII
pip install .[dev]
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
cd tests/legacy
pytest $PYTEST_OPTS --forked -m "deepspeed" ./
6 changes: 4 additions & 2 deletions .github/workflows/nv-pre-compile-ops.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ on:
paths-ignore:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
Expand All @@ -17,7 +19,7 @@ concurrency:
cancel-in-progress: true

jobs:
build-ops:
unit-tests:
runs-on: ubuntu-20.04
container:
image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116
Expand All @@ -33,7 +35,7 @@ jobs:
#python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Compile DeepSpeed Ops
run: |
TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install .
TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install .
- name: DS Report
run: |
ds_report
Loading

0 comments on commit 35aabdc

Please sign in to comment.