Skip to content

Commit

Permalink
Merge branch 'microsoft:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
Rohan138 authored Sep 27, 2024
2 parents 79029ac + d45cfd3 commit 4c6f1a5
Show file tree
Hide file tree
Showing 1,006 changed files with 77,742 additions and 7,685 deletions.
56 changes: 0 additions & 56 deletions .github/workflows/amd-mi100.yml

This file was deleted.

12 changes: 9 additions & 3 deletions .github/workflows/amd-mi200.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
name: amd-mi200

on:
workflow_dispatch:
pull_request:
paths:
- '.github/workflows/amd-mi200.yml'
- 'requirements/**'
schedule:
- cron: "0 0 * * *"

Expand All @@ -20,14 +25,14 @@ jobs:
# Steps represent a sequence of tasks that will be executed as part of the job
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- id: setup-venv
uses: ./.github/workflows/setup-venv

- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.4.2
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/rocm6.0
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand All @@ -44,6 +49,7 @@ jobs:
run: |
git clone https://github.com/ROCmSoftwarePlatform/apex.git
cd apex
git checkout torch_2.1_higher
CURRENT_VER=$(git rev-parse HEAD)
INSTALLED_VER=$(cat /blob/amd-apex/.venv_installed_version)
if [[ "$CURRENT_VER" != "$INSTALLED_VER" ]]; then
Expand Down Expand Up @@ -71,7 +77,7 @@ jobs:
pytest $PYTEST_OPTS -m 'sequential' unit/
- name: Open GitHub issue if nightly CI fails
if: failure()
if: ${{ failure() && (github.event_name == 'schedule') }}
uses: JasonEtco/create-an-issue@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
Expand Down
59 changes: 0 additions & 59 deletions .github/workflows/auto-sync.yml

This file was deleted.

68 changes: 46 additions & 22 deletions .github/workflows/cpu-inference.yml
Original file line number Diff line number Diff line change
@@ -1,58 +1,74 @@
name: cpu-inference

on:
workflow_dispatch:
pull_request:
paths-ignore:
- 'docs/**'
- 'blogs/**'
paths:
- '.github/workflows/cpu-inference.yml'
- 'requirements/**'
- 'deepspeed/__init__.py'
- 'deepspeed/inference/**'
- '!deepspeed/inference/v2/**' # exclude v2 dir
- 'tests/unit/inference/**'
- '!tests/unit/inference/v2/**' # exclude v2 tests dir
merge_group:
branches: [ master ]
schedule:
- cron: "0 0 * * 0"

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
unit-tests:
runs-on: ubuntu-20.04
runs-on: [self-hosted, cpu]

env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3

- id: setup-venv
uses: ./.github/workflows/setup-venv

- name: Install gcc-9
run: |
sudo add-apt-repository -u ppa:ubuntu-toolchain-r/test
sudo apt install -y gcc-9 g++-9
# set gcc-9 and g++9 to default
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 99
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 99
- name: Check gcc version
run: |
# Get gcc version
gcc --version
g++ --version
- name: Detect instruction sets on instance
run: |
lscpu
pip install cmake
git clone https://github.com/intel/intel-extension-for-pytorch
cd intel-extension-for-pytorch/tests/cpu/isa
cmake .
make
./cpu_features
- name: Install numactl
run: |
sudo apt-get install -y numactl
- name: Install oneCCL Bindings for PyTorch
- name: Install dependencies
run: |
python -m pip install intel_extension_for_pytorch
python -m pip install oneccl_bind_pt==2.0 -f https://developer.intel.com/ipex-whl-stable-cpu
pip install torch
# check installed version
pip list |grep \\\<torch\\\>
- name: Install oneCCL
run: |
pip install cmake
git clone https://github.com/oneapi-src/oneCCL
cd oneCCL
mkdir build
cd build
cmake ..
make
make install
#source ./_install/env/setvars.sh
# test whether oneCCL is correctly installed
#mpirun -n 2 ./examples/benchmark/benchmark
make -j install
- name: Install transformers
run: |
Expand All @@ -67,13 +83,21 @@ jobs:
pip install .[dev,1bit,autotuning,inf]
ds_report
- name: Python environment
- name: Python environment check
run: |
pip list
source oneCCL/build/_install/env/setvars.sh
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6
# check whether the environment is properly setup
python -c "import deepspeed;from deepspeed.accelerator import get_accelerator;print(get_accelerator().device_name());print(get_accelerator().is_available())"
- name: Unit tests
run: |
# prep oneCCL for CCLBackend comm ops building
source oneCCL/build/_install/env/setvars.sh
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' -m 'inference_ops' -m 'inference' unit/
cd tests
# LOCAL_SIZE=2 enforce CPU to report 2 devices, this helps run the test on github default runner
LOCAL_SIZE=2 COLUMNS=240 HF_HOME=~/tmp/hf_home/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/
LOCAL_SIZE=2 COLUMNS=240 HF_HOME=~/tmp/hf_home/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
name: nv-torch-latest-cpu
name: cpu-torch-latest

on:
workflow_dispatch:
pull_request:
paths-ignore:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
Expand All @@ -16,17 +19,21 @@ concurrency:

jobs:
unit-tests:
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- id: setup-venv
uses: ./.github/workflows/setup-venv

- name: Install system packages
run: |
sudo apt-get install -y numactl pdsh
- name: Install pytorch
run: |
pip install torch==1.12.0+cpu torchvision==0.13.0+cpu torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cpu
pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand All @@ -43,5 +50,5 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="1.12"
TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="1.12"
HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.4"
HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.4"
18 changes: 10 additions & 8 deletions .github/workflows/formatting.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: Formatting

on:
workflow_dispatch:
pull_request:
branches:
'**'
Expand All @@ -16,23 +17,24 @@ concurrency:
jobs:

# formatting and basic install on cpu-only machine
formatting:
runs-on: ubuntu-20.04
unit-tests:
runs-on: ubuntu-22.04

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: environment
run: |
which python
python --version
- name: Install deepspeed
- name: Install dependencies
run: |
pip install .[dev,autotuning,triton]
ds_report
# Previously we would do pip install .[dev] but this is causing out of
# space errors start with torch 2.1.0 release
grep -E "clang-format|pre-commit" requirements/requirements-dev.txt | xargs pip install
- name: Formatting checks
run: |
pip show pre-commit clang-format
pre-commit run --all-files
pip show pre-commit clang-format
pre-commit run --all-files
Loading

0 comments on commit 4c6f1a5

Please sign in to comment.