Merge branch 'microsoft:master' into master

ROCm · Sep 27, 2024 · 4c6f1a5 · 4c6f1a5
2 parents 79029ac + d45cfd3
commit 4c6f1a5
Show file tree

Hide file tree

Showing 1,006 changed files with 77,742 additions and 7,685 deletions.
diff --git a/.github/workflows/amd-mi100.yml b/.github/workflows/amd-mi100.yml
diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml
@@ -1,6 +1,11 @@
 name: amd-mi200
 
 on:
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - '.github/workflows/amd-mi200.yml'
+      - 'requirements/**'
   schedule:
     - cron: "0 0 * * *"
 
@@ -20,14 +25,14 @@ jobs:
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.4.2
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/rocm6.0
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -44,6 +49,7 @@ jobs:
         run: |
           git clone https://github.com/ROCmSoftwarePlatform/apex.git
           cd apex
+          git checkout torch_2.1_higher
           CURRENT_VER=$(git rev-parse HEAD)
           INSTALLED_VER=$(cat /blob/amd-apex/.venv_installed_version)
           if [[ "$CURRENT_VER" != "$INSTALLED_VER" ]]; then
@@ -71,7 +77,7 @@ jobs:
           pytest $PYTEST_OPTS -m 'sequential' unit/
 
       - name: Open GitHub issue if nightly CI fails
-        if: failure()
+        if: ${{ failure() && (github.event_name == 'schedule') }}
         uses: JasonEtco/create-an-issue@v2
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

diff --git a/.github/workflows/auto-sync.yml b/.github/workflows/auto-sync.yml
diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
@@ -1,58 +1,74 @@
 name: cpu-inference
 
 on:
+  workflow_dispatch:
   pull_request:
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
+    paths:
+      - '.github/workflows/cpu-inference.yml'
+      - 'requirements/**'
+      - 'deepspeed/__init__.py'
+      - 'deepspeed/inference/**'
+      - '!deepspeed/inference/v2/**' # exclude v2 dir
+      - 'tests/unit/inference/**'
+      - '!tests/unit/inference/v2/**' # exclude v2 tests dir
   merge_group:
     branches: [ master ]
+  schedule:
+        - cron: "0 0 * * 0"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:
   unit-tests:
-    runs-on: ubuntu-20.04
+    runs-on: [self-hosted, cpu]
+
+    env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
+      - name: Install gcc-9
+        run: |
+          sudo add-apt-repository -u ppa:ubuntu-toolchain-r/test
+          sudo apt install -y gcc-9 g++-9
+          # set gcc-9 and g++9 to default
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 99
+          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 99
+
+      - name: Check gcc version
+        run: |
+          # Get gcc version
+          gcc --version
+          g++ --version
+
       - name: Detect instruction sets on instance
         run: |
           lscpu
-          pip install cmake
-          git clone https://github.com/intel/intel-extension-for-pytorch
-          cd intel-extension-for-pytorch/tests/cpu/isa
-          cmake .
-          make
-          ./cpu_features
 
       - name: Install numactl
         run: |
           sudo apt-get install -y numactl
 
-      - name: Install oneCCL Bindings for PyTorch
+      - name: Install dependencies
         run: |
-          python -m pip install intel_extension_for_pytorch
-          python -m pip install oneccl_bind_pt==2.0 -f https://developer.intel.com/ipex-whl-stable-cpu
+          pip install torch
+          # check installed version
+          pip list |grep \\\<torch\\\>
 
       - name: Install oneCCL
         run: |
+          pip install cmake
           git clone https://github.com/oneapi-src/oneCCL
           cd oneCCL
           mkdir build
           cd build
           cmake ..
-          make
-          make install
-          #source ./_install/env/setvars.sh
-          # test whether oneCCL is correctly installed
-          #mpirun -n 2 ./examples/benchmark/benchmark
+          make -j install
 
       - name: Install transformers
         run: |
@@ -67,13 +83,21 @@ jobs:
           pip install .[dev,1bit,autotuning,inf]
           ds_report
 
-      - name: Python environment
+      - name: Python environment check
         run: |
           pip list
+          source oneCCL/build/_install/env/setvars.sh
+          export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6
+          # check whether the environment is properly setup
+          python -c "import deepspeed;from deepspeed.accelerator import get_accelerator;print(get_accelerator().device_name());print(get_accelerator().is_available())"
 
       - name: Unit tests
         run: |
+          # prep oneCCL for CCLBackend comm ops building
           source oneCCL/build/_install/env/setvars.sh
+          export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
-          cd tests
-          TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' -m 'inference_ops' -m 'inference' unit/
+          cd  tests
+          # LOCAL_SIZE=2 enforce CPU to report 2 devices, this helps run the test on github default runner
+          LOCAL_SIZE=2 COLUMNS=240 HF_HOME=~/tmp/hf_home/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/
+          LOCAL_SIZE=2 COLUMNS=240 HF_HOME=~/tmp/hf_home/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/
diff --git a/.github/workflows/nv-torch-latest-cpu.yml → .github/workflows/cpu-torch-latest.yml b/.github/workflows/nv-torch-latest-cpu.yml → .github/workflows/cpu-torch-latest.yml
@@ -1,10 +1,13 @@
-name: nv-torch-latest-cpu
+name: cpu-torch-latest
 
 on:
+  workflow_dispatch:
   pull_request:
     paths-ignore:
       - 'docs/**'
       - 'blogs/**'
+      - 'deepspeed/inference/v2/**'
+      - 'tests/unit/inference/v2/**'
   merge_group:
     branches: [ master ]
   schedule:
@@ -16,17 +19,21 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
+      - name: Install system packages
+        run: |
+          sudo apt-get install -y numactl pdsh
+
       - name: Install pytorch
         run: |
-          pip install torch==1.12.0+cpu torchvision==0.13.0+cpu torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cpu
+          pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -43,5 +50,5 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="1.12"
-          TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="1.12"
+          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.4"
+          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.4"
diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml
@@ -1,6 +1,7 @@
 name: Formatting
 
 on:
+  workflow_dispatch:
   pull_request:
     branches:
       '**'
@@ -16,23 +17,24 @@ concurrency:
 jobs:
 
   # formatting and basic install on cpu-only machine
-  formatting:
-    runs-on: ubuntu-20.04
+  unit-tests:
+    runs-on: ubuntu-22.04
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: environment
         run: |
           which python
           python --version
 
-      - name: Install deepspeed
+      - name: Install dependencies
         run: |
-          pip install .[dev,autotuning,triton]
-          ds_report
+          # Previously we would do pip install .[dev] but this is causing out of
+          # space errors start with torch 2.1.0 release
+          grep -E "clang-format|pre-commit" requirements/requirements-dev.txt | xargs pip install
 
       - name: Formatting checks
         run: |
-           pip show pre-commit clang-format
-           pre-commit run --all-files
+          pip show pre-commit clang-format
+          pre-commit run --all-files