Merge branch 'master' into zeroOptParamsFlatenning

microsoft · Jan 10, 2024 · 57b0112 · 57b0112
2 parents c10b626 + c1e0205
commit 57b0112
Show file tree

Hide file tree

Showing 38 changed files with 900 additions and 220 deletions.
diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
@@ -1,22 +1,49 @@
 name: cpu-inference
 
 on:
+  pull_request:
+    paths:
+      - '.github/workflows/cpu-inference.yml'
+      - 'requirements/**'
+      - 'deepspeed/__init__.py'
+      - 'deepspeed/inference/**'
+      - '!deepspeed/inference/v2/**' # exclude v2 dir
+      - 'tests/unit/inference/**'
+      - '!tests/unit/inference/v2/**' # exclude v2 tests dir
   workflow_dispatch:
+  merge_group:
+    branches: [ master ]
+  schedule:
+        - cron: "0 0 * * 0"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:
   unit-tests:
-    runs-on: ubuntu-20.04
+    runs-on: [self-hosted, cpu]
 
     steps:
       - uses: actions/checkout@v3
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
+      - name: Install gcc-9
+        run: |
+          sudo add-apt-repository -u ppa:ubuntu-toolchain-r/test
+          sudo apt install -y gcc-9 g++-9
+          # set gcc-9 and g++9 to default
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 99
+          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 99
+
+      - name: Check gcc version
+        run: |
+          # Get gcc version
+          gcc --version
+          g++ --version
+
       - name: Detect instruction sets on instance
         run: |
           lscpu
@@ -33,8 +60,16 @@ jobs:
 
       - name: Install oneCCL Bindings for PyTorch
         run: |
+          pip install torch
           python -m pip install intel_extension_for_pytorch
-          python -m pip install oneccl_bind_pt==2.0 -f https://developer.intel.com/ipex-whl-stable-cpu
+          # the curl line is for troubleshooting
+          curl -L https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+          python -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+          pip install py-cpuinfo
+          # check installed version
+          pip list |grep \\\<torch\\\>
+          pip list |grep intel-extension-for-pytorch
+          pip list |grep oneccl-bind-pt
 
       - name: Install oneCCL
         run: |
@@ -62,14 +97,22 @@ jobs:
           pip install .[dev,1bit,autotuning,inf]
           ds_report
 
-      - name: Python environment
+      - name: Python environment check
         run: |
           pip list
+          source oneCCL/build/_install/env/setvars.sh
+          export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6
+          # check whether the environment is properly setup
+          python -c "import torch;import intel_extension_for_pytorch as ipex;import oneccl_bindings_for_pytorch;print('done')"
+          python -c "import deepspeed;from deepspeed.accelerator import get_accelerator;print(get_accelerator().device_name());print(get_accelerator().is_available())"
 
       - name: Unit tests
         run: |
+          # prep oneCCL for CCLBackend comm ops building
           source oneCCL/build/_install/env/setvars.sh
+          export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
-          cd tests
-          TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/
-          TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/
+          cd  tests
+          # LOCAL_SIZE=2 enforce CPU to report 2 devices, this helps run the test on github default runner
+          LOCAL_SIZE=2 COLUMNS=240 TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/
+          LOCAL_SIZE=2 COLUMNS=240 TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/
diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml
@@ -2,11 +2,14 @@ name: nv-inference
 
 on:
   pull_request:
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
-      - 'deepspeed/inference/v2/**'
-      - 'tests/unit/inference/v2/**'
+    paths:
+      - '.github/workflows/nv-inference.yml'
+      - 'requirements/**'
+      - 'deepspeed/__init__.py'
+      - 'deepspeed/inference/**'
+      - '!deepspeed/inference/v2/**' # exclude v2 dir
+      - 'tests/unit/inference/**'
+      - '!tests/unit/inference/v2/**' # exclude v2 tests dir
   merge_group:
     branches: [ master ]
   schedule:

diff --git a/.github/workflows/nv-sd.yml b/.github/workflows/nv-sd.yml
@@ -10,7 +10,10 @@ on:
       - "tests/unit/inference/test_stable_diffusion.py"
       - "deepspeed/model_implementations/diffusers/unet.py"
       - "deepspeed/model_implementations/diffusers/vae.py"
+      - "deepspeed/module_inject/containers/vae.py"
+      - "deepspeed/module_inject/containers/unet.py"
       - ".github/workflows/nv-sd.yml"
+      - "requirements/requirements-sd.txt"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}