From d92bea83e57eb9a7d753082d2748e38abfd155b7 Mon Sep 17 00:00:00 2001 From: sjh Date: Mon, 21 Oct 2024 10:21:16 +0800 Subject: [PATCH 1/2] add requirements --- .github/workflows/deepspeed.yaml | 85 ++++++++++++++++--------- README.md | 15 +++++ requirements/requirements_deepspeed.txt | 29 +++++++++ 3 files changed, 98 insertions(+), 31 deletions(-) create mode 100644 requirements/requirements_deepspeed.txt diff --git a/.github/workflows/deepspeed.yaml b/.github/workflows/deepspeed.yaml index d423ad708..8fa2947d8 100644 --- a/.github/workflows/deepspeed.yaml +++ b/.github/workflows/deepspeed.yaml @@ -7,13 +7,15 @@ on: workflow_dispatch: pull_request: paths: - - '.github/workflows/deepspeed.yml' + - '.github/workflows/deepspeed.yaml' + - '.github/workflows/deepspeed.yaml' - 'requirements/**' schedule: - cron: "0 0 * * *" push: paths: - - '.github/workflows/deepspeed.yml' + - '.github/workflows/deepspeed.yaml' + - '.github/workflows/deepspeed.yaml' concurrency: @@ -39,31 +41,26 @@ jobs: - /etc/ascend_install.info:/etc/ascend_install.info options: --network host --name deepspeed_unit-tests - --device /dev/davinci5 + --device /dev/davinci6 --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc --shm-size "20g" --entrypoint /bin/bash - - env: - PT_HPU_LAZY_MODE: 0 - TORCHINDUCTOR_COMPILE_THREADS: 1 - TEST_LIST: | - test_accelerator.py - test_autotuning.py - test_compression.py - + + steps: - uses: actions/checkout@v4 - name: Install pytorch run: | npu-smi info - apt-get update + apt-get update + apt-get install sudo + pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple source /root/.bashrc - pip install torch==2.2.0 torchvision==0.17.0 torch_npu==2.2.0 numpy==1.26.4 cloudpickle tornado ml-dtypes -i https://pypi.tuna.tsinghua.edu.cn/simple + pip install torch==2.2.0 torchvision==0.17.0 torch_npu==2.2.0 torchaudio==2.2.0 numpy==1.26.4 cloudpickle tornado ml-dtypes python << EOF if __name__ == '__main__': @@ -74,24 +71,30 @@ jobs: print(f"Device Count: {torch.npu.device_count()}") print(f"Device Available: {torch.npu.is_available()}") EOF - - # - name: Install transformers - # run: | - # source /root/.bashrc - # echo "y" | apt-get install git - # git clone https://github.com/huggingface/transformers - # cd transformers - # git rev-parse --short HEAD - # pip install . -i https://pypi.tuna.tsinghua.edu.cn/simple - - - name: Install deepspeed + + - name: Install transformers run: | source /root/.bashrc echo "y" | apt-get install git - git clone --depth=1 https://github.com/microsoft/DeepSpeed.git - cd DeepSpeed - pip install .[dev,autotuning] -i https://pypi.tuna.tsinghua.edu.cn/simple - ds_report + git clone https://github.com/huggingface/transformers + cd transformers + git rev-parse --short HEAD + pip install . + + - name: Install deepspeed + uses: nick-fields/retry@v3 + with: + timeout_minutes: 30 + max_attempts: 3 + retry_on: error + command: | + source /root/.bashrc + git clone --depth=1 https://github.com/microsoft/DeepSpeed.git + pip install -r requirements/requirements_deepspeed.txt + cd DeepSpeed + pip install .[1bit,autotuning,inf] + + ds_report - name: Python environment run: | @@ -102,9 +105,29 @@ jobs: run: | source /root/.bashrc unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch + cd DeepSpeed/tests/unit/ - cd /DeepSpeed - pytest --verbose tests/* + pytest --verbose accelerator/* + pytest --verbose autotuning/* + pytest --verbose checkpoint/test_reshape_checkpoint.py + pytest --verbose checkpoint/test_moe_checkpoint.py + pytest --verbose checkpoint/test_shared_weights.py + pytest --verbose launcher/test_ds_arguments.py launcher/test_run.py + pytest --verbose model_parallelism/* + pytest --verbose moe/test_moe_tp.py + pytest --verbose monitor/* + pytest --verbose utils/* + pytest --verbose runtime/test_ds_config_model.py + pytest --verbose runtime/pipe/test_pipe_schedule.py + pytest --verbose runtime/zero/test_zero_config.py + pytest --verbose runtime/zero/test_zero_tiled.py + pytest --verbose runtime/zero/test_zeropp.py + pytest --verbose runtime/test_autocast.py + pytest --verbose runtime/test_data.py + pytest --verbose runtime/test_runtime_utils.py + pytest --verbose runtime/activation_checkpointing/* + pytest --verbose runtime/utils/* + pytest --verbose runtime/zero/test_zero_dynamic_class.py diff --git a/README.md b/README.md index 93ccb298e..780ec34e5 100644 --- a/README.md +++ b/README.md @@ -44,5 +44,20 @@ This [PR](https://github.com/microsoft/onnxruntime/pull/15833) refactored the Ex Update on 2023.06.08 This [PR](https://github.com/microsoft/onnxruntime/pull/14731) introduced a missing registration of CANN Identity operator for version greater than 14. It has been fixed in this [PR](https://github.com/microsoft/onnxruntime/pull/16210). +## Deepspeed Ascend CI +The Deepspeed source code is from `main` branch of `microsoft/deepspeed` and will be run and tested daily with Ascend related. + +------------------------------------------------------------ + +| Key | Value | +| :---: | :---: | +| CPU | Arrch64 | +| NPU | Ascend910B | +| OS | Ubantu | +| Period | UTC 1200 daily | +| Branch | main | +| Status | ![Deepspeed](https://github.com/Ascend/Ascend-CI/actions/workflows/deepspeed.yaml/badge.svg) | +| Recheck By Hand | comment 'recheck' in any issue | + ## Pytorch Ascend CI TBD diff --git a/requirements/requirements_deepspeed.txt b/requirements/requirements_deepspeed.txt new file mode 100644 index 000000000..902f2b442 --- /dev/null +++ b/requirements/requirements_deepspeed.txt @@ -0,0 +1,29 @@ +accelerate +clang-format==18.1.3 +comet_ml>=3.41.0 +# deepspeed-kernels ; sys_platform == 'linux' +docutils<0.18 +future +importlib-metadata>=4 +mup +pre-commit>=2.20.0 +pytest>=7.2.0 +pytest-forked +pytest-randomly +pytest-xdist +qtorch==0.3.0 +recommonmark +sphinx +sphinx-rtd-theme +tensorboard +torchvision +transformers>=4.39.0 +wandb +# google +# lm-eval==0.3.0 +# protobuf +# qtorch +# safetensors +# sentencepiece +# transformers>=4.32.1 +# tabulate From a4fd1ca187267de73c701d0fe923d7451c14edb7 Mon Sep 17 00:00:00 2001 From: sjh Date: Mon, 4 Nov 2024 09:26:26 +0800 Subject: [PATCH 2/2] updata deepspeed.yaml --- .github/workflows/deepspeed.yaml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/workflows/deepspeed.yaml b/.github/workflows/deepspeed.yaml index 8fa2947d8..ba4eea1c3 100644 --- a/.github/workflows/deepspeed.yaml +++ b/.github/workflows/deepspeed.yaml @@ -7,7 +7,6 @@ on: workflow_dispatch: pull_request: paths: - - '.github/workflows/deepspeed.yaml' - '.github/workflows/deepspeed.yaml' - 'requirements/**' schedule: @@ -15,7 +14,6 @@ on: push: paths: - '.github/workflows/deepspeed.yaml' - - '.github/workflows/deepspeed.yaml' concurrency: @@ -41,14 +39,13 @@ jobs: - /etc/ascend_install.info:/etc/ascend_install.info options: --network host --name deepspeed_unit-tests - --device /dev/davinci6 + --device /dev/davinci4 --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc --shm-size "20g" --entrypoint /bin/bash - steps: - uses: actions/checkout@v4 @@ -132,4 +129,4 @@ jobs: - + \ No newline at end of file