diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml index 304dc421020a..0f72c60b2b61 100644 --- a/.github/workflows/cpu-inference.yml +++ b/.github/workflows/cpu-inference.yml @@ -68,7 +68,7 @@ jobs: python -m pip install intel_extension_for_pytorch # the curl line is for troubleshooting curl -L https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ - python -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ + python -m pip install oneccl_bind_pt --index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ pip install py-cpuinfo # check installed version pip list |grep \\\ diff --git a/.github/workflows/nv-human-eval.yml b/.github/workflows/nv-human-eval.yml new file mode 100644 index 000000000000..82f7e9beacca --- /dev/null +++ b/.github/workflows/nv-human-eval.yml @@ -0,0 +1,53 @@ +name: nv-human-eval + +on: + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + unit-tests: + runs-on: [self-hosted, nvidia, a6000] + container: + image: nvcr.io/nvidia/pytorch:23.03-py3 + ports: + - 80 + options: --gpus all --shm-size "8G" + + steps: + - uses: actions/checkout@v3 + + - name: Check container state + run: | + ldd --version + nvcc --version + nvidia-smi + python -c "import torch; print('torch:', torch.__version__, torch)" + python -c "import torch; print('CUDA available:', torch.cuda.is_available())" + - name: Install transformers + run: | + git clone --depth=1 https://github.com/huggingface/transformers + cd transformers + git rev-parse --short HEAD + python -m pip install . + - name: Clone Human Eval + run: | + git clone --depth=1 https://github.com/openai/human-eval.git + sed -i '/exec(check_program, exec_globals)/ s/^# //' human-eval/human_eval/execution.py + cd human-eval + git rev-parse --short HEAD + python -m pip install . + - name: Install deepspeed + run: | + python -m pip install .[dev,1bit,autotuning] + ds_report + - name: Python environment + run: | + python -m pip list + - name: Unit tests + run: | + unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch + cd tests + python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.0" --cuda_ver="12" diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml index 0cbe212666de..a24376c8973d 100644 --- a/.github/workflows/nv-inference.yml +++ b/.github/workflows/nv-inference.yml @@ -32,7 +32,7 @@ jobs: - name: Install pytorch run: | - pip install -U --cache-dir $TORCH_CACHE torch==2.1.2 torchvision==0.16.2 --extra-index-url https://download.pytorch.org/whl/cu118 + pip install -U --cache-dir $TORCH_CACHE torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu118 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml index 16100cafe87d..e540b5acaf33 100644 --- a/.github/workflows/nv-nightly.yml +++ b/.github/workflows/nv-nightly.yml @@ -25,7 +25,7 @@ jobs: - name: Install pytorch run: | - pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116 + pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --index-url https://download.pytorch.org/whl/cu116 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/nv-torch-latest-cpu.yml b/.github/workflows/nv-torch-latest-cpu.yml index 4075a46ec913..60f9332f835d 100644 --- a/.github/workflows/nv-torch-latest-cpu.yml +++ b/.github/workflows/nv-torch-latest-cpu.yml @@ -29,7 +29,7 @@ jobs: - name: Install pytorch run: | - pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cpu + pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/blogs/deepspeed-chat/README.md b/blogs/deepspeed-chat/README.md index 43501652b98e..75b0841231a4 100644 --- a/blogs/deepspeed-chat/README.md +++ b/blogs/deepspeed-chat/README.md @@ -311,7 +311,7 @@ Furthermore, we would like to point out that our effective performance is 19x hi ***(II) Scalability Analysis.*** The best effective throughput for different model sizes is achieved at different GPU count. This is in part because some of the larger model sizes require more memory to run. However, a large part of this behavior stems from DeepSpeed-HE’s scalability properties that we discuss next. -Figure 7 shows that DeepSeed-RLHF has achieved good scaling overall on up to 64 GPUs. However, if we look more closely, it shows that DeepSpeed-RLHF training achieves super-linear scaling at small scale, followed by near linear or sub-linear scaling at larger scales. This is due to the interaction between memory availability and max global batch size. +Figure 7 shows that DeepSpeed-RLHF has achieved good scaling overall on up to 64 GPUs. However, if we look more closely, it shows that DeepSpeed-RLHF training achieves super-linear scaling at small scale, followed by near linear or sub-linear scaling at larger scales. This is due to the interaction between memory availability and max global batch size. As DeepSpeed-HE is powered by ZeRO-based technology for training, it allows model states to be partitioned across the available GPUs. As a result, the memory consumption per GPU reduces with the increase in the number of GPUs, allowing DeepSpeed-HE to support a larger batch per GPU resulting in super-linear scaling. However, at large scale, while the available memory continues to increase, the maximum global batch size (1024, in our case, with a sequence length of 512) limits the batch size per GPU, resulting in near-linear or sub-linear scaling. As a result, for a given max global batch size, DeepSpeed-HE achieves the best throughput and cost efficiency at the boundary of super-linear and sub-linear scalability, and the exact point is mostly determined by the largest batch size that can be run per GPU as the function of available memory and global batch size. diff --git a/blogs/deepspeed-chat/chinese/README.md b/blogs/deepspeed-chat/chinese/README.md index 03bf9b69449f..7a45735b3f09 100644 --- a/blogs/deepspeed-chat/chinese/README.md +++ b/blogs/deepspeed-chat/chinese/README.md @@ -300,7 +300,7 @@ DeepSpeed-HE可以在训练和推理之间无缝更改模型分区,以支持 ***(II) 可扩展性分析。*** 不同模型大小的最佳有效吞吐量取决于不同的 GPU 数量。部分原因是因为一些较大的模型大小需要更多的内存来运行。基于此,我们接下来讨论 DeepSpeed-HE 的可扩展性特性。 -图 7 显示 DeepSeed-RLHF 在多达 64 个 GPU的集群 上实现了良好的整体扩展。然而,如果我们仔细观察,可以发现 DeepSpeed-RLHF 训练在小规模时实现了超线性扩展,随后在较大规模时实现了接近线性或次线性扩展。这是由于内存可用性和最大全局批量大小之间的相互作用。 +图 7 显示 DeepSpeed-RLHF 在多达 64 个 GPU的集群 上实现了良好的整体扩展。然而,如果我们仔细观察,可以发现 DeepSpeed-RLHF 训练在小规模时实现了超线性扩展,随后在较大规模时实现了接近线性或次线性扩展。这是由于内存可用性和最大全局批量大小之间的相互作用。 DeepSpeed-HE 的核心技术基于 ZeRO,用于训练过程中将模型状态分割到每个GPU上。这意味着随着 GPU 数量的增加,每个 GPU 的内存消耗会减少,使得 DeepSpeed-HE 能够在每个 GPU 上支持更大的批量,从而实现超线性扩展。然而,在大规模情况下,尽管可用内存持续增加,但最大全局批量大小仍然限制了每个 GPU 的批量大小,导致接近线性或次线性扩展。因此,在给定的最大全局批量大小(例如,我们设置为 1024 个句子,每个句子长度为 512)下,DeepSpeed-HE 在超线性和次线性可扩展性之间实现了最佳的吞吐量和成本效益。具体的平衡点主要取决于每个 GPU 上可运行的最大批量大小,而这又受到可用内存和全局批量大小的函数所决定。 diff --git a/blogs/deepspeed-chat/japanese/README.md b/blogs/deepspeed-chat/japanese/README.md index e7aa62721417..23ead99f5c8d 100644 --- a/blogs/deepspeed-chat/japanese/README.md +++ b/blogs/deepspeed-chat/japanese/README.md @@ -305,7 +305,7 @@ DeepSpeed-RLHFは、Colossal-AIや、ネイティブのPyTorchを用いたHuggin ***(II) スケーラビリティ分析*** モデルサイズごとに、最良のスループットを得られるGPU数は異なります。これは、モデルサイズが大きくなると、実行に多くのメモリを必要とすることに加え、以下に説明する DeepSpeed-HE のスケーラビリティ特性にも起因しています。 -図7は、DeepSeed-RLHF が最大 64 GPU で全体的に良好なスケーラビリティを達成したことを示しています。しかし、より詳細に見ると、DeepSpeed-RLHFの訓練では、小規模な環境では超線形(super linear)なスケーリングを達成し、大規模では線形(linear)またはそれ以下のスケーラビリティになっていることが分かります。これは、メモリの可用性と最大グローバルバッチサイズとの間の相互作用によるものです。 +図7は、DeepSpeed-RLHF が最大 64 GPU で全体的に良好なスケーラビリティを達成したことを示しています。しかし、より詳細に見ると、DeepSpeed-RLHFの訓練では、小規模な環境では超線形(super linear)なスケーリングを達成し、大規模では線形(linear)またはそれ以下のスケーラビリティになっていることが分かります。これは、メモリの可用性と最大グローバルバッチサイズとの間の相互作用によるものです。 DeepSpeed-HEはトレーニングにZeROの技術を採用しているため、利用可能なGPU間でモデルを分割することが可能です。その結果、GPUあたりのメモリ消費量はGPU数の増加とともに減少し、DeepSpeed-HEはGPUあたりでより大きなバッチサイズをサポートできるようになり、超線形のスケーリングが実現できます。しかし、より大規模になると、利用可能なメモリが増加し続ける一方で、最大グローバルバッチサイズが制限されているため、GPUあたりのバッチサイズを小さくすることになり、線形またはそれ以下のスケーリングになります。その結果、与えられた最大グローバルバッチサイズに対して、DeepSpeed-HEは、スーパーリニアとサブリニアのスケーラビリティの境界で最高のスループットとコスト効率を達成し、正確なポイントは、利用可能なメモリとグローバルバッチサイズの関数としてGPUごとに実行できる最大バッチサイズによってほぼ決定されます。 diff --git a/deepspeed/runtime/compiler.py b/deepspeed/runtime/compiler.py index b2b612c85180..b5e4e33425d0 100644 --- a/deepspeed/runtime/compiler.py +++ b/deepspeed/runtime/compiler.py @@ -35,7 +35,7 @@ def get_backend_fn(backend: Union[str, Callable]) -> Union[str, Callable]: return backend elif isinstance(backend, str): - if backend in torch._dynamo.list_backends(): + if backend in torch._dynamo.list_backends(exclude_tags=()): return backend # Get module name from backend name diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index 4132a4527d6a..03813afa4ed1 100644 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -864,7 +864,9 @@ def _create_fp32_partitions(self): self.device).clone().float().detach()) self.fp32_partitioned_groups_flat[i].requires_grad = True # keep this in case internal optimizer uses it - self.fp32_partitioned_groups_flat[i].ds_id = '_'.join(map(str, self.fp16_partitioned_groups_flat_id[i])) + ds_id_begin = str(self.fp16_partitioned_groups_flat_id[i][0]) + ds_id_end = str(self.fp16_partitioned_groups_flat_id[i][-1]) + self.fp32_partitioned_groups_flat[i].ds_id = ds_id_begin + '_' + ds_id_end if len(swappable_fp32_tensors) > 0: self.optimizer_swapper.initialize_parameters(parameters=swappable_fp32_tensors, diff --git a/tests/pytest.ini b/tests/pytest.ini index 8d043c8b3f9d..f841c47afc0c 100644 --- a/tests/pytest.ini +++ b/tests/pytest.ini @@ -1,12 +1,13 @@ [pytest] -addopts = -m "not sequential and not nightly and not inference and not seq_inference and not inference_ops and not inference_v2 and not inference_v2_ops and not stable_diffusion" +addopts = -m "not sequential and not nightly and not inference and not seq_inference and not inference_ops and not inference_v2 and not inference_v2_ops and not stable_diffusion and not evaluation" markers = sequential:Tests that need to be run sequentially inference:Inference model tests inference_ops:Individual inference operator tests - inference_v2: Inference tests for the v2 stack - inference_v2_ops: Op tests for the v2 stack + inference_v2:Inference tests for the v2 stack + inference_v2_ops:Op tests for the v2 stack seq_inference:Inference model tests to run sequentially nightly:Tests that should be run nightly world_size:Change world size of individual tests in a class stable_diffusion:Tests that run Stable Diffusion + evaluation:Tests that evaluate model correctness diff --git a/tests/unit/inference/test_human_eval.py b/tests/unit/inference/test_human_eval.py new file mode 100644 index 000000000000..2525aeb5aa0e --- /dev/null +++ b/tests/unit/inference/test_human_eval.py @@ -0,0 +1,73 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import pytest +import os +import torch +from deepspeed.accelerator import get_accelerator + + +@pytest.mark.evaluation +@pytest.mark.parametrize("model_name", ["codellama/CodeLlama-7b-Python-hf"]) +def test_human_eval(model_name): + import mii + import numpy + from transformers import pipeline + from human_eval.data import write_jsonl, read_problems + from human_eval.evaluation import evaluate_functional_correctness + + def generate_base_completion(pipe, problem_prompt: str) -> str: + return pipe(problem_prompt, do_sample=True)[0]["generated_text"] + + def generate_mii_completion(pipe, problem_prompt: str) -> str: + return pipe(problem_prompt, max_new_tokens=512)[0].generated_text + + def generate_samples(pipe, generation_function): + samples = [ + dict(task_id=task_id, completion=generation_function(pipe, problems[task_id]["prompt"])) + for task_id in problems for _ in range(num_samples_per_task) + ] + return samples + + # Loading Problems + problems = read_problems("../../human-eval/data/HumanEval.jsonl.gz") + num_samples_per_task = 20 + + # Initializing HuggingFace Pipeline + local_rank = os.getenv("LOCAL_RANK", "0") + device = torch.device(get_accelerator().device_name(local_rank)) + base_pipe = pipeline(model=model_name, + device=torch.device(get_accelerator().device_name(local_rank)), + max_length=512, + return_full_text=False) + + # Generating Base Samples + base_samples = generate_samples(base_pipe, generate_base_completion) + + # Base Pipeline Teardown + del base_pipe + get_accelerator().empty_cache() + + # Initializing DeepSpeed-MII Pipeline + mii_pipe = mii.pipeline(model_name) + + # Generating MII Samples + mii_samples = generate_samples(mii_pipe, generate_mii_completion) + + # MII Pipeline Teardown + mii_pipe.destroy() + + # Writing Samples + write_jsonl("base_samples.jsonl", base_samples) + write_jsonl("mii_samples.jsonl", mii_samples) + + # Evaluating Samples + base_results = evaluate_functional_correctness("base_samples.jsonl") + mii_results = evaluate_functional_correctness("mii_samples.jsonl") + + # Executing Assertions + for key in base_results.keys(): + assert numpy.allclose(base_results[key], mii_results[key], rtol=0.10), \ + f"Base result: {base_results[key]}, MII result: {mii_results[key]}, outside of rtol." diff --git a/version.txt b/version.txt index 288adf538f0e..dffa40ec3c2c 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.13.3 +0.13.4