From f2739b4f722203b781722cec84a374d0d798b084 Mon Sep 17 00:00:00 2001 From: Joe Mayer <114769929+jomayeri@users.noreply.github.com> Date: Thu, 29 Aug 2024 08:59:32 -0700 Subject: [PATCH 1/5] Change GDS to 1 AIO thread (#6459) The `numThreads` config option determines how many threads are used to read from the file. In the CPU case these threads are created via AIO, in the GDS case they are handled by the GDS library via the cufile.json. If we were to also create AIO threads it would have a multiplicative effect. Example 8 AIO threads * 8 GDS threads would be 64 threads reading from the file when the user really only intended for 8 threads. Co-authored-by: Olatunji Ruwase --- csrc/gds/py_lib/deepspeed_py_gds_handle.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp index 3a35ad3145a0..15fd516acaae 100644 --- a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp +++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp @@ -20,7 +20,7 @@ deepspeed_gds_handle_t::deepspeed_gds_handle_t(const int block_size, const bool single_submit, const bool overlap_events, const int num_threads) - : deepspeed_io_handle_t(block_size, queue_depth, single_submit, overlap_events, num_threads) + : deepspeed_io_handle_t(block_size, queue_depth, single_submit, overlap_events, 1) { _init_cuFile(block_size, queue_depth, num_threads); } From 0cd9bf59785f24aefae4c407f184bf71339ed5d4 Mon Sep 17 00:00:00 2001 From: Yizhou Wang Date: Fri, 30 Aug 2024 00:42:53 +0800 Subject: [PATCH 2/5] [CCL] fix condition issue in ccl.py (#6443) previous condition check is not right, it would cause this condition always be True. --------- Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- deepspeed/comm/ccl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/comm/ccl.py b/deepspeed/comm/ccl.py index cdf4c030f5d6..8753cf9f70ed 100644 --- a/deepspeed/comm/ccl.py +++ b/deepspeed/comm/ccl.py @@ -15,7 +15,7 @@ def build_ccl_op(): builder = get_accelerator().create_op_builder("CCLCommBuilder") - if builder is None or NotImplementedBuilder: + if builder is None or isinstance(builder, NotImplementedBuilder): return None ccl_cpp_module = builder.load() print(f'DeepSpeed {builder.absolute_name()} built successfully') From a7ffe540fc98a945b9d3d2a9c05653e160d62dcf Mon Sep 17 00:00:00 2001 From: Ramya Ramineni <62723901+rraminen@users.noreply.github.com> Date: Thu, 29 Aug 2024 12:15:52 -0500 Subject: [PATCH 3/5] Avoid gds build errors on ROCm (#6456) This PR is to avoid the below error during DeepSpeed build on ROCm. The error is because of the incompatibility of GDSBuilder extension on ROCm. ``` Traceback (most recent call last): File "", line 1, in File "/tmp/pip-req-build-lv1v39xc/setup.py", line 180, in op_compatible = builder.is_compatible() File "/tmp/pip-req-build-lv1v39xc/op_builder/gds.py", line 47, in is_compatible CUDA_LIB64 = os.path.join(CUDA_HOME, "lib64") File "/opt/conda/envs/py_3.9/lib/python3.9/posixpath.py", line 76, in join a = os.fspath(a) TypeError: expected str, bytes or os.PathLike object, not NoneType Total number of unsupported CUDA function calls: 0 Total number of replaced kernel launches: 1 ---------------------------------------- ERROR: Command errored out with exit status 1: python setup.py egg_info Check the logs for full command output ``` cc: @jithunnair-amd --------- Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- op_builder/gds.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/op_builder/gds.py b/op_builder/gds.py index 01c2d5a245d1..727ebdf48372 100644 --- a/op_builder/gds.py +++ b/op_builder/gds.py @@ -36,6 +36,11 @@ def extra_ldflags(self): return super().extra_ldflags() + ['-lcufile'] def is_compatible(self, verbose=False): + if self.is_rocm_pytorch(): + if verbose: + self.warning(f'{self.NAME} is not compatible with ROCM') + return False + try: import torch.utils.cpp_extension except ImportError: From 89c4d9f5a73d06eb8db8037d61188c628ceeba51 Mon Sep 17 00:00:00 2001 From: Raza Sikander <54884406+raza-sikander@users.noreply.github.com> Date: Fri, 30 Aug 2024 03:35:20 +0530 Subject: [PATCH 4/5] TestLowCpuMemUsage UT get device by device_name (#6397) Co-authored-by: Shaik Raza Sikander Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- tests/unit/inference/test_inference.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index eadf670d9328..581a2ce433ed 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -298,6 +298,12 @@ def verify_injection(module): verify_injection(model) +# Used to Get Device name +def getDeviceId(local_rank): + device = torch.device(f"{get_accelerator().device_name(local_rank)}") + return device + + # Verify that test is valid def validate_test(model_w_task, dtype, enable_cuda_graph, enable_triton): model, task = model_w_task @@ -484,8 +490,8 @@ def test( pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.") local_rank = int(os.getenv("LOCAL_RANK", "0")) - - pipe = pipeline(task, model=model, model_kwargs={"low_cpu_mem_usage": True}, device=local_rank, framework="pt") + device = getDeviceId(local_rank) + pipe = pipeline(task, model=model, model_kwargs={"low_cpu_mem_usage": True}, device=device, framework="pt") bs_output = pipe(query, **inf_kwargs) pipe.model = deepspeed.init_inference(pipe.model, mp_size=self.world_size, From 9b7fc5452471392b0f58844219fcfdd14a9cdc77 Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Thu, 29 Aug 2024 16:43:21 -0700 Subject: [PATCH 5/5] Add workflow to build DS without torch to better test before releases (#6450) - Adds a nightly workflow that tests to confirm we can build DeepSpeed without torch as a dependency, as this often only surfaces when doing a release. --- .github/workflows/no-torch.yml | 45 ++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 .github/workflows/no-torch.yml diff --git a/.github/workflows/no-torch.yml b/.github/workflows/no-torch.yml new file mode 100644 index 000000000000..794c02b3c4db --- /dev/null +++ b/.github/workflows/no-torch.yml @@ -0,0 +1,45 @@ +name: no-torch + +on: + workflow_dispatch: + pull_request: + paths: + - '.github/workflows/no-torch.yml' + schedule: + - cron: "0 0 * * *" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + issues: write + +jobs: + unit-tests: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v4 + + - id: setup-venv + uses: ./.github/workflows/setup-venv + + - name: Python environment + run: | + pip uninstall torch --yes + pip list + + - name: Build deepspeed + run: | + DS_BUILD_STRING=" " python setup.py sdist + + - name: Open GitHub issue if nightly CI fails + if: ${{ failure() && (github.event_name == 'schedule') }} + uses: JasonEtco/create-an-issue@v2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + filename: .github/ISSUE_TEMPLATE/ci_failure_report.md + update_existing: true