From f2739b4f722203b781722cec84a374d0d798b084 Mon Sep 17 00:00:00 2001
From: Joe Mayer <114769929+jomayeri@users.noreply.github.com>
Date: Thu, 29 Aug 2024 08:59:32 -0700
Subject: [PATCH 1/5] Change GDS to 1 AIO thread (#6459)

The `numThreads` config option determines how many threads are used to
read from the file. In the CPU case these threads are created via AIO,
in the GDS case they are handled by the GDS library via the cufile.json.
If we were to also create AIO threads it would have a multiplicative
effect. Example 8 AIO threads * 8 GDS threads would be 64 threads
reading from the file when the user really only intended for 8 threads.

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 csrc/gds/py_lib/deepspeed_py_gds_handle.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
index 3a35ad3145a0..15fd516acaae 100644
--- a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
@@ -20,7 +20,7 @@ deepspeed_gds_handle_t::deepspeed_gds_handle_t(const int block_size,
                                                const bool single_submit,
                                                const bool overlap_events,
                                                const int num_threads)
-    : deepspeed_io_handle_t(block_size, queue_depth, single_submit, overlap_events, num_threads)
+    : deepspeed_io_handle_t(block_size, queue_depth, single_submit, overlap_events, 1)
 {
     _init_cuFile(block_size, queue_depth, num_threads);
 }

From 0cd9bf59785f24aefae4c407f184bf71339ed5d4 Mon Sep 17 00:00:00 2001
From: Yizhou Wang <yizhou.wang@intel.com>
Date: Fri, 30 Aug 2024 00:42:53 +0800
Subject: [PATCH 2/5] [CCL] fix condition issue in ccl.py (#6443)

previous condition check is not right, it would cause this condition
always be True.

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 deepspeed/comm/ccl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepspeed/comm/ccl.py b/deepspeed/comm/ccl.py
index cdf4c030f5d6..8753cf9f70ed 100644
--- a/deepspeed/comm/ccl.py
+++ b/deepspeed/comm/ccl.py
@@ -15,7 +15,7 @@
 
 def build_ccl_op():
     builder = get_accelerator().create_op_builder("CCLCommBuilder")
-    if builder is None or NotImplementedBuilder:
+    if builder is None or isinstance(builder, NotImplementedBuilder):
         return None
     ccl_cpp_module = builder.load()
     print(f'DeepSpeed {builder.absolute_name()} built successfully')

From a7ffe540fc98a945b9d3d2a9c05653e160d62dcf Mon Sep 17 00:00:00 2001
From: Ramya Ramineni <62723901+rraminen@users.noreply.github.com>
Date: Thu, 29 Aug 2024 12:15:52 -0500
Subject: [PATCH 3/5] Avoid gds build errors on ROCm (#6456)

This PR is to avoid the below error during DeepSpeed build on ROCm. The
error is because of the incompatibility of GDSBuilder extension on ROCm.

```
    Traceback (most recent call last):
      File "<string>", line 1, in <module>
      File "/tmp/pip-req-build-lv1v39xc/setup.py", line 180, in <module>
        op_compatible = builder.is_compatible()
      File "/tmp/pip-req-build-lv1v39xc/op_builder/gds.py", line 47, in is_compatible
        CUDA_LIB64 = os.path.join(CUDA_HOME, "lib64")
      File "/opt/conda/envs/py_3.9/lib/python3.9/posixpath.py", line 76, in join
        a = os.fspath(a)
    TypeError: expected str, bytes or os.PathLike object, not NoneType
    Total number of unsupported CUDA function calls: 0


    Total number of replaced kernel launches: 1
    ----------------------------------------
ERROR: Command errored out with exit status 1: python setup.py egg_info Check the logs for full command output
```

cc: @jithunnair-amd

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 op_builder/gds.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/op_builder/gds.py b/op_builder/gds.py
index 01c2d5a245d1..727ebdf48372 100644
--- a/op_builder/gds.py
+++ b/op_builder/gds.py
@@ -36,6 +36,11 @@ def extra_ldflags(self):
         return super().extra_ldflags() + ['-lcufile']
 
     def is_compatible(self, verbose=False):
+        if self.is_rocm_pytorch():
+            if verbose:
+                self.warning(f'{self.NAME} is not compatible with ROCM')
+            return False
+
         try:
             import torch.utils.cpp_extension
         except ImportError:

From 89c4d9f5a73d06eb8db8037d61188c628ceeba51 Mon Sep 17 00:00:00 2001
From: Raza Sikander <54884406+raza-sikander@users.noreply.github.com>
Date: Fri, 30 Aug 2024 03:35:20 +0530
Subject: [PATCH 4/5] TestLowCpuMemUsage UT get device by device_name (#6397)

Co-authored-by: Shaik Raza Sikander <srsikander@habana.ai>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 tests/unit/inference/test_inference.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py
index eadf670d9328..581a2ce433ed 100644
--- a/tests/unit/inference/test_inference.py
+++ b/tests/unit/inference/test_inference.py
@@ -298,6 +298,12 @@ def verify_injection(module):
     verify_injection(model)
 
 
+# Used to Get Device name
+def getDeviceId(local_rank):
+    device = torch.device(f"{get_accelerator().device_name(local_rank)}")
+    return device
+
+
 # Verify that test is valid
 def validate_test(model_w_task, dtype, enable_cuda_graph, enable_triton):
     model, task = model_w_task
@@ -484,8 +490,8 @@ def test(
             pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
 
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
-
-        pipe = pipeline(task, model=model, model_kwargs={"low_cpu_mem_usage": True}, device=local_rank, framework="pt")
+        device = getDeviceId(local_rank)
+        pipe = pipeline(task, model=model, model_kwargs={"low_cpu_mem_usage": True}, device=device, framework="pt")
         bs_output = pipe(query, **inf_kwargs)
         pipe.model = deepspeed.init_inference(pipe.model,
                                               mp_size=self.world_size,

From 9b7fc5452471392b0f58844219fcfdd14a9cdc77 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Thu, 29 Aug 2024 16:43:21 -0700
Subject: [PATCH 5/5] Add workflow to build DS without torch to better test
 before releases (#6450)

- Adds a nightly workflow that tests to confirm we can build DeepSpeed
without torch as a dependency, as this often only surfaces when doing a
release.
---
 .github/workflows/no-torch.yml | 45 ++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 .github/workflows/no-torch.yml

diff --git a/.github/workflows/no-torch.yml b/.github/workflows/no-torch.yml
new file mode 100644
index 000000000000..794c02b3c4db
--- /dev/null
+++ b/.github/workflows/no-torch.yml
@@ -0,0 +1,45 @@
+name: no-torch
+
+on:
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - '.github/workflows/no-torch.yml'
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+    contents: read
+    issues: write
+
+jobs:
+  unit-tests:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - id: setup-venv
+        uses: ./.github/workflows/setup-venv
+
+      - name: Python environment
+        run: |
+          pip uninstall torch --yes
+          pip list
+
+      - name: Build deepspeed
+        run: |
+          DS_BUILD_STRING=" " python setup.py sdist
+
+      - name: Open GitHub issue if nightly CI fails
+        if: ${{ failure() && (github.event_name == 'schedule') }}
+        uses: JasonEtco/create-an-issue@v2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+          update_existing: true