Merge branch 'master' into rearrange_ops

microsoft · Aug 31, 2024 · ddd0021 · ddd0021
2 parents c1eb49a + 9b7fc54
commit ddd0021
Show file tree

Hide file tree

Showing 5 changed files with 60 additions and 4 deletions.
diff --git a/.github/workflows/no-torch.yml b/.github/workflows/no-torch.yml
@@ -0,0 +1,45 @@
+name: no-torch
+
+on:
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - '.github/workflows/no-torch.yml'
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+    contents: read
+    issues: write
+
+jobs:
+  unit-tests:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - id: setup-venv
+        uses: ./.github/workflows/setup-venv
+
+      - name: Python environment
+        run: |
+          pip uninstall torch --yes
+          pip list
+
+      - name: Build deepspeed
+        run: |
+          DS_BUILD_STRING=" " python setup.py sdist
+
+      - name: Open GitHub issue if nightly CI fails
+        if: ${{ failure() && (github.event_name == 'schedule') }}
+        uses: JasonEtco/create-an-issue@v2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+          update_existing: true
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
@@ -20,7 +20,7 @@ deepspeed_gds_handle_t::deepspeed_gds_handle_t(const int block_size,
                                                const bool single_submit,
                                                const bool overlap_events,
                                                const int num_threads)
-    : deepspeed_io_handle_t(block_size, queue_depth, single_submit, overlap_events, num_threads)
+    : deepspeed_io_handle_t(block_size, queue_depth, single_submit, overlap_events, 1)
 {
     _init_cuFile(block_size, queue_depth, num_threads);
 }

diff --git a/deepspeed/comm/ccl.py b/deepspeed/comm/ccl.py
@@ -15,7 +15,7 @@
 
 def build_ccl_op():
     builder = get_accelerator().create_op_builder("CCLCommBuilder")
-    if builder is None or NotImplementedBuilder:
+    if builder is None or isinstance(builder, NotImplementedBuilder):
         return None
     ccl_cpp_module = builder.load()
     print(f'DeepSpeed {builder.absolute_name()} built successfully')

diff --git a/op_builder/gds.py b/op_builder/gds.py
@@ -36,6 +36,11 @@ def extra_ldflags(self):
         return super().extra_ldflags() + ['-lcufile']
 
     def is_compatible(self, verbose=False):
+        if self.is_rocm_pytorch():
+            if verbose:
+                self.warning(f'{self.NAME} is not compatible with ROCM')
+            return False
+
         try:
             import torch.utils.cpp_extension
         except ImportError:

diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py
@@ -298,6 +298,12 @@ def verify_injection(module):
     verify_injection(model)
 
 
+# Used to Get Device name
+def getDeviceId(local_rank):
+    device = torch.device(f"{get_accelerator().device_name(local_rank)}")
+    return device
+
+
 # Verify that test is valid
 def validate_test(model_w_task, dtype, enable_cuda_graph, enable_triton):
     model, task = model_w_task
@@ -484,8 +490,8 @@ def test(
             pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
 
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
-
-        pipe = pipeline(task, model=model, model_kwargs={"low_cpu_mem_usage": True}, device=local_rank, framework="pt")
+        device = getDeviceId(local_rank)
+        pipe = pipeline(task, model=model, model_kwargs={"low_cpu_mem_usage": True}, device=device, framework="pt")
         bs_output = pipe(query, **inf_kwargs)
         pipe.model = deepspeed.init_inference(pipe.model,
                                               mp_size=self.world_size,