diff --git a/.github/workflows/no-torch.yml b/.github/workflows/no-torch.yml new file mode 100644 index 000000000000..794c02b3c4db --- /dev/null +++ b/.github/workflows/no-torch.yml @@ -0,0 +1,45 @@ +name: no-torch + +on: + workflow_dispatch: + pull_request: + paths: + - '.github/workflows/no-torch.yml' + schedule: + - cron: "0 0 * * *" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + issues: write + +jobs: + unit-tests: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v4 + + - id: setup-venv + uses: ./.github/workflows/setup-venv + + - name: Python environment + run: | + pip uninstall torch --yes + pip list + + - name: Build deepspeed + run: | + DS_BUILD_STRING=" " python setup.py sdist + + - name: Open GitHub issue if nightly CI fails + if: ${{ failure() && (github.event_name == 'schedule') }} + uses: JasonEtco/create-an-issue@v2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + filename: .github/ISSUE_TEMPLATE/ci_failure_report.md + update_existing: true diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp index 3a35ad3145a0..15fd516acaae 100644 --- a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp +++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp @@ -20,7 +20,7 @@ deepspeed_gds_handle_t::deepspeed_gds_handle_t(const int block_size, const bool single_submit, const bool overlap_events, const int num_threads) - : deepspeed_io_handle_t(block_size, queue_depth, single_submit, overlap_events, num_threads) + : deepspeed_io_handle_t(block_size, queue_depth, single_submit, overlap_events, 1) { _init_cuFile(block_size, queue_depth, num_threads); } diff --git a/deepspeed/comm/ccl.py b/deepspeed/comm/ccl.py index cdf4c030f5d6..8753cf9f70ed 100644 --- a/deepspeed/comm/ccl.py +++ b/deepspeed/comm/ccl.py @@ -15,7 +15,7 @@ def build_ccl_op(): builder = get_accelerator().create_op_builder("CCLCommBuilder") - if builder is None or NotImplementedBuilder: + if builder is None or isinstance(builder, NotImplementedBuilder): return None ccl_cpp_module = builder.load() print(f'DeepSpeed {builder.absolute_name()} built successfully') diff --git a/op_builder/gds.py b/op_builder/gds.py index 01c2d5a245d1..727ebdf48372 100644 --- a/op_builder/gds.py +++ b/op_builder/gds.py @@ -36,6 +36,11 @@ def extra_ldflags(self): return super().extra_ldflags() + ['-lcufile'] def is_compatible(self, verbose=False): + if self.is_rocm_pytorch(): + if verbose: + self.warning(f'{self.NAME} is not compatible with ROCM') + return False + try: import torch.utils.cpp_extension except ImportError: diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index eadf670d9328..581a2ce433ed 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -298,6 +298,12 @@ def verify_injection(module): verify_injection(model) +# Used to Get Device name +def getDeviceId(local_rank): + device = torch.device(f"{get_accelerator().device_name(local_rank)}") + return device + + # Verify that test is valid def validate_test(model_w_task, dtype, enable_cuda_graph, enable_triton): model, task = model_w_task @@ -484,8 +490,8 @@ def test( pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.") local_rank = int(os.getenv("LOCAL_RANK", "0")) - - pipe = pipeline(task, model=model, model_kwargs={"low_cpu_mem_usage": True}, device=local_rank, framework="pt") + device = getDeviceId(local_rank) + pipe = pipeline(task, model=model, model_kwargs={"low_cpu_mem_usage": True}, device=device, framework="pt") bs_output = pipe(query, **inf_kwargs) pipe.model = deepspeed.init_inference(pipe.model, mp_size=self.world_size,