microsoft · mrwyattii · Dec 4, 2023 · Dec 4, 2023 · Dec 4, 2023 · Dec 13, 2023
diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
@@ -18,7 +18,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu111, v100]
+    runs-on: [self-hosted, nvidia, cu116, v100]
 
     steps:
       - uses: actions/checkout@v3
@@ -28,7 +28,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
+          pip install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 

@@ -29,7 +29,7 @@ jobs:
       - name: Install pytorch
         run: |
           # use the same pytorch version as transformers CI
-          pip install -U --cache-dir $TORCH_CACHE torch==2.0.1+cu118 --index-url https://download.pytorch.org/whl/cu118
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -38,9 +38,13 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
-          git checkout e7e9261a2
+          #git checkout e7e9261a2
           git rev-parse --short HEAD
-          pip install .
+          pip install .[testing]
+          # find reqs used in ds integration tests
+          find examples/pytorch -regextype posix-egrep -regex '.*(language-modeling|question-answering|summarization|image-classification|text-classification|translation).*/requirements.txt' -exec grep -v 'torch' {} \; | xargs -I {} pip install --upgrade {}
+          # force protobuf version due to issues
+          pip install "protobuf<4.21.0"
 
       - name: Install deepspeed
         run: |
@@ -55,10 +59,4 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd transformers
-          pip install .[testing]
-          # find reqs used in ds integration tests
-          find examples/pytorch -regextype posix-egrep -regex '.*(language-modeling|question-answering|summarization|image-classification|text-classification|translation).*/requirements.txt' -exec grep -v 'torch' {} \; | xargs -I {} pip install --upgrade {}
-          # force protobuf version due to issues
-          pip install "protobuf<4.21.0"
-          pip list
-          WANDB_DISABLED=true RUN_SLOW=1 pytest $PYTEST_OPTS tests/deepspeed
+          WANDB_DISABLED=true RUN_SLOW=1 pytest $PYTEST_OPTS tests/deepspeed -k "not zero_to_fp32_zero3_qa_mpnet and not zero_to_fp32_zero3_mlm_funnel and not zero_to_fp32_zero3_trans_m2m_100 and not zero_to_fp32_zero3_mlm_flaubert and not zero_to_fp32_zero3_trans_marian and not zero_to_fp32_zero3_clm_prophetnet and not zero_to_fp32_zero3_clas_bert and not zero_to_fp32_zero3_trans_fsmt"
diff --git a/deepspeed/launcher/multinode_runner.py b/deepspeed/launcher/multinode_runner.py
@@ -56,13 +56,21 @@ def __init__(self, args, world_info_base64):
     def backend_exists(self):
         return shutil.which('pdsh')
 
+    def parse_user_args(self):
+        processed_args = []
+        for arg in self.args.user_args:
+            # With pdsh, if we are passing a string as an argument, it will get
+            # split on whitespace. To avoid this and support strings that
+            # contain '"', we do this extra processing step:
+            if " " in arg:
+                arg = '"{}"'.format(arg.replace('"', '\\"'))
+            processed_args.append(arg)
+        return processed_args
+
     @property
     def name(self):
         return "pdsh"
 
-    def parse_user_args(self):
-        return list(map(lambda x: x if x.startswith("-") else f"'{x}'", self.args.user_args))
-
     def get_cmd(self, environment, active_resources):
         environment['PDSH_RCMD_TYPE'] = 'ssh'
         if self.args.ssh_port is not None:  # only specify ssh port if it is specified

diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
@@ -12,7 +12,6 @@
 import os
 import re
 import sys
-import shlex
 import json
 import base64
 import argparse
@@ -389,9 +388,6 @@ def parse_num_nodes(str_num_nodes: str, elastic_training: bool):
 def main(args=None):
     args = parse_args(args)
 
-    # For when argparse interprets remaining args as a single string
-    args.user_args = shlex.split(" ".join(list(map(lambda x: x if x.startswith("-") else f'"{x}"', args.user_args))))
-
     if args.elastic_training:
         assert args.master_addr != "", "Master Addr is required when elastic training is enabled"
 

diff --git a/tests/unit/launcher/test_user_args.py b/tests/unit/launcher/test_user_args.py
@@ -0,0 +1,64 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import subprocess
+
+from deepspeed.accelerator import get_accelerator
+
+if not get_accelerator().is_available():
+    pytest.skip("only supported in accelerator environments.", allow_module_level=True)
+
+user_arg_test_script = """import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument("--prompt", type=str)
+parser.add_argument("--local_rank", type=int, default=0)
+parser.add_argument("--world_size", type=int, default=1)
+args = parser.parse_args()
+print("ARG PARSE SUCCESS")
+"""
+
+
+@pytest.fixture(scope="function")
+def user_script_fp(tmpdir):
+    script_fp = tmpdir.join("user_arg_test.py")
+    with open(script_fp, "w") as f:
+        f.write(user_arg_test_script)
+    return script_fp
+
+
+@pytest.fixture(scope="function")
+def cmd(user_script_fp, prompt, multi_node):
+    if multi_node:
+        cmd = ("deepspeed", "--force_multi", "--num_nodes", "1", "--num_gpus", "1", user_script_fp, "--prompt", prompt)
+    else:
+        cmd = ("deepspeed", "--num_nodes", "1", "--num_gpus", "1", user_script_fp, "--prompt", prompt)
+    return cmd
+
+
+@pytest.mark.parametrize("prompt", [
+    '''"I am 6' tall"''', """'I am 72" tall'""", """'"translate English to Romanian: "'""",
+    '''I'm going to tell them "DeepSpeed is the best"'''
+])
+@pytest.mark.parametrize("multi_node", [True, False])
+def test_user_args(cmd):
+    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    out, err = p.communicate()
+    assert "ARG PARSE SUCCESS" in out.decode("utf-8"), f"User args not parsed correctly: {err.decode('utf-8')}"
+
+
+def test_bash_string_args(tmpdir, user_script_fp):
+    bash_script = f"""
+    ARGS="--prompt 'DeepSpeed is the best'"
+    echo ${{ARGS}}|xargs deepspeed --num_nodes 1 --num_gpus 1 {user_script_fp}
+    """
+
+    bash_fp = tmpdir.join("bash_script.sh")
+    with open(bash_fp, "w") as f:
+        f.write(bash_script)
+
+    p = subprocess.Popen(["bash", bash_fp], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    out, err = p.communicate()
+    assert "ARG PARSE SUCCESS" in out.decode("utf-8"), f"User args not parsed correctly: {err.decode('utf-8')}"