From 32581a4d127cc7927e8af4054eb36469ac5523e4 Mon Sep 17 00:00:00 2001 From: Lzhang-hub <57925599+Lzhang-hub@users.noreply.github.com> Date: Tue, 2 Apr 2024 06:45:03 +0800 Subject: [PATCH] resolve KeyError: 'PDSH_SSH_ARGS_APPEND' (#5318) when start job with `deepspeed --hostfile hostfile --master_addr $MASTER_IP --ssh_port 20023 src/train_bash.py ` get error: KeyError: 'PDSH_SSH_ARGS_APPEND' in https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/launcher/multinode_runner.py#L77 because PDSH_SSH_ARGS_APPEND not in environment. --------- Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- deepspeed/launcher/multinode_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deepspeed/launcher/multinode_runner.py b/deepspeed/launcher/multinode_runner.py index 44e694952ffe..ce58deadc281 100644 --- a/deepspeed/launcher/multinode_runner.py +++ b/deepspeed/launcher/multinode_runner.py @@ -74,7 +74,8 @@ def name(self): def get_cmd(self, environment, active_resources): environment['PDSH_RCMD_TYPE'] = 'ssh' if self.args.ssh_port is not None: # only specify ssh port if it is specified - environment["PDSH_SSH_ARGS_APPEND"] += f" -p {self.args.ssh_port}" + environment["PDSH_SSH_ARGS_APPEND"] = f"{environment.get('PDSH_SSH_ARGS_APPEND', '')} \ + -p {self.args.ssh_port}" active_workers = ",".join(active_resources.keys()) logger.info("Running on the following workers: %s" % active_workers)