diff --git a/blogs/deepspeed-fastgen/chinese/README.md b/blogs/deepspeed-fastgen/chinese/README.md index fb9cc7319ab6..1e92e4169450 100644 --- a/blogs/deepspeed-fastgen/chinese/README.md +++ b/blogs/deepspeed-fastgen/chinese/README.md @@ -226,6 +226,10 @@ DeepSpeed-FastGen 是 [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII * [LLaMA](https://huggingface.co/models?other=llama) 和 [LLaMA-2](https://huggingface.co/models?other=llama-2) * [Mistral](https://huggingface.co/models?other=mistral) * [OPT](https://huggingface.co/models?other=opt) +* [Falcon](https://huggingface.co/models?other=falcon) +* [Mixtral](https://huggingface.co/models?other=mixtral) +* [Phi-2](https://huggingface.co/models?other=phi-msft) +* [Qwen](https://huggingface.co/models?other=qwen) 所有当前模型都利用了后端的 [HuggingFace](https://github.com/huggingface) API 来提供模型权重和模型对应的分词器。 diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py index 0ca89dd0497a..12dd629b1b34 100755 --- a/deepspeed/launcher/runner.py +++ b/deepspeed/launcher/runner.py @@ -117,6 +117,12 @@ def parse_args(args=None): help="(optional) IP address of node 0, will be " "inferred via 'hostname -I' if not specified.") + parser.add_argument("--node_rank", + default=-1, + type=int, + help="ID of each node in the range [0:N). " + "Only required when --no_ssh is set.") + parser.add_argument("--launcher", default=PDSH_LAUNCHER, type=str, @@ -145,6 +151,10 @@ def parse_args(args=None): help="Do not pass local_rank as an argument when calling " "the user's training script.") + parser.add_argument("--no_ssh", + action="store_true", + help="Launch training independently on each node without ssh setup.") + parser.add_argument("--no_ssh_check", action="store_true", help="Do not perform ssh check in multi-node launcher model") @@ -427,7 +437,7 @@ def main(args=None): env = os.environ.copy() # validate that passwordless-ssh is workly properly with this hostfile - if multi_node_exec and not args.no_ssh_check: + if multi_node_exec and not args.no_ssh_check and not args.no_ssh: first_host = list(active_resources.keys())[0] try: ssh_check_cmd = "ssh -o PasswordAuthentication=no " @@ -483,16 +493,22 @@ def main(args=None): if args.elastic_training: assert not args.no_local_rank, "--no_local_rank argument is not supported in Elastic training" + if args.no_ssh: + assert (0 <= args.node_rank < + len(active_resources)), "Launching training without ssh, but --node_rank is not set correctly." + # encode world info as base64 to make it easier to pass via command line world_info_base64 = encode_world_info(active_resources) - multi_node_exec = args.force_multi or len(active_resources) > 1 + multi_node_exec = (args.force_multi or len(active_resources) > 1) and not args.no_ssh if not multi_node_exec: deepspeed_launch = [ sys.executable, "-u", "-m", "deepspeed.launcher.launch", f"--world_info={world_info_base64}", f"--master_addr={args.master_addr}", f"--master_port={args.master_port}" ] + if args.no_ssh: + deepspeed_launch.append(f"--node_rank={args.node_rank}") if args.no_python: deepspeed_launch.append("--no_python") if args.module: