Skip to content

Commit

Permalink
Merge branch 'master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
HeyangQin authored Jul 16, 2024
2 parents a89fceb + 78c6c44 commit b8929ca
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 2 deletions.
4 changes: 4 additions & 0 deletions blogs/deepspeed-fastgen/chinese/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,10 @@ DeepSpeed-FastGen 是 [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII
* [LLaMA](https://huggingface.co/models?other=llama)[LLaMA-2](https://huggingface.co/models?other=llama-2)
* [Mistral](https://huggingface.co/models?other=mistral)
* [OPT](https://huggingface.co/models?other=opt)
* [Falcon](https://huggingface.co/models?other=falcon)
* [Mixtral](https://huggingface.co/models?other=mixtral)
* [Phi-2](https://huggingface.co/models?other=phi-msft)
* [Qwen](https://huggingface.co/models?other=qwen)

所有当前模型都利用了后端的 [HuggingFace](https://github.com/huggingface) API 来提供模型权重和模型对应的分词器。

Expand Down
20 changes: 18 additions & 2 deletions deepspeed/launcher/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,12 @@ def parse_args(args=None):
help="(optional) IP address of node 0, will be "
"inferred via 'hostname -I' if not specified.")

parser.add_argument("--node_rank",
default=-1,
type=int,
help="ID of each node in the range [0:N). "
"Only required when --no_ssh is set.")

parser.add_argument("--launcher",
default=PDSH_LAUNCHER,
type=str,
Expand Down Expand Up @@ -145,6 +151,10 @@ def parse_args(args=None):
help="Do not pass local_rank as an argument when calling "
"the user's training script.")

parser.add_argument("--no_ssh",
action="store_true",
help="Launch training independently on each node without ssh setup.")

parser.add_argument("--no_ssh_check",
action="store_true",
help="Do not perform ssh check in multi-node launcher model")
Expand Down Expand Up @@ -427,7 +437,7 @@ def main(args=None):
env = os.environ.copy()

# validate that passwordless-ssh is workly properly with this hostfile
if multi_node_exec and not args.no_ssh_check:
if multi_node_exec and not args.no_ssh_check and not args.no_ssh:
first_host = list(active_resources.keys())[0]
try:
ssh_check_cmd = "ssh -o PasswordAuthentication=no "
Expand Down Expand Up @@ -483,16 +493,22 @@ def main(args=None):
if args.elastic_training:
assert not args.no_local_rank, "--no_local_rank argument is not supported in Elastic training"

if args.no_ssh:
assert (0 <= args.node_rank <
len(active_resources)), "Launching training without ssh, but --node_rank is not set correctly."

# encode world info as base64 to make it easier to pass via command line
world_info_base64 = encode_world_info(active_resources)

multi_node_exec = args.force_multi or len(active_resources) > 1
multi_node_exec = (args.force_multi or len(active_resources) > 1) and not args.no_ssh

if not multi_node_exec:
deepspeed_launch = [
sys.executable, "-u", "-m", "deepspeed.launcher.launch", f"--world_info={world_info_base64}",
f"--master_addr={args.master_addr}", f"--master_port={args.master_port}"
]
if args.no_ssh:
deepspeed_launch.append(f"--node_rank={args.node_rank}")
if args.no_python:
deepspeed_launch.append("--no_python")
if args.module:
Expand Down

0 comments on commit b8929ca

Please sign in to comment.