From 3f875d95193fbd3a0c7f0c0dcc8d39469061bb66 Mon Sep 17 00:00:00 2001 From: shiyuan680 <72335504+shiyuan680@users.noreply.github.com> Date: Sun, 21 Apr 2024 07:35:50 +0800 Subject: [PATCH] add device config env for the accelerator (#5396) Thank you for [pr](https://github.com/microsoft/DeepSpeed/pull/5369) and @delock contribution of ideas. As mentioned in this [pr](https://github.com/microsoft/DeepSpeed/pull/5369), each device has its own environmental variables. We create visible_devices_envs() and set_visible_devices_envs() methods on the accelerator class to enable each accelerator to implement env settings within the interface , which is more generic to other accelerators. this commit has tested on npu, each one has 8 ascend npus --------- Co-authored-by: yangcheng Co-authored-by: eigen2017 Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Olatunji Ruwase --- README.md | 11 ++++++----- accelerator/abstract_accelerator.py | 8 ++++++++ accelerator/cpu_accelerator.py | 8 ++++++++ accelerator/cuda_accelerator.py | 7 +++++++ accelerator/hpu_accelerator.py | 7 +++++++ accelerator/mps_accelerator.py | 9 +++++++++ accelerator/npu_accelerator.py | 7 +++++++ accelerator/xpu_accelerator.py | 7 +++++++ deepspeed/launcher/launch.py | 11 +++++++---- 9 files changed, 66 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index a1335caa4949..201b9016f8ab 100755 --- a/README.md +++ b/README.md @@ -159,11 +159,12 @@ dynamically link them at runtime. ## Contributed HW support * DeepSpeed now support various HW accelerators. -| Contributor | Hardware | Accelerator Name | Contributor validated | Upstream validated | -| ----------- | -------- | ---------------- | --------------------- | ------------------ | -| Intel | Intel(R) Gaudi(R) 2 AI accelerator | hpu | Yes | Yes | -| Intel | Intel(R) Xeon(R) Processors | cpu | Yes | Yes | -| Intel | Intel(R) Data Center GPU Max series | xpu | Yes | Yes | +| Contributor | Hardware | Accelerator Name | Contributor validated | Upstream validated | +|-------------|-------------------------------------|------------------| --------------------- |--------------------| +| Huawei | Huawei Ascend NPU | npu | Yes | No | +| Intel | Intel(R) Gaudi(R) 2 AI accelerator | hpu | Yes | Yes | +| Intel | Intel(R) Xeon(R) Processors | cpu | Yes | Yes | +| Intel | Intel(R) Data Center GPU Max series | xpu | Yes | Yes | ## PyPI We regularly push releases to [PyPI](https://pypi.org/project/deepspeed/) and encourage users to install from there in most cases. diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py index 3c5d799e293e..768d5ea34e5e 100644 --- a/accelerator/abstract_accelerator.py +++ b/accelerator/abstract_accelerator.py @@ -287,3 +287,11 @@ def build_extension(self): @abc.abstractmethod def export_envs(self): ... + + @abc.abstractmethod + def visible_devices_envs(self): + ... + + @abc.abstractmethod + def set_visible_devices_envs(self, current_env, local_accelerator_ids): + ... diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py index a0171723cfb8..237e7f51dcb4 100644 --- a/accelerator/cpu_accelerator.py +++ b/accelerator/cpu_accelerator.py @@ -322,3 +322,11 @@ def build_extension(self): def export_envs(self): return [] + + # TODO: cpu's visible envs is confirmed, keep as CUDA_VISIBLE_DEVICES + def visible_devices_envs(self): + return ['CUDA_VISIBLE_DEVICES'] + + def set_visible_devices_envs(self, current_env, local_accelerator_ids): + for env in self.visible_devices_envs(): + current_env[env] = ",".join(map(str, local_accelerator_ids)) diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py index 3d5e9c168c16..2fc0cfd94125 100644 --- a/accelerator/cuda_accelerator.py +++ b/accelerator/cuda_accelerator.py @@ -360,3 +360,10 @@ def build_extension(self): def export_envs(self): return ['NCCL'] + + def visible_devices_envs(self): + return ['CUDA_VISIBLE_DEVICES'] + + def set_visible_devices_envs(self, current_env, local_accelerator_ids): + for env in self.visible_devices_envs(): + current_env[env] = ",".join(map(str, local_accelerator_ids)) diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py index 30b115e8b1ab..326efc8fa01b 100644 --- a/accelerator/hpu_accelerator.py +++ b/accelerator/hpu_accelerator.py @@ -294,3 +294,10 @@ def build_extension(self): def export_envs(self): return [] + + def visible_devices_envs(self): + return ['HABANA_VISIBLE_MODULES'] + + def set_visible_devices_envs(self, current_env, local_accelerator_ids): + for env in self.visible_devices_envs(): + current_env[env] = ",".join(map(str, local_accelerator_ids)) diff --git a/accelerator/mps_accelerator.py b/accelerator/mps_accelerator.py index 972b33caece1..ff70b860d7c7 100644 --- a/accelerator/mps_accelerator.py +++ b/accelerator/mps_accelerator.py @@ -258,3 +258,12 @@ def build_extension(self): def export_envs(self): return [] + + # TODO: mpu's visible envs is confirmed, keep as CUDA_VISIBLE_DEVICES + def visible_devices_envs(self): + # TODO: could not find visible devices env for mps + return ['CUDA_VISIBLE_DEVICES'] + + def set_visible_devices_envs(self, current_env, local_accelerator_ids): + for env in self.visible_devices_envs(): + current_env[env] = ",".join(map(str, local_accelerator_ids)) diff --git a/accelerator/npu_accelerator.py b/accelerator/npu_accelerator.py index 472157e32c02..5d891ecb707d 100644 --- a/accelerator/npu_accelerator.py +++ b/accelerator/npu_accelerator.py @@ -278,3 +278,10 @@ def build_extension(self): def export_envs(self): return ['ASCEND', 'HCCL', 'LD_LIBRARY', 'PATH'] + + def visible_devices_envs(self): + return ['ASCEND_RT_VISIBLE_DEVICES'] + + def set_visible_devices_envs(self, current_env, local_accelerator_ids): + for env in self.visible_devices_envs(): + current_env[env] = ",".join(map(str, local_accelerator_ids)) diff --git a/accelerator/xpu_accelerator.py b/accelerator/xpu_accelerator.py index 3f65263946ab..c59f60077d2f 100644 --- a/accelerator/xpu_accelerator.py +++ b/accelerator/xpu_accelerator.py @@ -289,3 +289,10 @@ def build_extension(self): def export_envs(self): return [] + + def visible_devices_envs(self): + return ['ZE_AFFINITY_MASK'] + + def set_visible_devices_envs(self, current_env, local_accelerator_ids): + for env in self.visible_devices_envs(): + current_env[env] = ",".join(map(str, local_accelerator_ids)) diff --git a/deepspeed/launcher/launch.py b/deepspeed/launcher/launch.py index ffb9cbc18fbd..079182a3b45b 100755 --- a/deepspeed/launcher/launch.py +++ b/deepspeed/launcher/launch.py @@ -22,6 +22,7 @@ from collections import defaultdict from typing import Dict from argparse import ArgumentParser, REMAINDER +from deepspeed.accelerator import get_accelerator from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT from ..nebula.constants import DLTS_POD_ENV_PATH from ..utils import logger, get_numactl_cmd @@ -146,8 +147,8 @@ def main(): node_list = list(world_info.keys()) args.nnodes = len(node_list) local_node = node_list[args.node_rank] - local_gpu_ids = world_info[local_node] - num_local_procs = len(local_gpu_ids) + local_accelerator_ids = world_info[local_node] + num_local_procs = len(local_accelerator_ids) logger.info(f"nnodes={args.nnodes}, num_local_procs={num_local_procs}, node_rank={args.node_rank}") global_rank_mapping = defaultdict(list) @@ -161,8 +162,10 @@ def main(): curr_global_rank += 1 logger.info(f"global_rank_mapping={global_rank_mapping}") logger.info(f"dist_world_size={dist_world_size}") - current_env["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, local_gpu_ids)) - logger.info(f"Setting CUDA_VISIBLE_DEVICES={current_env['CUDA_VISIBLE_DEVICES']}") + + get_accelerator().set_visible_devices_envs(current_env, local_accelerator_ids) + for env in get_accelerator().visible_devices_envs(): + logger.info(f"Setting {env}={current_env[env]}") # set PyTorch distributed related environmental variables current_env["MASTER_ADDR"] = args.master_addr