Skip to content

Commit

Permalink
Merge branch 'master' into loadams/lamb-bf16
Browse files Browse the repository at this point in the history
  • Loading branch information
loadams authored Aug 27, 2024
2 parents 0b330c1 + 8ac42ed commit 117b4df
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 35 deletions.
23 changes: 14 additions & 9 deletions deepspeed/runtime/zero/mics.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from deepspeed.runtime.zero.parameter_offload import DeepSpeedZeRoOffload
from deepspeed.runtime.zero.partition_parameters import Init, AllGatherCoalescedHandle, ZeroParamStatus
from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
from deepspeed.utils import instrument_w_nvtx, log_dist
from deepspeed.utils import instrument_w_nvtx, log_dist, logger
from deepspeed.accelerator import get_accelerator
from torch import Tensor
from torch.nn import Parameter
Expand Down Expand Up @@ -88,6 +88,8 @@ def __init__(self,
if it was constructed in the context.
data_parallel_group (``deepspeed.comm`` process group, optional):
The group of processes to partition among. Defaults to all processes.
Synonymous with sequence data parallel group for param partitioning
across both sequence and data parallel groups.
mem_efficient_linear (bool, optional): Replace
torch.nn.functional.linear with an implementation that allows
DeepSpeed to partition parameters. Defaults to ``True``.
Expand Down Expand Up @@ -149,16 +151,19 @@ def __init__(self,
dist.init_distributed()
assert dist.is_initialized(), "Parameters cannot be scattered without initializing deepspeed.comm"

if data_parallel_group is None and sequence_data_parallel_group is None:
if data_parallel_group is None:
ds_process_group = dist.get_world_group()
elif sequence_data_parallel_group is not None:
ds_process_group = sequence_data_parallel_group
elif data_parallel_group is not None:
else:
ds_process_group = data_parallel_group
else: # both given
raise ValueError(
"Both 'data_parallel_group' and 'sequence_data_parallel_group' were specified. Please provide only one of these arguments."
)

if sequence_data_parallel_group is not None:
logger.warning(
f"sequence_data_parallel_group' is deprecated and will be removed. Use 'data_parallel_group' instead.")
if data_parallel_group is not None:
raise ValueError(
"Both 'data_parallel_group' and 'sequence_data_parallel_group' were specified. Please provide only one of these arguments."
)
self.ds_process_group = sequence_data_parallel_group

self.mics_comm_groups = create_mics_comm_groups(
_ds_config.mics_shard_size,
Expand Down
55 changes: 29 additions & 26 deletions deepspeed/runtime/zero/partition_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -814,24 +814,22 @@ class Init(InsertPostInitMethodToModuleSubClasses):
apply_param_persistence = False
override_module_apply = get_config_default(DeepSpeedZeroConfig, "override_module_apply")

def __init__(
self,
module=None,
data_parallel_group=None,
mem_efficient_linear=True,
remote_device=None,
pin_memory=False,
config_dict_or_path=None,
config=None,
enabled=True,
dtype=None,
mpu=None,
zero_param_parallel_group=None,
zero_quantized_weights=False,
zero_quantized_nontrainable_weights=False,
sequence_data_parallel_group=None,
param_swapper=None,
):
def __init__(self,
module=None,
data_parallel_group=None,
mem_efficient_linear=True,
remote_device=None,
pin_memory=False,
config_dict_or_path=None,
config=None,
enabled=True,
dtype=None,
mpu=None,
zero_param_parallel_group=None,
zero_quantized_weights=False,
zero_quantized_nontrainable_weights=False,
sequence_data_parallel_group=None,
param_swapper=None):
"""A context to enable massive model construction for training with
ZeRO-3. Models are automatically partitioned (or, sharded) across the
system and converted to half precision.
Expand All @@ -841,6 +839,8 @@ def __init__(
if it was constructed in the context.
data_parallel_group (``deepspeed.comm`` process group, optional):
The group of processes to partition among. Defaults to all processes.
Synonymous with sequence data parallel group for param partitioning
across both sequence and data parallel groups.
mem_efficient_linear (bool, optional): Replace
torch.nn.functional.linear with an implementation that allows
DeepSpeed to partition parameters. Defaults to ``True``.
Expand Down Expand Up @@ -940,16 +940,19 @@ def __init__(
init_distributed()
assert dist.is_initialized(), "Parameters cannot be scattered without initializing deepspeed.comm"

if data_parallel_group is None and sequence_data_parallel_group is None:
if data_parallel_group is None:
self.ds_process_group = dist.get_world_group()
elif sequence_data_parallel_group is not None:
self.ds_process_group = sequence_data_parallel_group
elif data_parallel_group is not None:
else:
self.ds_process_group = data_parallel_group
else: # both given
raise ValueError(
"Both 'data_parallel_group' and 'sequence_data_parallel_group' were specified. Please provide only one of these arguments."
)

if sequence_data_parallel_group is not None:
logger.warning(
f"sequence_data_parallel_group' is deprecated and will be removed. Use 'data_parallel_group' instead.")
if data_parallel_group is not None:
raise ValueError(
"Both 'data_parallel_group' and 'sequence_data_parallel_group' were specified. Please provide only one of these arguments."
)
self.ds_process_group = sequence_data_parallel_group

self.rank = dist.get_rank(group=self.ds_process_group)
self.dp_world_size = dist.get_world_size(group=self.ds_process_group)
Expand Down

0 comments on commit 117b4df

Please sign in to comment.