-
Notifications
You must be signed in to change notification settings - Fork 4.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
uniform deepspeed overflow check #5424
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1473,6 +1473,7 @@ def _configure_bf16_optimizer(self, optimizer): | |
timers = self.timers if self.wall_clock_breakdown() else NoopTimer() | ||
optimizer = BF16_Optimizer(optimizer, | ||
self.param_names, | ||
deepspeed=self, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't pass |
||
mpu=self.mpu, | ||
clip_grad=clip_grad, | ||
allgather_bucket_size=self.zero_allgather_bucket_size(), | ||
|
@@ -1532,6 +1533,7 @@ def _configure_zero_optimizer(self, optimizer): | |
overlap_comm=overlap_comm, | ||
offload_optimizer_config=self.zero_offload_optimizer(), | ||
mpu=self.mpu, | ||
deepspeed=self, | ||
postscale_gradients=self.postscale_gradients(), | ||
gradient_predivide_factor=self.gradient_predivide_factor(), | ||
gradient_accumulation_steps=self.gradient_accumulation_steps(), | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -181,12 +181,13 @@ def get_norm_with_moe_layers_fast(all_groups_norm, group): | |
class CheckOverflow(object): | ||
'''Checks for overflow in gradient across parallel process''' | ||
|
||
def __init__(self, param_groups=None, mpu=None, zero_reduce_scatter=False, deepspeed=None): | ||
def __init__(self, param_groups=None, mpu=None, zero_reduce_scatter=False, deepspeed=None, partition_grads=False): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Passing deepseed engine into a submodule is not a good design and create all sorts of cyclic reference issues. It is better to pass the specific attributes that are needed, such as |
||
self.mpu = mpu | ||
self.params = [] if param_groups else None | ||
self.zero_reduce_scatter = zero_reduce_scatter | ||
self.deepspeed = deepspeed | ||
self.has_moe_params = False | ||
self.partition_grads = partition_grads | ||
if param_groups: | ||
for group in param_groups: | ||
for param in group: | ||
|
@@ -234,7 +235,7 @@ def check(self, param_groups=None): | |
# `params` is a list / generator of torch.Variable | ||
def has_overflow_serial(self, params): | ||
for i, p in enumerate(params): | ||
if p.grad is not None and self._has_inf_or_nan(p.grad.data, i): | ||
if p.grad is not None and self._has_inf_or_nan(p.grad.data): | ||
return True | ||
return False | ||
|
||
|
@@ -261,15 +262,16 @@ def has_overflow(self, params, has_moe_params=None): | |
not using_pipeline and self.deepspeed.enable_backward_allreduce is False): | ||
dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.mpu.get_data_parallel_group()) | ||
dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.mpu.get_model_parallel_group()) | ||
elif self.deepspeed is not None and self.deepspeed.enable_backward_allreduce is False: | ||
elif self.deepspeed is not None and (self.deepspeed.enable_backward_allreduce is False | ||
or self.partition_grads is True): | ||
dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=dist.get_world_group()) | ||
|
||
overflow = overflow_gpu[0].item() | ||
return bool(overflow) | ||
|
||
# `x` is a torch.Tensor | ||
@staticmethod | ||
def _has_inf_or_nan(x, i): | ||
def _has_inf_or_nan(x): | ||
try: | ||
# if x is half, the .float() incurs an additional deep copy, but it's necessary if | ||
# Pytorch's .sum() creates a one-element tensor of the same type as x | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should
self.overflow
be a class member since it seems to be only used once?