Skip to content

Commit

Permalink
Reducing the memory-overhead of creating model for multi-GPU run (#1244)
Browse files Browse the repository at this point in the history
Co-authored-by: Jeff Rasley <[email protected]>
  • Loading branch information
RezaYazdaniAminabadi and jeffra authored Aug 26, 2021
1 parent 274c375 commit 49b6a63
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 14 deletions.
23 changes: 9 additions & 14 deletions deepspeed/inference/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,6 @@ def __init__(self,
self.quantize_merge_count = 1
self.quantization_scales = None

if self.mpu:
self.mp_world_size = dist.get_world_size(
group=self.mpu.get_model_parallel_group())
self.mp_group = self.mpu.get_model_parallel_group()
elif self.mp_world_size > 1 and not dist.is_initialized():
self._create_model_parallel_group()
else:
self.module.to(torch.cuda.current_device())

self._check_quantize_setting(quantization_setting)

if self.checkpoint:
Expand All @@ -62,13 +53,22 @@ def __init__(self,
if self.dtype:
self._convert_to_dtype()

if self.mpu:
self.mp_world_size = dist.get_world_size(
group=self.mpu.get_model_parallel_group())
self.mp_group = self.mpu.get_model_parallel_group()
elif self.mp_world_size > 1 and not dist.is_initialized():
self._create_model_parallel_group()

# apply injection policy
if self.injection_dict:
for client_module, injection_policy in self.injection_dict.items():
self._apply_injection_policy(client_module, injection_policy)
elif replace_method == 'auto':
self._apply_injection_policy()

self.module.to(torch.cuda.current_device())

if self.mp_world_size > 1:
self.model_orig_fwd = self.module.forward
self.module.forward = self.forward
Expand Down Expand Up @@ -96,11 +96,6 @@ def _create_model_parallel_group(self):
ranks = [i for i in range(self.mp_world_size)]
self.mp_group = dist.new_group(ranks)

self.module.to(torch.cuda.current_device())
for p in self.module.parameters():
if torch.is_tensor(p):
dist.broadcast(p, 0)

def _check_quantize_setting(self, quantization_setting):
self.quatize_bits = 8
self.mlp_extra_grouping = False
Expand Down
3 changes: 3 additions & 0 deletions deepspeed/module_inject/replace_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,9 @@ def replace_with_policy(child, policy_cls, inference=False, preln=True, layer_id

if inference:
hidden_size, num_attention_heads = policy.get_hidden_heads()
assert num_attention_heads % mp_size == 0,\
"To run the model parallel across the GPUs, the attention_heads require to be divisible by the world_size!" +\
"This is because the attention computation is partitioned evenly among the parallel GPUs."

attn_linear_layer, qkvw, qkvb, dense_w, dense_b, scale_attention = policy.attention()
mlp_linear_layer, _h4h_w, _h4h_b, _4hh_w, _4hh_b = policy.mlp()
Expand Down

0 comments on commit 49b6a63

Please sign in to comment.