From f4cb866c2eba60a72d79c3741d12d098fe4e45b8 Mon Sep 17 00:00:00 2001 From: Nadav Elyahu <88962733+nelyahu@users.noreply.github.com> Date: Tue, 4 Jun 2024 11:25:36 +0300 Subject: [PATCH] estimate_zero2_model_states_mem_needs: fixing memory estiamtion (#5099) was considering 4 bytes per model param, and 4 bytes per gradient. fixed it to 2 bytes - under the assumption of FP16/BF16 --------- Co-authored-by: Olatunji Ruwase --- deepspeed/runtime/zero/stage_1_and_2.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index 3d5ff5e6b43e..61652854211d 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -2432,7 +2432,9 @@ def estimate_zero2_model_states_mem_needs(total_params, gpu_mem = 2 * total_params cpu_mem = total_params * max(4 * total_gpus, 16) * additional_buffer_factor else: - gpu_mem = 4 * total_params + int(16 * total_params / total_gpus) + # GPU's total_params multipliers: 2 = params_16bit, + # 18 = 2_grads_16bit + 4_grads_32bit + 4_params_32bit + 8_optimizer_states_32bit(momentum and variance) + gpu_mem = 2 * total_params + int(18 * total_params / total_gpus) cpu_mem = total_params * 4 * num_gpus_per_node * additional_buffer_factor return int(cpu_mem), int(gpu_mem)