Skip to content

Commit

Permalink
fix allreduce latency and mem usage when tp is in use
Browse files Browse the repository at this point in the history
  • Loading branch information
cli99 committed Nov 13, 2024
1 parent acbaf25 commit 168d9a3
Showing 1 changed file with 6 additions and 6 deletions.
12 changes: 6 additions & 6 deletions llm_analysis/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,9 +475,9 @@ def get_memory_optimizer_state_and_gradient_per_layer(

memory_optimizer_state_others_per_layer = op_bytes_per_params * (
(self.get_num_params_per_layer_attn() +
+self.get_num_params_per_layer_router() +
self.get_num_params_per_layer_layernorm())
) / self.parallelism_config.tp_size / sharded_dp_size
+self.get_num_params_per_layer_router()) /
self.parallelism_config.tp_size +
self.get_num_params_per_layer_layernorm()) / sharded_dp_size

memory_optimizer_state_per_layer = memory_optimizer_state_mlp_per_layer + memory_optimizer_state_others_per_layer

Expand Down Expand Up @@ -1218,9 +1218,9 @@ def get_latency_fwd_per_tp_comm(self, batch_size: int, seq_len: int,
elems_per_all_reduce = (2 * batch_size * seq_len *
self.model_config.hidden_dim * (tp_size - 1) /
tp_size)
latency_per_all_reduce = (
elems_per_all_reduce * dtype_bytes /
(self.gpu_config.intra_node_bandwidth_in_GB_per_sec * 10**9))
# assuming tp_size <= number of GPUs per node, thus using intra-node bandwidth
latency_per_all_reduce = (elems_per_all_reduce * dtype_bytes /
(self.get_intra_node_bandwidth() * 10**9))

return max(
latency_per_all_reduce,
Expand Down

0 comments on commit 168d9a3

Please sign in to comment.