From b82dc2b511a62c7efad4825fe325fe8d3082866a Mon Sep 17 00:00:00 2001 From: Boris Sarana Date: Fri, 15 Nov 2024 09:22:28 -0800 Subject: [PATCH] Do barriers only once per PG initialization (#2558) Summary: Context: https://fb.workplace.com/groups/970281557043698/permalink/1794944644577381/ The barriers during PG init introduce significaant overhead, specially on large jobs. Differential Revision: D65912439 --- torchrec/distributed/comm.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torchrec/distributed/comm.py b/torchrec/distributed/comm.py index e3e50a2d1..f3edd56e7 100644 --- a/torchrec/distributed/comm.py +++ b/torchrec/distributed/comm.py @@ -135,8 +135,8 @@ def intra_and_cross_node_pg( "[Connection] intra_group: [%d] -> [%s]" % (my_rank, peers) ) _INTRA_PG = curr_intra_group_pg - - dist.barrier() + assert _INTRA_PG is not None + dist.barrier() if _CROSS_PG is None: for l_rank in range(local_size): @@ -147,7 +147,7 @@ def intra_and_cross_node_pg( "[Connection] cross_group: [%d] -> [%s]" % (my_rank, peers) ) _CROSS_PG = curr_cross_group_pg - - dist.barrier() + assert _CROSS_PG is not None + dist.barrier() return _INTRA_PG, _CROSS_PG