diff --git a/musiclm_pytorch/distributed.py b/musiclm_pytorch/distributed.py index fa5d640..71364ac 100644 --- a/musiclm_pytorch/distributed.py +++ b/musiclm_pytorch/distributed.py @@ -1,26 +1,34 @@ import torch from torch import nn from torch.autograd import Function -import torch.distributed as distributed +import torch.distributed as dist from einops import rearrange # distributed helpers +def all_gather_same_dim(t): + world_size = dist.get_world_size() + gathered_tensors = [torch.empty_like(t, device = t.device, dtype = t.dtype) for i in range(world_size)] + dist.all_gather(gathered_tensors, t) + return gathered_tensors + def all_gather_variable_dim(t, dim = 0, sizes = None): - device, rank, world_size = t.device, distributed.get_rank(), distributed.get_world_size() + device, rank, world_size = t.device, dist.get_rank(), dist.get_world_size() if not exists(sizes): size = torch.tensor(t.shape[dim], device = device, dtype = torch.long) - sizes = [torch.empty_like(size, device = device, dtype = torch.long) for i in range(world_size)] - distributed.all_gather(sizes, size) + sizes = all_gather_same_dim(size) sizes = torch.stack(sizes) + if torch.unique(sizes).numel() == 1: + gathered_tensors = all_gather_same_dim(t) + return torch.cat(gathered_tensors, dim = dim), sizes + max_size = sizes.amax().item() - padded_t = pad_dim_to(t, max_size, dim = dim) - gathered_tensors = [torch.empty(padded_t.shape, device = device, dtype = padded_t.dtype) for i in range(world_size)] - distributed.all_gather(gathered_tensors, padded_t) + padded_t = pad_dim_to(t, max_size, dim = dim) + gathered_tensors = all_gather_same_dim(padded_t) gathered_tensor = torch.cat(gathered_tensors, dim = dim) seq = torch.arange(max_size, device = device) @@ -45,9 +53,9 @@ def forward(ctx, x, dim, sizes, all_reduce_grads): @staticmethod def backward(ctx, grads, _): - batch_sizes, rank = ctx.batch_sizes, distributed.get_rank() + batch_sizes, rank = ctx.batch_sizes, dist.get_rank() if ctx.all_reduce_grads: - distributed.all_reduce(grads) + dist.all_reduce(grads) grads_by_rank = grads.split(batch_sizes, dim = ctx.dim) return grads_by_rank[rank], None, None, None @@ -62,7 +70,7 @@ def __init__( super().__init__() self.dim = dim self.all_reduce_grads = all_reduce_grads - self.is_distributed = distributed.is_initialized() and distributed.get_world_size() > 1 + self.is_distributed = dist.is_initialized() and dist.get_world_size() > 1 def forward( self, diff --git a/setup.py b/setup.py index cd7be6c..0f5a209 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name = 'musiclm-pytorch', packages = find_packages(exclude=[]), - version = '0.2.7', + version = '0.2.8', license='MIT', description = 'MusicLM - AudioLM + Audio CLIP to text to music synthesis', author = 'Phil Wang',