From c52d1e2baf386d5dc6566b820583a3d71e6827d6 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Fri, 19 May 2023 23:36:11 +0800 Subject: [PATCH 01/74] add show_straggler argument to log_summary() --- deepspeed/comm/comm.py | 6 ++++-- deepspeed/utils/comms_logging.py | 31 +++++++++++++++++++++++-------- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/deepspeed/comm/comm.py b/deepspeed/comm/comm.py index 4a9b51b9e601..d13d2e664eee 100644 --- a/deepspeed/comm/comm.py +++ b/deepspeed/comm/comm.py @@ -405,11 +405,13 @@ def monitored_barrier(group=None, return cdb.barrier(group=group, timeout=timeout, wait_all_ranks=wait_all_ranks) -def log_summary(): +def log_summary(show_straggler=False): global cdb barrier(log_name='log_summary_barrier') if cdb.get_rank() == 0: - comms_logger.log_all() + comms_logger.log_all(print_log=True, show_straggler=show_straggler) + else: + comms_logger.log_all(print_log=False, show_straggler=show_straggler) barrier(log_name='log_summary_barrier') diff --git a/deepspeed/utils/comms_logging.py b/deepspeed/utils/comms_logging.py index 2400fa55b20e..7f3423b06363 100644 --- a/deepspeed/utils/comms_logging.py +++ b/deepspeed/utils/comms_logging.py @@ -122,23 +122,38 @@ def append(self, raw_name, record_name, latency, msg_size): log_dist(log_str, [0]) # Print summary at end of iteration, epoch, or training - def log_all(self): + def log_all(self, print_log=True, show_straggler=False): + import torch from deepspeed.utils.timer import trim_mean - print( - f"{'Comm. Op': <20}{'Message Size': <20}{'Count': <20}{'Total Latency(ms)': <20}{'Avg Latency(ms)': <20}{'tput_avg (Gbps)': <20}{'busbw_avg (Gbps)': <20}" - ) + import deepspeed.comm as dist + from deepspeed.comm.reduce_op import ReduceOp + if print_log: + msg = f"{'Comm. Op': <20}{'Message Size': <20}{'Count': <20}{'Total Latency(ms)': <20}" + if show_straggler: + msg += f"{'Total straggler(ms)': <20}" + msg += f"{'Avg Latency(ms)': <20}{'tput_avg (Gbps)': <20}{'busbw_avg (Gbps)': <20}" + print(msg) for record_name in self.comms_dict.keys(): - print(record_name) + if print_log: + print(record_name) for msg_size, vals in sorted(self.comms_dict[record_name].items()): # vals[0] is the count for each msg size count = vals[0] # vals[1] is a list of latency records for each msg size total_lat = sum(vals[1]) + lats = torch.tensor(vals[1]) + min_lats = torch.tensor(vals[1]) + dist.all_reduce(min_lats, op=ReduceOp.MIN) + delta_lats = lats - min_lats + total_straggler = (lats - min_lats).sum().item() # vals[2] and vals[3] are the lists of algbw and busbw, respectively # Get rid of outliers when we print avg_lat = trim_mean(vals[1], 0.1) avg_algbw = trim_mean(vals[2], 0.1) avg_busbw = trim_mean(vals[3], 0.1) - print( - f"{' ': <20}{convert_size(msg_size): <20}{count: <20}{total_lat: <20.2f}{avg_lat: <20.2f}{avg_algbw: <20.2f}{avg_busbw: <20.2f}" - ) + if print_log: + msg = f"{' ': <20}{convert_size(msg_size): <20}{count: <20}{total_lat: <20.2f}" + if show_straggler: + msg += f"{total_straggler: <20.2f}" + msg += f"{avg_lat: <20.2f}{avg_algbw: <20.2f}{avg_busbw: <20.2f}" + print(msg) From de368dbfe79fc83b396f9039cebfcac8ea751ab0 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Sat, 20 May 2023 16:50:03 +0800 Subject: [PATCH 02/74] Show straggler effect logging in seperate table --- deepspeed/utils/comms_logging.py | 40 ++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/deepspeed/utils/comms_logging.py b/deepspeed/utils/comms_logging.py index 7f3423b06363..8d9defebe0fe 100644 --- a/deepspeed/utils/comms_logging.py +++ b/deepspeed/utils/comms_logging.py @@ -128,11 +128,7 @@ def log_all(self, print_log=True, show_straggler=False): import deepspeed.comm as dist from deepspeed.comm.reduce_op import ReduceOp if print_log: - msg = f"{'Comm. Op': <20}{'Message Size': <20}{'Count': <20}{'Total Latency(ms)': <20}" - if show_straggler: - msg += f"{'Total straggler(ms)': <20}" - msg += f"{'Avg Latency(ms)': <20}{'tput_avg (Gbps)': <20}{'busbw_avg (Gbps)': <20}" - print(msg) + print(f"{'Comm. Op': <20}{'Message Size': <20}{'Count': <20}{'Total Latency(ms)': <20}{'Avg Latency(ms)': <20}{'tput_avg (Gbps)': <20}{'busbw_avg (Gbps)': <20}") for record_name in self.comms_dict.keys(): if print_log: print(record_name) @@ -141,19 +137,33 @@ def log_all(self, print_log=True, show_straggler=False): count = vals[0] # vals[1] is a list of latency records for each msg size total_lat = sum(vals[1]) - lats = torch.tensor(vals[1]) - min_lats = torch.tensor(vals[1]) - dist.all_reduce(min_lats, op=ReduceOp.MIN) - delta_lats = lats - min_lats - total_straggler = (lats - min_lats).sum().item() # vals[2] and vals[3] are the lists of algbw and busbw, respectively # Get rid of outliers when we print avg_lat = trim_mean(vals[1], 0.1) avg_algbw = trim_mean(vals[2], 0.1) avg_busbw = trim_mean(vals[3], 0.1) if print_log: - msg = f"{' ': <20}{convert_size(msg_size): <20}{count: <20}{total_lat: <20.2f}" - if show_straggler: - msg += f"{total_straggler: <20.2f}" - msg += f"{avg_lat: <20.2f}{avg_algbw: <20.2f}{avg_busbw: <20.2f}" - print(msg) + print(f"{' ': <20}{convert_size(msg_size): <20}{count: <20}{total_lat: <20.2f}{avg_lat: <20.2f}{avg_algbw: <20.2f}{avg_busbw: <20.2f}") + + if show_straggler: + if print_log: + print("_______________________________") + print("Breakdown with straggler effect") + print("-------------------------------") + print(f"{'Comm. Op': <20}{'Message Size': <20}{'Count': <20}{'Total comm lat(ms)': <20}{'Total straggler(ms)': <20}{'Avg comm lat(ms)': <20}{'Avg straggler(ms)': <20}") + for record_name in self.comms_dict.keys(): + if print_log: + print(record_name) + for msg_size, vals in sorted(self.comms_dict[record_name].items()): + # vals[0] is the count for each msg size + count = vals[0] + # vals[1] is a list of latency records for each msg size + lats = torch.tensor(vals[1]) + min_lats = torch.tensor(vals[1]) + dist.all_reduce(min_lats, op=ReduceOp.MIN) + total_lat = min_lats.sum().item() + total_straggler = (lats - min_lats).sum().item() + avg_lat = trim_mean(min_lats.tolist(), 0.1) + avg_straggler = trim_mean((lats - min_lats).tolist(), 0.1) + if print_log: + print(f"{' ': <20}{convert_size(msg_size): <20}{count: <20}{total_lat: <20.2f}{total_straggler: <20.2f}{avg_lat: <20.2f}{avg_straggler: <20.2f}") From 6884e33a91156c34a2bb3e0795efc3d7697b4c23 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Sat, 20 May 2023 17:01:33 +0800 Subject: [PATCH 03/74] fix formatting --- deepspeed/utils/comms_logging.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/deepspeed/utils/comms_logging.py b/deepspeed/utils/comms_logging.py index 8d9defebe0fe..8e6558cfb9dd 100644 --- a/deepspeed/utils/comms_logging.py +++ b/deepspeed/utils/comms_logging.py @@ -128,7 +128,9 @@ def log_all(self, print_log=True, show_straggler=False): import deepspeed.comm as dist from deepspeed.comm.reduce_op import ReduceOp if print_log: - print(f"{'Comm. Op': <20}{'Message Size': <20}{'Count': <20}{'Total Latency(ms)': <20}{'Avg Latency(ms)': <20}{'tput_avg (Gbps)': <20}{'busbw_avg (Gbps)': <20}") + print( + f"{'Comm. Op': <20}{'Message Size': <20}{'Count': <20}{'Total Latency(ms)': <20}{'Avg Latency(ms)': <20}{'tput_avg (Gbps)': <20}{'busbw_avg (Gbps)': <20}" + ) for record_name in self.comms_dict.keys(): if print_log: print(record_name) @@ -143,14 +145,18 @@ def log_all(self, print_log=True, show_straggler=False): avg_algbw = trim_mean(vals[2], 0.1) avg_busbw = trim_mean(vals[3], 0.1) if print_log: - print(f"{' ': <20}{convert_size(msg_size): <20}{count: <20}{total_lat: <20.2f}{avg_lat: <20.2f}{avg_algbw: <20.2f}{avg_busbw: <20.2f}") + print( + f"{' ': <20}{convert_size(msg_size): <20}{count: <20}{total_lat: <20.2f}{avg_lat: <20.2f}{avg_algbw: <20.2f}{avg_busbw: <20.2f}" + ) if show_straggler: if print_log: print("_______________________________") print("Breakdown with straggler effect") print("-------------------------------") - print(f"{'Comm. Op': <20}{'Message Size': <20}{'Count': <20}{'Total comm lat(ms)': <20}{'Total straggler(ms)': <20}{'Avg comm lat(ms)': <20}{'Avg straggler(ms)': <20}") + print( + f"{'Comm. Op': <20}{'Message Size': <20}{'Count': <20}{'Total comm lat(ms)': <20}{'Total straggler(ms)': <20}{'Avg comm lat(ms)': <20}{'Avg straggler(ms)': <20}" + ) for record_name in self.comms_dict.keys(): if print_log: print(record_name) @@ -166,4 +172,6 @@ def log_all(self, print_log=True, show_straggler=False): avg_lat = trim_mean(min_lats.tolist(), 0.1) avg_straggler = trim_mean((lats - min_lats).tolist(), 0.1) if print_log: - print(f"{' ': <20}{convert_size(msg_size): <20}{count: <20}{total_lat: <20.2f}{total_straggler: <20.2f}{avg_lat: <20.2f}{avg_straggler: <20.2f}") + print( + f"{' ': <20}{convert_size(msg_size): <20}{count: <20}{total_lat: <20.2f}{total_straggler: <20.2f}{avg_lat: <20.2f}{avg_straggler: <20.2f}" + ) From 206d4550f433468d1daf6a5675de113ec8830cd1 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Sun, 21 May 2023 17:01:51 +0800 Subject: [PATCH 04/74] add docs for log_summary with straggler effect --- docs/_tutorials/comms-logging.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/docs/_tutorials/comms-logging.md b/docs/_tutorials/comms-logging.md index b6a352b60f68..7313688a03e6 100644 --- a/docs/_tutorials/comms-logging.md +++ b/docs/_tutorials/comms-logging.md @@ -13,7 +13,7 @@ In this tutorial, we introduce DeepSpeed communication logging and provide examp NOTE: All logging communication calls are synchronized in order to provide accurate timing information. This may hamper performance if your model heavily uses asynchronous communication operations. -Logging communication calls is vital to ensure networking resources are fully utilized. The DeepSpeed communication logger enables the detection and logging of all communication operations launched under `deepspeed.comm`. Each communication operation can all be directly printed to the console immediately after completion (via the `verbose` config option), or a summary may be printed with a call to `deepspeed.comm.log_summary()` in the client code at the completion of training, an epoch, after N training iterations, etc. +Logging communication calls is vital to ensure networking resources are fully utilized. The DeepSpeed communication logger enables the detection and logging of all communication operations launched under `deepspeed.comm`. Each communication operation can all be directly printed to the console immediately after completion (via the `verbose` config option), or a summary may be printed with a call to `deepspeed.comm.log_summary()` or `deepspeed.com.log_summary(show_straggler=True)` in the client code at the completion of training, an epoch, after N training iterations, etc. ## Usage @@ -114,3 +114,14 @@ broadcast | [Caller Func: _broadcast_model] reduce_scatter_tensor | [Caller Func: reduce_scatter_fn] 678.86 MB 80 1527.17 13.94 1211.75 1136.01 ``` + +Straggler effect can be shown by supply optional argument `show_straggler=True` to `deepspeed.comm.log_summary()` call. Straggler effect is defined as the time a rank waits for the slowest rank to start communication. For each collective, `log_summary` would get the minimum collective time among all ranks, compute straggler effect as follows: + +``` +straggler = sum(t_collectives - allreduce(t_collectives, MIN)) +``` + +Print straggler effect with the following `log_summary` call in the example above: +``` + dist.log_summary(show_straggler=True) +``` From 4a9ad5d0637ebb439fddf654819141ef685ceb7c Mon Sep 17 00:00:00 2001 From: "Wang, Yi A" Date: Wed, 24 May 2023 06:17:08 -0700 Subject: [PATCH 05/74] fix opt-350m shard loading issue in AutoTP Signed-off-by: Wang, Yi A --- deepspeed/module_inject/replace_module.py | 15 +++++++++------ tests/unit/inference/test_checkpoint_sharding.py | 5 +++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index f0fe81f28714..6fdd218713e3 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -794,17 +794,20 @@ def replace_module(model, orig_class, replace_fn, _replace_policy, checkpoint=No import re -def skip_level_0_prefix(model, name): +def skip_level_0_prefix(model, state_dict): model = str(model) key = re.search(r": (.*?)Model", model) if key is None: key = re.search(r": (.*?)Stack", model) if key is None: key = re.match(r"(.*?)Model", model) - if key is not None and key.group(1).lower() in "bloom": - # if keys start with 'model.', don't skip level 0 prefix - if not re.match("^model[.]", name): - return True + # if keys start with 'model.', don't skip level 0 prefix + if state_dict != None: + for item in state_dict.keys(): + if re.match("^model[.]", item): + return False + if key is not None and key.group(1).lower() in ["bloom", "opt"]: + return True return False @@ -860,7 +863,7 @@ def _replace_module(model, policies, prefix='', layer_id=0, level_id=0, state_di load_buffer(child, state_dict, checking_key) _, layer_id = _replace_module(child, policies, - prefix if level_id == 0 and skip_level_0_prefix(model, name) else \ + prefix if level_id == 0 and skip_level_0_prefix(model, state_dict) else \ prefix + name + '.', layer_id=layer_id, level_id=level_id + 1, diff --git a/tests/unit/inference/test_checkpoint_sharding.py b/tests/unit/inference/test_checkpoint_sharding.py index 31cb923b4164..8e3afb0636a7 100644 --- a/tests/unit/inference/test_checkpoint_sharding.py +++ b/tests/unit/inference/test_checkpoint_sharding.py @@ -31,8 +31,9 @@ def find_dtype(module): assert (found_dtype == expected_dtype), f"Expected transformer dtype {expected_dtype}, but found {found_dtype}" -@pytest.fixture( - params=["bigscience/bloom-560m", "EleutherAI/gpt-j-6B", "EleutherAI/gpt-neo-125M", "facebook/opt-125m"]) +@pytest.fixture(params=[ + "bigscience/bloom-560m", "EleutherAI/gpt-j-6B", "EleutherAI/gpt-neo-125M", "facebook/opt-350m", "facebook/opt-125m" +]) def model_name(request): return request.param From d5552ef66f88aa28b2bcf97a2addd5732fee3cde Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Mon, 26 Jun 2023 09:41:49 -0400 Subject: [PATCH 06/74] init version of CCLBacked allreduce_latency --- csrc/cpu/comm/ccl.cpp | 27 +++++++++++++++++++++++++++ deepspeed/comm/ccl.py | 5 ++++- deepspeed/comm/comm.py | 24 +++++++++++++++++++----- deepspeed/comm/torch.py | 4 ++++ deepspeed/launcher/launch.py | 3 +++ deepspeed/module_inject/layers.py | 13 +++++++++++-- 6 files changed, 68 insertions(+), 8 deletions(-) diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index 0f974fb859e0..97643e019b0a 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -168,6 +168,32 @@ void all_reduce(torch::Tensor& data, py::object op, py::object group, bool async .wait()); } +void all_reduce_low_latency(torch::Tensor& data, py::object op, py::object group, bool async_op) +{ + auto data_ptr = data.data_ptr(); + auto numel = data.numel(); + auto datatype = data.scalar_type(); + auto reduce_op = op; + +/* + if SHM for this data_size/type had not been created + create data_size/type + set all local SHM_flag to 1 + + for i = 0 to world_size - 1: + if i == my_rank: + continue + wait until ith rank SHM_flag+i == 1 + copy data to ith rank SHM_buf+numel*i + set SHM_flag+i to 2 + + wait until all SHM_flag == 2 + reduce result and save to data + + set all SHM_flag to 0 +*/ +} + void all_reduce_caching(torch::Tensor& data, py::object op, std::string match_id, @@ -206,6 +232,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) m.def("get_world_size", &get_world_size, "get world size"); m.def("broadcast", &broadcast, "ccl broadcast"); m.def("all_reduce", &all_reduce, "ccl all_reduce"); + m.def("all_reduce_low_latency", &all_reduce_low_latency, "low latency all_reduce implementation"); m.def("all_reduce_caching", &all_reduce_caching, "ccl all_reduce with caching"); m.def("barrier", &barrier, "barrier"); } diff --git a/deepspeed/comm/ccl.py b/deepspeed/comm/ccl.py index 147f610e2b85..9ca403f2e53b 100644 --- a/deepspeed/comm/ccl.py +++ b/deepspeed/comm/ccl.py @@ -24,7 +24,7 @@ def build_ccl_op(): class CCLBackend(TorchBackend): def __init__(self, name='ccl', rank=-1, world_size=-1, mpu=None, timeout=None, init_method=None): - super(CCLBackend, self).__init__(backend='ccl', + super(CCLBackend, self).__init__(backend='gloo', name='torch', rank=rank, world_size=world_size, @@ -58,5 +58,8 @@ def all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False): else: self.ccl_comm_op.all_reduce(tensor, op, group, async_op) + def all_reduce_low_latency(self, tensor, op=ReduceOp.SUM, group=None, async_op=False): + self.ccl_comm_op.all_reduce_low_latency(tensor, op, group, async_op) + def barrier(self, group=None, async_op=False): self.ccl_comm_op.barrier(group, async_op) diff --git a/deepspeed/comm/comm.py b/deepspeed/comm/comm.py index ea73cafad128..6918c57bea70 100644 --- a/deepspeed/comm/comm.py +++ b/deepspeed/comm/comm.py @@ -75,6 +75,11 @@ def configure( prof_ops=None, verbose=None, debug=None, + #enabled=True, + #prof_all=True, + #prof_ops=False, + #verbose=False, + #debug=False, ): if deepspeed_config is not None: @@ -473,14 +478,23 @@ def all_reduce(tensor, prof=False, log_name='all_reduce', debug=get_caller_func()): - #if profile_comm: - # context of the timers? - # timers.start() - # TensorBoard logging for comm calls.? + #with torch.profiler.record_function("cdb allreduce"): + # global cdb + # return cdb.all_reduce(tensor, op, group, async_op) global cdb - #print(f'op = {op}, cdb= {cdb.name}') return cdb.all_reduce(tensor, op, group, async_op) + #return ccl_backend.ccl_comm_op.all_reduce(tensor, op, group, async_op) +@timed_op +def all_reduce_low_latency(tensor, + op=ReduceOp.SUM, + group=None, + async_op=False, + prof=False, + log_name='all_reduce', + debug=get_caller_func()): + global cdb + return cdb.all_reduce_low_latency(tensor, op, group, async_op) @timed_op def all_reduce_coalesced(tensors, diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py index 208416c533aa..c75142c620da 100644 --- a/deepspeed/comm/torch.py +++ b/deepspeed/comm/torch.py @@ -105,6 +105,10 @@ def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, group=None, asyn op = self._reduce_op(op) return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=async_op) + def all_reduce_low_latency(self, tensor, op=torch.distributed.ReduceOp.SUM, group=None, async_op=False): + op = self._reduce_op(op) + return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=async_op) + def all_reduce_coalesced(self, tensors, op=torch.distributed.ReduceOp.SUM, group=None, async_op=False): """ proxy func to torch.distributed.all_reduce_coalesced, which is included in PyTorch 1.13 and above diff --git a/deepspeed/launcher/launch.py b/deepspeed/launcher/launch.py index b3cb9d1205a9..a95c24bec21e 100755 --- a/deepspeed/launcher/launch.py +++ b/deepspeed/launcher/launch.py @@ -221,6 +221,8 @@ def main(): local_rank = dist_rank % num_local_procs current_env["RANK"] = str(dist_rank) current_env["LOCAL_RANK"] = str(local_rank) + current_env["CCL_LOCAL_RANK"] = str(local_rank) + current_env["CCL_LOCAL_SIZE"] = str(num_local_procs) # spawn the processes cmd = [] @@ -243,6 +245,7 @@ def main(): cmd.append(f"--local_rank={local_rank}") cmd += args.training_script_args + print (cmd) if args.enable_each_rank_log != "None": log_file = os.path.join(args.enable_each_rank_log, f"{log_name_prefix}_rank{dist_rank}.log") log_fd = open(log_file, 'w') diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py index a8dd4b05f1b0..9bac1b207af8 100644 --- a/deepspeed/module_inject/layers.py +++ b/deepspeed/module_inject/layers.py @@ -11,6 +11,8 @@ from torch.nn.parameter import Parameter from deepspeed.accelerator import get_accelerator +global output_tensors +output_tensors = {} class LinearAllreduce(nn.Module): @@ -21,9 +23,16 @@ def __init__(self, weight, bias=None, mp_group=None): self.mp_group = mp_group def forward(self, input): - output = torch.matmul(input, self.weight.transpose(-1, -2)) + global output_tensors + output_size = input.size()[:-1] + self.weight.transpose(-1,-2).size()[1:] + if not output_size in output_tensors: + output_tensors[output_size] = torch.empty(output_size, device=input.device, dtype=input.dtype) + output = output_tensors[output_size] + torch.matmul(input, self.weight.transpose(-1, -2), out=output) + #output = torch.matmul(input, self.weight.transpose(-1, -2)) + if self.mp_group is not None: - dist.all_reduce(output, group=self.mp_group) + dist.all_reduce_low_latency(output, group=self.mp_group) if self.bias is not None: output += self.bias return output From f0ea3eb16474c025bc72d26bc5017e3cc648fab8 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Mon, 26 Jun 2023 09:44:11 -0400 Subject: [PATCH 07/74] remove torch-ccl as dependency --- accelerator/cpu_accelerator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py index b1fb036fed07..826f300ea252 100644 --- a/accelerator/cpu_accelerator.py +++ b/accelerator/cpu_accelerator.py @@ -5,7 +5,7 @@ import torch from deepspeed.accelerator.abstract_accelerator import DeepSpeedAccelerator -import oneccl_bindings_for_pytorch # noqa: F401 +#import oneccl_bindings_for_pytorch # noqa: F401 import psutil import os From 6caf695c736f0d52c16c57c6408cb29c5a77e92c Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Wed, 28 Jun 2023 03:59:50 -0400 Subject: [PATCH 08/74] init allreduce for latency without actual reduce operation --- csrc/cpu/comm/ccl.cpp | 153 +++++++++++++++++++++++++++++++++++------- 1 file changed, 130 insertions(+), 23 deletions(-) diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index 97643e019b0a..dd7f07a05f74 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -6,11 +6,64 @@ #include #include +#include +#include +#include +#include +#include +#include + +struct SharedData { + const char *name; + int descriptor; + void *bytes; + size_t nbytes; +}; + +int world_rank = -1; +int world_size = -1; + +void shared_open(SharedData *data, const char *name, size_t nbytes) { + int d = shm_open(name, O_RDWR, S_IRUSR | S_IWUSR); + if (d != -1) { + void *bytes = mmap(NULL, nbytes, PROT_READ|PROT_WRITE, MAP_SHARED, d, 0); + data->name = name; + data->descriptor = d; + data->bytes = bytes; + data->nbytes = nbytes; + printf ("(%d)shared_open %s done\n", world_rank, name); + } else { + printf ("(%d)shared_open %s failed\n", world_rank, name); + data->descriptor = -1; + } +} + +void shared_create(SharedData *data, const char *name, void *bytes, size_t nbytes) { + int d = shm_open(name, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); + if (d != -1) { + if (nbytes = write(d, bytes, nbytes)) { + shared_open(data, name, nbytes); + } + } else { + printf ("(%d)shared_create %s failed\n", world_rank, name); + } + printf ("(%d)shared_create %s done\n", world_rank, name); +} + +void shared_close(SharedData *data) { + if (data->descriptor != -1) { + munmap(data->bytes, data->nbytes); + shm_unlink(data->name); + } +} std::set _comm_ids; std::set _colors; ccl::vector_class _ccl_comms; +ccl::communicator& _get_comm_from_group() { return _ccl_comms[0]; } +ccl::communicator& _get_comm_from_group(py::object group) { return _ccl_comms[0]; } + #define CCLCHECK(cmd) \ do { \ cmd; \ @@ -21,11 +74,16 @@ ccl::vector_class _ccl_comms; bool is_initialized = 0; -int world_rank = -1; -int world_size = -1; - ccl::shared_ptr_class kvs; +SharedData allreduce_buffer; +char buffer_name[100] = "allreduce_buffer"; +struct allreduce_workspace { + int state; + char buffer[32768]; +}; +struct allreduce_workspace *buffer; + void initialize(int size, int rank, torch::Tensor& kvs_data) { if (is_initialized) return; @@ -41,6 +99,21 @@ void initialize(int size, int rank, torch::Tensor& kvs_data) } _ccl_comms.emplace_back(ccl::create_communicator(size, rank, kvs)); + + //sprintf (buffer_name, "allreduce_buffer_%d", rank); + if (rank == 0) { + buffer = (struct allreduce_workspace*) malloc(size*sizeof(struct allreduce_workspace)); + shared_create(&allreduce_buffer, buffer_name, buffer, size*sizeof(struct allreduce_workspace)); + buffer = (struct allreduce_workspace*)allreduce_buffer.bytes; + for (int i=0; i(end - start); + count++; + auto t = duration.count(); + total += t; + if (t>max) max = t; + if (t100.0) max_count ++; + if (count == 17920 && world_rank == 0) { + printf ("%f, %f, %f, %d, %d\n", total/count, min, max, min_count, max_count); + } +} + +void wait_buffer_state_until(int index, int state) +{ + volatile int *state_ptr = &(buffer[index].state); + + while (*state_ptr != state); } void all_reduce_low_latency(torch::Tensor& data, py::object op, py::object group, bool async_op) @@ -175,23 +271,34 @@ void all_reduce_low_latency(torch::Tensor& data, py::object op, py::object group auto datatype = data.scalar_type(); auto reduce_op = op; -/* - if SHM for this data_size/type had not been created - create data_size/type - set all local SHM_flag to 1 - - for i = 0 to world_size - 1: - if i == my_rank: - continue - wait until ith rank SHM_flag+i == 1 - copy data to ith rank SHM_buf+numel*i - set SHM_flag+i to 2 + memcpy(buffer[world_rank].buffer, data_ptr, numel*2); + buffer[world_rank].state = 1; - wait until all SHM_flag == 2 - reduce result and save to data - - set all SHM_flag to 0 -*/ + if (world_rank == 0) { + // compute allreduce result on rank 0 + for (int i=1; i< world_size; i++) { + // wait until the other rank copy the buffer + wait_buffer_state_until(i, 1); + memcpy(buffer[0].buffer, buffer[i].buffer, numel*2); + } + buffer[world_rank].state = 2; + memcpy(data_ptr, buffer[0].buffer, numel*2); + } + if (world_rank != 0) { + wait_buffer_state_until(0, 2); + memcpy(data_ptr, buffer[0].buffer, numel*2); + buffer[world_rank].state = 2; + } + if (world_rank == 0) { + for (int i=1; i< world_size; i++) { + wait_buffer_state_until(i, 2); + } + buffer[world_rank].state = 0; + } + if (world_rank != 0) { + wait_buffer_state_until(0, 0); + buffer[world_rank].state = 0; + } } void all_reduce_caching(torch::Tensor& data, From f689f22779ca9cce5c00e32568cd6560120c4443 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Wed, 28 Jun 2023 06:10:45 -0400 Subject: [PATCH 09/74] first version of SHM based direct allreduce --- csrc/cpu/comm/ccl.cpp | 73 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 65 insertions(+), 8 deletions(-) diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index dd7f07a05f74..1c78aeae5058 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -12,6 +12,8 @@ #include #include #include +#include +#include struct SharedData { const char *name; @@ -31,7 +33,6 @@ void shared_open(SharedData *data, const char *name, size_t nbytes) { data->descriptor = d; data->bytes = bytes; data->nbytes = nbytes; - printf ("(%d)shared_open %s done\n", world_rank, name); } else { printf ("(%d)shared_open %s failed\n", world_rank, name); data->descriptor = -1; @@ -47,7 +48,6 @@ void shared_create(SharedData *data, const char *name, void *bytes, size_t nbyte } else { printf ("(%d)shared_create %s failed\n", world_rank, name); } - printf ("(%d)shared_create %s done\n", world_rank, name); } void shared_close(SharedData *data) { @@ -226,6 +226,7 @@ void broadcast(torch::Tensor& data, int src, py::object group, bool async_op) } float total = 0.0f; +float total_sq = 0.0f; float min = 1000.0f; float max = 0.0f; int count = 0; @@ -248,12 +249,10 @@ void all_reduce(torch::Tensor& data, py::object op, py::object group, bool async count++; auto t = duration.count(); total += t; - if (t>max) max = t; - if (t100.0) max_count ++; + total_sq += t*t; + auto segma = sqrt(total_sq/count-total*total/count/count); if (count == 17920 && world_rank == 0) { - printf ("%f, %f, %f, %d, %d\n", total/count, min, max, min_count, max_count); + printf ("average duration: %f, std: %f\n", total/count, segma); } } @@ -264,6 +263,52 @@ void wait_buffer_state_until(int index, int state) while (*state_ptr != state); } +__m512 cvt_bf16_to_fp32(const __m256i src) __attribute__((target("avx512bw"))); +inline __m512 cvt_bf16_to_fp32(const __m256i src) { + auto y = _mm512_cvtepu16_epi32(src); + return _mm512_castsi512_ps(_mm512_bslli_epi128(y, 2)); +} + +inline __m256i cvt_fp32_to_bf16(const __m512 src) __attribute__((target("avx512bw"))); +inline __m256i cvt_fp32_to_bf16(const __m512 src) { +//#if (defined CPU_CAPABILITY_AVX512_BF16) +#if 0 + return reinterpret_cast<__m256i>(_mm512_cvtneps_pbh(src)); +#else + __m512i value = _mm512_castps_si512(src); + __m512i nan = _mm512_set1_epi32(0xffff); + auto mask_value = _mm512_cmp_ps_mask(src, src, _CMP_ORD_Q); + __m512i ones = _mm512_set1_epi32(0x1); + __m512i vec_bias = _mm512_set1_epi32(0x7fff); + // uint32_t lsb = (input >> 16) & 1; + auto t_value = _mm512_and_si512(_mm512_srli_epi32(value, 16), ones); + // uint32_t rounding_bias = 0x7fff + lsb; + t_value = _mm512_add_epi32(t_value, vec_bias); + // input += rounding_bias; + t_value = _mm512_add_epi32(t_value, value); + // input = input >> 16; + t_value = _mm512_srli_epi32(t_value, 16); + // Check NaN before converting back to bf16 + t_value = _mm512_mask_blend_epi32(mask_value, nan, t_value); + return _mm512_cvtusepi32_epi16(t_value); +#endif +} + +void reduce_bf16_buffers(void* inout, void* in, int num_elements) __attribute__((target("avx512bw"))); +void reduce_bf16_buffers(void* inout, void* in, int num_elements) +{ + for (int i=0; i(end - start); + count++; + auto t = duration.count(); + total += t; + total_sq += t*t; + auto segma = sqrt(total_sq/count-total*total/count/count); + if (count == 17920 && world_rank == 0) { + printf ("average duration: %f, std: %f\n", total/count, segma); + } } void all_reduce_caching(torch::Tensor& data, From bc48c7e99c2619035505914b2fc060125da6e2b8 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Thu, 29 Jun 2023 08:49:47 -0400 Subject: [PATCH 10/74] tweak reduce kernel --- csrc/cpu/comm/ccl.cpp | 55 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index 1c78aeae5058..f007362b3203 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -295,17 +295,56 @@ inline __m256i cvt_fp32_to_bf16(const __m512 src) { } void reduce_bf16_buffers(void* inout, void* in, int num_elements) __attribute__((target("avx512bw"))); +void reduce_8_bf16_buffers(void* inout, void* in1, void* in2, void* in3, + void* in4, void* in5, void* in6, void* in7, + int num_elements) __attribute__((target("avx512bw"))); + +void reduce_all_bf16_buffers(struct allreduce_workspace * buffer, int num_elements, int num_buffers) +{ + if (num_buffers == 8) { + reduce_8_bf16_buffers(buffer[0].buffer, buffer[1].buffer, + buffer[2].buffer, buffer[3].buffer, + buffer[4].buffer, buffer[5].buffer, + buffer[6].buffer, buffer[7].buffer, + num_elements); + } else { + for (int i=1; i Date: Thu, 29 Jun 2023 10:17:19 -0400 Subject: [PATCH 11/74] SHM allreduce support 2-8 ranks --- csrc/cpu/comm/ccl.cpp | 124 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 120 insertions(+), 4 deletions(-) diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index f007362b3203..93ad5f27e075 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -295,6 +295,20 @@ inline __m256i cvt_fp32_to_bf16(const __m512 src) { } void reduce_bf16_buffers(void* inout, void* in, int num_elements) __attribute__((target("avx512bw"))); + +void reduce_3_bf16_buffers(void* inout, void* in1, void* in2, + int num_elements) __attribute__((target("avx512bw"))); +void reduce_4_bf16_buffers(void* inout, void* in1, void* in2, void* in3, + int num_elements) __attribute__((target("avx512bw"))); +void reduce_5_bf16_buffers(void* inout, void* in1, void* in2, void* in3, + void* in4, + int num_elements) __attribute__((target("avx512bw"))); +void reduce_6_bf16_buffers(void* inout, void* in1, void* in2, void* in3, + void* in4, void* in5, + int num_elements) __attribute__((target("avx512bw"))); +void reduce_7_bf16_buffers(void* inout, void* in1, void* in2, void* in3, + void* in4, void* in5, void* in6, + int num_elements) __attribute__((target("avx512bw"))); void reduce_8_bf16_buffers(void* inout, void* in1, void* in2, void* in3, void* in4, void* in5, void* in6, void* in7, int num_elements) __attribute__((target("avx512bw"))); @@ -302,10 +316,25 @@ void reduce_8_bf16_buffers(void* inout, void* in1, void* in2, void* in3, void reduce_all_bf16_buffers(struct allreduce_workspace * buffer, int num_elements, int num_buffers) { if (num_buffers == 8) { - reduce_8_bf16_buffers(buffer[0].buffer, buffer[1].buffer, - buffer[2].buffer, buffer[3].buffer, - buffer[4].buffer, buffer[5].buffer, - buffer[6].buffer, buffer[7].buffer, + reduce_8_bf16_buffers(buffer[0].buffer, buffer[1].buffer, buffer[2].buffer, + buffer[3].buffer, buffer[4].buffer, buffer[5].buffer, + buffer[6].buffer, buffer[7].buffer, num_elements); + } else if (num_buffers == 7) { + reduce_7_bf16_buffers(buffer[0].buffer, buffer[1].buffer, buffer[2].buffer, + buffer[3].buffer, buffer[4].buffer, buffer[5].buffer, + buffer[6].buffer, num_elements); + } else if (num_buffers == 6) { + reduce_6_bf16_buffers(buffer[0].buffer, buffer[1].buffer, buffer[2].buffer, + buffer[3].buffer, buffer[4].buffer, buffer[5].buffer, + num_elements); + } else if (num_buffers == 5) { + reduce_5_bf16_buffers(buffer[0].buffer, buffer[1].buffer, buffer[2].buffer, + buffer[3].buffer, buffer[4].buffer, num_elements); + } else if (num_buffers == 4) { + reduce_4_bf16_buffers(buffer[0].buffer, buffer[1].buffer, buffer[2].buffer, + buffer[3].buffer, num_elements); + } else if (num_buffers == 3) { + reduce_3_bf16_buffers(buffer[0].buffer, buffer[1].buffer, buffer[2].buffer, num_elements); } else { for (int i=1; i Date: Fri, 30 Jun 2023 10:06:05 +0800 Subject: [PATCH 12/74] clean up --- accelerator/cpu_accelerator.py | 1 - csrc/cpu/comm/ccl.cpp | 386 +++++++++++++++++------------- deepspeed/comm/comm.py | 23 +- deepspeed/launcher/launch.py | 3 - deepspeed/module_inject/layers.py | 16 +- 5 files changed, 236 insertions(+), 193 deletions(-) diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py index 826f300ea252..824b912df1ec 100644 --- a/accelerator/cpu_accelerator.py +++ b/accelerator/cpu_accelerator.py @@ -5,7 +5,6 @@ import torch from deepspeed.accelerator.abstract_accelerator import DeepSpeedAccelerator -#import oneccl_bindings_for_pytorch # noqa: F401 import psutil import os diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index 93ad5f27e075..afac58e8c4f9 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -5,52 +5,53 @@ #include -#include -#include -#include -#include -#include #include -#include #include #include +#include +#include +#include +#include +#include +#include struct SharedData { - const char *name; + const char* name; int descriptor; - void *bytes; + void* bytes; size_t nbytes; }; int world_rank = -1; int world_size = -1; -void shared_open(SharedData *data, const char *name, size_t nbytes) { +void shared_open(SharedData* data, const char* name, size_t nbytes) +{ int d = shm_open(name, O_RDWR, S_IRUSR | S_IWUSR); if (d != -1) { - void *bytes = mmap(NULL, nbytes, PROT_READ|PROT_WRITE, MAP_SHARED, d, 0); + void* bytes = mmap(NULL, nbytes, PROT_READ | PROT_WRITE, MAP_SHARED, d, 0); data->name = name; data->descriptor = d; data->bytes = bytes; data->nbytes = nbytes; } else { - printf ("(%d)shared_open %s failed\n", world_rank, name); + printf("(%d)shared_open %s failed\n", world_rank, name); data->descriptor = -1; } } -void shared_create(SharedData *data, const char *name, void *bytes, size_t nbytes) { +void shared_create(SharedData* data, const char* name, void* bytes, size_t nbytes) +{ int d = shm_open(name, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); if (d != -1) { - if (nbytes = write(d, bytes, nbytes)) { - shared_open(data, name, nbytes); - } + if (nbytes = write(d, bytes, nbytes)) { shared_open(data, name, nbytes); } } else { - printf ("(%d)shared_create %s failed\n", world_rank, name); + printf("(%d)shared_create %s failed\n", world_rank, name); } } -void shared_close(SharedData *data) { +void shared_close(SharedData* data) +{ if (data->descriptor != -1) { munmap(data->bytes, data->nbytes); shm_unlink(data->name); @@ -82,7 +83,7 @@ struct allreduce_workspace { int state; char buffer[32768]; }; -struct allreduce_workspace *buffer; +struct allreduce_workspace* buffer; void initialize(int size, int rank, torch::Tensor& kvs_data) { @@ -100,18 +101,16 @@ void initialize(int size, int rank, torch::Tensor& kvs_data) _ccl_comms.emplace_back(ccl::create_communicator(size, rank, kvs)); - //sprintf (buffer_name, "allreduce_buffer_%d", rank); if (rank == 0) { - buffer = (struct allreduce_workspace*) malloc(size*sizeof(struct allreduce_workspace)); - shared_create(&allreduce_buffer, buffer_name, buffer, size*sizeof(struct allreduce_workspace)); + buffer = (struct allreduce_workspace*)malloc(size * sizeof(struct allreduce_workspace)); + shared_create( + &allreduce_buffer, buffer_name, buffer, size * sizeof(struct allreduce_workspace)); buffer = (struct allreduce_workspace*)allreduce_buffer.bytes; - for (int i=0; i(_mm512_cvtneps_pbh(src)); #else - __m512i value = _mm512_castps_si512(src); - __m512i nan = _mm512_set1_epi32(0xffff); - auto mask_value = _mm512_cmp_ps_mask(src, src, _CMP_ORD_Q); - __m512i ones = _mm512_set1_epi32(0x1); - __m512i vec_bias = _mm512_set1_epi32(0x7fff); - // uint32_t lsb = (input >> 16) & 1; - auto t_value = _mm512_and_si512(_mm512_srli_epi32(value, 16), ones); - // uint32_t rounding_bias = 0x7fff + lsb; - t_value = _mm512_add_epi32(t_value, vec_bias); - // input += rounding_bias; - t_value = _mm512_add_epi32(t_value, value); - // input = input >> 16; - t_value = _mm512_srli_epi32(t_value, 16); - // Check NaN before converting back to bf16 - t_value = _mm512_mask_blend_epi32(mask_value, nan, t_value); - return _mm512_cvtusepi32_epi16(t_value); + __m512i value = _mm512_castps_si512(src); + __m512i nan = _mm512_set1_epi32(0xffff); + auto mask_value = _mm512_cmp_ps_mask(src, src, _CMP_ORD_Q); + __m512i ones = _mm512_set1_epi32(0x1); + __m512i vec_bias = _mm512_set1_epi32(0x7fff); + // uint32_t lsb = (input >> 16) & 1; + auto t_value = _mm512_and_si512(_mm512_srli_epi32(value, 16), ones); + // uint32_t rounding_bias = 0x7fff + lsb; + t_value = _mm512_add_epi32(t_value, vec_bias); + // input += rounding_bias; + t_value = _mm512_add_epi32(t_value, value); + // input = input >> 16; + t_value = _mm512_srli_epi32(t_value, 16); + // Check NaN before converting back to bf16 + t_value = _mm512_mask_blend_epi32(mask_value, nan, t_value); + return _mm512_cvtusepi32_epi16(t_value); #endif } -void reduce_bf16_buffers(void* inout, void* in, int num_elements) __attribute__((target("avx512bw"))); - -void reduce_3_bf16_buffers(void* inout, void* in1, void* in2, - int num_elements) __attribute__((target("avx512bw"))); -void reduce_4_bf16_buffers(void* inout, void* in1, void* in2, void* in3, - int num_elements) __attribute__((target("avx512bw"))); -void reduce_5_bf16_buffers(void* inout, void* in1, void* in2, void* in3, - void* in4, - int num_elements) __attribute__((target("avx512bw"))); -void reduce_6_bf16_buffers(void* inout, void* in1, void* in2, void* in3, - void* in4, void* in5, - int num_elements) __attribute__((target("avx512bw"))); -void reduce_7_bf16_buffers(void* inout, void* in1, void* in2, void* in3, - void* in4, void* in5, void* in6, - int num_elements) __attribute__((target("avx512bw"))); -void reduce_8_bf16_buffers(void* inout, void* in1, void* in2, void* in3, - void* in4, void* in5, void* in6, void* in7, - int num_elements) __attribute__((target("avx512bw"))); - -void reduce_all_bf16_buffers(struct allreduce_workspace * buffer, int num_elements, int num_buffers) +void reduce_bf16_buffers(void* in_out, void* in, int num_elements) + __attribute__((target("avx512bw"))); + +void reduce_3_bf16_buffers(void* in_out, void* in1, void* in2, int num_elements) + __attribute__((target("avx512bw"))); +void reduce_4_bf16_buffers(void* in_out, void* in1, void* in2, void* in3, int num_elements) + __attribute__((target("avx512bw"))); +void reduce_5_bf16_buffers(void* in_out, + void* in1, + void* in2, + void* in3, + void* in4, + int num_elements) __attribute__((target("avx512bw"))); +void reduce_6_bf16_buffers(void* in_out, + void* in1, + void* in2, + void* in3, + void* in4, + void* in5, + int num_elements) __attribute__((target("avx512bw"))); +void reduce_7_bf16_buffers(void* in_out, + void* in1, + void* in2, + void* in3, + void* in4, + void* in5, + void* in6, + int num_elements) __attribute__((target("avx512bw"))); +void reduce_8_bf16_buffers(void* in_out, + void* in1, + void* in2, + void* in3, + void* in4, + void* in5, + void* in6, + void* in7, + int num_elements) __attribute__((target("avx512bw"))); + +void reduce_all_bf16_buffers(struct allreduce_workspace* buffer, int num_elements, int num_buffers) { if (num_buffers == 8) { - reduce_8_bf16_buffers(buffer[0].buffer, buffer[1].buffer, buffer[2].buffer, - buffer[3].buffer, buffer[4].buffer, buffer[5].buffer, - buffer[6].buffer, buffer[7].buffer, num_elements); + reduce_8_bf16_buffers(buffer[0].buffer, + buffer[1].buffer, + buffer[2].buffer, + buffer[3].buffer, + buffer[4].buffer, + buffer[5].buffer, + buffer[6].buffer, + buffer[7].buffer, + num_elements); } else if (num_buffers == 7) { - reduce_7_bf16_buffers(buffer[0].buffer, buffer[1].buffer, buffer[2].buffer, - buffer[3].buffer, buffer[4].buffer, buffer[5].buffer, - buffer[6].buffer, num_elements); + reduce_7_bf16_buffers(buffer[0].buffer, + buffer[1].buffer, + buffer[2].buffer, + buffer[3].buffer, + buffer[4].buffer, + buffer[5].buffer, + buffer[6].buffer, + num_elements); } else if (num_buffers == 6) { - reduce_6_bf16_buffers(buffer[0].buffer, buffer[1].buffer, buffer[2].buffer, - buffer[3].buffer, buffer[4].buffer, buffer[5].buffer, + reduce_6_bf16_buffers(buffer[0].buffer, + buffer[1].buffer, + buffer[2].buffer, + buffer[3].buffer, + buffer[4].buffer, + buffer[5].buffer, num_elements); } else if (num_buffers == 5) { - reduce_5_bf16_buffers(buffer[0].buffer, buffer[1].buffer, buffer[2].buffer, - buffer[3].buffer, buffer[4].buffer, num_elements); + reduce_5_bf16_buffers(buffer[0].buffer, + buffer[1].buffer, + buffer[2].buffer, + buffer[3].buffer, + buffer[4].buffer, + num_elements); } else if (num_buffers == 4) { - reduce_4_bf16_buffers(buffer[0].buffer, buffer[1].buffer, buffer[2].buffer, - buffer[3].buffer, num_elements); + reduce_4_bf16_buffers( + buffer[0].buffer, buffer[1].buffer, buffer[2].buffer, buffer[3].buffer, num_elements); } else if (num_buffers == 3) { - reduce_3_bf16_buffers(buffer[0].buffer, buffer[1].buffer, buffer[2].buffer, - num_elements); + reduce_3_bf16_buffers(buffer[0].buffer, buffer[1].buffer, buffer[2].buffer, num_elements); } else { - for (int i=1; i Date: Fri, 30 Jun 2023 10:11:44 +0800 Subject: [PATCH 13/74] remove oneCCL binding for pytorch from workflow, use gloo to bootstrap CCLBackend --- .github/workflows/cpu-inference.yml | 3 +-- docs/_tutorials/accelerator-abstraction-interface.md | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml index f139d84ee6e9..2c05b7810b7b 100644 --- a/.github/workflows/cpu-inference.yml +++ b/.github/workflows/cpu-inference.yml @@ -40,10 +40,9 @@ jobs: run: | sudo apt-get install -y numactl - - name: Install oneCCL Bindings for PyTorch + - name: Install Intel Extension for PyTorch run: | python -m pip install intel_extension_for_pytorch - python -m pip install oneccl_bind_pt==2.0 -f https://developer.intel.com/ipex-whl-stable-cpu - name: Install oneCCL run: | diff --git a/docs/_tutorials/accelerator-abstraction-interface.md b/docs/_tutorials/accelerator-abstraction-interface.md index bc0db6d809f4..29b7e3a921f3 100644 --- a/docs/_tutorials/accelerator-abstraction-interface.md +++ b/docs/_tutorials/accelerator-abstraction-interface.md @@ -94,7 +94,6 @@ To run DeepSpeed model on CPU, use the following steps to prepare environment: ``` python -m pip install intel_extension_for_pytorch -python -m pip install oneccl_bind_pt==2.0 -f https://developer.intel.com/ipex-whl-stable-cpu git clone https://github.com/oneapi-src/oneCCL cd oneCCL mkdir build From 3151996b7747b84e29518cbe134fe4fa3601b58a Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Fri, 30 Jun 2023 13:08:31 +0800 Subject: [PATCH 14/74] add gpt-neox autotp support --- deepspeed/module_inject/auto_tp.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py index 396fe7db2447..d5a3941a6c72 100644 --- a/deepspeed/module_inject/auto_tp.py +++ b/deepspeed/module_inject/auto_tp.py @@ -108,6 +108,8 @@ def tp_parser(model): gem_list = gem_list + [layer] elif 'down_proj' in layer: gem_list = gem_list + [layer] + elif 'attention.dense' in layer and 'GPTNeoX' in str(model): + gem_list = gem_list + [layer] layer_list = [] if gem_list != []: gem_list = list(set(gem_list)) From c5dd6dc7e734e3eb70d791bfebd6940ce889f6c2 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Fri, 30 Jun 2023 13:49:29 +0800 Subject: [PATCH 15/74] fallback to oneccl if input is too large --- csrc/cpu/comm/ccl.cpp | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index afac58e8c4f9..bc1faad31fbe 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -75,13 +75,15 @@ ccl::communicator& _get_comm_from_group(py::object group) { return _ccl_comms[0] bool is_initialized = 0; +#define MAX_BUF_SIZE 65536 + ccl::shared_ptr_class kvs; SharedData allreduce_buffer; char buffer_name[100] = "allreduce_buffer"; struct allreduce_workspace { int state; - char buffer[32768]; + char buffer[MAX_BUF_SIZE]; }; struct allreduce_workspace* buffer; @@ -521,8 +523,21 @@ void reduce_8_bf16_buffers(void* in_out, void all_reduce_low_latency(torch::Tensor& data, py::object op, py::object group, bool async_op) { - auto data_ptr = data.data_ptr(); auto numel = data.numel(); + + if (numel * 2 > MAX_BUF_SIZE) { + // fallback to oneccl allreduce + CCLCHECK(ccl::allreduce(data.data_ptr(), + data.data_ptr(), + data.numel(), + get_ccl_datatype(data.scalar_type()), + get_ccl_reduce_op(op, data), + _get_comm_from_group(group)) + .wait()); + return; + } + + auto data_ptr = data.data_ptr(); auto datatype = data.scalar_type(); auto reduce_op = op; From f402f0b01996132d3840fc3e0a02fe42534b958b Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Fri, 30 Jun 2023 10:51:58 -0400 Subject: [PATCH 16/74] code clean up --- csrc/cpu/comm/ccl.cpp | 608 +++++++++++++++++++----------------------- 1 file changed, 269 insertions(+), 339 deletions(-) diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index bc1faad31fbe..a390e70a0fc3 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -13,8 +13,10 @@ #include #include #include +#include #include +// SHM building blocks struct SharedData { const char* name; int descriptor; @@ -22,9 +24,6 @@ struct SharedData { size_t nbytes; }; -int world_rank = -1; -int world_size = -1; - void shared_open(SharedData* data, const char* name, size_t nbytes) { int d = shm_open(name, O_RDWR, S_IRUSR | S_IWUSR); @@ -35,7 +34,7 @@ void shared_open(SharedData* data, const char* name, size_t nbytes) data->bytes = bytes; data->nbytes = nbytes; } else { - printf("(%d)shared_open %s failed\n", world_rank, name); + printf("shared_open %s failed\n", name); data->descriptor = -1; } } @@ -46,7 +45,7 @@ void shared_create(SharedData* data, const char* name, void* bytes, size_t nbyte if (d != -1) { if (nbytes = write(d, bytes, nbytes)) { shared_open(data, name, nbytes); } } else { - printf("(%d)shared_create %s failed\n", world_rank, name); + printf("shared_create %s failed\n", name); } } @@ -58,6 +57,221 @@ void shared_close(SharedData* data) } } +// SHM based allreduce helper functions +#define MAX_BUF_SIZE 65536 +#define SHM_BUFFER_NAME "deepspeed_allreduce_buffer" +SharedData allreduce_buffer; +struct allreduce_workspace { + int state; + char buffer[MAX_BUF_SIZE]; +}; +struct allreduce_workspace* buffer; + +void wait_buffer_state_until(int index, int state) +{ + volatile int* state_ptr = &(buffer[index].state); + + while (*state_ptr != state) + ; +} + +__m512 cvt_bf16_to_fp32(const __m256i src) __attribute__((target("avx512bw"))); +inline __m512 cvt_bf16_to_fp32(const __m256i src) +{ + auto y = _mm512_cvtepu16_epi32(src); + return _mm512_castsi512_ps(_mm512_bslli_epi128(y, 2)); +} + +inline __m256i cvt_fp32_to_bf16(const __m512 src) __attribute__((target("avx512bw"))); +inline __m256i cvt_fp32_to_bf16(const __m512 src) +{ +#if 0 + return reinterpret_cast<__m256i>(_mm512_cvtneps_pbh(src)); +#else + __m512i value = _mm512_castps_si512(src); + __m512i nan = _mm512_set1_epi32(0xffff); + auto mask_value = _mm512_cmp_ps_mask(src, src, _CMP_ORD_Q); + __m512i ones = _mm512_set1_epi32(0x1); + __m512i vec_bias = _mm512_set1_epi32(0x7fff); + // uint32_t lsb = (input >> 16) & 1; + auto t_value = _mm512_and_si512(_mm512_srli_epi32(value, 16), ones); + // uint32_t rounding_bias = 0x7fff + lsb; + t_value = _mm512_add_epi32(t_value, vec_bias); + // input += rounding_bias; + t_value = _mm512_add_epi32(t_value, value); + // input = input >> 16; + t_value = _mm512_srli_epi32(t_value, 16); + // Check NaN before converting back to bf16 + t_value = _mm512_mask_blend_epi32(mask_value, nan, t_value); + return _mm512_cvtusepi32_epi16(t_value); +#endif +} + +void reduce_2_bf16_buffers(int num_elements, void* in_out, void* in) + __attribute__((target("avx512bw"))); + +void reduce_3_bf16_buffers(int num_elements, void* in_out, void* in1, void* in2) + __attribute__((target("avx512bw"))); +void reduce_4_bf16_buffers(int num_elements, void* in_out, void* in1, void* in2, void* in3) + __attribute__((target("avx512bw"))); +void reduce_5_bf16_buffers(int num_elements, void* in_out, + void* in1, + void* in2, + void* in3, + void* in4) __attribute__((target("avx512bw"))); +void reduce_6_bf16_buffers(int num_elements, void* in_out, + void* in1, + void* in2, + void* in3, + void* in4, + void* in5) __attribute__((target("avx512bw"))); +void reduce_7_bf16_buffers(int num_elements, void* in_out, + void* in1, + void* in2, + void* in3, + void* in4, + void* in5, + void* in6) __attribute__((target("avx512bw"))); +void reduce_8_bf16_buffers(int num_elements, void* in_out, + void* in1, + void* in2, + void* in3, + void* in4, + void* in5, + void* in6, + void* in7) __attribute__((target("avx512bw"))); + +void reduce_all_bf16_buffers(struct allreduce_workspace* buffer, int num_elements, int num_buffers) +{ + switch (num_buffers) { + case 8: + reduce_8_bf16_buffers(num_elements, buffer[0].buffer, + buffer[1].buffer, + buffer[2].buffer, + buffer[3].buffer, + buffer[4].buffer, + buffer[5].buffer, + buffer[6].buffer, + buffer[7].buffer); + break; + case 7: + reduce_7_bf16_buffers(num_elements, buffer[0].buffer, + buffer[1].buffer, + buffer[2].buffer, + buffer[3].buffer, + buffer[4].buffer, + buffer[5].buffer, + buffer[6].buffer); + break; + case 6: + reduce_6_bf16_buffers(num_elements, buffer[0].buffer, + buffer[1].buffer, + buffer[2].buffer, + buffer[3].buffer, + buffer[4].buffer, + buffer[5].buffer); + break; + case 5: + reduce_5_bf16_buffers(num_elements, buffer[0].buffer, + buffer[1].buffer, + buffer[2].buffer, + buffer[3].buffer, + buffer[4].buffer); + break; + case 4: + reduce_4_bf16_buffers(num_elements, buffer[0].buffer, buffer[1].buffer, buffer[2].buffer, buffer[3].buffer); + break; + case 3: + reduce_3_bf16_buffers(num_elements, buffer[0].buffer, buffer[1].buffer, buffer[2].buffer); + break; + default: + for (int i = 1; i < num_buffers; i++) { + reduce_2_bf16_buffers(num_elements, buffer[0].buffer, buffer[i].buffer); + } + break; + } +} + +#define REPEAT(N, x) REPEAT_##N(x) +#define REPEAT_1(x) x(1) +#define REPEAT_2(x) REPEAT_1(x);x(2) +#define REPEAT_3(x) REPEAT_2(x);x(3) +#define REPEAT_4(x) REPEAT_3(x);x(4) +#define REPEAT_5(x) REPEAT_4(x);x(5) +#define REPEAT_6(x) REPEAT_5(x);x(6) +#define REPEAT_7(x) REPEAT_6(x);x(7) + +#define CVT_ADD(x) do {\ + auto in##x##_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in##x + i))); \ + inout_val = _mm512_add_ps(inout_val, in##x##_val);} while(0) + +void reduce_2_bf16_buffers(int num_elements, void* in_out, void* in1) +{ + for (int i = 0; i < num_elements * 2; i += 32) { + auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in_out + i))); + REPEAT(1, CVT_ADD); + _mm256_storeu_si256((__m256i*)(in_out + i), cvt_fp32_to_bf16(inout_val)); + } +} + +void reduce_3_bf16_buffers(int num_elements, void* in_out, void* in1, void* in2) +{ + for (int i = 0; i < num_elements * 2; i += 32) { + auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in_out + i))); + REPEAT(2, CVT_ADD); + _mm256_storeu_si256((__m256i*)(in_out + i), cvt_fp32_to_bf16(inout_val)); + } +} + +void reduce_4_bf16_buffers(int num_elements, void* in_out, void* in1, void* in2, void* in3) +{ + for (int i = 0; i < num_elements * 2; i += 32) { + auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in_out + i))); + REPEAT(3, CVT_ADD); + _mm256_storeu_si256((__m256i*)(in_out + i), cvt_fp32_to_bf16(inout_val)); + } +} + +void reduce_5_bf16_buffers(int num_elements, void* in_out, void* in1, void* in2, void* in3, void* in4) +{ + for (int i = 0; i < num_elements * 2; i += 32) { + auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in_out + i))); + REPEAT(4, CVT_ADD); + _mm256_storeu_si256((__m256i*)(in_out + i), cvt_fp32_to_bf16(inout_val)); + } +} + +void reduce_6_bf16_buffers(int num_elements, void* in_out, void* in1, void* in2, void* in3, void* in4, void* in5) +{ + for (int i = 0; i < num_elements * 2; i += 32) { + auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in_out + i))); + REPEAT(5, CVT_ADD); + _mm256_storeu_si256((__m256i*)(in_out + i), cvt_fp32_to_bf16(inout_val)); + } +} + +void reduce_7_bf16_buffers(int num_elements, void* in_out, void* in1, void* in2, void* in3, void* in4, void* in5, void* in6) +{ + for (int i = 0; i < num_elements * 2; i += 32) { + auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in_out + i))); + REPEAT(6, CVT_ADD); + _mm256_storeu_si256((__m256i*)(in_out + i), cvt_fp32_to_bf16(inout_val)); + } +} + +void reduce_8_bf16_buffers(int num_elements, void* in_out, void* in1, void* in2, void* in3, void* in4, void* in5, void* in6, void* in7) +{ + for (int i = 0; i < num_elements * 2; i += 32) { + auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in_out + i))); + REPEAT(7, CVT_ADD); + _mm256_storeu_si256((__m256i*)(in_out + i), cvt_fp32_to_bf16(inout_val)); + } +} + +// Communicatiooon settings +int world_rank = -1; +int world_size = -1; + std::set _comm_ids; std::set _colors; ccl::vector_class _ccl_comms; @@ -75,21 +289,24 @@ ccl::communicator& _get_comm_from_group(py::object group) { return _ccl_comms[0] bool is_initialized = 0; -#define MAX_BUF_SIZE 65536 - ccl::shared_ptr_class kvs; -SharedData allreduce_buffer; -char buffer_name[100] = "allreduce_buffer"; -struct allreduce_workspace { - int state; - char buffer[MAX_BUF_SIZE]; -}; -struct allreduce_workspace* buffer; +bool all_ranks_local_p = false; void initialize(int size, int rank, torch::Tensor& kvs_data) { if (is_initialized) return; + + // Check whetehr all ranks is on the same physical machine. + // If true, we will use an SHM based low latency allreduce + + int ws = std::stoi(std::getenv("WORLD_SIZE")); + int ls = std::stoi(std::getenv("LOCAL_SIZE")); + + if (ws >= 1 && ws == ls) { + all_ranks_local_p = true; + } + world_size = size; world_rank = rank; is_initialized = 1; @@ -103,18 +320,21 @@ void initialize(int size, int rank, torch::Tensor& kvs_data) _ccl_comms.emplace_back(ccl::create_communicator(size, rank, kvs)); - if (rank == 0) { - buffer = (struct allreduce_workspace*)malloc(size * sizeof(struct allreduce_workspace)); - shared_create( - &allreduce_buffer, buffer_name, buffer, size * sizeof(struct allreduce_workspace)); + // create shared workspace for SHM based allreduce + if (all_ranks_local_p) { + if (rank == 0) { + buffer = (struct allreduce_workspace*)malloc(size * sizeof(struct allreduce_workspace)); + shared_create( + &allreduce_buffer, SHM_BUFFER_NAME, buffer, size * sizeof(struct allreduce_workspace)); + buffer = (struct allreduce_workspace*)allreduce_buffer.bytes; + for (int i = 0; i < size; i++) { buffer[i].state = 0; } + } + CCLCHECK(ccl::barrier(_get_comm_from_group()).wait()); + if (rank != 0) { + shared_open(&allreduce_buffer, SHM_BUFFER_NAME, size * sizeof(struct allreduce_workspace)); + } buffer = (struct allreduce_workspace*)allreduce_buffer.bytes; - for (int i = 0; i < size; i++) { buffer[i].state = 0; } } - CCLCHECK(ccl::barrier(_get_comm_from_group()).wait()); - if (rank != 0) { - shared_open(&allreduce_buffer, buffer_name, size * sizeof(struct allreduce_workspace)); - } - buffer = (struct allreduce_workspace*)allreduce_buffer.bytes; } /* @@ -236,7 +456,6 @@ int max_count = 0; // TODO: implement torch's async_op behavior, document it. void all_reduce(torch::Tensor& data, py::object op, py::object group, bool async_op) { - auto start = std::chrono::high_resolution_clock::now(); CCLCHECK(ccl::allreduce(data.data_ptr(), data.data_ptr(), data.numel(), @@ -244,288 +463,40 @@ void all_reduce(torch::Tensor& data, py::object op, py::object group, bool async get_ccl_reduce_op(op, data), _get_comm_from_group(group)) .wait()); - auto end = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end - start); - count++; - auto t = duration.count(); - total += t; - total_sq += t * t; - auto segma = sqrt(total_sq / count - total * total / count / count); - if (count == 17920 && world_rank == 0) { - printf("average duration: %f, std: %f\n", total / count, segma); - } -} - -void wait_buffer_state_until(int index, int state) -{ - volatile int* state_ptr = &(buffer[index].state); - - while (*state_ptr != state) - ; -} - -__m512 cvt_bf16_to_fp32(const __m256i src) __attribute__((target("avx512bw"))); -inline __m512 cvt_bf16_to_fp32(const __m256i src) -{ - auto y = _mm512_cvtepu16_epi32(src); - return _mm512_castsi512_ps(_mm512_bslli_epi128(y, 2)); -} - -inline __m256i cvt_fp32_to_bf16(const __m512 src) __attribute__((target("avx512bw"))); -inline __m256i cvt_fp32_to_bf16(const __m512 src) -{ -// #if (defined CPU_CAPABILITY_AVX512_BF16) -#if 0 - return reinterpret_cast<__m256i>(_mm512_cvtneps_pbh(src)); -#else - __m512i value = _mm512_castps_si512(src); - __m512i nan = _mm512_set1_epi32(0xffff); - auto mask_value = _mm512_cmp_ps_mask(src, src, _CMP_ORD_Q); - __m512i ones = _mm512_set1_epi32(0x1); - __m512i vec_bias = _mm512_set1_epi32(0x7fff); - // uint32_t lsb = (input >> 16) & 1; - auto t_value = _mm512_and_si512(_mm512_srli_epi32(value, 16), ones); - // uint32_t rounding_bias = 0x7fff + lsb; - t_value = _mm512_add_epi32(t_value, vec_bias); - // input += rounding_bias; - t_value = _mm512_add_epi32(t_value, value); - // input = input >> 16; - t_value = _mm512_srli_epi32(t_value, 16); - // Check NaN before converting back to bf16 - t_value = _mm512_mask_blend_epi32(mask_value, nan, t_value); - return _mm512_cvtusepi32_epi16(t_value); -#endif -} - -void reduce_bf16_buffers(void* in_out, void* in, int num_elements) - __attribute__((target("avx512bw"))); - -void reduce_3_bf16_buffers(void* in_out, void* in1, void* in2, int num_elements) - __attribute__((target("avx512bw"))); -void reduce_4_bf16_buffers(void* in_out, void* in1, void* in2, void* in3, int num_elements) - __attribute__((target("avx512bw"))); -void reduce_5_bf16_buffers(void* in_out, - void* in1, - void* in2, - void* in3, - void* in4, - int num_elements) __attribute__((target("avx512bw"))); -void reduce_6_bf16_buffers(void* in_out, - void* in1, - void* in2, - void* in3, - void* in4, - void* in5, - int num_elements) __attribute__((target("avx512bw"))); -void reduce_7_bf16_buffers(void* in_out, - void* in1, - void* in2, - void* in3, - void* in4, - void* in5, - void* in6, - int num_elements) __attribute__((target("avx512bw"))); -void reduce_8_bf16_buffers(void* in_out, - void* in1, - void* in2, - void* in3, - void* in4, - void* in5, - void* in6, - void* in7, - int num_elements) __attribute__((target("avx512bw"))); - -void reduce_all_bf16_buffers(struct allreduce_workspace* buffer, int num_elements, int num_buffers) -{ - if (num_buffers == 8) { - reduce_8_bf16_buffers(buffer[0].buffer, - buffer[1].buffer, - buffer[2].buffer, - buffer[3].buffer, - buffer[4].buffer, - buffer[5].buffer, - buffer[6].buffer, - buffer[7].buffer, - num_elements); - } else if (num_buffers == 7) { - reduce_7_bf16_buffers(buffer[0].buffer, - buffer[1].buffer, - buffer[2].buffer, - buffer[3].buffer, - buffer[4].buffer, - buffer[5].buffer, - buffer[6].buffer, - num_elements); - } else if (num_buffers == 6) { - reduce_6_bf16_buffers(buffer[0].buffer, - buffer[1].buffer, - buffer[2].buffer, - buffer[3].buffer, - buffer[4].buffer, - buffer[5].buffer, - num_elements); - } else if (num_buffers == 5) { - reduce_5_bf16_buffers(buffer[0].buffer, - buffer[1].buffer, - buffer[2].buffer, - buffer[3].buffer, - buffer[4].buffer, - num_elements); - } else if (num_buffers == 4) { - reduce_4_bf16_buffers( - buffer[0].buffer, buffer[1].buffer, buffer[2].buffer, buffer[3].buffer, num_elements); - } else if (num_buffers == 3) { - reduce_3_bf16_buffers(buffer[0].buffer, buffer[1].buffer, buffer[2].buffer, num_elements); - } else { - for (int i = 1; i < num_buffers; i++) { - reduce_bf16_buffers(buffer[0].buffer, buffer[i].buffer, num_elements); - } - } -} - -void reduce_bf16_buffers(void* in_out, void* in, int num_elements) -{ - for (int i = 0; i < num_elements * 2; i += 32) { - auto in1 = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in + i))); - auto inout1 = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in_out + i))); - inout1 = _mm512_add_ps(inout1, in1); - _mm256_storeu_si256((__m256i*)(in_out + i), cvt_fp32_to_bf16(inout1)); - } -} - -void reduce_3_bf16_buffers(void* in_out, void* in1, void* in2, int num_elements) -{ - for (int i = 0; i < num_elements * 2; i += 32) { - auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in_out + i))); - auto in1_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in1 + i))); - inout_val = _mm512_add_ps(inout_val, in1_val); - auto in2_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in2 + i))); - inout_val = _mm512_add_ps(inout_val, in2_val); - _mm256_storeu_si256((__m256i*)(in_out + i), cvt_fp32_to_bf16(inout_val)); - } -} - -void reduce_4_bf16_buffers(void* in_out, void* in1, void* in2, void* in3, int num_elements) -{ - for (int i = 0; i < num_elements * 2; i += 32) { - auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in_out + i))); - auto in1_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in1 + i))); - inout_val = _mm512_add_ps(inout_val, in1_val); - auto in2_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in2 + i))); - inout_val = _mm512_add_ps(inout_val, in2_val); - auto in3_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in3 + i))); - inout_val = _mm512_add_ps(inout_val, in3_val); - _mm256_storeu_si256((__m256i*)(in_out + i), cvt_fp32_to_bf16(inout_val)); - } } -void reduce_5_bf16_buffers(void* in_out, - void* in1, - void* in2, - void* in3, - void* in4, - int num_elements) -{ - for (int i = 0; i < num_elements * 2; i += 32) { - auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in_out + i))); - auto in1_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in1 + i))); - inout_val = _mm512_add_ps(inout_val, in1_val); - auto in2_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in2 + i))); - inout_val = _mm512_add_ps(inout_val, in2_val); - auto in3_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in3 + i))); - inout_val = _mm512_add_ps(inout_val, in3_val); - auto in4_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in4 + i))); - inout_val = _mm512_add_ps(inout_val, in4_val); - _mm256_storeu_si256((__m256i*)(in_out + i), cvt_fp32_to_bf16(inout_val)); - } -} - -void reduce_6_bf16_buffers(void* in_out, - void* in1, - void* in2, - void* in3, - void* in4, - void* in5, - int num_elements) -{ - for (int i = 0; i < num_elements * 2; i += 32) { - auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in_out + i))); - auto in1_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in1 + i))); - inout_val = _mm512_add_ps(inout_val, in1_val); - auto in2_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in2 + i))); - inout_val = _mm512_add_ps(inout_val, in2_val); - auto in3_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in3 + i))); - inout_val = _mm512_add_ps(inout_val, in3_val); - auto in4_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in4 + i))); - inout_val = _mm512_add_ps(inout_val, in4_val); - auto in5_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in5 + i))); - inout_val = _mm512_add_ps(inout_val, in5_val); - _mm256_storeu_si256((__m256i*)(in_out + i), cvt_fp32_to_bf16(inout_val)); - } -} - -void reduce_7_bf16_buffers(void* in_out, - void* in1, - void* in2, - void* in3, - void* in4, - void* in5, - void* in6, - int num_elements) -{ - for (int i = 0; i < num_elements * 2; i += 32) { - auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in_out + i))); - auto in1_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in1 + i))); - inout_val = _mm512_add_ps(inout_val, in1_val); - auto in2_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in2 + i))); - inout_val = _mm512_add_ps(inout_val, in2_val); - auto in3_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in3 + i))); - inout_val = _mm512_add_ps(inout_val, in3_val); - auto in4_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in4 + i))); - inout_val = _mm512_add_ps(inout_val, in4_val); - auto in5_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in5 + i))); - inout_val = _mm512_add_ps(inout_val, in5_val); - auto in6_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in6 + i))); - inout_val = _mm512_add_ps(inout_val, in6_val); - _mm256_storeu_si256((__m256i*)(in_out + i), cvt_fp32_to_bf16(inout_val)); - } -} - -void reduce_8_bf16_buffers(void* in_out, - void* in1, - void* in2, - void* in3, - void* in4, - void* in5, - void* in6, - void* in7, - int num_elements) +void all_reduce_caching(torch::Tensor& data, + py::object op, + std::string match_id, + py::object group, + bool async_op) { - for (int i = 0; i < num_elements * 2; i += 32) { - auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in_out + i))); - auto in1_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in1 + i))); - inout_val = _mm512_add_ps(inout_val, in1_val); - auto in2_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in2 + i))); - inout_val = _mm512_add_ps(inout_val, in2_val); - auto in3_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in3 + i))); - inout_val = _mm512_add_ps(inout_val, in3_val); - auto in4_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in4 + i))); - inout_val = _mm512_add_ps(inout_val, in4_val); - auto in5_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in5 + i))); - inout_val = _mm512_add_ps(inout_val, in5_val); - auto in6_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in6 + i))); - inout_val = _mm512_add_ps(inout_val, in6_val); - auto in7_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in7 + i))); - inout_val = _mm512_add_ps(inout_val, in7_val); - _mm256_storeu_si256((__m256i*)(in_out + i), cvt_fp32_to_bf16(inout_val)); - } + ccl::allreduce_attr attr = ccl::default_allreduce_attr; + auto match_str = ccl::v1::string(match_id); + attr.template set(true); + attr.template set(match_str); + // To control this, use operation attribute and set true value for to_cache field and unique + // string (for example, tensor name) for match_id field. Note that: + // match_id should be the same for a specific communication operation across all ranks. + // If the same tensor is a part of different communication operations, match_id should have + // different values for each of these operations. + CCLCHECK(ccl::allreduce(data.data_ptr(), + data.data_ptr(), + data.numel(), + get_ccl_datatype(data.scalar_type()), + get_ccl_reduce_op(op, data), + _get_comm_from_group(group), + attr) + .wait()); } void all_reduce_low_latency(torch::Tensor& data, py::object op, py::object group, bool async_op) { auto numel = data.numel(); + auto datatype = data.scalar_type(); + auto reduce_op = op; - if (numel * 2 > MAX_BUF_SIZE) { + if (numel * 2 > MAX_BUF_SIZE || !all_ranks_local_p) { // fallback to oneccl allreduce CCLCHECK(ccl::allreduce(data.data_ptr(), data.data_ptr(), @@ -538,10 +509,7 @@ void all_reduce_low_latency(torch::Tensor& data, py::object op, py::object group } auto data_ptr = data.data_ptr(); - auto datatype = data.scalar_type(); - auto reduce_op = op; - auto start = std::chrono::high_resolution_clock::now(); memcpy(buffer[world_rank].buffer, data_ptr, numel * 2); buffer[world_rank].state = 1; @@ -552,9 +520,6 @@ void all_reduce_low_latency(torch::Tensor& data, py::object op, py::object group wait_buffer_state_until(i, 1); } reduce_all_bf16_buffers(buffer, numel, world_size); - // for (int i=1; i< world_size; i++) { - // reduce_bf16_buffers(buffer[0].buffer, buffer[i].buffer, numel); - // } buffer[world_rank].state = 2; memcpy(data_ptr, buffer[0].buffer, numel * 2); } @@ -571,41 +536,6 @@ void all_reduce_low_latency(torch::Tensor& data, py::object op, py::object group wait_buffer_state_until(0, 0); buffer[world_rank].state = 0; } - auto end = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end - start); - count++; - auto t = duration.count(); - total += t; - total_sq += t * t; - auto segma = sqrt(total_sq / count - total * total / count / count); - if (count == 17920 && world_rank == 0) { - printf("average duration: %f, std: %f\n", total / count, segma); - } -} - -void all_reduce_caching(torch::Tensor& data, - py::object op, - std::string match_id, - py::object group, - bool async_op) -{ - ccl::allreduce_attr attr = ccl::default_allreduce_attr; - auto match_str = ccl::v1::string(match_id); - attr.template set(true); - attr.template set(match_str); - // To control this, use operation attribute and set true value for to_cache field and unique - // string (for example, tensor name) for match_id field. Note that: - // match_id should be the same for a specific communication operation across all ranks. - // If the same tensor is a part of different communication operations, match_id should have - // different values for each of these operations. - CCLCHECK(ccl::allreduce(data.data_ptr(), - data.data_ptr(), - data.numel(), - get_ccl_datatype(data.scalar_type()), - get_ccl_reduce_op(op, data), - _get_comm_from_group(group), - attr) - .wait()); } void barrier(py::object group, bool async_op) From 6ecf721603b65d4f987c7b35413c76035a6e16ef Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Fri, 30 Jun 2023 11:20:02 -0400 Subject: [PATCH 17/74] first clean up code --- csrc/cpu/comm/ccl.cpp | 174 +++++++++++++----------------------------- 1 file changed, 53 insertions(+), 121 deletions(-) diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index a390e70a0fc3..ab3e046a6778 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -65,11 +65,11 @@ struct allreduce_workspace { int state; char buffer[MAX_BUF_SIZE]; }; -struct allreduce_workspace* buffer; +struct allreduce_workspace* workspace; void wait_buffer_state_until(int index, int state) { - volatile int* state_ptr = &(buffer[index].state); + volatile int* state_ptr = &(workspace[index].state); while (*state_ptr != state) ; @@ -132,63 +132,17 @@ void reduce_7_bf16_buffers(int num_elements, void* in_out, void* in4, void* in5, void* in6) __attribute__((target("avx512bw"))); -void reduce_8_bf16_buffers(int num_elements, void* in_out, - void* in1, - void* in2, - void* in3, - void* in4, - void* in5, - void* in6, - void* in7) __attribute__((target("avx512bw"))); -void reduce_all_bf16_buffers(struct allreduce_workspace* buffer, int num_elements, int num_buffers) +void reduce_bf16_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace) __attribute__((target("avx512bw"))); + +void reduce_all_bf16_buffers(struct allreduce_workspace* workspace, int num_elements, int num_buffers) { - switch (num_buffers) { - case 8: - reduce_8_bf16_buffers(num_elements, buffer[0].buffer, - buffer[1].buffer, - buffer[2].buffer, - buffer[3].buffer, - buffer[4].buffer, - buffer[5].buffer, - buffer[6].buffer, - buffer[7].buffer); - break; - case 7: - reduce_7_bf16_buffers(num_elements, buffer[0].buffer, - buffer[1].buffer, - buffer[2].buffer, - buffer[3].buffer, - buffer[4].buffer, - buffer[5].buffer, - buffer[6].buffer); - break; - case 6: - reduce_6_bf16_buffers(num_elements, buffer[0].buffer, - buffer[1].buffer, - buffer[2].buffer, - buffer[3].buffer, - buffer[4].buffer, - buffer[5].buffer); - break; - case 5: - reduce_5_bf16_buffers(num_elements, buffer[0].buffer, - buffer[1].buffer, - buffer[2].buffer, - buffer[3].buffer, - buffer[4].buffer); - break; - case 4: - reduce_4_bf16_buffers(num_elements, buffer[0].buffer, buffer[1].buffer, buffer[2].buffer, buffer[3].buffer); - break; - case 3: - reduce_3_bf16_buffers(num_elements, buffer[0].buffer, buffer[1].buffer, buffer[2].buffer); - break; - default: + if (num_buffers >=3 && num_buffers <=8) { + reduce_bf16_buffers(num_elements, num_buffers, workspace); + } else { for (int i = 1; i < num_buffers; i++) { - reduce_2_bf16_buffers(num_elements, buffer[0].buffer, buffer[i].buffer); + reduce_2_bf16_buffers(num_elements, workspace[0].buffer, workspace[i].buffer); } - break; } } @@ -202,68 +156,46 @@ void reduce_all_bf16_buffers(struct allreduce_workspace* buffer, int num_element #define REPEAT_7(x) REPEAT_6(x);x(7) #define CVT_ADD(x) do {\ - auto in##x##_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in##x + i))); \ + auto in##x##_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(workspace[x].buffer + i))); \ inout_val = _mm512_add_ps(inout_val, in##x##_val);} while(0) -void reduce_2_bf16_buffers(int num_elements, void* in_out, void* in1) -{ - for (int i = 0; i < num_elements * 2; i += 32) { - auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in_out + i))); - REPEAT(1, CVT_ADD); - _mm256_storeu_si256((__m256i*)(in_out + i), cvt_fp32_to_bf16(inout_val)); - } -} - -void reduce_3_bf16_buffers(int num_elements, void* in_out, void* in1, void* in2) +void reduce_bf16_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace) { for (int i = 0; i < num_elements * 2; i += 32) { - auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in_out + i))); - REPEAT(2, CVT_ADD); - _mm256_storeu_si256((__m256i*)(in_out + i), cvt_fp32_to_bf16(inout_val)); - } -} - -void reduce_4_bf16_buffers(int num_elements, void* in_out, void* in1, void* in2, void* in3) -{ - for (int i = 0; i < num_elements * 2; i += 32) { - auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in_out + i))); - REPEAT(3, CVT_ADD); - _mm256_storeu_si256((__m256i*)(in_out + i), cvt_fp32_to_bf16(inout_val)); - } -} - -void reduce_5_bf16_buffers(int num_elements, void* in_out, void* in1, void* in2, void* in3, void* in4) -{ - for (int i = 0; i < num_elements * 2; i += 32) { - auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in_out + i))); - REPEAT(4, CVT_ADD); - _mm256_storeu_si256((__m256i*)(in_out + i), cvt_fp32_to_bf16(inout_val)); - } -} - -void reduce_6_bf16_buffers(int num_elements, void* in_out, void* in1, void* in2, void* in3, void* in4, void* in5) -{ - for (int i = 0; i < num_elements * 2; i += 32) { - auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in_out + i))); - REPEAT(5, CVT_ADD); - _mm256_storeu_si256((__m256i*)(in_out + i), cvt_fp32_to_bf16(inout_val)); - } -} - -void reduce_7_bf16_buffers(int num_elements, void* in_out, void* in1, void* in2, void* in3, void* in4, void* in5, void* in6) -{ - for (int i = 0; i < num_elements * 2; i += 32) { - auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in_out + i))); - REPEAT(6, CVT_ADD); - _mm256_storeu_si256((__m256i*)(in_out + i), cvt_fp32_to_bf16(inout_val)); + auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(workspace[0].buffer + i))); + switch (num_buffers) { + case 8: + REPEAT(7, CVT_ADD); + break; + case 7: + REPEAT(6, CVT_ADD); + break; + case 6: + REPEAT(5, CVT_ADD); + break; + case 5: + REPEAT(4, CVT_ADD); + break; + case 4: + REPEAT(3, CVT_ADD); + break; + case 3: + REPEAT(2, CVT_ADD); + break; + default: + assert(!"Should not get here."); + } + _mm256_storeu_si256((__m256i*)(workspace[0].buffer + i), cvt_fp32_to_bf16(inout_val)); } } -void reduce_8_bf16_buffers(int num_elements, void* in_out, void* in1, void* in2, void* in3, void* in4, void* in5, void* in6, void* in7) +void reduce_2_bf16_buffers(int num_elements, void* in_out, void* in1) { for (int i = 0; i < num_elements * 2; i += 32) { auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in_out + i))); - REPEAT(7, CVT_ADD); + auto in1_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in1 + i))); + inout_val = _mm512_add_ps(inout_val, in1_val); + REPEAT(1, CVT_ADD); _mm256_storeu_si256((__m256i*)(in_out + i), cvt_fp32_to_bf16(inout_val)); } } @@ -323,17 +255,17 @@ void initialize(int size, int rank, torch::Tensor& kvs_data) // create shared workspace for SHM based allreduce if (all_ranks_local_p) { if (rank == 0) { - buffer = (struct allreduce_workspace*)malloc(size * sizeof(struct allreduce_workspace)); + workspace = (struct allreduce_workspace*)malloc(size * sizeof(struct allreduce_workspace)); shared_create( - &allreduce_buffer, SHM_BUFFER_NAME, buffer, size * sizeof(struct allreduce_workspace)); - buffer = (struct allreduce_workspace*)allreduce_buffer.bytes; - for (int i = 0; i < size; i++) { buffer[i].state = 0; } + &allreduce_buffer, SHM_BUFFER_NAME, workspace, size * sizeof(struct allreduce_workspace)); + workspace = (struct allreduce_workspace*)allreduce_buffer.bytes; + for (int i = 0; i < size; i++) { workspace[i].state = 0; } } CCLCHECK(ccl::barrier(_get_comm_from_group()).wait()); if (rank != 0) { shared_open(&allreduce_buffer, SHM_BUFFER_NAME, size * sizeof(struct allreduce_workspace)); } - buffer = (struct allreduce_workspace*)allreduce_buffer.bytes; + workspace = (struct allreduce_workspace*)allreduce_buffer.bytes; } } @@ -510,8 +442,8 @@ void all_reduce_low_latency(torch::Tensor& data, py::object op, py::object group auto data_ptr = data.data_ptr(); - memcpy(buffer[world_rank].buffer, data_ptr, numel * 2); - buffer[world_rank].state = 1; + memcpy(workspace[world_rank].buffer, data_ptr, numel * 2); + workspace[world_rank].state = 1; if (world_rank == 0) { // compute allreduce result on rank 0 @@ -519,22 +451,22 @@ void all_reduce_low_latency(torch::Tensor& data, py::object op, py::object group // wait until the other rank copy the buffer wait_buffer_state_until(i, 1); } - reduce_all_bf16_buffers(buffer, numel, world_size); - buffer[world_rank].state = 2; - memcpy(data_ptr, buffer[0].buffer, numel * 2); + reduce_all_bf16_buffers(workspace, numel, world_size); + workspace[world_rank].state = 2; + memcpy(data_ptr, workspace[0].buffer, numel * 2); } if (world_rank != 0) { wait_buffer_state_until(0, 2); - memcpy(data_ptr, buffer[0].buffer, numel * 2); - buffer[world_rank].state = 2; + memcpy(data_ptr, workspace[0].buffer, numel * 2); + workspace[world_rank].state = 2; } if (world_rank == 0) { for (int i = 1; i < world_size; i++) { wait_buffer_state_until(i, 2); } - buffer[world_rank].state = 0; + workspace[world_rank].state = 0; } if (world_rank != 0) { wait_buffer_state_until(0, 0); - buffer[world_rank].state = 0; + workspace[world_rank].state = 0; } } From d6a3ac8959882d351a14b167c17ab67a7c194c70 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Fri, 30 Jun 2023 12:04:07 -0400 Subject: [PATCH 18/74] add checks for allreduce_low_latency, remove warning --- csrc/cpu/comm/ccl.cpp | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index ab3e046a6778..f9939a0ab8d2 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -192,11 +192,11 @@ void reduce_bf16_buffers(int num_elements, int num_buffers, struct allreduce_wor void reduce_2_bf16_buffers(int num_elements, void* in_out, void* in1) { for (int i = 0; i < num_elements * 2; i += 32) { - auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in_out + i))); - auto in1_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(in1 + i))); + auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)((char*)in_out + i))); + auto in1_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)((char*)in1 + i))); inout_val = _mm512_add_ps(inout_val, in1_val); REPEAT(1, CVT_ADD); - _mm256_storeu_si256((__m256i*)(in_out + i), cvt_fp32_to_bf16(inout_val)); + _mm256_storeu_si256((__m256i*)((char*)in_out + i), cvt_fp32_to_bf16(inout_val)); } } @@ -424,11 +424,16 @@ void all_reduce_caching(torch::Tensor& data, void all_reduce_low_latency(torch::Tensor& data, py::object op, py::object group, bool async_op) { + static py::object ReduceOp = py::module_::import("deepspeed.comm").attr("ReduceOp"); + static auto ReduceOpSum = (int)py::int_(ReduceOp.attr("SUM").attr("value")); + + assert (py::int_(op.attr("value")) == ReduceOpSum); + auto numel = data.numel(); - auto datatype = data.scalar_type(); - auto reduce_op = op; - if (numel * 2 > MAX_BUF_SIZE || !all_ranks_local_p) { + if (numel * 2 > MAX_BUF_SIZE + || data.scalar_type() != c10::ScalarType::BFloat16 + || !all_ranks_local_p) { // fallback to oneccl allreduce CCLCHECK(ccl::allreduce(data.data_ptr(), data.data_ptr(), From 3364a93b446baf61fb85a6979e280d22bf63960e Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Fri, 30 Jun 2023 12:09:38 -0400 Subject: [PATCH 19/74] remove redudant declaration, fix 2 ranks --- csrc/cpu/comm/ccl.cpp | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index f9939a0ab8d2..6ee39d12af12 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -110,29 +110,6 @@ inline __m256i cvt_fp32_to_bf16(const __m512 src) void reduce_2_bf16_buffers(int num_elements, void* in_out, void* in) __attribute__((target("avx512bw"))); -void reduce_3_bf16_buffers(int num_elements, void* in_out, void* in1, void* in2) - __attribute__((target("avx512bw"))); -void reduce_4_bf16_buffers(int num_elements, void* in_out, void* in1, void* in2, void* in3) - __attribute__((target("avx512bw"))); -void reduce_5_bf16_buffers(int num_elements, void* in_out, - void* in1, - void* in2, - void* in3, - void* in4) __attribute__((target("avx512bw"))); -void reduce_6_bf16_buffers(int num_elements, void* in_out, - void* in1, - void* in2, - void* in3, - void* in4, - void* in5) __attribute__((target("avx512bw"))); -void reduce_7_bf16_buffers(int num_elements, void* in_out, - void* in1, - void* in2, - void* in3, - void* in4, - void* in5, - void* in6) __attribute__((target("avx512bw"))); - void reduce_bf16_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace) __attribute__((target("avx512bw"))); void reduce_all_bf16_buffers(struct allreduce_workspace* workspace, int num_elements, int num_buffers) @@ -195,7 +172,6 @@ void reduce_2_bf16_buffers(int num_elements, void* in_out, void* in1) auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)((char*)in_out + i))); auto in1_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)((char*)in1 + i))); inout_val = _mm512_add_ps(inout_val, in1_val); - REPEAT(1, CVT_ADD); _mm256_storeu_si256((__m256i*)((char*)in_out + i), cvt_fp32_to_bf16(inout_val)); } } From afc67f68bc9d33a66d3534145723766a1c57b8b0 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Sun, 2 Jul 2023 22:09:49 -0400 Subject: [PATCH 20/74] remove avx512f path --- csrc/cpu/comm/ccl.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index 6ee39d12af12..7ae38308b12a 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -85,9 +85,6 @@ inline __m512 cvt_bf16_to_fp32(const __m256i src) inline __m256i cvt_fp32_to_bf16(const __m512 src) __attribute__((target("avx512bw"))); inline __m256i cvt_fp32_to_bf16(const __m512 src) { -#if 0 - return reinterpret_cast<__m256i>(_mm512_cvtneps_pbh(src)); -#else __m512i value = _mm512_castps_si512(src); __m512i nan = _mm512_set1_epi32(0xffff); auto mask_value = _mm512_cmp_ps_mask(src, src, _CMP_ORD_Q); @@ -104,7 +101,6 @@ inline __m256i cvt_fp32_to_bf16(const __m512 src) // Check NaN before converting back to bf16 t_value = _mm512_mask_blend_epi32(mask_value, nan, t_value); return _mm512_cvtusepi32_epi16(t_value); -#endif } void reduce_2_bf16_buffers(int num_elements, void* in_out, void* in) From 6f05cbfa729b6e0a1e58021c0997a7df611cc4d3 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Mon, 3 Jul 2023 22:50:48 -0400 Subject: [PATCH 21/74] check whether buffer size is divisible by 16 --- csrc/cpu/comm/ccl.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index 7ae38308b12a..800063aac282 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -132,6 +132,7 @@ void reduce_all_bf16_buffers(struct allreduce_workspace* workspace, int num_elem auto in##x##_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(workspace[x].buffer + i))); \ inout_val = _mm512_add_ps(inout_val, in##x##_val);} while(0) +// num_elements must be divisible by 16 (caller check) void reduce_bf16_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace) { for (int i = 0; i < num_elements * 2; i += 32) { @@ -404,6 +405,7 @@ void all_reduce_low_latency(torch::Tensor& data, py::object op, py::object group auto numel = data.numel(); if (numel * 2 > MAX_BUF_SIZE + || (numel % 16) != 0 || data.scalar_type() != c10::ScalarType::BFloat16 || !all_ranks_local_p) { // fallback to oneccl allreduce From 4a38410b5e5c1d6e078844416f1cde3db85946ce Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Tue, 4 Jul 2023 14:31:21 +0800 Subject: [PATCH 22/74] autoTP linear allreduce should go to allreduce_low_latency --- deepspeed/module_inject/layers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py index 7c08e9df7be1..9149794c70f0 100644 --- a/deepspeed/module_inject/layers.py +++ b/deepspeed/module_inject/layers.py @@ -31,8 +31,7 @@ def forward(self, input): #output = output_tensors[output_size] #torch.matmul(input, self.weight.transpose(-1, -2), out=output) output = torch.matmul(input, self.weight.transpose(-1, -2)) - if self.mp_group is not None: - dist.all_reduce_low_latency(output, group=self.mp_group) + dist.all_reduce_low_latency(output, group=self.mp_group) if self.bias is not None: output += self.bias return output From c88f3bda27022fa8d19faf45be7172bd2a1e978f Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Tue, 4 Jul 2023 09:00:22 -0400 Subject: [PATCH 23/74] cleanup profile code --- csrc/cpu/comm/ccl.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index 800063aac282..6d83a80481ad 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -351,13 +351,6 @@ void broadcast(torch::Tensor& data, int src, py::object group, bool async_op) .wait()); } -float total = 0.0f; -float total_sq = 0.0f; -float min = 1000.0f; -float max = 0.0f; -int count = 0; -int min_count = 0; -int max_count = 0; // TODO: implement torch's async_op behavior, document it. void all_reduce(torch::Tensor& data, py::object op, py::object group, bool async_op) { From bd5ea3f445efd3ac192cc8863a580c3adebf2df3 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Tue, 4 Jul 2023 12:07:09 -0400 Subject: [PATCH 24/74] make SHM buffer larger --- csrc/cpu/comm/ccl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index 6d83a80481ad..34d9912c9e02 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -58,7 +58,7 @@ void shared_close(SharedData* data) } // SHM based allreduce helper functions -#define MAX_BUF_SIZE 65536 +#define MAX_BUF_SIZE 1048576 #define SHM_BUFFER_NAME "deepspeed_allreduce_buffer" SharedData allreduce_buffer; struct allreduce_workspace { From 3feebc181dd40d01ff94925e35d590cc93752e3f Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Tue, 4 Jul 2023 21:54:42 -0400 Subject: [PATCH 25/74] add mfence to ensure memory order before each buffer state update --- csrc/cpu/comm/ccl.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index 34d9912c9e02..6f22b580118c 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -11,10 +11,10 @@ #include #include #include -#include #include #include #include +#include // SHM building blocks struct SharedData { @@ -415,6 +415,7 @@ void all_reduce_low_latency(torch::Tensor& data, py::object op, py::object group auto data_ptr = data.data_ptr(); memcpy(workspace[world_rank].buffer, data_ptr, numel * 2); + std::atomic_thread_fence(std::memory_order_release); workspace[world_rank].state = 1; if (world_rank == 0) { @@ -424,16 +425,19 @@ void all_reduce_low_latency(torch::Tensor& data, py::object op, py::object group wait_buffer_state_until(i, 1); } reduce_all_bf16_buffers(workspace, numel, world_size); + std::atomic_thread_fence(std::memory_order_release); workspace[world_rank].state = 2; memcpy(data_ptr, workspace[0].buffer, numel * 2); } if (world_rank != 0) { wait_buffer_state_until(0, 2); memcpy(data_ptr, workspace[0].buffer, numel * 2); + std::atomic_thread_fence(std::memory_order_release); workspace[world_rank].state = 2; } if (world_rank == 0) { for (int i = 1; i < world_size; i++) { wait_buffer_state_until(i, 2); } + std::atomic_thread_fence(std::memory_order_release); workspace[world_rank].state = 0; } if (world_rank != 0) { From 0ee639f9c31351df2059bd02a7ca6dd86b4ffe3b Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Thu, 6 Jul 2023 15:39:01 +0800 Subject: [PATCH 26/74] fix typo --- deepspeed/utils/numa.py | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/deepspeed/utils/numa.py b/deepspeed/utils/numa.py index 07ba77bb6e80..fdc0d45acaa0 100644 --- a/deepspeed/utils/numa.py +++ b/deepspeed/utils/numa.py @@ -121,11 +121,34 @@ def get_numactl_cmd(bind_core_list, num_local_procs, local_rank): # check if all cores belong to same numa, if true, bind process to that numa domain with -m parameter numa_cores = get_numa_cores() num_numas = len(numa_cores) + + numa_mode = "normal" + + non_empty_numa_list = [] + empty_numa_list = [] for i in range(num_numas): - if set(core_list_for_rank) <= set(numa_cores[i]): - numactl_cmd.append("-m") - numactl_cmd.append(f"{i}") - break + # look for empty numa which is HBM numa + if numa_cores[i] == []: + empty_numa_list.append[i] + else: + non_empty_numa_list.append[i] + + if empty_numa_list != [] and len(empty_numa_list) == len(non_empty_numa_list): + numa_mode = "flat_hbm" + numa_dict = dict(zip(non_empty_numa_list, empty_numa_list)) + + if numa_mode == "normal": + for i in range(num_numas): + if set(core_list_for_rank) <= set(numa_cores[i]): + numactl_cmd.append("-m") + numactl_cmd.append(f"{i}") + break + elif numa_mode == "flat_hbm": + for i in range(num_numas): + if set(core_list_for_rank) <= set(numa_cores[i]): + numactl_cmd.append("-p") + numactl_cmd.append(f"{numa_dict[i]}") + break numactl_cmd.append("-C") last_core = core_list_for_rank[0] From a75064a99714206cff466061df76b143394021fa Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Thu, 6 Jul 2023 04:09:54 -0400 Subject: [PATCH 27/74] fix typo --- deepspeed/utils/numa.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepspeed/utils/numa.py b/deepspeed/utils/numa.py index fdc0d45acaa0..d3a76cbbf00c 100644 --- a/deepspeed/utils/numa.py +++ b/deepspeed/utils/numa.py @@ -129,9 +129,9 @@ def get_numactl_cmd(bind_core_list, num_local_procs, local_rank): for i in range(num_numas): # look for empty numa which is HBM numa if numa_cores[i] == []: - empty_numa_list.append[i] + empty_numa_list.append([i]) else: - non_empty_numa_list.append[i] + non_empty_numa_list.append([i]) if empty_numa_list != [] and len(empty_numa_list) == len(non_empty_numa_list): numa_mode = "flat_hbm" From 9a1160307099c9ef8006012f87b1a34e185690ae Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Thu, 6 Jul 2023 04:27:59 -0400 Subject: [PATCH 28/74] add temp print cmd --- deepspeed/launcher/launch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deepspeed/launcher/launch.py b/deepspeed/launcher/launch.py index b3cb9d1205a9..39db33ac2ad7 100755 --- a/deepspeed/launcher/launch.py +++ b/deepspeed/launcher/launch.py @@ -243,6 +243,7 @@ def main(): cmd.append(f"--local_rank={local_rank}") cmd += args.training_script_args + print (cmd) if args.enable_each_rank_log != "None": log_file = os.path.join(args.enable_each_rank_log, f"{log_name_prefix}_rank{dist_rank}.log") log_fd = open(log_file, 'w') From 4107634d8f1145f9e6796c761cb12111b8c1ba10 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Thu, 6 Jul 2023 04:34:46 -0400 Subject: [PATCH 29/74] fix error in HBM path --- deepspeed/utils/numa.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepspeed/utils/numa.py b/deepspeed/utils/numa.py index d3a76cbbf00c..e6fd13c4377a 100644 --- a/deepspeed/utils/numa.py +++ b/deepspeed/utils/numa.py @@ -129,9 +129,9 @@ def get_numactl_cmd(bind_core_list, num_local_procs, local_rank): for i in range(num_numas): # look for empty numa which is HBM numa if numa_cores[i] == []: - empty_numa_list.append([i]) + empty_numa_list.append(i) else: - non_empty_numa_list.append([i]) + non_empty_numa_list.append(i) if empty_numa_list != [] and len(empty_numa_list) == len(non_empty_numa_list): numa_mode = "flat_hbm" From 3723eb15a42211982dab65125364548c4184489d Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Thu, 6 Jul 2023 04:43:15 -0400 Subject: [PATCH 30/74] remove cmd print --- deepspeed/launcher/launch.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deepspeed/launcher/launch.py b/deepspeed/launcher/launch.py index 39db33ac2ad7..b3cb9d1205a9 100755 --- a/deepspeed/launcher/launch.py +++ b/deepspeed/launcher/launch.py @@ -243,7 +243,6 @@ def main(): cmd.append(f"--local_rank={local_rank}") cmd += args.training_script_args - print (cmd) if args.enable_each_rank_log != "None": log_file = os.path.join(args.enable_each_rank_log, f"{log_name_prefix}_rank{dist_rank}.log") log_fd = open(log_file, 'w') From accf0e38a1112e75d0bccbfbcef0733b098056a4 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Thu, 6 Jul 2023 06:22:28 -0400 Subject: [PATCH 31/74] remove buffer reuse in linear allreduce since it does not bring visible benefits --- deepspeed/module_inject/layers.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py index 9149794c70f0..3433f5d672c8 100644 --- a/deepspeed/module_inject/layers.py +++ b/deepspeed/module_inject/layers.py @@ -11,9 +11,6 @@ from torch.nn.parameter import Parameter from deepspeed.accelerator import get_accelerator -global output_tensors -output_tensors = {} - class LinearAllreduce(nn.Module): @@ -24,12 +21,6 @@ def __init__(self, weight, bias=None, mp_group=None): self.mp_group = mp_group def forward(self, input): - #global output_tensors - #output_size = input.size()[:-1] + self.weight.transpose(-1,-2).size()[1:] - #if not output_size in output_tensors: - # output_tensors[output_size] = torch.empty(output_size, device=input.device, dtype=input.dtype) - #output = output_tensors[output_size] - #torch.matmul(input, self.weight.transpose(-1, -2), out=output) output = torch.matmul(input, self.weight.transpose(-1, -2)) dist.all_reduce_low_latency(output, group=self.mp_group) if self.bias is not None: From 0d2699ace9d474880b724591c146bd5966af25b6 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Mon, 10 Jul 2023 10:38:11 +0800 Subject: [PATCH 32/74] add support for fakenuma support --- accelerator/cpu_accelerator.py | 4 +++- deepspeed/utils/numa.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py index 824b912df1ec..09a79519da3f 100644 --- a/accelerator/cpu_accelerator.py +++ b/accelerator/cpu_accelerator.py @@ -47,9 +47,11 @@ def device_count(self): # Ignore these NUMA nodes with no cores. numa_core_lists = get_numa_cores() numa_count = 0 + prev_core_list = [] for core_list in numa_core_lists: - if len(core_list) > 0: + if len(core_list) > 0 and core_list != prev_core_list: numa_count += 1 + prev_core_list = core_list return numa_count def synchronize(self, device_index=None): diff --git a/deepspeed/utils/numa.py b/deepspeed/utils/numa.py index e6fd13c4377a..b9283d783b61 100644 --- a/deepspeed/utils/numa.py +++ b/deepspeed/utils/numa.py @@ -126,6 +126,9 @@ def get_numactl_cmd(bind_core_list, num_local_procs, local_rank): non_empty_numa_list = [] empty_numa_list = [] + previous_numa_cores = [] + numa_node_list = [] + numa_node_list_list = [] for i in range(num_numas): # look for empty numa which is HBM numa if numa_cores[i] == []: @@ -133,9 +136,25 @@ def get_numactl_cmd(bind_core_list, num_local_procs, local_rank): else: non_empty_numa_list.append(i) + # check for fakenuma + if numa_cores[i] == previous_numa_cores: + if numa_node_list == []: + #first duplication, add previous node into list + numa_node_list.append(i - 1) + numa_node_list.append(i) + else: + if numa_node_list != []: + numa_node_list_list.append(numa_node_list) + numa_node_list = [] + previous_numa_cores = numa_cores[i] + if numa_node_list != []: + numa_node_list_list.append(numa_node_list) + if empty_numa_list != [] and len(empty_numa_list) == len(non_empty_numa_list): numa_mode = "flat_hbm" numa_dict = dict(zip(non_empty_numa_list, empty_numa_list)) + elif numa_node_list_list != []: + numa_mode = "fake" if numa_mode == "normal": for i in range(num_numas): @@ -149,6 +168,18 @@ def get_numactl_cmd(bind_core_list, num_local_procs, local_rank): numactl_cmd.append("-p") numactl_cmd.append(f"{numa_dict[i]}") break + elif numa_mode == "fake": + for i in range(num_numas): + if set(core_list_for_rank) <= set(numa_cores[i]): + for nodes in numa_node_list_list: + if i in nodes: + numactl_cmd.append("-m") + numactl_cmd.append(f"{','.join(map(str, nodes))}") + break + # the following construct break the outer loop if inner loop breaks + else: + continue + break numactl_cmd.append("-C") last_core = core_list_for_rank[0] From 0ac06bc0ebc9490936561f9e6818800288616b5b Mon Sep 17 00:00:00 2001 From: baodii Date: Thu, 6 Jul 2023 16:46:55 +0800 Subject: [PATCH 33/74] fix llama meta data error when model device is meta and LLaMa lm_head wrongly loading error --- deepspeed/module_inject/replace_module.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index f0fe81f28714..1ea89b596dbf 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -485,7 +485,14 @@ def _replace_module(r_module, prev_name='', prev_class_name=''): else: class_name = prev_class_name + '.' + prev_name checking_key = prefix + '.' + class_name + '.' + name + '.' if class_name != "" else prefix + '.' + name + '.' - if child.__class__ in [nn.Linear, nn.Embedding, nn.LayerNorm] and state_dict != None: + try: + import transformers + LlamaRMSNorm = transformers.models.llama.modeling_llama.LlamaRMSNorm + except: + LlamaRMSNorm = None + if child.__class__ in [nn.Linear, nn.Embedding, nn.LayerNorm, LlamaRMSNorm] and state_dict != None: + if child.__class__ is LlamaRMSNorm and not child.weight.is_meta: + continue if any(checking_key in item for item in state_dict): load(child, state_dict, checking_key, mp_group) else: @@ -786,6 +793,11 @@ def replace_module(model, orig_class, replace_fn, _replace_policy, checkpoint=No if embedding_weight is not None and hasattr(replaced_module, "lm_head") and hasattr( replaced_module.lm_head, "weight") and replaced_module.lm_head.weight.is_meta: replaced_module.lm_head.weight = embedding_weight + if sd is not None: + if 'lm_head.weight' in sd.keys() and hasattr(replaced_module, 'lm_head'): + replaced_module.lm_head.weight = torch.nn.parameter.Parameter(data=torch.empty_like(sd['lm_head.weight'].data, device="cpu"), + requires_grad=sd['lm_head.weight'].data.requires_grad) + replaced_module.lm_head.weight.data.copy_(sd['lm_head.weight']) return replaced_module @@ -831,7 +843,12 @@ def _replace_module(model, policies, prefix='', layer_id=0, level_id=0, state_di OPTLearnedPositionalEmbedding = transformers.models.opt.modeling_opt.OPTLearnedPositionalEmbedding except: OPTLearnedPositionalEmbedding = None - load_layers = [nn.Linear, nn.Embedding, nn.LayerNorm, OPTLearnedPositionalEmbedding] + try: + import transformers + LlamaRMSNorm = transformers.models.llama.modeling_llama.LlamaRMSNorm + except: + LlamaRMSNorm = None + load_layers = [nn.Linear, nn.Embedding, nn.LayerNorm, OPTLearnedPositionalEmbedding, LlamaRMSNorm] for name, child in model.named_children(): if child.__class__ in policies: replaced_module = policies[child.__class__][0](child, From e86fc8302c8f1d55e368d5d8d93e23bc47f4d149 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Mon, 10 Jul 2023 14:06:45 +0800 Subject: [PATCH 34/74] fix llama meta data error when model device is meta and LLaMa lm_head --- deepspeed/module_inject/replace_module.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 97949f612e89..2a247c0dd2d8 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -461,7 +461,14 @@ def _replace_module(r_module, prev_name='', prev_class_name=''): else: class_name = prev_class_name + '.' + prev_name checking_key = prefix + '.' + class_name + '.' + name + '.' if class_name != "" else prefix + '.' + name + '.' - if child.__class__ in [nn.Linear, nn.Embedding, nn.LayerNorm] and state_dict != None: + try: + import transformers + LlamaRMSNorm = transformers.models.llama.modeling_llama.LlamaRMSNorm + except: + LlamaRMSNorm = None + if child.__class__ in [nn.Linear, nn.Embedding, nn.LayerNorm, LlamaRMSNorm] and state_dict != None: + if child.__class__ is LlamaRMSNorm and not child.weight.is_meta: + continue if any(checking_key in item for item in state_dict): load(child, state_dict, checking_key, mp_group) else: @@ -772,6 +779,12 @@ def replace_module(model, orig_class, replace_fn, _replace_policy, checkpoint=No if embedding_weight is not None and hasattr(replaced_module, "lm_head") and hasattr( replaced_module.lm_head, "weight") and replaced_module.lm_head.weight.is_meta: replaced_module.lm_head.weight = embedding_weight + if sd is not None: + if 'lm_head.weight' in sd.keys() and hasattr(replaced_module, 'lm_head'): + replaced_module.lm_head.weight = torch.nn.parameter.Parameter( + data=torch.empty_like(sd['lm_head.weight'].data, device="cpu"), + requires_grad=sd['lm_head.weight'].data.requires_grad) + replaced_module.lm_head.weight.data.copy_(sd['lm_head.weight']) return replaced_module @@ -820,7 +833,12 @@ def _replace_module(model, policies, prefix='', layer_id=0, level_id=0, state_di OPTLearnedPositionalEmbedding = transformers.models.opt.modeling_opt.OPTLearnedPositionalEmbedding except: OPTLearnedPositionalEmbedding = None - load_layers = [nn.Linear, nn.Embedding, nn.LayerNorm, OPTLearnedPositionalEmbedding] + try: + import transformers + LlamaRMSNorm = transformers.models.llama.modeling_llama.LlamaRMSNorm + except: + LlamaRMSNorm = None + load_layers = [nn.Linear, nn.Embedding, nn.LayerNorm, OPTLearnedPositionalEmbedding, LlamaRMSNorm] for name, child in model.named_children(): if child.__class__ in policies: replaced_module = policies[child.__class__][0](child, From 0be3b7eea1dc7b48bed02da3e696a9381218c348 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Tue, 11 Jul 2023 05:11:25 -0400 Subject: [PATCH 35/74] support FP32 SHM allreduce --- csrc/cpu/comm/ccl.cpp | 182 +++++++++++++++++++++++++++++------------- 1 file changed, 126 insertions(+), 56 deletions(-) diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index 6f22b580118c..0ea15fe16fdc 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -11,10 +11,10 @@ #include #include #include -#include +#include #include +#include #include -#include // SHM building blocks struct SharedData { @@ -106,31 +106,70 @@ inline __m256i cvt_fp32_to_bf16(const __m512 src) void reduce_2_bf16_buffers(int num_elements, void* in_out, void* in) __attribute__((target("avx512bw"))); -void reduce_bf16_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace) __attribute__((target("avx512bw"))); +void reduce_bf16_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace) + __attribute__((target("avx512bw"))); + +void reduce_2_f32_buffers(int num_elements, void* in_out, void* in) + __attribute__((target("avx512bw"))); + +void reduce_f32_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace) + __attribute__((target("avx512bw"))); -void reduce_all_bf16_buffers(struct allreduce_workspace* workspace, int num_elements, int num_buffers) +void reduce_all_buffers(struct allreduce_workspace* workspace, + int num_elements, + c10::ScalarType scalar_type, + int num_buffers) { - if (num_buffers >=3 && num_buffers <=8) { - reduce_bf16_buffers(num_elements, num_buffers, workspace); - } else { - for (int i = 1; i < num_buffers; i++) { - reduce_2_bf16_buffers(num_elements, workspace[0].buffer, workspace[i].buffer); - } + switch (scalar_type) { + case c10::ScalarType::BFloat16: + if (num_buffers >= 3 && num_buffers <= 8) { + reduce_bf16_buffers(num_elements, num_buffers, workspace); + } else { + for (int i = 1; i < num_buffers; i++) { + reduce_2_bf16_buffers(num_elements, workspace[0].buffer, workspace[i].buffer); + } + } + break; + case c10::ScalarType::Float: + if (num_buffers >= 3 && num_buffers <= 8) { + reduce_f32_buffers(num_elements, num_buffers, workspace); + } else { + for (int i = 1; i < num_buffers; i++) { + reduce_2_f32_buffers(num_elements, workspace[0].buffer, workspace[i].buffer); + } + } + break; + default: assert(!"Should not get here"); } } #define REPEAT(N, x) REPEAT_##N(x) #define REPEAT_1(x) x(1) -#define REPEAT_2(x) REPEAT_1(x);x(2) -#define REPEAT_3(x) REPEAT_2(x);x(3) -#define REPEAT_4(x) REPEAT_3(x);x(4) -#define REPEAT_5(x) REPEAT_4(x);x(5) -#define REPEAT_6(x) REPEAT_5(x);x(6) -#define REPEAT_7(x) REPEAT_6(x);x(7) - -#define CVT_ADD(x) do {\ - auto in##x##_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(workspace[x].buffer + i))); \ - inout_val = _mm512_add_ps(inout_val, in##x##_val);} while(0) +#define REPEAT_2(x) \ + REPEAT_1(x); \ + x(2) +#define REPEAT_3(x) \ + REPEAT_2(x); \ + x(3) +#define REPEAT_4(x) \ + REPEAT_3(x); \ + x(4) +#define REPEAT_5(x) \ + REPEAT_4(x); \ + x(5) +#define REPEAT_6(x) \ + REPEAT_5(x); \ + x(6) +#define REPEAT_7(x) \ + REPEAT_6(x); \ + x(7) + +#define CVT_ADD_BF16(x) \ + do { \ + auto in##x##_val = \ + cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(workspace[x].buffer + i))); \ + inout_val = _mm512_add_ps(inout_val, in##x##_val); \ + } while (0) // num_elements must be divisible by 16 (caller check) void reduce_bf16_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace) @@ -138,26 +177,13 @@ void reduce_bf16_buffers(int num_elements, int num_buffers, struct allreduce_wor for (int i = 0; i < num_elements * 2; i += 32) { auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(workspace[0].buffer + i))); switch (num_buffers) { - case 8: - REPEAT(7, CVT_ADD); - break; - case 7: - REPEAT(6, CVT_ADD); - break; - case 6: - REPEAT(5, CVT_ADD); - break; - case 5: - REPEAT(4, CVT_ADD); - break; - case 4: - REPEAT(3, CVT_ADD); - break; - case 3: - REPEAT(2, CVT_ADD); - break; - default: - assert(!"Should not get here."); + case 8: REPEAT(7, CVT_ADD_BF16); break; + case 7: REPEAT(6, CVT_ADD_BF16); break; + case 6: REPEAT(5, CVT_ADD_BF16); break; + case 5: REPEAT(4, CVT_ADD_BF16); break; + case 4: REPEAT(3, CVT_ADD_BF16); break; + case 3: REPEAT(2, CVT_ADD_BF16); break; + default: assert(!"Should not get here."); } _mm256_storeu_si256((__m256i*)(workspace[0].buffer + i), cvt_fp32_to_bf16(inout_val)); } @@ -173,6 +199,40 @@ void reduce_2_bf16_buffers(int num_elements, void* in_out, void* in1) } } +#define CVT_ADD_F32(x) \ + do { \ + auto in##x##_val = _mm256_loadu_ps((float*)(workspace[x].buffer + i)); \ + inout_val = _mm256_add_ps(inout_val, in##x##_val); \ + } while (0) + +// num_elements must be divisible by 16 (caller check) +void reduce_f32_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace) +{ + for (int i = 0; i < num_elements * 4; i += 32) { + auto inout_val = _mm256_loadu_ps((float*)(workspace[0].buffer + i)); + switch (num_buffers) { + case 8: REPEAT(7, CVT_ADD_F32); break; + case 7: REPEAT(6, CVT_ADD_F32); break; + case 6: REPEAT(5, CVT_ADD_F32); break; + case 5: REPEAT(4, CVT_ADD_F32); break; + case 4: REPEAT(3, CVT_ADD_F32); break; + case 3: REPEAT(2, CVT_ADD_F32); break; + default: assert(!"Should not get here."); + } + _mm256_storeu_ps((float*)(workspace[0].buffer + i), inout_val); + } +} + +void reduce_2_f32_buffers(int num_elements, void* in_out, void* in1) +{ + for (int i = 0; i < num_elements * 4; i += 32) { + auto inout_val = _mm256_loadu_ps((float*)((char*)in_out + i)); + auto in1_val = _mm256_loadu_ps((float*)((char*)in1 + i)); + inout_val = _mm256_add_ps(inout_val, in1_val); + _mm256_storeu_ps((float*)((char*)in_out + i), inout_val); + } +} + // Communicatiooon settings int world_rank = -1; int world_size = -1; @@ -202,15 +262,13 @@ void initialize(int size, int rank, torch::Tensor& kvs_data) { if (is_initialized) return; - // Check whetehr all ranks is on the same physical machine. + // Check whether all ranks is on the same physical machine. // If true, we will use an SHM based low latency allreduce int ws = std::stoi(std::getenv("WORLD_SIZE")); int ls = std::stoi(std::getenv("LOCAL_SIZE")); - if (ws >= 1 && ws == ls) { - all_ranks_local_p = true; - } + if (ws >= 1 && ws == ls) { all_ranks_local_p = true; } world_size = size; world_rank = rank; @@ -228,15 +286,19 @@ void initialize(int size, int rank, torch::Tensor& kvs_data) // create shared workspace for SHM based allreduce if (all_ranks_local_p) { if (rank == 0) { - workspace = (struct allreduce_workspace*)malloc(size * sizeof(struct allreduce_workspace)); - shared_create( - &allreduce_buffer, SHM_BUFFER_NAME, workspace, size * sizeof(struct allreduce_workspace)); + workspace = + (struct allreduce_workspace*)malloc(size * sizeof(struct allreduce_workspace)); + shared_create(&allreduce_buffer, + SHM_BUFFER_NAME, + workspace, + size * sizeof(struct allreduce_workspace)); workspace = (struct allreduce_workspace*)allreduce_buffer.bytes; for (int i = 0; i < size; i++) { workspace[i].state = 0; } } CCLCHECK(ccl::barrier(_get_comm_from_group()).wait()); if (rank != 0) { - shared_open(&allreduce_buffer, SHM_BUFFER_NAME, size * sizeof(struct allreduce_workspace)); + shared_open( + &allreduce_buffer, SHM_BUFFER_NAME, size * sizeof(struct allreduce_workspace)); } workspace = (struct allreduce_workspace*)allreduce_buffer.bytes; } @@ -397,10 +459,18 @@ void all_reduce_low_latency(torch::Tensor& data, py::object op, py::object group auto numel = data.numel(); - if (numel * 2 > MAX_BUF_SIZE - || (numel % 16) != 0 - || data.scalar_type() != c10::ScalarType::BFloat16 - || !all_ranks_local_p) { + int data_size = 0; + bool data_type_fallback = false; + + switch (data.scalar_type()) { + case c10::ScalarType::BFloat16: data_size = numel * 2; break; + case c10::ScalarType::Float: data_size = numel * 4; break; + default: data_type_fallback = true; + } + + if (data_size > MAX_BUF_SIZE || (numel % 16) != 0 || + data_type_fallback || + !all_ranks_local_p) { // fallback to oneccl allreduce CCLCHECK(ccl::allreduce(data.data_ptr(), data.data_ptr(), @@ -414,7 +484,7 @@ void all_reduce_low_latency(torch::Tensor& data, py::object op, py::object group auto data_ptr = data.data_ptr(); - memcpy(workspace[world_rank].buffer, data_ptr, numel * 2); + memcpy(workspace[world_rank].buffer, data_ptr, data_size); std::atomic_thread_fence(std::memory_order_release); workspace[world_rank].state = 1; @@ -424,14 +494,14 @@ void all_reduce_low_latency(torch::Tensor& data, py::object op, py::object group // wait until the other rank copy the buffer wait_buffer_state_until(i, 1); } - reduce_all_bf16_buffers(workspace, numel, world_size); + reduce_all_buffers(workspace, numel, data.scalar_type(), world_size); std::atomic_thread_fence(std::memory_order_release); workspace[world_rank].state = 2; - memcpy(data_ptr, workspace[0].buffer, numel * 2); + memcpy(data_ptr, workspace[0].buffer, data_size); } if (world_rank != 0) { wait_buffer_state_until(0, 2); - memcpy(data_ptr, workspace[0].buffer, numel * 2); + memcpy(data_ptr, workspace[0].buffer, data_size); std::atomic_thread_fence(std::memory_order_release); workspace[world_rank].state = 2; } From 0706acd78dadaed07278d4ca6d883939c157098f Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Thu, 20 Jul 2023 05:41:07 -0400 Subject: [PATCH 36/74] allow number of heads not divisible by number of ranks --- deepspeed/module_inject/replace_module.py | 38 +++++++++++++++-------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 4c488e3bf2cd..9c6e6336bc25 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -304,6 +304,9 @@ def replace_transformer_layer(orig_layer_impl, model, checkpoint_dict, config, m seed = -1 local_rank = -1 + global num_heads + num_heads = -1 + mp_replace = ReplaceWithTensorSlicing(mp_group=config.tensor_parallel.tp_group, mp_size=config.tensor_parallel.tp_size) #, out_dim=0, in_dim=1) @@ -364,6 +367,11 @@ def replace_with_policy(child, policy_cls, triangular_masking, inference=False, return _container.module + def get_shard_size(total_size, num_slices): + num_units = num_heads + my_slices = num_units // num_slices + (1 if dist.get_rank() < (num_units % num_slices) else 0) + return total_size // num_units * my_slices + def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None): mp_size = config.tensor_parallel.tp_size mp_group = config.tensor_parallel.tp_group @@ -374,12 +382,11 @@ def _replace(child, name, conv_linear_layer): mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group) weight_shape = child.weight.shape if name in all_reduce_linears: - new_weight = torch.empty(( - weight_shape[1] if conv_linear_layer else weight_shape[0], - (weight_shape[0] if conv_linear_layer else weight_shape[1]) // mp_size, - ), - device=child.weight.device, - dtype=child.weight.dtype) + new_weight = torch.empty( + (weight_shape[1] if conv_linear_layer else weight_shape[0], + get_shard_size(weight_shape[0] if conv_linear_layer else weight_shape[1], mp_size)), + device=child.weight.device, + dtype=child.weight.dtype) if conv_linear_layer: child.weight.data = child.weight.data.transpose(-1, -2).contiguous() data = mp_replace.copy(new_weight, child.weight.data) @@ -391,8 +398,8 @@ def _replace(child, name, conv_linear_layer): torch.nn.parameter.Parameter(new_bias.to(get_accelerator().current_device_name())), mp_group) else: new_weight = torch.empty(( - (weight_shape[1] if conv_linear_layer else weight_shape[0]) // mp_size, - weight_shape[0] // mp_size if conv_linear_layer else weight_shape[1], + get_shard_size(weight_shape[1] if conv_linear_layer else weight_shape[0], mp_size), + get_shard_size(weight_shape[0], mp_size) if conv_linear_layer else weight_shape[1], ), device=child.weight.device, dtype=child.weight.dtype) @@ -400,7 +407,7 @@ def _replace(child, name, conv_linear_layer): child.weight.data = child.weight.data.transpose(-1, -2).contiguous() data = mp_replace.copy(new_weight, child.weight.data) - new_bias = torch.empty((weight_shape[0] // mp_size), + new_bias = torch.empty(get_shard_size(weight_shape[0], mp_size), device=child.weight.device, dtype=child.weight.dtype) bias_data = None if child.bias is None else mp_replace.copy(new_bias, child.bias.data).to( @@ -412,13 +419,13 @@ def _slice_embedding(child, name, conv_linear_layer): if getattr(child, "replaced", False) == True: return mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group) - new_weight = torch.empty((child.weight.shape[0], child.weight.shape[1] // mp_size), + new_weight = torch.empty((child.weight.shape[0], get_shard_size(child.weight.shape[1], mp_size)), device=child.weight.device, dtype=child.weight.dtype) data = mp_replace.copy(new_weight, child.weight.ds_tensor.data if hasattr(child.weight, 'ds_tensor') else \ child.weight.data) - new_embedding = nn.Embedding(child.weight.shape[0], child.weight.shape[1] // mp_size) + new_embedding = nn.Embedding(child.weight.shape[0], get_shard_size(child.weight.shape[1], mp_size)) new_embedding.weight.data.copy_(data) setattr(child, "replaced", True) return new_embedding @@ -432,8 +439,13 @@ def update_mp_params(child): ]: if hasattr(child, param): param_val = getattr(child, param) - assert param_val % mp_size == 0, f"{param} ({param_val}) must be divisible by mp_size ({mp_size})" - setattr(child, param, param_val // mp_size) + if param in ["n_heads", "num_heads", "num_kv", "num_attention_heads", "num_attn_heads"]: + global num_heads + num_heads = param_val + setattr(child, param, param_val // mp_size + (1 if dist.get_rank() < + (param_val % mp_size) else 0)) + else: + setattr(child, param, get_shard_size(param_val, mp_size)) setattr(child, "replaced", True) conv_linear_layer = False From 43d2c67bfc68ddca308e28b491d44f9f34bd1593 Mon Sep 17 00:00:00 2001 From: baodi Date: Fri, 21 Jul 2023 10:11:25 +0800 Subject: [PATCH 37/74] Update replace_module.py delete lm_head weight load part --- deepspeed/module_inject/replace_module.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 1ea89b596dbf..cf90758234dd 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -793,11 +793,7 @@ def replace_module(model, orig_class, replace_fn, _replace_policy, checkpoint=No if embedding_weight is not None and hasattr(replaced_module, "lm_head") and hasattr( replaced_module.lm_head, "weight") and replaced_module.lm_head.weight.is_meta: replaced_module.lm_head.weight = embedding_weight - if sd is not None: - if 'lm_head.weight' in sd.keys() and hasattr(replaced_module, 'lm_head'): - replaced_module.lm_head.weight = torch.nn.parameter.Parameter(data=torch.empty_like(sd['lm_head.weight'].data, device="cpu"), - requires_grad=sd['lm_head.weight'].data.requires_grad) - replaced_module.lm_head.weight.data.copy_(sd['lm_head.weight']) + return replaced_module From 1e90f03c506464e1f7fc578a65f9426566a7e747 Mon Sep 17 00:00:00 2001 From: baodi Date: Fri, 21 Jul 2023 11:11:57 +0800 Subject: [PATCH 38/74] Update replace_module.py delete debug code --- deepspeed/module_inject/replace_module.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index cf90758234dd..7ca4b7165317 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -491,8 +491,6 @@ def _replace_module(r_module, prev_name='', prev_class_name=''): except: LlamaRMSNorm = None if child.__class__ in [nn.Linear, nn.Embedding, nn.LayerNorm, LlamaRMSNorm] and state_dict != None: - if child.__class__ is LlamaRMSNorm and not child.weight.is_meta: - continue if any(checking_key in item for item in state_dict): load(child, state_dict, checking_key, mp_group) else: From 0bf785f893e1ef9c3acf140553cff93eb861b4d0 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Fri, 21 Jul 2023 01:18:32 -0400 Subject: [PATCH 39/74] get num_heads from model config, more robust --- deepspeed/module_inject/replace_module.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 9c6e6336bc25..9843e30fb113 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -304,9 +304,6 @@ def replace_transformer_layer(orig_layer_impl, model, checkpoint_dict, config, m seed = -1 local_rank = -1 - global num_heads - num_heads = -1 - mp_replace = ReplaceWithTensorSlicing(mp_group=config.tensor_parallel.tp_group, mp_size=config.tensor_parallel.tp_size) #, out_dim=0, in_dim=1) @@ -368,7 +365,7 @@ def replace_with_policy(child, policy_cls, triangular_masking, inference=False, return _container.module def get_shard_size(total_size, num_slices): - num_units = num_heads + num_units = model_config.num_attention_heads my_slices = num_units // num_slices + (1 if dist.get_rank() < (num_units % num_slices) else 0) return total_size // num_units * my_slices @@ -440,8 +437,6 @@ def update_mp_params(child): if hasattr(child, param): param_val = getattr(child, param) if param in ["n_heads", "num_heads", "num_kv", "num_attention_heads", "num_attn_heads"]: - global num_heads - num_heads = param_val setattr(child, param, param_val // mp_size + (1 if dist.get_rank() < (param_val % mp_size) else 0)) else: From 72b9e1addb8b932e4862fdcbe948e188b0e4aaa5 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Fri, 21 Jul 2023 01:23:07 -0400 Subject: [PATCH 40/74] simplify logic where num_head itself is sharded --- deepspeed/module_inject/replace_module.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 9843e30fb113..cb4365cd77b1 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -436,11 +436,7 @@ def update_mp_params(child): ]: if hasattr(child, param): param_val = getattr(child, param) - if param in ["n_heads", "num_heads", "num_kv", "num_attention_heads", "num_attn_heads"]: - setattr(child, param, param_val // mp_size + (1 if dist.get_rank() < - (param_val % mp_size) else 0)) - else: - setattr(child, param, get_shard_size(param_val, mp_size)) + setattr(child, param, get_shard_size(param_val, mp_size)) setattr(child, "replaced", True) conv_linear_layer = False From 5ed9a5662c1464e28933b2267c716dd353c7b8a2 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Fri, 21 Jul 2023 01:32:15 -0400 Subject: [PATCH 41/74] name tweaks --- deepspeed/module_inject/replace_module.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index cb4365cd77b1..238cffd79711 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -365,9 +365,9 @@ def replace_with_policy(child, policy_cls, triangular_masking, inference=False, return _container.module def get_shard_size(total_size, num_slices): - num_units = model_config.num_attention_heads - my_slices = num_units // num_slices + (1 if dist.get_rank() < (num_units % num_slices) else 0) - return total_size // num_units * my_slices + num_heads = model_config.num_attention_heads + my_slices = num_heads // num_slices + (1 if dist.get_rank() < (num_heads % num_slices) else 0) + return total_size // num_heads * my_slices def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None): mp_size = config.tensor_parallel.tp_size From 73f499d9226a10f6304b9d8cbd425e4d7aac0912 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Fri, 21 Jul 2023 01:39:09 -0400 Subject: [PATCH 42/74] make code more robust where num_attention_heads may not be defined in model_config --- deepspeed/module_inject/replace_module.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 238cffd79711..0b937bfe1928 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -365,9 +365,15 @@ def replace_with_policy(child, policy_cls, triangular_masking, inference=False, return _container.module def get_shard_size(total_size, num_slices): - num_heads = model_config.num_attention_heads - my_slices = num_heads // num_slices + (1 if dist.get_rank() < (num_heads % num_slices) else 0) - return total_size // num_heads * my_slices + if hasattr(model_config, 'num_attention_heads'): + num_heads = model_config.num_attention_heads + my_slices = num_heads // num_slices + (1 if dist.get_rank() < (num_heads % num_slices) else 0) + return total_size // num_heads * my_slices + else: + if total_size % num_slices == 0: + return total_size // num_slices + else: + assert False, f"Number of attention heads ({total_size}) must be divisible by mp_size ({num_slices})" def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None): mp_size = config.tensor_parallel.tp_size From 12c0628631da848ee61ec75ad61c62431513db03 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Mon, 24 Jul 2023 22:39:28 -0400 Subject: [PATCH 43/74] support num_key_value_heads < num_attention_heads which is used by llama2 --- deepspeed/module_inject/replace_module.py | 26 ++++++++++++++++------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 0b937bfe1928..7bcd059496b3 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -364,16 +364,26 @@ def replace_with_policy(child, policy_cls, triangular_masking, inference=False, return _container.module - def get_shard_size(total_size, num_slices): - if hasattr(model_config, 'num_attention_heads'): - num_heads = model_config.num_attention_heads - my_slices = num_heads // num_slices + (1 if dist.get_rank() < (num_heads % num_slices) else 0) - return total_size // num_heads * my_slices + def get_shard_size(total_size, mp_size): + num_kv_heads = None + + # 1. Try to get num_key_heads from model_config.num_key_value_heads + if hasattr(model_config, 'num_key_value_heads'): + num_kv_heads = model_config.num_key_value_heads + + # 2. Fallback to model_config.num_attention_heads when necessary + if num_kv_heads == None and hasattr(model_config, 'num_attention_heads'): + num_kv_heads = model_config.num_attention_heads + + # 3. When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division + if num_kv_heads != None: + my_slices = num_kv_heads // mp_size + (1 if dist.get_rank() < (num_kv_heads % mp_size) else 0) + return total_size // num_kv_heads * my_slices else: - if total_size % num_slices == 0: - return total_size // num_slices + if total_size % mp_size == 0: + return total_size // mp_size else: - assert False, f"Number of attention heads ({total_size}) must be divisible by mp_size ({num_slices})" + assert False, f"Number of attention heads ({total_size}) must be divisible by mp_size ({mp_size})" def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None): mp_size = config.tensor_parallel.tp_size From 8f23d9bfebb59dd7046edfc8a4a978124f9276fc Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Mon, 24 Jul 2023 23:43:04 -0400 Subject: [PATCH 44/74] add test for 5 ranks --- tests/unit/inference/test_inference.py | 31 ++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index 4209bfa02ab4..50bf8bc661fa 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -519,6 +519,37 @@ def test( print(local_rank, "deepspeed", ds_output) assert assert_fn(bs_output, ds_output) + @pytest.mark.world_size(5) + def test_odd_world_size( + self, + model_w_task, + query, + inf_kwargs, + assert_fn, + dtype, + ): + invalid_test_msg = validate_test(model_w_task, dtype, enable_cuda_graph=False, enable_triton=False) + if invalid_test_msg: + pytest.skip(invalid_test_msg) + + model, task = model_w_task + local_rank = int(os.getenv("LOCAL_RANK", "0")) + world_size = int(os.getenv("WORLD_SIZE", "5")) + + # We have to load these large models on CPU with pipeline because not + # enough GPU memory + pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt") + bs_output = pipe(query, **inf_kwargs) + + pipe.model = deepspeed.init_inference(pipe.model, mp_size=world_size, dtype=dtype) + # Switch device to GPU so that input tensors are not on CPU + pipe.device = torch.device(get_accelerator().device_name(local_rank)) + ds_output = pipe(query, **inf_kwargs) + + print(local_rank, "baseline", bs_output) + print(local_rank, "deepspeed", ds_output) + assert assert_fn(bs_output, ds_output) + @pytest.mark.nightly @pytest.mark.parametrize( From 9c53bd74e314c0361c77a12f82a5a256892a4a8d Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Tue, 25 Jul 2023 00:54:04 -0400 Subject: [PATCH 45/74] change odd rank # to 3 to avoid test skip --- tests/unit/inference/test_inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index 50bf8bc661fa..28a458422e4f 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -519,7 +519,7 @@ def test( print(local_rank, "deepspeed", ds_output) assert assert_fn(bs_output, ds_output) - @pytest.mark.world_size(5) + @pytest.mark.world_size(3) def test_odd_world_size( self, model_w_task, @@ -534,7 +534,7 @@ def test_odd_world_size( model, task = model_w_task local_rank = int(os.getenv("LOCAL_RANK", "0")) - world_size = int(os.getenv("WORLD_SIZE", "5")) + world_size = int(os.getenv("WORLD_SIZE", "3")) # We have to load these large models on CPU with pipeline because not # enough GPU memory From a04fa977e906799f6a147f83fcdeb2eec2538ad8 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Wed, 26 Jul 2023 20:49:56 -0400 Subject: [PATCH 46/74] Run SHM allreduce's reduce kernel with openmp to further improve perf. 10KB message: 15us -> 12us --- csrc/cpu/comm/ccl.cpp | 10 ++++++---- op_builder/cpu/comm.py | 3 +++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index 0ea15fe16fdc..b110c408aeae 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -174,6 +174,7 @@ void reduce_all_buffers(struct allreduce_workspace* workspace, // num_elements must be divisible by 16 (caller check) void reduce_bf16_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace) { +#pragma omp parallel for for (int i = 0; i < num_elements * 2; i += 32) { auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(workspace[0].buffer + i))); switch (num_buffers) { @@ -191,6 +192,7 @@ void reduce_bf16_buffers(int num_elements, int num_buffers, struct allreduce_wor void reduce_2_bf16_buffers(int num_elements, void* in_out, void* in1) { +#pragma omp parallel for for (int i = 0; i < num_elements * 2; i += 32) { auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)((char*)in_out + i))); auto in1_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)((char*)in1 + i))); @@ -208,6 +210,7 @@ void reduce_2_bf16_buffers(int num_elements, void* in_out, void* in1) // num_elements must be divisible by 16 (caller check) void reduce_f32_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace) { +#pragma omp parallel for for (int i = 0; i < num_elements * 4; i += 32) { auto inout_val = _mm256_loadu_ps((float*)(workspace[0].buffer + i)); switch (num_buffers) { @@ -225,6 +228,7 @@ void reduce_f32_buffers(int num_elements, int num_buffers, struct allreduce_work void reduce_2_f32_buffers(int num_elements, void* in_out, void* in1) { +#pragma omp parallel for for (int i = 0; i < num_elements * 4; i += 32) { auto inout_val = _mm256_loadu_ps((float*)((char*)in_out + i)); auto in1_val = _mm256_loadu_ps((float*)((char*)in1 + i)); @@ -455,7 +459,7 @@ void all_reduce_low_latency(torch::Tensor& data, py::object op, py::object group static py::object ReduceOp = py::module_::import("deepspeed.comm").attr("ReduceOp"); static auto ReduceOpSum = (int)py::int_(ReduceOp.attr("SUM").attr("value")); - assert (py::int_(op.attr("value")) == ReduceOpSum); + assert(py::int_(op.attr("value")) == ReduceOpSum); auto numel = data.numel(); @@ -468,9 +472,7 @@ void all_reduce_low_latency(torch::Tensor& data, py::object op, py::object group default: data_type_fallback = true; } - if (data_size > MAX_BUF_SIZE || (numel % 16) != 0 || - data_type_fallback || - !all_ranks_local_p) { + if (data_size > MAX_BUF_SIZE || (numel % 16) != 0 || data_type_fallback || !all_ranks_local_p) { // fallback to oneccl allreduce CCLCHECK(ccl::allreduce(data.data_ptr(), data.data_ptr(), diff --git a/op_builder/cpu/comm.py b/op_builder/cpu/comm.py index c076ee48376d..ec908eb0622b 100644 --- a/op_builder/cpu/comm.py +++ b/op_builder/cpu/comm.py @@ -25,6 +25,9 @@ def include_paths(self): includes = ['csrc/cpu/includes'] return includes + def cxx_args(self): + return ['-O2', '-fopenmp'] + def is_compatible(self, verbose=True): # TODO: add soft compatibility check for private binary release. # a soft check, as in we know it can be trivially changed. From 27fde308692e267b1e21211d495ad6b68c04533d Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Wed, 9 Aug 2023 10:17:01 -0400 Subject: [PATCH 47/74] add get_shard_size function --- deepspeed/module_inject/auto_tp.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py index 8c1f78d0cfec..c6768d605c7a 100644 --- a/deepspeed/module_inject/auto_tp.py +++ b/deepspeed/module_inject/auto_tp.py @@ -402,6 +402,27 @@ def update_linear_policies(self): else: self.linear_policies = {nn.Linear: self._replace, nn.Embedding: self._slice_embedding} + def get_shard_size(total_size, mp_size): + num_kv_heads = None + + # 1. Try to get num_key_heads from model_config.num_key_value_heads + if hasattr(model_config, 'num_key_value_heads'): + num_kv_heads = model_config.num_key_value_heads + + # 2. Fallback to model_config.num_attention_heads when necessary + if num_kv_heads == None and hasattr(model_config, 'num_attention_heads'): + num_kv_heads = model_config.num_attention_heads + + # 3. When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division + if num_kv_heads != None: + my_slices = num_kv_heads // mp_size + (1 if dist.get_rank() < (num_kv_heads % mp_size) else 0) + return total_size // num_kv_heads * my_slices + else: + if total_size % mp_size == 0: + return total_size // mp_size + else: + assert False, f"Number of attention heads ({total_size}) must be divisible by mp_size ({mp_size})" + def _replace_module(self, r_module, prev_name='', prev_class_name=''): for name, child in r_module.named_children(): if prev_class_name == "": From 8e1fd277b026e0aa17bbcc9196ffdcd571425130 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Thu, 10 Aug 2023 02:56:07 -0400 Subject: [PATCH 48/74] modify sharding mechanism according to latest auto TP --- deepspeed/module_inject/auto_tp.py | 40 ++++++----------------- deepspeed/module_inject/fusedqkv_utils.py | 9 ++--- deepspeed/module_inject/replace_module.py | 38 +++++++++------------ deepspeed/utils/tp_shard.py | 18 ++++++++++ 4 files changed, 48 insertions(+), 57 deletions(-) create mode 100644 deepspeed/utils/tp_shard.py diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py index c6768d605c7a..039b9892a6bd 100644 --- a/deepspeed/module_inject/auto_tp.py +++ b/deepspeed/module_inject/auto_tp.py @@ -14,6 +14,7 @@ from .layers import LinearAllreduce, LinearLayer from deepspeed.accelerator import get_accelerator from .fusedqkv_utils import require_tp_fused_qkvw, prepare_tp_fused_qkvw +from deepspeed.utils.tp_shard import get_shard_size class ReplaceWithTensorSlicing: @@ -300,7 +301,7 @@ def _replace(self, child, name, conv_linear_layer): # MPT block qkv weight's allocation is different from other models, it's [3,num_head,head_dim,hidden_size] # instead of [num_head,3,head_dim,hidden_size] new_weight = torch.empty(( - weight_shape[0] // self.mp_size, + get_shard_size(weight_shape[0], self.mp_size), weight_shape[1], ), device=child.weight.device, @@ -319,7 +320,7 @@ def _replace(self, child, name, conv_linear_layer): if self.conv_linear_layer: child.weight.data = child.weight.data.transpose(-1, -2).contiguous() data = child.weight.data.split( - (weight_shape[0] if self.conv_linear_layer else weight_shape[1]) // self.mp_size, dim=1) + get_shard_size(weight_shape[0] if self.conv_linear_layer else weight_shape[1], self.mp_size), dim=1) data = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()) setattr(child, "replaced", True) @@ -342,13 +343,13 @@ def _replace(self, child, name, conv_linear_layer): module_str, child.bias.data, self.mp_size, mp_replace.gpu_index).to( get_accelerator().current_device_name()) else: - data = child.weight.data.split((weight_shape[0]) // self.mp_size, + data = child.weight.data.split(get_shard_size(weight_shape[0], self.mp_size), dim=1 if self.conv_linear_layer else 0) data = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()) if child.bias is not None: bias_data = child.bias.data.split( - (weight_shape[1] if self.conv_linear_layer else weight_shape[0]) // self.mp_size, dim=0) + get_shard_size(weight_shape[1] if self.conv_linear_layer else weight_shape[0], self.mp_size), dim=0) bias_data = bias_data[mp_replace.gpu_index].to(get_accelerator().current_device_name()) else: bias_data = None @@ -362,12 +363,12 @@ def _slice_embedding(self, child, name, conv_linear_layer): mp_replace = ReplaceWithTensorSlicing(mp_group=self.mp_group) if hasattr(child.weight, 'ds_tensor'): - data = child.weight.ds_tensor.data.split(child.weight.shape[1] // self.mp_size, dim=1) + data = child.weight.ds_tensor.data.split(get_shard_size(child.weight.shape[1], self.mp_size), dim=1) else: - data = child.weight.data.split(child.weight.shape[1] // self.mp_size, dim=1) + data = child.weight.data.split(get_shard_size(child.weight.shape[1], self.mp_size), dim=1) data = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()) - new_embedding = nn.Embedding(child.weight.shape[0], child.weight.shape[1] // self.mp_size) + new_embedding = nn.Embedding(child.weight.shape[0], get_shard_size(child.weight.shape[1], self.mp_size)) new_embedding.weight.data.copy_(data) setattr(child, "replaced", True) return new_embedding @@ -381,8 +382,8 @@ def update_mp_params(self, child): ]: if hasattr(child, param): param_val = getattr(child, param) - assert param_val % self.mp_size == 0, f"{param} ({param_val}) must be divisible by mp_size ({self.mp_size})" - setattr(child, param, param_val // self.mp_size) + #assert param_val % self.mp_size == 0, f"{param} ({param_val}) must be divisible by mp_size ({self.mp_size})" + setattr(child, param, get_shard_size(param_val, self.mp_size)) setattr(child, "replaced", True) def update_linear_policies(self): @@ -402,27 +403,6 @@ def update_linear_policies(self): else: self.linear_policies = {nn.Linear: self._replace, nn.Embedding: self._slice_embedding} - def get_shard_size(total_size, mp_size): - num_kv_heads = None - - # 1. Try to get num_key_heads from model_config.num_key_value_heads - if hasattr(model_config, 'num_key_value_heads'): - num_kv_heads = model_config.num_key_value_heads - - # 2. Fallback to model_config.num_attention_heads when necessary - if num_kv_heads == None and hasattr(model_config, 'num_attention_heads'): - num_kv_heads = model_config.num_attention_heads - - # 3. When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division - if num_kv_heads != None: - my_slices = num_kv_heads // mp_size + (1 if dist.get_rank() < (num_kv_heads % mp_size) else 0) - return total_size // num_kv_heads * my_slices - else: - if total_size % mp_size == 0: - return total_size // mp_size - else: - assert False, f"Number of attention heads ({total_size}) must be divisible by mp_size ({mp_size})" - def _replace_module(self, r_module, prev_name='', prev_class_name=''): for name, child in r_module.named_children(): if prev_class_name == "": diff --git a/deepspeed/module_inject/fusedqkv_utils.py b/deepspeed/module_inject/fusedqkv_utils.py index 30a5bb75db23..d7850cb85aa3 100644 --- a/deepspeed/module_inject/fusedqkv_utils.py +++ b/deepspeed/module_inject/fusedqkv_utils.py @@ -4,6 +4,7 @@ # DeepSpeed Team import torch from deepspeed.utils.logging import warning_once +from deepspeed.utils.tp_shard import get_shard_size import re @@ -41,14 +42,14 @@ def _codegen_type_transpose(input, mp_size, codegen_mp_num=4): #input : [3*hidden_dim, hidden_dim](weight) or [3*hidden_dim](bias) shape = input.shape - dst_shape = shape[0] // mp_size + dst_shape = get_shard_size(shape[0], mp_size) num_mp_blocks = input.reshape(codegen_mp_num, shape[0] // codegen_mp_num, shape[1]) #num_mp_blocks : [codegen_mp_num, 3*hidden_dim/codegen_mp_num, :] src_split = list(torch.split(num_mp_blocks, num_mp_blocks.shape[1] // 3, dim=1)) src_split = [x.reshape(codegen_mp_num * mp_size, -1, shape[1]) for x in src_split] - split_fusedqkv = split_by_qkvlist_and_refuse(src_split, shape[0] // 3 // mp_size, 0, 1) + split_fusedqkv = split_by_qkvlist_and_refuse(src_split, get_shard_size(shape[0] // 3, mp_size), 0, 1) tp_fuseqkv_weight = torch.cat(split_fusedqkv, dim=0).reshape(shape[0], -1) return tp_fuseqkv_weight[gpu_index * dst_shape:(gpu_index + 1) * dst_shape] @@ -57,10 +58,10 @@ def _glm_type_transpose(input, mp_size): #input : [3*hidden_dim, hidden_dim](weight) or [3*hidden_dim](bias) shape = input.shape - dst_shape = shape[0] // mp_size + dst_shape = get_shard_size(shape[0], mp_size) src_split = torch.split(input, shape[0] // 3, dim=0) - split_fusedqkv = split_by_qkvlist_and_refuse(src_split, shape[0] // 3 // mp_size) + split_fusedqkv = split_by_qkvlist_and_refuse(src_split, get_shard_size(shape[0] // 3, mp_size)) tp_fuseqkv_weight = torch.cat(split_fusedqkv, dim=0) return tp_fuseqkv_weight[gpu_index * dst_shape:(gpu_index + 1) * dst_shape] diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index d93146857a66..4010f164420d 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -16,6 +16,7 @@ from .auto_tp import AutoTP, ReplaceWithTensorSlicing, Loading from deepspeed import comm as dist +from deepspeed.utils.tp_shard import set_num_kv_heads from torch import nn from .load_checkpoint import load_model_with_checkpoint @@ -263,27 +264,6 @@ def replace_with_policy(child, policy_cls, triangular_masking, inference=False, return _container.module - def get_shard_size(total_size, mp_size): - num_kv_heads = None - - # 1. Try to get num_key_heads from model_config.num_key_value_heads - if hasattr(model_config, 'num_key_value_heads'): - num_kv_heads = model_config.num_key_value_heads - - # 2. Fallback to model_config.num_attention_heads when necessary - if num_kv_heads == None and hasattr(model_config, 'num_attention_heads'): - num_kv_heads = model_config.num_attention_heads - - # 3. When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division - if num_kv_heads != None: - my_slices = num_kv_heads // mp_size + (1 if dist.get_rank() < (num_kv_heads % mp_size) else 0) - return total_size // num_kv_heads * my_slices - else: - if total_size % mp_size == 0: - return total_size // mp_size - else: - assert False, f"Number of attention heads ({total_size}) must be divisible by mp_size ({mp_size})" - def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None): #mp_replace = ReplaceWithTensorSlicing(mp_group=config.tensor_parallel.tp_group) @@ -293,10 +273,22 @@ def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None): # 2. Set the tensor parallelism config _autotp.set_tensor_parallel_config(config.tensor_parallel.tp_size, config.tensor_parallel.tp_group) - # 3. Set linear policies + # 3. Try to get num_key_heads from model_config.num_key_value_heads + num_kv_heads = None + if hasattr(model_config, 'num_key_value_heads'): + num_kv_heads = model_config.num_key_value_heads + + # 4. Fallback to model_config.num_attention_heads when necessary + if num_kv_heads == None and hasattr(model_config, 'num_attention_heads'): + num_kv_heads = model_config.num_attention_heads + + # 5. When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division + set_num_kv_heads(num_kv_heads) + + # 6. Set linear policies _autotp.update_linear_policies() - # 4. Replace modules + # 7. Replace modules return _autotp._replace_module(module) def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None): diff --git a/deepspeed/utils/tp_shard.py b/deepspeed/utils/tp_shard.py new file mode 100644 index 000000000000..42d6f50f7e31 --- /dev/null +++ b/deepspeed/utils/tp_shard.py @@ -0,0 +1,18 @@ +from deepspeed import comm as dist +global num_kv_heads + +def set_num_kv_heads(num): + global num_kv_heads + num_kv_heads = num + +def get_shard_size(total_size, mp_size): + global num_kv_heads + # When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division + if num_kv_heads != None: + my_slices = num_kv_heads // mp_size + (1 if dist.get_rank() < (num_kv_heads % mp_size) else 0) + return total_size // num_kv_heads * my_slices + else: + if total_size % mp_size == 0: + return total_size // mp_size + else: + assert False, f"Number of attention heads ({total_size}) must be divisible by mp_size ({mp_size})" From e21231df26082dd33f4f48e99f6bf5031ac3254c Mon Sep 17 00:00:00 2001 From: blzheng Date: Wed, 16 Aug 2023 10:52:27 +0800 Subject: [PATCH 49/74] Add optimizations for lm_head & embed_out (#11) * add lm_head tensor parallel * fix conflict * add embed_out tp * add llama2 autoTP support in replace_module (#4022) --------- Co-authored-by: Lai, Yejing Co-authored-by: Dino Chen --- deepspeed/module_inject/layers.py | 29 +++++++++++++++ deepspeed/module_inject/replace_module.py | 44 +++++++++++++++++++++-- 2 files changed, 70 insertions(+), 3 deletions(-) diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py index 3433f5d672c8..704a4c399107 100644 --- a/deepspeed/module_inject/layers.py +++ b/deepspeed/module_inject/layers.py @@ -28,6 +28,35 @@ def forward(self, input): return output +class LmHeadLinearAllreduce(nn.Module): + + def __init__( + self, + weight, + rank, + world_size, + bias=None, + mp_group=None, + ): + super(LmHeadLinearAllreduce, self).__init__() + self.weight = weight + self.bias = bias + self.mp_group = mp_group + self.rank = rank + self.world_size = world_size + + def forward(self, input): + assert input.shape[-1] % self.world_size == 0, 'Please ensure that self.world_size is divisible by input.shape[-1]' + input_shard = input.shape[-1] // self.world_size + output = torch.matmul(input[:, :, self.rank * input_shard:(self.rank + 1) * input_shard], + self.weight.transpose(-1, -2)) + if self.mp_group is not None: + dist.all_reduce(output, group=self.mp_group) + if self.bias is not None: + output += self.bias + return output + + class LinearLayer(nn.Module): def __init__(self, weight_shape=None, dtype=torch.half, weight=None, bias=None): diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index f81a8e41a4be..92e91fb8b2a1 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -19,7 +19,7 @@ from deepspeed import comm as dist from torch import nn -from .layers import LinearAllreduce, LinearLayer +from .layers import LinearAllreduce, LinearLayer, LmHeadLinearAllreduce from .load_checkpoint import load_model_with_checkpoint import time @@ -387,6 +387,9 @@ def _replace(child, name, conv_linear_layer): if child.bias is not None: new_bias.data.copy_(child.bias.data) setattr(child, "replaced", True) + if name == "lm_head" or name == 'embed_out': + return LmHeadLinearAllreduce(data, dist.get_rank(), dist.get_world_size(), child.bias if child.bias is None else \ + torch.nn.parameter.Parameter(new_bias.to(get_accelerator().current_device_name())), mp_group) return LinearAllreduce(data, child.bias if child.bias is None else \ torch.nn.parameter.Parameter(new_bias.to(get_accelerator().current_device_name())), mp_group) else: @@ -428,7 +431,7 @@ def update_mp_params(child): return for param in [ "n_heads", "inner_dim", "num_heads", "num_kv", "num_attention_heads", "num_attn_heads", - "all_head_size", "embed_dim", "hidden_size" + "all_head_size", "embed_dim", "hidden_size", "num_key_value_heads" ]: if hasattr(child, param): param_val = getattr(child, param) @@ -452,6 +455,24 @@ def update_mp_params(child): else: linear_policies = {nn.Linear: _replace, nn.Embedding: _slice_embedding} + def _replace_last_linear_module(r_module): + for name, child in r_module.named_children(): + if name == "lm_head" or name == 'embed_out': + checking_key = name + '.' + if child.__class__ in [nn.Linear, nn.Embedding, nn.LayerNorm] and state_dict != None: + if any(checking_key in item for item in state_dict): + load(child, state_dict, checking_key, mp_group) + else: + continue + if len(child._buffers) != 0 and state_dict != None: + load_buffer(child, state_dict, checking_key) + if child.__class__ in linear_policies: + setattr(r_module, name, linear_policies[child.__class__](child, name, conv_linear_layer)) + else: + update_mp_params(child) + _replace_module(child, name) + return r_module + def _replace_module(r_module, prev_name='', prev_class_name=''): for name, child in r_module.named_children(): if prev_class_name == "": @@ -491,6 +512,8 @@ def _replace_module(r_module, prev_name='', prev_class_name=''): _replace_module(child, name, class_name) return r_module + if "lm_head" in str(module) or 'embed_out' in str(module): + return _replace_last_linear_module(module) return _replace_module(module) def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None): @@ -755,6 +778,8 @@ def replace_module(model, orig_class, replace_fn, _replace_policy, checkpoint=No policy = {} if orig_class is not None: policy.update({orig_class: (replace_fn, _replace_policy)}) + origin_layer = torch.nn.modules.linear.Linear + policy.update({origin_layer: (replace_fn, (list(model.named_modules())[-1][0]))}) else: for plcy in replace_policies: # instantiate a throw-away policy in order to populate the _orig_layer_class @@ -777,6 +802,12 @@ def replace_module(model, orig_class, replace_fn, _replace_policy, checkpoint=No if embedding_weight is not None and hasattr(replaced_module, "lm_head") and hasattr( replaced_module.lm_head, "weight") and replaced_module.lm_head.weight.is_meta: replaced_module.lm_head.weight = embedding_weight + #if sd is not None: + # if 'lm_head.weight' in sd.keys() and hasattr(replaced_module, 'lm_head'): + # replaced_module.lm_head.weight = torch.nn.parameter.Parameter( + # data=torch.empty_like(sd['lm_head.weight'].data, device="cpu"), + # requires_grad=sd['lm_head.weight'].data.requires_grad) + # replaced_module.lm_head.weight.data.copy_(sd['lm_head.weight']) return replaced_module @@ -832,7 +863,14 @@ def _replace_module(model, policies, prefix='', layer_id=0, level_id=0, state_di LlamaRMSNorm = None load_layers = [nn.Linear, nn.Embedding, nn.LayerNorm, OPTLearnedPositionalEmbedding, LlamaRMSNorm] for name, child in model.named_children(): - if child.__class__ in policies: + if name == "lm_head" or name =="embed_out": + if child.__class__ in policies: + replaced_module = policies[child.__class__][0](model, + policies[child.__class__][-1], + layer_id, + prefix=prefix + name, + state_dict=state_dict) + elif child.__class__ in policies: replaced_module = policies[child.__class__][0](child, policies[child.__class__][-1], layer_id, From 2dac94f35a8630973d58988a9f94623cdb5f5052 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Thu, 17 Aug 2023 10:07:27 +0000 Subject: [PATCH 50/74] fix accuracy issue --- deepspeed/module_inject/auto_tp.py | 12 ++++++------ deepspeed/module_inject/auto_tp_model_utils.py | 9 +++++---- deepspeed/utils/tp_shard.py | 14 +++++++++++--- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py index c458fc0ac86f..17a9f7aebec4 100644 --- a/deepspeed/module_inject/auto_tp.py +++ b/deepspeed/module_inject/auto_tp.py @@ -14,7 +14,7 @@ from .layers import LinearAllreduce, LinearLayer from deepspeed.accelerator import get_accelerator from .fusedqkv_utils import require_tp_fused_qkvw, prepare_tp_fused_qkvw -from deepspeed.utils.tp_shard import get_shard_size +from deepspeed.utils.tp_shard import get_shard_size, get_shard_size_list class ReplaceWithTensorSlicing: @@ -320,7 +320,7 @@ def _replace(self, child, name, conv_linear_layer): if self.conv_linear_layer: child.weight.data = child.weight.data.transpose(-1, -2).contiguous() data = child.weight.data.split( - get_shard_size(weight_shape[0] if self.conv_linear_layer else weight_shape[1], self.mp_size), dim=1) + get_shard_size_list(weight_shape[0] if self.conv_linear_layer else weight_shape[1], self.mp_size), dim=1) data = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()) setattr(child, "replaced", True) @@ -343,13 +343,13 @@ def _replace(self, child, name, conv_linear_layer): module_str, child.bias.data, self.mp_size, mp_replace.gpu_index).to( get_accelerator().current_device_name()) else: - data = child.weight.data.split(get_shard_size(weight_shape[0], self.mp_size), + data = child.weight.data.split(get_shard_size_list(weight_shape[0], self.mp_size), dim=1 if self.conv_linear_layer else 0) data = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()) if child.bias is not None: bias_data = child.bias.data.split( - get_shard_size(weight_shape[1] if self.conv_linear_layer else weight_shape[0], self.mp_size), dim=0) + get_shard_size_list(weight_shape[1] if self.conv_linear_layer else weight_shape[0], self.mp_size), dim=0) bias_data = bias_data[mp_replace.gpu_index].to(get_accelerator().current_device_name()) bias_data = torch.nn.parameter.Parameter(bias_data, requires_grad=False) else: @@ -365,9 +365,9 @@ def _slice_embedding(self, child, name, conv_linear_layer): mp_replace = ReplaceWithTensorSlicing(mp_group=self.mp_group) if hasattr(child.weight, 'ds_tensor'): - data = child.weight.ds_tensor.data.split(get_shard_size(child.weight.shape[1], self.mp_size), dim=1) + data = child.weight.ds_tensor.data.split(get_shard_size_list(child.weight.shape[1], self.mp_size), dim=1) else: - data = child.weight.data.split(get_shard_size(child.weight.shape[1], self.mp_size), dim=1) + data = child.weight.data.split(get_shard_size_list(child.weight.shape[1], self.mp_size), dim=1) data = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()) data = torch.nn.parameter.Parameter(data, requires_grad=False) diff --git a/deepspeed/module_inject/auto_tp_model_utils.py b/deepspeed/module_inject/auto_tp_model_utils.py index d31dfd17a2a9..445619dcd37a 100644 --- a/deepspeed/module_inject/auto_tp_model_utils.py +++ b/deepspeed/module_inject/auto_tp_model_utils.py @@ -6,6 +6,7 @@ from deepspeed import comm as dist import torch from typing import Optional +from deepspeed.utils.tp_shard import get_shard_size, get_shard_size_list def build_bloom_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor: @@ -51,8 +52,8 @@ def build_bloom_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :] alibi = slopes[..., None] * arange_tensor if dist.is_initialized(): - num_heads_per_rank = int(num_heads / dist.get_world_size()) - offset = dist.get_rank() * num_heads_per_rank + num_heads_per_rank = get_shard_size(num_heads, dist.get_world_size()) + offset = sum(get_shard_size_list(num_heads, dist.get_world_size())[0:dist.get_rank()]) alibi = alibi.view(batch_size, num_heads, 1, seq_length) alibi = alibi[:, offset:num_heads_per_rank + offset, :, :] return alibi.reshape(batch_size * num_heads_per_rank, 1, seq_length).to(dtype) @@ -72,7 +73,7 @@ def build_mpt_atten_bias_tensor(self, prefix_mask=prefix_mask, sequence_id=sequence_id) if dist.is_initialized(): - num_heads_per_rank = int(self.config.n_heads / dist.get_world_size()) - offset = dist.get_rank() * num_heads_per_rank + num_heads_per_rank = get_shard_size(self.config.n_heads, dist.get_world_size()) + offset = sum(get_shard_size_list(self.config.n_heads, dist.get_world_size())[0:dist.get_rank()]) attn_bias = attn_bias[:, offset:num_heads_per_rank + offset, :, :] return attn_bias, attention_mask diff --git a/deepspeed/utils/tp_shard.py b/deepspeed/utils/tp_shard.py index 42d6f50f7e31..67bb6c1f7872 100644 --- a/deepspeed/utils/tp_shard.py +++ b/deepspeed/utils/tp_shard.py @@ -5,14 +5,22 @@ def set_num_kv_heads(num): global num_kv_heads num_kv_heads = num -def get_shard_size(total_size, mp_size): +def get_shard_size(total_size, mp_size, rank=None): global num_kv_heads # When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division if num_kv_heads != None: - my_slices = num_kv_heads // mp_size + (1 if dist.get_rank() < (num_kv_heads % mp_size) else 0) - return total_size // num_kv_heads * my_slices + if (rank == None): + rank = dist.get_rank() + my_slices = (num_kv_heads // mp_size) + (1 if rank < (num_kv_heads % mp_size) else 0) + return (total_size // num_kv_heads) * my_slices else: if total_size % mp_size == 0: return total_size // mp_size else: assert False, f"Number of attention heads ({total_size}) must be divisible by mp_size ({mp_size})" + +def get_shard_size_list(total_size, mp_size): + shard_sizes = [] + for i in range(mp_size): + shard_sizes.append(get_shard_size(total_size, mp_size, i)) + return shard_sizes From 71f9f401b2302d8ba75c2f90a6f307041955bfee Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Mon, 21 Aug 2023 01:38:06 +0000 Subject: [PATCH 51/74] fix format --- deepspeed/module_inject/auto_tp.py | 10 ++++++---- deepspeed/utils/tp_shard.py | 8 ++++++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py index 17a9f7aebec4..4198f73b42dd 100644 --- a/deepspeed/module_inject/auto_tp.py +++ b/deepspeed/module_inject/auto_tp.py @@ -319,8 +319,9 @@ def _replace(self, child, name, conv_linear_layer): if self.conv_linear_layer: child.weight.data = child.weight.data.transpose(-1, -2).contiguous() - data = child.weight.data.split( - get_shard_size_list(weight_shape[0] if self.conv_linear_layer else weight_shape[1], self.mp_size), dim=1) + data = child.weight.data.split(get_shard_size_list( + weight_shape[0] if self.conv_linear_layer else weight_shape[1], self.mp_size), + dim=1) data = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()) setattr(child, "replaced", True) @@ -348,8 +349,9 @@ def _replace(self, child, name, conv_linear_layer): data = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()) if child.bias is not None: - bias_data = child.bias.data.split( - get_shard_size_list(weight_shape[1] if self.conv_linear_layer else weight_shape[0], self.mp_size), dim=0) + bias_data = child.bias.data.split(get_shard_size_list( + weight_shape[1] if self.conv_linear_layer else weight_shape[0], self.mp_size), + dim=0) bias_data = bias_data[mp_replace.gpu_index].to(get_accelerator().current_device_name()) bias_data = torch.nn.parameter.Parameter(bias_data, requires_grad=False) else: diff --git a/deepspeed/utils/tp_shard.py b/deepspeed/utils/tp_shard.py index 67bb6c1f7872..61a017184e42 100644 --- a/deepspeed/utils/tp_shard.py +++ b/deepspeed/utils/tp_shard.py @@ -1,10 +1,17 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + from deepspeed import comm as dist global num_kv_heads + def set_num_kv_heads(num): global num_kv_heads num_kv_heads = num + def get_shard_size(total_size, mp_size, rank=None): global num_kv_heads # When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division @@ -19,6 +26,7 @@ def get_shard_size(total_size, mp_size, rank=None): else: assert False, f"Number of attention heads ({total_size}) must be divisible by mp_size ({mp_size})" + def get_shard_size_list(total_size, mp_size): shard_sizes = [] for i in range(mp_size): From db9db6bfe29da97daa7c95784558a3cdf4503f2e Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Wed, 23 Aug 2023 06:22:12 +0000 Subject: [PATCH 52/74] skip tests with fusedqkv --- tests/unit/inference/test_inference.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index fdd53425a947..9d3876265dd2 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -529,6 +529,8 @@ def test_odd_world_size( pytest.skip(invalid_test_msg) model, task = model_w_task + if model == "Salesforce/codegen-350M-mono": + pytest.skip("fusedqkv does not supported by odd world_size") local_rank = int(os.getenv("LOCAL_RANK", "0")) world_size = int(os.getenv("WORLD_SIZE", "3")) From 9d5eae34b0361f9ef0acfca4e9823acd5a6f8c4d Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Wed, 23 Aug 2023 06:26:43 +0000 Subject: [PATCH 53/74] remove skip of fusedqkv tests --- tests/unit/inference/test_inference.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index 6639654b9ecc..896f5cc13db7 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -558,8 +558,6 @@ def test_odd_world_size( pytest.skip(invalid_test_msg) model, task = model_w_task - if model == "Salesforce/codegen-350M-mono": - pytest.skip("fusedqkv does not supported by odd world_size") local_rank = int(os.getenv("LOCAL_RANK", "0")) world_size = int(os.getenv("WORLD_SIZE", "3")) From 25e656dbd9666746be3daa237edb9a044b24b863 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Wed, 23 Aug 2023 06:32:15 +0000 Subject: [PATCH 54/74] skip test fusedqkv with odd number of ranks --- tests/unit/inference/test_inference.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index 896f5cc13db7..6639654b9ecc 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -558,6 +558,8 @@ def test_odd_world_size( pytest.skip(invalid_test_msg) model, task = model_w_task + if model == "Salesforce/codegen-350M-mono": + pytest.skip("fusedqkv does not supported by odd world_size") local_rank = int(os.getenv("LOCAL_RANK", "0")) world_size = int(os.getenv("WORLD_SIZE", "3")) From 5eba475975336fbdac9d52c652af5e9afc78804d Mon Sep 17 00:00:00 2001 From: "Wang, Yi" Date: Thu, 24 Aug 2023 12:07:52 -0400 Subject: [PATCH 55/74] fix lm head overriden issue, move it from checkpoint in-loop loading to out loop Signed-off-by: Wang, Yi --- deepspeed/module_inject/load_checkpoint.py | 7 ------- deepspeed/module_inject/replace_module.py | 22 ++++++++++++++-------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/deepspeed/module_inject/load_checkpoint.py b/deepspeed/module_inject/load_checkpoint.py index fee5da4bfe52..dc1ee3bcff9d 100644 --- a/deepspeed/module_inject/load_checkpoint.py +++ b/deepspeed/module_inject/load_checkpoint.py @@ -255,13 +255,6 @@ def load_module_recursive(module, prefix='', level=0): load_module_recursive(r_module) - embedding_weight = None - - for n, p in r_module.named_parameters(): - if "word_embeddings." in n or "embed_tokens." in n or "wte." in n: - embedding_weight = p - if embedding_weight is not None and r_module.lm_head.weight.is_meta: - r_module.lm_head.weight = embedding_weight for sd_ in sd: del sd_ sd = None diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 7ed0f8f1fff7..7395b45ed749 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -309,6 +309,13 @@ def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None): checkpoint=checkpoint[i]) pbar.update(1) gc.collect() + embedding_weight = None + for n, p in replaced_module.named_parameters(): + if "word_embeddings." in n or "embed_tokens." in n or "wte." in n: + embedding_weight = p + if embedding_weight is not None and hasattr(replaced_module, "lm_head") and hasattr( + replaced_module.lm_head, "weight") and replaced_module.lm_head.weight.is_meta: + replaced_module.lm_head.weight = embedding_weight else: replaced_module = replace_module(model=model, orig_class=orig_layer_impl, @@ -386,6 +393,13 @@ def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None): container=container_g) sds = [None for _ in sds] gc.collect() + embedding_weight = None + for n, p in replaced_module.named_parameters(): + if "word_embeddings." in n or "embed_tokens." in n or "wte." in n: + embedding_weight = p + if embedding_weight is not None and hasattr(replaced_module, "lm_head") and hasattr( + replaced_module.lm_head, "weight") and replaced_module.lm_head.weight.is_meta: + replaced_module.lm_head.weight = embedding_weight print(f"checkpoint loading time at rank {rank}: {time.time()-start_time} sec") if config.save_mp_checkpoint_path is not None: @@ -554,14 +568,6 @@ def replace_module(model, orig_class, replace_fn, _replace_policy, checkpoint=No "You can find some samples here: https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py" replaced_module, _ = _replace_module(model, policy, state_dict=sd) - if checkpoint is not None: - embedding_weight = None - for n, p in replaced_module.named_parameters(): - if "word_embeddings." in n or "embed_tokens." in n or "wte." in n: - embedding_weight = p - if embedding_weight is not None and hasattr(replaced_module, "lm_head") and hasattr( - replaced_module.lm_head, "weight") and replaced_module.lm_head.weight.is_meta: - replaced_module.lm_head.weight = embedding_weight return replaced_module From 6ef90933c46116f2a3e2d3405db63c50373aab72 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Fri, 25 Aug 2023 15:16:46 +0000 Subject: [PATCH 56/74] change all_reduce_low_latency to inference_all_reduce --- deepspeed/module_inject/layers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py index a233918db124..ec30086500d1 100644 --- a/deepspeed/module_inject/layers.py +++ b/deepspeed/module_inject/layers.py @@ -22,7 +22,7 @@ def __init__(self, weight, bias=None, mp_group=None): def forward(self, input): output = torch.matmul(input, self.weight.transpose(-1, -2)) - dist.all_reduce_low_latency(output, group=self.mp_group) + dist.inference_all_reduce(output, group=self.mp_group) if self.bias is not None: output += self.bias return output From 5efdc8f9d90e916286dc99ac766875bb325b6707 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Fri, 25 Aug 2023 15:44:33 +0000 Subject: [PATCH 57/74] merge lm-head updates from lyj/lmhead_tp branch --- deepspeed/module_inject/auto_tp.py | 20 ++++++++++++- deepspeed/module_inject/layers.py | 3 +- deepspeed/module_inject/replace_module.py | 34 ++++++++--------------- 3 files changed, 33 insertions(+), 24 deletions(-) diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py index 0fb2ebb07e19..0f4ff33f095f 100644 --- a/deepspeed/module_inject/auto_tp.py +++ b/deepspeed/module_inject/auto_tp.py @@ -11,7 +11,7 @@ from typing import Optional import torch from deepspeed import comm as dist -from .layers import LinearAllreduce, LinearLayer +from .layers import LinearAllreduce, LinearLayer, LmHeadLinearAllreduce from deepspeed.accelerator import get_accelerator from .fusedqkv_utils import require_tp_fused_qkvw, prepare_tp_fused_qkvw from deepspeed.utils.tp_shard import get_shard_size, get_shard_size_list @@ -310,6 +310,11 @@ def _replace(self, child, name, conv_linear_layer): data = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()) setattr(child, "replaced", True) + if name == "lm_head" or name == 'embed_out': + return LmHeadLinearAllreduce( + torch.nn.parameter.Parameter(data, requires_grad=False), dist.get_rank(), dist.get_world_size(), + child.bias if child.bias is None else torch.nn.parameter.Parameter( + child.bias.to(get_accelerator().current_device_name())), self.mp_group) return LinearAllreduce(torch.nn.parameter.Parameter(data, requires_grad=False), child.bias if child.bias is None else \ torch.nn.parameter.Parameter(child.bias.to(get_accelerator().current_device_name())), self.mp_group) else: @@ -428,3 +433,16 @@ def _replace_module(self, r_module, prev_name='', prev_class_name=''): self.update_mp_params(child) self._replace_module(child, name, class_name) return r_module + + def _replace_last_linear_module(self, r_module): + if hasattr(r_module, "lm_head"): + name = "lm_head" + child = r_module.lm_head + elif hasattr(r_module, "embed_out"): + name = "embed_out" + child = r_module.embed_out + else: + return r_module + if child.__class__ in self.linear_policies: + setattr(r_module, name, self.linear_policies[child.__class__](child, name, self.conv_linear_layer)) + return r_module diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py index ec30086500d1..7a565560dec9 100644 --- a/deepspeed/module_inject/layers.py +++ b/deepspeed/module_inject/layers.py @@ -22,7 +22,8 @@ def __init__(self, weight, bias=None, mp_group=None): def forward(self, input): output = torch.matmul(input, self.weight.transpose(-1, -2)) - dist.inference_all_reduce(output, group=self.mp_group) + if self.mp_group is not None: + dist.inference_all_reduce(output, group=self.mp_group) if self.bias is not None: output += self.bias return output diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 1492d67be8a0..fd46b1d3c815 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -289,6 +289,8 @@ def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None): _autotp.update_linear_policies() # 7. Replace modules + if hasattr(module, "lm_head") or hasattr(module, 'embed_out'): + return _autotp._replace_last_linear_module(module) return _autotp._replace_module(module) def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None): @@ -553,8 +555,6 @@ def replace_module(model, orig_class, replace_fn, _replace_policy, checkpoint=No policy = {} if orig_class is not None: policy.update({orig_class: (replace_fn, _replace_policy)}) - origin_layer = torch.nn.modules.linear.Linear - policy.update({origin_layer: (replace_fn, (list(model.named_modules())[-1][0]))}) else: for plcy in replace_policies: # instantiate a throw-away policy in order to populate the _orig_layer_class @@ -577,12 +577,14 @@ def replace_module(model, orig_class, replace_fn, _replace_policy, checkpoint=No if embedding_weight is not None and hasattr(replaced_module, "lm_head") and hasattr( replaced_module.lm_head, "weight") and replaced_module.lm_head.weight.is_meta: replaced_module.lm_head.weight = embedding_weight - #if sd is not None: - # if 'lm_head.weight' in sd.keys() and hasattr(replaced_module, 'lm_head'): - # replaced_module.lm_head.weight = torch.nn.parameter.Parameter( - # data=torch.empty_like(sd['lm_head.weight'].data, device="cpu"), - # requires_grad=sd['lm_head.weight'].data.requires_grad) - # replaced_module.lm_head.weight.data.copy_(sd['lm_head.weight']) + + # enable tensor parallel for the last linear + if hasattr(replaced_module, "lm_head") and hasattr(replaced_module.lm_head, + "weight") and not replaced_module.lm_head.weight.is_meta: + replaced_module = replace_fn(replaced_module, ("lm_head", ), 0, "lm_head") + elif hasattr(replaced_module, "embed_out") and hasattr(replaced_module.embed_out, + "weight") and not replaced_module.embed_out.weight.is_meta: + replaced_module = replace_fn(replaced_module, ("embed_out", ), 0, "embed_out") return replaced_module @@ -621,21 +623,9 @@ def _replace_module(model, policies, prefix='', layer_id=0, level_id=0, state_di OPTLearnedPositionalEmbedding = transformers.models.opt.modeling_opt.OPTLearnedPositionalEmbedding except: OPTLearnedPositionalEmbedding = None - try: - import transformers - LlamaRMSNorm = transformers.models.llama.modeling_llama.LlamaRMSNorm - except: - LlamaRMSNorm = None - load_layers = [nn.Linear, nn.Embedding, nn.LayerNorm, OPTLearnedPositionalEmbedding, LlamaRMSNorm] + load_layers = [nn.Linear, nn.Embedding, nn.LayerNorm, OPTLearnedPositionalEmbedding] for name, child in model.named_children(): - if name == "lm_head" or name == "embed_out": - if child.__class__ in policies: - replaced_module = policies[child.__class__][0](model, - policies[child.__class__][-1], - layer_id, - prefix=prefix + name, - state_dict=state_dict) - elif child.__class__ in policies: + if child.__class__ in policies: replaced_module = policies[child.__class__][0](child, policies[child.__class__][-1], layer_id, From aa64514459526f02c48fef6192a2228ba1d3fdcd Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Tue, 29 Aug 2023 20:56:32 -0700 Subject: [PATCH 58/74] cherry pick fix for activation size not divisible by attention heads --- deepspeed/utils/tp_shard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/utils/tp_shard.py b/deepspeed/utils/tp_shard.py index 61a017184e42..a3352ada3bcb 100644 --- a/deepspeed/utils/tp_shard.py +++ b/deepspeed/utils/tp_shard.py @@ -19,7 +19,7 @@ def get_shard_size(total_size, mp_size, rank=None): if (rank == None): rank = dist.get_rank() my_slices = (num_kv_heads // mp_size) + (1 if rank < (num_kv_heads % mp_size) else 0) - return (total_size // num_kv_heads) * my_slices + return total_size * my_slices // num_kv_heads else: if total_size % mp_size == 0: return total_size // mp_size From 8b0a88745c43ee1bb1950435d9ea0fd418c13a31 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Tue, 5 Sep 2023 10:44:06 +0000 Subject: [PATCH 59/74] Support uneven sharding for lm_head --- deepspeed/module_inject/layers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py index 7a565560dec9..f1870bb10487 100644 --- a/deepspeed/module_inject/layers.py +++ b/deepspeed/module_inject/layers.py @@ -10,6 +10,7 @@ from torch.nn.parameter import Parameter from deepspeed.accelerator import get_accelerator +from deepspeed.utils.tp_shard import get_shard_size, get_shard_size_list class LinearAllreduce(nn.Module): @@ -47,10 +48,9 @@ def __init__( self.world_size = world_size def forward(self, input): - assert input.shape[ - -1] % self.world_size == 0, 'Please ensure that self.world_size is divisible by input.shape[-1]' - input_shard = input.shape[-1] // self.world_size - output = torch.matmul(input[:, :, self.rank * input_shard:(self.rank + 1) * input_shard], + input_shard_size = get_shard_size(input.shape[-1], self.world_size) + input_shard_offset = sum(get_shard_size_list(input.shape[-1], self.world_size)[0:self.rank]) + output = torch.matmul(input[:, :, input_shard_offset:input_shard_offset + input_shard_size], self.weight.transpose(-1, -2)) if self.mp_group is not None: dist.inference_all_reduce(output, group=self.mp_group) From f15e6d48c140a51272d2337141e965109db613c6 Mon Sep 17 00:00:00 2001 From: Yejing-Lai <55339926+Yejing-Lai@users.noreply.github.com> Date: Wed, 13 Sep 2023 14:18:32 +0800 Subject: [PATCH 60/74] fix CPU loading model OOM. (#13) * fix model partition load cpu mem increase * fix format * fix format --- deepspeed/module_inject/auto_tp.py | 34 ++++++++++++++--------- deepspeed/module_inject/replace_module.py | 12 ++++---- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py index 33faff8e0e67..a71c2c26bb80 100644 --- a/deepspeed/module_inject/auto_tp.py +++ b/deepspeed/module_inject/auto_tp.py @@ -312,16 +312,21 @@ def _replace(self, child, name, conv_linear_layer): data = child.weight.data.split(get_shard_size_list( weight_shape[0] if self.conv_linear_layer else weight_shape[1], self.mp_size), dim=1) - data = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()) + data_dc = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()).clone().detach() + del data setattr(child, "replaced", True) if name == "lm_head" or name == 'embed_out': - return LmHeadLinearAllreduce( - torch.nn.parameter.Parameter(data, requires_grad=False), dist.get_rank(), dist.get_world_size(), - child.bias if child.bias is None else torch.nn.parameter.Parameter( - child.bias.to(get_accelerator().current_device_name())), self.mp_group) - return LinearAllreduce(torch.nn.parameter.Parameter(data, requires_grad=False), child.bias if child.bias is None else \ - torch.nn.parameter.Parameter(child.bias.to(get_accelerator().current_device_name())), self.mp_group) + return LmHeadLinearAllreduce(weight=torch.nn.parameter.Parameter(data_dc, requires_grad=False), + rank=dist.get_rank(), + world_size=dist.get_world_size(), + bias=child.bias if child.bias is None else torch.nn.parameter.Parameter( + child.bias.to(get_accelerator().current_device_name())), + mp_group=self.mp_group) + return LinearAllreduce(weight=torch.nn.parameter.Parameter(data_dc, requires_grad=False), + bias=child.bias if child.bias is None else \ + torch.nn.parameter.Parameter(child.bias.to(get_accelerator().current_device_name())), + mp_group=self.mp_group) else: # if conv_linear_layer [weight_shape[1], weight_shape[0] // mp_size] @@ -333,15 +338,16 @@ def _replace(self, child, name, conv_linear_layer): #for detecting fused type module_str = str(self.module).strip() #The copy is a regular copy, The shape of dst and src is the same - data = prepare_tp_fused_qkvw(module_str, child.weight.data, self.mp_size, mp_replace.gpu_index) + data_dc = prepare_tp_fused_qkvw(module_str, child.weight.data, self.mp_size, mp_replace.gpu_index) - bias_data = None if child.bias is None else prepare_tp_fused_qkvw( + bias_data_dc = None if child.bias is None else prepare_tp_fused_qkvw( module_str, child.bias.data, self.mp_size, mp_replace.gpu_index).to( get_accelerator().current_device_name()) else: data = child.weight.data.split(get_shard_size_list(weight_shape[0], self.mp_size), dim=1 if self.conv_linear_layer else 0) - data = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()) + data_dc = data[mp_replace.gpu_index].clone().detach() + del data if child.bias is not None: bias_data = child.bias.data.split(get_shard_size_list( @@ -349,12 +355,14 @@ def _replace(self, child, name, conv_linear_layer): dim=0) bias_data = bias_data[mp_replace.gpu_index].to(get_accelerator().current_device_name()) bias_data = torch.nn.parameter.Parameter(bias_data, requires_grad=False) + bias_data_dc = bias_data.clone().detach() + del bias_data else: - bias_data = None + bias_data_dc = None setattr(child, "replaced", True) - return LinearLayer(weight=torch.nn.parameter.Parameter(data.to(get_accelerator().current_device_name()), requires_grad=False), \ - bias=bias_data) + return LinearLayer(weight=torch.nn.parameter.Parameter(data_dc.to(get_accelerator().current_device_name()), requires_grad=False), \ + bias=bias_data_dc) def _slice_embedding(self, child, name, conv_linear_layer): if getattr(child, "replaced", False) == True: diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index eeb9fda216a3..3b0df2ee198e 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -333,10 +333,10 @@ def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None): # enable tensor parallel for the last linear if hasattr(replaced_module, "lm_head") and hasattr(replaced_module.lm_head, - "weight") and not replaced_module.lm_head.weight.is_meta: + "weight") and not replaced_module.lm_head.weight.is_meta: replaced_module = replace_fn(replaced_module, ("lm_head", ), 0, "lm_head") - elif hasattr(replaced_module, "embed_out") and hasattr(replaced_module.embed_out, - "weight") and not replaced_module.embed_out.weight.is_meta: + elif hasattr(replaced_module, "embed_out") and hasattr( + replaced_module.embed_out, "weight") and not replaced_module.embed_out.weight.is_meta: replaced_module = replace_fn(replaced_module, ("embed_out", ), 0, "embed_out") else: replaced_module = replace_module(model=model, @@ -425,10 +425,10 @@ def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None): # enable tensor parallel for the last linear if hasattr(replaced_module, "lm_head") and hasattr(replaced_module.lm_head, - "weight") and not replaced_module.lm_head.weight.is_meta: + "weight") and not replaced_module.lm_head.weight.is_meta: replaced_module = replace_fn(replaced_module, ("lm_head", ), 0, "lm_head") - elif hasattr(replaced_module, "embed_out") and hasattr(replaced_module.embed_out, - "weight") and not replaced_module.embed_out.weight.is_meta: + elif hasattr(replaced_module, "embed_out") and hasattr( + replaced_module.embed_out, "weight") and not replaced_module.embed_out.weight.is_meta: replaced_module = replace_fn(replaced_module, ("embed_out", ), 0, "embed_out") print(f"checkpoint loading time at rank {rank}: {time.time()-start_time} sec") From f02d40f7fd47fdb89d504554575008e4487c0d00 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Wed, 13 Sep 2023 15:55:32 +0800 Subject: [PATCH 61/74] merge latest change in uneven_heads --- deepspeed/module_inject/auto_tp.py | 2 +- deepspeed/module_inject/auto_tp_model_utils.py | 2 +- deepspeed/module_inject/fusedqkv_utils.py | 17 ++++++++--------- deepspeed/module_inject/layers.py | 1 - deepspeed/module_inject/replace_module.py | 14 +++++++------- deepspeed/utils/tp_shard.py | 5 +++++ tests/unit/inference/test_inference.py | 2 +- 7 files changed, 23 insertions(+), 20 deletions(-) diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py index a71c2c26bb80..2d687e487a40 100644 --- a/deepspeed/module_inject/auto_tp.py +++ b/deepspeed/module_inject/auto_tp.py @@ -14,7 +14,7 @@ from .layers import LinearAllreduce, LinearLayer, LmHeadLinearAllreduce from deepspeed.accelerator import get_accelerator from .fusedqkv_utils import require_tp_fused_qkvw, prepare_tp_fused_qkvw -from deepspeed.utils.tp_shard import get_shard_size, get_shard_size_list +from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list class ReplaceWithTensorSlicing: diff --git a/deepspeed/module_inject/auto_tp_model_utils.py b/deepspeed/module_inject/auto_tp_model_utils.py index 847494a30cd8..51e52e3258dd 100644 --- a/deepspeed/module_inject/auto_tp_model_utils.py +++ b/deepspeed/module_inject/auto_tp_model_utils.py @@ -6,7 +6,7 @@ from deepspeed import comm as dist import torch from typing import Optional -from deepspeed.utils.tp_shard import get_shard_size, get_shard_size_list +from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list def build_bloom_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor: diff --git a/deepspeed/module_inject/fusedqkv_utils.py b/deepspeed/module_inject/fusedqkv_utils.py index e292d3e70bb1..2e8f6b5917ed 100644 --- a/deepspeed/module_inject/fusedqkv_utils.py +++ b/deepspeed/module_inject/fusedqkv_utils.py @@ -4,7 +4,7 @@ # DeepSpeed Team import torch from deepspeed.utils.logging import warning_once -from deepspeed.utils.tp_shard import get_shard_size +from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list, get_num_kv_heads import re @@ -40,7 +40,8 @@ def prepare_tp_fused_qkvw(module_str, src, mp_size, gpu_index): def _codegen_type_transpose(input, mp_size, codegen_mp_num=4): # codegen_mp_num defined in https://github.com/huggingface/transformers/blob/main/src/transformers/models/codegen/modeling_codegen.py - #TODO: assert num_heads % (mp_size*codegen_mp_num) == 0 + assert get_num_kv_heads() % ( + mp_size * codegen_mp_num) == 0, "codgen autoTP requires num_kv_heads % (mp_size*codegen_mp_num) == 0" #input : [3*hidden_dim, hidden_dim](weight) or [3*hidden_dim](bias) shape = input.shape @@ -60,18 +61,16 @@ def _glm_type_transpose(input, mp_size): #input : [3*hidden_dim, hidden_dim](weight) or [3*hidden_dim](bias) shape = input.shape - dst_shape = get_shard_size(shape[0], mp_size) src_split = torch.split(input, shape[0] // 3, dim=0) - split_fusedqkv = split_by_qkvlist_and_refuse(src_split, get_shard_size(shape[0] // 3, mp_size)) - tp_fuseqkv_weight = torch.cat(split_fusedqkv, dim=0) - - return tp_fuseqkv_weight[gpu_index * dst_shape:(gpu_index + 1) * dst_shape] + split_fusedqkv = split_by_qkvlist_and_refuse(src_split, get_shard_size_list(shape[0] // 3, mp_size)) + return split_fusedqkv[gpu_index] def _bloom_type_transpose(input, mp_size): shape = input.shape - dst_shape = shape[0] // mp_size - return input[gpu_index * dst_shape:(gpu_index + 1) * dst_shape] + + split_fusedqkv = input.split(get_shard_size_list(shape[0], mp_size), dim=0) + return split_fusedqkv[gpu_index] def _transpose_fused_qkvw(src, mp_size, fused_qkv_type=None): diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py index f1870bb10487..b3302b659e05 100644 --- a/deepspeed/module_inject/layers.py +++ b/deepspeed/module_inject/layers.py @@ -10,7 +10,6 @@ from torch.nn.parameter import Parameter from deepspeed.accelerator import get_accelerator -from deepspeed.utils.tp_shard import get_shard_size, get_shard_size_list class LinearAllreduce(nn.Module): diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 7032e75ba872..51e2d33606de 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -16,7 +16,7 @@ from .auto_tp import AutoTP, ReplaceWithTensorSlicing, Loading from deepspeed import comm as dist -from deepspeed.utils.tp_shard import set_num_kv_heads +from deepspeed.module_inject.tp_shard import set_num_kv_heads from .load_checkpoint import load_model_with_checkpoint import time @@ -274,12 +274,12 @@ def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None): # 3. Try to get num_key_heads from model_config.num_key_value_heads num_kv_heads = None - if hasattr(model_config, 'num_key_value_heads'): - num_kv_heads = model_config.num_key_value_heads - - # 4. Fallback to model_config.num_attention_heads when necessary - if num_kv_heads == None and hasattr(model_config, 'num_attention_heads'): - num_kv_heads = model_config.num_attention_heads + kv_head_names = ['num_key_value_heads', 'num_attention_heads', 'n_heads'] + for name in kv_head_names: + if hasattr(model_config, name): + num_kv_heads = getattr(model_config, name) + if num_kv_heads != None: + break # 5. When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division set_num_kv_heads(num_kv_heads) diff --git a/deepspeed/utils/tp_shard.py b/deepspeed/utils/tp_shard.py index a3352ada3bcb..8e2fa78d883f 100644 --- a/deepspeed/utils/tp_shard.py +++ b/deepspeed/utils/tp_shard.py @@ -12,6 +12,11 @@ def set_num_kv_heads(num): num_kv_heads = num +def get_num_kv_heads(): + global num_kv_heads + return num_kv_heads + + def get_shard_size(total_size, mp_size, rank=None): global num_kv_heads # When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index 6639654b9ecc..7e2b3465e100 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -559,7 +559,7 @@ def test_odd_world_size( model, task = model_w_task if model == "Salesforce/codegen-350M-mono": - pytest.skip("fusedqkv does not supported by odd world_size") + pytest.skip("codegen does not supported by odd world_size") local_rank = int(os.getenv("LOCAL_RANK", "0")) world_size = int(os.getenv("WORLD_SIZE", "3")) From 3a8ad631f38c3c79ef6538887743f01fb3a1fd4c Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Wed, 13 Sep 2023 08:45:53 +0000 Subject: [PATCH 62/74] move tp_shard to module_inject --- deepspeed/{utils => module_inject}/tp_shard.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename deepspeed/{utils => module_inject}/tp_shard.py (100%) diff --git a/deepspeed/utils/tp_shard.py b/deepspeed/module_inject/tp_shard.py similarity index 100% rename from deepspeed/utils/tp_shard.py rename to deepspeed/module_inject/tp_shard.py From f0ef3eaa959617eb5d29d7fc4132fde8e6773cbe Mon Sep 17 00:00:00 2001 From: baodi Date: Thu, 28 Sep 2023 07:50:58 +0800 Subject: [PATCH 63/74] support baichuan model (#14) --- deepspeed/module_inject/auto_tp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py index a71c2c26bb80..eb5495ac9e00 100644 --- a/deepspeed/module_inject/auto_tp.py +++ b/deepspeed/module_inject/auto_tp.py @@ -117,7 +117,7 @@ class Loading(): def is_load_module(module): load_layers = [nn.Linear, nn.Embedding, nn.LayerNorm] - load_layer_names = ["LPLayerNorm", "SharedEmbedding", "OPTLearnedPositionalEmbedding", "LlamaRMSNorm"] + load_layer_names = ["LPLayerNorm", "SharedEmbedding", "OPTLearnedPositionalEmbedding", "LlamaRMSNorm", "RMSNorm"] return module.__class__ in load_layers or module._get_name() in load_layer_names def load_buffer(module, state_dict, prefix): From 91f56a2d9e4e17ece4f361c6a01467049cf0fea1 Mon Sep 17 00:00:00 2001 From: Guokai Ma Date: Fri, 20 Oct 2023 10:35:16 +0800 Subject: [PATCH 64/74] fix bug in lm_head, cherry pick from #4522 --- deepspeed/module_inject/replace_module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 0e6c05d616fc..4c7ba75f69d9 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -340,13 +340,13 @@ def set_lm_head(module): checkpoint=checkpoint_file) pbar.update(1) gc.collect() - replaced_module = set_lm_head(replaced_module) else: replaced_module = replace_module(model=model, orig_class=orig_layer_impl, replace_fn=replace_fn, _replace_policy=config.injection_policy_tuple) + replaced_module = set_lm_head(replaced_module) quantizer = GroupQuantizer(q_int8=quantize) world_size = dist.get_world_size() if dist.is_initialized() else 1 rank = dist.get_rank() if dist.is_initialized() else 0 From 57ff508ea592ff752fd323b383c32177d5bce7b5 Mon Sep 17 00:00:00 2001 From: Yejing-Lai <55339926+Yejing-Lai@users.noreply.github.com> Date: Tue, 31 Oct 2023 16:24:46 +0800 Subject: [PATCH 65/74] fix uneven heads issue (#25) --- deepspeed/module_inject/tp_shard.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/deepspeed/module_inject/tp_shard.py b/deepspeed/module_inject/tp_shard.py index 8e2fa78d883f..77756e43c62f 100644 --- a/deepspeed/module_inject/tp_shard.py +++ b/deepspeed/module_inject/tp_shard.py @@ -19,17 +19,14 @@ def get_num_kv_heads(): def get_shard_size(total_size, mp_size, rank=None): global num_kv_heads - # When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division - if num_kv_heads != None: - if (rank == None): - rank = dist.get_rank() + # When we have num_kv_heads defined, uneven division is possible, otherwise enforce near even division + if rank == None: + rank = dist.get_rank() + if num_kv_heads != None and total_size % num_kv_heads == 0: my_slices = (num_kv_heads // mp_size) + (1 if rank < (num_kv_heads % mp_size) else 0) return total_size * my_slices // num_kv_heads else: - if total_size % mp_size == 0: - return total_size // mp_size - else: - assert False, f"Number of attention heads ({total_size}) must be divisible by mp_size ({mp_size})" + return total_size // mp_size + (1 if rank < (total_size % mp_size) else 0) def get_shard_size_list(total_size, mp_size): From 09a348c92a6a85db099c00b71f5a488cce9cdb93 Mon Sep 17 00:00:00 2001 From: Yejing-Lai <55339926+Yejing-Lai@users.noreply.github.com> Date: Thu, 16 Nov 2023 09:46:11 +0800 Subject: [PATCH 66/74] fix imbalance autotp issue (#31) --- deepspeed/module_inject/auto_tp.py | 15 +++++++++------ deepspeed/module_inject/layers.py | 4 ++-- deepspeed/module_inject/tp_shard.py | 13 ++++++++----- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py index 4e993876fbe8..d930c9f2f413 100644 --- a/deepspeed/module_inject/auto_tp.py +++ b/deepspeed/module_inject/auto_tp.py @@ -121,7 +121,9 @@ class Loading(): def is_load_module(module): load_layers = [nn.Linear, nn.Embedding, nn.LayerNorm] - load_layer_names = ["LPLayerNorm", "SharedEmbedding", "OPTLearnedPositionalEmbedding", "LlamaRMSNorm", "RMSNorm"] + load_layer_names = [ + "LPLayerNorm", "SharedEmbedding", "OPTLearnedPositionalEmbedding", "LlamaRMSNorm", "RMSNorm" + ] return module.__class__ in load_layers or module._get_name() in load_layer_names def load_buffer(module, state_dict, prefix): @@ -314,7 +316,7 @@ def _replace(self, child, name, conv_linear_layer): if self.conv_linear_layer: child.weight.data = child.weight.data.transpose(-1, -2).contiguous() data = child.weight.data.split(get_shard_size_list( - weight_shape[0] if self.conv_linear_layer else weight_shape[1], self.mp_size), + weight_shape[0] if self.conv_linear_layer else weight_shape[1], self.mp_size, name), dim=1) data_dc = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()).clone().detach() del data @@ -344,14 +346,14 @@ def _replace(self, child, name, conv_linear_layer): module_str, child.bias.data, self.mp_size, mp_replace.gpu_index).to( get_accelerator().current_device_name()) else: - data = child.weight.data.split(get_shard_size_list(weight_shape[0], self.mp_size), + data = child.weight.data.split(get_shard_size_list(weight_shape[0], self.mp_size, name), dim=1 if self.conv_linear_layer else 0) data_dc = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()).clone().detach() del data if child.bias is not None: bias_data = child.bias.data.split(get_shard_size_list( - weight_shape[1] if self.conv_linear_layer else weight_shape[0], self.mp_size), + weight_shape[1] if self.conv_linear_layer else weight_shape[0], self.mp_size, name), dim=0) bias_data = bias_data[mp_replace.gpu_index].to(get_accelerator().current_device_name()) bias_data_dc = torch.nn.parameter.Parameter(bias_data, requires_grad=False) @@ -369,9 +371,10 @@ def _slice_embedding(self, child, name, conv_linear_layer): mp_replace = ReplaceWithTensorSlicing(mp_group=self.mp_group) if hasattr(child.weight, 'ds_tensor'): - data = child.weight.ds_tensor.data.split(get_shard_size_list(child.weight.shape[1], self.mp_size), dim=1) + data = child.weight.ds_tensor.data.split(get_shard_size_list(child.weight.shape[1], self.mp_size, name), + dim=1) else: - data = child.weight.data.split(get_shard_size_list(child.weight.shape[1], self.mp_size), dim=1) + data = child.weight.data.split(get_shard_size_list(child.weight.shape[1], self.mp_size, name), dim=1) data = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()) data = torch.nn.parameter.Parameter(data, requires_grad=False) diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py index 969826ad0289..5fb957faa80c 100644 --- a/deepspeed/module_inject/layers.py +++ b/deepspeed/module_inject/layers.py @@ -48,8 +48,8 @@ def __init__( self.world_size = world_size def forward(self, input): - input_shard_size = get_shard_size(input.shape[-1], self.world_size) - input_shard_offset = sum(get_shard_size_list(input.shape[-1], self.world_size)[0:self.rank]) + input_shard_size = get_shard_size(input.shape[-1], self.world_size, "lm_head") + input_shard_offset = sum(get_shard_size_list(input.shape[-1], self.world_size, "lm_head")[0:self.rank]) output = torch.matmul(input[:, :, input_shard_offset:input_shard_offset + input_shard_size], self.weight.transpose(-1, -2)) if self.mp_group is not None: diff --git a/deepspeed/module_inject/tp_shard.py b/deepspeed/module_inject/tp_shard.py index 77756e43c62f..cf5cd4aafe04 100644 --- a/deepspeed/module_inject/tp_shard.py +++ b/deepspeed/module_inject/tp_shard.py @@ -17,20 +17,23 @@ def get_num_kv_heads(): return num_kv_heads -def get_shard_size(total_size, mp_size, rank=None): +def get_shard_size(total_size, mp_size, name=None, rank=None): global num_kv_heads + last_linear = ["lm_head", "embed_out"] # When we have num_kv_heads defined, uneven division is possible, otherwise enforce near even division if rank == None: rank = dist.get_rank() - if num_kv_heads != None and total_size % num_kv_heads == 0: + if num_kv_heads != None and total_size % num_kv_heads == 0 and "mlp" not in str(name) and str( + name) not in last_linear: my_slices = (num_kv_heads // mp_size) + (1 if rank < (num_kv_heads % mp_size) else 0) return total_size * my_slices // num_kv_heads else: - return total_size // mp_size + (1 if rank < (total_size % mp_size) else 0) + grain_size = total_size // 64 + return (grain_size // mp_size + (1 if rank < (grain_size % mp_size) else 0)) * 64 -def get_shard_size_list(total_size, mp_size): +def get_shard_size_list(total_size, mp_size, name=None): shard_sizes = [] for i in range(mp_size): - shard_sizes.append(get_shard_size(total_size, mp_size, i)) + shard_sizes.append(get_shard_size(total_size, mp_size, name, i)) return shard_sizes From 14f5058c36ed42168b91fe6d3e3af439a921128b Mon Sep 17 00:00:00 2001 From: Yejing-Lai <55339926+Yejing-Lai@users.noreply.github.com> Date: Tue, 21 Nov 2023 16:44:29 +0800 Subject: [PATCH 67/74] fix splt shape < 64 issue & add num_kv_heads to mp_params (#33) --- deepspeed/module_inject/auto_tp.py | 2 +- deepspeed/module_inject/replace_module.py | 2 +- deepspeed/module_inject/tp_shard.py | 7 +++++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py index d930c9f2f413..20dd19fe32a4 100644 --- a/deepspeed/module_inject/auto_tp.py +++ b/deepspeed/module_inject/auto_tp.py @@ -388,7 +388,7 @@ def update_mp_params(self, child): return for param in [ "n_heads", "inner_dim", "num_heads", "num_kv", "num_attention_heads", "num_attn_heads", - "all_head_size", "embed_dim", "hidden_size", "num_key_value_heads" + "all_head_size", "embed_dim", "hidden_size", "num_key_value_heads", "num_kv_heads" ]: if hasattr(child, param): param_val = getattr(child, param) diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 4c7ba75f69d9..54523f54f6fd 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -274,7 +274,7 @@ def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None): # 3. Try to get num_key_heads from model_config.num_key_value_heads num_kv_heads = None - kv_head_names = ['num_key_value_heads', 'num_attention_heads', 'n_heads'] + kv_head_names = ['num_kv_heads', 'num_key_value_heads', 'num_attention_heads', 'n_heads'] for name in kv_head_names: if hasattr(model_config, name): num_kv_heads = getattr(model_config, name) diff --git a/deepspeed/module_inject/tp_shard.py b/deepspeed/module_inject/tp_shard.py index cf5cd4aafe04..4586b8de9b47 100644 --- a/deepspeed/module_inject/tp_shard.py +++ b/deepspeed/module_inject/tp_shard.py @@ -28,8 +28,11 @@ def get_shard_size(total_size, mp_size, name=None, rank=None): my_slices = (num_kv_heads // mp_size) + (1 if rank < (num_kv_heads % mp_size) else 0) return total_size * my_slices // num_kv_heads else: - grain_size = total_size // 64 - return (grain_size // mp_size + (1 if rank < (grain_size % mp_size) else 0)) * 64 + if total_size >= 64: + grain_size = total_size // 64 + return (grain_size // mp_size + (1 if rank < (grain_size % mp_size) else 0)) * 64 + else: + return total_size // mp_size + (1 if rank < (total_size % mp_size) else 0) def get_shard_size_list(total_size, mp_size, name=None): From 8d6043291d99feb7a55a209d5018c664109ce700 Mon Sep 17 00:00:00 2001 From: baodi Date: Thu, 23 Nov 2023 11:02:57 +0800 Subject: [PATCH 68/74] Baodi/support baichuan (#23) * support baichuan model * support baichuan without changing model script --- deepspeed/inference/engine.py | 7 ++++++- deepspeed/module_inject/auto_tp_model_utils.py | 8 ++++++++ deepspeed/module_inject/fusedqkv_utils.py | 3 ++- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py index 149d20cd9305..d45ffb3abe55 100755 --- a/deepspeed/inference/engine.py +++ b/deepspeed/inference/engine.py @@ -26,7 +26,8 @@ from ..module_inject.auto_tp import AutoTP from ..module_inject.replace_policy import generic_policies -from ..module_inject.auto_tp_model_utils import build_bloom_alibi_tensor, build_mpt_atten_bias_tensor, build_mpt_alibi_tensor + +from ..module_inject.auto_tp_model_utils import build_bloom_alibi_tensor, build_mpt_atten_bias_tensor, build_mpt_alibi_tensor, get_alibi_mask from ..ops.transformer.inference.ds_attention import DeepSpeedSelfAttention from ..model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference @@ -220,6 +221,10 @@ def build_alibi_tensor(self): if hasattr(self.module.transformer, 'build_mpt_alibi_tensor'): self.module.transformer.build_mpt_alibi_tensor_orig = self.module.transformer.build_mpt_alibi_tensor self.module.transformer.__class__.build_mpt_alibi_tensor = build_mpt_alibi_tensor + if hasattr(self.module, 'model'): + if hasattr(self.module.model, 'get_alibi_mask'): + self.module.model.get_alibi_mask_orig = self.module.model.get_alibi_mask + self.module.model.__class__.get_alibi_mask = get_alibi_mask def build_attn_bias(self): if hasattr(self.module, 'transformer'): diff --git a/deepspeed/module_inject/auto_tp_model_utils.py b/deepspeed/module_inject/auto_tp_model_utils.py index 51e52e3258dd..46f6f2a4a24b 100644 --- a/deepspeed/module_inject/auto_tp_model_utils.py +++ b/deepspeed/module_inject/auto_tp_model_utils.py @@ -60,6 +60,14 @@ def build_bloom_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype else: return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype) +def get_alibi_mask(self, tensor, seq_length_with_past): + mask = self.get_alibi_mask_orig(tensor, seq_length_with_past) + if not self.training and dist.is_initialized(): + num_heads_per_rank = get_shard_size(self.n_head, dist.get_world_size()) + offset = sum(get_shard_size_list(self.n_head, dist.get_world_size())[0:dist.get_rank()]) + mask = mask[offset:num_heads_per_rank + offset, :seq_length_with_past, :seq_length_with_past] + + return mask def build_mpt_atten_bias_tensor(self, device, diff --git a/deepspeed/module_inject/fusedqkv_utils.py b/deepspeed/module_inject/fusedqkv_utils.py index 2e8f6b5917ed..05b3dd368a0e 100644 --- a/deepspeed/module_inject/fusedqkv_utils.py +++ b/deepspeed/module_inject/fusedqkv_utils.py @@ -17,7 +17,7 @@ def split_by_qkvlist_and_refuse(qkv_list, split_size, split_dim=0, cat_dim=0): def require_tp_fused_qkvw(name, mp_size): - fused_qkvw_name_list = ['qkv_proj', 'query_key_value', 'attn.Wqkv'] + fused_qkvw_name_list = ['qkv_proj', 'query_key_value', 'attn.Wqkv', 'self_attn.W_pack'] if mp_size == 1: return False @@ -36,6 +36,7 @@ def prepare_tp_fused_qkvw(module_str, src, mp_size, gpu_index): 'GLMBlock': 'glmtype', "MPTBlock": 'glmtype', "MptBlock": 'glmtype', + "BaichuanLayer": 'glmtype', } def _codegen_type_transpose(input, mp_size, codegen_mp_num=4): From 0ebb1ed3d252ef11ecaef47b22e3471cd86a5c8b Mon Sep 17 00:00:00 2001 From: baodi Date: Thu, 23 Nov 2023 16:12:00 +0800 Subject: [PATCH 69/74] fix Baichuan-7B qkv order error (#35) --- deepspeed/module_inject/fusedqkv_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deepspeed/module_inject/fusedqkv_utils.py b/deepspeed/module_inject/fusedqkv_utils.py index 05b3dd368a0e..0fca37c6b5e8 100644 --- a/deepspeed/module_inject/fusedqkv_utils.py +++ b/deepspeed/module_inject/fusedqkv_utils.py @@ -37,6 +37,7 @@ def prepare_tp_fused_qkvw(module_str, src, mp_size, gpu_index): "MPTBlock": 'glmtype', "MptBlock": 'glmtype', "BaichuanLayer": 'glmtype', + "DecoderLayer": 'glmtype', } def _codegen_type_transpose(input, mp_size, codegen_mp_num=4): From 547ac962a7f4722b71357212e9a3bef4f168785e Mon Sep 17 00:00:00 2001 From: Yejing-Lai <55339926+Yejing-Lai@users.noreply.github.com> Date: Thu, 23 Nov 2023 16:46:03 +0800 Subject: [PATCH 70/74] fix baichuan lm_head replace issue (#34) --- deepspeed/module_inject/replace_module.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 54523f54f6fd..76549aee3f8a 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -320,10 +320,13 @@ def set_lm_head(module): module.lm_head, "weight") and module.lm_head.weight.is_meta: module.lm_head.weight = embedding_weight # enable tensor parallel for the last linear - if hasattr(module, "lm_head") and hasattr(module.lm_head, "weight") and not module.lm_head.weight.is_meta: + if hasattr(module, "lm_head") and hasattr(module.lm_head, + "weight") and not module.lm_head.weight.is_meta and isinstance( + module.lm_head, torch.nn.Linear): module = replace_wo_policy(module, ("lm_head", ), 0, "lm_head") elif hasattr(module, "embed_out") and hasattr(module.embed_out, - "weight") and not module.embed_out.weight.is_meta: + "weight") and not module.embed_out.weight.is_meta and isinstance( + module.embed_out, torch.nn.Linear): module = replace_wo_policy(module, ("embed_out", ), 0, "embed_out") return module From cd070bf8ddf6b55c12e545a7945cdd8878515e30 Mon Sep 17 00:00:00 2001 From: Liangliang-Ma <1906710196@qq.com> Date: Mon, 27 Nov 2023 17:29:17 +0800 Subject: [PATCH 71/74] shorten inference_all_reduce call stack (#37) --- csrc/cpu/comm/ccl.cpp | 4 ++-- deepspeed/comm/ccl.py | 46 +++++++++++++++++++++++++++++-------------- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index 9a04b6f873f2..6428ab5cbfa5 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -537,7 +537,7 @@ static void parallel_memcpy(void* to, void* from, size_t n_bytes) } } -void inference_all_reduce(torch::Tensor& data, py::object op, std::vector group, bool async_op) +void inference_all_reduce(torch::Tensor& data, py::object op, bool async_op) { static py::object ReduceOp = py::module_::import("deepspeed.comm").attr("ReduceOp"); static auto ReduceOpSum = (int)py::int_(ReduceOp.attr("SUM").attr("value")); @@ -562,7 +562,7 @@ void inference_all_reduce(torch::Tensor& data, py::object op, std::vector g data.numel(), get_ccl_datatype(data.scalar_type()), get_ccl_reduce_op(op, data), - _get_comm_from_group(group)) + _get_comm_from_group()) .wait()); return; } diff --git a/deepspeed/comm/ccl.py b/deepspeed/comm/ccl.py index 6e915d0ca430..982e13c1ce1a 100644 --- a/deepspeed/comm/ccl.py +++ b/deepspeed/comm/ccl.py @@ -61,7 +61,8 @@ def is_initialized(self): def run_collective(self, name, **kwargs): if name in self.available_coll: - kwargs['group'] = self.get_all_ranks_from_group(kwargs['group']) + if 'group' in kwargs: + kwargs['group'] = self.get_all_ranks_from_group(kwargs['group']) if 'dst' in kwargs: kwargs['dst'] = kwargs['group'].index(kwargs['dst']) if 'src' in kwargs: @@ -71,23 +72,38 @@ def run_collective(self, name, **kwargs): return CCLHandler(self.ccl_comm_op) else: func = "super(CCLBackend, self)." + name - return eval(func)(*(kwargs.values())) + eval(func)(*(kwargs.values())) + return CCLHandler(self.ccl_comm_op) def all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False): use_caching = False if use_caching: match_id = f"{tensor.size()}-{op}" - return self.run_collective(name="all_reduce_caching", - tensor=tensor, - op=op, - match_id=match_id, - group=group, - async_op=async_op) + name = "all_reduce_caching" + if name in self.available_coll: + group = self.get_all_ranks_from_group(group) + return self.ccl_comm_op.all_reduce_caching(tensor, op, match_id, group, async_op) + else: + return self.run_collective(name=name, + tensor=tensor, + op=op, + match_id=match_id, + group=group, + async_op=async_op) else: - return self.run_collective(name="all_reduce", tensor=tensor, op=op, group=group, async_op=async_op) + name = "all_reduce" + if name in self.available_coll: + group = self.get_all_ranks_from_group(group) + return self.ccl_comm_op.all_reduce(tensor, op, group, async_op) + else: + return self.run_collective(name=name, tensor=tensor, op=op, group=group, async_op=async_op) def inference_all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False): - return self.run_collective(name="inference_all_reduce", tensor=tensor, op=op, group=group, async_op=async_op) + name = "inference_all_reduce" + if name in self.available_coll: + return self.ccl_comm_op.inference_all_reduce(tensor, op, async_op) + else: + return self.run_collective(name=name, tensor=tensor, op=op, group=None, async_op=async_op) def broadcast(self, tensor, src, group=None, async_op=False): return self.run_collective(name="broadcast", tensor=tensor, src=src, group=group, async_op=async_op) @@ -120,11 +136,11 @@ def all_to_all_single(self, output, input, output_split_sizes, input_split_sizes input_split_sizes=input_split_sizes, group=group) - def send(self, tensor, dst, group=None, async_op=False): - return self.run_collective(name="send", tensor=tensor, dst=dst, group=group, async_op=async_op) + def send(self, tensor, dst, group=None, tag=0): + return self.run_collective(name="send", tensor=tensor, dst=dst, group=group, tag=tag) - def recv(self, tensor, src, group=None, async_op=False): - return self.run_collective(name="recv", tensor=tensor, src=src, group=group, async_op=async_op) + def recv(self, tensor, src, group=None, tag=0): + return self.run_collective(name="recv", tensor=tensor, src=src, group=group, tag=tag) def gather(self, tensor, gather_list, dst, group=None, async_op=False): return self.run_collective(name="gather", tensor=tensor, gather_list=gather_list, dst=dst, group=group) @@ -170,7 +186,7 @@ def get_all_ranks_from_group(self, group): while True: results.append(super(CCLBackend, self).get_global_rank(group, rank)) rank += 1 - except ValueError: + except (ValueError, RuntimeError): pass if tuple(results) not in self.groups: self._new_group(results, group) From e8ab894537a0747e11b5327050622061da1a372b Mon Sep 17 00:00:00 2001 From: Yejing-Lai <55339926+Yejing-Lai@users.noreply.github.com> Date: Fri, 15 Dec 2023 15:36:57 +0800 Subject: [PATCH 72/74] Enable starcode autotp (#38) * enable starcode autotp * add get_n_embd --- deepspeed/module_inject/fusedqkv_utils.py | 17 +++++++++++++++-- deepspeed/module_inject/replace_module.py | 14 +++++++++++++- deepspeed/module_inject/tp_shard.py | 10 ++++++++++ 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/deepspeed/module_inject/fusedqkv_utils.py b/deepspeed/module_inject/fusedqkv_utils.py index 0fca37c6b5e8..c1b6d229dcf5 100644 --- a/deepspeed/module_inject/fusedqkv_utils.py +++ b/deepspeed/module_inject/fusedqkv_utils.py @@ -4,7 +4,7 @@ # DeepSpeed Team import torch from deepspeed.utils.logging import warning_once -from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list, get_num_kv_heads +from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list, get_num_kv_heads, get_n_embd import re @@ -17,7 +17,8 @@ def split_by_qkvlist_and_refuse(qkv_list, split_size, split_dim=0, cat_dim=0): def require_tp_fused_qkvw(name, mp_size): - fused_qkvw_name_list = ['qkv_proj', 'query_key_value', 'attn.Wqkv', 'self_attn.W_pack'] + # 'c_attn' is for starcoder + fused_qkvw_name_list = ['qkv_proj', 'query_key_value', 'attn.Wqkv', 'self_attn.W_pack', 'c_attn'] if mp_size == 1: return False @@ -38,6 +39,7 @@ def prepare_tp_fused_qkvw(module_str, src, mp_size, gpu_index): "MptBlock": 'glmtype', "BaichuanLayer": 'glmtype', "DecoderLayer": 'glmtype', + "GPTBigCodeBlock": 'bigcodetype' # starcoder } def _codegen_type_transpose(input, mp_size, codegen_mp_num=4): @@ -74,6 +76,15 @@ def _bloom_type_transpose(input, mp_size): split_fusedqkv = input.split(get_shard_size_list(shape[0], mp_size), dim=0) return split_fusedqkv[gpu_index] + def _bigcode_type_transpose(input, mp_size): + n_embd = get_n_embd() + q = input[:n_embd] + kv = input[n_embd:] + shape = q.shape + + split_q = q.split(get_shard_size_list(shape[0], mp_size), dim=0) + return torch.cat((split_q[gpu_index], kv), dim=0) + def _transpose_fused_qkvw(src, mp_size, fused_qkv_type=None): # suppose num_heads=n, q(n)_w means the n-th q head linear weight, the weight format are as following @@ -87,6 +98,8 @@ def _transpose_fused_qkvw(src, mp_size, fused_qkv_type=None): return _codegen_type_transpose(src, mp_size) elif fused_qkv_type == 'glmtype': return _glm_type_transpose(src, mp_size) + elif fused_qkv_type == 'bigcodetype': + return _bigcode_type_transpose(src, mp_size) raise ValueError("unknown fused_qkv_type") diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 76549aee3f8a..2e603fa5ebab 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -16,7 +16,7 @@ from .auto_tp import AutoTP, ReplaceWithTensorSlicing, Loading from deepspeed import comm as dist -from deepspeed.module_inject.tp_shard import set_num_kv_heads +from deepspeed.module_inject.tp_shard import set_num_kv_heads, set_n_embd from .load_checkpoint import load_model_with_checkpoint import time @@ -281,6 +281,18 @@ def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None): if num_kv_heads != None: break + # 4.1 Get n_embd + n_embd = None + multi_query_n_embd_names = ['n_embd'] + for name in multi_query_n_embd_names: + if hasattr(model_config, name): + n_embd = getattr(model_config, name) + if n_embd != None: + break + + # 4.2 set n_embd + set_n_embd(n_embd) + # 5. When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division set_num_kv_heads(num_kv_heads) diff --git a/deepspeed/module_inject/tp_shard.py b/deepspeed/module_inject/tp_shard.py index 4586b8de9b47..f838e03a2e3e 100644 --- a/deepspeed/module_inject/tp_shard.py +++ b/deepspeed/module_inject/tp_shard.py @@ -12,11 +12,21 @@ def set_num_kv_heads(num): num_kv_heads = num +def set_n_embd(num): + global n_embd + n_embd = num + + def get_num_kv_heads(): global num_kv_heads return num_kv_heads +def get_n_embd(): + global n_embd + return n_embd + + def get_shard_size(total_size, mp_size, name=None, rank=None): global num_kv_heads last_linear = ["lm_head", "embed_out"] From 092b0f26d8fa65000a9c941b8d3f94414e8685f6 Mon Sep 17 00:00:00 2001 From: Yejing-Lai <55339926+Yejing-Lai@users.noreply.github.com> Date: Thu, 4 Jan 2024 10:02:51 +0800 Subject: [PATCH 73/74] fix falcon-40b accuracy issue (#39) --- deepspeed/module_inject/fusedqkv_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deepspeed/module_inject/fusedqkv_utils.py b/deepspeed/module_inject/fusedqkv_utils.py index c1b6d229dcf5..24f7dbdd39d8 100644 --- a/deepspeed/module_inject/fusedqkv_utils.py +++ b/deepspeed/module_inject/fusedqkv_utils.py @@ -37,6 +37,7 @@ def prepare_tp_fused_qkvw(module_str, src, mp_size, gpu_index): 'GLMBlock': 'glmtype', "MPTBlock": 'glmtype', "MptBlock": 'glmtype', + "FalconDecoderLayer": 'bloomtype', "BaichuanLayer": 'glmtype', "DecoderLayer": 'glmtype', "GPTBigCodeBlock": 'bigcodetype' # starcoder From 94873fe12cab7fc3b5729609e7e237de6e4b3951 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Thu, 4 Jan 2024 14:02:16 +0800 Subject: [PATCH 74/74] fix t5 and mistral model load from config meta tensor bug (#42) --- deepspeed/module_inject/auto_tp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py index 20dd19fe32a4..10bc1bb94140 100644 --- a/deepspeed/module_inject/auto_tp.py +++ b/deepspeed/module_inject/auto_tp.py @@ -122,7 +122,7 @@ class Loading(): def is_load_module(module): load_layers = [nn.Linear, nn.Embedding, nn.LayerNorm] load_layer_names = [ - "LPLayerNorm", "SharedEmbedding", "OPTLearnedPositionalEmbedding", "LlamaRMSNorm", "RMSNorm" + "LPLayerNorm", "SharedEmbedding", "OPTLearnedPositionalEmbedding", "LlamaRMSNorm", "RMSNorm", "MistralRMSNorm", "T5LayerNorm", ] return module.__class__ in load_layers or module._get_name() in load_layer_names