From 8f5ae7cdad6cc413c1d24f2aba745e5f1ffe9dd6 Mon Sep 17 00:00:00 2001 From: jiangzishan Date: Fri, 30 Aug 2024 03:07:52 +0800 Subject: [PATCH] [micro_perf] use less memory to avoid using L2 Cache; fix device2host host2device bug. --- byte_micro_perf/backends/GPU/backend_gpu.py | 27 +++++++++++++++++---- byte_micro_perf/backends/backend.py | 23 +++--------------- byte_micro_perf/backends/module_store.py | 8 ++---- 3 files changed, 27 insertions(+), 31 deletions(-) diff --git a/byte_micro_perf/backends/GPU/backend_gpu.py b/byte_micro_perf/backends/GPU/backend_gpu.py index 3d4bb033..1fb5baf0 100644 --- a/byte_micro_perf/backends/GPU/backend_gpu.py +++ b/byte_micro_perf/backends/GPU/backend_gpu.py @@ -211,10 +211,17 @@ def build_tensor(self, input_shapes, dtype): bytes_per_cnt = dtype_size * element_num - # compute max avail tensors for compute - avail_bytes = (self.memory_limit - 4) * 1024**3 - avail_cnts = avail_bytes // bytes_per_cnt - max_data_cnt = min(self.iterations, avail_cnts) + # avoid use L2 Cache: assume 256 MB currently + # data_per_cnt > 256 MB, only one buffer + # data_per_cnt < 256 MB, malloc multiple buffer to exceed 256MB, and use first and last buffer + + assume_l2_cache_size = 256 * 1024**2 + if bytes_per_cnt > self.memory_limit * 0.9 * 1024 ** 3: + return [], 0, bytes_per_cnt + elif bytes_per_cnt > assume_l2_cache_size: + max_data_cnt = 1 + else: + max_data_cnt = math.ceil(assume_l2_cache_size / bytes_per_cnt) # create input tensors for each op input_tensors_list = [] @@ -241,7 +248,17 @@ def build_tensor(self, input_shapes, dtype): self.op.process_inputs(*(input_tensor)) for input_tensor in input_tensors_list ] - return input_tensors_list, max_data_cnt, bytes_per_cnt + + + if max_data_cnt > 2: + max_data_cnt = 2 + new_tensor_list = [] + new_tensor_list.append(input_tensors_list[0]) + new_tensor_list.append(input_tensors_list[-1]) + else: + new_tensor_list = input_tensors_list + + return new_tensor_list, max_data_cnt, bytes_per_cnt diff --git a/byte_micro_perf/backends/backend.py b/byte_micro_perf/backends/backend.py index e684ab41..59cfa6df 100644 --- a/byte_micro_perf/backends/backend.py +++ b/byte_micro_perf/backends/backend.py @@ -222,30 +222,17 @@ def perf(self, input_shapes: List[List[int]], dtype): if tensor_cnt > 0: try: - # random select input tensors - input_index_list = [ - random.randint(0, tensor_cnt - 1) for _ in range(self.iterations) - ] - # warmup num_warm_up = 10 for _ in range(num_warm_up): self._run_operation(self.op, tensor_list[0]) - - # ccl ops need barrier - if self.op_name in ["allreduce", "allgather", "reducescatter", "alltoall", "broadcast", "p2p"]: - self.barier() - # test perf - num_test_perf = 5 + num_test_perf = 10 self.device_synchronize() start_time = time.perf_counter_ns() for i in range(num_test_perf): - self._run_operation( - self.op, - tensor_list[input_index_list[i]] - ) + self._run_operation(self.op, tensor_list[0]) self.device_synchronize() end_time = time.perf_counter_ns() @@ -257,7 +244,6 @@ def perf(self, input_shapes: List[List[int]], dtype): else: prefer_iterations = min(max(int(max_perf_seconds // op_duration), 10), self.iterations) - # ccl ops need barrier if self.op_name in ["allreduce", "allgather", "reducescatter", "alltoall", "broadcast", "p2p"]: self.barier() @@ -266,10 +252,7 @@ def perf(self, input_shapes: List[List[int]], dtype): self.device_synchronize() start_time = time.perf_counter_ns() for i in range(prefer_iterations): - self._run_operation( - self.op, - tensor_list[input_index_list[i]] - ) + self._run_operation(self.op, tensor_list[i % tensor_cnt]) self.device_synchronize() end_time = time.perf_counter_ns() diff --git a/byte_micro_perf/backends/module_store.py b/byte_micro_perf/backends/module_store.py index 9f4293a2..630d2db0 100644 --- a/byte_micro_perf/backends/module_store.py +++ b/byte_micro_perf/backends/module_store.py @@ -183,9 +183,7 @@ def custom_create_tensors(self, input_shapes, torch_dtype, xpu_device): device_tensor = torch.randn(input_shapes[0], dtype=torch_dtype, device=xpu_device) return [host_tensor, device_tensor] - def forward(self, input_tensors): - host_tensor = input_tensors[0] - device_tensor = input_tensors[1] + def forward(self, host_tensor, device_tensor): device_tensor.copy_(host_tensor, non_blocking=True) return device_tensor @@ -199,9 +197,7 @@ def custom_create_tensors(self, input_shapes, torch_dtype, xpu_device): host_tensor = torch.randn(input_shapes[0], dtype=torch_dtype, device="cpu") return [device_tensor, host_tensor] - def forward(self, input_tensors): - device_tensor = input_tensors[0] - host_tensor= input_tensors[1] + def forward(self, device_tensor, host_tensor): host_tensor.copy_(device_tensor, non_blocking=True) return host_tensor