Skip to content

Commit

Permalink
[micro_perf] use less memory to avoid using L2 Cache; fix device2host…
Browse files Browse the repository at this point in the history
… host2device bug.
  • Loading branch information
suisiyuan committed Aug 29, 2024
1 parent 22e9b39 commit 8f5ae7c
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 31 deletions.
27 changes: 22 additions & 5 deletions byte_micro_perf/backends/GPU/backend_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,10 +211,17 @@ def build_tensor(self, input_shapes, dtype):
bytes_per_cnt = dtype_size * element_num


# compute max avail tensors for compute
avail_bytes = (self.memory_limit - 4) * 1024**3
avail_cnts = avail_bytes // bytes_per_cnt
max_data_cnt = min(self.iterations, avail_cnts)
# avoid use L2 Cache: assume 256 MB currently
# data_per_cnt > 256 MB, only one buffer
# data_per_cnt < 256 MB, malloc multiple buffer to exceed 256MB, and use first and last buffer

assume_l2_cache_size = 256 * 1024**2
if bytes_per_cnt > self.memory_limit * 0.9 * 1024 ** 3:
return [], 0, bytes_per_cnt
elif bytes_per_cnt > assume_l2_cache_size:
max_data_cnt = 1
else:
max_data_cnt = math.ceil(assume_l2_cache_size / bytes_per_cnt)

# create input tensors for each op
input_tensors_list = []
Expand All @@ -241,7 +248,17 @@ def build_tensor(self, input_shapes, dtype):
self.op.process_inputs(*(input_tensor))
for input_tensor in input_tensors_list
]
return input_tensors_list, max_data_cnt, bytes_per_cnt


if max_data_cnt > 2:
max_data_cnt = 2
new_tensor_list = []
new_tensor_list.append(input_tensors_list[0])
new_tensor_list.append(input_tensors_list[-1])
else:
new_tensor_list = input_tensors_list

return new_tensor_list, max_data_cnt, bytes_per_cnt



Expand Down
23 changes: 3 additions & 20 deletions byte_micro_perf/backends/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,30 +222,17 @@ def perf(self, input_shapes: List[List[int]], dtype):

if tensor_cnt > 0:
try:
# random select input tensors
input_index_list = [
random.randint(0, tensor_cnt - 1) for _ in range(self.iterations)
]

# warmup
num_warm_up = 10
for _ in range(num_warm_up):
self._run_operation(self.op, tensor_list[0])


# ccl ops need barrier
if self.op_name in ["allreduce", "allgather", "reducescatter", "alltoall", "broadcast", "p2p"]:
self.barier()

# test perf
num_test_perf = 5
num_test_perf = 10
self.device_synchronize()
start_time = time.perf_counter_ns()
for i in range(num_test_perf):
self._run_operation(
self.op,
tensor_list[input_index_list[i]]
)
self._run_operation(self.op, tensor_list[0])
self.device_synchronize()
end_time = time.perf_counter_ns()

Expand All @@ -257,7 +244,6 @@ def perf(self, input_shapes: List[List[int]], dtype):
else:
prefer_iterations = min(max(int(max_perf_seconds // op_duration), 10), self.iterations)


# ccl ops need barrier
if self.op_name in ["allreduce", "allgather", "reducescatter", "alltoall", "broadcast", "p2p"]:
self.barier()
Expand All @@ -266,10 +252,7 @@ def perf(self, input_shapes: List[List[int]], dtype):
self.device_synchronize()
start_time = time.perf_counter_ns()
for i in range(prefer_iterations):
self._run_operation(
self.op,
tensor_list[input_index_list[i]]
)
self._run_operation(self.op, tensor_list[i % tensor_cnt])
self.device_synchronize()
end_time = time.perf_counter_ns()

Expand Down
8 changes: 2 additions & 6 deletions byte_micro_perf/backends/module_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,7 @@ def custom_create_tensors(self, input_shapes, torch_dtype, xpu_device):
device_tensor = torch.randn(input_shapes[0], dtype=torch_dtype, device=xpu_device)
return [host_tensor, device_tensor]

def forward(self, input_tensors):
host_tensor = input_tensors[0]
device_tensor = input_tensors[1]
def forward(self, host_tensor, device_tensor):
device_tensor.copy_(host_tensor, non_blocking=True)
return device_tensor

Expand All @@ -199,9 +197,7 @@ def custom_create_tensors(self, input_shapes, torch_dtype, xpu_device):
host_tensor = torch.randn(input_shapes[0], dtype=torch_dtype, device="cpu")
return [device_tensor, host_tensor]

def forward(self, input_tensors):
device_tensor = input_tensors[0]
host_tensor= input_tensors[1]
def forward(self, device_tensor, host_tensor):
host_tensor.copy_(device_tensor, non_blocking=True)
return host_tensor

Expand Down

0 comments on commit 8f5ae7c

Please sign in to comment.