Merge pull request bytedance#65 from bytedance/optimize_micro_report

[micro_perf] fix host2device process_inputs bug; add tensor shape to micro_perf report.
hliuca · Apr 16, 2024 · 041c295 · 041c295
2 parents 713a0d5 + 772ca20
commit 041c295
Show file tree

Hide file tree

Showing 4 changed files with 14 additions and 7 deletions.
diff --git a/byte_micro_perf/backends/backend.py b/byte_micro_perf/backends/backend.py
@@ -131,17 +131,22 @@ def perf(self, input_shapes: List[List[int]], dtype):
         ]
 
         # warmup
-        for _ in range(10):
+        num_warm_up = 10
+        for _ in range(num_warm_up):
             self._run_operation(self.op, inputs_list[0])
-        self.device_synchronize()
 
-        start_time = time.time()
+        # perf
+        self.device_synchronize()
+        start_time = time.perf_counter_ns()
         for i in range(self.iterations):
             result = self._run_operation(self.op, inputs_list[input_index_list[i]])
         self.device_synchronize()
-        execution_time = time.time() - start_time
+        end_time = time.perf_counter_ns()
+
+        # time in us
+        exec_time = (end_time - start_time) / 1e3
+        latency = round(exec_time / self.iterations, 2)
 
-        latency = round(execution_time * 1e6 / self.iterations, 2)
         if self.op_name in ["allreduce", "allgather", "reducescatter", "alltoall", "broadcast"]:
             report = dump_communication_ops_report(
                 self.op_name,

diff --git a/byte_micro_perf/backends/module_store.py b/byte_micro_perf/backends/module_store.py
@@ -228,7 +228,7 @@ def __init__(self, xpu_device):
 
     def process_inputs(self, input_tensors):
         new_inputs = input_tensors.cpu()
-        return new_inputs
+        return [new_inputs]
 
     def forward(self, input_tensors):
         assert input_tensors.device.type == "cpu"

diff --git a/byte_micro_perf/backends/utils.py b/byte_micro_perf/backends/utils.py
@@ -48,6 +48,7 @@ def dump_communication_ops_report(
 
     report = {
         "Dtype": dtype,
+        "Tensor Shapes": input_shapes, 
         "Memory Size(MB)": round(mb, 2),
         "Group": group_size,
         "Kernel bandwidth(GB/s)": round(algo_bw, 2),
@@ -99,6 +100,7 @@ def dump_computation_ops_report(
 
     report = {
         "Dtype": dtype,
+        "Tensor Shapes": input_shapes, 
         "Memory Size(MB)": round(mb, 2),
         "Kernel bandwidth(GB/s)": round(algo_bw, 2),
         "Bandwidth Utilization(%)": bandwidth_utils,

diff --git a/byte_micro_perf/requirements.txt b/byte_micro_perf/requirements.txt
@@ -1,6 +1,6 @@
 matplotlib
 pandas
-virtualenv
+virtualenv==16.7.12
 scikit-learn
 prompt_toolkit
 tqdm