Fix divide by zero and unit conversion for throughput display (#500)

* Fix divide by zero * Convert ns to secs for throughput metrics
triton-inference-server · Mar 8, 2024 · 58b2a74 · 58b2a74
1 parent 8893fe6
commit 58b2a74
Showing 1 changed file with 9 additions and 3 deletions.
diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py
@@ -337,7 +337,8 @@ def _parse_requests(self, requests: dict) -> LLMMetrics:
 
             # request latencies
             req_latency = res_timestamps[-1] - req_timestamp
-            request_latencies.append(req_latency)
+            request_latencies.append(req_latency)  # nanosec
+            req_latency = req_latency / 1e9  # sec
 
             # time to first token
             time_to_first_tokens.append(res_timestamps[0] - req_timestamp)
@@ -351,10 +352,15 @@ def _parse_requests(self, requests: dict) -> LLMMetrics:
 
             # inter token latency
             for (t1, _), (t2, n2) in pairwise(zip(res_timestamps, num_output_tokens)):
-                inter_token_latencies.append(round((t2 - t1) / n2))
+                # TMA-1676: handle empty first/last responses
+                # if the latter response has zero token (e.g. empty string),
+                # then set it default to one for the sake of inter token latency
+                # calculation and to avoid divide by zero.
+                num_token = 1 if n2 == 0 else n2
+                inter_token_latencies.append(round((t2 - t1) / num_token))
 
         # request throughput
-        benchmark_duration = max_res_timestamp - min_req_timestamp
+        benchmark_duration = (max_res_timestamp - min_req_timestamp) / 1e9  # nanosec
         request_throughputs = [len(requests) / benchmark_duration]
 
         return LLMMetrics(