diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py index 55e4dd9a3..5854f32e8 100755 --- a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py +++ b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py @@ -337,7 +337,8 @@ def _parse_requests(self, requests: dict) -> LLMMetrics: # request latencies req_latency = res_timestamps[-1] - req_timestamp - request_latencies.append(req_latency) + request_latencies.append(req_latency) # nanosec + req_latency = req_latency / 1e9 # sec # time to first token time_to_first_tokens.append(res_timestamps[0] - req_timestamp) @@ -351,10 +352,15 @@ def _parse_requests(self, requests: dict) -> LLMMetrics: # inter token latency for (t1, _), (t2, n2) in pairwise(zip(res_timestamps, num_output_tokens)): - inter_token_latencies.append(round((t2 - t1) / n2)) + # TMA-1676: handle empty first/last responses + # if the latter response has zero token (e.g. empty string), + # then set it default to one for the sake of inter token latency + # calculation and to avoid divide by zero. + num_token = 1 if n2 == 0 else n2 + inter_token_latencies.append(round((t2 - t1) / num_token)) # request throughput - benchmark_duration = max_res_timestamp - min_req_timestamp + benchmark_duration = (max_res_timestamp - min_req_timestamp) / 1e9 # nanosec request_throughputs = [len(requests) / benchmark_duration] return LLMMetrics(