From 7d7271f8d2c6a6cb174ccf78e35a04e9b0f8faad Mon Sep 17 00:00:00 2001 From: braf Date: Mon, 16 Oct 2023 19:35:28 +0000 Subject: [PATCH] Change to use lists and mean() --- model_analyzer/perf_analyzer/perf_analyzer.py | 23 ++++++++----------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py index 83e95f763..e508934d8 100755 --- a/model_analyzer/perf_analyzer/perf_analyzer.py +++ b/model_analyzer/perf_analyzer/perf_analyzer.py @@ -22,6 +22,7 @@ import re import signal import tempfile +from statistics import mean from subprocess import STDOUT, Popen from typing import Dict, List @@ -532,34 +533,28 @@ def _extract_llm_records(self, perf_config, metrics): self._llm_records[perf_config["model-name"]].append(record) def _calculate_avg_first_token_latency(self, llm_output: Dict) -> float: - total_first_token_latency = 0 + total_first_token_latencies = [] for request in llm_output["experiments"][0]["requests"]: - total_first_token_latency += ( + total_first_token_latencies.append( request["response_timestamps"][0] - request["timestamp"] ) - avg_first_token_latency = total_first_token_latency / len( - llm_output["experiments"][0]["requests"] - ) + avg_first_token_latency = mean(total_first_token_latencies) return avg_first_token_latency def _calculate_avg_token_to_token_latency(self, llm_output: Dict) -> float: - total_token_to_token_latency = 0.0 + token_to_token_latencies = [] for request in llm_output["experiments"][0]["requests"]: - total_response_to_response_latency = 0 + response_to_response_latencies = [] prev_response = request["response_timestamps"][0] for response in request["response_timestamps"][1:]: - total_response_to_response_latency = response - prev_response + response_to_response_latencies.append(response - prev_response) prev_response = response - total_token_to_token_latency += total_response_to_response_latency / len( - request["response_timestamps"] - ) + token_to_token_latencies.append(mean(response_to_response_latencies)) - avg_token_to_token_latency = total_token_to_token_latency / len( - llm_output["experiments"][0]["requests"] - ) + avg_token_to_token_latency = mean(token_to_token_latencies) return avg_token_to_token_latency