diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py index 5d68eb916..0075aa6b9 100755 --- a/model_analyzer/perf_analyzer/perf_analyzer.py +++ b/model_analyzer/perf_analyzer/perf_analyzer.py @@ -236,7 +236,7 @@ def get_llm_records(self): if self._llm_records: return self._llm_records raise TritonModelAnalyzerException( - "Attempted to get perf_analyzer results" "without calling run first." + "Attempted to get perf_analyzer results without calling run first." ) def output(self): @@ -514,11 +514,11 @@ def _extract_llm_records(self, perf_config, metrics): with open(perf_config["profile-export-file"], mode="r") as f: llm_output = json.load(f) - avg_first_token_to_token_latency = ( - self._calculate_avg_first_token_to_token_latency(llm_output) + avg_first_token_latency = self._calculate_avg_first_token_latency( + llm_output ) record = PerfAnalyzer.llm_metric_table[0][PerfAnalyzer.RECORD_CLASS]( - value=avg_first_token_to_token_latency + value=avg_first_token_latency ) # type: ignore self._llm_records[perf_config["model-name"]].append(record) @@ -531,31 +531,33 @@ def _extract_llm_records(self, perf_config, metrics): ) # type: ignore self._llm_records[perf_config["model-name"]].append(record) - def _calculate_avg_first_token_to_token_latency(self, llm_output: Dict) -> float: + def _calculate_avg_first_token_latency(self, llm_output: Dict) -> float: total_first_token_latency = 0 for request in llm_output["experiments"][0]["requests"]: total_first_token_latency += ( request["response_timestamps"][0] - request["timestamp"] ) - avg_first_token_to_token_latency = total_first_token_latency / len( + avg_first_token_latency = total_first_token_latency / len( llm_output["experiments"][0]["requests"] ) - return avg_first_token_to_token_latency + return avg_first_token_latency def _calculate_avg_token_to_token_latency(self, llm_output: Dict) -> float: - total_token_latency = 0.0 + total_token_to_token_latency = 0.0 for request in llm_output["experiments"][0]["requests"]: - total_response_latency = 0 - for response_timestamp in request["response_timestamps"]: - total_response_latency += response_timestamp - request["timestamp"] + total_response_to_response_latency = 0 + prev_response = request["response_timestamps"][0] + for response in request["response_timestamps"][1:]: + total_response_to_reponse_latency = response - prev_response + prev_response = response - total_token_latency += total_response_latency / len( + total_token_to_token_latency += total_response_to_response_latency / len( request["response_timestamps"] ) - avg_token_to_token_latency = total_token_latency / len( + avg_token_to_token_latency = total_token_to_token_latency / len( llm_output["experiments"][0]["requests"] ) diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py index e5a336400..dc65f5665 100755 --- a/tests/common/test_utils.py +++ b/tests/common/test_utils.py @@ -235,7 +235,7 @@ def convert_avg_gpu_metrics_to_data(avg_gpu_metric_values): def construct_perf_analyzer_config( model_name="my-model", output_file_name="my-model-results.csv", - output_llm_file_name="my-model-llm-results.csv", + output_llm_file_name="my-model-results.json", batch_size=DEFAULT_BATCH_SIZES, concurrency=1, request_rate=None,