Skip to content

Commit

Permalink
Fixes based on hwoo review
Browse files Browse the repository at this point in the history
  • Loading branch information
nv-braf committed Oct 16, 2023
1 parent 53730da commit e109116
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 14 deletions.
28 changes: 15 additions & 13 deletions model_analyzer/perf_analyzer/perf_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def get_llm_records(self):
if self._llm_records:
return self._llm_records
raise TritonModelAnalyzerException(
"Attempted to get perf_analyzer results" "without calling run first."
"Attempted to get perf_analyzer results without calling run first."
)

def output(self):
Expand Down Expand Up @@ -514,11 +514,11 @@ def _extract_llm_records(self, perf_config, metrics):
with open(perf_config["profile-export-file"], mode="r") as f:
llm_output = json.load(f)

avg_first_token_to_token_latency = (
self._calculate_avg_first_token_to_token_latency(llm_output)
avg_first_token_latency = self._calculate_avg_first_token_latency(
llm_output
)
record = PerfAnalyzer.llm_metric_table[0][PerfAnalyzer.RECORD_CLASS](
value=avg_first_token_to_token_latency
value=avg_first_token_latency
) # type: ignore

self._llm_records[perf_config["model-name"]].append(record)
Expand All @@ -531,31 +531,33 @@ def _extract_llm_records(self, perf_config, metrics):
) # type: ignore
self._llm_records[perf_config["model-name"]].append(record)

def _calculate_avg_first_token_to_token_latency(self, llm_output: Dict) -> float:
def _calculate_avg_first_token_latency(self, llm_output: Dict) -> float:
total_first_token_latency = 0
for request in llm_output["experiments"][0]["requests"]:
total_first_token_latency += (
request["response_timestamps"][0] - request["timestamp"]
)

avg_first_token_to_token_latency = total_first_token_latency / len(
avg_first_token_latency = total_first_token_latency / len(
llm_output["experiments"][0]["requests"]
)

return avg_first_token_to_token_latency
return avg_first_token_latency

def _calculate_avg_token_to_token_latency(self, llm_output: Dict) -> float:
total_token_latency = 0.0
total_token_to_token_latency = 0.0
for request in llm_output["experiments"][0]["requests"]:
total_response_latency = 0
for response_timestamp in request["response_timestamps"]:
total_response_latency += response_timestamp - request["timestamp"]
total_response_to_response_latency = 0
prev_response = request["response_timestamps"][0]
for response in request["response_timestamps"][1:]:
total_response_to_reponse_latency = response - prev_response

Check notice

Code scanning / CodeQL

Unused local variable Note

Variable total_response_to_reponse_latency is not used.
prev_response = response

total_token_latency += total_response_latency / len(
total_token_to_token_latency += total_response_to_response_latency / len(
request["response_timestamps"]
)

avg_token_to_token_latency = total_token_latency / len(
avg_token_to_token_latency = total_token_to_token_latency / len(
llm_output["experiments"][0]["requests"]
)

Expand Down
2 changes: 1 addition & 1 deletion tests/common/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def convert_avg_gpu_metrics_to_data(avg_gpu_metric_values):
def construct_perf_analyzer_config(
model_name="my-model",
output_file_name="my-model-results.csv",
output_llm_file_name="my-model-llm-results.csv",
output_llm_file_name="my-model-results.json",
batch_size=DEFAULT_BATCH_SIZES,
concurrency=1,
request_rate=None,
Expand Down

0 comments on commit e109116

Please sign in to comment.