Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Capture LLM metrics from PA #774

Merged
merged 13 commits into from
Oct 17, 2023
103 changes: 94 additions & 9 deletions model_analyzer/perf_analyzer/perf_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,13 @@

import csv
import glob
import json
import logging
import os
import re
import signal
import tempfile
from statistics import mean
from subprocess import STDOUT, Popen
from typing import Dict, List

Expand Down Expand Up @@ -118,6 +120,14 @@ def get_gpu_metrics():
]
return gpu_metrics

@staticmethod
def get_llm_metrics():
llm_metrics = [
llm_metric[PerfAnalyzer.RECORD_CLASS]
for llm_metric in PerfAnalyzer.llm_metric_table
]
return llm_metrics

def __init__(self, path, config, max_retries, timeout, max_cpu_util):
"""
Parameters
Expand All @@ -143,6 +153,7 @@ def __init__(self, path, config, max_retries, timeout, max_cpu_util):
self._output = ""
self._perf_records = {}
self._gpu_records = []
self._llm_records = {}
self._max_cpu_util = max_cpu_util

def run(self, metrics, env=None):
Expand Down Expand Up @@ -216,6 +227,19 @@ def get_gpu_records(self):

return self._gpu_records

def get_llm_records(self):
"""
Returns
-------
The LLM records from the last perf_analyzer run
"""

if self._llm_records:
return self._llm_records
raise TritonModelAnalyzerException(
"Attempted to get perf_analyzer results without calling run first."
)

def output(self):
"""
Returns
Expand Down Expand Up @@ -457,21 +481,82 @@ def _parse_outputs(self, metrics):
logger.debug(
f"Reading PA results from {perf_config['latency-report-file']}"
)
with open(perf_config["latency-report-file"], mode="r") as f:
csv_reader = csv.DictReader(f, delimiter=",")

for row in csv_reader:
self._perf_records[
perf_config["model-name"]
] = self._extract_perf_records_from_row(metrics, row)
self._gpu_records = self._extract_gpu_records_from_row(metrics, row)
self._extract_gpu_records(perf_config, metrics)
self._extract_llm_records(perf_config, metrics)

for perf_config in [
mrc.perf_config() for mrc in self._config.model_run_configs()
]:
# Remove the latency file and all associated composing model latency files
# Remove the latency/profile export files and all associated composing model latency files
for f in glob.glob(f"*{perf_config['latency-report-file']}"):
os.remove(f)
for f in glob.glob(f"*{perf_config['profile-export-file']}"):
os.remove(f)

def _extract_gpu_records(self, perf_config, metrics):
if perf_config["profile-export-file"]:
return

with open(perf_config["latency-report-file"], mode="r") as f:
csv_reader = csv.DictReader(f, delimiter=",")

for row in csv_reader:
self._perf_records[
perf_config["model-name"]
] = self._extract_perf_records_from_row(metrics, row)
self._gpu_records = self._extract_gpu_records_from_row(metrics, row)

def _extract_llm_records(self, perf_config, metrics):
if not perf_config["profile-export-file"]:
return

self._llm_records[perf_config["model-name"]] = []

with open(perf_config["profile-export-file"], mode="r") as f:
llm_output = json.load(f)

avg_first_token_latency = self._calculate_avg_first_token_latency(
llm_output
)
record = PerfAnalyzer.llm_metric_table[0][PerfAnalyzer.RECORD_CLASS](
value=avg_first_token_latency
) # type: ignore

self._llm_records[perf_config["model-name"]].append(record)

avg_token_to_token_latency = self._calculate_avg_token_to_token_latency(
llm_output
)
record = PerfAnalyzer.llm_metric_table[1][PerfAnalyzer.RECORD_CLASS](
value=avg_token_to_token_latency
) # type: ignore
self._llm_records[perf_config["model-name"]].append(record)

def _calculate_avg_first_token_latency(self, llm_output: Dict) -> float:
total_first_token_latencies = []
for request in llm_output["experiments"][0]["requests"]:
total_first_token_latencies.append(
request["response_timestamps"][0] - request["timestamp"]
)

avg_first_token_latency = mean(total_first_token_latencies)

return avg_first_token_latency

def _calculate_avg_token_to_token_latency(self, llm_output: Dict) -> float:
token_to_token_latencies = []
for request in llm_output["experiments"][0]["requests"]:
response_to_response_latencies = []
prev_response = request["response_timestamps"][0]
for response in request["response_timestamps"][1:]:
response_to_response_latencies.append(response - prev_response)
prev_response = response

token_to_token_latencies.append(mean(response_to_response_latencies))
nv-braf marked this conversation as resolved.
Show resolved Hide resolved

avg_token_to_token_latency = mean(token_to_token_latencies)
nv-braf marked this conversation as resolved.
Show resolved Hide resolved

return avg_token_to_token_latency

def _extract_perf_records_from_row(
self, requested_metrics: List[Record], row_metrics: Dict[str, str]
Expand Down
9 changes: 8 additions & 1 deletion model_analyzer/perf_analyzer/perf_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ class PerfAnalyzerConfig:
"metrics-interval",
"bls-composing-models",
"request-parameter",
"request-period",
]

input_to_options = [
Expand All @@ -82,6 +83,7 @@ class PerfAnalyzerConfig:
"url",
"protocol",
"latency-report-file",
"profile-export-file",
"http-header",
]

Expand Down Expand Up @@ -112,6 +114,7 @@ def __init__(self):
"-u": None,
"-i": None,
"-f": None,
"--profile-export-file": None,
"-H": None,
}
self._verbose = {"-v": None, "-v -v": None, "--verbose-csv": None}
Expand All @@ -123,6 +126,7 @@ def __init__(self):
"url": "-u",
"protocol": "-i",
"latency-report-file": "-f",
"profile-export-file": "--profile-export-file",
"http-header": "-H",
}

Expand Down Expand Up @@ -193,6 +197,9 @@ def update_config_from_profile_config(self, model_name, profile_config):
"verbose-csv": "--verbose-csv",
}

if profile_config.is_llm_model():
params.update({"profile-export-file": model_name + "-results.json"})

if profile_config.triton_launch_mode == "c_api":
params.update(
{
Expand Down Expand Up @@ -307,7 +314,7 @@ def remove_url_from_cli_string(cls, cli_string):
@classmethod
def remove_mrc_from_cli_string(cls, cli_string):
"""
utility function strips the measruement request count
utility function strips the measurement request count
from a cli string representation

Parameters
Expand Down
20 changes: 15 additions & 5 deletions model_analyzer/record/metrics_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ class MetricsManager:
"gpu_power_usage",
"cpu_available_ram",
"cpu_used_ram",
"avg_first_token_latency",
"avg_token_to_token_latency",
]

def __init__(self, config, client, server, gpus, result_manager, state_manager):
Expand Down Expand Up @@ -116,6 +118,7 @@ def __init__(self, config, client, server, gpus, result_manager, state_manager):
self._gpu_metrics,
self._perf_metrics,
self._cpu_metrics,
self._llm_metrics,
) = self._categorize_metrics(self.metrics, self._config.collect_cpu_metrics)
self._gpus = gpus
self._init_state()
Expand Down Expand Up @@ -160,21 +163,23 @@ def _categorize_metrics(metric_tags, collect_cpu_metrics=False):

Returns
-------
(list,list,list)
tuple of three lists (DCGM, PerfAnalyzer, CPU) metrics
(list,list,list,list)
tuple of four lists (DCGM, PerfAnalyzer, CPU, LLM) metrics
"""

gpu_metrics, perf_metrics, cpu_metrics = [], [], []
gpu_metrics, perf_metrics, cpu_metrics, llm_metrics = [], [], [], []
# Separates metrics and objectives into related lists
for metric in MetricsManager.get_metric_types(metric_tags):
if metric in PerfAnalyzer.get_gpu_metrics():
gpu_metrics.append(metric)
elif metric in PerfAnalyzer.get_perf_metrics():
perf_metrics.append(metric)
elif metric in PerfAnalyzer.get_llm_metrics():
llm_metrics.append(metric)
elif collect_cpu_metrics and (metric in CPUMonitor.cpu_metrics):
cpu_metrics.append(metric)

return gpu_metrics, perf_metrics, cpu_metrics
return gpu_metrics, perf_metrics, cpu_metrics, llm_metrics

def profile_server(self):
"""
Expand Down Expand Up @@ -556,6 +561,9 @@ def _run_perf_analyzer(
)

metrics_to_gather = self._perf_metrics + self._gpu_metrics
if self._config.is_llm_model():
metrics_to_gather += self._llm_metrics

status = perf_analyzer.run(metrics_to_gather, env=perf_analyzer_env)

self._write_perf_analyzer_output(perf_output_writer, perf_analyzer)
Expand All @@ -564,7 +572,9 @@ def _run_perf_analyzer(
self._handle_unsuccessful_perf_analyzer_run(perf_analyzer)
return (None, None)

perf_records = perf_analyzer.get_perf_records()
perf_records = (
perf_analyzer.get_perf_records() + perf_analyzer.get_llm_records()
)
gpu_records = perf_analyzer.get_gpu_records()

aggregated_perf_records = self._aggregate_perf_records(perf_records)
Expand Down
2 changes: 1 addition & 1 deletion model_analyzer/record/types/avg_first_token_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
@total_ordering
class AvgFirstTokenLatency(DecreasingRecord):
"""
A record for perf_analyzer avg first token to token latency metric
A record for perf_analyzer average first token latency metric
"""

tag = "avg_first_token_latency"
Expand Down
2 changes: 1 addition & 1 deletion model_analyzer/record/types/avg_token_to_token_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
@total_ordering
class AvgTokenToTokenLatency(DecreasingRecord):
"""
A record for perf_analyzer avg token-to-token latency metric
A record for perf_analyzer average token-to-token latency metric
"""

tag = "avg_token_to_token_latency"
Expand Down
6 changes: 6 additions & 0 deletions tests/common/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ def convert_avg_gpu_metrics_to_data(avg_gpu_metric_values):
def construct_perf_analyzer_config(
model_name="my-model",
output_file_name="my-model-results.csv",
output_llm_file_name="my-model-results.json",
batch_size=DEFAULT_BATCH_SIZES,
concurrency=1,
request_rate=None,
Expand All @@ -253,6 +254,8 @@ def construct_perf_analyzer_config(
The name of the model
output_file_name: str
The name of the output file
output_llm_file_name: str
The name of the LLM output file
nv-braf marked this conversation as resolved.
Show resolved Hide resolved
batch_size: int
The batch size for this PA configuration
concurrency: int
Expand All @@ -279,6 +282,9 @@ def construct_perf_analyzer_config(
pa_config._options["-f"] = output_file_name
pa_config._options["-b"] = batch_size

if llm_search_mode:
pa_config._options["--profile-export-file"] = output_llm_file_name

if request_rate:
pa_config._args["request-rate-range"] = request_rate
elif llm_search_mode:
Expand Down
Loading