Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Profiling model using genai-perf #849

Merged
merged 5 commits into from
Mar 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 42 additions & 23 deletions model_analyzer/perf_analyzer/perf_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,15 +115,15 @@ class PerfAnalyzer:
]

llm_metric_table = [
["time_to_first_token_avg", "Time to First Token (ns) avg", TimeToFirstTokenAvg, "1000"],
["time_to_first_token_min", "Time to First Token (ns) min", TimeToFirstTokenMin, "1000"],
["time_to_first_token_max", "Time to First Token (ns) max", TimeToFirstTokenMax, "1000"],
["time_to_first_token_p99", "Time to First Token (ns) p99", TimeToFirstTokenP99, "1000"],
["time_to_first_token_p95", "Time to First Token (ns) p95", TimeToFirstTokenP95, "1000"],
["time_to_first_token_p90", "Time to First Token (ns) p90", TimeToFirstTokenP90, "1000"],
["time_to_first_token_p75", "Time to First Token (ns) p75", TimeToFirstTokenP75, "1000"],
["time_to_first_token_p50", "Time to First Token (ns) p50", TimeToFirstTokenP50, "1000"],
["time_to_first_token_p25", "Time to First Token (ns) p25", TimeToFirstTokenP25, "1000"],
["time_to_first_token_avg", "Time To First Token (ns) avg", TimeToFirstTokenAvg, "1000"],
["time_to_first_token_min", "Time To First Token (ns) min", TimeToFirstTokenMin, "1000"],
["time_to_first_token_max", "Time To First Token (ns) max", TimeToFirstTokenMax, "1000"],
["time_to_first_token_p99", "Time To First Token (ns) p99", TimeToFirstTokenP99, "1000"],
["time_to_first_token_p95", "Time To First Token (ns) p95", TimeToFirstTokenP95, "1000"],
["time_to_first_token_p90", "Time To First Token (ns) p90", TimeToFirstTokenP90, "1000"],
["time_to_first_token_p75", "Time To First Token (ns) p75", TimeToFirstTokenP75, "1000"],
["time_to_first_token_p50", "Time To First Token (ns) p50", TimeToFirstTokenP50, "1000"],
["time_to_first_token_p25", "Time To First Token (ns) p25", TimeToFirstTokenP25, "1000"],
["inter_token_latency_avg", "Inter Token Latency (ns) avg", InterTokenLatencyAvg, "1000"],
["inter_token_latency_min", "Inter Token Latency (ns) min", InterTokenLatencyMin, "1000"],
["inter_token_latency_max", "Inter Token Latency (ns) max", InterTokenLatencyMax, "1000"],
Expand Down Expand Up @@ -323,14 +323,34 @@ def _get_cmd(self):
return cmd

def _get_single_model_cmd(self, index):
cmd = [self.bin_path]
if self._is_multi_model():
cmd += ["--enable-mpi"]
cmd += self._get_pa_cli_command(index).replace("=", " ").split()
# TODO: TMA-1771 - hook up the user defined CLI options
if self._model_type == "LLM":
cmd = [
"genai-perf",
"-m",
self._config.models_name(),
"--streaming",
tgerdesnv marked this conversation as resolved.
Show resolved Hide resolved
"--",
]
cmd += (
self._get_pa_cli_command(index, exclude_model_name=True)
.replace("=", " ")
.split()
)
else:
cmd = [self.bin_path]
if self._is_multi_model():
cmd += ["--enable-mpi"]
cmd += self._get_pa_cli_command(index).replace("=", " ").split()

return cmd

def _get_pa_cli_command(self, index):
return self._config.model_run_configs()[index].perf_config().to_cli_string()
def _get_pa_cli_command(self, index, exclude_model_name=False):
return (
self._config.model_run_configs()[index]
.perf_config()
.to_cli_string(exclude_model_name)
)

def _create_env(self, env):
perf_analyzer_env = os.environ.copy()
Expand Down Expand Up @@ -552,16 +572,16 @@ def _parse_llm_outputs(self, metrics):

perf_config = self._config.model_run_configs()[0].perf_config()

logger.debug(f"Reading PA results from {GENAI_PERF_CSV}")
logger.debug(f"Reading GENAI-PERF results from {GENAI_PERF_CSV}")
with open(GENAI_PERF_CSV, mode="r") as f:
csv_reader = csv.DictReader(f, delimiter=",")
csv_reader = list(csv.DictReader(f, delimiter=","))

# See test_perf_analyzer::test_pa_llm_csv_output() for CSV output example
self._llm_records[perf_config["model-name"]] = self._extract_llm_records(
metrics, csv_reader
)

os.remove(f)
os.remove(GENAI_PERF_CSV)

def _extract_perf_records_from_row(
self, requested_metrics: List[Record], row_metrics: Dict[str, str]
Expand Down Expand Up @@ -632,13 +652,14 @@ def _extract_llm_records(

for requested_metric in requested_metrics:
new_llm_record = self._get_llm_record_from_csv(requested_metric, csv_reader)
llm_records.append(new_llm_record)
if new_llm_record:
llm_records.append(new_llm_record)

return llm_records

def _get_llm_record_from_csv(
self, requested_metric: Record, csv_reader: DictReader
) -> Record:
) -> Optional[Record]:
for row in csv_reader:
for key, value in row.items():
metric_string = f"{row['Metric']} {key}"
Expand All @@ -655,9 +676,7 @@ def _get_llm_record_from_csv(
llm_record = llm_metric[PerfAnalyzer.RECORD_CLASS](adjusted_value) # type: ignore
return llm_record

raise TritonModelAnalyzerException(
f"Did not find {requested_metric.tag} in genai-perf CSV file"
)
return None

def _find_corresponding_llm_metric_row(self, metric_string: str) -> Optional[List]:
for row in PerfAnalyzer.llm_metric_table:
Expand Down
11 changes: 8 additions & 3 deletions model_analyzer/perf_analyzer/perf_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List

from model_analyzer.config.input.config_defaults import DEFAULT_MEASUREMENT_MODE
from model_analyzer.constants import SECONDS_TO_MILLISECONDS_MULTIPLIER
from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
Expand Down Expand Up @@ -325,7 +327,7 @@ def remove_mrc_from_cli_string(cls, cli_string):

return " ".join(perf_str_tokens)

def to_cli_string(self):
def to_cli_string(self, exclude_model_name: bool = False) -> str:
"""
Utility function to convert a config into a
string of arguments to the perf_analyzer with CLI.
Expand All @@ -340,19 +342,22 @@ def to_cli_string(self):

# single dashed options, then verbose flags, then main args
args = []
args.extend(self._parse_short_options())
args.extend(self._parse_short_options(exclude_model_name))
args.extend(self._parse_verbose_options())
args.extend(self._parse_long_options())

return " ".join(args)

def _parse_short_options(self):
def _parse_short_options(self, exclude_model_name: bool = False) -> List:
"""
Parse the perf analyzer single dash options
"""
temp_args = []
for key, value in self._options.items():
if value:
if exclude_model_name and key == "-m":
tgerdesnv marked this conversation as resolved.
Show resolved Hide resolved
continue

if key in self._additive_args:
for additive_value in value:
temp_args.append(f"{key} {additive_value}")
Expand Down
50 changes: 45 additions & 5 deletions model_analyzer/record/metrics_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,25 @@ class MetricsManager:
"gpu_power_usage",
"cpu_available_ram",
"cpu_used_ram",
"time_to_first_token_avg",
"time_to_first_token_min",
"time_to_first_token_max",
"time_to_first_token_p99",
"time_to_first_token_p95",
"time_to_first_token_p90",
"time_to_first_token_p75",
"time_to_first_token_p50",
"time_to_first_token_p25",
"inter_token_latency_avg",
"inter_token_latency_min",
"inter_token_latency_max",
"inter_token_latency_p99",
"inter_token_latency_p95",
"inter_token_latency_p90",
"inter_token_latency_p75",
"inter_token_latency_p50",
"inter_token_latency_p25",
"output_token_throughput",
]

def __init__(self, config, client, server, gpus, result_manager, state_manager):
Expand Down Expand Up @@ -115,6 +134,7 @@ def __init__(self, config, client, server, gpus, result_manager, state_manager):
(
self._gpu_metrics,
self._perf_metrics,
self._llm_metrics,
self._cpu_metrics,
) = self._categorize_metrics(self.metrics, self._config.collect_cpu_metrics)
self._gpus = gpus
Expand Down Expand Up @@ -160,21 +180,23 @@ def _categorize_metrics(metric_tags, collect_cpu_metrics=False):

Returns
-------
(list,list,list)
tuple of three lists (DCGM, PerfAnalyzer, CPU) metrics
(list,list,list,list)
tuple of four lists (DCGM, PerfAnalyzer, LLM, CPU) metrics
"""

gpu_metrics, perf_metrics, cpu_metrics = [], [], []
gpu_metrics, perf_metrics, llm_metrics, cpu_metrics = [], [], [], []
# Separates metrics and objectives into related lists
for metric in MetricsManager.get_metric_types(metric_tags):
if metric in PerfAnalyzer.get_gpu_metrics():
gpu_metrics.append(metric)
elif metric in PerfAnalyzer.get_perf_metrics():
perf_metrics.append(metric)
elif metric in PerfAnalyzer.get_llm_metrics():
llm_metrics.append(metric)
elif collect_cpu_metrics and (metric in CPUMonitor.cpu_metrics):
cpu_metrics.append(metric)

return gpu_metrics, perf_metrics, cpu_metrics
return gpu_metrics, perf_metrics, llm_metrics, cpu_metrics

def profile_server(self):
"""
Expand Down Expand Up @@ -589,9 +611,10 @@ def _run_perf_analyzer(
max_retries=self._config.perf_analyzer_max_auto_adjusts,
timeout=self._config.perf_analyzer_timeout,
max_cpu_util=self._config.perf_analyzer_cpu_util,
model_type=self._config.model_type,
)

metrics_to_gather = self._perf_metrics + self._gpu_metrics
metrics_to_gather = self._perf_metrics + self._llm_metrics + self._gpu_metrics
tgerdesnv marked this conversation as resolved.
Show resolved Hide resolved
status = perf_analyzer.run(metrics_to_gather, env=perf_analyzer_env)

self._write_perf_analyzer_output(perf_output_writer, perf_analyzer)
Expand All @@ -601,6 +624,12 @@ def _run_perf_analyzer(
return (None, None)

perf_records = perf_analyzer.get_perf_records()

if self._config.model_type == "LLM":
tgerdesnv marked this conversation as resolved.
Show resolved Hide resolved
perf_records[run_config.models_name()].extend(
perf_analyzer.get_llm_records()[run_config.models_name()]
)

gpu_records = perf_analyzer.get_gpu_records()

aggregated_perf_records = self._aggregate_perf_records(perf_records)
Expand Down Expand Up @@ -824,6 +853,17 @@ def is_perf_analyzer_metric(tag):
metric = MetricsManager.get_metric_types([tag])[0]
return metric in PerfAnalyzer.get_perf_metrics()

@staticmethod
def is_llm_metric(tag):
"""
Returns
------
True if the given tag is a supported perf_analyzer metric
False otherwise
"""
metric = MetricsManager.get_metric_types([tag])[0]
return metric in PerfAnalyzer.get_llm_metrics()

@staticmethod
def is_cpu_metric(tag):
"""
Expand Down
2 changes: 1 addition & 1 deletion tests/test_perf_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,7 @@ def test_pa_llm_csv_output(self):
self.client.wait_for_server_ready(num_retries=1)

pa_llm_csv_mock = """Metric,avg,min,max,p99,p95,p90,p75,p50,p25\n"""
pa_llm_csv_mock += """Time to First Token (ns),4238735,3367978,6702240,6371118,5344958,5006259,4841394,4146648,3484484\n"""
pa_llm_csv_mock += """Time To First Token (ns),4238735,3367978,6702240,6371118,5344958,5006259,4841394,4146648,3484484\n"""
pa_llm_csv_mock += """Inter Token Latency (ns),27202264,3849435,138324924,28283424,27737593,27469154,27067290,26979956,26926962\n"""
pa_llm_csv_mock += """Request Latency (ns),3363927003,3367978,14238834483,14091273510,13740917508,13692672723,3752510140,4846258,3612270\n"""
pa_llm_csv_mock += """Num Output Token,126,0,584,562,509,505,135,0,0\n"""
Expand Down
Loading