diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py index e5387f04e..d59e79279 100755 --- a/model_analyzer/perf_analyzer/perf_analyzer.py +++ b/model_analyzer/perf_analyzer/perf_analyzer.py @@ -115,15 +115,15 @@ class PerfAnalyzer: ] llm_metric_table = [ - ["time_to_first_token_avg", "Time to First Token (ns) avg", TimeToFirstTokenAvg, "1000"], - ["time_to_first_token_min", "Time to First Token (ns) min", TimeToFirstTokenMin, "1000"], - ["time_to_first_token_max", "Time to First Token (ns) max", TimeToFirstTokenMax, "1000"], - ["time_to_first_token_p99", "Time to First Token (ns) p99", TimeToFirstTokenP99, "1000"], - ["time_to_first_token_p95", "Time to First Token (ns) p95", TimeToFirstTokenP95, "1000"], - ["time_to_first_token_p90", "Time to First Token (ns) p90", TimeToFirstTokenP90, "1000"], - ["time_to_first_token_p75", "Time to First Token (ns) p75", TimeToFirstTokenP75, "1000"], - ["time_to_first_token_p50", "Time to First Token (ns) p50", TimeToFirstTokenP50, "1000"], - ["time_to_first_token_p25", "Time to First Token (ns) p25", TimeToFirstTokenP25, "1000"], + ["time_to_first_token_avg", "Time To First Token (ns) avg", TimeToFirstTokenAvg, "1000"], + ["time_to_first_token_min", "Time To First Token (ns) min", TimeToFirstTokenMin, "1000"], + ["time_to_first_token_max", "Time To First Token (ns) max", TimeToFirstTokenMax, "1000"], + ["time_to_first_token_p99", "Time To First Token (ns) p99", TimeToFirstTokenP99, "1000"], + ["time_to_first_token_p95", "Time To First Token (ns) p95", TimeToFirstTokenP95, "1000"], + ["time_to_first_token_p90", "Time To First Token (ns) p90", TimeToFirstTokenP90, "1000"], + ["time_to_first_token_p75", "Time To First Token (ns) p75", TimeToFirstTokenP75, "1000"], + ["time_to_first_token_p50", "Time To First Token (ns) p50", TimeToFirstTokenP50, "1000"], + ["time_to_first_token_p25", "Time To First Token (ns) p25", TimeToFirstTokenP25, "1000"], ["inter_token_latency_avg", "Inter Token Latency (ns) avg", InterTokenLatencyAvg, "1000"], ["inter_token_latency_min", "Inter Token Latency (ns) min", InterTokenLatencyMin, "1000"], ["inter_token_latency_max", "Inter Token Latency (ns) max", InterTokenLatencyMax, "1000"], @@ -323,14 +323,34 @@ def _get_cmd(self): return cmd def _get_single_model_cmd(self, index): - cmd = [self.bin_path] - if self._is_multi_model(): - cmd += ["--enable-mpi"] - cmd += self._get_pa_cli_command(index).replace("=", " ").split() + # TODO: TMA-1771 - hook up the user defined CLI options + if self._model_type == "LLM": + cmd = [ + "genai-perf", + "-m", + self._config.models_name(), + "--streaming", + "--", + ] + cmd += ( + self._get_pa_cli_command(index, exclude_model_name=True) + .replace("=", " ") + .split() + ) + else: + cmd = [self.bin_path] + if self._is_multi_model(): + cmd += ["--enable-mpi"] + cmd += self._get_pa_cli_command(index).replace("=", " ").split() + return cmd - def _get_pa_cli_command(self, index): - return self._config.model_run_configs()[index].perf_config().to_cli_string() + def _get_pa_cli_command(self, index, exclude_model_name=False): + return ( + self._config.model_run_configs()[index] + .perf_config() + .to_cli_string(exclude_model_name) + ) def _create_env(self, env): perf_analyzer_env = os.environ.copy() @@ -552,16 +572,16 @@ def _parse_llm_outputs(self, metrics): perf_config = self._config.model_run_configs()[0].perf_config() - logger.debug(f"Reading PA results from {GENAI_PERF_CSV}") + logger.debug(f"Reading GENAI-PERF results from {GENAI_PERF_CSV}") with open(GENAI_PERF_CSV, mode="r") as f: - csv_reader = csv.DictReader(f, delimiter=",") + csv_reader = list(csv.DictReader(f, delimiter=",")) # See test_perf_analyzer::test_pa_llm_csv_output() for CSV output example self._llm_records[perf_config["model-name"]] = self._extract_llm_records( metrics, csv_reader ) - os.remove(f) + os.remove(GENAI_PERF_CSV) def _extract_perf_records_from_row( self, requested_metrics: List[Record], row_metrics: Dict[str, str] @@ -632,13 +652,14 @@ def _extract_llm_records( for requested_metric in requested_metrics: new_llm_record = self._get_llm_record_from_csv(requested_metric, csv_reader) - llm_records.append(new_llm_record) + if new_llm_record: + llm_records.append(new_llm_record) return llm_records def _get_llm_record_from_csv( self, requested_metric: Record, csv_reader: DictReader - ) -> Record: + ) -> Optional[Record]: for row in csv_reader: for key, value in row.items(): metric_string = f"{row['Metric']} {key}" @@ -655,9 +676,7 @@ def _get_llm_record_from_csv( llm_record = llm_metric[PerfAnalyzer.RECORD_CLASS](adjusted_value) # type: ignore return llm_record - raise TritonModelAnalyzerException( - f"Did not find {requested_metric.tag} in genai-perf CSV file" - ) + return None def _find_corresponding_llm_metric_row(self, metric_string: str) -> Optional[List]: for row in PerfAnalyzer.llm_metric_table: diff --git a/model_analyzer/perf_analyzer/perf_config.py b/model_analyzer/perf_analyzer/perf_config.py index e9160a44a..521cc1629 100755 --- a/model_analyzer/perf_analyzer/perf_config.py +++ b/model_analyzer/perf_analyzer/perf_config.py @@ -14,6 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import List + from model_analyzer.config.input.config_defaults import DEFAULT_MEASUREMENT_MODE from model_analyzer.constants import SECONDS_TO_MILLISECONDS_MULTIPLIER from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException @@ -325,7 +327,7 @@ def remove_mrc_from_cli_string(cls, cli_string): return " ".join(perf_str_tokens) - def to_cli_string(self): + def to_cli_string(self, exclude_model_name: bool = False) -> str: """ Utility function to convert a config into a string of arguments to the perf_analyzer with CLI. @@ -340,19 +342,22 @@ def to_cli_string(self): # single dashed options, then verbose flags, then main args args = [] - args.extend(self._parse_short_options()) + args.extend(self._parse_short_options(exclude_model_name)) args.extend(self._parse_verbose_options()) args.extend(self._parse_long_options()) return " ".join(args) - def _parse_short_options(self): + def _parse_short_options(self, exclude_model_name: bool = False) -> List: """ Parse the perf analyzer single dash options """ temp_args = [] for key, value in self._options.items(): if value: + if exclude_model_name and key == "-m": + continue + if key in self._additive_args: for additive_value in value: temp_args.append(f"{key} {additive_value}") diff --git a/model_analyzer/record/metrics_manager.py b/model_analyzer/record/metrics_manager.py index 581cae88b..849731935 100755 --- a/model_analyzer/record/metrics_manager.py +++ b/model_analyzer/record/metrics_manager.py @@ -69,6 +69,25 @@ class MetricsManager: "gpu_power_usage", "cpu_available_ram", "cpu_used_ram", + "time_to_first_token_avg", + "time_to_first_token_min", + "time_to_first_token_max", + "time_to_first_token_p99", + "time_to_first_token_p95", + "time_to_first_token_p90", + "time_to_first_token_p75", + "time_to_first_token_p50", + "time_to_first_token_p25", + "inter_token_latency_avg", + "inter_token_latency_min", + "inter_token_latency_max", + "inter_token_latency_p99", + "inter_token_latency_p95", + "inter_token_latency_p90", + "inter_token_latency_p75", + "inter_token_latency_p50", + "inter_token_latency_p25", + "output_token_throughput", ] def __init__(self, config, client, server, gpus, result_manager, state_manager): @@ -115,6 +134,7 @@ def __init__(self, config, client, server, gpus, result_manager, state_manager): ( self._gpu_metrics, self._perf_metrics, + self._llm_metrics, self._cpu_metrics, ) = self._categorize_metrics(self.metrics, self._config.collect_cpu_metrics) self._gpus = gpus @@ -160,21 +180,23 @@ def _categorize_metrics(metric_tags, collect_cpu_metrics=False): Returns ------- - (list,list,list) - tuple of three lists (DCGM, PerfAnalyzer, CPU) metrics + (list,list,list,list) + tuple of four lists (DCGM, PerfAnalyzer, LLM, CPU) metrics """ - gpu_metrics, perf_metrics, cpu_metrics = [], [], [] + gpu_metrics, perf_metrics, llm_metrics, cpu_metrics = [], [], [], [] # Separates metrics and objectives into related lists for metric in MetricsManager.get_metric_types(metric_tags): if metric in PerfAnalyzer.get_gpu_metrics(): gpu_metrics.append(metric) elif metric in PerfAnalyzer.get_perf_metrics(): perf_metrics.append(metric) + elif metric in PerfAnalyzer.get_llm_metrics(): + llm_metrics.append(metric) elif collect_cpu_metrics and (metric in CPUMonitor.cpu_metrics): cpu_metrics.append(metric) - return gpu_metrics, perf_metrics, cpu_metrics + return gpu_metrics, perf_metrics, llm_metrics, cpu_metrics def profile_server(self): """ @@ -589,9 +611,10 @@ def _run_perf_analyzer( max_retries=self._config.perf_analyzer_max_auto_adjusts, timeout=self._config.perf_analyzer_timeout, max_cpu_util=self._config.perf_analyzer_cpu_util, + model_type=self._config.model_type, ) - metrics_to_gather = self._perf_metrics + self._gpu_metrics + metrics_to_gather = self._perf_metrics + self._llm_metrics + self._gpu_metrics status = perf_analyzer.run(metrics_to_gather, env=perf_analyzer_env) self._write_perf_analyzer_output(perf_output_writer, perf_analyzer) @@ -601,6 +624,12 @@ def _run_perf_analyzer( return (None, None) perf_records = perf_analyzer.get_perf_records() + + if self._config.model_type == "LLM": + perf_records[run_config.models_name()].extend( + perf_analyzer.get_llm_records()[run_config.models_name()] + ) + gpu_records = perf_analyzer.get_gpu_records() aggregated_perf_records = self._aggregate_perf_records(perf_records) @@ -824,6 +853,17 @@ def is_perf_analyzer_metric(tag): metric = MetricsManager.get_metric_types([tag])[0] return metric in PerfAnalyzer.get_perf_metrics() + @staticmethod + def is_llm_metric(tag): + """ + Returns + ------ + True if the given tag is a supported perf_analyzer metric + False otherwise + """ + metric = MetricsManager.get_metric_types([tag])[0] + return metric in PerfAnalyzer.get_llm_metrics() + @staticmethod def is_cpu_metric(tag): """ diff --git a/tests/test_perf_analyzer.py b/tests/test_perf_analyzer.py index 0d063f81f..0b57701b8 100755 --- a/tests/test_perf_analyzer.py +++ b/tests/test_perf_analyzer.py @@ -509,7 +509,7 @@ def test_pa_llm_csv_output(self): self.client.wait_for_server_ready(num_retries=1) pa_llm_csv_mock = """Metric,avg,min,max,p99,p95,p90,p75,p50,p25\n""" - pa_llm_csv_mock += """Time to First Token (ns),4238735,3367978,6702240,6371118,5344958,5006259,4841394,4146648,3484484\n""" + pa_llm_csv_mock += """Time To First Token (ns),4238735,3367978,6702240,6371118,5344958,5006259,4841394,4146648,3484484\n""" pa_llm_csv_mock += """Inter Token Latency (ns),27202264,3849435,138324924,28283424,27737593,27469154,27067290,26979956,26926962\n""" pa_llm_csv_mock += """Request Latency (ns),3363927003,3367978,14238834483,14091273510,13740917508,13692672723,3752510140,4846258,3612270\n""" pa_llm_csv_mock += """Num Output Token,126,0,584,562,509,505,135,0,0\n"""