From 6063f179a5338b9033322a1e497f677c0bee8c83 Mon Sep 17 00:00:00 2001 From: Brian Raf <92820864+nv-braf@users.noreply.github.com> Date: Mon, 25 Mar 2024 07:30:50 -0700 Subject: [PATCH] Adding new output fields for LLM (#846) --- .../config/input/config_command_profile.py | 7 ++++++ .../config/input/config_defaults.py | 14 +++++++++++ tests/test_config.py | 25 +++++++++++++++++++ 3 files changed, 46 insertions(+) diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py index ff1fb5ea2..d4650a040 100755 --- a/model_analyzer/config/input/config_command_profile.py +++ b/model_analyzer/config/input/config_command_profile.py @@ -50,6 +50,7 @@ DEFAULT_GPU_OUTPUT_FIELDS, DEFAULT_GPUS, DEFAULT_INFERENCE_OUTPUT_FIELDS, + DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS, DEFAULT_MAX_RETRIES, DEFAULT_MODEL_TYPE, DEFAULT_MODEL_WEIGHTING, @@ -1449,6 +1450,12 @@ def _autofill_values(self): if not self._fields["gpu_output_fields"].is_set_by_user(): self.gpu_output_fields = DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS + # Switch default output fields if user specifies model type of LLM + # and the user didn't specify a custom output field + if self.model_type == "LLM": + if not self._fields["inference_output_fields"].is_set_by_user(): + self.inference_output_fields = DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS + new_profile_models = {} for i, model in enumerate(self.profile_models): new_model = {"cpu_only": (model.cpu_only() or cpu_only)} diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py index 07ecb59f7..8685537f7 100755 --- a/model_analyzer/config/input/config_defaults.py +++ b/model_analyzer/config/input/config_defaults.py @@ -119,6 +119,20 @@ "perf_throughput", "perf_latency_p99", ] +DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS = [ + "model_name", + "batch_size", + "concurrency", + "model_config_path", + "instance_group", + "max_batch_size", + "satisfies_constraints", + "perf_throughput", + "perf_latency_p99", + "inter_token_latency_p99", + "time_to_first_token_p99", + "output_token_throughput", +] DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS = [ "model_name", "batch_size", diff --git a/tests/test_config.py b/tests/test_config.py index f056eb76a..190075dea 100755 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -24,6 +24,9 @@ from model_analyzer.cli.cli import CLI from model_analyzer.config.input.config_command_profile import ConfigCommandProfile from model_analyzer.config.input.config_command_report import ConfigCommandReport +from model_analyzer.config.input.config_defaults import ( + DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS, +) from model_analyzer.config.input.config_enum import ConfigEnum from model_analyzer.config.input.config_list_generic import ConfigListGeneric from model_analyzer.config.input.config_list_numeric import ConfigListNumeric @@ -2356,6 +2359,28 @@ def test_mixing_request_rate_and_concurrency(self): with self.assertRaises(TritonModelAnalyzerException): self._evaluate_config(args, yaml_content, subcommand="profile") + def test_model_type_llm(self): + """ + Test that model type of LLM chooses the correct inference outputs + """ + args = [ + "model-analyzer", + "profile", + "--model-repository", + "cli-repository", + "--profile-models", + "modelA", + "--model-type", + "LLM", + ] + yaml_content = "" + + config = self._evaluate_config(args, yaml_content) + + self.assertEqual( + config.inference_output_fields, DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS + ) + def _test_request_rate_config_conflicts( self, base_args: List[Any], yaml_content: str ) -> None: