diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py index b1bb22591..59f478594 100755 --- a/model_analyzer/config/input/config_command_profile.py +++ b/model_analyzer/config/input/config_command_profile.py @@ -50,6 +50,7 @@ DEFAULT_GPU_OUTPUT_FIELDS, DEFAULT_GPUS, DEFAULT_INFERENCE_OUTPUT_FIELDS, + DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS, DEFAULT_MAX_RETRIES, DEFAULT_MODEL_TYPE, DEFAULT_MODEL_WEIGHTING, @@ -1354,6 +1355,12 @@ def _autofill_values(self): if not self._fields["gpu_output_fields"].is_set_by_user(): self.gpu_output_fields = DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS + # Switch default output fields if user specifies model type of LLM + # and the user didn't specify a custom output field + if self.model_type == "LLM": + if not self._fields["inference_output_fields"].is_set_by_user(): + self.inference_output_fields = DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS + new_profile_models = {} for i, model in enumerate(self.profile_models): new_model = {"cpu_only": (model.cpu_only() or cpu_only)} diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py index 7052af44b..34a4581c6 100755 --- a/model_analyzer/config/input/config_defaults.py +++ b/model_analyzer/config/input/config_defaults.py @@ -119,6 +119,20 @@ "perf_throughput", "perf_latency_p99", ] +DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS = [ + "model_name", + "batch_size", + "concurrency", + "model_config_path", + "instance_group", + "max_batch_size", + "satisfies_constraints", + "perf_throughput", + "perf_latency_p99", + "inter_token_latency_p99", + "time_to_first_token_p99", + "output_token_throughput", +] DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS = [ "model_name", "batch_size", diff --git a/tests/test_config.py b/tests/test_config.py index f056eb76a..190075dea 100755 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -24,6 +24,9 @@ from model_analyzer.cli.cli import CLI from model_analyzer.config.input.config_command_profile import ConfigCommandProfile from model_analyzer.config.input.config_command_report import ConfigCommandReport +from model_analyzer.config.input.config_defaults import ( + DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS, +) from model_analyzer.config.input.config_enum import ConfigEnum from model_analyzer.config.input.config_list_generic import ConfigListGeneric from model_analyzer.config.input.config_list_numeric import ConfigListNumeric @@ -2356,6 +2359,28 @@ def test_mixing_request_rate_and_concurrency(self): with self.assertRaises(TritonModelAnalyzerException): self._evaluate_config(args, yaml_content, subcommand="profile") + def test_model_type_llm(self): + """ + Test that model type of LLM chooses the correct inference outputs + """ + args = [ + "model-analyzer", + "profile", + "--model-repository", + "cli-repository", + "--profile-models", + "modelA", + "--model-type", + "LLM", + ] + yaml_content = "" + + config = self._evaluate_config(args, yaml_content) + + self.assertEqual( + config.inference_output_fields, DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS + ) + def _test_request_rate_config_conflicts( self, base_args: List[Any], yaml_content: str ) -> None: