From c9d467f74717fd800b0ad6b60be0220bf9bcd74b Mon Sep 17 00:00:00 2001
From: Brian Raf <92820864+nv-braf@users.noreply.github.com>
Date: Tue, 3 Oct 2023 08:05:04 -0700
Subject: [PATCH 01/12] Adding new options for LLM (#768)

* Update README and versions for 23.09 branch (#761) (#767)

* Adding new options for LLM

* Fixing codeQL issues

* Fixing codeQL issue

---------

Co-authored-by: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>
---
 .../config/input/config_command_profile.py    | 234 ++++++++++--------
 .../config/input/config_defaults.py           |   5 +
 tests/test_cli.py                             |  45 ++--
 3 files changed, 155 insertions(+), 129 deletions(-)

diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py
index 02d6def28..9c40f16ef 100755
--- a/model_analyzer/config/input/config_command_profile.py
+++ b/model_analyzer/config/input/config_command_profile.py
@@ -23,6 +23,7 @@
 from google.protobuf.descriptor import FieldDescriptor
 from tritonclient.grpc.model_config_pb2 import ModelConfig
 
+import model_analyzer.config.input.config_defaults as config_defaults
 from model_analyzer.config.input.config_utils import (
     binary_path_validator,
     file_path_validator,
@@ -36,62 +37,6 @@
 from model_analyzer.triton.server.server_config import TritonServerConfig
 
 from .config_command import ConfigCommand
-from .config_defaults import (
-    DEFAULT_ALWAYS_REPORT_GPU_METRICS,
-    DEFAULT_BATCH_SIZES,
-    DEFAULT_CHECKPOINT_DIRECTORY,
-    DEFAULT_CLIENT_PROTOCOL,
-    DEFAULT_COLLECT_CPU_METRICS,
-    DEFAULT_DURATION_SECONDS,
-    DEFAULT_EXPORT_PATH,
-    DEFAULT_FILENAME_MODEL_GPU,
-    DEFAULT_FILENAME_MODEL_INFERENCE,
-    DEFAULT_FILENAME_SERVER_ONLY,
-    DEFAULT_GPU_OUTPUT_FIELDS,
-    DEFAULT_GPUS,
-    DEFAULT_INFERENCE_OUTPUT_FIELDS,
-    DEFAULT_MAX_RETRIES,
-    DEFAULT_MODEL_WEIGHTING,
-    DEFAULT_MONITORING_INTERVAL,
-    DEFAULT_NUM_CONFIGS_PER_MODEL,
-    DEFAULT_NUM_TOP_MODEL_CONFIGS,
-    DEFAULT_OFFLINE_OBJECTIVES,
-    DEFAULT_OFFLINE_PLOTS,
-    DEFAULT_ONLINE_OBJECTIVES,
-    DEFAULT_ONLINE_PLOTS,
-    DEFAULT_OUTPUT_MODEL_REPOSITORY,
-    DEFAULT_OVERRIDE_OUTPUT_REPOSITORY_FLAG,
-    DEFAULT_PERF_ANALYZER_CPU_UTIL,
-    DEFAULT_PERF_ANALYZER_PATH,
-    DEFAULT_PERF_ANALYZER_TIMEOUT,
-    DEFAULT_PERF_MAX_AUTO_ADJUSTS,
-    DEFAULT_PERF_OUTPUT_FLAG,
-    DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS,
-    DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS,
-    DEFAULT_REQUEST_RATE_SEARCH_ENABLE,
-    DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS,
-    DEFAULT_RUN_CONFIG_MAX_CONCURRENCY,
-    DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT,
-    DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE,
-    DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE,
-    DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
-    DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT,
-    DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE,
-    DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE,
-    DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE,
-    DEFAULT_RUN_CONFIG_SEARCH_DISABLE,
-    DEFAULT_RUN_CONFIG_SEARCH_MODE,
-    DEFAULT_SERVER_OUTPUT_FIELDS,
-    DEFAULT_SKIP_DETAILED_REPORTS,
-    DEFAULT_SKIP_SUMMARY_REPORTS,
-    DEFAULT_TRITON_DOCKER_IMAGE,
-    DEFAULT_TRITON_GRPC_ENDPOINT,
-    DEFAULT_TRITON_HTTP_ENDPOINT,
-    DEFAULT_TRITON_INSTALL_PATH,
-    DEFAULT_TRITON_LAUNCH_MODE,
-    DEFAULT_TRITON_METRICS_URL,
-    DEFAULT_TRITON_SERVER_PATH,
-)
 from .config_enum import ConfigEnum
 from .config_field import ConfigField
 from .config_list_generic import ConfigListGeneric
@@ -224,7 +169,7 @@ def _fill_config(self):
             ConfigField(
                 "checkpoint_directory",
                 flags=["-s", "--checkpoint-directory"],
-                default_value=DEFAULT_CHECKPOINT_DIRECTORY,
+                default_value=config_defaults.DEFAULT_CHECKPOINT_DIRECTORY,
                 field_type=ConfigPrimitive(str, validator=parent_path_validator),
                 description="Full path to directory to which to read and write checkpoints and profile data.",
             )
@@ -234,7 +179,7 @@ def _fill_config(self):
                 "monitoring_interval",
                 flags=["-i", "--monitoring-interval"],
                 field_type=ConfigPrimitive(float),
-                default_value=DEFAULT_MONITORING_INTERVAL,
+                default_value=config_defaults.DEFAULT_MONITORING_INTERVAL,
                 description="Interval of time between metrics measurements in seconds",
             )
         )
@@ -243,7 +188,7 @@ def _fill_config(self):
                 "duration_seconds",
                 field_type=ConfigPrimitive(int),
                 flags=["-d", "--duration-seconds"],
-                default_value=DEFAULT_DURATION_SECONDS,
+                default_value=config_defaults.DEFAULT_DURATION_SECONDS,
                 description="Specifies how long (seconds) to gather server-only metrics",
             )
         )
@@ -253,7 +198,7 @@ def _fill_config(self):
                 field_type=ConfigPrimitive(bool),
                 flags=["--collect-cpu-metrics"],
                 parser_args={"action": "store_true"},
-                default_value=DEFAULT_COLLECT_CPU_METRICS,
+                default_value=config_defaults.DEFAULT_COLLECT_CPU_METRICS,
                 description="Specify whether CPU metrics are collected or not",
             )
         )
@@ -262,7 +207,7 @@ def _fill_config(self):
                 "gpus",
                 flags=["--gpus"],
                 field_type=ConfigListString(),
-                default_value=DEFAULT_GPUS,
+                default_value=config_defaults.DEFAULT_GPUS,
                 description="List of GPU UUIDs to be used for the profiling. "
                 "Use 'all' to profile all the GPUs visible by CUDA.",
             )
@@ -273,7 +218,7 @@ def _fill_config(self):
                 flags=["--always-report-gpu-metrics"],
                 field_type=ConfigPrimitive(bool),
                 parser_args={"action": "store_true"},
-                default_value=DEFAULT_ALWAYS_REPORT_GPU_METRICS,
+                default_value=config_defaults.DEFAULT_ALWAYS_REPORT_GPU_METRICS,
                 description="Report GPU metrics, even when the model is `cpu_only`.",
             )
         )
@@ -283,7 +228,7 @@ def _fill_config(self):
                 flags=["--skip-summary-reports"],
                 field_type=ConfigPrimitive(bool),
                 parser_args={"action": "store_true"},
-                default_value=DEFAULT_SKIP_SUMMARY_REPORTS,
+                default_value=config_defaults.DEFAULT_SKIP_SUMMARY_REPORTS,
                 description="Skips the generation of analysis summary reports and tables.",
             )
         )
@@ -293,7 +238,7 @@ def _fill_config(self):
                 flags=["--skip-detailed-reports"],
                 field_type=ConfigPrimitive(bool),
                 parser_args={"action": "store_true"},
-                default_value=DEFAULT_SKIP_DETAILED_REPORTS,
+                default_value=config_defaults.DEFAULT_SKIP_DETAILED_REPORTS,
                 description="Skips the generation of detailed summary reports and tables.",
             )
         )
@@ -325,7 +270,7 @@ def _add_repository_configs(self):
             ConfigField(
                 "output_model_repository_path",
                 field_type=ConfigPrimitive(str),
-                default_value=DEFAULT_OUTPUT_MODEL_REPOSITORY,
+                default_value=config_defaults.DEFAULT_OUTPUT_MODEL_REPOSITORY,
                 flags=["--output-model-repository-path"],
                 description="Output model repository path used by Model Analyzer."
                 " This is the directory that will contain all the generated model configurations",
@@ -336,7 +281,7 @@ def _add_repository_configs(self):
                 "override_output_model_repository",
                 field_type=ConfigPrimitive(bool),
                 parser_args={"action": "store_true"},
-                default_value=DEFAULT_OVERRIDE_OUTPUT_REPOSITORY_FLAG,
+                default_value=config_defaults.DEFAULT_OVERRIDE_OUTPUT_REPOSITORY_FLAG,
                 flags=["--override-output-model-repository"],
                 description="Will override the contents of the output model repository"
                 " and replace it with the new results.",
@@ -520,7 +465,7 @@ def _add_profile_models_configs(self):
             ConfigField(
                 "objectives",
                 field_type=objectives_scheme,
-                default_value=DEFAULT_OFFLINE_OBJECTIVES,
+                default_value=config_defaults.DEFAULT_OFFLINE_OBJECTIVES,
                 description="Model Analyzer uses the objectives described here to find the best configuration for each model.",
             )
         )
@@ -602,7 +547,7 @@ def _add_profile_models_configs(self):
                 "batch_sizes",
                 flags=["-b", "--batch-sizes"],
                 field_type=ConfigListNumeric(int),
-                default_value=DEFAULT_BATCH_SIZES,
+                default_value=config_defaults.DEFAULT_BATCH_SIZES,
                 description="Comma-delimited list of batch sizes to use for the profiling",
             )
         )
@@ -624,6 +569,24 @@ def _add_profile_models_configs(self):
                 " to be used during profiling",
             )
         )
+        self._add_config(
+            ConfigField(
+                "prompt_length",
+                flags=["--prompt-length"],
+                field_type=ConfigListNumeric(int),
+                description="Comma-delimited list of prompt length values or ranges <start:end:step>"
+                " to be used during profiling LLMs",
+            )
+        )
+        self._add_config(
+            ConfigField(
+                "max_token_count",
+                flags=["--max-token-count"],
+                field_type=ConfigListNumeric(int),
+                description="Comma-delimited list of max token values or ranges <start:end:step>"
+                " to be used during profiling LLMs",
+            )
+        )
         self._add_config(
             ConfigField(
                 "reload_model_disable",
@@ -685,7 +648,7 @@ def _add_client_configs(self):
                 "client_max_retries",
                 flags=["-r", "--client-max-retries"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_MAX_RETRIES,
+                default_value=config_defaults.DEFAULT_MAX_RETRIES,
                 description="Specifies the max number of retries for any requests to Triton server.",
             )
         )
@@ -695,7 +658,7 @@ def _add_client_configs(self):
                 flags=["--client-protocol"],
                 choices=["http", "grpc"],
                 field_type=ConfigPrimitive(str),
-                default_value=DEFAULT_CLIENT_PROTOCOL,
+                default_value=config_defaults.DEFAULT_CLIENT_PROTOCOL,
                 description="The protocol used to communicate with the Triton Inference Server",
             )
         )
@@ -721,7 +684,7 @@ def _add_run_search_configs(self):
                 "run_config_search_max_concurrency",
                 flags=["--run-config-search-max-concurrency"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_RUN_CONFIG_MAX_CONCURRENCY,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_CONCURRENCY,
                 description="Max concurrency value that run config search should not go beyond that.",
             )
         )
@@ -730,7 +693,7 @@ def _add_run_search_configs(self):
                 "run_config_search_min_concurrency",
                 flags=["--run-config-search-min-concurrency"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
                 description="Min concurrency value that run config search should start with.",
             )
         )
@@ -739,7 +702,7 @@ def _add_run_search_configs(self):
                 "run_config_search_max_request_rate",
                 flags=["--run-config-search-max-request-rate"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE,
                 description="Max request rate value that run config search should not go beyond that.",
             )
         )
@@ -748,7 +711,7 @@ def _add_run_search_configs(self):
                 "run_config_search_min_request_rate",
                 flags=["--run-config-search-min-request-rate"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE,
                 description="Min request rate value that run config search should start with.",
             )
         )
@@ -757,7 +720,7 @@ def _add_run_search_configs(self):
                 "run_config_search_max_instance_count",
                 flags=["--run-config-search-max-instance-count"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT,
                 description="Max instance count value that run config search should not go beyond that.",
             )
         )
@@ -766,7 +729,7 @@ def _add_run_search_configs(self):
                 "run_config_search_min_instance_count",
                 flags=["--run-config-search-min-instance-count"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT,
                 description="Min instance count value that run config search should start with.",
             )
         )
@@ -775,7 +738,7 @@ def _add_run_search_configs(self):
                 "run_config_search_max_model_batch_size",
                 flags=["--run-config-search-max-model-batch-size"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE,
                 description="Value for the model's max_batch_size that run config search will not go beyond.",
             )
         )
@@ -784,7 +747,7 @@ def _add_run_search_configs(self):
                 "run_config_search_min_model_batch_size",
                 flags=["--run-config-search-min-model-batch-size"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE,
                 description="Value for the model's max_batch_size that run config search will start from.",
             )
         )
@@ -793,7 +756,7 @@ def _add_run_search_configs(self):
                 "run_config_search_max_binary_search_steps",
                 flags=["--run-config-search-max-binary-search-steps"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS,
                 description="Maximum number of steps take during the binary concurrency search.",
             )
         )
@@ -803,7 +766,7 @@ def _add_run_search_configs(self):
                 flags=["--run-config-search-mode"],
                 choices=["brute", "quick"],
                 field_type=ConfigPrimitive(str),
-                default_value=DEFAULT_RUN_CONFIG_SEARCH_MODE,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_SEARCH_MODE,
                 description="The search mode for Model Analyzer to find and evaluate"
                 " model configurations. 'brute' will brute force all combinations of"
                 " configuration options.  'quick' will attempt to find a near-optimal"
@@ -817,7 +780,7 @@ def _add_run_search_configs(self):
                 flags=["--run-config-search-disable"],
                 field_type=ConfigPrimitive(bool),
                 parser_args={"action": "store_true"},
-                default_value=DEFAULT_RUN_CONFIG_SEARCH_DISABLE,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_SEARCH_DISABLE,
                 description="Disable run config search.",
             )
         )
@@ -827,7 +790,7 @@ def _add_run_search_configs(self):
                 flags=["--run-config-profile-models-concurrently-enable"],
                 field_type=ConfigPrimitive(bool),
                 parser_args={"action": "store_true"},
-                default_value=DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE,
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE,
                 description="Enable the profiling of all supplied models concurrently.",
             )
         )
@@ -837,10 +800,56 @@ def _add_run_search_configs(self):
                 flags=["--request-rate-search-enable"],
                 field_type=ConfigPrimitive(bool),
                 parser_args={"action": "store_true"},
-                default_value=DEFAULT_REQUEST_RATE_SEARCH_ENABLE,
+                default_value=config_defaults.DEFAULT_REQUEST_RATE_SEARCH_ENABLE,
                 description="Enables the searching of request rate (instead of concurrency).",
             )
         )
+        self._add_config(
+            ConfigField(
+                "llm_search_enable",
+                flags=["--llm-search-enable"],
+                field_type=ConfigPrimitive(bool),
+                parser_args={"action": "store_true"},
+                default_value=config_defaults.DEFAULT_LLM_SEARCH_ENABLE,
+                description="Enables searching values are important to LLMs: prompt length, max token, etc...",
+            )
+        )
+        self._add_config(
+            ConfigField(
+                "run_config_search_min_prompt_length",
+                flags=["--run-config-search-min-prompt-length"],
+                field_type=ConfigPrimitive(int),
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH,
+                description="Min prompt length that run config search should start with.",
+            )
+        )
+        self._add_config(
+            ConfigField(
+                "run_config_search_max_prompt_length",
+                flags=["--run-config-search-max-prompt-length"],
+                field_type=ConfigPrimitive(int),
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH,
+                description="Max prompt length that run config search will not go beyond.",
+            )
+        )
+        self._add_config(
+            ConfigField(
+                "run_config_search_min_token_count",
+                flags=["--run-config-search-min-token-count"],
+                field_type=ConfigPrimitive(int),
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT,
+                description="Min token count that run config search should start with.",
+            )
+        )
+        self._add_config(
+            ConfigField(
+                "run_config_search_max_token_count",
+                flags=["--run-config-search-max-token-count"],
+                field_type=ConfigPrimitive(int),
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT,
+                description="Max token count that run config search will not go beyond.",
+            )
+        )
 
     def _add_triton_configs(self):
         """
@@ -853,7 +862,7 @@ def _add_triton_configs(self):
                 "triton_launch_mode",
                 field_type=ConfigPrimitive(str),
                 flags=["--triton-launch-mode"],
-                default_value=DEFAULT_TRITON_LAUNCH_MODE,
+                default_value=config_defaults.DEFAULT_TRITON_LAUNCH_MODE,
                 choices=["local", "docker", "remote", "c_api"],
                 description="The method by which to launch Triton Server. "
                 "'local' assumes tritonserver binary is available locally. "
@@ -869,7 +878,7 @@ def _add_triton_configs(self):
                 "triton_docker_image",
                 flags=["--triton-docker-image"],
                 field_type=ConfigPrimitive(str),
-                default_value=DEFAULT_TRITON_DOCKER_IMAGE,
+                default_value=config_defaults.DEFAULT_TRITON_DOCKER_IMAGE,
                 description="Triton Server Docker image tag",
             )
         )
@@ -878,7 +887,7 @@ def _add_triton_configs(self):
                 "triton_http_endpoint",
                 flags=["--triton-http-endpoint"],
                 field_type=ConfigPrimitive(str),
-                default_value=DEFAULT_TRITON_HTTP_ENDPOINT,
+                default_value=config_defaults.DEFAULT_TRITON_HTTP_ENDPOINT,
                 description="Triton Server HTTP endpoint url used by Model Analyzer client.",
             )
         )
@@ -887,7 +896,7 @@ def _add_triton_configs(self):
                 "triton_grpc_endpoint",
                 flags=["--triton-grpc-endpoint"],
                 field_type=ConfigPrimitive(str),
-                default_value=DEFAULT_TRITON_GRPC_ENDPOINT,
+                default_value=config_defaults.DEFAULT_TRITON_GRPC_ENDPOINT,
                 description="Triton Server HTTP endpoint url used by Model Analyzer client.",
             )
         )
@@ -896,7 +905,7 @@ def _add_triton_configs(self):
                 "triton_metrics_url",
                 field_type=ConfigPrimitive(str),
                 flags=["--triton-metrics-url"],
-                default_value=DEFAULT_TRITON_METRICS_URL,
+                default_value=config_defaults.DEFAULT_TRITON_METRICS_URL,
                 description="Triton Server Metrics endpoint url. ",
             )
         )
@@ -905,7 +914,7 @@ def _add_triton_configs(self):
                 "triton_server_path",
                 field_type=ConfigPrimitive(str),
                 flags=["--triton-server-path"],
-                default_value=DEFAULT_TRITON_SERVER_PATH,
+                default_value=config_defaults.DEFAULT_TRITON_SERVER_PATH,
                 description="The full path to the tritonserver binary executable",
             )
         )
@@ -953,7 +962,7 @@ def _add_triton_configs(self):
             ConfigField(
                 "triton_install_path",
                 field_type=ConfigPrimitive(str),
-                default_value=DEFAULT_TRITON_INSTALL_PATH,
+                default_value=config_defaults.DEFAULT_TRITON_INSTALL_PATH,
                 flags=["--triton-install-path"],
                 description=(
                     "Path to Triton install directory i.e. the parent directory of 'lib/libtritonserver.so'."
@@ -973,7 +982,7 @@ def _add_perf_analyzer_configs(self):
                 "perf_analyzer_timeout",
                 flags=["--perf-analyzer-timeout"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_PERF_ANALYZER_TIMEOUT,
+                default_value=config_defaults.DEFAULT_PERF_ANALYZER_TIMEOUT,
                 description="Perf analyzer timeout value in seconds.",
             )
         )
@@ -982,7 +991,8 @@ def _add_perf_analyzer_configs(self):
                 "perf_analyzer_cpu_util",
                 flags=["--perf-analyzer-cpu-util"],
                 field_type=ConfigPrimitive(float),
-                default_value=psutil.cpu_count() * DEFAULT_PERF_ANALYZER_CPU_UTIL,
+                default_value=psutil.cpu_count()
+                * config_defaults.DEFAULT_PERF_ANALYZER_CPU_UTIL,
                 description="Maximum CPU utilization value allowed for the perf_analyzer.",
             )
         )
@@ -991,7 +1001,7 @@ def _add_perf_analyzer_configs(self):
                 "perf_analyzer_path",
                 flags=["--perf-analyzer-path"],
                 field_type=ConfigPrimitive(str, validator=binary_path_validator),
-                default_value=DEFAULT_PERF_ANALYZER_PATH,
+                default_value=config_defaults.DEFAULT_PERF_ANALYZER_PATH,
                 description="The full path to the perf_analyzer binary executable",
             )
         )
@@ -1001,7 +1011,7 @@ def _add_perf_analyzer_configs(self):
                 flags=["--perf-output"],
                 parser_args={"action": "store_true"},
                 field_type=ConfigPrimitive(bool),
-                default_value=DEFAULT_PERF_OUTPUT_FLAG,
+                default_value=config_defaults.DEFAULT_PERF_OUTPUT_FLAG,
                 description="Enables the output from the perf_analyzer to a file specified by"
                 " perf_output_path. If perf_output_path is None, output will be"
                 " written to stdout.",
@@ -1020,7 +1030,7 @@ def _add_perf_analyzer_configs(self):
                 "perf_analyzer_max_auto_adjusts",
                 flags=["--perf-analyzer-max-auto-adjusts"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_PERF_MAX_AUTO_ADJUSTS,
+                default_value=config_defaults.DEFAULT_PERF_MAX_AUTO_ADJUSTS,
                 description="Maximum number of times perf_analyzer is "
                 "launched with auto adjusted parameters in an attempt to profile a model. ",
             )
@@ -1034,7 +1044,7 @@ def _add_export_configs(self):
             ConfigField(
                 "export_path",
                 flags=["-e", "--export-path"],
-                default_value=DEFAULT_EXPORT_PATH,
+                default_value=config_defaults.DEFAULT_EXPORT_PATH,
                 field_type=ConfigPrimitive(str, validator=parent_path_validator),
                 description="Full path to directory in which to store the results",
             )
@@ -1043,7 +1053,7 @@ def _add_export_configs(self):
             ConfigField(
                 "filename_model_inference",
                 flags=["--filename-model-inference"],
-                default_value=DEFAULT_FILENAME_MODEL_INFERENCE,
+                default_value=config_defaults.DEFAULT_FILENAME_MODEL_INFERENCE,
                 field_type=ConfigPrimitive(str),
                 description="Specifies filename for storing model inference metrics",
             )
@@ -1053,7 +1063,7 @@ def _add_export_configs(self):
                 "filename_model_gpu",
                 flags=["--filename-model-gpu"],
                 field_type=ConfigPrimitive(str),
-                default_value=DEFAULT_FILENAME_MODEL_GPU,
+                default_value=config_defaults.DEFAULT_FILENAME_MODEL_GPU,
                 description="Specifies filename for storing model GPU metrics",
             )
         )
@@ -1062,7 +1072,7 @@ def _add_export_configs(self):
                 "filename_server_only",
                 flags=["--filename-server-only"],
                 field_type=ConfigPrimitive(str),
-                default_value=DEFAULT_FILENAME_SERVER_ONLY,
+                default_value=config_defaults.DEFAULT_FILENAME_SERVER_ONLY,
                 description="Specifies filename for server-only metrics",
             )
         )
@@ -1076,7 +1086,7 @@ def _add_report_configs(self):
                 "num_configs_per_model",
                 flags=["--num-configs-per-model"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_NUM_CONFIGS_PER_MODEL,
+                default_value=config_defaults.DEFAULT_NUM_CONFIGS_PER_MODEL,
                 description="The number of configurations to plot per model in the summary.",
             )
         )
@@ -1085,7 +1095,7 @@ def _add_report_configs(self):
                 "num_top_model_configs",
                 flags=["--num-top-model-configs"],
                 field_type=ConfigPrimitive(int),
-                default_value=DEFAULT_NUM_TOP_MODEL_CONFIGS,
+                default_value=config_defaults.DEFAULT_NUM_TOP_MODEL_CONFIGS,
                 description="Model Analyzer will compare this many of the top models configs across all models.",
             )
         )
@@ -1100,7 +1110,7 @@ def _add_table_configs(self):
                 "inference_output_fields",
                 flags=["--inference-output-fields"],
                 field_type=ConfigListString(),
-                default_value=DEFAULT_INFERENCE_OUTPUT_FIELDS,
+                default_value=config_defaults.DEFAULT_INFERENCE_OUTPUT_FIELDS,
                 description="Specifies column keys for model inference metrics table",
             )
         )
@@ -1109,7 +1119,7 @@ def _add_table_configs(self):
                 "gpu_output_fields",
                 flags=["--gpu-output-fields"],
                 field_type=ConfigListString(),
-                default_value=DEFAULT_GPU_OUTPUT_FIELDS,
+                default_value=config_defaults.DEFAULT_GPU_OUTPUT_FIELDS,
                 description="Specifies column keys for model gpu metrics table",
             )
         )
@@ -1118,7 +1128,7 @@ def _add_table_configs(self):
                 "server_output_fields",
                 flags=["--server-output-fields"],
                 field_type=ConfigListString(),
-                default_value=DEFAULT_SERVER_OUTPUT_FIELDS,
+                default_value=config_defaults.DEFAULT_SERVER_OUTPUT_FIELDS,
                 description="Specifies column keys for server-only metrics table",
             )
         )
@@ -1163,7 +1173,9 @@ def set_config_values(self, args: argparse.Namespace) -> None:
             this exception
         """
         if args.mode == "online" and "latency_budget" not in args:
-            self._fields["objectives"].set_default_value(DEFAULT_ONLINE_OBJECTIVES)
+            self._fields["objectives"].set_default_value(
+                config_defaults.DEFAULT_ONLINE_OBJECTIVES
+            )
 
         super().set_config_values(args)
 
@@ -1171,9 +1183,9 @@ def set_config_values(self, args: argparse.Namespace) -> None:
         # able to edit these plots.
         self._add_plot_configs()
         if args.mode == "online":
-            self._fields["plots"].set_value(DEFAULT_ONLINE_PLOTS)
+            self._fields["plots"].set_value(config_defaults.DEFAULT_ONLINE_PLOTS)
         elif args.mode == "offline":
-            self._fields["plots"].set_value(DEFAULT_OFFLINE_PLOTS)
+            self._fields["plots"].set_value(config_defaults.DEFAULT_OFFLINE_PLOTS)
 
     def _add_plot_configs(self):
         """
@@ -1336,11 +1348,13 @@ def _autofill_values(self):
         if self._using_request_rate():
             if not self._fields["inference_output_fields"].is_set_by_user():
                 self.inference_output_fields = (
-                    DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS
+                    config_defaults.DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS
                 )
 
             if not self._fields["gpu_output_fields"].is_set_by_user():
-                self.gpu_output_fields = DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS
+                self.gpu_output_fields = (
+                    config_defaults.DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS
+                )
 
         new_profile_models = {}
         for i, model in enumerate(self.profile_models):
@@ -1369,7 +1383,7 @@ def _autofill_values(self):
                         "Weighting can not be specified as a global parameter. Please make this a model parameter."
                     )
                 else:
-                    new_model["weighting"] = DEFAULT_MODEL_WEIGHTING
+                    new_model["weighting"] = config_defaults.DEFAULT_MODEL_WEIGHTING
             else:
                 new_model["weighting"] = model.weighting()
 
diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py
index 67c62dca9..c2edd6e91 100755
--- a/model_analyzer/config/input/config_defaults.py
+++ b/model_analyzer/config/input/config_defaults.py
@@ -51,10 +51,15 @@
 DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE = 1
 DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE = 128
 DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS = 5
+DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH = 1
+DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH = 1000
+DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT = 1
+DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT = 256
 DEFAULT_RUN_CONFIG_SEARCH_DISABLE = False
 DEFAULT_RUN_CONFIG_SEARCH_MODE = "brute"
 DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE = False
 DEFAULT_REQUEST_RATE_SEARCH_ENABLE = False
+DEFAULT_LLM_SEARCH_ENABLE = False
 DEFAULT_TRITON_LAUNCH_MODE = "local"
 DEFAULT_TRITON_DOCKER_IMAGE = "nvcr.io/nvidia/tritonserver:23.09-py3"
 DEFAULT_TRITON_HTTP_ENDPOINT = "localhost:8000"
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 98ec60237..75be15038 100755
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -30,10 +30,10 @@
 
 import psutil
 
+import model_analyzer.config.input.config_defaults as config_defaults
 from model_analyzer.cli.cli import CLI
 from model_analyzer.config.input.config_command_profile import ConfigCommandProfile
 from model_analyzer.config.input.config_command_report import ConfigCommandReport
-from model_analyzer.config.input.config_defaults import DEFAULT_TRITON_DOCKER_IMAGE
 from model_analyzer.config.input.config_status import ConfigStatus
 from model_analyzer.constants import CONFIG_PARSER_SUCCESS
 from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
@@ -60,6 +60,7 @@ def get_test_options():
         OptionStruct("bool", "profile","--run-config-search-disable"),
         OptionStruct("bool", "profile","--run-config-profile-models-concurrently-enable"),
         OptionStruct("bool", "profile","--request-rate-search-enable"),
+        OptionStruct("bool", "profile","--llm-search-enable"),
         OptionStruct("bool", "profile","--reload-model-disable"),
         OptionStruct("bool", "profile","--early-exit-enable"),
         OptionStruct("bool", "profile","--skip-summary-reports"),
@@ -71,23 +72,27 @@ def get_test_options():
         # The following options can be None:
         #   short_option
         #   expected_default_value
-        OptionStruct("int", "profile", "--client-max-retries", "-r", "125", "50"),
-        OptionStruct("int", "profile", "--duration-seconds", "-d", "10", "3"),
-        OptionStruct("int", "profile", "--perf-analyzer-timeout", None, "100", "600"),
-        OptionStruct("int", "profile", "--perf-analyzer-max-auto-adjusts", None, "100", "10"),
-        OptionStruct("int", "profile", "--run-config-search-min-concurrency", None, "2", "1"),
-        OptionStruct("int", "profile", "--run-config-search-max-concurrency", None, "100", "1024"),
-        OptionStruct("int", "profile", "--run-config-search-min-request-rate", None, "2", "16"),
-        OptionStruct("int", "profile", "--run-config-search-max-request-rate", None, "100", "8192"),
-        OptionStruct("int", "profile", "--run-config-search-min-model-batch-size", None, "100", "1"),
-        OptionStruct("int", "profile", "--run-config-search-max-model-batch-size", None, "100", "128"),
-        OptionStruct("int", "profile", "--run-config-search-min-instance-count", None, "2", "1"),
-        OptionStruct("int", "profile", "--run-config-search-max-instance-count", None, "10", "5"),
-        OptionStruct("int", "profile", "--run-config-search-max-binary-search-steps", None, "10", "5"),
-        OptionStruct("float", "profile", "--monitoring-interval", "-i", "10.0", "1.0"),
-        OptionStruct("float", "profile", "--perf-analyzer-cpu-util", None, "10.0", str(psutil.cpu_count() * 80.0)),
-        OptionStruct("int", "profile", "--num-configs-per-model", None, "10", "3"),
-        OptionStruct("int", "profile", "--num-top-model-configs", None, "10", "0"),
+        OptionStruct("int", "profile", "--client-max-retries", "-r", "125", str(config_defaults.DEFAULT_MAX_RETRIES)),
+        OptionStruct("int", "profile", "--duration-seconds", "-d", "10", str(config_defaults.DEFAULT_DURATION_SECONDS)),
+        OptionStruct("int", "profile", "--perf-analyzer-timeout", None, "100", str(config_defaults.DEFAULT_PERF_ANALYZER_TIMEOUT)),
+        OptionStruct("int", "profile", "--perf-analyzer-max-auto-adjusts", None, "100", str(config_defaults.DEFAULT_PERF_MAX_AUTO_ADJUSTS)),
+        OptionStruct("int", "profile", "--run-config-search-min-concurrency", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_CONCURRENCY)),
+        OptionStruct("int", "profile", "--run-config-search-max-concurrency", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_CONCURRENCY)),
+        OptionStruct("int", "profile", "--run-config-search-min-request-rate", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE)),
+        OptionStruct("int", "profile", "--run-config-search-max-request-rate", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE)),
+        OptionStruct("int", "profile", "--run-config-search-min-model-batch-size", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE)),
+        OptionStruct("int", "profile", "--run-config-search-max-model-batch-size", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE)),
+        OptionStruct("int", "profile", "--run-config-search-min-instance-count", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT)),
+        OptionStruct("int", "profile", "--run-config-search-max-instance-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT)),
+        OptionStruct("int", "profile", "--run-config-search-max-binary-search-steps", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS)),
+        OptionStruct("int", "profile", "--run-config-search-min-prompt-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH)),
+        OptionStruct("int", "profile", "--run-config-search-max-prompt-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH)),
+        OptionStruct("int", "profile", "--run-config-search-min-token-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT)),
+        OptionStruct("int", "profile", "--run-config-search-max-token-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT)),
+        OptionStruct("float", "profile", "--monitoring-interval", "-i", "10.0", str(config_defaults.DEFAULT_MONITORING_INTERVAL)),
+        OptionStruct("float", "profile", "--perf-analyzer-cpu-util", None, "10.0", str(psutil.cpu_count() * config_defaults.DEFAULT_PERF_ANALYZER_CPU_UTIL)),
+        OptionStruct("int", "profile", "--num-configs-per-model", None, "10", str(config_defaults.DEFAULT_NUM_CONFIGS_PER_MODEL)),
+        OptionStruct("int", "profile", "--num-top-model-configs", None, "10", str(config_defaults.DEFAULT_NUM_TOP_MODEL_CONFIGS)),
         OptionStruct("int", "profile", "--latency-budget", None, "200", None),
         OptionStruct("int", "profile", "--min-throughput", None, "300", None),
 
@@ -105,7 +110,7 @@ def get_test_options():
         OptionStruct("string", "profile", "--client-protocol", None, ["http", "grpc"], "grpc", "SHOULD_FAIL"),
         OptionStruct("string", "profile", "--perf-analyzer-path", None, ".", "perf_analyzer", None),
         OptionStruct("string", "profile", "--perf-output-path", None, ".", None, None),
-        OptionStruct("string", "profile", "--triton-docker-image", None, "test_image", DEFAULT_TRITON_DOCKER_IMAGE, None),
+        OptionStruct("string", "profile", "--triton-docker-image", None, "test_image", config_defaults.DEFAULT_TRITON_DOCKER_IMAGE, None),
         OptionStruct("string", "profile", "--triton-http-endpoint", None, "localhost:4000", "localhost:8000", None),
         OptionStruct("string", "profile", "--triton-grpc-endpoint", None, "localhost:4001", "localhost:8001", None),
         OptionStruct("string", "profile", "--triton-metrics-url", None, "localhost:4002", "http://localhost:8002/metrics", None),
@@ -135,6 +140,8 @@ def get_test_options():
         OptionStruct("intlist", "profile", "--batch-sizes", "-b", "2, 4, 6", "1"),
         OptionStruct("intlist", "profile", "--concurrency", "-c", "1, 2, 3", None),
         OptionStruct("intlist", "profile", "--request-rate", None, "1, 2, 3", None),
+        OptionStruct("intlist", "profile", "--prompt-length", None, "1, 2, 3", None),
+        OptionStruct("intlist", "profile", "--max-token-count", None, "1, 2, 3", None),
         OptionStruct("stringlist", "profile", "--triton-docker-mounts", None, "a:b:c, d:e:f", None, extra_commands=["--triton-launch-mode", "docker"]),
         OptionStruct("stringlist", "profile", "--gpus", None, "a, b, c", "all"),
         OptionStruct("stringlist", "profile", "--inference-output-fields", None, "a, b, c",

From 1b7aee0655bae9eaa750198470e63e811b225b7c Mon Sep 17 00:00:00 2001
From: Brian Raf <92820864+nv-braf@users.noreply.github.com>
Date: Thu, 12 Oct 2023 08:54:28 -0700
Subject: [PATCH 02/12] Add LLM support to Brute Search (#769)

* Initial coding complete

* First unit test passing

* Adding test for prompt length

* Refactor PACG methods

* Further refactoring

* Ensure early exit isn't enabled for LLM models

* Fix type checking errors

* Attempt at fixing codeql issue

* Revert "Attempt at fixing codeql issue"

This reverts commit 2619b83c73d296a174229526c835fe31eeb1d3ca.

* Attempt at codeQL fix

* Adding deepcopy back in

* Removing deepcopy in an attempt to fix codeQL errors

* Update model_analyzer/config/input/config_command_profile.py

Co-authored-by: Hyunjae Woo <107147848+nv-hwoo@users.noreply.github.com>

* Update model_analyzer/config/generate/perf_analyzer_config_generator.py

Co-authored-by: Hyunjae Woo <107147848+nv-hwoo@users.noreply.github.com>

* Update model_analyzer/config/generate/perf_analyzer_config_generator.py

Co-authored-by: Hyunjae Woo <107147848+nv-hwoo@users.noreply.github.com>

* Update model_analyzer/config/generate/perf_analyzer_config_generator.py

Co-authored-by: Hyunjae Woo <107147848+nv-hwoo@users.noreply.github.com>

* Moving location of method

* Changing parameter to inference load

* Changing parameter to inference load

* Changing prompt length to text input length

* Changing max_tokens to use request-parameter

* Fix input-data typo

* Changing non-parameter to parameter

---------

Co-authored-by: Hyunjae Woo <107147848+nv-hwoo@users.noreply.github.com>
---
 .../automatic_model_config_generator.py       |  11 +-
 ...lus_binary_search_run_config_generator.py} |  28 +-
 .../generate/model_run_config_generator.py    |  12 +-
 .../perf_analyzer_config_generator.py         | 282 +++++++++++++-----
 ..._concurrency_sweep_run_config_generator.py |  12 +-
 .../generate/run_config_generator_factory.py  |  10 +-
 .../config/input/config_command_profile.py    |  64 +++-
 .../config/input/config_defaults.py           |   5 +-
 model_analyzer/constants.py                   |   2 +-
 model_analyzer/perf_analyzer/perf_config.py   |   4 +
 ...ter_search.py => inference_load_search.py} | 113 +++----
 tests/common/test_utils.py                    |  13 +
 tests/test_cli.py                             |   6 +-
 tests/test_config.py                          | 114 ++++++-
 ...earch.py => test_inference_load_search.py} |  40 +--
 tests/test_perf_analyzer_config_generator.py  | 118 +++++++-
 16 files changed, 620 insertions(+), 214 deletions(-)
 rename model_analyzer/config/generate/{brute_plus_binary_parameter_search_run_config_generator.py => brute_plus_binary_search_run_config_generator.py} (86%)
 rename model_analyzer/result/{parameter_search.py => inference_load_search.py} (63%)
 rename tests/{test_parameter_search.py => test_inference_load_search.py} (92%)

diff --git a/model_analyzer/config/generate/automatic_model_config_generator.py b/model_analyzer/config/generate/automatic_model_config_generator.py
index 79925cb7d..283f112d0 100755
--- a/model_analyzer/config/generate/automatic_model_config_generator.py
+++ b/model_analyzer/config/generate/automatic_model_config_generator.py
@@ -79,10 +79,7 @@ def __init__(
             logger.info("")
             AutomaticModelConfigGenerator._log_first_run = True
 
-        self._max_instance_count = config.run_config_search_max_instance_count
-        self._min_instance_count = config.run_config_search_min_instance_count
-        self._max_model_batch_size = config.run_config_search_max_model_batch_size
-        self._min_model_batch_size = config.run_config_search_min_model_batch_size
+        self._set_min_max_search_values(config)
 
         self._instance_kind = "KIND_CPU" if self._cpu_only else "KIND_GPU"
 
@@ -162,3 +159,9 @@ def _get_curr_param_combo(self) -> Dict:
             config["dynamic_batching"] = {}
 
         return config
+
+    def _set_min_max_search_values(self, config: ConfigCommandProfile) -> None:
+        self._max_instance_count = config.run_config_search_max_instance_count
+        self._min_instance_count = config.run_config_search_min_instance_count
+        self._max_model_batch_size = config.run_config_search_max_model_batch_size
+        self._min_model_batch_size = config.run_config_search_min_model_batch_size
diff --git a/model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py b/model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py
similarity index 86%
rename from model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py
rename to model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py
index b0a217274..78d55a1bc 100755
--- a/model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py
+++ b/model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py
@@ -29,7 +29,7 @@
 from model_analyzer.config.run.run_config import RunConfig
 from model_analyzer.constants import LOGGER_NAME
 from model_analyzer.device.gpu_device import GPUDevice
-from model_analyzer.result.parameter_search import ParameterSearch
+from model_analyzer.result.inference_load_search import InferenceLoadSearch
 from model_analyzer.result.result_manager import ResultManager
 from model_analyzer.result.run_config_measurement import RunConfigMeasurement
 from model_analyzer.triton.client.client import TritonClient
@@ -39,10 +39,10 @@
 logger = logging.getLogger(LOGGER_NAME)
 
 
-class BrutePlusBinaryParameterSearchRunConfigGenerator(ConfigGeneratorInterface):
+class BrutePlusBinarySearchRunConfigGenerator(ConfigGeneratorInterface):
     """
     First run BruteRunConfigGenerator for a brute search, then for
-    automatic searches use ParameterSearch to perform a binary search
+    automatic searches use InferenceLoadSearch to perform a binary search
     """
 
     def __init__(
@@ -132,17 +132,19 @@ def _binary_search_over_top_results(self) -> Generator[RunConfig, None, None]:
             for result in top_results:
                 run_config = deepcopy(result.run_config())
                 model_parameters = self._get_model_parameters(model_name)
-                parameter_search = ParameterSearch(
+                inference_load_search = InferenceLoadSearch(
                     config=self._config,
                     model_parameters=model_parameters,
-                    skip_parameter_sweep=True,
+                    skip_inference_load_sweep=True,
                 )
-                for parameter in parameter_search.search_parameters():
-                    run_config = self._set_parameter(
-                        run_config, model_parameters, parameter
+                for inference_load in inference_load_search.search_inference_loads():
+                    run_config = self._set_inference_load(
+                        run_config, model_parameters, inference_load
                     )
                     yield run_config
-                    parameter_search.add_run_config_measurement(self._last_measurement)
+                    inference_load_search.add_run_config_measurement(
+                        self._last_measurement
+                    )
 
     def _get_model_parameters(self, model_name: str) -> Dict:
         for model in self._models:
@@ -151,14 +153,14 @@ def _get_model_parameters(self, model_name: str) -> Dict:
 
         return {}
 
-    def _set_parameter(
-        self, run_config: RunConfig, model_parameters: Dict, parameter: int
+    def _set_inference_load(
+        self, run_config: RunConfig, model_parameters: Dict, inference_load: int
     ) -> RunConfig:
         for model_run_config in run_config.model_run_configs():
             perf_config = model_run_config.perf_config()
             if self._config.is_request_rate_specified(model_parameters):
-                perf_config.update_config({"request-rate-range": parameter})
+                perf_config.update_config({"request-rate-range": inference_load})
             else:
-                perf_config.update_config({"concurrency-range": parameter})
+                perf_config.update_config({"concurrency-range": inference_load})
 
         return run_config
diff --git a/model_analyzer/config/generate/model_run_config_generator.py b/model_analyzer/config/generate/model_run_config_generator.py
index b068c7577..529fa5b83 100755
--- a/model_analyzer/config/generate/model_run_config_generator.py
+++ b/model_analyzer/config/generate/model_run_config_generator.py
@@ -150,5 +150,13 @@ def _determine_early_exit_enables(
         concurrency_specified = model.parameters()["concurrency"]
         config_parameters_exist = model.model_config_parameters()
 
-        self._pacg_early_exit_enable = early_exit_enable or not concurrency_specified
-        self._mcg_early_exit_enable = early_exit_enable or not config_parameters_exist
+        if config.is_llm_model():
+            self._pacg_early_exit_enable = False
+            self._mcg_early_exit_enable = False
+        else:
+            self._pacg_early_exit_enable = (
+                early_exit_enable or not concurrency_specified
+            )
+            self._mcg_early_exit_enable = (
+                early_exit_enable or not config_parameters_exist
+            )
diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py
index 985032564..f17c2bc18 100755
--- a/model_analyzer/config/generate/perf_analyzer_config_generator.py
+++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py
@@ -14,13 +14,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
 import logging
-from typing import Generator, List, Optional
+from itertools import repeat
+from typing import Dict, Generator, List, Optional, Tuple
 
 from model_analyzer.config.input.config_command_profile import ConfigCommandProfile
+from model_analyzer.config.input.config_defaults import DEFAULT_INPUT_JSON_PATH
 from model_analyzer.constants import (
     LOGGER_NAME,
-    THROUGHPUT_MINIMUM_CONSECUTIVE_BATCH_SIZE_TRIES,
+    THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES,
     THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES,
     THROUGHPUT_MINIMUM_GAIN,
 )
@@ -62,7 +65,7 @@ def __init__(
             custom perf analyzer configuration
 
         model_parameters: Dict
-            model constraints for batch_sizes, concurrency and/or request rate
+            model constraints for batch sizes, concurrency, request rate, text input length, etc..
 
         early_exit_enable: Bool
             If true, this class can early exit during search of concurrency/request rate
@@ -72,35 +75,53 @@ def __init__(
 
         # All configs are pregenerated in _configs[][]
         # Indexed as follows:
-        #    _configs[_curr_batch_size_index][_curr_parameter_index]
+        #    _configs[_curr_parameter_index][_curr_inference_load_index]
+        #
+        # Parameters are: batch size, text input length, max token size
+        # Inference load are: concurrency/periodic-concurrency, request-rate
         #
         self._curr_parameter_index = 0
-        self._curr_batch_size_index = 0
+        self._curr_inference_load_index = 0
         self._configs: List[List[PerfAnalyzerConfig]] = []
-        self._parameter_warning_printed = False
+        self._inference_load_warning_printed = False
 
         # Flag to indicate we have started to return results
         #
         self._generator_started = False
 
         self._last_results: List[RunConfigMeasurement] = []
+        self._inference_load_results: List[Optional[RunConfigMeasurement]] = []
         self._parameter_results: List[Optional[RunConfigMeasurement]] = []
-        self._batch_size_results: List[Optional[RunConfigMeasurement]] = []
 
         self._model_name = model_name
-        self._perf_analyzer_flags = model_perf_analyzer_flags
-
-        self._batch_sizes = sorted(model_parameters["batch_sizes"])
         self._cli_config = cli_config
 
+        self._llm_input_dict = self._create_input_dict(model_perf_analyzer_flags)
+
+        self._perf_analyzer_flags = self._set_perf_analyzer_flags(
+            model_perf_analyzer_flags
+        )
+
         self._model_parameters = model_parameters
-        self._parameters = self._create_parameter_list()
+        self._inference_loads = self._create_inference_load_list()
+
+        self._batch_sizes = sorted(model_parameters["batch_sizes"])
+        self._text_input_lengths = self._create_text_input_length_list()
+        self._max_token_counts = self._create_max_token_count_list()
+
+        self._perf_config_parameter_values = self._create_parameter_perf_config_values()
+        self._parameter_count = len(
+            utils.generate_parameter_combinations(self._perf_config_parameter_values)
+        )
+
+        self._input_json_filename = DEFAULT_INPUT_JSON_PATH + "/input-data.json"
+
         self._generate_perf_configs()
 
     @staticmethod
     def throughput_gain_valid_helper(
         throughputs: List[Optional[RunConfigMeasurement]],
-        min_tries: int = THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES,
+        min_tries: int = THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES,
         min_gain: float = THROUGHPUT_MINIMUM_GAIN,
     ) -> bool:
         if len(throughputs) < min_tries:
@@ -136,8 +157,8 @@ def get_configs(self) -> Generator[PerfAnalyzerConfig, None, None]:
                 break
 
             self._generator_started = True
-            config = self._configs[self._curr_batch_size_index][
-                self._curr_parameter_index
+            config = self._configs[self._curr_parameter_index][
+                self._curr_inference_load_index
             ]
             yield (config)
 
@@ -166,10 +187,28 @@ def set_last_results(
             measurement = [max(valid_measurements)]
 
             self._last_results = measurement
-            self._parameter_results.extend(measurement)
+            self._inference_load_results.extend(measurement)
+
+    def _set_perf_analyzer_flags(self, model_perf_analyzer_flags: Dict) -> Dict:
+        # For LLM models we will be creating custom input data based on text input length
+        perf_analyzer_flags = {k: v for k, v in model_perf_analyzer_flags.items()}
+
+        if self._cli_config.is_llm_model():
+            perf_analyzer_flags.pop("input-data")
+
+        return perf_analyzer_flags
+
+    def _create_input_dict(self, model_perf_analyzer_flags: Dict) -> Dict:
+        if self._cli_config.is_llm_model():
+            with open(model_perf_analyzer_flags["input-data"], "r") as f:
+                input_dict = json.load(f)
+
+            return input_dict
+        else:
+            return {}
 
-    def _create_parameter_list(self) -> List[int]:
-        # The two possible parameters are request rate or concurrency
+    def _create_inference_load_list(self) -> List[int]:
+        # The two possible inference loads are request rate or concurrency
         # Concurrency is the default and will be used unless the user specifies
         # request rate, either as a model parameter or a config option
         if self._cli_config.is_request_rate_specified(self._model_parameters):
@@ -199,75 +238,176 @@ def _create_concurrency_list(self) -> List[int]:
                 self._cli_config.run_config_search_max_concurrency,
             )
 
+    def _create_text_input_length_list(self) -> List[int]:
+        if not self._cli_config.is_llm_model():
+            return []
+
+        if self._model_parameters["text_input_length"]:
+            return sorted(self._model_parameters["text_input_length"])
+        elif self._cli_config.run_config_search_disable:
+            return [1]
+        else:
+            return utils.generate_doubled_list(
+                self._cli_config.run_config_search_min_text_input_length,
+                self._cli_config.run_config_search_max_text_input_length,
+            )
+
+    def _create_max_token_count_list(self) -> List[int]:
+        if not self._cli_config.is_llm_model():
+            return []
+
+        if self._model_parameters["max_token_count"]:
+            return sorted(self._model_parameters["max_token_count"])
+        elif self._cli_config.run_config_search_disable:
+            return [1]
+        else:
+            return utils.generate_doubled_list(
+                self._cli_config.run_config_search_min_token_count,
+                self._cli_config.run_config_search_max_token_count,
+            )
+
     def _generate_perf_configs(self) -> None:
-        perf_config_non_parameter_values = (
-            self._create_non_parameter_perf_config_values()
+        parameter_combinations = utils.generate_parameter_combinations(
+            self._perf_config_parameter_values
         )
+        for parameter_combination in parameter_combinations:
+            perf_configs_for_a_given_combination = []
+            for inference_load in self._inference_loads:
+                new_perf_config = self._create_new_perf_config(
+                    inference_load, parameter_combination
+                )
+                perf_configs_for_a_given_combination.append(new_perf_config)
 
-        for params in utils.generate_parameter_combinations(
-            perf_config_non_parameter_values
-        ):
-            configs_with_concurrency = []
-            for parameter in self._parameters:
-                new_perf_config = PerfAnalyzerConfig()
+            self._configs.append(perf_configs_for_a_given_combination)
 
-                new_perf_config.update_config_from_profile_config(
-                    self._model_name, self._cli_config
-                )
+    def _create_new_perf_config(
+        self, inference_load: int, parameter_combination: Dict
+    ) -> PerfAnalyzerConfig:
+        perf_config = self._create_base_perf_config()
 
-                new_perf_config.update_config(params)
+        (
+            text_input_length,
+            modified_parameter_combination,
+        ) = self._extract_text_input_length(parameter_combination)
 
-                if self._cli_config.is_request_rate_specified(self._model_parameters):
-                    new_perf_config.update_config({"request-rate-range": parameter})
-                else:
-                    new_perf_config.update_config({"concurrency-range": parameter})
+        self._update_perf_config_based_on_parameter_combination(
+            perf_config, modified_parameter_combination
+        )
+        self._update_perf_config_based_on_inference_load(perf_config, inference_load)
+        self._update_perf_config_based_on_perf_analyzer_flags(perf_config)
+        self._update_perf_config_for_llm_model(perf_config, text_input_length)
+
+        return perf_config
+
+    def _create_base_perf_config(self) -> PerfAnalyzerConfig:
+        perf_config = PerfAnalyzerConfig()
+        perf_config.update_config_from_profile_config(
+            self._model_name, self._cli_config
+        )
+
+        return perf_config
+
+    def _extract_text_input_length(
+        self, parameter_combination: Dict
+    ) -> Tuple[int, Dict]:
+        if not self._cli_config.is_llm_model():
+            return 0, parameter_combination
+
+        modified_parameter_combination = {
+            k: v for k, v in parameter_combination.items()
+        }
+        text_input_length = modified_parameter_combination.pop("text-input-length")
+        return text_input_length, modified_parameter_combination
+
+    def _update_perf_config_based_on_parameter_combination(
+        self, perf_config: PerfAnalyzerConfig, parameter_combination: Dict
+    ) -> None:
+        perf_config.update_config(parameter_combination)
+
+    def _update_perf_config_based_on_perf_analyzer_flags(
+        self, perf_config: PerfAnalyzerConfig
+    ) -> None:
+        perf_config.update_config(self._perf_analyzer_flags)
+
+    def _update_perf_config_based_on_inference_load(
+        self, perf_config: PerfAnalyzerConfig, inference_load: int
+    ) -> None:
+        if self._cli_config.is_llm_model():
+            perf_config.update_config({"periodic-concurrency-range": inference_load})
+        elif self._cli_config.is_request_rate_specified(self._model_parameters):
+            perf_config.update_config({"request-rate-range": inference_load})
+        else:
+            perf_config.update_config({"concurrency-range": inference_load})
+
+    def _update_perf_config_for_llm_model(
+        self, perf_config: PerfAnalyzerConfig, text_input_length: int
+    ) -> None:
+        if not self._cli_config.is_llm_model():
+            return
+
+        modified_input_dict = self._modify_text_in_input_dict(text_input_length)
+        self._write_modified_input_dict_to_file(modified_input_dict)
 
-                # User provided flags can override the search parameters
-                new_perf_config.update_config(self._perf_analyzer_flags)
+        perf_config.update_config({"input-data": self._input_json_filename})
 
-                configs_with_concurrency.append(new_perf_config)
-            self._configs.append(configs_with_concurrency)
+    def _modify_text_in_input_dict(self, text_input_length: int) -> Dict:
+        modified_text = " ".join(repeat("Hello", text_input_length))
 
-    def _create_non_parameter_perf_config_values(self) -> dict:
+        modified_input_dict = {k: v for k, v in self._llm_input_dict.items()}
+        modified_input_dict["data"][0]["text-input"] = modified_text
+
+        return modified_input_dict
+
+    def _write_modified_input_dict_to_file(self, modified_input_dict: Dict) -> None:
+        with open(self._input_json_filename, "w") as f:
+            json.dump(modified_input_dict, f)
+
+    def _create_parameter_perf_config_values(self) -> dict:
         perf_config_values = {
             "batch-size": self._batch_sizes,
         }
 
+        if self._cli_config.is_llm_model():
+            perf_config_values["request-parameter"] = [
+                "max_token:" + str(mtc) + ":int" for mtc in self._max_token_counts
+            ]
+            perf_config_values["text-input-length"] = self._text_input_lengths
+
         return perf_config_values
 
     def _step(self) -> None:
-        self._step_parameter()
+        self._step_inference_load()
 
-        if self._done_walking_parameters():
-            self._add_best_throughput_to_batch_sizes()
-            self._reset_parameters()
-            self._step_batch_size()
+        if self._done_walking_inference_loads():
+            self._add_best_throughput_to_parameter_results()
+            self._reset_inference_loads()
+            self._step_parameter()
 
-    def _add_best_throughput_to_batch_sizes(self) -> None:
-        if self._parameter_results:
+    def _add_best_throughput_to_parameter_results(self) -> None:
+        if self._inference_load_results:
             # type is List[Optional[RCM]]
-            best = max(self._parameter_results)  # type: ignore
-            self._batch_size_results.append(best)
+            best = max(self._inference_load_results)  # type: ignore
+            self._parameter_results.append(best)
 
-    def _reset_parameters(self) -> None:
-        self._curr_parameter_index = 0
-        self._parameter_warning_printed = False
-        self._parameter_results = []
+    def _reset_inference_loads(self) -> None:
+        self._curr_inference_load_index = 0
+        self._inference_load_warning_printed = False
+        self._inference_load_results = []
+
+    def _step_inference_load(self) -> None:
+        self._curr_inference_load_index += 1
 
     def _step_parameter(self) -> None:
         self._curr_parameter_index += 1
 
-    def _step_batch_size(self) -> None:
-        self._curr_batch_size_index += 1
-
     def _done_walking(self) -> bool:
-        return self._done_walking_batch_sizes()
+        return self._done_walking_parameters()
 
-    def _done_walking_parameters(self) -> bool:
-        if len(self._parameters) == self._curr_parameter_index:
+    def _done_walking_inference_loads(self) -> bool:
+        if len(self._inference_loads) == self._curr_inference_load_index:
             return True
-        if self._early_exit_enable and not self._parameter_throughput_gain_valid():
-            if not self._parameter_warning_printed:
+        if self._early_exit_enable and not self._inference_load_throughput_gain_valid():
+            if not self._inference_load_warning_printed:
                 if self._cli_config.is_request_rate_specified(self._model_parameters):
                     logger.info(
                         "No longer increasing request rate as throughput has plateaued"
@@ -276,15 +416,15 @@ def _done_walking_parameters(self) -> bool:
                     logger.info(
                         "No longer increasing concurrency as throughput has plateaued"
                     )
-                self._parameter_warning_printed = True
+                self._inference_load_warning_printed = True
             return True
         return False
 
-    def _done_walking_batch_sizes(self) -> bool:
-        if len(self._batch_sizes) == self._curr_batch_size_index:
+    def _done_walking_parameters(self) -> bool:
+        if self._parameter_count == self._curr_parameter_index:
             return True
 
-        if self._early_exit_enable and not self._batch_size_throughput_gain_valid():
+        if self._early_exit_enable and not self._parameter_throughput_gain_valid():
             logger.info(
                 "No longer increasing client batch size as throughput has plateaued"
             )
@@ -295,18 +435,18 @@ def _done_walking_batch_sizes(self) -> bool:
     def _last_results_erroneous(self) -> bool:
         return not self._last_results or self._last_results[-1] is None
 
-    def _parameter_throughput_gain_valid(self) -> bool:
-        """Check if any of the last X parameter results resulted in valid gain"""
+    def _inference_load_throughput_gain_valid(self) -> bool:
+        """Check if any of the last X inference load results resulted in valid gain"""
         return PerfAnalyzerConfigGenerator.throughput_gain_valid_helper(
-            throughputs=self._parameter_results,
-            min_tries=THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES,
+            throughputs=self._inference_load_results,
+            min_tries=THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES,
             min_gain=THROUGHPUT_MINIMUM_GAIN,
         )
 
-    def _batch_size_throughput_gain_valid(self) -> bool:
-        """Check if any of the last X batch_size results resulted in valid gain"""
+    def _parameter_throughput_gain_valid(self) -> bool:
+        """Check if any of the last X non-parameter results resulted in valid gain"""
         return PerfAnalyzerConfigGenerator.throughput_gain_valid_helper(
-            throughputs=self._batch_size_results,
-            min_tries=THROUGHPUT_MINIMUM_CONSECUTIVE_BATCH_SIZE_TRIES,
+            throughputs=self._parameter_results,
+            min_tries=THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES,
             min_gain=THROUGHPUT_MINIMUM_GAIN,
         )
diff --git a/model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py b/model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py
index b7adbef97..14a669438 100755
--- a/model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py
+++ b/model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py
@@ -30,7 +30,7 @@
 from model_analyzer.config.run.run_config import RunConfig
 from model_analyzer.constants import LOGGER_NAME
 from model_analyzer.device.gpu_device import GPUDevice
-from model_analyzer.result.parameter_search import ParameterSearch
+from model_analyzer.result.inference_load_search import InferenceLoadSearch
 from model_analyzer.result.result_manager import ResultManager
 from model_analyzer.result.run_config_measurement import RunConfigMeasurement
 from model_analyzer.triton.client.client import TritonClient
@@ -43,7 +43,7 @@
 class QuickPlusConcurrencySweepRunConfigGenerator(ConfigGeneratorInterface):
     """
     First run QuickRunConfigGenerator for a hill climbing search, then use
-    ParameterSearch for a concurrency sweep + binary search of the default
+    InferenceLoadSearch for a concurrency sweep + binary search of the default
     and Top N results
     """
 
@@ -139,11 +139,13 @@ def _sweep_concurrency_over_top_results(self) -> Generator[RunConfig, None, None
 
             for result in top_results:
                 run_config = deepcopy(result.run_config())
-                parameter_search = ParameterSearch(self._config)
-                for concurrency in parameter_search.search_parameters():
+                inference_load_search = InferenceLoadSearch(self._config)
+                for concurrency in inference_load_search.search_inference_loads():
                     run_config = self._set_concurrency(run_config, concurrency)
                     yield run_config
-                    parameter_search.add_run_config_measurement(self._last_measurement)
+                    inference_load_search.add_run_config_measurement(
+                        self._last_measurement
+                    )
 
     def _set_concurrency(self, run_config: RunConfig, concurrency: int) -> RunConfig:
         for model_run_config in run_config.model_run_configs():
diff --git a/model_analyzer/config/generate/run_config_generator_factory.py b/model_analyzer/config/generate/run_config_generator_factory.py
index da3fc7a7a..0cdcddeb6 100755
--- a/model_analyzer/config/generate/run_config_generator_factory.py
+++ b/model_analyzer/config/generate/run_config_generator_factory.py
@@ -31,8 +31,8 @@
 from model_analyzer.triton.client.client import TritonClient
 from model_analyzer.triton.model.model_config import ModelConfig
 
-from .brute_plus_binary_parameter_search_run_config_generator import (
-    BrutePlusBinaryParameterSearchRunConfigGenerator,
+from .brute_plus_binary_search_run_config_generator import (
+    BrutePlusBinarySearchRunConfigGenerator,
 )
 from .config_generator_interface import ConfigGeneratorInterface
 from .quick_plus_concurrency_sweep_run_config_generator import (
@@ -96,7 +96,7 @@ def create_run_config_generator(
                 model_variant_name_manager=model_variant_name_manager,
             )
         elif command_config.run_config_search_mode == "brute":
-            return RunConfigGeneratorFactory._create_brute_plus_binary_parameter_search_run_config_generator(
+            return RunConfigGeneratorFactory._create_brute_plus_binary_search_run_config_generator(
                 command_config=command_config,
                 gpus=gpus,
                 models=new_models,
@@ -110,7 +110,7 @@ def create_run_config_generator(
             )
 
     @staticmethod
-    def _create_brute_plus_binary_parameter_search_run_config_generator(
+    def _create_brute_plus_binary_search_run_config_generator(
         command_config: ConfigCommandProfile,
         gpus: List[GPUDevice],
         models: List[ModelProfileSpec],
@@ -118,7 +118,7 @@ def _create_brute_plus_binary_parameter_search_run_config_generator(
         result_manager: ResultManager,
         model_variant_name_manager: ModelVariantNameManager,
     ) -> ConfigGeneratorInterface:
-        return BrutePlusBinaryParameterSearchRunConfigGenerator(
+        return BrutePlusBinarySearchRunConfigGenerator(
             config=command_config,
             gpus=gpus,
             models=models,
diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py
index 9c40f16ef..a215a2251 100755
--- a/model_analyzer/config/input/config_command_profile.py
+++ b/model_analyzer/config/input/config_command_profile.py
@@ -498,6 +498,8 @@ def _add_profile_models_configs(self):
                                 "batch_sizes": ConfigListNumeric(type_=int),
                                 "concurrency": ConfigListNumeric(type_=int),
                                 "request_rate": ConfigListNumeric(type_=int),
+                                "text_input_length": ConfigListNumeric(type_=int),
+                                "max_token_count": ConfigListNumeric(type_=int),
                             }
                         ),
                         "objectives": objectives_scheme,
@@ -571,10 +573,10 @@ def _add_profile_models_configs(self):
         )
         self._add_config(
             ConfigField(
-                "prompt_length",
-                flags=["--prompt-length"],
+                "text_input_length",
+                flags=["--text-input-length"],
                 field_type=ConfigListNumeric(int),
-                description="Comma-delimited list of prompt length values or ranges <start:end:step>"
+                description="Comma-delimited list of text input length values or ranges <start:end:step>"
                 " to be used during profiling LLMs",
             )
         )
@@ -811,25 +813,25 @@ def _add_run_search_configs(self):
                 field_type=ConfigPrimitive(bool),
                 parser_args={"action": "store_true"},
                 default_value=config_defaults.DEFAULT_LLM_SEARCH_ENABLE,
-                description="Enables searching values are important to LLMs: prompt length, max token, etc...",
+                description="Enables searching values are important to LLMs: text input length, max token, etc...",
             )
         )
         self._add_config(
             ConfigField(
-                "run_config_search_min_prompt_length",
-                flags=["--run-config-search-min-prompt-length"],
+                "run_config_search_min_text_input_length",
+                flags=["--run-config-search-min-text-input-length"],
                 field_type=ConfigPrimitive(int),
-                default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH,
-                description="Min prompt length that run config search should start with.",
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH,
+                description="Min text input length that run config search should start with.",
             )
         )
         self._add_config(
             ConfigField(
-                "run_config_search_max_prompt_length",
-                flags=["--run-config-search-max-prompt-length"],
+                "run_config_search_max_text_input_length",
+                flags=["--run-config-search-max-text-input-length"],
                 field_type=ConfigPrimitive(int),
-                default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH,
-                description="Max prompt length that run config search will not go beyond.",
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_TEXT_INPUT_LENGTH,
+                description="Max text input length that run config search will not go beyond.",
             )
         )
         self._add_config(
@@ -1419,6 +1421,8 @@ def _autofill_values(self):
                     "batch_sizes": self.batch_sizes,
                     "concurrency": self.concurrency,
                     "request_rate": self.request_rate,
+                    "text_input_length": self.text_input_length,
+                    "max_token_count": self.max_token_count,
                 }
             else:
                 new_model["parameters"] = {}
@@ -1443,6 +1447,24 @@ def _autofill_values(self):
                 else:
                     new_model["parameters"].update({"request_rate": self.request_rate})
 
+                if "text_input_length" in model.parameters():
+                    new_model["parameters"].update(
+                        {"text_input_length": model.parameters()["text_input_length"]}
+                    )
+                else:
+                    new_model["parameters"].update(
+                        {"text_input_length": self.text_input_length}
+                    )
+
+                if "max_token_count" in model.parameters():
+                    new_model["max_token_count"].update(
+                        {"max_token_count": model.parameters()["max_token_count"]}
+                    )
+                else:
+                    new_model["parameters"].update(
+                        {"max_token_count": self.text_input_length}
+                    )
+
             if (
                 new_model["parameters"]["request_rate"]
                 and new_model["parameters"]["concurrency"]
@@ -1523,3 +1545,21 @@ def is_request_rate_specified(self, model_parameters: dict) -> bool:
             or self.get_config()["run_config_search_min_request_rate"].is_set_by_user()
             or self.get_config()["run_config_search_max_request_rate"].is_set_by_user()
         )
+
+    def is_llm_model(self) -> bool:
+        """
+        Returns true if the user has enabled llm search or set any llm search value
+        """
+        return (
+            self.llm_search_enable
+            or self.get_config()[
+                "run_config_search_min_text_input_length"
+            ].is_set_by_user()
+            or self.get_config()[
+                "run_config_search_max_text_input_length"
+            ].is_set_by_user()
+            or self.get_config()["run_config_search_min_token_count"].is_set_by_user()
+            or self.get_config()["run_config_search_max_token_count"].is_set_by_user()
+            or self.get_config()["text_input_length"].is_set_by_user()
+            or self.get_config()["max_token_count"].is_set_by_user()
+        )
diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py
index c2edd6e91..7e37f7c7d 100755
--- a/model_analyzer/config/input/config_defaults.py
+++ b/model_analyzer/config/input/config_defaults.py
@@ -38,6 +38,7 @@
 DEFAULT_SKIP_SUMMARY_REPORTS = False
 DEFAULT_SKIP_DETAILED_REPORTS = False
 DEFAULT_OUTPUT_MODEL_REPOSITORY = os.path.join(os.getcwd(), "output_model_repository")
+DEFAULT_INPUT_JSON_PATH = os.getcwd()
 DEFAULT_OVERRIDE_OUTPUT_REPOSITORY_FLAG = False
 DEFAULT_BATCH_SIZES = 1
 DEFAULT_MAX_RETRIES = 50
@@ -51,8 +52,8 @@
 DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE = 1
 DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE = 128
 DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS = 5
-DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH = 1
-DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH = 1000
+DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH = 1
+DEFAULT_RUN_CONFIG_MAX_TEXT_INPUT_LENGTH = 1024
 DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT = 1
 DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT = 256
 DEFAULT_RUN_CONFIG_SEARCH_DISABLE = False
diff --git a/model_analyzer/constants.py b/model_analyzer/constants.py
index 886360d34..09f581326 100755
--- a/model_analyzer/constants.py
+++ b/model_analyzer/constants.py
@@ -32,8 +32,8 @@
 
 # Run Search
 THROUGHPUT_MINIMUM_GAIN = 0.05
+THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES = 4
 THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES = 4
-THROUGHPUT_MINIMUM_CONSECUTIVE_BATCH_SIZE_TRIES = 4
 
 # Quick search algorithm constants
 RADIUS = 3
diff --git a/model_analyzer/perf_analyzer/perf_config.py b/model_analyzer/perf_analyzer/perf_config.py
index e9160a44a..7cab2dd3c 100755
--- a/model_analyzer/perf_analyzer/perf_config.py
+++ b/model_analyzer/perf_analyzer/perf_config.py
@@ -33,6 +33,7 @@ class PerfAnalyzerConfig:
         "measurement-interval",
         "concurrency-range",
         "request-rate-range",
+        "periodic-concurrency-range",
         "request-distribution",
         "request-intervals",
         "binary-search",
@@ -71,6 +72,7 @@ class PerfAnalyzerConfig:
         "metrics-url",
         "metrics-interval",
         "bls-composing-models",
+        "request-parameter",
     ]
 
     input_to_options = [
@@ -273,6 +275,8 @@ def extract_model_specific_parameters(self):
             "batch-size": self._options["-b"],
             "concurrency-range": self._args["concurrency-range"],
             "request-rate-range": self._args["request-rate-range"],
+            "periodic-concurrency-range": self._args["periodic-concurrency-range"],
+            "max-tokens": self._args["request-parameter"],
         }
 
     @classmethod
diff --git a/model_analyzer/result/parameter_search.py b/model_analyzer/result/inference_load_search.py
similarity index 63%
rename from model_analyzer/result/parameter_search.py
rename to model_analyzer/result/inference_load_search.py
index e716a5b7d..5c7c9598d 100755
--- a/model_analyzer/result/parameter_search.py
+++ b/model_analyzer/result/inference_load_search.py
@@ -21,7 +21,7 @@
 from model_analyzer.config.input.config_command_profile import ConfigCommandProfile
 from model_analyzer.constants import (
     LOGGER_NAME,
-    THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES,
+    THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES,
     THROUGHPUT_MINIMUM_GAIN,
 )
 from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
@@ -30,11 +30,11 @@
 logger = logging.getLogger(LOGGER_NAME)
 
 
-class ParameterSearch:
+class InferenceLoadSearch:
     """
-    Generates the next parameter value to use when searching through
+    Generates the next inference load value to use when searching through
     RunConfigMeasurements for the best value (according to the users objective)
-      - Will sweep from by powers of two from min to max parameter
+      - Will sweep from by powers of two from min to max inference load
       - If the user specifies a constraint, the algorithm will perform a binary search
         around the boundary if the constraint is violated
 
@@ -45,43 +45,43 @@ def __init__(
         self,
         config: ConfigCommandProfile,
         model_parameters: dict = {},
-        skip_parameter_sweep: bool = False,
+        skip_inference_load_sweep: bool = False,
     ) -> None:
         """
         Parameters
         ----------
         config: ConfigCommandProfile
             Profile configuration information
-        skip_parameter_sweep: bool
-            If true, skips the parameter sweep and only does the binary search
+        skip_inference_load_sweep: bool
+            If true, skips the inference load sweep and only does the binary search
         """
-        self._skip_parameter_sweep = skip_parameter_sweep
-        self._parameter_is_request_rate = config.is_request_rate_specified(
+        self._skip_inference_load_sweep = skip_inference_load_sweep
+        self._inference_load_is_request_rate = config.is_request_rate_specified(
             model_parameters
         )
 
-        if self._parameter_is_request_rate:
-            self._min_parameter_index = int(
+        if self._inference_load_is_request_rate:
+            self._min_inference_load_index = int(
                 log2(config.run_config_search_min_request_rate)
             )
-            self._max_parameter_index = int(
+            self._max_inference_load_index = int(
                 log2(config.run_config_search_max_request_rate)
             )
 
         else:
-            self._min_parameter_index = int(
+            self._min_inference_load_index = int(
                 log2(config.run_config_search_min_concurrency)
             )
-            self._max_parameter_index = int(
+            self._max_inference_load_index = int(
                 log2(config.run_config_search_max_concurrency)
             )
 
         self._max_binary_search_steps = config.run_config_search_max_binary_search_steps
 
         self._run_config_measurements: List[Optional[RunConfigMeasurement]] = []
-        self._parameters: List[int] = []
-        self._last_failing_parameter = 0
-        self._last_passing_parameter = 0
+        self._inference_loads: List[int] = []
+        self._last_failing_inference_load = 0
+        self._last_passing_inference_load = 0
 
     def add_run_config_measurement(
         self, run_config_measurement: Optional[RunConfigMeasurement]
@@ -92,30 +92,31 @@ def add_run_config_measurement(
         """
         self._run_config_measurements.append(run_config_measurement)
 
-    def search_parameters(self) -> Generator[int, None, None]:
+    def search_inference_loads(self) -> Generator[int, None, None]:
         """
-        First performs a parameter sweep, and then, if necessary, perform
-        a binary parameter search around the point where the constraint
-        violated
+        First performs an inference load sweep, and then, if necessary, perform
+        a binary search around the point where the constraint was violated
         """
-        yield from self._perform_parameter_sweep()
+        yield from self._perform_inference_load_sweep()
 
         if self._was_constraint_violated():
-            yield from self._perform_binary_parameter_search()
+            yield from self._perform_binary_search()
 
-    def _perform_parameter_sweep(self) -> Generator[int, None, None]:
-        for parameter in (
+    def _perform_inference_load_sweep(self) -> Generator[int, None, None]:
+        for inference_load in (
             2**i
-            for i in range(self._min_parameter_index, self._max_parameter_index + 1)
+            for i in range(
+                self._min_inference_load_index, self._max_inference_load_index + 1
+            )
         ):
-            if self._should_continue_parameter_sweep():
-                self._parameters.append(parameter)
-                yield parameter
+            if self._should_continue_inference_load_sweep():
+                self._inference_loads.append(inference_load)
+                yield inference_load
             else:
                 # We can't actually skip the sweep because the results need to be added
                 # but, we can suppress the logging messages
-                if not self._skip_parameter_sweep:
-                    if self._parameter_is_request_rate:
+                if not self._skip_inference_load_sweep:
+                    if self._inference_load_is_request_rate:
                         logger.info(
                             "Terminating request rate sweep - throughput is decreasing"
                         )
@@ -125,7 +126,7 @@ def _perform_parameter_sweep(self) -> Generator[int, None, None]:
                         )
                     return
 
-    def _should_continue_parameter_sweep(self) -> bool:
+    def _should_continue_inference_load_sweep(self) -> bool:
         self._check_measurement_count()
 
         if not self._are_minimum_tries_reached():
@@ -134,16 +135,16 @@ def _should_continue_parameter_sweep(self) -> bool:
             return not self._has_objective_gain_saturated()
 
     def _check_measurement_count(self) -> None:
-        if len(self._run_config_measurements) != len(self._parameters):
+        if len(self._run_config_measurements) != len(self._inference_loads):
             raise TritonModelAnalyzerException(
-                f"Internal Measurement count: {self._parameters}, doesn't match number "
+                f"Internal Measurement count: {self._inference_loads}, doesn't match number "
                 f"of measurements added: {len(self._run_config_measurements)}."
             )
 
     def _are_minimum_tries_reached(self) -> bool:
         if (
             len(self._run_config_measurements)
-            < THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES
+            < THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES
         ):
             return False
         else:
@@ -155,7 +156,7 @@ def _has_objective_gain_saturated(self) -> bool:
 
     def _calculate_gain(self) -> float:
         first_rcm = self._run_config_measurements[
-            -THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES
+            -THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES
         ]
 
         best_rcm = self._get_best_rcm()
@@ -177,7 +178,7 @@ def _get_best_rcm(self) -> Optional[RunConfigMeasurement]:
         pruned_rcms = [
             rcm
             for rcm in self._run_config_measurements[
-                -THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES:
+                -THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES:
             ]
             if rcm
         ]
@@ -188,16 +189,16 @@ def _get_best_rcm(self) -> Optional[RunConfigMeasurement]:
     def _was_constraint_violated(self) -> bool:
         for i in range(len(self._run_config_measurements) - 1, 1, -1):
             if self._at_constraint_failure_boundary(i):
-                self._last_failing_parameter = self._parameters[i]
-                self._last_passing_parameter = self._parameters[i - 1]
+                self._last_failing_inference_load = self._inference_loads[i]
+                self._last_passing_inference_load = self._inference_loads[i - 1]
                 return True
 
         if (
             self._run_config_measurements[0]
             and not self._run_config_measurements[0].is_passing_constraints()
         ):
-            self._last_failing_parameter = self._parameters[i]
-            self._last_passing_parameter = 0
+            self._last_failing_inference_load = self._inference_loads[i]
+            self._last_passing_inference_load = 0
             return True
         else:
             return False
@@ -220,27 +221,31 @@ def _at_constraint_failure_boundary(self, index: int) -> bool:
 
         return at_failure_boundary
 
-    def _perform_binary_parameter_search(self) -> Generator[int, None, None]:
+    def _perform_binary_search(self) -> Generator[int, None, None]:
         # This is needed because we are going to restart the search from the
-        # parameter that failed - so we expect this to be at the end of the list
-        self._parameters.append(self._last_failing_parameter)
+        # inference_load that failed - so we expect this to be at the end of the list
+        self._inference_loads.append(self._last_failing_inference_load)
 
         for i in range(0, self._max_binary_search_steps):
-            parameter = self._determine_next_binary_parameter()
+            inference_load = self._determine_next_binary_inference_load()
 
-            if parameter != self._parameters[-1]:
-                self._parameters.append(parameter)
-                yield parameter
+            if inference_load != self._inference_loads[-1]:
+                self._inference_loads.append(inference_load)
+                yield inference_load
 
-    def _determine_next_binary_parameter(self) -> int:
+    def _determine_next_binary_inference_load(self) -> int:
         if not self._run_config_measurements[-1]:
             return 0
 
         if self._run_config_measurements[-1].is_passing_constraints():
-            self._last_passing_parameter = self._parameters[-1]
-            parameter = int((self._last_failing_parameter + self._parameters[-1]) / 2)
+            self._last_passing_inference_load = self._inference_loads[-1]
+            inference_load = int(
+                (self._last_failing_inference_load + self._inference_loads[-1]) / 2
+            )
         else:
-            self._last_failing_parameter = self._parameters[-1]
-            parameter = int((self._last_passing_parameter + self._parameters[-1]) / 2)
+            self._last_failing_inference_load = self._inference_loads[-1]
+            inference_load = int(
+                (self._last_passing_inference_load + self._inference_loads[-1]) / 2
+            )
 
-        return parameter
+        return inference_load
diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py
index 9d418027f..caa9763ce 100755
--- a/tests/common/test_utils.py
+++ b/tests/common/test_utils.py
@@ -23,6 +23,7 @@
 from model_analyzer.config.input.config_defaults import (
     DEFAULT_BATCH_SIZES,
     DEFAULT_CLIENT_PROTOCOL,
+    DEFAULT_INPUT_JSON_PATH,
     DEFAULT_MEASUREMENT_MODE,
     DEFAULT_MONITORING_INTERVAL,
     DEFAULT_OUTPUT_MODEL_REPOSITORY,
@@ -237,9 +238,11 @@ def construct_perf_analyzer_config(
     batch_size=DEFAULT_BATCH_SIZES,
     concurrency=1,
     request_rate=None,
+    max_token_count=1,
     launch_mode=DEFAULT_TRITON_LAUNCH_MODE,
     client_protocol=DEFAULT_CLIENT_PROTOCOL,
     perf_analyzer_flags=None,
+    llm_search_mode=False,
 ):
     """
     Constructs a Perf Analyzer Config
@@ -262,6 +265,8 @@ def construct_perf_analyzer_config(
         The client protocol for this PA configuration
     perf_analyzer_flags: dict
         A dict of any additional PA flags to be set
+    llm_search_mode: bool
+        Indicates we should use LLM search parameters
 
     Returns
     -------
@@ -276,9 +281,17 @@ def construct_perf_analyzer_config(
 
     if request_rate:
         pa_config._args["request-rate-range"] = request_rate
+    elif llm_search_mode:
+        pa_config._args["periodic-concurrency-range"] = concurrency
     else:
         pa_config._args["concurrency-range"] = concurrency
 
+    if llm_search_mode:
+        pa_config._args["request-parameter"] = (
+            "max_token:" + str(max_token_count) + ":int"
+        )
+        pa_config._args["input-data"] = DEFAULT_INPUT_JSON_PATH + "/input-data.json"
+
     pa_config._args["measurement-mode"] = DEFAULT_MEASUREMENT_MODE
 
     pa_config.update_config(perf_analyzer_flags)
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 75be15038..94dbf0b21 100755
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -85,8 +85,8 @@ def get_test_options():
         OptionStruct("int", "profile", "--run-config-search-min-instance-count", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT)),
         OptionStruct("int", "profile", "--run-config-search-max-instance-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT)),
         OptionStruct("int", "profile", "--run-config-search-max-binary-search-steps", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS)),
-        OptionStruct("int", "profile", "--run-config-search-min-prompt-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH)),
-        OptionStruct("int", "profile", "--run-config-search-max-prompt-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH)),
+        OptionStruct("int", "profile", "--run-config-search-min-text-input-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH)),
+        OptionStruct("int", "profile", "--run-config-search-max-text-input-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_TEXT_INPUT_LENGTH)),
         OptionStruct("int", "profile", "--run-config-search-min-token-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT)),
         OptionStruct("int", "profile", "--run-config-search-max-token-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT)),
         OptionStruct("float", "profile", "--monitoring-interval", "-i", "10.0", str(config_defaults.DEFAULT_MONITORING_INTERVAL)),
@@ -140,7 +140,7 @@ def get_test_options():
         OptionStruct("intlist", "profile", "--batch-sizes", "-b", "2, 4, 6", "1"),
         OptionStruct("intlist", "profile", "--concurrency", "-c", "1, 2, 3", None),
         OptionStruct("intlist", "profile", "--request-rate", None, "1, 2, 3", None),
-        OptionStruct("intlist", "profile", "--prompt-length", None, "1, 2, 3", None),
+        OptionStruct("intlist", "profile", "--text-input-length", None, "1, 2, 3", None),
         OptionStruct("intlist", "profile", "--max-token-count", None, "1, 2, 3", None),
         OptionStruct("stringlist", "profile", "--triton-docker-mounts", None, "a:b:c, d:e:f", None, extra_commands=["--triton-launch-mode", "docker"]),
         OptionStruct("stringlist", "profile", "--gpus", None, "a, b, c", "all"),
diff --git a/tests/test_config.py b/tests/test_config.py
index ca9835cec..01dc739d8 100755
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -288,12 +288,24 @@ def test_range_and_list_values(self):
         expected_model_configs = [
             ConfigModelProfileSpec(
                 "model_1",
-                parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []},
+                parameters={
+                    "batch_sizes": [1],
+                    "concurrency": [],
+                    "request_rate": [],
+                    "text_input_length": [],
+                    "max_token_count": [],
+                },
                 objectives={"perf_throughput": 10},
             ),
             ConfigModelProfileSpec(
                 "model_2",
-                parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []},
+                parameters={
+                    "batch_sizes": [1],
+                    "concurrency": [],
+                    "request_rate": [],
+                    "text_input_length": [],
+                    "max_token_count": [],
+                },
                 objectives={"perf_throughput": 10},
             ),
         ]
@@ -430,12 +442,20 @@ def test_object(self):
                     "batch_sizes": [1],
                     "concurrency": [1, 2, 3, 4],
                     "request_rate": [],
+                    "text_input_length": [],
+                    "max_token_count": [],
                 },
                 objectives={"perf_throughput": 10},
             ),
             ConfigModelProfileSpec(
                 "vgg_19_graphdef",
-                parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []},
+                parameters={
+                    "batch_sizes": [1],
+                    "concurrency": [],
+                    "request_rate": [],
+                    "text_input_length": [],
+                    "max_token_count": [],
+                },
                 objectives={"perf_throughput": 10},
             ),
         ]
@@ -489,6 +509,8 @@ def test_object(self):
                     "batch_sizes": [1],
                     "concurrency": [1, 2, 3, 4],
                     "request_rate": [],
+                    "text_input_length": [],
+                    "max_token_count": [],
                 },
                 objectives={"perf_throughput": 10},
             ),
@@ -498,6 +520,8 @@ def test_object(self):
                     "concurrency": [1, 2, 3, 4],
                     "batch_sizes": [2, 4, 6],
                     "request_rate": [],
+                    "text_input_length": [],
+                    "max_token_count": [],
                 },
                 objectives={"perf_throughput": 10},
             ),
@@ -569,6 +593,8 @@ def test_constraints(self):
                     "batch_sizes": [1],
                     "concurrency": [1, 2, 3, 4],
                     "request_rate": [],
+                    "text_input_length": [],
+                    "max_token_count": [],
                 },
                 objectives={"perf_throughput": 10, "gpu_used_memory": 5},
                 constraints={
@@ -579,7 +605,13 @@ def test_constraints(self):
             ),
             ConfigModelProfileSpec(
                 "vgg_19_graphdef",
-                parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []},
+                parameters={
+                    "batch_sizes": [1],
+                    "concurrency": [],
+                    "request_rate": [],
+                    "text_input_length": [],
+                    "max_token_count": [],
+                },
                 objectives={"perf_throughput": 10},
             ),
         ]
@@ -697,7 +729,13 @@ def test_config_model(self):
         expected_model_configs = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []},
+                parameters={
+                    "batch_sizes": [1],
+                    "concurrency": [],
+                    "request_rate": [],
+                    "text_input_length": [],
+                    "max_token_count": [],
+                },
                 objectives={"perf_throughput": 10},
                 model_config_parameters={
                     "instance_group": [[{"kind": ["KIND_GPU"], "count": [1]}]]
@@ -722,7 +760,13 @@ def test_config_model(self):
         expected_model_configs = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []},
+                parameters={
+                    "batch_sizes": [1],
+                    "concurrency": [],
+                    "request_rate": [],
+                    "text_input_length": [],
+                    "max_token_count": [],
+                },
                 objectives={"perf_throughput": 10},
                 model_config_parameters={
                     "instance_group": [[{"kind": ["KIND_GPU"], "count": [1]}]]
@@ -758,7 +802,13 @@ def test_config_model(self):
         expected_model_configs = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []},
+                parameters={
+                    "batch_sizes": [1],
+                    "concurrency": [],
+                    "request_rate": [],
+                    "text_input_length": [],
+                    "max_token_count": [],
+                },
                 objectives={"perf_throughput": 10},
                 model_config_parameters={
                     "instance_group": [
@@ -801,7 +851,13 @@ def test_config_model(self):
         expected_model_configs = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []},
+                parameters={
+                    "batch_sizes": [1],
+                    "concurrency": [],
+                    "request_rate": [],
+                    "text_input_length": [],
+                    "max_token_count": [],
+                },
                 objectives={"perf_throughput": 10},
                 model_config_parameters={
                     "instance_group": [
@@ -831,7 +887,13 @@ def test_config_model(self):
         expected_model_configs = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []},
+                parameters={
+                    "batch_sizes": [1],
+                    "concurrency": [],
+                    "request_rate": [],
+                    "text_input_length": [],
+                    "max_token_count": [],
+                },
                 objectives={"perf_throughput": 10},
                 model_config_parameters={
                     "input": [
@@ -874,7 +936,13 @@ def test_config_model(self):
         expected_model_configs = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []},
+                parameters={
+                    "batch_sizes": [1],
+                    "concurrency": [],
+                    "request_rate": [],
+                    "text_input_length": [],
+                    "max_token_count": [],
+                },
                 objectives={"perf_throughput": 10},
                 perf_analyzer_flags={
                     "measurement-interval": 10000,
@@ -900,7 +968,13 @@ def test_config_model(self):
         expected_model_configs = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []},
+                parameters={
+                    "batch_sizes": [1],
+                    "concurrency": [],
+                    "request_rate": [],
+                    "text_input_length": [],
+                    "max_token_count": [],
+                },
                 objectives={"perf_throughput": 10},
                 perf_analyzer_flags={
                     "measurement-interval": 10000,
@@ -1171,7 +1245,13 @@ def test_autofill(self):
         expected_model_configs = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []},
+                parameters={
+                    "batch_sizes": [1],
+                    "concurrency": [],
+                    "request_rate": [],
+                    "text_input_length": [],
+                    "max_token_count": [],
+                },
                 objectives={"perf_throughput": 10},
                 model_config_parameters={
                     "instance_group": [[{"kind": ["KIND_GPU"], "count": [1]}]]
@@ -1215,6 +1295,8 @@ def test_autofill(self):
                     "batch_sizes": [16, 32],
                     "concurrency": [2, 4],
                     "request_rate": [],
+                    "text_input_length": [],
+                    "max_token_count": [],
                 },
                 objectives={"perf_throughput": 10, "gpu_used_memory": 5},
                 constraints={
@@ -1263,6 +1345,8 @@ def test_autofill(self):
                     "batch_sizes": [16, 32],
                     "concurrency": [2, 4],
                     "request_rate": [],
+                    "text_input_length": [],
+                    "max_token_count": [],
                 },
                 objectives={"gpu_used_memory": 10},
                 constraints={"perf_latency_p99": {"max": 8000}},
@@ -1307,6 +1391,8 @@ def test_autofill(self):
                     "batch_sizes": [16, 32],
                     "concurrency": [2, 4],
                     "request_rate": [],
+                    "text_input_length": [],
+                    "max_token_count": [],
                 },
                 objectives={"gpu_used_memory": 10},
                 constraints={"perf_latency_p99": {"max": 8000}},
@@ -1362,6 +1448,8 @@ def test_autofill(self):
                     "batch_sizes": [16, 32],
                     "concurrency": [5, 6, 7],
                     "request_rate": [],
+                    "text_input_length": [],
+                    "max_token_count": [],
                 },
                 objectives={"gpu_used_memory": 10},
                 constraints={
@@ -1375,6 +1463,8 @@ def test_autofill(self):
                     "batch_sizes": [1, 2],
                     "concurrency": [2, 4],
                     "request_rate": [],
+                    "text_input_length": [],
+                    "max_token_count": [],
                 },
                 objectives={"perf_throughput": 10, "perf_latency_p99": 5},
                 constraints={"perf_latency_p99": {"max": 8000}},
diff --git a/tests/test_parameter_search.py b/tests/test_inference_load_search.py
similarity index 92%
rename from tests/test_parameter_search.py
rename to tests/test_inference_load_search.py
index 7f410bb26..d8643ad66 100755
--- a/tests/test_parameter_search.py
+++ b/tests/test_inference_load_search.py
@@ -25,17 +25,17 @@
     DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
     DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE,
 )
-from model_analyzer.constants import THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES
+from model_analyzer.constants import THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES
 from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
 from model_analyzer.result.constraint_manager import ConstraintManager
-from model_analyzer.result.parameter_search import ParameterSearch
+from model_analyzer.result.inference_load_search import InferenceLoadSearch
 from model_analyzer.result.run_config_measurement import RunConfigMeasurement
 
 from .common import test_result_collector as trc
 from .common.test_utils import construct_run_config_measurement, evaluate_mock_config
 
 
-class TestParameterSearch(trc.TestResultCollector):
+class TestInferenceLoadSearch(trc.TestResultCollector):
     def setUp(self):
         self._min_concurrency_index = int(log2(DEFAULT_RUN_CONFIG_MIN_CONCURRENCY))
         self._max_concurrency_index = int(log2(DEFAULT_RUN_CONFIG_MAX_CONCURRENCY))
@@ -67,9 +67,9 @@ def test_concurrency_sweep(self):
         """
         config = self._create_single_model_no_constraints()
         constraint_manager = ConstraintManager(config)
-        concurrency_search = ParameterSearch(config)
+        concurrency_search = InferenceLoadSearch(config)
 
-        for concurrency in concurrency_search.search_parameters():
+        for concurrency in concurrency_search.search_inference_loads():
             self._concurrencies.append(concurrency)
 
             concurrency_search.add_run_config_measurement(
@@ -90,11 +90,11 @@ def test_request_rate_sweep(self):
         """
         config = self._create_single_model_no_constraints()
         constraint_manager = ConstraintManager(config)
-        concurrency_search = ParameterSearch(
+        concurrency_search = InferenceLoadSearch(
             config, model_parameters={"request_rate": "True"}
         )
 
-        for request_rate in concurrency_search.search_parameters():
+        for request_rate in concurrency_search.search_inference_loads():
             self._request_rates.append(request_rate)
 
             concurrency_search.add_run_config_measurement(
@@ -115,7 +115,7 @@ def test_saturating_sweep(self):
         """
         config = self._create_single_model_no_constraints()
         constraint_manager = ConstraintManager(config)
-        concurrency_search = ParameterSearch(config)
+        concurrency_search = InferenceLoadSearch(config)
         INCREASE_THROUGHPUT_COUNT = 4
 
         # [100, 200, 400, 800, 1000, 1000,...]
@@ -124,7 +124,7 @@ def test_saturating_sweep(self):
             for c in range(self._min_concurrency_index, self._max_concurrency_index + 1)
         ]
 
-        for i, concurrency in enumerate(concurrency_search.search_parameters()):
+        for i, concurrency in enumerate(concurrency_search.search_inference_loads()):
             self._concurrencies.append(concurrency)
 
             concurrency_search.add_run_config_measurement(
@@ -140,7 +140,7 @@ def test_saturating_sweep(self):
             2**c
             for c in range(
                 INCREASE_THROUGHPUT_COUNT
-                + THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES
+                + THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES
             )
         ]
         self.assertEqual(self._concurrencies, expected_concurrencies)
@@ -152,12 +152,12 @@ def test_sweep_with_constraints_decreasing(self):
         """
         config = self._create_single_model_with_constraints("95")
         constraint_manager = ConstraintManager(config)
-        concurrency_search = ParameterSearch(config)
+        concurrency_search = InferenceLoadSearch(config)
 
         self._expected_concurrencies.extend([12, 10, 9])
         latencies = [10 * c for c in self._expected_concurrencies]
 
-        for i, concurrency in enumerate(concurrency_search.search_parameters()):
+        for i, concurrency in enumerate(concurrency_search.search_inference_loads()):
             self._concurrencies.append(concurrency)
 
             concurrency_search.add_run_config_measurement(
@@ -178,12 +178,12 @@ def test_sweep_with_constraints_decrease_then_increase(self):
         """
         config = self._create_single_model_with_constraints("155")
         constraint_manager = ConstraintManager(config)
-        concurrency_search = ParameterSearch(config)
+        concurrency_search = InferenceLoadSearch(config)
 
         self._expected_concurrencies.extend([12, 14, 15])
         latencies = [10 * c for c in self._expected_concurrencies]
 
-        for i, concurrency in enumerate(concurrency_search.search_parameters()):
+        for i, concurrency in enumerate(concurrency_search.search_inference_loads()):
             self._concurrencies.append(concurrency)
 
             concurrency_search.add_run_config_measurement(
@@ -204,14 +204,14 @@ def test_sweep_with_multiple_violation_areas(self):
         """
         config = self._create_single_model_with_constraints("155")
         constraint_manager = ConstraintManager(config)
-        concurrency_search = ParameterSearch(config)
+        concurrency_search = InferenceLoadSearch(config)
 
         self._expected_concurrencies.extend([12, 14, 15])
         latencies = [10 * c for c in self._expected_concurrencies]
         # this adds an early constraint violation which should be ignored
         latencies[1] = 200
 
-        for i, concurrency in enumerate(concurrency_search.search_parameters()):
+        for i, concurrency in enumerate(concurrency_search.search_inference_loads()):
             self._concurrencies.append(concurrency)
 
             concurrency_search.add_run_config_measurement(
@@ -233,12 +233,12 @@ def test_sweep_with_constraints_hitting_limit(self):
         """
         config = self._create_single_model_with_constraints("970")
         constraint_manager = ConstraintManager(config)
-        concurrency_search = ParameterSearch(config)
+        concurrency_search = InferenceLoadSearch(config)
 
         self._expected_concurrencies.extend([768, 896, 960, 992, 976])
         latencies = self._expected_concurrencies
 
-        for i, concurrency in enumerate(concurrency_search.search_parameters()):
+        for i, concurrency in enumerate(concurrency_search.search_inference_loads()):
             self._concurrencies.append(concurrency)
 
             concurrency_search.add_run_config_measurement(
@@ -258,10 +258,10 @@ def test_not_adding_measurements(self):
         """
         config = self._create_single_model_no_constraints()
         constraint_manager = ConstraintManager(config)
-        concurrency_search = ParameterSearch(config)
+        concurrency_search = InferenceLoadSearch(config)
 
         with self.assertRaises(TritonModelAnalyzerException):
-            for concurrency in concurrency_search.search_parameters():
+            for concurrency in concurrency_search.search_inference_loads():
                 self._concurrencies.append(concurrency)
 
                 if concurrency < 32:
diff --git a/tests/test_perf_analyzer_config_generator.py b/tests/test_perf_analyzer_config_generator.py
index e9852356e..69e42ef8d 100755
--- a/tests/test_perf_analyzer_config_generator.py
+++ b/tests/test_perf_analyzer_config_generator.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 import unittest
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock, mock_open, patch
 
 from model_analyzer.config.generate.generator_utils import GeneratorUtils as utils
 from model_analyzer.config.generate.perf_analyzer_config_generator import (
@@ -41,7 +41,11 @@ def __init__(self, methodname):
         super().__init__(methodname)
         self._perf_throughput = 1
 
-    def test_set_last_results(self):
+    @patch(
+        "model_analyzer.config.input.config_command_profile.ConfigCommandProfile.is_llm_model",
+        return_value=False,
+    )
+    def test_set_last_results(self, *args):
         """
         Test set_last_results() with multi model
 
@@ -60,8 +64,26 @@ def test_set_last_results(self):
             ["modelA", "modelB"], [{"perf_throughput": 10}, {"perf_throughput": 2}]
         )
 
+        args = [
+            "model-analyzer",
+            "profile",
+            "--model-repository",
+            "cli_repository",
+            "-f",
+            "path-to-config-file",
+        ]
+
+        # yapf: disable
+        yaml_str = ("""
+            profile_models:
+                - my-model
+            """)
+        # yapf: enable
+
+        config = evaluate_mock_config(args, yaml_str, subcommand="profile")
+
         pacg = PerfAnalyzerConfigGenerator(
-            MagicMock(), MagicMock(), MagicMock(), MagicMock(), early_exit_enable=False
+            config, MagicMock(), MagicMock(), MagicMock(), early_exit_enable=False
         )
 
         pacg.set_last_results([measurement1, measurement2, measurement3])
@@ -537,6 +559,74 @@ def test_perf_analyzer_flags(self):
 
         self._run_and_test_perf_analyzer_config_generator(yaml_str, expected_configs)
 
+    def test_llm_search_max_token_count(self):
+        """
+        Test LLM Search:
+            - max token count 1->256
+
+        Concurrency and text input length max set to 1
+        """
+
+        # yapf: disable
+        yaml_str = ("""
+            perf_analyzer_flags:
+                input-data: input-data.json
+            profile_models:
+                - my-model
+            """)
+        # yapf: enable
+
+        max_token_counts = utils.generate_doubled_list(1, 256)
+        expected_configs = [
+            construct_perf_analyzer_config(max_token_count=mtc, llm_search_mode=True)
+            for mtc in max_token_counts
+        ]
+
+        pa_cli_args = [
+            "--llm-search-enable",
+            "--run-config-search-max-concurrency",
+            "1",
+            "--run-config-search-max-text-input-length",
+            "1",
+        ]
+        self._run_and_test_perf_analyzer_config_generator(
+            yaml_str, expected_configs, pa_cli_args
+        )
+
+    def test_llm_search_text_input_length(self):
+        """
+        Test LLM Search:
+            - Input length 1->1024
+
+        Concurrency and max token count set to 1
+        """
+
+        # yapf: disable
+        yaml_str = ("""
+            perf_analyzer_flags:
+                input-data: input-data.json
+            profile_models:
+                - my-model
+            """)
+        # yapf: enable
+
+        text_input_lengths = utils.generate_doubled_list(1, 1024)
+        expected_configs = [
+            construct_perf_analyzer_config(llm_search_mode=True)
+            for pl in text_input_lengths
+        ]
+
+        pa_cli_args = [
+            "--llm-search-enable",
+            "--run-config-search-max-concurrency",
+            "1",
+            "--run-config-search-max-token-count",
+            "1",
+        ]
+        self._run_and_test_perf_analyzer_config_generator(
+            yaml_str, expected_configs, pa_cli_args
+        )
+
     def test_perf_analyzer_config_ssl_options(self):
         """
         Test Perf Analyzer SSL options:
@@ -754,13 +844,17 @@ def _run_and_test_perf_analyzer_config_generator(
 
         config = evaluate_mock_config(args, yaml_str, subcommand="profile")
 
-        pacg = PerfAnalyzerConfigGenerator(
-            config,
-            config.profile_models[0].model_name(),
-            config.profile_models[0].perf_analyzer_flags(),
-            config.profile_models[0].parameters(),
-            early_exit,
-        )
+        with patch(
+            "model_analyzer.config.generate.perf_analyzer_config_generator.open",
+            mock_open(read_data=self._input_data),
+        ):
+            pacg = PerfAnalyzerConfigGenerator(
+                config,
+                config.profile_models[0].model_name(),
+                config.profile_models[0].perf_analyzer_flags(),
+                config.profile_models[0].parameters(),
+                early_exit,
+            )
 
         perf_analyzer_configs = []
         for perf_config in pacg.get_configs():
@@ -824,6 +918,10 @@ def setUp(self):
         )
         self.mock_os.start()
 
+        self._input_data = """{
+            "data": [{"text_input": ["Hello, my name is"], "stream": [true]}]
+        }"""
+
     def tearDown(self):
         self.mock_os.stop()
         patch.stopall()

From 14ea52809a301feae002b5fab535594795614a7f Mon Sep 17 00:00:00 2001
From: Brian Raf <92820864+nv-braf@users.noreply.github.com>
Date: Thu, 12 Oct 2023 09:00:15 -0700
Subject: [PATCH 03/12] New LLM record types (#770)

* New measurement fields created.

* Fixing omission in llm_metric_table

* Changing name to be avg_token_to_token...
---
 model_analyzer/perf_analyzer/perf_analyzer.py |  9 ++
 .../record/types/avg_first_token_latency.py   | 96 +++++++++++++++++++
 .../types/avg_token_to_token_latency.py       | 96 +++++++++++++++++++
 tests/test_record_types.py                    |  2 +
 4 files changed, 203 insertions(+)
 create mode 100755 model_analyzer/record/types/avg_first_token_latency.py
 create mode 100755 model_analyzer/record/types/avg_token_to_token_latency.py

diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py
index c88f8e655..51ad64151 100755
--- a/model_analyzer/perf_analyzer/perf_analyzer.py
+++ b/model_analyzer/perf_analyzer/perf_analyzer.py
@@ -36,6 +36,10 @@
 )
 from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
 from model_analyzer.record.record import Record
+from model_analyzer.record.types.avg_first_token_latency import AvgFirstTokenLatency
+from model_analyzer.record.types.avg_token_to_token_latency import (
+    AvgTokenToTokenLatency,
+)
 from model_analyzer.record.types.gpu_free_memory import GPUFreeMemory
 from model_analyzer.record.types.gpu_power_usage import GPUPowerUsage
 from model_analyzer.record.types.gpu_used_memory import GPUUsedMemory
@@ -91,6 +95,11 @@ class PerfAnalyzer:
         ["gpu_used_memory",            "Max GPU Memory Usage",  GPUUsedMemory,        "1000000"],
         ["gpu_free_memory",            "Total GPU Memory",      GPUFreeMemory,        "1000000"]
     ]
+
+    llm_metric_table = [
+        ["avg_first_latency",          None,                    AvgFirstTokenLatency,     "1000"],
+        ["avg_token_to_token_latency", None,                    AvgTokenToTokenLatency,   "1000"]
+    ]
     # yapf: enable
 
     @staticmethod
diff --git a/model_analyzer/record/types/avg_first_token_latency.py b/model_analyzer/record/types/avg_first_token_latency.py
new file mode 100755
index 000000000..15badd92a
--- /dev/null
+++ b/model_analyzer/record/types/avg_first_token_latency.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import DecreasingRecord
+
+
+@total_ordering
+class AvgFirstTokenLatency(DecreasingRecord):
+    """
+    A record for perf_analyzer avg first token to token latency metric
+    """
+
+    tag = "avg_first_token_latency"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed avg time for first token-to-token latency
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "avg first token-to-token latency (ms)"
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() > other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/avg_token_to_token_latency.py b/model_analyzer/record/types/avg_token_to_token_latency.py
new file mode 100755
index 000000000..2941da39b
--- /dev/null
+++ b/model_analyzer/record/types/avg_token_to_token_latency.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import DecreasingRecord
+
+
+@total_ordering
+class AvgTokenToTokenLatency(DecreasingRecord):
+    """
+    A record for perf_analyzer avg token-to-token latency metric
+    """
+
+    tag = "avg_token_to_token_latency"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed avg time for token-to-token latency
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "avg token-to-token latency (ms)"
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() > other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/tests/test_record_types.py b/tests/test_record_types.py
index 4bd6d8b32..4d3482af1 100755
--- a/tests/test_record_types.py
+++ b/tests/test_record_types.py
@@ -59,6 +59,8 @@ def setUp(self):
                 "perf_client_send_recv",
                 "perf_server_compute_input",
                 "gpu_power_usage",
+                "avg_first_token_latency",
+                "avg_token_to_token_latency",
             ]
         }
         self.more_is_better_types = {

From 6e0fc2469a70784689ef425834d10dd0fcc37165 Mon Sep 17 00:00:00 2001
From: Brian Raf <92820864+nv-braf@users.noreply.github.com>
Date: Tue, 17 Oct 2023 10:43:04 -0700
Subject: [PATCH 04/12] New config options based on live run (#775)

* Added new config options and modified existing options

* Refactoring model parameter setting

* Removing magic numbers
---
 .../config/generate/generator_utils.py        |   2 +
 .../perf_analyzer_config_generator.py         |  36 ++-
 .../config/input/config_command_profile.py    | 192 +++++++++++-----
 .../config/input/config_defaults.py           |  10 +-
 tests/common/test_utils.py                    |  12 +-
 tests/test_cli.py                             |  12 +-
 tests/test_config.py                          | 206 +++++-------------
 tests/test_perf_analyzer_config_generator.py  |  12 +-
 8 files changed, 257 insertions(+), 225 deletions(-)

diff --git a/model_analyzer/config/generate/generator_utils.py b/model_analyzer/config/generate/generator_utils.py
index 1f0e9c5eb..ceef010ca 100755
--- a/model_analyzer/config/generate/generator_utils.py
+++ b/model_analyzer/config/generate/generator_utils.py
@@ -108,6 +108,8 @@ def generate_doubled_list(min_value: int, max_value: int) -> List[int]:
             The value that the generated list will not exceed
         """
 
+        assert min_value <= max_value
+
         list = []
         val = 1 if min_value == 0 else min_value
         while val <= max_value:
diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py
index f17c2bc18..771e895f1 100755
--- a/model_analyzer/config/generate/perf_analyzer_config_generator.py
+++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py
@@ -20,7 +20,14 @@
 from typing import Dict, Generator, List, Optional, Tuple
 
 from model_analyzer.config.input.config_command_profile import ConfigCommandProfile
-from model_analyzer.config.input.config_defaults import DEFAULT_INPUT_JSON_PATH
+from model_analyzer.config.input.config_defaults import (
+    DEFAULT_INPUT_JSON_PATH,
+    DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
+    DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT,
+    DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY,
+    DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE,
+    DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH,
+)
 from model_analyzer.constants import (
     LOGGER_NAME,
     THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES,
@@ -211,7 +218,9 @@ def _create_inference_load_list(self) -> List[int]:
         # The two possible inference loads are request rate or concurrency
         # Concurrency is the default and will be used unless the user specifies
         # request rate, either as a model parameter or a config option
-        if self._cli_config.is_request_rate_specified(self._model_parameters):
+        if self._cli_config.is_llm_model():
+            return self._create_periodic_concurrency_list()
+        elif self._cli_config.is_request_rate_specified(self._model_parameters):
             return self._create_request_rate_list()
         else:
             return self._create_concurrency_list()
@@ -220,7 +229,7 @@ def _create_request_rate_list(self) -> List[int]:
         if self._model_parameters["request_rate"]:
             return sorted(self._model_parameters["request_rate"])
         elif self._cli_config.run_config_search_disable:
-            return [1]
+            return [DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE]
         else:
             return utils.generate_doubled_list(
                 self._cli_config.run_config_search_min_request_rate,
@@ -231,13 +240,24 @@ def _create_concurrency_list(self) -> List[int]:
         if self._model_parameters["concurrency"]:
             return sorted(self._model_parameters["concurrency"])
         elif self._cli_config.run_config_search_disable:
-            return [1]
+            return [DEFAULT_RUN_CONFIG_MIN_CONCURRENCY]
         else:
             return utils.generate_doubled_list(
                 self._cli_config.run_config_search_min_concurrency,
                 self._cli_config.run_config_search_max_concurrency,
             )
 
+    def _create_periodic_concurrency_list(self) -> List[int]:
+        if self._model_parameters["periodic_concurrency"]:
+            return sorted(self._model_parameters["periodic_concurrency"])
+        elif self._cli_config.run_config_search_disable:
+            return [DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY]
+        else:
+            return utils.generate_doubled_list(
+                self._cli_config.run_config_search_min_periodic_concurrency,
+                self._cli_config.run_config_search_max_periodic_concurrency,
+            )
+
     def _create_text_input_length_list(self) -> List[int]:
         if not self._cli_config.is_llm_model():
             return []
@@ -245,7 +265,7 @@ def _create_text_input_length_list(self) -> List[int]:
         if self._model_parameters["text_input_length"]:
             return sorted(self._model_parameters["text_input_length"])
         elif self._cli_config.run_config_search_disable:
-            return [1]
+            return [DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH]
         else:
             return utils.generate_doubled_list(
                 self._cli_config.run_config_search_min_text_input_length,
@@ -259,11 +279,11 @@ def _create_max_token_count_list(self) -> List[int]:
         if self._model_parameters["max_token_count"]:
             return sorted(self._model_parameters["max_token_count"])
         elif self._cli_config.run_config_search_disable:
-            return [1]
+            return [DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT]
         else:
             return utils.generate_doubled_list(
-                self._cli_config.run_config_search_min_token_count,
-                self._cli_config.run_config_search_max_token_count,
+                self._cli_config.run_config_search_min_max_token_count,
+                self._cli_config.run_config_search_max_max_token_count,
             )
 
     def _generate_perf_configs(self) -> None:
diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py
index a215a2251..bdce45027 100755
--- a/model_analyzer/config/input/config_command_profile.py
+++ b/model_analyzer/config/input/config_command_profile.py
@@ -17,6 +17,7 @@
 import argparse
 import logging
 import os
+from typing import Dict
 
 import numba.cuda
 import psutil
@@ -497,7 +498,9 @@ def _add_profile_models_configs(self):
                             schema={
                                 "batch_sizes": ConfigListNumeric(type_=int),
                                 "concurrency": ConfigListNumeric(type_=int),
+                                "periodic_concurrency": ConfigListNumeric(type_=int),
                                 "request_rate": ConfigListNumeric(type_=int),
+                                "request_period": ConfigListNumeric(type_=int),
                                 "text_input_length": ConfigListNumeric(type_=int),
                                 "max_token_count": ConfigListNumeric(type_=int),
                             }
@@ -562,6 +565,15 @@ def _add_profile_models_configs(self):
                 " to be used during profiling",
             )
         )
+        self._add_config(
+            ConfigField(
+                "periodic_concurrency",
+                flags=["--periodic-concurrency"],
+                field_type=ConfigListNumeric(int),
+                description="Comma-delimited list of periodic concurrency values or ranges <start:end:step>"
+                " to be used during profiling",
+            )
+        )
         self._add_config(
             ConfigField(
                 "request_rate",
@@ -571,6 +583,15 @@ def _add_profile_models_configs(self):
                 " to be used during profiling",
             )
         )
+        self._add_config(
+            ConfigField(
+                "request_period",
+                flags=["--request-period"],
+                field_type=ConfigListNumeric(int),
+                description="Comma-delimited list of request period values or ranges <start:end:step>"
+                " to be used during profiling",
+            )
+        )
         self._add_config(
             ConfigField(
                 "text_input_length",
@@ -687,7 +708,7 @@ def _add_run_search_configs(self):
                 flags=["--run-config-search-max-concurrency"],
                 field_type=ConfigPrimitive(int),
                 default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_CONCURRENCY,
-                description="Max concurrency value that run config search should not go beyond that.",
+                description="Max concurrency value that run config search should not go beyond.",
             )
         )
         self._add_config(
@@ -699,13 +720,49 @@ def _add_run_search_configs(self):
                 description="Min concurrency value that run config search should start with.",
             )
         )
+        self._add_config(
+            ConfigField(
+                "run_config_search_max_periodic_concurrency",
+                flags=["--run-config-search-max-periodic-concurrency"],
+                field_type=ConfigPrimitive(int),
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_PERIODIC_CONCURRENCY,
+                description="Max periodic concurrency value that run config search should not go beyond.",
+            )
+        )
+        self._add_config(
+            ConfigField(
+                "run_config_search_min_periodic_concurrency",
+                flags=["--run-config-search-min-periodic-concurrency"],
+                field_type=ConfigPrimitive(int),
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY,
+                description="Min periodic concurrency value that run config search should start with.",
+            )
+        )
+        self._add_config(
+            ConfigField(
+                "run_config_search_max_periodic_concurrency_step",
+                flags=["--run-config-search-max-periodic-concurrency-step"],
+                field_type=ConfigPrimitive(int),
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_PERIODIC_CONCURRENCY_STEP,
+                description="Max periodic concurrency step value that run config search should not go beyond.",
+            )
+        )
+        self._add_config(
+            ConfigField(
+                "run_config_search_min_periodic_concurrency_step",
+                flags=["--run-config-search-min-periodic-concurrency-step"],
+                field_type=ConfigPrimitive(int),
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY_STEP,
+                description="Min periodic concurrency step value that run config search should start with.",
+            )
+        )
         self._add_config(
             ConfigField(
                 "run_config_search_max_request_rate",
                 flags=["--run-config-search-max-request-rate"],
                 field_type=ConfigPrimitive(int),
                 default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE,
-                description="Max request rate value that run config search should not go beyond that.",
+                description="Max request rate value that run config search should not go beyond.",
             )
         )
         self._add_config(
@@ -717,13 +774,31 @@ def _add_run_search_configs(self):
                 description="Min request rate value that run config search should start with.",
             )
         )
+        self._add_config(
+            ConfigField(
+                "run_config_search_max_request_period",
+                flags=["--run-config-search-max-request-period"],
+                field_type=ConfigPrimitive(int),
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_REQUEST_PERIOD,
+                description="Max request period value that run config search should not go beyond.",
+            )
+        )
+        self._add_config(
+            ConfigField(
+                "run_config_search_min_request_period",
+                flags=["--run-config-search-min-request-period"],
+                field_type=ConfigPrimitive(int),
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD,
+                description="Min request period value that run config search should start with.",
+            )
+        )
         self._add_config(
             ConfigField(
                 "run_config_search_max_instance_count",
                 flags=["--run-config-search-max-instance-count"],
                 field_type=ConfigPrimitive(int),
                 default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT,
-                description="Max instance count value that run config search should not go beyond that.",
+                description="Max instance count value that run config search should not go beyond.",
             )
         )
         self._add_config(
@@ -836,20 +911,20 @@ def _add_run_search_configs(self):
         )
         self._add_config(
             ConfigField(
-                "run_config_search_min_token_count",
-                flags=["--run-config-search-min-token-count"],
+                "run_config_search_min_max_token_count",
+                flags=["--run-config-search-min-max-token-count"],
                 field_type=ConfigPrimitive(int),
-                default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT,
-                description="Min token count that run config search should start with.",
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT,
+                description="Min max_token count that run config search should start with.",
             )
         )
         self._add_config(
             ConfigField(
-                "run_config_search_max_token_count",
-                flags=["--run-config-search-max-token-count"],
+                "run_config_search_max_max_token_count",
+                flags=["--run-config-search-max-max-token-count"],
                 field_type=ConfigPrimitive(int),
-                default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT,
-                description="Max token count that run config search will not go beyond.",
+                default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_MAX_TOKEN_COUNT,
+                description="Max max_token count that run config search will not go beyond.",
             )
         )
 
@@ -1420,50 +1495,35 @@ def _autofill_values(self):
                 new_model["parameters"] = {
                     "batch_sizes": self.batch_sizes,
                     "concurrency": self.concurrency,
+                    "periodic_concurrency": self.periodic_concurrency,
                     "request_rate": self.request_rate,
+                    "request_period": self.request_period,
                     "text_input_length": self.text_input_length,
                     "max_token_count": self.max_token_count,
                 }
             else:
                 new_model["parameters"] = {}
-                if "batch_sizes" in model.parameters():
-                    new_model["parameters"].update(
-                        {"batch_sizes": model.parameters()["batch_sizes"]}
-                    )
-                else:
-                    new_model["parameters"].update({"batch_sizes": self.batch_sizes})
-
-                if "concurrency" in model.parameters():
-                    new_model["parameters"].update(
-                        {"concurrency": model.parameters()["concurrency"]}
-                    )
-                else:
-                    new_model["parameters"].update({"concurrency": self.concurrency})
-
-                if "request_rate" in model.parameters():
-                    new_model["parameters"].update(
-                        {"request_rate": model.parameters()["request_rate"]}
-                    )
-                else:
-                    new_model["parameters"].update({"request_rate": self.request_rate})
-
-                if "text_input_length" in model.parameters():
-                    new_model["parameters"].update(
-                        {"text_input_length": model.parameters()["text_input_length"]}
-                    )
-                else:
-                    new_model["parameters"].update(
-                        {"text_input_length": self.text_input_length}
-                    )
-
-                if "max_token_count" in model.parameters():
-                    new_model["max_token_count"].update(
-                        {"max_token_count": model.parameters()["max_token_count"]}
-                    )
-                else:
-                    new_model["parameters"].update(
-                        {"max_token_count": self.text_input_length}
-                    )
+                new_model["parameters"].update(
+                    self._set_model_parameter(model, "batch_sizes")
+                )
+                new_model["parameters"].update(
+                    self._set_model_parameter(model, "concurrency")
+                )
+                new_model["parameters"].update(
+                    self._set_model_parameter(model, "periodic_concurrency")
+                )
+                new_model["parameters"].update(
+                    self._set_model_parameter(model, "request_rate")
+                )
+                new_model["parameters"].update(
+                    self._set_model_parameter(model, "request_period")
+                )
+                new_model["parameters"].update(
+                    self._set_model_parameter(model, "max_token_count")
+                )
+                new_model["parameters"].update(
+                    self._set_model_parameter(model, "text_input_length")
+                )
 
             if (
                 new_model["parameters"]["request_rate"]
@@ -1506,6 +1566,14 @@ def _autofill_values(self):
             new_profile_models[model.model_name()] = new_model
         self._fields["profile_models"].set_value(new_profile_models)
 
+    def _set_model_parameter(
+        self, model: ConfigModelProfileSpec, parameter_name: str
+    ) -> Dict:
+        if parameter_name in model.parameters():
+            return {parameter_name: model.parameters()[parameter_name]}
+        else:
+            return {parameter_name: getattr(self, parameter_name)}
+
     def _using_request_rate(self) -> bool:
         if self.request_rate or self.request_rate_search_enable:
             return True
@@ -1550,16 +1618,26 @@ def is_llm_model(self) -> bool:
         """
         Returns true if the user has enabled llm search or set any llm search value
         """
+        config = self.get_config()
+
         return (
             self.llm_search_enable
-            or self.get_config()[
-                "run_config_search_min_text_input_length"
+            or config["run_config_search_min_text_input_length"].is_set_by_user()
+            or config["run_config_search_max_text_input_length"].is_set_by_user()
+            or config["run_config_search_min_max_token_count"].is_set_by_user()
+            or config["run_config_search_max_max_token_count"].is_set_by_user()
+            or config["run_config_search_min_periodic_concurrency"].is_set_by_user()
+            or config["run_config_search_max_periodic_concurrency"].is_set_by_user()
+            or config[
+                "run_config_search_min_periodic_concurrency_step"
             ].is_set_by_user()
-            or self.get_config()[
-                "run_config_search_max_text_input_length"
+            or config[
+                "run_config_search_max_periodic_concurrency_step"
             ].is_set_by_user()
-            or self.get_config()["run_config_search_min_token_count"].is_set_by_user()
-            or self.get_config()["run_config_search_max_token_count"].is_set_by_user()
-            or self.get_config()["text_input_length"].is_set_by_user()
-            or self.get_config()["max_token_count"].is_set_by_user()
+            or config["run_config_search_min_request_period"].is_set_by_user()
+            or config["run_config_search_max_request_period"].is_set_by_user()
+            or config["text_input_length"].is_set_by_user()
+            or config["max_token_count"].is_set_by_user()
+            or config["periodic_concurrency"].is_set_by_user()
+            or config["request_period"].is_set_by_user()
         )
diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py
index 7e37f7c7d..bab62a4fd 100755
--- a/model_analyzer/config/input/config_defaults.py
+++ b/model_analyzer/config/input/config_defaults.py
@@ -45,8 +45,14 @@
 DEFAULT_CLIENT_PROTOCOL = "grpc"
 DEFAULT_RUN_CONFIG_MAX_CONCURRENCY = 1024
 DEFAULT_RUN_CONFIG_MIN_CONCURRENCY = 1
+DEFAULT_RUN_CONFIG_MAX_PERIODIC_CONCURRENCY = 1024
+DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY = 16
+DEFAULT_RUN_CONFIG_MAX_PERIODIC_CONCURRENCY_STEP = 128
+DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY_STEP = 4
 DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE = 8192
 DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE = 16
+DEFAULT_RUN_CONFIG_MAX_REQUEST_PERIOD = 256
+DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD = 1
 DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT = 5
 DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT = 1
 DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE = 1
@@ -54,8 +60,8 @@
 DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS = 5
 DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH = 1
 DEFAULT_RUN_CONFIG_MAX_TEXT_INPUT_LENGTH = 1024
-DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT = 1
-DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT = 256
+DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT = 1
+DEFAULT_RUN_CONFIG_MAX_MAX_TOKEN_COUNT = 256
 DEFAULT_RUN_CONFIG_SEARCH_DISABLE = False
 DEFAULT_RUN_CONFIG_SEARCH_MODE = "brute"
 DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE = False
diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py
index caa9763ce..d6e42fadc 100755
--- a/tests/common/test_utils.py
+++ b/tests/common/test_utils.py
@@ -27,6 +27,9 @@
     DEFAULT_MEASUREMENT_MODE,
     DEFAULT_MONITORING_INTERVAL,
     DEFAULT_OUTPUT_MODEL_REPOSITORY,
+    DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
+    DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT,
+    DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY,
     DEFAULT_TRITON_GRPC_ENDPOINT,
     DEFAULT_TRITON_HTTP_ENDPOINT,
     DEFAULT_TRITON_INSTALL_PATH,
@@ -236,9 +239,10 @@ def construct_perf_analyzer_config(
     model_name="my-model",
     output_file_name="my-model-results.csv",
     batch_size=DEFAULT_BATCH_SIZES,
-    concurrency=1,
+    concurrency=DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
+    periodic_concurrency=DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY,
     request_rate=None,
-    max_token_count=1,
+    max_token_count=DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT,
     launch_mode=DEFAULT_TRITON_LAUNCH_MODE,
     client_protocol=DEFAULT_CLIENT_PROTOCOL,
     perf_analyzer_flags=None,
@@ -257,6 +261,8 @@ def construct_perf_analyzer_config(
         The batch size for this PA configuration
     concurrency: int
         The concurrency value for this PA configuration
+    periodic_concurrency:
+        The periodic concurrency value for this PA configuration
     request_rate: int
         The request rate value for this PA configuration
     launch_mode: str
@@ -282,7 +288,7 @@ def construct_perf_analyzer_config(
     if request_rate:
         pa_config._args["request-rate-range"] = request_rate
     elif llm_search_mode:
-        pa_config._args["periodic-concurrency-range"] = concurrency
+        pa_config._args["periodic-concurrency-range"] = periodic_concurrency
     else:
         pa_config._args["concurrency-range"] = concurrency
 
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 94dbf0b21..c6669b2c2 100755
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -78,6 +78,12 @@ def get_test_options():
         OptionStruct("int", "profile", "--perf-analyzer-max-auto-adjusts", None, "100", str(config_defaults.DEFAULT_PERF_MAX_AUTO_ADJUSTS)),
         OptionStruct("int", "profile", "--run-config-search-min-concurrency", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_CONCURRENCY)),
         OptionStruct("int", "profile", "--run-config-search-max-concurrency", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_CONCURRENCY)),
+        OptionStruct("int", "profile", "--run-config-search-min-periodic-concurrency", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY)),
+        OptionStruct("int", "profile", "--run-config-search-max-periodic-concurrency", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_PERIODIC_CONCURRENCY)),
+        OptionStruct("int", "profile", "--run-config-search-min-periodic-concurrency-step", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY_STEP)),
+        OptionStruct("int", "profile", "--run-config-search-max-periodic-concurrency-step", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_PERIODIC_CONCURRENCY_STEP)),
+        OptionStruct("int", "profile", "--run-config-search-min-request-period", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD)),
+        OptionStruct("int", "profile", "--run-config-search-max-request-period", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_REQUEST_PERIOD)),
         OptionStruct("int", "profile", "--run-config-search-min-request-rate", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE)),
         OptionStruct("int", "profile", "--run-config-search-max-request-rate", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE)),
         OptionStruct("int", "profile", "--run-config-search-min-model-batch-size", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE)),
@@ -87,8 +93,8 @@ def get_test_options():
         OptionStruct("int", "profile", "--run-config-search-max-binary-search-steps", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS)),
         OptionStruct("int", "profile", "--run-config-search-min-text-input-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH)),
         OptionStruct("int", "profile", "--run-config-search-max-text-input-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_TEXT_INPUT_LENGTH)),
-        OptionStruct("int", "profile", "--run-config-search-min-token-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT)),
-        OptionStruct("int", "profile", "--run-config-search-max-token-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT)),
+        OptionStruct("int", "profile", "--run-config-search-min-max-token-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT)),
+        OptionStruct("int", "profile", "--run-config-search-max-max-token-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_MAX_TOKEN_COUNT)),
         OptionStruct("float", "profile", "--monitoring-interval", "-i", "10.0", str(config_defaults.DEFAULT_MONITORING_INTERVAL)),
         OptionStruct("float", "profile", "--perf-analyzer-cpu-util", None, "10.0", str(psutil.cpu_count() * config_defaults.DEFAULT_PERF_ANALYZER_CPU_UTIL)),
         OptionStruct("int", "profile", "--num-configs-per-model", None, "10", str(config_defaults.DEFAULT_NUM_CONFIGS_PER_MODEL)),
@@ -139,7 +145,9 @@ def get_test_options():
         #   expected_default_value
         OptionStruct("intlist", "profile", "--batch-sizes", "-b", "2, 4, 6", "1"),
         OptionStruct("intlist", "profile", "--concurrency", "-c", "1, 2, 3", None),
+        OptionStruct("intlist", "profile", "--periodic-concurrency", None, "1, 2, 3", None),
         OptionStruct("intlist", "profile", "--request-rate", None, "1, 2, 3", None),
+        OptionStruct("intlist", "profile", "--request-period", None, "1, 2, 3", None),
         OptionStruct("intlist", "profile", "--text-input-length", None, "1, 2, 3", None),
         OptionStruct("intlist", "profile", "--max-token-count", None, "1, 2, 3", None),
         OptionStruct("stringlist", "profile", "--triton-docker-mounts", None, "a:b:c, d:e:f", None, extra_commands=["--triton-launch-mode", "docker"]),
diff --git a/tests/test_config.py b/tests/test_config.py
index 01dc739d8..72af999fe 100755
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -47,6 +47,26 @@
 
 
 class TestConfig(trc.TestResultCollector):
+    def _create_parameters(
+        self,
+        batch_sizes: List = [],
+        concurrency: List = [],
+        periodic_concurrency: List = [],
+        request_rate: List = [],
+        request_period: List = [],
+        text_input_length: List = [],
+        max_token_count: List = [],
+    ) -> Dict:
+        return {
+            "batch_sizes": batch_sizes,
+            "concurrency": concurrency,
+            "periodic_concurrency": periodic_concurrency,
+            "request_rate": request_rate,
+            "request_period": request_period,
+            "text_input_length": text_input_length,
+            "max_token_count": max_token_count,
+        }
+
     def _evaluate_config(self, args, yaml_content, subcommand="profile"):
         mock_numba = MockNumba(
             mock_paths=["model_analyzer.config.input.config_command_profile"]
@@ -288,24 +308,12 @@ def test_range_and_list_values(self):
         expected_model_configs = [
             ConfigModelProfileSpec(
                 "model_1",
-                parameters={
-                    "batch_sizes": [1],
-                    "concurrency": [],
-                    "request_rate": [],
-                    "text_input_length": [],
-                    "max_token_count": [],
-                },
+                parameters=self._create_parameters(batch_sizes=[1]),
                 objectives={"perf_throughput": 10},
             ),
             ConfigModelProfileSpec(
                 "model_2",
-                parameters={
-                    "batch_sizes": [1],
-                    "concurrency": [],
-                    "request_rate": [],
-                    "text_input_length": [],
-                    "max_token_count": [],
-                },
+                parameters=self._create_parameters(batch_sizes=[1]),
                 objectives={"perf_throughput": 10},
             ),
         ]
@@ -438,24 +446,14 @@ def test_object(self):
         expected_model_objects = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={
-                    "batch_sizes": [1],
-                    "concurrency": [1, 2, 3, 4],
-                    "request_rate": [],
-                    "text_input_length": [],
-                    "max_token_count": [],
-                },
+                parameters=self._create_parameters(
+                    batch_sizes=[1], concurrency=[1, 2, 3, 4]
+                ),
                 objectives={"perf_throughput": 10},
             ),
             ConfigModelProfileSpec(
                 "vgg_19_graphdef",
-                parameters={
-                    "batch_sizes": [1],
-                    "concurrency": [],
-                    "request_rate": [],
-                    "text_input_length": [],
-                    "max_token_count": [],
-                },
+                parameters=self._create_parameters(batch_sizes=[1]),
                 objectives={"perf_throughput": 10},
             ),
         ]
@@ -505,24 +503,16 @@ def test_object(self):
         expected_model_objects = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={
-                    "batch_sizes": [1],
-                    "concurrency": [1, 2, 3, 4],
-                    "request_rate": [],
-                    "text_input_length": [],
-                    "max_token_count": [],
-                },
+                parameters=self._create_parameters(
+                    batch_sizes=[1], concurrency=[1, 2, 3, 4]
+                ),
                 objectives={"perf_throughput": 10},
             ),
             ConfigModelProfileSpec(
                 "vgg_19_graphdef",
-                parameters={
-                    "concurrency": [1, 2, 3, 4],
-                    "batch_sizes": [2, 4, 6],
-                    "request_rate": [],
-                    "text_input_length": [],
-                    "max_token_count": [],
-                },
+                parameters=self._create_parameters(
+                    batch_sizes=[2, 4, 6], concurrency=[1, 2, 3, 4]
+                ),
                 objectives={"perf_throughput": 10},
             ),
         ]
@@ -589,13 +579,9 @@ def test_constraints(self):
         expected_model_objects = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={
-                    "batch_sizes": [1],
-                    "concurrency": [1, 2, 3, 4],
-                    "request_rate": [],
-                    "text_input_length": [],
-                    "max_token_count": [],
-                },
+                parameters=self._create_parameters(
+                    batch_sizes=[1], concurrency=[1, 2, 3, 4]
+                ),
                 objectives={"perf_throughput": 10, "gpu_used_memory": 5},
                 constraints={
                     "gpu_used_memory": {
@@ -605,13 +591,7 @@ def test_constraints(self):
             ),
             ConfigModelProfileSpec(
                 "vgg_19_graphdef",
-                parameters={
-                    "batch_sizes": [1],
-                    "concurrency": [],
-                    "request_rate": [],
-                    "text_input_length": [],
-                    "max_token_count": [],
-                },
+                parameters=self._create_parameters(batch_sizes=[1]),
                 objectives={"perf_throughput": 10},
             ),
         ]
@@ -729,13 +709,7 @@ def test_config_model(self):
         expected_model_configs = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={
-                    "batch_sizes": [1],
-                    "concurrency": [],
-                    "request_rate": [],
-                    "text_input_length": [],
-                    "max_token_count": [],
-                },
+                parameters=self._create_parameters(batch_sizes=[1]),
                 objectives={"perf_throughput": 10},
                 model_config_parameters={
                     "instance_group": [[{"kind": ["KIND_GPU"], "count": [1]}]]
@@ -760,13 +734,7 @@ def test_config_model(self):
         expected_model_configs = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={
-                    "batch_sizes": [1],
-                    "concurrency": [],
-                    "request_rate": [],
-                    "text_input_length": [],
-                    "max_token_count": [],
-                },
+                parameters=self._create_parameters(batch_sizes=[1]),
                 objectives={"perf_throughput": 10},
                 model_config_parameters={
                     "instance_group": [[{"kind": ["KIND_GPU"], "count": [1]}]]
@@ -802,13 +770,7 @@ def test_config_model(self):
         expected_model_configs = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={
-                    "batch_sizes": [1],
-                    "concurrency": [],
-                    "request_rate": [],
-                    "text_input_length": [],
-                    "max_token_count": [],
-                },
+                parameters=self._create_parameters(batch_sizes=[1]),
                 objectives={"perf_throughput": 10},
                 model_config_parameters={
                     "instance_group": [
@@ -851,13 +813,7 @@ def test_config_model(self):
         expected_model_configs = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={
-                    "batch_sizes": [1],
-                    "concurrency": [],
-                    "request_rate": [],
-                    "text_input_length": [],
-                    "max_token_count": [],
-                },
+                parameters=self._create_parameters(batch_sizes=[1]),
                 objectives={"perf_throughput": 10},
                 model_config_parameters={
                     "instance_group": [
@@ -887,13 +843,7 @@ def test_config_model(self):
         expected_model_configs = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={
-                    "batch_sizes": [1],
-                    "concurrency": [],
-                    "request_rate": [],
-                    "text_input_length": [],
-                    "max_token_count": [],
-                },
+                parameters=self._create_parameters(batch_sizes=[1]),
                 objectives={"perf_throughput": 10},
                 model_config_parameters={
                     "input": [
@@ -936,13 +886,7 @@ def test_config_model(self):
         expected_model_configs = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={
-                    "batch_sizes": [1],
-                    "concurrency": [],
-                    "request_rate": [],
-                    "text_input_length": [],
-                    "max_token_count": [],
-                },
+                parameters=self._create_parameters(batch_sizes=[1]),
                 objectives={"perf_throughput": 10},
                 perf_analyzer_flags={
                     "measurement-interval": 10000,
@@ -968,13 +912,7 @@ def test_config_model(self):
         expected_model_configs = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={
-                    "batch_sizes": [1],
-                    "concurrency": [],
-                    "request_rate": [],
-                    "text_input_length": [],
-                    "max_token_count": [],
-                },
+                parameters=self._create_parameters(batch_sizes=[1]),
                 objectives={"perf_throughput": 10},
                 perf_analyzer_flags={
                     "measurement-interval": 10000,
@@ -1245,13 +1183,7 @@ def test_autofill(self):
         expected_model_configs = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={
-                    "batch_sizes": [1],
-                    "concurrency": [],
-                    "request_rate": [],
-                    "text_input_length": [],
-                    "max_token_count": [],
-                },
+                parameters=self._create_parameters(batch_sizes=[1]),
                 objectives={"perf_throughput": 10},
                 model_config_parameters={
                     "instance_group": [[{"kind": ["KIND_GPU"], "count": [1]}]]
@@ -1291,13 +1223,9 @@ def test_autofill(self):
         expected_model_configs = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={
-                    "batch_sizes": [16, 32],
-                    "concurrency": [2, 4],
-                    "request_rate": [],
-                    "text_input_length": [],
-                    "max_token_count": [],
-                },
+                parameters=self._create_parameters(
+                    batch_sizes=[16, 32], concurrency=[2, 4]
+                ),
                 objectives={"perf_throughput": 10, "gpu_used_memory": 5},
                 constraints={
                     "gpu_used_memory": {
@@ -1341,13 +1269,9 @@ def test_autofill(self):
         expected_model_configs = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={
-                    "batch_sizes": [16, 32],
-                    "concurrency": [2, 4],
-                    "request_rate": [],
-                    "text_input_length": [],
-                    "max_token_count": [],
-                },
+                parameters=self._create_parameters(
+                    batch_sizes=[16, 32], concurrency=[2, 4]
+                ),
                 objectives={"gpu_used_memory": 10},
                 constraints={"perf_latency_p99": {"max": 8000}},
                 model_config_parameters={
@@ -1387,13 +1311,9 @@ def test_autofill(self):
         expected_model_configs = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={
-                    "batch_sizes": [16, 32],
-                    "concurrency": [2, 4],
-                    "request_rate": [],
-                    "text_input_length": [],
-                    "max_token_count": [],
-                },
+                parameters=self._create_parameters(
+                    batch_sizes=[16, 32], concurrency=[2, 4]
+                ),
                 objectives={"gpu_used_memory": 10},
                 constraints={"perf_latency_p99": {"max": 8000}},
                 model_config_parameters={
@@ -1444,13 +1364,9 @@ def test_autofill(self):
         expected_model_configs = [
             ConfigModelProfileSpec(
                 "vgg_16_graphdef",
-                parameters={
-                    "batch_sizes": [16, 32],
-                    "concurrency": [5, 6, 7],
-                    "request_rate": [],
-                    "text_input_length": [],
-                    "max_token_count": [],
-                },
+                parameters=self._create_parameters(
+                    batch_sizes=[16, 32], concurrency=[5, 6, 7]
+                ),
                 objectives={"gpu_used_memory": 10},
                 constraints={
                     "perf_latency_p99": {"max": 8000},
@@ -1459,13 +1375,9 @@ def test_autofill(self):
             ),
             ConfigModelProfileSpec(
                 "vgg_19_graphdef",
-                parameters={
-                    "batch_sizes": [1, 2],
-                    "concurrency": [2, 4],
-                    "request_rate": [],
-                    "text_input_length": [],
-                    "max_token_count": [],
-                },
+                parameters=self._create_parameters(
+                    batch_sizes=[1, 2], concurrency=[2, 4]
+                ),
                 objectives={"perf_throughput": 10, "perf_latency_p99": 5},
                 constraints={"perf_latency_p99": {"max": 8000}},
             ),
diff --git a/tests/test_perf_analyzer_config_generator.py b/tests/test_perf_analyzer_config_generator.py
index 69e42ef8d..f00084335 100755
--- a/tests/test_perf_analyzer_config_generator.py
+++ b/tests/test_perf_analyzer_config_generator.py
@@ -584,8 +584,8 @@ def test_llm_search_max_token_count(self):
 
         pa_cli_args = [
             "--llm-search-enable",
-            "--run-config-search-max-concurrency",
-            "1",
+            "--run-config-search-max-periodic-concurrency",
+            "16",
             "--run-config-search-max-text-input-length",
             "1",
         ]
@@ -598,7 +598,7 @@ def test_llm_search_text_input_length(self):
         Test LLM Search:
             - Input length 1->1024
 
-        Concurrency and max token count set to 1
+        Periodic Concurrency and max token count set to 1
         """
 
         # yapf: disable
@@ -618,9 +618,9 @@ def test_llm_search_text_input_length(self):
 
         pa_cli_args = [
             "--llm-search-enable",
-            "--run-config-search-max-concurrency",
-            "1",
-            "--run-config-search-max-token-count",
+            "--run-config-search-max-periodic-concurrency",
+            "16",
+            "--run-config-search-max-max-token-count",
             "1",
         ]
         self._run_and_test_perf_analyzer_config_generator(

From e81a36963517ab2b8cb5bbe8aa940b62217a79d4 Mon Sep 17 00:00:00 2001
From: Brian Raf <92820864+nv-braf@users.noreply.github.com>
Date: Tue, 17 Oct 2023 11:07:13 -0700
Subject: [PATCH 05/12] Capture LLM metrics from PA (#774)

* Initial code for aggregation of new LLM metrics

* New measurement fields created.

* Fixing PA unit tests

* Adding hooks in metrics to capture new LLM fields

* Fixing codeQL errors

* Fixing type checking errors

* Changes needed post-merge from other branches

* Revert naming mistake (due to merge).

* Changes uncovered during live testing

* Fixes based on hwoo review

* Fixing typo

* Change to use lists and mean()

* Changes based on hwoo review
---
 model_analyzer/perf_analyzer/perf_analyzer.py | 103 +++++++++++--
 model_analyzer/perf_analyzer/perf_config.py   |   9 +-
 model_analyzer/record/metrics_manager.py      |  20 ++-
 .../record/types/avg_first_token_latency.py   |   2 +-
 .../types/avg_token_to_token_latency.py       |   2 +-
 tests/common/test_utils.py                    |   6 +
 tests/test_perf_analyzer.py                   | 142 ++++++++++++++----
 7 files changed, 235 insertions(+), 49 deletions(-)

diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py
index 51ad64151..49f15f5a2 100755
--- a/model_analyzer/perf_analyzer/perf_analyzer.py
+++ b/model_analyzer/perf_analyzer/perf_analyzer.py
@@ -16,6 +16,7 @@
 
 import csv
 import glob
+import json
 import logging
 import os
 import re
@@ -25,6 +26,7 @@
 from typing import Dict, List
 
 import psutil
+from numpy import mean
 
 from model_analyzer.constants import (
     INTERVAL_SLEEP_TIME,
@@ -118,6 +120,14 @@ def get_gpu_metrics():
         ]
         return gpu_metrics
 
+    @staticmethod
+    def get_llm_metrics():
+        llm_metrics = [
+            llm_metric[PerfAnalyzer.RECORD_CLASS]
+            for llm_metric in PerfAnalyzer.llm_metric_table
+        ]
+        return llm_metrics
+
     def __init__(self, path, config, max_retries, timeout, max_cpu_util):
         """
         Parameters
@@ -143,6 +153,7 @@ def __init__(self, path, config, max_retries, timeout, max_cpu_util):
         self._output = ""
         self._perf_records = {}
         self._gpu_records = []
+        self._llm_records = {}
         self._max_cpu_util = max_cpu_util
 
     def run(self, metrics, env=None):
@@ -216,6 +227,19 @@ def get_gpu_records(self):
 
         return self._gpu_records
 
+    def get_llm_records(self):
+        """
+        Returns
+        -------
+        The LLM records from the last perf_analyzer run
+        """
+
+        if self._llm_records:
+            return self._llm_records
+        raise TritonModelAnalyzerException(
+            "Attempted to get perf_analyzer results without calling run first."
+        )
+
     def output(self):
         """
         Returns
@@ -457,21 +481,82 @@ def _parse_outputs(self, metrics):
             logger.debug(
                 f"Reading PA results from {perf_config['latency-report-file']}"
             )
-            with open(perf_config["latency-report-file"], mode="r") as f:
-                csv_reader = csv.DictReader(f, delimiter=",")
-
-                for row in csv_reader:
-                    self._perf_records[
-                        perf_config["model-name"]
-                    ] = self._extract_perf_records_from_row(metrics, row)
-                    self._gpu_records = self._extract_gpu_records_from_row(metrics, row)
+            self._extract_gpu_records(perf_config, metrics)
+            self._extract_llm_records(perf_config, metrics)
 
         for perf_config in [
             mrc.perf_config() for mrc in self._config.model_run_configs()
         ]:
-            # Remove the latency file and all associated composing model latency files
+            # Remove the latency/profile export files and all associated composing model latency files
             for f in glob.glob(f"*{perf_config['latency-report-file']}"):
                 os.remove(f)
+            for f in glob.glob(f"*{perf_config['profile-export-file']}"):
+                os.remove(f)
+
+    def _extract_gpu_records(self, perf_config, metrics):
+        if perf_config["profile-export-file"]:
+            return
+
+        with open(perf_config["latency-report-file"], mode="r") as f:
+            csv_reader = csv.DictReader(f, delimiter=",")
+
+            for row in csv_reader:
+                self._perf_records[
+                    perf_config["model-name"]
+                ] = self._extract_perf_records_from_row(metrics, row)
+                self._gpu_records = self._extract_gpu_records_from_row(metrics, row)
+
+    def _extract_llm_records(self, perf_config, metrics):
+        if not perf_config["profile-export-file"]:
+            return
+
+        self._llm_records[perf_config["model-name"]] = []
+
+        with open(perf_config["profile-export-file"], mode="r") as f:
+            llm_output = json.load(f)
+
+            avg_first_token_latency = self._calculate_avg_first_token_latency(
+                llm_output
+            )
+            record = PerfAnalyzer.llm_metric_table[0][PerfAnalyzer.RECORD_CLASS](
+                value=avg_first_token_latency
+            )  # type: ignore
+
+            self._llm_records[perf_config["model-name"]].append(record)
+
+            avg_token_to_token_latency = self._calculate_avg_token_to_token_latency(
+                llm_output
+            )
+            record = PerfAnalyzer.llm_metric_table[1][PerfAnalyzer.RECORD_CLASS](
+                value=avg_token_to_token_latency
+            )  # type: ignore
+            self._llm_records[perf_config["model-name"]].append(record)
+
+    def _calculate_avg_first_token_latency(self, llm_output: Dict) -> float:
+        total_first_token_latencies = []
+        for request in llm_output["experiments"][0]["requests"]:
+            total_first_token_latencies.append(
+                request["response_timestamps"][0] - request["timestamp"]
+            )
+
+        avg_first_token_latency = mean(total_first_token_latencies)
+
+        return avg_first_token_latency
+
+    def _calculate_avg_token_to_token_latency(self, llm_output: Dict) -> float:
+        token_to_token_latencies = []
+        for request in llm_output["experiments"][0]["requests"]:
+            response_to_response_latencies = []
+            prev_response = request["response_timestamps"][0]
+            for response in request["response_timestamps"][1:]:
+                response_to_response_latencies.append(response - prev_response)
+                prev_response = response
+
+            token_to_token_latencies.append(mean(response_to_response_latencies))
+
+        avg_token_to_token_latency = mean(token_to_token_latencies)
+
+        return avg_token_to_token_latency
 
     def _extract_perf_records_from_row(
         self, requested_metrics: List[Record], row_metrics: Dict[str, str]
diff --git a/model_analyzer/perf_analyzer/perf_config.py b/model_analyzer/perf_analyzer/perf_config.py
index 7cab2dd3c..a72cdc3b1 100755
--- a/model_analyzer/perf_analyzer/perf_config.py
+++ b/model_analyzer/perf_analyzer/perf_config.py
@@ -73,6 +73,7 @@ class PerfAnalyzerConfig:
         "metrics-interval",
         "bls-composing-models",
         "request-parameter",
+        "request-period",
     ]
 
     input_to_options = [
@@ -82,6 +83,7 @@ class PerfAnalyzerConfig:
         "url",
         "protocol",
         "latency-report-file",
+        "profile-export-file",
         "http-header",
     ]
 
@@ -112,6 +114,7 @@ def __init__(self):
             "-u": None,
             "-i": None,
             "-f": None,
+            "--profile-export-file": None,
             "-H": None,
         }
         self._verbose = {"-v": None, "-v -v": None, "--verbose-csv": None}
@@ -123,6 +126,7 @@ def __init__(self):
             "url": "-u",
             "protocol": "-i",
             "latency-report-file": "-f",
+            "profile-export-file": "--profile-export-file",
             "http-header": "-H",
         }
 
@@ -193,6 +197,9 @@ def update_config_from_profile_config(self, model_name, profile_config):
             "verbose-csv": "--verbose-csv",
         }
 
+        if profile_config.is_llm_model():
+            params.update({"profile-export-file": model_name + "-results.json"})
+
         if profile_config.triton_launch_mode == "c_api":
             params.update(
                 {
@@ -307,7 +314,7 @@ def remove_url_from_cli_string(cls, cli_string):
     @classmethod
     def remove_mrc_from_cli_string(cls, cli_string):
         """
-        utility function strips the measruement request count
+        utility function strips the measurement request count
         from a cli string representation
 
         Parameters
diff --git a/model_analyzer/record/metrics_manager.py b/model_analyzer/record/metrics_manager.py
index 176b632df..fe77f6eb8 100755
--- a/model_analyzer/record/metrics_manager.py
+++ b/model_analyzer/record/metrics_manager.py
@@ -69,6 +69,8 @@ class MetricsManager:
         "gpu_power_usage",
         "cpu_available_ram",
         "cpu_used_ram",
+        "avg_first_token_latency",
+        "avg_token_to_token_latency",
     ]
 
     def __init__(self, config, client, server, gpus, result_manager, state_manager):
@@ -116,6 +118,7 @@ def __init__(self, config, client, server, gpus, result_manager, state_manager):
             self._gpu_metrics,
             self._perf_metrics,
             self._cpu_metrics,
+            self._llm_metrics,
         ) = self._categorize_metrics(self.metrics, self._config.collect_cpu_metrics)
         self._gpus = gpus
         self._init_state()
@@ -160,21 +163,23 @@ def _categorize_metrics(metric_tags, collect_cpu_metrics=False):
 
         Returns
         -------
-        (list,list,list)
-            tuple of three lists (DCGM, PerfAnalyzer, CPU) metrics
+        (list,list,list,list)
+            tuple of four lists (DCGM, PerfAnalyzer, CPU, LLM) metrics
         """
 
-        gpu_metrics, perf_metrics, cpu_metrics = [], [], []
+        gpu_metrics, perf_metrics, cpu_metrics, llm_metrics = [], [], [], []
         # Separates metrics and objectives into related lists
         for metric in MetricsManager.get_metric_types(metric_tags):
             if metric in PerfAnalyzer.get_gpu_metrics():
                 gpu_metrics.append(metric)
             elif metric in PerfAnalyzer.get_perf_metrics():
                 perf_metrics.append(metric)
+            elif metric in PerfAnalyzer.get_llm_metrics():
+                llm_metrics.append(metric)
             elif collect_cpu_metrics and (metric in CPUMonitor.cpu_metrics):
                 cpu_metrics.append(metric)
 
-        return gpu_metrics, perf_metrics, cpu_metrics
+        return gpu_metrics, perf_metrics, cpu_metrics, llm_metrics
 
     def profile_server(self):
         """
@@ -556,6 +561,9 @@ def _run_perf_analyzer(
         )
 
         metrics_to_gather = self._perf_metrics + self._gpu_metrics
+        if self._config.is_llm_model():
+            metrics_to_gather += self._llm_metrics
+
         status = perf_analyzer.run(metrics_to_gather, env=perf_analyzer_env)
 
         self._write_perf_analyzer_output(perf_output_writer, perf_analyzer)
@@ -564,7 +572,9 @@ def _run_perf_analyzer(
             self._handle_unsuccessful_perf_analyzer_run(perf_analyzer)
             return (None, None)
 
-        perf_records = perf_analyzer.get_perf_records()
+        perf_records = (
+            perf_analyzer.get_perf_records() + perf_analyzer.get_llm_records()
+        )
         gpu_records = perf_analyzer.get_gpu_records()
 
         aggregated_perf_records = self._aggregate_perf_records(perf_records)
diff --git a/model_analyzer/record/types/avg_first_token_latency.py b/model_analyzer/record/types/avg_first_token_latency.py
index 15badd92a..72d539633 100755
--- a/model_analyzer/record/types/avg_first_token_latency.py
+++ b/model_analyzer/record/types/avg_first_token_latency.py
@@ -22,7 +22,7 @@
 @total_ordering
 class AvgFirstTokenLatency(DecreasingRecord):
     """
-    A record for perf_analyzer avg first token to token latency metric
+    A record for perf_analyzer average first token latency metric
     """
 
     tag = "avg_first_token_latency"
diff --git a/model_analyzer/record/types/avg_token_to_token_latency.py b/model_analyzer/record/types/avg_token_to_token_latency.py
index 2941da39b..66c93b6fc 100755
--- a/model_analyzer/record/types/avg_token_to_token_latency.py
+++ b/model_analyzer/record/types/avg_token_to_token_latency.py
@@ -22,7 +22,7 @@
 @total_ordering
 class AvgTokenToTokenLatency(DecreasingRecord):
     """
-    A record for perf_analyzer avg token-to-token latency metric
+    A record for perf_analyzer average token-to-token latency metric
     """
 
     tag = "avg_token_to_token_latency"
diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py
index d6e42fadc..380a5d404 100755
--- a/tests/common/test_utils.py
+++ b/tests/common/test_utils.py
@@ -238,6 +238,7 @@ def convert_avg_gpu_metrics_to_data(avg_gpu_metric_values):
 def construct_perf_analyzer_config(
     model_name="my-model",
     output_file_name="my-model-results.csv",
+    export_file_name="my-model-results.json",
     batch_size=DEFAULT_BATCH_SIZES,
     concurrency=DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
     periodic_concurrency=DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY,
@@ -257,6 +258,8 @@ def construct_perf_analyzer_config(
         The name of the model
     output_file_name: str
         The name of the output file
+    export_file_name: str
+        The name of the export file
     batch_size: int
         The batch size for this PA configuration
     concurrency: int
@@ -285,6 +288,9 @@ def construct_perf_analyzer_config(
     pa_config._options["-f"] = output_file_name
     pa_config._options["-b"] = batch_size
 
+    if llm_search_mode:
+        pa_config._options["--profile-export-file"] = export_file_name
+
     if request_rate:
         pa_config._args["request-rate-range"] = request_rate
     elif llm_search_mode:
diff --git a/tests/test_perf_analyzer.py b/tests/test_perf_analyzer.py
index e95f0d4a1..a984279bd 100755
--- a/tests/test_perf_analyzer.py
+++ b/tests/test_perf_analyzer.py
@@ -49,6 +49,7 @@
 from model_analyzer.triton.client.client_factory import TritonClientFactory
 from model_analyzer.triton.server.server_config import TritonServerConfig
 from model_analyzer.triton.server.server_factory import TritonServerFactory
+from tests.common.test_utils import construct_perf_analyzer_config
 
 from .common import test_result_collector as trc
 from .mocks.mock_client import MockTritonClientMethods
@@ -67,7 +68,56 @@
 TEST_GRPC_URL = "test_hostname:test_port"
 
 
-class TestPerfAnalyzerMethods(trc.TestResultCollector):
+def mock_open_method(*args, **kwargs):
+    pa_csv_mock = """Concurrency,Inferences/Second,Client Send,Network+Server Send/Recv,Server Queue,Server Compute Input,Server Compute Infer,Server Compute Output,"""
+    pa_csv_mock += """Client Recv,p50 latency,p90 latency,p95 latency,p99 latency,Avg latency,request/response,response wait,"""
+    pa_csv_mock += """Avg GPU Utilization,Avg GPU Power Usage,Max GPU Memory Usage,Total GPU Memory\n"""
+    pa_csv_mock += """1,46.8,2,187,18,34,65,16,1,4600,4700,4800,4900,5000,3,314,"""
+    pa_csv_mock += """GPU-aaf4fea0:0.809;GPU-aaf4fea1:0.901;GPU-aaf4fea2:0.745;,GPU-aaf4fea0:91.2;GPU-aaf4fea1:100;,GPU-aaf4fea0:1000000000;GPU-aaf4fea1:2000000000,GPU-aaf4fea0:1500000000;GPU-aaf4fea2:3000000000"""
+
+    # yapf: disable
+    pa_json_mock = """
+    {
+        "experiments": [
+            {
+                "experiment": {
+                    "mode": "concurrency",
+                    "value": 4
+                },
+                "requests": [
+                    {
+                        "timestamp": 1,
+                        "sequence_id": 1,
+                        "response_timestamps": [2,3,4]
+                    },
+                    {
+                        "timestamp": 4,
+                        "sequence_id": 2,
+                        "response_timestamps": [5,6]
+                    },
+                    {
+                        "timestamp": 6,
+                        "sequence_id": 3,
+                        "response_timestamps": [7,8,9]
+                    }
+                ],
+                "window_boundaries": [1,5,6]
+            }
+        ],
+        "version": "1.2.3"
+    }
+    """
+    # yapf: enable
+
+    if args[0] == "my-model-results.csv":
+        return mock_open(read_data=pa_csv_mock)(*args, **kwargs)
+    elif args[0] == "my-model-llm-results.csv":
+        return mock_open(read_data=pa_json_mock)(*args, **kwargs)
+    else:
+        return mock_open(read_data=None)(*args, **kwargs)
+
+
+class TestPerfAnalyzer(trc.TestResultCollector):
     def setUp(self):
         # Mocks
         self.server_local_mock = MockServerLocalMethods()
@@ -80,7 +130,7 @@ def setUp(self):
         self.client_mock.start()
 
         # PerfAnalyzer config for all tests
-        self.config = PerfAnalyzerConfig()
+        self.config = construct_perf_analyzer_config()
         self.config["model-name"] = TEST_MODEL_NAME
         self.config["measurement-interval"] = 1000
         self.config["measurement-request-count"] = 50
@@ -90,6 +140,16 @@ def setUp(self):
             ModelRunConfig("fake_name", MagicMock(), self.config)
         )
 
+        self.llm_config = construct_perf_analyzer_config(llm_search_mode=True)
+        self.llm_config["model-name"] = TEST_MODEL_NAME
+        self.llm_config["measurement-interval"] = 1000
+        self.llm_config["measurement-request-count"] = 50
+
+        self.llm_run_config = RunConfig({})
+        self.llm_run_config.add_model_run_config(
+            ModelRunConfig("fake_name", MagicMock(), self.llm_config)
+        )
+
         self.gpus = [GPUDevice("TEST_DEVICE_NAME", 0, "TEST_PCI_BUS_ID", "TEST_UUID")]
 
         # Triton Server
@@ -132,7 +192,7 @@ def test_perf_analyzer_config(self):
 
     def test_perf_analyzer_boolean_args(self):
         """Test that only positive boolean args get added"""
-        expected_cli_str = "-m test_model --measurement-interval=1000 --binary-search --measurement-request-count=50"
+        expected_cli_str = "-m test_model -b 1 -u localhost:8001 -i grpc -f my-model-results.csv --measurement-interval=1000 --concurrency-range=1 --binary-search --measurement-mode=count_windows --measurement-request-count=50 --collect-metrics --metrics-url=http://localhost:8002/metrics --metrics-interval=1000.0"
 
         self.config["async"] = "False"
         self.config["binary-search"] = "True"
@@ -141,7 +201,7 @@ def test_perf_analyzer_boolean_args(self):
 
     def test_perf_analyzer_additive_args(self):
         shape = ["name1:1,2,3", "name2:4,5,6"]
-        expected_cli_str = "-m test_model --measurement-interval=1000 --shape=name1:1,2,3 --shape=name2:4,5,6 --measurement-request-count=50"
+        expected_cli_str = "-m test_model -b 1 -u localhost:8001 -i grpc -f my-model-results.csv --measurement-interval=1000 --concurrency-range=1 --shape=name1:1,2,3 --shape=name2:4,5,6 --measurement-mode=count_windows --measurement-request-count=50 --collect-metrics --metrics-url=http://localhost:8002/metrics --metrics-interval=1000.0"
 
         self.config["shape"] = shape[:]
 
@@ -149,7 +209,7 @@ def test_perf_analyzer_additive_args(self):
         self.assertEqual(self.config.to_cli_string(), expected_cli_str)
 
         shape = "name1:1,2,3"
-        expected_cli_str = "-m test_model --measurement-interval=1000 --shape=name1:1,2,3 --measurement-request-count=50"
+        expected_cli_str = "-m test_model -b 1 -u localhost:8001 -i grpc -f my-model-results.csv --measurement-interval=1000 --concurrency-range=1 --shape=name1:1,2,3 --measurement-mode=count_windows --measurement-request-count=50 --collect-metrics --metrics-url=http://localhost:8002/metrics --metrics-interval=1000.0"
         self.config["shape"] = shape
 
         self.assertEqual(self.config.to_cli_string(), expected_cli_str)
@@ -177,10 +237,13 @@ def test_perf_analyzer_ssl_args(self):
         ssl_https_private_key_file = "h"
 
         expected_cli_str = (
-            f"-m test_model --measurement-interval=1000 --measurement-request-count=50 --ssl-grpc-use-ssl "
+            f"-m test_model -b 1 -u localhost:8001 -i grpc -f my-model-results.csv --measurement-interval=1000 "
+            f"--concurrency-range=1 --measurement-mode=count_windows --measurement-request-count=50 --ssl-grpc-use-ssl "
             f"--ssl-grpc-root-certifications-file=a --ssl-grpc-private-key-file=b --ssl-grpc-certificate-chain-file=c "
-            f"--ssl-https-verify-peer=1 --ssl-https-verify-host=2 --ssl-https-ca-certificates-file=d --ssl-https-client-certificate-type=e "
-            f"--ssl-https-client-certificate-file=f --ssl-https-private-key-type=g --ssl-https-private-key-file=h"
+            f"--ssl-https-verify-peer=1 --ssl-https-verify-host=2 --ssl-https-ca-certificates-file=d "
+            f"--ssl-https-client-certificate-type=e --ssl-https-client-certificate-file=f --ssl-https-private-key-type=g "
+            f"--ssl-https-private-key-file=h --collect-metrics --metrics-url=http://localhost:8002/metrics "
+            f"--metrics-interval=1000.0"
         )
 
         self.config["ssl-grpc-use-ssl"] = ssl_grpc_use_ssl
@@ -241,11 +304,15 @@ def test_perf_analyzer_ssl_args(self):
         self.config["ssl-grpc-use-ssl"] = ssl_grpc_use_ssl
         self.assertEqual(self.config["ssl-grpc-use-ssl"], ssl_grpc_use_ssl)
         expected_cli_str = (
-            f"-m test_model --measurement-interval=1000 --measurement-request-count=50 "
+            f"-m test_model -b 1 -u localhost:8001 -i grpc -f my-model-results.csv --measurement-interval=1000 "
+            f"--concurrency-range=1 --measurement-mode=count_windows --measurement-request-count=50 "
             f"--ssl-grpc-root-certifications-file=a --ssl-grpc-private-key-file=b --ssl-grpc-certificate-chain-file=c "
-            f"--ssl-https-verify-peer=1 --ssl-https-verify-host=2 --ssl-https-ca-certificates-file=d --ssl-https-client-certificate-type=e "
-            f"--ssl-https-client-certificate-file=f --ssl-https-private-key-type=g --ssl-https-private-key-file=h"
+            f"--ssl-https-verify-peer=1 --ssl-https-verify-host=2 --ssl-https-ca-certificates-file=d "
+            f"--ssl-https-client-certificate-type=e --ssl-https-client-certificate-file=f "
+            f"--ssl-https-private-key-type=g --ssl-https-private-key-file=h --collect-metrics "
+            f"--metrics-url=http://localhost:8002/metrics --metrics-interval=1000.0"
         )
+
         self.assertEqual(self.config.to_cli_string(), expected_cli_str)
 
     def test_run(self):
@@ -268,18 +335,12 @@ def test_run(self):
         self.server.start()
         self.client.wait_for_server_ready(num_retries=1)
 
-        pa_csv_mock = """Concurrency,Inferences/Second,Client Send,Network+Server Send/Recv,Server Queue,Server Compute Input,Server Compute Infer,Server Compute Output,"""
-        pa_csv_mock += """Client Recv,p50 latency,p90 latency,p95 latency,p99 latency,Avg latency,request/response,response wait,"""
-        pa_csv_mock += """Avg GPU Utilization,Avg GPU Power Usage,Max GPU Memory Usage,Total GPU Memory\n"""
-        pa_csv_mock += """1,46.8,2,187,18,34,65,16,1,4600,4700,4800,4900,5000,3,314,"""
-        pa_csv_mock += """GPU-aaf4fea0:0.809;GPU-aaf4fea1:0.901;GPU-aaf4fea2:0.745;,GPU-aaf4fea0:91.2;GPU-aaf4fea1:100;,GPU-aaf4fea0:1000000000;GPU-aaf4fea1:2000000000,GPU-aaf4fea0:1500000000;GPU-aaf4fea2:3000000000"""
-
         # Test avg latency parsing. GPU metric is ignored for get_perf_records()
         perf_metrics = [PerfLatencyAvg, GPUUtilization]
 
         with patch(
             "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
+            side_effect=mock_open_method,
         ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
             perf_analyzer.run(perf_metrics)
 
@@ -292,7 +353,7 @@ def test_run(self):
 
         with patch(
             "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
+            side_effect=mock_open_method,
         ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
             perf_analyzer.run(perf_metrics)
 
@@ -305,7 +366,7 @@ def test_run(self):
 
         with patch(
             "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
+            side_effect=mock_open_method,
         ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
             perf_analyzer.run(perf_metrics)
 
@@ -318,7 +379,7 @@ def test_run(self):
 
         with patch(
             "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
+            side_effect=mock_open_method,
         ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
             perf_analyzer.run(perf_metrics)
 
@@ -331,7 +392,7 @@ def test_run(self):
 
         with patch(
             "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
+            side_effect=mock_open_method,
         ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
             perf_analyzer.run(perf_metrics)
 
@@ -344,7 +405,7 @@ def test_run(self):
 
         with patch(
             "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
+            side_effect=mock_open_method,
         ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
             perf_analyzer.run(perf_metrics)
 
@@ -357,7 +418,7 @@ def test_run(self):
 
         with patch(
             "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
+            side_effect=mock_open_method,
         ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
             perf_analyzer.run(perf_metrics)
 
@@ -370,7 +431,7 @@ def test_run(self):
 
         with patch(
             "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
+            side_effect=mock_open_method,
         ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
             perf_analyzer.run(perf_metrics)
 
@@ -383,7 +444,7 @@ def test_run(self):
 
         with patch(
             "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
+            side_effect=mock_open_method,
         ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
             perf_analyzer.run(perf_metrics)
 
@@ -396,7 +457,7 @@ def test_run(self):
 
         with patch(
             "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
+            side_effect=mock_open_method,
         ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
             perf_analyzer.run(perf_metrics)
 
@@ -409,7 +470,7 @@ def test_run(self):
 
         with patch(
             "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
+            side_effect=mock_open_method,
         ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
             perf_analyzer.run(gpu_metrics)
 
@@ -427,7 +488,7 @@ def test_run(self):
 
         with patch(
             "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
+            side_effect=mock_open_method,
         ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
             perf_analyzer.run(gpu_metrics)
 
@@ -443,7 +504,7 @@ def test_run(self):
 
         with patch(
             "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
+            side_effect=mock_open_method,
         ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
             perf_analyzer.run(gpu_metrics)
 
@@ -462,7 +523,7 @@ def test_run(self):
 
         with patch(
             "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
+            side_effect=mock_open_method,
         ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
             perf_analyzer.run(gpu_metrics)
 
@@ -487,7 +548,7 @@ def test_run(self):
 
         with patch(
             "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
+            side_effect=mock_open_method,
         ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
             perf_analyzer.run(perf_metrics)
 
@@ -651,10 +712,27 @@ def test_get_cmd_single_model(self):
             "perf_analyzer",
             "-m",
             "test_model",
+            "-b",
+            "1",
+            "-u",
+            "localhost:8001",
+            "-i",
+            "grpc",
+            "-f",
+            "my-model-results.csv",
             "--measurement-interval",
             "1000",
+            "--concurrency-range",
+            "1",
+            "--measurement-mode",
+            "count_windows",
             "--measurement-request-count",
             "50",
+            "--collect-metrics",
+            "--metrics-url",
+            "http://localhost:8002/metrics",
+            "--metrics-interval",
+            "1000.0",
         ]
         self.assertEqual(pa._get_cmd(), expected_cmd)
 

From efea104625464c969a3644f7e30123508dec70f6 Mon Sep 17 00:00:00 2001
From: Brian Raf <92820864+nv-braf@users.noreply.github.com>
Date: Wed, 18 Oct 2023 09:29:04 -0700
Subject: [PATCH 06/12] Correct how periodic concurrency works in PACG (#777)

* Created a new class ConfigRangeNumeric and using it for periodic-concurrency

* Fixes and defaults for periodic concurrency

* First unit test passing

* PACG chagnes complete. Unit tests updated and passing

* Removing uneeded class

* Fixing codeQL and hwoo's review suggestions

* Adding missing else
---
 .../perf_analyzer_config_generator.py         |  57 +++++++--
 .../config/input/config_command_profile.py    |   7 +-
 .../config/input/config_defaults.py           |   1 +
 .../config/input/config_list_numeric.py       |   9 +-
 tests/common/test_utils.py                    |   6 +-
 tests/test_cli.py                             |  10 +-
 tests/test_perf_analyzer_config_generator.py  | 118 ++++++++++++++++--
 7 files changed, 178 insertions(+), 30 deletions(-)

diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py
index 771e895f1..104ed79e6 100755
--- a/model_analyzer/config/generate/perf_analyzer_config_generator.py
+++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py
@@ -17,16 +17,16 @@
 import json
 import logging
 from itertools import repeat
-from typing import Dict, Generator, List, Optional, Tuple
+from typing import Any, Dict, Generator, List, Optional, Tuple
 
 from model_analyzer.config.input.config_command_profile import ConfigCommandProfile
 from model_analyzer.config.input.config_defaults import (
     DEFAULT_INPUT_JSON_PATH,
     DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
     DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT,
-    DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY,
     DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE,
     DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH,
+    DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY,
 )
 from model_analyzer.constants import (
     LOGGER_NAME,
@@ -214,9 +214,10 @@ def _create_input_dict(self, model_perf_analyzer_flags: Dict) -> Dict:
         else:
             return {}
 
-    def _create_inference_load_list(self) -> List[int]:
-        # The two possible inference loads are request rate or concurrency
-        # Concurrency is the default and will be used unless the user specifies
+    def _create_inference_load_list(self) -> List[Any]:
+        # The three possible inference loads are request rate, concurrency or periodic concurrency
+        # For LLM models periodic concurrency is used for non-LLM models
+        # concurrency is the default and will be used unless the user specifies
         # request rate, either as a model parameter or a config option
         if self._cli_config.is_llm_model():
             return self._create_periodic_concurrency_list()
@@ -247,16 +248,50 @@ def _create_concurrency_list(self) -> List[int]:
                 self._cli_config.run_config_search_max_concurrency,
             )
 
-    def _create_periodic_concurrency_list(self) -> List[int]:
+    def _create_periodic_concurrency_list(self) -> List[str]:
         if self._model_parameters["periodic_concurrency"]:
             return sorted(self._model_parameters["periodic_concurrency"])
         elif self._cli_config.run_config_search_disable:
-            return [DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY]
+            return [DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY]
+
+        periodic_concurrencies = self._generate_periodic_concurrencies()
+        return periodic_concurrencies
+
+    def _generate_periodic_concurrencies(self) -> List[str]:
+        periodic_concurrencies = []
+
+        periodic_concurrency_doubled_list = utils.generate_doubled_list(
+            self._cli_config.run_config_search_min_periodic_concurrency,
+            self._cli_config.run_config_search_max_periodic_concurrency,
+        )
+
+        step_doubled_list = utils.generate_doubled_list(
+            self._cli_config.run_config_search_min_periodic_concurrency_step,
+            self._cli_config.run_config_search_max_periodic_concurrency_step,
+        )
+
+        for start in periodic_concurrency_doubled_list:
+            for end in periodic_concurrency_doubled_list:
+                for step in step_doubled_list:
+                    if self._is_illegal_periodic_concurrency_combination(
+                        start, end, step
+                    ):
+                        continue
+
+                    periodic_concurrencies.append(f"{start}:{end}:{step}")
+        return periodic_concurrencies
+
+    def _is_illegal_periodic_concurrency_combination(
+        self, start: int, end: int, step: int
+    ) -> bool:
+        if start > end:
+            return True
+        elif start == end and step != 1:
+            return True
+        elif (end - start) % step:
+            return True
         else:
-            return utils.generate_doubled_list(
-                self._cli_config.run_config_search_min_periodic_concurrency,
-                self._cli_config.run_config_search_max_periodic_concurrency,
-            )
+            return False
 
     def _create_text_input_length_list(self) -> List[int]:
         if not self._cli_config.is_llm_model():
diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py
index bdce45027..9da3e7d31 100755
--- a/model_analyzer/config/input/config_command_profile.py
+++ b/model_analyzer/config/input/config_command_profile.py
@@ -498,7 +498,7 @@ def _add_profile_models_configs(self):
                             schema={
                                 "batch_sizes": ConfigListNumeric(type_=int),
                                 "concurrency": ConfigListNumeric(type_=int),
-                                "periodic_concurrency": ConfigListNumeric(type_=int),
+                                "periodic_concurrency": ConfigListString(),
                                 "request_rate": ConfigListNumeric(type_=int),
                                 "request_period": ConfigListNumeric(type_=int),
                                 "text_input_length": ConfigListNumeric(type_=int),
@@ -569,9 +569,8 @@ def _add_profile_models_configs(self):
             ConfigField(
                 "periodic_concurrency",
                 flags=["--periodic-concurrency"],
-                field_type=ConfigListNumeric(int),
-                description="Comma-delimited list of periodic concurrency values or ranges <start:end:step>"
-                " to be used during profiling",
+                field_type=ConfigListString(),
+                description="A list of ranges <start:end:step> to be used during profiling",
             )
         )
         self._add_config(
diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py
index bab62a4fd..aad674838 100755
--- a/model_analyzer/config/input/config_defaults.py
+++ b/model_analyzer/config/input/config_defaults.py
@@ -45,6 +45,7 @@
 DEFAULT_CLIENT_PROTOCOL = "grpc"
 DEFAULT_RUN_CONFIG_MAX_CONCURRENCY = 1024
 DEFAULT_RUN_CONFIG_MIN_CONCURRENCY = 1
+DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY = "1:1:1"
 DEFAULT_RUN_CONFIG_MAX_PERIODIC_CONCURRENCY = 1024
 DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY = 16
 DEFAULT_RUN_CONFIG_MAX_PERIODIC_CONCURRENCY_STEP = 128
diff --git a/model_analyzer/config/input/config_list_numeric.py b/model_analyzer/config/input/config_list_numeric.py
index 799cbdf9e..b677bcdab 100755
--- a/model_analyzer/config/input/config_list_numeric.py
+++ b/model_analyzer/config/input/config_list_numeric.py
@@ -103,7 +103,14 @@ def set_value(self, value):
         try:
             if self._is_string(value):
                 self._value = []
-                value = value.split(",")
+                if "," in value:
+                    value = value.split(",")
+                elif ":" in value:
+                    value = value.split(":")
+                    if len(value) == 2:
+                        value = {"start": value[0], "stop": value[1], "step": 1}
+                    else:
+                        value = {"start": value[0], "stop": value[1], "step": value[2]}
 
             if self._is_list(value):
                 new_value = self._process_list(value)
diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py
index 380a5d404..e8448ae98 100755
--- a/tests/common/test_utils.py
+++ b/tests/common/test_utils.py
@@ -29,7 +29,7 @@
     DEFAULT_OUTPUT_MODEL_REPOSITORY,
     DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
     DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT,
-    DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY,
+    DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY,
     DEFAULT_TRITON_GRPC_ENDPOINT,
     DEFAULT_TRITON_HTTP_ENDPOINT,
     DEFAULT_TRITON_INSTALL_PATH,
@@ -241,7 +241,7 @@ def construct_perf_analyzer_config(
     export_file_name="my-model-results.json",
     batch_size=DEFAULT_BATCH_SIZES,
     concurrency=DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
-    periodic_concurrency=DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY,
+    periodic_concurrency=DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY,
     request_rate=None,
     max_token_count=DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT,
     launch_mode=DEFAULT_TRITON_LAUNCH_MODE,
@@ -264,7 +264,7 @@ def construct_perf_analyzer_config(
         The batch size for this PA configuration
     concurrency: int
         The concurrency value for this PA configuration
-    periodic_concurrency:
+    periodic_concurrency: list
         The periodic concurrency value for this PA configuration
     request_rate: int
         The request rate value for this PA configuration
diff --git a/tests/test_cli.py b/tests/test_cli.py
index c6669b2c2..1a2fb84a2 100755
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -145,7 +145,7 @@ def get_test_options():
         #   expected_default_value
         OptionStruct("intlist", "profile", "--batch-sizes", "-b", "2, 4, 6", "1"),
         OptionStruct("intlist", "profile", "--concurrency", "-c", "1, 2, 3", None),
-        OptionStruct("intlist", "profile", "--periodic-concurrency", None, "1, 2, 3", None),
+        OptionStruct("stringlist", "profile", "--periodic-concurrency", None, '"5:50:5", "10:100:10"', None, None),
         OptionStruct("intlist", "profile", "--request-rate", None, "1, 2, 3", None),
         OptionStruct("intlist", "profile", "--request-period", None, "1, 2, 3", None),
         OptionStruct("intlist", "profile", "--text-input-length", None, "1, 2, 3", None),
@@ -603,9 +603,15 @@ def _convert_string_to_numeric(self, number):
         return float(number) if "." in number else int(number)
 
     def _convert_string_to_int_list(self, list_values):
-        ret_val = [int(x) for x in list_values.split(",")]
+        if ":" in list_values:
+            ret_val = [int(x) for x in list_values.split(":")]
+            ret_val = list(range(ret_val[0], ret_val[1] + 1, ret_val[2]))
+        else:
+            ret_val = [int(x) for x in list_values.split(",")]
+
         if len(ret_val) == 1:
             return ret_val[0]
+
         return ret_val
 
     def _convert_string_to_string_list(self, list_values):
diff --git a/tests/test_perf_analyzer_config_generator.py b/tests/test_perf_analyzer_config_generator.py
index f00084335..a405e2df6 100755
--- a/tests/test_perf_analyzer_config_generator.py
+++ b/tests/test_perf_analyzer_config_generator.py
@@ -577,15 +577,23 @@ def test_llm_search_max_token_count(self):
         # yapf: enable
 
         max_token_counts = utils.generate_doubled_list(1, 256)
-        expected_configs = [
-            construct_perf_analyzer_config(max_token_count=mtc, llm_search_mode=True)
-            for mtc in max_token_counts
-        ]
+        periodic_concurrencies = ["16:32:4", "16:32:8", "16:32:16"]
+
+        expected_configs = []
+        for mtc in max_token_counts:
+            for pc in periodic_concurrencies:
+                expected_configs.append(
+                    construct_perf_analyzer_config(
+                        max_token_count=mtc,
+                        llm_search_mode=True,
+                        periodic_concurrency=pc,
+                    )
+                )
 
         pa_cli_args = [
             "--llm-search-enable",
             "--run-config-search-max-periodic-concurrency",
-            "16",
+            "32",
             "--run-config-search-max-text-input-length",
             "1",
         ]
@@ -611,17 +619,109 @@ def test_llm_search_text_input_length(self):
         # yapf: enable
 
         text_input_lengths = utils.generate_doubled_list(1, 1024)
+        periodic_concurrencies = ["16:32:4", "16:32:8", "16:32:16"]
+
+        expected_configs = []
+        for _ in text_input_lengths:
+            for pc in periodic_concurrencies:
+                expected_configs.append(
+                    construct_perf_analyzer_config(
+                        llm_search_mode=True, periodic_concurrency=pc
+                    )
+                )
+
+        pa_cli_args = [
+            "--llm-search-enable",
+            "--run-config-search-max-periodic-concurrency",
+            "32",
+            "--run-config-search-max-max-token-count",
+            "1",
+        ]
+        self._run_and_test_perf_analyzer_config_generator(
+            yaml_str, expected_configs, pa_cli_args
+        )
+
+    def test_periodic_concurrency_parameter(self):
+        """
+        Test LLM Search:
+            - periodic-concurrency: 10:100:10
+
+        Max token set to 1
+        Text input set to 1
+        """
+
+        # yapf: disable
+        yaml_str = ("""
+            perf_analyzer_flags:
+                input-data: input-data.json
+            profile_models:
+                - my-model
+            """)
+        # yapf: enable
+
         expected_configs = [
-            construct_perf_analyzer_config(llm_search_mode=True)
-            for pl in text_input_lengths
+            construct_perf_analyzer_config(
+                llm_search_mode=True, periodic_concurrency="10:100:10"
+            )
         ]
 
         pa_cli_args = [
             "--llm-search-enable",
-            "--run-config-search-max-periodic-concurrency",
-            "16",
+            "--periodic-concurrency",
+            "10:100:10",
             "--run-config-search-max-max-token-count",
             "1",
+            "--run-config-search-max-text-input-length",
+            "1",
+        ]
+        self._run_and_test_perf_analyzer_config_generator(
+            yaml_str, expected_configs, pa_cli_args
+        )
+
+    def test_periodic_concurrency_search(self):
+        """
+        Test LLM Search:
+            - Period Concurrency using RCS values
+
+        Max token set to 1
+        Text input set to 1
+        """
+
+        # yapf: disable
+        yaml_str = ("""
+            perf_analyzer_flags:
+                input-data: input-data.json
+            profile_models:
+                - my-model
+            """)
+        # yapf: enable
+
+        periodic_concurrencies = [
+            "16:32:8",
+            "16:32:16",
+            "16:64:8",
+            "16:64:16",
+            "32:64:8",
+            "32:64:16",
+            "32:64:32",
+        ]
+        expected_configs = [
+            construct_perf_analyzer_config(
+                llm_search_mode=True, periodic_concurrency=pc
+            )
+            for pc in periodic_concurrencies
+        ]
+
+        pa_cli_args = [
+            "--llm-search-enable",
+            "--run-config-search-max-max-token-count",
+            "1",
+            "--run-config-search-max-text-input-length",
+            "1",
+            "--run-config-search-max-periodic-concurrency",
+            "64",
+            "--run-config-search-min-periodic-concurrency-step",
+            "8",
         ]
         self._run_and_test_perf_analyzer_config_generator(
             yaml_str, expected_configs, pa_cli_args

From b3aba30b092d4f092921e63ebf2a8bfffecc4649 Mon Sep 17 00:00:00 2001
From: Brian Raf <92820864+nv-braf@users.noreply.github.com>
Date: Thu, 19 Oct 2023 10:03:38 -0700
Subject: [PATCH 07/12] Llm testing live run (#778)

* Created a new class ConfigRangeNumeric and using it for periodic-concurrency

* Fixes and defaults for periodic concurrency

* First unit test passing

* PACG chagnes complete. Unit tests updated and passing

* Removing uneeded class

* Changes to fix live run

* Minor refactor and cleanup

* Removing json files

* Changing to use f-string

* More cleanup from hwoo CR

* Removing stale code for request period

* Fix nit
---
 model_analyzer/analyzer.py                    |  5 +-
 .../automatic_model_config_generator.py       |  2 +-
 ...plus_binary_search_run_config_generator.py |  6 +-
 .../generate/brute_run_config_generator.py    |  2 +-
 .../perf_analyzer_config_generator.py         | 55 ++++++++++++++++---
 model_analyzer/perf_analyzer/perf_analyzer.py | 14 ++++-
 model_analyzer/record/metrics_manager.py      |  5 +-
 model_analyzer/record/record.py               |  2 +-
 tests/common/test_utils.py                    | 22 ++++++--
 tests/test_perf_analyzer_config_generator.py  | 14 ++++-
 10 files changed, 105 insertions(+), 22 deletions(-)

diff --git a/model_analyzer/analyzer.py b/model_analyzer/analyzer.py
index c68acae3f..750c2a8ba 100755
--- a/model_analyzer/analyzer.py
+++ b/model_analyzer/analyzer.py
@@ -137,7 +137,10 @@ def profile(
         if not self._config.skip_summary_reports:
             self._create_summary_tables(verbose)
             self._create_summary_reports(mode)
-            self._create_detailed_reports(mode)
+
+            # FIXME: need to figure out detailed reporting for LLMs
+            if not self._config.is_llm_model():
+                self._create_detailed_reports(mode)
 
         self._check_for_perf_analyzer_errors()
 
diff --git a/model_analyzer/config/generate/automatic_model_config_generator.py b/model_analyzer/config/generate/automatic_model_config_generator.py
index 283f112d0..c4d7595b4 100755
--- a/model_analyzer/config/generate/automatic_model_config_generator.py
+++ b/model_analyzer/config/generate/automatic_model_config_generator.py
@@ -88,7 +88,7 @@ def __init__(
 
         self._reset_max_batch_size()
 
-        if not self._early_exit_enable:
+        if not self._early_exit_enable and not self._config.is_llm_model():
             raise TritonModelAnalyzerException(
                 "Early exit disable is not supported in automatic model config generator"
             )
diff --git a/model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py b/model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py
index 78d55a1bc..efe403041 100755
--- a/model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py
+++ b/model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py
@@ -116,7 +116,11 @@ def _create_brute_run_config_generator(self) -> BruteRunConfigGenerator:
 
     def _can_binary_search_top_results(self) -> bool:
         for model in self._models:
-            if model.parameters()["concurrency"] or model.parameters()["request_rate"]:
+            if (
+                model.parameters()["concurrency"]
+                or model.parameters()["request_rate"]
+                or self._config.is_llm_model()
+            ):
                 return False
 
         return True
diff --git a/model_analyzer/config/generate/brute_run_config_generator.py b/model_analyzer/config/generate/brute_run_config_generator.py
index d226811aa..151e97fde 100755
--- a/model_analyzer/config/generate/brute_run_config_generator.py
+++ b/model_analyzer/config/generate/brute_run_config_generator.py
@@ -80,7 +80,7 @@ def __init__(
         self._curr_results: List = [[] for n in range(self._num_models)]
         self._curr_generators: Dict[int, ConfigGeneratorInterface] = {}
 
-        self._skip_default_config = skip_default_config
+        self._skip_default_config = skip_default_config or config.is_llm_model()
 
     def set_last_results(
         self, measurements: List[Optional[RunConfigMeasurement]]
diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py
index 104ed79e6..7e5154aee 100755
--- a/model_analyzer/config/generate/perf_analyzer_config_generator.py
+++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py
@@ -24,6 +24,7 @@
     DEFAULT_INPUT_JSON_PATH,
     DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
     DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT,
+    DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD,
     DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE,
     DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH,
     DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY,
@@ -115,13 +116,14 @@ def __init__(
         self._batch_sizes = sorted(model_parameters["batch_sizes"])
         self._text_input_lengths = self._create_text_input_length_list()
         self._max_token_counts = self._create_max_token_count_list()
+        self._request_periods = self._create_request_period_list()
 
         self._perf_config_parameter_values = self._create_parameter_perf_config_values()
         self._parameter_count = len(
             utils.generate_parameter_combinations(self._perf_config_parameter_values)
         )
 
-        self._input_json_filename = DEFAULT_INPUT_JSON_PATH + "/input-data.json"
+        self._input_json_base_filename = DEFAULT_INPUT_JSON_PATH + "/input-data-"
 
         self._generate_perf_configs()
 
@@ -321,6 +323,20 @@ def _create_max_token_count_list(self) -> List[int]:
                 self._cli_config.run_config_search_max_max_token_count,
             )
 
+    def _create_request_period_list(self) -> List[int]:
+        if not self._cli_config.is_llm_model():
+            return []
+
+        if self._model_parameters["request_period"]:
+            return sorted(self._model_parameters["period"])
+        elif self._cli_config.run_config_search_disable:
+            return [DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD]
+        else:
+            return utils.generate_doubled_list(
+                self._cli_config.run_config_search_min_request_period,
+                self._cli_config.run_config_search_max_request_period,
+            )
+
     def _generate_perf_configs(self) -> None:
         parameter_combinations = utils.generate_parameter_combinations(
             self._perf_config_parameter_values
@@ -377,8 +393,23 @@ def _extract_text_input_length(
     def _update_perf_config_based_on_parameter_combination(
         self, perf_config: PerfAnalyzerConfig, parameter_combination: Dict
     ) -> None:
+        if "request-parameter" in parameter_combination:
+            request_parameter = parameter_combination["request-parameter"]
+            max_tokens = self._extract_max_tokens_from_request_parameter(
+                request_parameter
+            )
+            parameter_combination["request-period"] = (
+                max_tokens
+                if max_tokens < parameter_combination["request-period"]
+                else parameter_combination["request-period"]
+            )
+
         perf_config.update_config(parameter_combination)
 
+    def _extract_max_tokens_from_request_parameter(self, request_parameter: str) -> int:
+        _, max_tokens, _ = request_parameter.split(":")
+        return int(max_tokens)
+
     def _update_perf_config_based_on_perf_analyzer_flags(
         self, perf_config: PerfAnalyzerConfig
     ) -> None:
@@ -389,6 +420,7 @@ def _update_perf_config_based_on_inference_load(
     ) -> None:
         if self._cli_config.is_llm_model():
             perf_config.update_config({"periodic-concurrency-range": inference_load})
+            perf_config.update_config({"streaming": "True"})
         elif self._cli_config.is_request_rate_specified(self._model_parameters):
             perf_config.update_config({"request-rate-range": inference_load})
         else:
@@ -400,21 +432,29 @@ def _update_perf_config_for_llm_model(
         if not self._cli_config.is_llm_model():
             return
 
+        input_json_filename = (
+            self._input_json_base_filename + f"{text_input_length}.json"
+        )
         modified_input_dict = self._modify_text_in_input_dict(text_input_length)
-        self._write_modified_input_dict_to_file(modified_input_dict)
+        self._write_modified_input_dict_to_file(
+            modified_input_dict, input_json_filename
+        )
 
-        perf_config.update_config({"input-data": self._input_json_filename})
+        perf_config.update_config({"input-data": input_json_filename})
 
     def _modify_text_in_input_dict(self, text_input_length: int) -> Dict:
         modified_text = " ".join(repeat("Hello", text_input_length))
 
         modified_input_dict = {k: v for k, v in self._llm_input_dict.items()}
-        modified_input_dict["data"][0]["text-input"] = modified_text
+        # FIXME: this needs to be updated once tritonserver/PA are updated TMA-1414
+        modified_input_dict["data"][0]["PROMPT"] = [modified_text]
 
         return modified_input_dict
 
-    def _write_modified_input_dict_to_file(self, modified_input_dict: Dict) -> None:
-        with open(self._input_json_filename, "w") as f:
+    def _write_modified_input_dict_to_file(
+        self, modified_input_dict: Dict, input_json_filename: str
+    ) -> None:
+        with open(input_json_filename, "w") as f:
             json.dump(modified_input_dict, f)
 
     def _create_parameter_perf_config_values(self) -> dict:
@@ -424,8 +464,9 @@ def _create_parameter_perf_config_values(self) -> dict:
 
         if self._cli_config.is_llm_model():
             perf_config_values["request-parameter"] = [
-                "max_token:" + str(mtc) + ":int" for mtc in self._max_token_counts
+                f"max_tokens:{str(mtc)}:int" for mtc in self._max_token_counts
             ]
+            perf_config_values["request-period"] = self._request_periods
             perf_config_values["text-input-length"] = self._text_input_lengths
 
         return perf_config_values
diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py
index 49f15f5a2..2decbf6f2 100755
--- a/model_analyzer/perf_analyzer/perf_analyzer.py
+++ b/model_analyzer/perf_analyzer/perf_analyzer.py
@@ -99,7 +99,7 @@ class PerfAnalyzer:
     ]
 
     llm_metric_table = [
-        ["avg_first_latency",          None,                    AvgFirstTokenLatency,     "1000"],
+        ["avg_first_token_latency",    None,                    AvgFirstTokenLatency,     "1000"],
         ["avg_token_to_token_latency", None,                    AvgTokenToTokenLatency,   "1000"]
     ]
     # yapf: enable
@@ -285,6 +285,14 @@ def _get_single_model_cmd(self, index):
         if self._is_multi_model():
             cmd += ["--enable-mpi"]
         cmd += self._get_pa_cli_command(index).replace("=", " ").split()
+
+        # OPTME: There should be a more elegant way of determining how to add EOS
+        #        We have to do it here because we use a dictionary to create the PA command
+        #        and it already contains `--request-parameter`
+        if "--periodic-concurrency-range" in cmd:
+            cmd.append("--request-parameter")
+            cmd.append("ignore_eos:true:bool")
+
         return cmd
 
     def _get_pa_cli_command(self, index):
@@ -539,7 +547,7 @@ def _calculate_avg_first_token_latency(self, llm_output: Dict) -> float:
                 request["response_timestamps"][0] - request["timestamp"]
             )
 
-        avg_first_token_latency = mean(total_first_token_latencies)
+        avg_first_token_latency = float(mean(total_first_token_latencies))
 
         return avg_first_token_latency
 
@@ -554,7 +562,7 @@ def _calculate_avg_token_to_token_latency(self, llm_output: Dict) -> float:
 
             token_to_token_latencies.append(mean(response_to_response_latencies))
 
-        avg_token_to_token_latency = mean(token_to_token_latencies)
+        avg_token_to_token_latency = float(mean(token_to_token_latencies))
 
         return avg_token_to_token_latency
 
diff --git a/model_analyzer/record/metrics_manager.py b/model_analyzer/record/metrics_manager.py
index fe77f6eb8..e703e19a2 100755
--- a/model_analyzer/record/metrics_manager.py
+++ b/model_analyzer/record/metrics_manager.py
@@ -572,8 +572,11 @@ def _run_perf_analyzer(
             self._handle_unsuccessful_perf_analyzer_run(perf_analyzer)
             return (None, None)
 
+        # FIXME: PA does not return a latency report file if an export report file is specified
         perf_records = (
-            perf_analyzer.get_perf_records() + perf_analyzer.get_llm_records()
+            perf_analyzer.get_llm_records()
+            if self._config.is_llm_model()
+            else perf_analyzer.get_perf_records()
         )
         gpu_records = perf_analyzer.get_gpu_records()
 
diff --git a/model_analyzer/record/record.py b/model_analyzer/record/record.py
index 23aa9e50f..8a55b6a88 100755
--- a/model_analyzer/record/record.py
+++ b/model_analyzer/record/record.py
@@ -101,7 +101,7 @@ def __init__(self, value, timestamp):
         Parameters
         ----------
         value : float or int
-            The value of the GPU metrtic
+            The value of the GPU metric
         timestamp : int
             The timestamp for the record in nanoseconds
         """
diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py
index e8448ae98..64c7525ef 100755
--- a/tests/common/test_utils.py
+++ b/tests/common/test_utils.py
@@ -29,6 +29,8 @@
     DEFAULT_OUTPUT_MODEL_REPOSITORY,
     DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
     DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT,
+    DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD,
+    DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH,
     DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY,
     DEFAULT_TRITON_GRPC_ENDPOINT,
     DEFAULT_TRITON_HTTP_ENDPOINT,
@@ -244,6 +246,8 @@ def construct_perf_analyzer_config(
     periodic_concurrency=DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY,
     request_rate=None,
     max_token_count=DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT,
+    text_input_length=DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH,
+    request_period=DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD,
     launch_mode=DEFAULT_TRITON_LAUNCH_MODE,
     client_protocol=DEFAULT_CLIENT_PROTOCOL,
     perf_analyzer_flags=None,
@@ -266,6 +270,12 @@ def construct_perf_analyzer_config(
         The concurrency value for this PA configuration
     periodic_concurrency: list
         The periodic concurrency value for this PA configuration
+    max_token_count: int
+        The max token count for this PA configuration
+    text_input_length: int
+        The text input length for this PA configuration
+    request_period: int
+        The request period for this PA configuration
     request_rate: int
         The request rate value for this PA configuration
     launch_mode: str
@@ -299,10 +309,14 @@ def construct_perf_analyzer_config(
         pa_config._args["concurrency-range"] = concurrency
 
     if llm_search_mode:
-        pa_config._args["request-parameter"] = (
-            "max_token:" + str(max_token_count) + ":int"
-        )
-        pa_config._args["input-data"] = DEFAULT_INPUT_JSON_PATH + "/input-data.json"
+        pa_config._args["request-parameter"] = f"max_tokens:{str(max_token_count)}:int"
+
+        pa_config._args["request-period"] = request_period
+        pa_config._args[
+            "input-data"
+        ] = f"{DEFAULT_INPUT_JSON_PATH}/input-data-{str(text_input_length)}.json"
+
+        pa_config._args["streaming"] = "True"
 
     pa_config._args["measurement-mode"] = DEFAULT_MEASUREMENT_MODE
 
diff --git a/tests/test_perf_analyzer_config_generator.py b/tests/test_perf_analyzer_config_generator.py
index a405e2df6..3a6f6795d 100755
--- a/tests/test_perf_analyzer_config_generator.py
+++ b/tests/test_perf_analyzer_config_generator.py
@@ -596,6 +596,8 @@ def test_llm_search_max_token_count(self):
             "32",
             "--run-config-search-max-text-input-length",
             "1",
+            "--run-config-search-max-request-period",
+            "1",
         ]
         self._run_and_test_perf_analyzer_config_generator(
             yaml_str, expected_configs, pa_cli_args
@@ -622,11 +624,13 @@ def test_llm_search_text_input_length(self):
         periodic_concurrencies = ["16:32:4", "16:32:8", "16:32:16"]
 
         expected_configs = []
-        for _ in text_input_lengths:
+        for til in text_input_lengths:
             for pc in periodic_concurrencies:
                 expected_configs.append(
                     construct_perf_analyzer_config(
-                        llm_search_mode=True, periodic_concurrency=pc
+                        llm_search_mode=True,
+                        periodic_concurrency=pc,
+                        text_input_length=til,
                     )
                 )
 
@@ -636,6 +640,8 @@ def test_llm_search_text_input_length(self):
             "32",
             "--run-config-search-max-max-token-count",
             "1",
+            "--run-config-search-max-request-period",
+            "1",
         ]
         self._run_and_test_perf_analyzer_config_generator(
             yaml_str, expected_configs, pa_cli_args
@@ -673,6 +679,8 @@ def test_periodic_concurrency_parameter(self):
             "1",
             "--run-config-search-max-text-input-length",
             "1",
+            "--run-config-search-max-request-period",
+            "1",
         ]
         self._run_and_test_perf_analyzer_config_generator(
             yaml_str, expected_configs, pa_cli_args
@@ -722,6 +730,8 @@ def test_periodic_concurrency_search(self):
             "64",
             "--run-config-search-min-periodic-concurrency-step",
             "8",
+            "--run-config-search-max-request-period",
+            "1",
         ]
         self._run_and_test_perf_analyzer_config_generator(
             yaml_str, expected_configs, pa_cli_args

From f508625efbf433b7875c73fbc4881aa623c185e6 Mon Sep 17 00:00:00 2001
From: Brian Raf <92820864+nv-braf@users.noreply.github.com>
Date: Fri, 20 Oct 2023 14:27:37 -0700
Subject: [PATCH 08/12] Changes to get LLM summary reports working (#779)

* Changes to get LLM summary reports working

* Addressing hwoo's CR
---
 .../config/generate/generator_utils.py        | 21 +++++-
 .../perf_analyzer_config_generator.py         |  8 +-
 .../config/input/config_command_profile.py    |  9 ++-
 .../config/input/config_defaults.py           | 12 +++
 model_analyzer/perf_analyzer/perf_analyzer.py | 14 +++-
 model_analyzer/perf_analyzer/perf_config.py   |  9 ++-
 .../record/types/avg_first_token_latency.py   |  2 +-
 .../types/avg_token_to_token_latency.py       |  2 +-
 model_analyzer/result/result_table_manager.py | 75 ++++++++++++++++++-
 tests/test_result_table_manager.py            |  4 +
 10 files changed, 137 insertions(+), 19 deletions(-)

diff --git a/model_analyzer/config/generate/generator_utils.py b/model_analyzer/config/generate/generator_utils.py
index ceef010ca..f3b6a61d5 100755
--- a/model_analyzer/config/generate/generator_utils.py
+++ b/model_analyzer/config/generate/generator_utils.py
@@ -80,8 +80,8 @@ def generate_combinations(value: object) -> List:
     @staticmethod
     def generate_parameter_combinations(params: Dict) -> List[Dict]:
         """
-        Generate a list of all possible subdictionaries
-        from given dictionary. The subdictionaries will
+        Generate a list of all possible sub-dictionaries
+        from given dictionary. The sub-dictionaries will
         have all the same keys, but only one value from
         each key.
 
@@ -116,3 +116,20 @@ def generate_doubled_list(min_value: int, max_value: int) -> List[int]:
             list.append(val)
             val *= 2
         return list
+
+    @staticmethod
+    def extract_value_from_request_parameter(request_parameter: str) -> int:
+        # Format is: <parameter>:<value>:<type>
+        # Example: max_tokens:10:int
+
+        _, value, _ = request_parameter.split(":")
+
+        return int(value)
+
+    @staticmethod
+    def extract_text_input_length_from_input_data(input_data: str) -> int:
+        # format is input-data-<num>.json
+        _, _, text_input_length = input_data.split("-")
+        text_input_length, _ = text_input_length.split(".")
+
+        return int(text_input_length)
diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py
index 7e5154aee..7f134813a 100755
--- a/model_analyzer/config/generate/perf_analyzer_config_generator.py
+++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py
@@ -395,9 +395,7 @@ def _update_perf_config_based_on_parameter_combination(
     ) -> None:
         if "request-parameter" in parameter_combination:
             request_parameter = parameter_combination["request-parameter"]
-            max_tokens = self._extract_max_tokens_from_request_parameter(
-                request_parameter
-            )
+            max_tokens = utils.extract_value_from_request_parameter(request_parameter)
             parameter_combination["request-period"] = (
                 max_tokens
                 if max_tokens < parameter_combination["request-period"]
@@ -406,10 +404,6 @@ def _update_perf_config_based_on_parameter_combination(
 
         perf_config.update_config(parameter_combination)
 
-    def _extract_max_tokens_from_request_parameter(self, request_parameter: str) -> int:
-        _, max_tokens, _ = request_parameter.split(":")
-        return int(max_tokens)
-
     def _update_perf_config_based_on_perf_analyzer_flags(
         self, perf_config: PerfAnalyzerConfig
     ) -> None:
diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py
index 9da3e7d31..da11ac967 100755
--- a/model_analyzer/config/input/config_command_profile.py
+++ b/model_analyzer/config/input/config_command_profile.py
@@ -1419,9 +1419,14 @@ def _autofill_values(self):
                     {"perf_throughput": {"min": self.min_throughput}}
                 )
 
-        # Switch default output fields if request rate is being used
+        # Switch default output fields if LLM model or request rate is being used
         # and the user didn't specify a custom output field
-        if self._using_request_rate():
+        if self.is_llm_model():
+            if not self._fields["inference_output_fields"].is_set_by_user():
+                self.inference_output_fields = (
+                    config_defaults.DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS
+                )
+        elif self._using_request_rate():
             if not self._fields["inference_output_fields"].is_set_by_user():
                 self.inference_output_fields = (
                     config_defaults.DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS
diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py
index aad674838..fd48812b8 100755
--- a/model_analyzer/config/input/config_defaults.py
+++ b/model_analyzer/config/input/config_defaults.py
@@ -142,6 +142,18 @@
     "perf_throughput",
     "perf_latency_p99",
 ]
+DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS = [
+    "model_name",
+    "batch_size",
+    "periodic_concurrency",
+    "request_period",
+    "text_input_length",
+    "max_tokens",
+    "model_config_path",
+    "instance_group",
+    "avg_first_token_latency",
+    "avg_token_to_token_latency",
+]
 DEFAULT_GPU_OUTPUT_FIELDS = [
     "model_name",
     "gpu_uuid",
diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py
index 2decbf6f2..0a8f2b725 100755
--- a/model_analyzer/perf_analyzer/perf_analyzer.py
+++ b/model_analyzer/perf_analyzer/perf_analyzer.py
@@ -99,8 +99,8 @@ class PerfAnalyzer:
     ]
 
     llm_metric_table = [
-        ["avg_first_token_latency",    None,                    AvgFirstTokenLatency,     "1000"],
-        ["avg_token_to_token_latency", None,                    AvgTokenToTokenLatency,   "1000"]
+        ["avg_first_token_latency",    None,                    AvgFirstTokenLatency,     "1000000"],
+        ["avg_token_to_token_latency", None,                    AvgTokenToTokenLatency,   "1000000"]
     ]
     # yapf: enable
 
@@ -548,8 +548,11 @@ def _calculate_avg_first_token_latency(self, llm_output: Dict) -> float:
             )
 
         avg_first_token_latency = float(mean(total_first_token_latencies))
+        reduction_factor = float(
+            PerfAnalyzer.llm_metric_table[0][PerfAnalyzer.REDUCTION_FACTOR]  # type: ignore
+        )
 
-        return avg_first_token_latency
+        return avg_first_token_latency / reduction_factor
 
     def _calculate_avg_token_to_token_latency(self, llm_output: Dict) -> float:
         token_to_token_latencies = []
@@ -563,8 +566,11 @@ def _calculate_avg_token_to_token_latency(self, llm_output: Dict) -> float:
             token_to_token_latencies.append(mean(response_to_response_latencies))
 
         avg_token_to_token_latency = float(mean(token_to_token_latencies))
+        reduction_factor = float(
+            PerfAnalyzer.llm_metric_table[1][PerfAnalyzer.REDUCTION_FACTOR]  # type: ignore
+        )
 
-        return avg_token_to_token_latency
+        return avg_token_to_token_latency / reduction_factor
 
     def _extract_perf_records_from_row(
         self, requested_metrics: List[Record], row_metrics: Dict[str, str]
diff --git a/model_analyzer/perf_analyzer/perf_config.py b/model_analyzer/perf_analyzer/perf_config.py
index a72cdc3b1..5f7579175 100755
--- a/model_analyzer/perf_analyzer/perf_config.py
+++ b/model_analyzer/perf_analyzer/perf_config.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from model_analyzer.config.generate.generator_utils import GeneratorUtils as utils
 from model_analyzer.config.input.config_defaults import DEFAULT_MEASUREMENT_MODE
 from model_analyzer.constants import SECONDS_TO_MILLISECONDS_MULTIPLIER
 from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
@@ -283,7 +284,13 @@ def extract_model_specific_parameters(self):
             "concurrency-range": self._args["concurrency-range"],
             "request-rate-range": self._args["request-rate-range"],
             "periodic-concurrency-range": self._args["periodic-concurrency-range"],
-            "max-tokens": self._args["request-parameter"],
+            "max-tokens": utils.extract_value_from_request_parameter(
+                self._args["request-parameter"]
+            ),
+            "request-period": self._args["request-period"],
+            "text-input-length": utils.extract_text_input_length_from_input_data(
+                self._args["input-data"]
+            ),
         }
 
     @classmethod
diff --git a/model_analyzer/record/types/avg_first_token_latency.py b/model_analyzer/record/types/avg_first_token_latency.py
index 72d539633..fe862aad6 100755
--- a/model_analyzer/record/types/avg_first_token_latency.py
+++ b/model_analyzer/record/types/avg_first_token_latency.py
@@ -57,7 +57,7 @@ def header(cls, aggregation_tag=False):
             metric.
         """
 
-        return "avg first token-to-token latency (ms)"
+        return "Avg First Token latency (ms)"
 
     def __eq__(self, other):
         """
diff --git a/model_analyzer/record/types/avg_token_to_token_latency.py b/model_analyzer/record/types/avg_token_to_token_latency.py
index 66c93b6fc..72ccdfe5f 100755
--- a/model_analyzer/record/types/avg_token_to_token_latency.py
+++ b/model_analyzer/record/types/avg_token_to_token_latency.py
@@ -57,7 +57,7 @@ def header(cls, aggregation_tag=False):
             metric.
         """
 
-        return "avg token-to-token latency (ms)"
+        return "Avg Token-to-Token latency (ms)"
 
     def __eq__(self, other):
         """
diff --git a/model_analyzer/result/result_table_manager.py b/model_analyzer/result/result_table_manager.py
index 12a406e7c..83a6b8bd4 100755
--- a/model_analyzer/result/result_table_manager.py
+++ b/model_analyzer/result/result_table_manager.py
@@ -37,6 +37,10 @@ class ResultTableManager:
         "model_name": "Model",
         "batch_size": "Batch",
         "concurrency": "Concurrency",
+        "periodic_concurrency": "Periodic Concurrency",
+        "text_input_length": "Text Input Length",
+        "max_tokens": "Max Tokens",
+        "request_period": "Request Period",
         "request_rate": "Request Rate",
         "model_config_path": "Model Config Path",
         "instance_group": "Instance Group",
@@ -456,6 +460,10 @@ def _tabulate_measurement(
             batch_sizes,
             concurrencies,
             request_rates,
+            periodic_concurrencies,
+            max_tokens,
+            request_period,
+            text_input_length,
         ) = self._tabulate_measurement_setup(run_config_measurement)
 
         satisfies = "Yes" if passes else "No"
@@ -467,6 +475,10 @@ def _tabulate_measurement(
             batch_sizes,
             concurrencies,
             request_rates,
+            periodic_concurrencies,
+            max_tokens,
+            request_period,
+            text_input_length,
             satisfies,
             model_name,
             model_config_name,
@@ -494,6 +506,10 @@ def _tabulate_measurement(
                     batch_sizes,
                     concurrencies,
                     request_rates,
+                    periodic_concurrencies,
+                    max_tokens,
+                    request_period,
+                    text_input_length,
                     satisfies,
                     model_name,
                     model_config_name,
@@ -526,8 +542,37 @@ def _tabulate_measurement_setup(self, run_config_measurement):
             for pa_params in model_specific_pa_params
             if "request-rate-range" in pa_params
         ]
+        periodic_concurrencies = [
+            pa_params["periodic-concurrency-range"]
+            for pa_params in model_specific_pa_params
+            if "periodic-concurrency-range" in pa_params
+        ]
+        max_tokens = [
+            pa_params["max-tokens"]
+            for pa_params in model_specific_pa_params
+            if "max-tokens" in pa_params
+        ]
+        request_periods = [
+            pa_params["request-period"]
+            for pa_params in model_specific_pa_params
+            if "request-period" in pa_params
+        ]
+        text_input_lengths = [
+            pa_params["text-input-length"]
+            for pa_params in model_specific_pa_params
+            if "text-input-length" in pa_params
+        ]
 
-        return model_specific_pa_params, batch_sizes, concurrencies, request_rates
+        return (
+            model_specific_pa_params,
+            batch_sizes,
+            concurrencies,
+            request_rates,
+            periodic_concurrencies,
+            max_tokens,
+            request_periods,
+            text_input_lengths,
+        )
 
     def _populate_inference_rows(
         self, run_config_measurement, inference_fields, inference_row
@@ -574,6 +619,10 @@ def _get_common_row_items(
         batch_sizes,
         concurrencies,
         request_rates,
+        periodic_concurrencies,
+        max_tokens,
+        request_period,
+        text_input_length,
         satisfies,
         model_name,
         model_config_path,
@@ -604,6 +653,30 @@ def _get_common_row_items(
         if request_rate_index is not None:
             row[request_rate_index] = format_for_csv(request_rates)
 
+        # Periodic Concurrency
+        periodic_concurrency_index = self._find_index_for_field(
+            fields, "periodic_concurrency"
+        )
+        if periodic_concurrency_index is not None:
+            row[periodic_concurrency_index] = format_for_csv(periodic_concurrencies)
+
+        # Max Tokens
+        max_tokens_index = self._find_index_for_field(fields, "max_tokens")
+        if max_tokens_index is not None:
+            row[max_tokens_index] = format_for_csv(max_tokens)
+
+        # Request Period
+        request_period_index = self._find_index_for_field(fields, "request_period")
+        if request_period_index is not None:
+            row[request_period_index] = format_for_csv(request_period)
+
+        # Text Input Length
+        text_input_length_index = self._find_index_for_field(
+            fields, "text_input_length"
+        )
+        if text_input_length_index is not None:
+            row[text_input_length_index] = format_for_csv(text_input_length)
+
         # Satisfies
         satisfies_constraints_index = self._find_index_for_field(
             fields, "satisfies_constraints"
diff --git a/tests/test_result_table_manager.py b/tests/test_result_table_manager.py
index 278ceb5ea..854bb0543 100755
--- a/tests/test_result_table_manager.py
+++ b/tests/test_result_table_manager.py
@@ -304,6 +304,10 @@ def test_get_common_row_items_with_backend_parameters(self):
             dynamic_batchings=None,
             instance_groups=None,
             max_batch_sizes=None,
+            periodic_concurrencies=None,
+            max_tokens=None,
+            request_period=None,
+            text_input_length=None,
             backend_parameters=backend_parameters,
         )
         self.assertEqual(

From 88545899f35f594f83d3ada96a2f2315ad7f37c1 Mon Sep 17 00:00:00 2001
From: Brian Raf <92820864+nv-braf@users.noreply.github.com>
Date: Tue, 31 Oct 2023 14:58:08 -0700
Subject: [PATCH 09/12] Adding illegal LLM checks w/ unit testing + some minor
 cleanup (#781)

* Adding illegal LLM checks w/ unit testing + some minor cleanup

* Updated with TMA
---
 .../perf_analyzer_config_generator.py         |  2 +-
 model_analyzer/config/input/config_command.py | 48 ++++++++++++
 .../config/input/config_command_report.py     |  4 +
 model_analyzer/perf_analyzer/perf_config.py   |  9 ++-
 tests/test_config.py                          | 74 +++++++++++++++++++
 5 files changed, 135 insertions(+), 2 deletions(-)

diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py
index 7f134813a..8d10459ae 100755
--- a/model_analyzer/config/generate/perf_analyzer_config_generator.py
+++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py
@@ -328,7 +328,7 @@ def _create_request_period_list(self) -> List[int]:
             return []
 
         if self._model_parameters["request_period"]:
-            return sorted(self._model_parameters["period"])
+            return sorted(self._model_parameters["request_period"])
         elif self._cli_config.run_config_search_disable:
             return [DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD]
         else:
diff --git a/model_analyzer/config/input/config_command.py b/model_analyzer/config/input/config_command.py
index 23e4fc484..59d3e87ce 100755
--- a/model_analyzer/config/input/config_command.py
+++ b/model_analyzer/config/input/config_command.py
@@ -129,6 +129,7 @@ def _check_for_illegal_config_settings(
         self._check_for_bls_incompatibility(args, yaml_config)
         self._check_for_concurrency_rate_request_conflicts(args, yaml_config)
         self._check_for_config_search_rate_request_conflicts(args, yaml_config)
+        self._check_for_llm_incompatibility(args, yaml_config)
 
     def _set_field_values(
         self, args: Namespace, yaml_config: Optional[Dict[str, List]]
@@ -398,6 +399,53 @@ def _check_for_config_search_rate_request_conflicts(
                     f"\nCannot have both `run-config-search-max-request-rate` and `run-config-search-min/max-concurrency` specified in the config/CLI."
                 )
 
+    def _check_for_llm_incompatibility(
+        self, args: Namespace, yaml_config: Optional[Dict[str, List]]
+    ) -> None:
+        if not self._get_config_value("llm_search_enable", args, yaml_config):
+            return
+
+        if (
+            self._get_config_value("run_config_search_mode", args, yaml_config)
+            == "quick"
+        ):
+            raise TritonModelAnalyzerException(
+                f"\nLLM models are not supported in quick search. Please use brute search mode."
+            )
+
+        self._check_for_illegal_llm_option(
+            args, yaml_config, "run_config_search_min_model_batch_size"
+        )
+        self._check_for_illegal_llm_option(
+            args, yaml_config, "run_config_search_max_model_batch_size"
+        )
+        self._check_for_illegal_llm_option(
+            args, yaml_config, "run_config_search_min_concurrency"
+        )
+        self._check_for_illegal_llm_option(
+            args, yaml_config, "run_config_search_max_concurrency"
+        )
+        self._check_for_illegal_llm_option(
+            args, yaml_config, "run_config_search_min_request_rate"
+        )
+        self._check_for_illegal_llm_option(
+            args, yaml_config, "run_config_search_max_request_rate"
+        )
+        self._check_for_illegal_llm_option(
+            args, yaml_config, "request_rate_search_enable"
+        )
+        self._check_for_illegal_llm_option(args, yaml_config, "concurrency")
+        self._check_for_illegal_llm_option(args, yaml_config, "latency_budget")
+        self._check_for_illegal_llm_option(args, yaml_config, "min_throughput")
+
+    def _check_for_illegal_llm_option(
+        self, args: Namespace, yaml_config: Optional[Dict[str, List]], option: str
+    ) -> None:
+        if self._get_config_value(option, args, yaml_config):
+            raise TritonModelAnalyzerException(
+                f"\nLLM models do not support setting the `{option}` option when profiling."
+            )
+
     def _preprocess_and_verify_arguments(self):
         """
         Enforces some rules on the config.
diff --git a/model_analyzer/config/input/config_command_report.py b/model_analyzer/config/input/config_command_report.py
index 7d1eee7fb..d4cf9b536 100755
--- a/model_analyzer/config/input/config_command_report.py
+++ b/model_analyzer/config/input/config_command_report.py
@@ -209,6 +209,10 @@ def set_config_values(self, args):
 
         super().set_config_values(args)
 
+    # TODO TMA-1443: Update this when adding support for detailed reporting
+    def is_llm_model(self) -> bool:
+        return False
+
     def _preprocess_and_verify_arguments(self):
         """
         Enforces some rules on the config.
diff --git a/model_analyzer/perf_analyzer/perf_config.py b/model_analyzer/perf_analyzer/perf_config.py
index 5f7579175..35df6b7a3 100755
--- a/model_analyzer/perf_analyzer/perf_config.py
+++ b/model_analyzer/perf_analyzer/perf_config.py
@@ -101,6 +101,8 @@ class PerfAnalyzerConfig:
         "collect-metrics",
     ]
 
+    llm_args = ["text-input-length", "max-tokens"]
+
     def __init__(self):
         """
         Construct a PerfAnalyzerConfig
@@ -152,7 +154,12 @@ def allowed_keys(cls):
             passed into perf_analyzer
         """
 
-        return cls.perf_analyzer_args + cls.input_to_options + cls.input_to_verbose
+        return (
+            cls.perf_analyzer_args
+            + cls.input_to_options
+            + cls.input_to_verbose
+            + cls.llm_args
+        )
 
     @classmethod
     def additive_keys(cls):
diff --git a/tests/test_config.py b/tests/test_config.py
index 72af999fe..2dd371787 100755
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -2340,6 +2340,80 @@ def _test_arg_conflict(
         with self.assertRaises(TritonModelAnalyzerException):
             self._evaluate_config(args, yaml_content)
 
+    def test_llm_mode_rcs(self):
+        """
+        Test RCS options for an LLM model
+        """
+        yaml_content = ""
+
+        self._test_llm_mode_case(
+            yaml_content,
+            ["--run-config-search-mode", "brute"],
+            is_legal=True,
+            use_value=False,
+            use_list=False,
+        )
+        self._test_llm_mode_case(
+            yaml_content,
+            ["--run-config-search-mode", "quick"],
+            use_value=False,
+            use_list=False,
+        )
+
+        self._test_llm_mode_case(
+            yaml_content, ["--run-config-search-min-model-batch-size"]
+        )
+        self._test_llm_mode_case(
+            yaml_content, ["--run-config-search-max-model-batch-size"]
+        )
+        self._test_llm_mode_case(yaml_content, ["--run-config-search-min-concurrency"])
+        self._test_llm_mode_case(yaml_content, ["--run-config-search-max-concurrency"])
+        self._test_llm_mode_case(yaml_content, ["--run-config-search-min-request-rate"])
+        self._test_llm_mode_case(yaml_content, ["--run-config-search-max-request-rate"])
+        self._test_llm_mode_case(
+            yaml_content,
+            ["--request-rate-search-enable"],
+            use_value=False,
+            use_list=False,
+        )
+        self._test_llm_mode_case(yaml_content, ["--concurrency"])
+        self._test_llm_mode_case(yaml_content, ["--latency-budget"])
+        self._test_llm_mode_case(yaml_content, ["--min-throughput"])
+
+    def _test_llm_mode_case(
+        self,
+        yaml_content: Optional[Dict[str, List]],
+        options_string: str,
+        is_legal: bool = False,
+        use_value: bool = True,
+        use_list: bool = True,
+    ) -> None:
+        """
+        Tests that options raise exceptions in LLM mode
+        """
+        args = [
+            "model-analyzer",
+            "profile",
+            "--model-repository",
+            "cli-repository",
+            "--profile-models",
+            "test_llm_modelA",
+            "--llm-search-enable",
+        ]
+
+        args.extend(options_string)
+
+        if use_value:
+            args.append("1")
+        elif use_list:
+            args.append(["1", "2", "4"])
+
+        if is_legal:
+            self._evaluate_config(args, yaml_content, subcommand="profile")
+        else:
+            with self.assertRaises(TritonModelAnalyzerException):
+                self._evaluate_config(args, yaml_content, subcommand="profile")
+
 
 if __name__ == "__main__":
     unittest.main()

From d9e075b8ebec41779e470962544c4bc459aa0870 Mon Sep 17 00:00:00 2001
From: Brian Raf <92820864+nv-braf@users.noreply.github.com>
Date: Tue, 31 Oct 2023 15:47:02 -0700
Subject: [PATCH 10/12] Misc LLM cleanup (#782)

* General cleanup

* Add ticket nums to todos
---
 model_analyzer/analyzer.py                                 | 7 +++++--
 .../config/generate/perf_analyzer_config_generator.py      | 4 ++++
 model_analyzer/config/input/config_defaults.py             | 2 +-
 model_analyzer/record/metrics_manager.py                   | 5 +++++
 tests/test_model_manager.py                                | 4 +++-
 5 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/model_analyzer/analyzer.py b/model_analyzer/analyzer.py
index 750c2a8ba..f58bd1bdb 100755
--- a/model_analyzer/analyzer.py
+++ b/model_analyzer/analyzer.py
@@ -136,9 +136,12 @@ def profile(
 
         if not self._config.skip_summary_reports:
             self._create_summary_tables(verbose)
-            self._create_summary_reports(mode)
 
-            # FIXME: need to figure out detailed reporting for LLMs
+            # TODO TMA-1401: need to figure out summary reporting for LLMs
+            if not self._config.is_llm_model():
+                self._create_summary_reports(mode)
+
+            # TODO TMA-1443: need to figure out detailed reporting for LLMs
             if not self._config.is_llm_model():
                 self._create_detailed_reports(mode)
 
diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py
index 8d10459ae..10c86e610 100755
--- a/model_analyzer/config/generate/perf_analyzer_config_generator.py
+++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py
@@ -16,6 +16,7 @@
 
 import json
 import logging
+import os
 from itertools import repeat
 from typing import Any, Dict, Generator, List, Optional, Tuple
 
@@ -448,6 +449,9 @@ def _modify_text_in_input_dict(self, text_input_length: int) -> Dict:
     def _write_modified_input_dict_to_file(
         self, modified_input_dict: Dict, input_json_filename: str
     ) -> None:
+        if not os.path.exists(DEFAULT_INPUT_JSON_PATH):
+            os.makedirs(DEFAULT_INPUT_JSON_PATH)
+
         with open(input_json_filename, "w") as f:
             json.dump(modified_input_dict, f)
 
diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py
index fd48812b8..fb0b62ee8 100755
--- a/model_analyzer/config/input/config_defaults.py
+++ b/model_analyzer/config/input/config_defaults.py
@@ -38,7 +38,7 @@
 DEFAULT_SKIP_SUMMARY_REPORTS = False
 DEFAULT_SKIP_DETAILED_REPORTS = False
 DEFAULT_OUTPUT_MODEL_REPOSITORY = os.path.join(os.getcwd(), "output_model_repository")
-DEFAULT_INPUT_JSON_PATH = os.getcwd()
+DEFAULT_INPUT_JSON_PATH = os.path.join(os.getcwd(), "input_json_dir")
 DEFAULT_OVERRIDE_OUTPUT_REPOSITORY_FLAG = False
 DEFAULT_BATCH_SIZES = 1
 DEFAULT_MAX_RETRIES = 50
diff --git a/model_analyzer/record/metrics_manager.py b/model_analyzer/record/metrics_manager.py
index e703e19a2..10459a76f 100755
--- a/model_analyzer/record/metrics_manager.py
+++ b/model_analyzer/record/metrics_manager.py
@@ -16,6 +16,7 @@
 
 import logging
 import os
+import shutil
 import time
 from collections import defaultdict
 from typing import Dict, List, Optional, Tuple
@@ -27,6 +28,7 @@
 from model_analyzer.config.generate.base_model_config_generator import (
     BaseModelConfigGenerator,
 )
+from model_analyzer.config.input.config_defaults import DEFAULT_INPUT_JSON_PATH
 from model_analyzer.config.run.run_config import RunConfig
 from model_analyzer.constants import LOGGER_NAME, PA_ERROR_LOG_FILENAME
 from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
@@ -309,6 +311,9 @@ def profile_models(self, run_config: RunConfig) -> Optional[RunConfigMeasurement
     def finalize(self):
         self._server.stop()
 
+        if os.path.exists(DEFAULT_INPUT_JSON_PATH):
+            shutil.rmtree(DEFAULT_INPUT_JSON_PATH)
+
     def _create_model_variants(self, run_config: RunConfig) -> None:
         """
         Creates and fills all model variant directories
diff --git a/tests/test_model_manager.py b/tests/test_model_manager.py
index 0370c6c77..fcbd8eee8 100755
--- a/tests/test_model_manager.py
+++ b/tests/test_model_manager.py
@@ -1294,7 +1294,9 @@ def _test_model_manager(self, yaml_content, expected_ranges, args=None):
             MagicMock(),
         )
 
-        model_manager.run_models([config.profile_models[0]])
+        with patch("shutil.rmtree"):
+            model_manager.run_models([config.profile_models[0]])
+
         self.mock_model_config.stop()
 
         self._check_results(model_manager, expected_ranges)

From f229273f560e5c932dbac105815779ddf8275055 Mon Sep 17 00:00:00 2001
From: braf <braf@nvidia.com>
Date: Wed, 1 Nov 2023 14:48:19 +0000
Subject: [PATCH 11/12] Fix for non-LLM breaking bug introduced.

---
 model_analyzer/config/generate/generator_utils.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/model_analyzer/config/generate/generator_utils.py b/model_analyzer/config/generate/generator_utils.py
index f3b6a61d5..e7c76099f 100755
--- a/model_analyzer/config/generate/generator_utils.py
+++ b/model_analyzer/config/generate/generator_utils.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 from itertools import product
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 
 class GeneratorUtils:
@@ -118,16 +118,21 @@ def generate_doubled_list(min_value: int, max_value: int) -> List[int]:
         return list
 
     @staticmethod
-    def extract_value_from_request_parameter(request_parameter: str) -> int:
+    def extract_value_from_request_parameter(request_parameter: Optional[str]) -> int:
+        if not request_parameter:
+            return 0
+
         # Format is: <parameter>:<value>:<type>
         # Example: max_tokens:10:int
-
         _, value, _ = request_parameter.split(":")
 
         return int(value)
 
     @staticmethod
-    def extract_text_input_length_from_input_data(input_data: str) -> int:
+    def extract_text_input_length_from_input_data(input_data: Optional[str]) -> int:
+        if not input_data:
+            return 0
+
         # format is input-data-<num>.json
         _, _, text_input_length = input_data.split("-")
         text_input_length, _ = text_input_length.split(".")

From 709531b16332bed84456ae757008c246de71a845 Mon Sep 17 00:00:00 2001
From: braf <braf@nvidia.com>
Date: Wed, 1 Nov 2023 15:29:23 +0000
Subject: [PATCH 12/12] summary table in progress

---
 model_analyzer/config/generate/generator_utils.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/model_analyzer/config/generate/generator_utils.py b/model_analyzer/config/generate/generator_utils.py
index e7c76099f..551fa0c28 100755
--- a/model_analyzer/config/generate/generator_utils.py
+++ b/model_analyzer/config/generate/generator_utils.py
@@ -126,6 +126,12 @@ def extract_value_from_request_parameter(request_parameter: Optional[str]) -> in
         # Example: max_tokens:10:int
         _, value, _ = request_parameter.split(":")
 
+        # this catches the case for non-LLM models where the user has specified request parameters
+        try:
+            int(value)
+        except ValueError as _:
+            return 0
+
         return int(value)
 
     @staticmethod
@@ -137,4 +143,10 @@ def extract_text_input_length_from_input_data(input_data: Optional[str]) -> int:
         _, _, text_input_length = input_data.split("-")
         text_input_length, _ = text_input_length.split(".")
 
+        # this catches the case for non-LLM models where the user has specified input data
+        try:
+            int(text_input_length)
+        except ValueError as _:
+            return 0
+
         return int(text_input_length)