triton-inference-server · nv-braf · Oct 19, 2023 · Oct 16, 2023 · Oct 16, 2023 · Oct 16, 2023
diff --git a/model_analyzer/analyzer.py b/model_analyzer/analyzer.py
@@ -137,7 +137,10 @@ def profile(
         if not self._config.skip_summary_reports:
             self._create_summary_tables(verbose)
             self._create_summary_reports(mode)
-            self._create_detailed_reports(mode)
+
+            # FIXME: need to figure out detailed reporting for LLMs
+            if not self._config.is_llm_model():
+                self._create_detailed_reports(mode)
 
         self._check_for_perf_analyzer_errors()
 

diff --git a/model_analyzer/config/generate/automatic_model_config_generator.py b/model_analyzer/config/generate/automatic_model_config_generator.py
@@ -88,7 +88,7 @@ def __init__(
 
         self._reset_max_batch_size()
 
-        if not self._early_exit_enable:
+        if not self._early_exit_enable and not self._config.is_llm_model():
             raise TritonModelAnalyzerException(
                 "Early exit disable is not supported in automatic model config generator"
             )

diff --git a/model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py b/model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py
@@ -116,7 +116,11 @@ def _create_brute_run_config_generator(self) -> BruteRunConfigGenerator:
 
     def _can_binary_search_top_results(self) -> bool:
         for model in self._models:
-            if model.parameters()["concurrency"] or model.parameters()["request_rate"]:
+            if (
+                model.parameters()["concurrency"]
+                or model.parameters()["request_rate"]
+                or self._config.is_llm_model()
+            ):
                 return False
 
         return True

diff --git a/model_analyzer/config/generate/brute_run_config_generator.py b/model_analyzer/config/generate/brute_run_config_generator.py
@@ -80,7 +80,7 @@ def __init__(
         self._curr_results: List = [[] for n in range(self._num_models)]
         self._curr_generators: Dict[int, ConfigGeneratorInterface] = {}
 
-        self._skip_default_config = skip_default_config
+        self._skip_default_config = skip_default_config or config.is_llm_model()
 
     def set_last_results(
         self, measurements: List[Optional[RunConfigMeasurement]]

diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py
@@ -24,6 +24,7 @@
     DEFAULT_INPUT_JSON_PATH,
     DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
     DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT,
+    DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD,
     DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE,
     DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH,
     DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY,
@@ -115,13 +116,14 @@ def __init__(
         self._batch_sizes = sorted(model_parameters["batch_sizes"])
         self._text_input_lengths = self._create_text_input_length_list()
         self._max_token_counts = self._create_max_token_count_list()
+        self._request_periods = self._create_request_period_list()
 
         self._perf_config_parameter_values = self._create_parameter_perf_config_values()
         self._parameter_count = len(
             utils.generate_parameter_combinations(self._perf_config_parameter_values)
         )
 
-        self._input_json_filename = DEFAULT_INPUT_JSON_PATH + "/input-data.json"
+        self._input_json_base_filename = DEFAULT_INPUT_JSON_PATH + "/input-data-"
 
         self._generate_perf_configs()
 
@@ -321,6 +323,20 @@ def _create_max_token_count_list(self) -> List[int]:
                 self._cli_config.run_config_search_max_max_token_count,
             )
 
+    def _create_request_period_list(self) -> List[int]:
+        if not self._cli_config.is_llm_model():
+            return []
+
+        if self._model_parameters["request_period"]:
+            return sorted(self._model_parameters["period"])
+        elif self._cli_config.run_config_search_disable:
+            return [DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD]
+        else:
+            return utils.generate_doubled_list(
+                self._cli_config.run_config_search_min_request_period,
+                self._cli_config.run_config_search_max_request_period,
+            )
+
     def _generate_perf_configs(self) -> None:
         parameter_combinations = utils.generate_parameter_combinations(
             self._perf_config_parameter_values
@@ -377,8 +393,23 @@ def _extract_text_input_length(
     def _update_perf_config_based_on_parameter_combination(
         self, perf_config: PerfAnalyzerConfig, parameter_combination: Dict
     ) -> None:
+        if "request-parameter" in parameter_combination:
+            request_parameter = parameter_combination["request-parameter"]
+            max_tokens = self._extract_max_tokens_from_request_parameter(
+                request_parameter
+            )
+            parameter_combination["request-period"] = (
+                max_tokens
+                if max_tokens < parameter_combination["request-period"]
+                else parameter_combination["request-period"]
+            )
+
         perf_config.update_config(parameter_combination)
 
+    def _extract_max_tokens_from_request_parameter(self, request_parameter: str) -> int:
+        _, max_tokens, _ = request_parameter.split(":")
+        return int(max_tokens)
+
     def _update_perf_config_based_on_perf_analyzer_flags(
         self, perf_config: PerfAnalyzerConfig
     ) -> None:
@@ -389,6 +420,7 @@ def _update_perf_config_based_on_inference_load(
     ) -> None:
         if self._cli_config.is_llm_model():
             perf_config.update_config({"periodic-concurrency-range": inference_load})
+            perf_config.update_config({"streaming": "True"})
         elif self._cli_config.is_request_rate_specified(self._model_parameters):
             perf_config.update_config({"request-rate-range": inference_load})
         else:
@@ -400,21 +432,29 @@ def _update_perf_config_for_llm_model(
         if not self._cli_config.is_llm_model():
             return
 
+        input_json_filename = (
+            self._input_json_base_filename + f"{text_input_length}.json"
+        )
         modified_input_dict = self._modify_text_in_input_dict(text_input_length)
-        self._write_modified_input_dict_to_file(modified_input_dict)
+        self._write_modified_input_dict_to_file(
+            modified_input_dict, input_json_filename
+        )
 
-        perf_config.update_config({"input-data": self._input_json_filename})
+        perf_config.update_config({"input-data": input_json_filename})
 
     def _modify_text_in_input_dict(self, text_input_length: int) -> Dict:
         modified_text = " ".join(repeat("Hello", text_input_length))
 
         modified_input_dict = {k: v for k, v in self._llm_input_dict.items()}
-        modified_input_dict["data"][0]["text-input"] = modified_text
+        # FIXME: this needs to be updated once tritonserver/PA are updated TMA-1414
+        modified_input_dict["data"][0]["PROMPT"] = [modified_text]
 
         return modified_input_dict
 
-    def _write_modified_input_dict_to_file(self, modified_input_dict: Dict) -> None:
-        with open(self._input_json_filename, "w") as f:
+    def _write_modified_input_dict_to_file(
+        self, modified_input_dict: Dict, input_json_filename: str
+    ) -> None:
+        with open(input_json_filename, "w") as f:
             json.dump(modified_input_dict, f)
 
     def _create_parameter_perf_config_values(self) -> dict:
@@ -424,8 +464,9 @@ def _create_parameter_perf_config_values(self) -> dict:
 
         if self._cli_config.is_llm_model():
             perf_config_values["request-parameter"] = [
-                "max_token:" + str(mtc) + ":int" for mtc in self._max_token_counts
+                f"max_tokens:{str(mtc)}:int" for mtc in self._max_token_counts
             ]
+            perf_config_values["request-period"] = self._request_periods
             perf_config_values["text-input-length"] = self._text_input_lengths
 
         return perf_config_values

diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py
@@ -99,7 +99,7 @@ class PerfAnalyzer:
     ]
 
     llm_metric_table = [
-        ["avg_first_latency",          None,                    AvgFirstTokenLatency,     "1000"],
+        ["avg_first_token_latency",    None,                    AvgFirstTokenLatency,     "1000"],
         ["avg_token_to_token_latency", None,                    AvgTokenToTokenLatency,   "1000"]
     ]
     # yapf: enable
@@ -285,6 +285,14 @@ def _get_single_model_cmd(self, index):
         if self._is_multi_model():
             cmd += ["--enable-mpi"]
         cmd += self._get_pa_cli_command(index).replace("=", " ").split()
+
+        # OPTME: There should be a more elegant way of determining how to add EOS
+        #        We have to do it here because we use a dictionary to create the PA command
+        #        and it already contains `--request-parameter`
+        if "--periodic-concurrency-range" in cmd:
+            cmd.append("--request-parameter")
+            cmd.append("ignore_eos:true:bool")
+
         return cmd
 
     def _get_pa_cli_command(self, index):
@@ -539,7 +547,7 @@ def _calculate_avg_first_token_latency(self, llm_output: Dict) -> float:
                 request["response_timestamps"][0] - request["timestamp"]
             )
 
-        avg_first_token_latency = mean(total_first_token_latencies)
+        avg_first_token_latency = float(mean(total_first_token_latencies))
 
         return avg_first_token_latency
 
@@ -554,7 +562,7 @@ def _calculate_avg_token_to_token_latency(self, llm_output: Dict) -> float:
 
             token_to_token_latencies.append(mean(response_to_response_latencies))
 
-        avg_token_to_token_latency = mean(token_to_token_latencies)
+        avg_token_to_token_latency = float(mean(token_to_token_latencies))
 
         return avg_token_to_token_latency
 

diff --git a/model_analyzer/record/metrics_manager.py b/model_analyzer/record/metrics_manager.py
@@ -572,8 +572,11 @@ def _run_perf_analyzer(
             self._handle_unsuccessful_perf_analyzer_run(perf_analyzer)
             return (None, None)
 
+        # FIXME: PA does not return a latency report file if an export report file is specified
         perf_records = (
-            perf_analyzer.get_perf_records() + perf_analyzer.get_llm_records()
+            perf_analyzer.get_llm_records()
+            if self._config.is_llm_model()
+            else perf_analyzer.get_perf_records()
         )
         gpu_records = perf_analyzer.get_gpu_records()
 

diff --git a/model_analyzer/record/record.py b/model_analyzer/record/record.py
@@ -101,7 +101,7 @@ def __init__(self, value, timestamp):
         Parameters
         ----------
         value : float or int
-            The value of the GPU metrtic
+            The value of the GPU metric
         timestamp : int
             The timestamp for the record in nanoseconds
         """

diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py
@@ -29,6 +29,8 @@
     DEFAULT_OUTPUT_MODEL_REPOSITORY,
     DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
     DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT,
+    DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD,
+    DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH,
     DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY,
     DEFAULT_TRITON_GRPC_ENDPOINT,
     DEFAULT_TRITON_HTTP_ENDPOINT,
@@ -244,6 +246,8 @@ def construct_perf_analyzer_config(
     periodic_concurrency=DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY,
     request_rate=None,
     max_token_count=DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT,
+    text_input_length=DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH,
+    request_period=DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD,
     launch_mode=DEFAULT_TRITON_LAUNCH_MODE,
     client_protocol=DEFAULT_CLIENT_PROTOCOL,
     perf_analyzer_flags=None,
@@ -266,6 +270,12 @@ def construct_perf_analyzer_config(
         The concurrency value for this PA configuration
     periodic_concurrency: list
         The periodic concurrency value for this PA configuration
+    max_token_count: int
+        The max token count for this PA configuration
+    text_input_length: int
+        The text input length for this PA configuration
+    request_period: int
+        The request period for this PA configuration
     request_rate: int
         The request rate value for this PA configuration
     launch_mode: str
@@ -299,10 +309,14 @@ def construct_perf_analyzer_config(
         pa_config._args["concurrency-range"] = concurrency
 
     if llm_search_mode:
-        pa_config._args["request-parameter"] = (
-            "max_token:" + str(max_token_count) + ":int"
-        )
-        pa_config._args["input-data"] = DEFAULT_INPUT_JSON_PATH + "/input-data.json"
+        pa_config._args["request-parameter"] = f"max_tokens:{str(max_token_count)}:int"
+
+        pa_config._args["request-period"] = request_period
+        pa_config._args[
+            "input-data"
+        ] = f"{DEFAULT_INPUT_JSON_PATH}/input-data-{str(text_input_length)}.json"
+
+        pa_config._args["streaming"] = "True"
 
     pa_config._args["measurement-mode"] = DEFAULT_MEASUREMENT_MODE
 

diff --git a/tests/test_perf_analyzer_config_generator.py b/tests/test_perf_analyzer_config_generator.py
@@ -596,6 +596,8 @@ def test_llm_search_max_token_count(self):
             "32",
             "--run-config-search-max-text-input-length",
             "1",
+            "--run-config-search-max-request-period",
+            "1",
         ]
         self._run_and_test_perf_analyzer_config_generator(
             yaml_str, expected_configs, pa_cli_args
@@ -622,11 +624,13 @@ def test_llm_search_text_input_length(self):
         periodic_concurrencies = ["16:32:4", "16:32:8", "16:32:16"]
 
         expected_configs = []
-        for _ in text_input_lengths:
+        for til in text_input_lengths:
             for pc in periodic_concurrencies:
                 expected_configs.append(
                     construct_perf_analyzer_config(
-                        llm_search_mode=True, periodic_concurrency=pc
+                        llm_search_mode=True,
+                        periodic_concurrency=pc,
+                        text_input_length=til,
                     )
                 )
 
@@ -636,6 +640,8 @@ def test_llm_search_text_input_length(self):
             "32",
             "--run-config-search-max-max-token-count",
             "1",
+            "--run-config-search-max-request-period",
+            "1",
         ]
         self._run_and_test_perf_analyzer_config_generator(
             yaml_str, expected_configs, pa_cli_args
@@ -673,6 +679,8 @@ def test_periodic_concurrency_parameter(self):
             "1",
             "--run-config-search-max-text-input-length",
             "1",
+            "--run-config-search-max-request-period",
+            "1",
         ]
         self._run_and_test_perf_analyzer_config_generator(
             yaml_str, expected_configs, pa_cli_args
@@ -722,6 +730,8 @@ def test_periodic_concurrency_search(self):
             "64",
             "--run-config-search-min-periodic-concurrency-step",
             "8",
+            "--run-config-search-max-request-period",
+            "1",
         ]
         self._run_and_test_perf_analyzer_config_generator(
             yaml_str, expected_configs, pa_cli_args