Changes to fix live run

triton-inference-server · Oct 18, 2023 · ee7565d · ee7565d
1 parent 2ec0743
commit ee7565d
Show file tree

Hide file tree

Showing 10 changed files with 70 additions and 20 deletions.
diff --git a/model_analyzer/analyzer.py b/model_analyzer/analyzer.py
@@ -137,7 +137,10 @@ def profile(
         if not self._config.skip_summary_reports:
             self._create_summary_tables(verbose)
             self._create_summary_reports(mode)
-            self._create_detailed_reports(mode)
+
+            # FIXME: need to figure out detailed reporting for LLMs
+            if not self._config.is_llm_model():
+                self._create_detailed_reports(mode)
 
         self._check_for_perf_analyzer_errors()
 

diff --git a/model_analyzer/config/generate/automatic_model_config_generator.py b/model_analyzer/config/generate/automatic_model_config_generator.py
@@ -88,7 +88,7 @@ def __init__(
 
         self._reset_max_batch_size()
 
-        if not self._early_exit_enable:
+        if not self._early_exit_enable and not self._config.is_llm_model():
             raise TritonModelAnalyzerException(
                 "Early exit disable is not supported in automatic model config generator"
             )

diff --git a/model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py b/model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py
@@ -116,7 +116,11 @@ def _create_brute_run_config_generator(self) -> BruteRunConfigGenerator:
 
     def _can_binary_search_top_results(self) -> bool:
         for model in self._models:
-            if model.parameters()["concurrency"] or model.parameters()["request_rate"]:
+            if (
+                model.parameters()["concurrency"]
+                or model.parameters()["request_rate"]
+                or self._config.is_llm_model()
+            ):
                 return False
 
         return True

diff --git a/model_analyzer/config/generate/brute_run_config_generator.py b/model_analyzer/config/generate/brute_run_config_generator.py
@@ -80,7 +80,7 @@ def __init__(
         self._curr_results: List = [[] for n in range(self._num_models)]
         self._curr_generators: Dict[int, ConfigGeneratorInterface] = {}
 
-        self._skip_default_config = skip_default_config
+        self._skip_default_config = skip_default_config or config.is_llm_model()
 
     def set_last_results(
         self, measurements: List[Optional[RunConfigMeasurement]]

diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py
@@ -121,7 +121,7 @@ def __init__(
             utils.generate_parameter_combinations(self._perf_config_parameter_values)
         )
 
-        self._input_json_filename = DEFAULT_INPUT_JSON_PATH + "/input-data.json"
+        self._input_json_base_filename = DEFAULT_INPUT_JSON_PATH + "/input-data-"
 
         self._generate_perf_configs()
 
@@ -377,6 +377,15 @@ def _extract_text_input_length(
     def _update_perf_config_based_on_parameter_combination(
         self, perf_config: PerfAnalyzerConfig, parameter_combination: Dict
     ) -> None:
+        if "request-parameter" in parameter_combination:
+            request_parameter = parameter_combination["request-parameter"]
+            max_token_start = request_parameter.find(":")
+            max_token_stop = request_parameter.find(":", max_token_start + 1)
+            max_token = int(request_parameter[max_token_start + 1 : max_token_stop])
+            parameter_combination["request-period"] = (
+                max_token if max_token < 10 else 10
+            )
+
         perf_config.update_config(parameter_combination)
 
     def _update_perf_config_based_on_perf_analyzer_flags(
@@ -389,6 +398,7 @@ def _update_perf_config_based_on_inference_load(
     ) -> None:
         if self._cli_config.is_llm_model():
             perf_config.update_config({"periodic-concurrency-range": inference_load})
+            perf_config.update_config({"streaming": "True"})
         elif self._cli_config.is_request_rate_specified(self._model_parameters):
             perf_config.update_config({"request-rate-range": inference_load})
         else:
@@ -400,21 +410,28 @@ def _update_perf_config_for_llm_model(
         if not self._cli_config.is_llm_model():
             return
 
+        input_json_filename = (
+            self._input_json_base_filename + f"{text_input_length}.json"
+        )
         modified_input_dict = self._modify_text_in_input_dict(text_input_length)
-        self._write_modified_input_dict_to_file(modified_input_dict)
+        self._write_modified_input_dict_to_file(
+            modified_input_dict, input_json_filename
+        )
 
-        perf_config.update_config({"input-data": self._input_json_filename})
+        perf_config.update_config({"input-data": input_json_filename})
 
     def _modify_text_in_input_dict(self, text_input_length: int) -> Dict:
         modified_text = " ".join(repeat("Hello", text_input_length))
 
         modified_input_dict = {k: v for k, v in self._llm_input_dict.items()}
-        modified_input_dict["data"][0]["text-input"] = modified_text
+        modified_input_dict["data"][0]["PROMPT"] = [modified_text]
 
         return modified_input_dict
 
-    def _write_modified_input_dict_to_file(self, modified_input_dict: Dict) -> None:
-        with open(self._input_json_filename, "w") as f:
+    def _write_modified_input_dict_to_file(
+        self, modified_input_dict: Dict, input_json_filename: str
+    ) -> None:
+        with open(input_json_filename, "w") as f:
             json.dump(modified_input_dict, f)
 
     def _create_parameter_perf_config_values(self) -> dict:
@@ -424,7 +441,7 @@ def _create_parameter_perf_config_values(self) -> dict:
 
         if self._cli_config.is_llm_model():
             perf_config_values["request-parameter"] = [
-                "max_token:" + str(mtc) + ":int" for mtc in self._max_token_counts
+                "max_tokens:" + str(mtc) + ":int" for mtc in self._max_token_counts
             ]
             perf_config_values["text-input-length"] = self._text_input_lengths
 

diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py
@@ -99,7 +99,7 @@ class PerfAnalyzer:
     ]
 
     llm_metric_table = [
-        ["avg_first_latency",          None,                    AvgFirstTokenLatency,     "1000"],
+        ["avg_first_token_latency",    None,                    AvgFirstTokenLatency,     "1000"],
         ["avg_token_to_token_latency", None,                    AvgTokenToTokenLatency,   "1000"]
     ]
     # yapf: enable
@@ -285,6 +285,14 @@ def _get_single_model_cmd(self, index):
         if self._is_multi_model():
             cmd += ["--enable-mpi"]
         cmd += self._get_pa_cli_command(index).replace("=", " ").split()
+
+        # OPTME: There should be a more elegant way of determining how to add EOS
+        #        We have to do it here because we use a dictionary to create the PA command
+        #        and it already contains `--request-parameter`
+        if "--periodic-concurrency-range" in cmd:
+            cmd.append("--request-parameter")
+            cmd.append("ignore_eos:true:bool")
+
         return cmd
 
     def _get_pa_cli_command(self, index):
@@ -539,7 +547,7 @@ def _calculate_avg_first_token_latency(self, llm_output: Dict) -> float:
                 request["response_timestamps"][0] - request["timestamp"]
             )
 
-        avg_first_token_latency = mean(total_first_token_latencies)
+        avg_first_token_latency = float(mean(total_first_token_latencies))
 
         return avg_first_token_latency
 
@@ -554,7 +562,7 @@ def _calculate_avg_token_to_token_latency(self, llm_output: Dict) -> float:
 
             token_to_token_latencies.append(mean(response_to_response_latencies))
 
-        avg_token_to_token_latency = mean(token_to_token_latencies)
+        avg_token_to_token_latency = float(mean(token_to_token_latencies))
 
         return avg_token_to_token_latency
 

diff --git a/model_analyzer/record/metrics_manager.py b/model_analyzer/record/metrics_manager.py
@@ -572,8 +572,11 @@ def _run_perf_analyzer(
             self._handle_unsuccessful_perf_analyzer_run(perf_analyzer)
             return (None, None)
 
+        # FIXME: PA does not return a latency report file if an export report file is specified
         perf_records = (
-            perf_analyzer.get_perf_records() + perf_analyzer.get_llm_records()
+            perf_analyzer.get_llm_records()
+            if self._config.is_llm_model()
+            else perf_analyzer.get_perf_records()
         )
         gpu_records = perf_analyzer.get_gpu_records()
 

diff --git a/model_analyzer/record/record.py b/model_analyzer/record/record.py
@@ -101,7 +101,7 @@ def __init__(self, value, timestamp):
         Parameters
         ----------
         value : float or int
-            The value of the GPU metrtic
+            The value of the GPU metric
         timestamp : int
             The timestamp for the record in nanoseconds
         """

diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py
@@ -29,6 +29,7 @@
     DEFAULT_OUTPUT_MODEL_REPOSITORY,
     DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
     DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT,
+    DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH,
     DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY,
     DEFAULT_TRITON_GRPC_ENDPOINT,
     DEFAULT_TRITON_HTTP_ENDPOINT,
@@ -244,6 +245,7 @@ def construct_perf_analyzer_config(
     periodic_concurrency=DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY,
     request_rate=None,
     max_token_count=DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT,
+    text_input_length=DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH,
     launch_mode=DEFAULT_TRITON_LAUNCH_MODE,
     client_protocol=DEFAULT_CLIENT_PROTOCOL,
     perf_analyzer_flags=None,
@@ -266,6 +268,10 @@ def construct_perf_analyzer_config(
         The concurrency value for this PA configuration
     periodic_concurrency: list
         The periodic concurrency value for this PA configuration
+    max_token_count: int
+        The max token count for this PA configuration
+    text_input_length: int
+        The text input length for this PA configuration
     request_rate: int
         The request rate value for this PA configuration
     launch_mode: str
@@ -300,9 +306,15 @@ def construct_perf_analyzer_config(
 
     if llm_search_mode:
         pa_config._args["request-parameter"] = (
-            "max_token:" + str(max_token_count) + ":int"
+            "max_tokens:" + str(max_token_count) + ":int"
         )
-        pa_config._args["input-data"] = DEFAULT_INPUT_JSON_PATH + "/input-data.json"
+        pa_config._args["request-period"] = (
+            max_token_count if max_token_count < 10 else 10
+        )
+        pa_config._args["input-data"] = (
+            DEFAULT_INPUT_JSON_PATH + "/input-data-" + str(text_input_length) + ".json"
+        )
+        pa_config._args["streaming"] = "True"
 
     pa_config._args["measurement-mode"] = DEFAULT_MEASUREMENT_MODE
 

diff --git a/tests/test_perf_analyzer_config_generator.py b/tests/test_perf_analyzer_config_generator.py
@@ -622,11 +622,13 @@ def test_llm_search_text_input_length(self):
         periodic_concurrencies = ["16:32:4", "16:32:8", "16:32:16"]
 
         expected_configs = []
-        for _ in text_input_lengths:
+        for til in text_input_lengths:
             for pc in periodic_concurrencies:
                 expected_configs.append(
                     construct_perf_analyzer_config(
-                        llm_search_mode=True, periodic_concurrency=pc
+                        llm_search_mode=True,
+                        periodic_concurrency=pc,
+                        text_input_length=til,
                     )
                 )
 
@@ -1005,6 +1007,7 @@ def _run_and_test_perf_analyzer_config_generator(
             self.assertEqual(
                 expected_configs[i]._options, perf_analyzer_configs[i]._options
             )
+            self.maxDiff = None
             self.assertEqual(expected_configs[i]._args, perf_analyzer_configs[i]._args)
             self.assertEqual(
                 expected_configs[i]._additive_args,