From ee7565d7f81edc293d9ab1f13b045e7b5f9cb3b2 Mon Sep 17 00:00:00 2001 From: braf Date: Wed, 18 Oct 2023 15:44:16 +0000 Subject: [PATCH] Changes to fix live run --- model_analyzer/analyzer.py | 5 ++- .../automatic_model_config_generator.py | 2 +- ...plus_binary_search_run_config_generator.py | 6 +++- .../generate/brute_run_config_generator.py | 2 +- .../perf_analyzer_config_generator.py | 31 ++++++++++++++----- model_analyzer/perf_analyzer/perf_analyzer.py | 14 +++++++-- model_analyzer/record/metrics_manager.py | 5 ++- model_analyzer/record/record.py | 2 +- tests/common/test_utils.py | 16 ++++++++-- tests/test_perf_analyzer_config_generator.py | 7 +++-- 10 files changed, 70 insertions(+), 20 deletions(-) diff --git a/model_analyzer/analyzer.py b/model_analyzer/analyzer.py index c68acae3f..750c2a8ba 100755 --- a/model_analyzer/analyzer.py +++ b/model_analyzer/analyzer.py @@ -137,7 +137,10 @@ def profile( if not self._config.skip_summary_reports: self._create_summary_tables(verbose) self._create_summary_reports(mode) - self._create_detailed_reports(mode) + + # FIXME: need to figure out detailed reporting for LLMs + if not self._config.is_llm_model(): + self._create_detailed_reports(mode) self._check_for_perf_analyzer_errors() diff --git a/model_analyzer/config/generate/automatic_model_config_generator.py b/model_analyzer/config/generate/automatic_model_config_generator.py index 283f112d0..c4d7595b4 100755 --- a/model_analyzer/config/generate/automatic_model_config_generator.py +++ b/model_analyzer/config/generate/automatic_model_config_generator.py @@ -88,7 +88,7 @@ def __init__( self._reset_max_batch_size() - if not self._early_exit_enable: + if not self._early_exit_enable and not self._config.is_llm_model(): raise TritonModelAnalyzerException( "Early exit disable is not supported in automatic model config generator" ) diff --git a/model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py b/model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py index 78d55a1bc..efe403041 100755 --- a/model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py +++ b/model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py @@ -116,7 +116,11 @@ def _create_brute_run_config_generator(self) -> BruteRunConfigGenerator: def _can_binary_search_top_results(self) -> bool: for model in self._models: - if model.parameters()["concurrency"] or model.parameters()["request_rate"]: + if ( + model.parameters()["concurrency"] + or model.parameters()["request_rate"] + or self._config.is_llm_model() + ): return False return True diff --git a/model_analyzer/config/generate/brute_run_config_generator.py b/model_analyzer/config/generate/brute_run_config_generator.py index d226811aa..151e97fde 100755 --- a/model_analyzer/config/generate/brute_run_config_generator.py +++ b/model_analyzer/config/generate/brute_run_config_generator.py @@ -80,7 +80,7 @@ def __init__( self._curr_results: List = [[] for n in range(self._num_models)] self._curr_generators: Dict[int, ConfigGeneratorInterface] = {} - self._skip_default_config = skip_default_config + self._skip_default_config = skip_default_config or config.is_llm_model() def set_last_results( self, measurements: List[Optional[RunConfigMeasurement]] diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py index 104ed79e6..c3219257c 100755 --- a/model_analyzer/config/generate/perf_analyzer_config_generator.py +++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py @@ -121,7 +121,7 @@ def __init__( utils.generate_parameter_combinations(self._perf_config_parameter_values) ) - self._input_json_filename = DEFAULT_INPUT_JSON_PATH + "/input-data.json" + self._input_json_base_filename = DEFAULT_INPUT_JSON_PATH + "/input-data-" self._generate_perf_configs() @@ -377,6 +377,15 @@ def _extract_text_input_length( def _update_perf_config_based_on_parameter_combination( self, perf_config: PerfAnalyzerConfig, parameter_combination: Dict ) -> None: + if "request-parameter" in parameter_combination: + request_parameter = parameter_combination["request-parameter"] + max_token_start = request_parameter.find(":") + max_token_stop = request_parameter.find(":", max_token_start + 1) + max_token = int(request_parameter[max_token_start + 1 : max_token_stop]) + parameter_combination["request-period"] = ( + max_token if max_token < 10 else 10 + ) + perf_config.update_config(parameter_combination) def _update_perf_config_based_on_perf_analyzer_flags( @@ -389,6 +398,7 @@ def _update_perf_config_based_on_inference_load( ) -> None: if self._cli_config.is_llm_model(): perf_config.update_config({"periodic-concurrency-range": inference_load}) + perf_config.update_config({"streaming": "True"}) elif self._cli_config.is_request_rate_specified(self._model_parameters): perf_config.update_config({"request-rate-range": inference_load}) else: @@ -400,21 +410,28 @@ def _update_perf_config_for_llm_model( if not self._cli_config.is_llm_model(): return + input_json_filename = ( + self._input_json_base_filename + f"{text_input_length}.json" + ) modified_input_dict = self._modify_text_in_input_dict(text_input_length) - self._write_modified_input_dict_to_file(modified_input_dict) + self._write_modified_input_dict_to_file( + modified_input_dict, input_json_filename + ) - perf_config.update_config({"input-data": self._input_json_filename}) + perf_config.update_config({"input-data": input_json_filename}) def _modify_text_in_input_dict(self, text_input_length: int) -> Dict: modified_text = " ".join(repeat("Hello", text_input_length)) modified_input_dict = {k: v for k, v in self._llm_input_dict.items()} - modified_input_dict["data"][0]["text-input"] = modified_text + modified_input_dict["data"][0]["PROMPT"] = [modified_text] return modified_input_dict - def _write_modified_input_dict_to_file(self, modified_input_dict: Dict) -> None: - with open(self._input_json_filename, "w") as f: + def _write_modified_input_dict_to_file( + self, modified_input_dict: Dict, input_json_filename: str + ) -> None: + with open(input_json_filename, "w") as f: json.dump(modified_input_dict, f) def _create_parameter_perf_config_values(self) -> dict: @@ -424,7 +441,7 @@ def _create_parameter_perf_config_values(self) -> dict: if self._cli_config.is_llm_model(): perf_config_values["request-parameter"] = [ - "max_token:" + str(mtc) + ":int" for mtc in self._max_token_counts + "max_tokens:" + str(mtc) + ":int" for mtc in self._max_token_counts ] perf_config_values["text-input-length"] = self._text_input_lengths diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py index 49f15f5a2..2decbf6f2 100755 --- a/model_analyzer/perf_analyzer/perf_analyzer.py +++ b/model_analyzer/perf_analyzer/perf_analyzer.py @@ -99,7 +99,7 @@ class PerfAnalyzer: ] llm_metric_table = [ - ["avg_first_latency", None, AvgFirstTokenLatency, "1000"], + ["avg_first_token_latency", None, AvgFirstTokenLatency, "1000"], ["avg_token_to_token_latency", None, AvgTokenToTokenLatency, "1000"] ] # yapf: enable @@ -285,6 +285,14 @@ def _get_single_model_cmd(self, index): if self._is_multi_model(): cmd += ["--enable-mpi"] cmd += self._get_pa_cli_command(index).replace("=", " ").split() + + # OPTME: There should be a more elegant way of determining how to add EOS + # We have to do it here because we use a dictionary to create the PA command + # and it already contains `--request-parameter` + if "--periodic-concurrency-range" in cmd: + cmd.append("--request-parameter") + cmd.append("ignore_eos:true:bool") + return cmd def _get_pa_cli_command(self, index): @@ -539,7 +547,7 @@ def _calculate_avg_first_token_latency(self, llm_output: Dict) -> float: request["response_timestamps"][0] - request["timestamp"] ) - avg_first_token_latency = mean(total_first_token_latencies) + avg_first_token_latency = float(mean(total_first_token_latencies)) return avg_first_token_latency @@ -554,7 +562,7 @@ def _calculate_avg_token_to_token_latency(self, llm_output: Dict) -> float: token_to_token_latencies.append(mean(response_to_response_latencies)) - avg_token_to_token_latency = mean(token_to_token_latencies) + avg_token_to_token_latency = float(mean(token_to_token_latencies)) return avg_token_to_token_latency diff --git a/model_analyzer/record/metrics_manager.py b/model_analyzer/record/metrics_manager.py index fe77f6eb8..e703e19a2 100755 --- a/model_analyzer/record/metrics_manager.py +++ b/model_analyzer/record/metrics_manager.py @@ -572,8 +572,11 @@ def _run_perf_analyzer( self._handle_unsuccessful_perf_analyzer_run(perf_analyzer) return (None, None) + # FIXME: PA does not return a latency report file if an export report file is specified perf_records = ( - perf_analyzer.get_perf_records() + perf_analyzer.get_llm_records() + perf_analyzer.get_llm_records() + if self._config.is_llm_model() + else perf_analyzer.get_perf_records() ) gpu_records = perf_analyzer.get_gpu_records() diff --git a/model_analyzer/record/record.py b/model_analyzer/record/record.py index 23aa9e50f..8a55b6a88 100755 --- a/model_analyzer/record/record.py +++ b/model_analyzer/record/record.py @@ -101,7 +101,7 @@ def __init__(self, value, timestamp): Parameters ---------- value : float or int - The value of the GPU metrtic + The value of the GPU metric timestamp : int The timestamp for the record in nanoseconds """ diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py index e8448ae98..7e27824cf 100755 --- a/tests/common/test_utils.py +++ b/tests/common/test_utils.py @@ -29,6 +29,7 @@ DEFAULT_OUTPUT_MODEL_REPOSITORY, DEFAULT_RUN_CONFIG_MIN_CONCURRENCY, DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT, + DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH, DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY, DEFAULT_TRITON_GRPC_ENDPOINT, DEFAULT_TRITON_HTTP_ENDPOINT, @@ -244,6 +245,7 @@ def construct_perf_analyzer_config( periodic_concurrency=DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY, request_rate=None, max_token_count=DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT, + text_input_length=DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH, launch_mode=DEFAULT_TRITON_LAUNCH_MODE, client_protocol=DEFAULT_CLIENT_PROTOCOL, perf_analyzer_flags=None, @@ -266,6 +268,10 @@ def construct_perf_analyzer_config( The concurrency value for this PA configuration periodic_concurrency: list The periodic concurrency value for this PA configuration + max_token_count: int + The max token count for this PA configuration + text_input_length: int + The text input length for this PA configuration request_rate: int The request rate value for this PA configuration launch_mode: str @@ -300,9 +306,15 @@ def construct_perf_analyzer_config( if llm_search_mode: pa_config._args["request-parameter"] = ( - "max_token:" + str(max_token_count) + ":int" + "max_tokens:" + str(max_token_count) + ":int" ) - pa_config._args["input-data"] = DEFAULT_INPUT_JSON_PATH + "/input-data.json" + pa_config._args["request-period"] = ( + max_token_count if max_token_count < 10 else 10 + ) + pa_config._args["input-data"] = ( + DEFAULT_INPUT_JSON_PATH + "/input-data-" + str(text_input_length) + ".json" + ) + pa_config._args["streaming"] = "True" pa_config._args["measurement-mode"] = DEFAULT_MEASUREMENT_MODE diff --git a/tests/test_perf_analyzer_config_generator.py b/tests/test_perf_analyzer_config_generator.py index a405e2df6..818fb2564 100755 --- a/tests/test_perf_analyzer_config_generator.py +++ b/tests/test_perf_analyzer_config_generator.py @@ -622,11 +622,13 @@ def test_llm_search_text_input_length(self): periodic_concurrencies = ["16:32:4", "16:32:8", "16:32:16"] expected_configs = [] - for _ in text_input_lengths: + for til in text_input_lengths: for pc in periodic_concurrencies: expected_configs.append( construct_perf_analyzer_config( - llm_search_mode=True, periodic_concurrency=pc + llm_search_mode=True, + periodic_concurrency=pc, + text_input_length=til, ) ) @@ -1005,6 +1007,7 @@ def _run_and_test_perf_analyzer_config_generator( self.assertEqual( expected_configs[i]._options, perf_analyzer_configs[i]._options ) + self.maxDiff = None self.assertEqual(expected_configs[i]._args, perf_analyzer_configs[i]._args) self.assertEqual( expected_configs[i]._additive_args,