From 49b4e15fefbb860cbe233771efeec3a418b10ccf Mon Sep 17 00:00:00 2001 From: braf Date: Thu, 5 Oct 2023 02:20:19 +0000 Subject: [PATCH 01/23] Initial coding complete --- .../automatic_model_config_generator.py | 11 +- .../perf_analyzer_config_generator.py | 109 ++++++++++++++++- .../config/input/config_command_profile.py | 36 ++++++ model_analyzer/perf_analyzer/perf_config.py | 4 + tests/test_config.py | 114 ++++++++++++++++-- tests/test_perf_analyzer_config_generator.py | 26 +++- 6 files changed, 276 insertions(+), 24 deletions(-) diff --git a/model_analyzer/config/generate/automatic_model_config_generator.py b/model_analyzer/config/generate/automatic_model_config_generator.py index 79925cb7d..283f112d0 100755 --- a/model_analyzer/config/generate/automatic_model_config_generator.py +++ b/model_analyzer/config/generate/automatic_model_config_generator.py @@ -79,10 +79,7 @@ def __init__( logger.info("") AutomaticModelConfigGenerator._log_first_run = True - self._max_instance_count = config.run_config_search_max_instance_count - self._min_instance_count = config.run_config_search_min_instance_count - self._max_model_batch_size = config.run_config_search_max_model_batch_size - self._min_model_batch_size = config.run_config_search_min_model_batch_size + self._set_min_max_search_values(config) self._instance_kind = "KIND_CPU" if self._cpu_only else "KIND_GPU" @@ -162,3 +159,9 @@ def _get_curr_param_combo(self) -> Dict: config["dynamic_batching"] = {} return config + + def _set_min_max_search_values(self, config: ConfigCommandProfile) -> None: + self._max_instance_count = config.run_config_search_max_instance_count + self._min_instance_count = config.run_config_search_min_instance_count + self._max_model_batch_size = config.run_config_search_max_model_batch_size + self._min_model_batch_size = config.run_config_search_min_model_batch_size diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py index 985032564..0536b4abd 100755 --- a/model_analyzer/config/generate/perf_analyzer_config_generator.py +++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py @@ -14,7 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import logging +import tempfile +from copy import deepcopy from typing import Generator, List, Optional from model_analyzer.config.input.config_command_profile import ConfigCommandProfile @@ -88,13 +91,21 @@ def __init__( self._batch_size_results: List[Optional[RunConfigMeasurement]] = [] self._model_name = model_name - self._perf_analyzer_flags = model_perf_analyzer_flags self._batch_sizes = sorted(model_parameters["batch_sizes"]) self._cli_config = cli_config + self._perf_analyzer_flags = self._set_perf_analyzer_flags( + model_perf_analyzer_flags + ) + + self._llm_input_dict = self._create_input_dict(model_perf_analyzer_flags) self._model_parameters = model_parameters self._parameters = self._create_parameter_list() + + self._prompt_lengths = self._create_prompt_length_list() + self._max_token_counts = self._create_max_token_count_list() + self._generate_perf_configs() @staticmethod @@ -168,6 +179,32 @@ def set_last_results( self._last_results = measurement self._parameter_results.extend(measurement) + def _set_perf_analyzer_flags(self, model_perf_analyzer_flags: dict) -> dict: + # For LLM models we will be creating custom input data based on prompt length + if self._cli_config.is_llm_model(): + perf_analyzer_flags = deepcopy(model_perf_analyzer_flags) + perf_analyzer_flags.pop("input-data") + return perf_analyzer_flags + else: + return model_perf_analyzer_flags + + def _create_input_dict(self, model_perf_analyzer_flags: dict) -> dict: + if self._cli_config.is_llm_model(): + with open(model_perf_analyzer_flags["input-data"], "r") as f: + input_dict = json.load(f) + + return input_dict + else: + return {} + + def _modify_prompt_in_input_dict(self, prompt_length: int) -> dict: + modified_input_dict = deepcopy(self._llm_input_dict) + + modified_prompt = ["hi"] * prompt_length + modified_input_dict["data"][0]["PROMPT"] = modified_prompt + + return modified_input_dict + def _create_parameter_list(self) -> List[int]: # The two possible parameters are request rate or concurrency # Concurrency is the default and will be used unless the user specifies @@ -199,40 +236,100 @@ def _create_concurrency_list(self) -> List[int]: self._cli_config.run_config_search_max_concurrency, ) + def _create_prompt_length_list(self) -> List[int]: + if not self._cli_config.is_llm_model(): + return [] + + if self._model_parameters["prompt_length"]: + return sorted(self._model_parameters["prompt_length"]) + elif self._cli_config.run_config_search_disable: + return [1] + else: + return utils.generate_doubled_list( + self._cli_config.run_config_search_min_prompt_length, + self._cli_config.run_config_search_max_prompt_length, + ) + + def _create_max_token_count_list(self) -> List[int]: + if not self._cli_config.is_llm_model(): + return [] + + if self._model_parameters["max_token_count"]: + return sorted(self._model_parameters["max_token_count"]) + elif self._cli_config.run_config_search_disable: + return [1] + else: + return utils.generate_doubled_list( + self._cli_config.run_config_search_min_token_count, + self._cli_config.run_config_search_max_token_count, + ) + def _generate_perf_configs(self) -> None: perf_config_non_parameter_values = ( self._create_non_parameter_perf_config_values() ) - for params in utils.generate_parameter_combinations( + for unmodified_params in utils.generate_parameter_combinations( perf_config_non_parameter_values ): configs_with_concurrency = [] for parameter in self._parameters: + params = deepcopy(unmodified_params) new_perf_config = PerfAnalyzerConfig() new_perf_config.update_config_from_profile_config( self._model_name, self._cli_config ) + if self._cli_config.is_llm_model(): + prompt_length = params.pop("prompt-length") + new_perf_config.update_config(params) - if self._cli_config.is_request_rate_specified(self._model_parameters): - new_perf_config.update_config({"request-rate-range": parameter}) - else: - new_perf_config.update_config({"concurrency-range": parameter}) + new_perf_config = self._update_config_based_on_parameter( + new_perf_config, parameter + ) # User provided flags can override the search parameters new_perf_config.update_config(self._perf_analyzer_flags) + if self._cli_config.is_llm_model(): + modified_input_dict = self._modify_prompt_in_input_dict( + prompt_length + ) + + # Write new input dict to temp file + temp_input_data_path = "./temp-input-data.json" + temp_input_data = open(temp_input_data_path, "w") + json.dump(modified_input_dict, temp_input_data) + temp_input_data.close() + + new_perf_config.update_config({"input-data": temp_input_data_path}) + configs_with_concurrency.append(new_perf_config) self._configs.append(configs_with_concurrency) + def _update_config_based_on_parameter( + self, perf_config: PerfAnalyzerConfig, parameter: int + ) -> PerfAnalyzerConfig: + if self._cli_config.is_llm_model(): + perf_config.update_config({"periodic-concurrency-range": parameter}) + elif self._cli_config.is_request_rate_specified(self._model_parameters): + perf_config.update_config({"request-rate-range": parameter}) + else: + perf_config.update_config({"concurrency-range": parameter}) + + return perf_config + def _create_non_parameter_perf_config_values(self) -> dict: perf_config_values = { "batch-size": self._batch_sizes, } + if self._cli_config.is_llm_model(): + perf_config_values["max-token-count"] = self._max_token_counts + perf_config_values["prompt-length"] = self._prompt_lengths + return perf_config_values def _step(self) -> None: diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py index 9c40f16ef..89aecbefe 100755 --- a/model_analyzer/config/input/config_command_profile.py +++ b/model_analyzer/config/input/config_command_profile.py @@ -498,6 +498,8 @@ def _add_profile_models_configs(self): "batch_sizes": ConfigListNumeric(type_=int), "concurrency": ConfigListNumeric(type_=int), "request_rate": ConfigListNumeric(type_=int), + "prompt_length": ConfigListNumeric(type_=int), + "max_token_count": ConfigListNumeric(type_=int), } ), "objectives": objectives_scheme, @@ -1419,6 +1421,8 @@ def _autofill_values(self): "batch_sizes": self.batch_sizes, "concurrency": self.concurrency, "request_rate": self.request_rate, + "prompt_length": self.prompt_length, + "max_token_count": self.max_token_count, } else: new_model["parameters"] = {} @@ -1443,6 +1447,24 @@ def _autofill_values(self): else: new_model["parameters"].update({"request_rate": self.request_rate}) + if "prompt_length" in model.parameters(): + new_model["parameters"].update( + {"prompt_length": model.parameters()["prompt_length"]} + ) + else: + new_model["parameters"].update( + {"prompt_length": self.prompt_length} + ) + + if "max_token_count" in model.parameters(): + new_model["max_token_count"].update( + {"max_token_count": model.parameters()["max_token_count"]} + ) + else: + new_model["parameters"].update( + {"max_token_count": self.prompt_length} + ) + if ( new_model["parameters"]["request_rate"] and new_model["parameters"]["concurrency"] @@ -1523,3 +1545,17 @@ def is_request_rate_specified(self, model_parameters: dict) -> bool: or self.get_config()["run_config_search_min_request_rate"].is_set_by_user() or self.get_config()["run_config_search_max_request_rate"].is_set_by_user() ) + + def is_llm_model(self) -> bool: + """ + Returns true the user has enabled llm search or set any llm search value + """ + return ( + self.llm_search_enable + or self.get_config()["run_config_search_min_prompt_length"].is_set_by_user() + or self.get_config()["run_config_search_max_prompt_length"].is_set_by_user() + or self.get_config()["run_config_search_min_token_count"].is_set_by_user() + or self.get_config()["run_config_search_max_token_count"].is_set_by_user() + or self.get_config()["prompt_length"].is_set_by_user() + or self.get_config()["max_token_count"].is_set_by_user() + ) diff --git a/model_analyzer/perf_analyzer/perf_config.py b/model_analyzer/perf_analyzer/perf_config.py index e9160a44a..c9865f515 100755 --- a/model_analyzer/perf_analyzer/perf_config.py +++ b/model_analyzer/perf_analyzer/perf_config.py @@ -33,6 +33,7 @@ class PerfAnalyzerConfig: "measurement-interval", "concurrency-range", "request-rate-range", + "periodic-concurrency-range", "request-distribution", "request-intervals", "binary-search", @@ -71,6 +72,7 @@ class PerfAnalyzerConfig: "metrics-url", "metrics-interval", "bls-composing-models", + "max-token-count", ] input_to_options = [ @@ -273,6 +275,8 @@ def extract_model_specific_parameters(self): "batch-size": self._options["-b"], "concurrency-range": self._args["concurrency-range"], "request-rate-range": self._args["request-rate-range"], + "periodic-concurrency-range": self._args["periodic-concurrency-range"], + "max-token-count": self._args["max-token-count"], } @classmethod diff --git a/tests/test_config.py b/tests/test_config.py index ca9835cec..2e95d3d4d 100755 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -288,12 +288,24 @@ def test_range_and_list_values(self): expected_model_configs = [ ConfigModelProfileSpec( "model_1", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "prompt_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, ), ConfigModelProfileSpec( "model_2", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "prompt_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, ), ] @@ -430,12 +442,20 @@ def test_object(self): "batch_sizes": [1], "concurrency": [1, 2, 3, 4], "request_rate": [], + "prompt_length": [], + "max_token_count": [], }, objectives={"perf_throughput": 10}, ), ConfigModelProfileSpec( "vgg_19_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "prompt_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, ), ] @@ -489,6 +509,8 @@ def test_object(self): "batch_sizes": [1], "concurrency": [1, 2, 3, 4], "request_rate": [], + "prompt_length": [], + "max_token_count": [], }, objectives={"perf_throughput": 10}, ), @@ -498,6 +520,8 @@ def test_object(self): "concurrency": [1, 2, 3, 4], "batch_sizes": [2, 4, 6], "request_rate": [], + "prompt_length": [], + "max_token_count": [], }, objectives={"perf_throughput": 10}, ), @@ -569,6 +593,8 @@ def test_constraints(self): "batch_sizes": [1], "concurrency": [1, 2, 3, 4], "request_rate": [], + "prompt_length": [], + "max_token_count": [], }, objectives={"perf_throughput": 10, "gpu_used_memory": 5}, constraints={ @@ -579,7 +605,13 @@ def test_constraints(self): ), ConfigModelProfileSpec( "vgg_19_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "prompt_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, ), ] @@ -697,7 +729,13 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "prompt_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, model_config_parameters={ "instance_group": [[{"kind": ["KIND_GPU"], "count": [1]}]] @@ -722,7 +760,13 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "prompt_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, model_config_parameters={ "instance_group": [[{"kind": ["KIND_GPU"], "count": [1]}]] @@ -758,7 +802,13 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "prompt_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, model_config_parameters={ "instance_group": [ @@ -801,7 +851,13 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "prompt_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, model_config_parameters={ "instance_group": [ @@ -831,7 +887,13 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "prompt_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, model_config_parameters={ "input": [ @@ -874,7 +936,13 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "prompt_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, perf_analyzer_flags={ "measurement-interval": 10000, @@ -900,7 +968,13 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "prompt_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, perf_analyzer_flags={ "measurement-interval": 10000, @@ -1171,7 +1245,13 @@ def test_autofill(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "prompt_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, model_config_parameters={ "instance_group": [[{"kind": ["KIND_GPU"], "count": [1]}]] @@ -1215,6 +1295,8 @@ def test_autofill(self): "batch_sizes": [16, 32], "concurrency": [2, 4], "request_rate": [], + "prompt_length": [], + "max_token_count": [], }, objectives={"perf_throughput": 10, "gpu_used_memory": 5}, constraints={ @@ -1263,6 +1345,8 @@ def test_autofill(self): "batch_sizes": [16, 32], "concurrency": [2, 4], "request_rate": [], + "prompt_length": [], + "max_token_count": [], }, objectives={"gpu_used_memory": 10}, constraints={"perf_latency_p99": {"max": 8000}}, @@ -1307,6 +1391,8 @@ def test_autofill(self): "batch_sizes": [16, 32], "concurrency": [2, 4], "request_rate": [], + "prompt_length": [], + "max_token_count": [], }, objectives={"gpu_used_memory": 10}, constraints={"perf_latency_p99": {"max": 8000}}, @@ -1362,6 +1448,8 @@ def test_autofill(self): "batch_sizes": [16, 32], "concurrency": [5, 6, 7], "request_rate": [], + "prompt_length": [], + "max_token_count": [], }, objectives={"gpu_used_memory": 10}, constraints={ @@ -1375,6 +1463,8 @@ def test_autofill(self): "batch_sizes": [1, 2], "concurrency": [2, 4], "request_rate": [], + "prompt_length": [], + "max_token_count": [], }, objectives={"perf_throughput": 10, "perf_latency_p99": 5}, constraints={"perf_latency_p99": {"max": 8000}}, diff --git a/tests/test_perf_analyzer_config_generator.py b/tests/test_perf_analyzer_config_generator.py index e9852356e..d65590bf2 100755 --- a/tests/test_perf_analyzer_config_generator.py +++ b/tests/test_perf_analyzer_config_generator.py @@ -41,7 +41,11 @@ def __init__(self, methodname): super().__init__(methodname) self._perf_throughput = 1 - def test_set_last_results(self): + @patch( + "model_analyzer.config.input.config_command_profile.ConfigCommandProfile.is_llm_model", + return_value=False, + ) + def test_set_last_results(self, *args): """ Test set_last_results() with multi model @@ -60,8 +64,26 @@ def test_set_last_results(self): ["modelA", "modelB"], [{"perf_throughput": 10}, {"perf_throughput": 2}] ) + args = [ + "model-analyzer", + "profile", + "--model-repository", + "cli_repository", + "-f", + "path-to-config-file", + ] + + # yapf: disable + yaml_str = (""" + profile_models: + - my-model + """) + # yapf: enable + + config = evaluate_mock_config(args, yaml_str, subcommand="profile") + pacg = PerfAnalyzerConfigGenerator( - MagicMock(), MagicMock(), MagicMock(), MagicMock(), early_exit_enable=False + config, MagicMock(), MagicMock(), MagicMock(), early_exit_enable=False ) pacg.set_last_results([measurement1, measurement2, measurement3]) From 43d5c1d5a8b78107706b5d6bfe31bb2d036d8f4f Mon Sep 17 00:00:00 2001 From: braf Date: Thu, 5 Oct 2023 18:15:31 +0000 Subject: [PATCH 02/23] First unit test passing --- .../perf_analyzer_config_generator.py | 58 ++++++++++--------- model_analyzer/constants.py | 2 +- tests/common/test_utils.py | 10 ++++ tests/test_perf_analyzer_config_generator.py | 58 ++++++++++++++++--- 4 files changed, 92 insertions(+), 36 deletions(-) diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py index 0536b4abd..e0266ee13 100755 --- a/model_analyzer/config/generate/perf_analyzer_config_generator.py +++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py @@ -23,7 +23,7 @@ from model_analyzer.config.input.config_command_profile import ConfigCommandProfile from model_analyzer.constants import ( LOGGER_NAME, - THROUGHPUT_MINIMUM_CONSECUTIVE_BATCH_SIZE_TRIES, + THROUGHPUT_MINIMUM_CONSECUTIVE_NON_PARAMETER_TRIES, THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES, THROUGHPUT_MINIMUM_GAIN, ) @@ -65,7 +65,7 @@ def __init__( custom perf analyzer configuration model_parameters: Dict - model constraints for batch_sizes, concurrency and/or request rate + model constraints for batch sizes, concurrency, request rate, prompt length, etc.. early_exit_enable: Bool If true, this class can early exit during search of concurrency/request rate @@ -75,10 +75,10 @@ def __init__( # All configs are pregenerated in _configs[][] # Indexed as follows: - # _configs[_curr_batch_size_index][_curr_parameter_index] + # _configs[_curr_non_parameter_index][_curr_parameter_index] # + self._curr_non_parameter_index = 0 self._curr_parameter_index = 0 - self._curr_batch_size_index = 0 self._configs: List[List[PerfAnalyzerConfig]] = [] self._parameter_warning_printed = False @@ -88,11 +88,9 @@ def __init__( self._last_results: List[RunConfigMeasurement] = [] self._parameter_results: List[Optional[RunConfigMeasurement]] = [] - self._batch_size_results: List[Optional[RunConfigMeasurement]] = [] + self._non_parameter_results: List[Optional[RunConfigMeasurement]] = [] self._model_name = model_name - - self._batch_sizes = sorted(model_parameters["batch_sizes"]) self._cli_config = cli_config self._perf_analyzer_flags = self._set_perf_analyzer_flags( @@ -103,9 +101,19 @@ def __init__( self._model_parameters = model_parameters self._parameters = self._create_parameter_list() + self._batch_sizes = sorted(model_parameters["batch_sizes"]) self._prompt_lengths = self._create_prompt_length_list() self._max_token_counts = self._create_max_token_count_list() + self._perf_config_non_parameter_values = ( + self._create_non_parameter_perf_config_values() + ) + self._non_parameter_count = len( + utils.generate_parameter_combinations( + self._perf_config_non_parameter_values + ) + ) + self._generate_perf_configs() @staticmethod @@ -147,7 +155,7 @@ def get_configs(self) -> Generator[PerfAnalyzerConfig, None, None]: break self._generator_started = True - config = self._configs[self._curr_batch_size_index][ + config = self._configs[self._curr_non_parameter_index][ self._curr_parameter_index ] yield (config) @@ -265,12 +273,8 @@ def _create_max_token_count_list(self) -> List[int]: ) def _generate_perf_configs(self) -> None: - perf_config_non_parameter_values = ( - self._create_non_parameter_perf_config_values() - ) - for unmodified_params in utils.generate_parameter_combinations( - perf_config_non_parameter_values + self._perf_config_non_parameter_values ): configs_with_concurrency = [] for parameter in self._parameters: @@ -336,15 +340,15 @@ def _step(self) -> None: self._step_parameter() if self._done_walking_parameters(): - self._add_best_throughput_to_batch_sizes() + self._add_best_throughput_to_non_parameter_results() self._reset_parameters() - self._step_batch_size() + self._step_non_parameter() - def _add_best_throughput_to_batch_sizes(self) -> None: + def _add_best_throughput_to_non_parameter_results(self) -> None: if self._parameter_results: # type is List[Optional[RCM]] best = max(self._parameter_results) # type: ignore - self._batch_size_results.append(best) + self._non_parameter_results.append(best) def _reset_parameters(self) -> None: self._curr_parameter_index = 0 @@ -354,11 +358,11 @@ def _reset_parameters(self) -> None: def _step_parameter(self) -> None: self._curr_parameter_index += 1 - def _step_batch_size(self) -> None: - self._curr_batch_size_index += 1 + def _step_non_parameter(self) -> None: + self._curr_non_parameter_index += 1 def _done_walking(self) -> bool: - return self._done_walking_batch_sizes() + return self._done_walking_non_parameters() def _done_walking_parameters(self) -> bool: if len(self._parameters) == self._curr_parameter_index: @@ -377,11 +381,11 @@ def _done_walking_parameters(self) -> bool: return True return False - def _done_walking_batch_sizes(self) -> bool: - if len(self._batch_sizes) == self._curr_batch_size_index: + def _done_walking_non_parameters(self) -> bool: + if self._non_parameter_count == self._curr_non_parameter_index: return True - if self._early_exit_enable and not self._batch_size_throughput_gain_valid(): + if self._early_exit_enable and not self._non_parameter_throughput_gain_valid(): logger.info( "No longer increasing client batch size as throughput has plateaued" ) @@ -400,10 +404,10 @@ def _parameter_throughput_gain_valid(self) -> bool: min_gain=THROUGHPUT_MINIMUM_GAIN, ) - def _batch_size_throughput_gain_valid(self) -> bool: - """Check if any of the last X batch_size results resulted in valid gain""" + def _non_parameter_throughput_gain_valid(self) -> bool: + """Check if any of the last X non-parameter results resulted in valid gain""" return PerfAnalyzerConfigGenerator.throughput_gain_valid_helper( - throughputs=self._batch_size_results, - min_tries=THROUGHPUT_MINIMUM_CONSECUTIVE_BATCH_SIZE_TRIES, + throughputs=self._non_parameter_results, + min_tries=THROUGHPUT_MINIMUM_CONSECUTIVE_NON_PARAMETER_TRIES, min_gain=THROUGHPUT_MINIMUM_GAIN, ) diff --git a/model_analyzer/constants.py b/model_analyzer/constants.py index 886360d34..65554f287 100755 --- a/model_analyzer/constants.py +++ b/model_analyzer/constants.py @@ -33,7 +33,7 @@ # Run Search THROUGHPUT_MINIMUM_GAIN = 0.05 THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES = 4 -THROUGHPUT_MINIMUM_CONSECUTIVE_BATCH_SIZE_TRIES = 4 +THROUGHPUT_MINIMUM_CONSECUTIVE_NON_PARAMETER_TRIES = 4 # Quick search algorithm constants RADIUS = 3 diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py index 9d418027f..be7d51fd0 100755 --- a/tests/common/test_utils.py +++ b/tests/common/test_utils.py @@ -237,9 +237,11 @@ def construct_perf_analyzer_config( batch_size=DEFAULT_BATCH_SIZES, concurrency=1, request_rate=None, + max_token_count=1, launch_mode=DEFAULT_TRITON_LAUNCH_MODE, client_protocol=DEFAULT_CLIENT_PROTOCOL, perf_analyzer_flags=None, + llm_search_mode=False, ): """ Constructs a Perf Analyzer Config @@ -262,6 +264,8 @@ def construct_perf_analyzer_config( The client protocol for this PA configuration perf_analyzer_flags: dict A dict of any additional PA flags to be set + llm_search_mode: bool + Indicates we should use LLM search parameters Returns ------- @@ -276,9 +280,15 @@ def construct_perf_analyzer_config( if request_rate: pa_config._args["request-rate-range"] = request_rate + elif llm_search_mode: + pa_config._args["periodic-concurrency-range"] = concurrency else: pa_config._args["concurrency-range"] = concurrency + if llm_search_mode: + pa_config._args["max-token-count"] = max_token_count + pa_config._args["input-data"] = "./temp-input-data.json" + pa_config._args["measurement-mode"] = DEFAULT_MEASUREMENT_MODE pa_config.update_config(perf_analyzer_flags) diff --git a/tests/test_perf_analyzer_config_generator.py b/tests/test_perf_analyzer_config_generator.py index d65590bf2..8249044d0 100755 --- a/tests/test_perf_analyzer_config_generator.py +++ b/tests/test_perf_analyzer_config_generator.py @@ -15,7 +15,7 @@ # limitations under the License. import unittest -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock, mock_open, patch from model_analyzer.config.generate.generator_utils import GeneratorUtils as utils from model_analyzer.config.generate.perf_analyzer_config_generator import ( @@ -559,6 +559,40 @@ def test_perf_analyzer_flags(self): self._run_and_test_perf_analyzer_config_generator(yaml_str, expected_configs) + def test_llm_search_max_token_count(self): + """ + Test LLM Search: + - max token count 1->256 + + Concurrency and prompt length max set to 1 + """ + + # yapf: disable + yaml_str = (""" + perf_analyzer_flags: + input-data: input_data.json + profile_models: + - my-model + """) + # yapf: enable + + max_token_counts = utils.generate_doubled_list(1, 256) + expected_configs = [ + construct_perf_analyzer_config(max_token_count=mtc, llm_search_mode=True) + for mtc in max_token_counts + ] + + pa_cli_args = [ + "--llm-search-enable", + "--run-config-search-max-concurrency", + "1", + "--run-config-search-max-prompt-length", + "1", + ] + self._run_and_test_perf_analyzer_config_generator( + yaml_str, expected_configs, pa_cli_args + ) + def test_perf_analyzer_config_ssl_options(self): """ Test Perf Analyzer SSL options: @@ -776,13 +810,17 @@ def _run_and_test_perf_analyzer_config_generator( config = evaluate_mock_config(args, yaml_str, subcommand="profile") - pacg = PerfAnalyzerConfigGenerator( - config, - config.profile_models[0].model_name(), - config.profile_models[0].perf_analyzer_flags(), - config.profile_models[0].parameters(), - early_exit, - ) + with patch( + "model_analyzer.config.generate.perf_analyzer_config_generator.open", + mock_open(read_data=self._input_data), + ): + pacg = PerfAnalyzerConfigGenerator( + config, + config.profile_models[0].model_name(), + config.profile_models[0].perf_analyzer_flags(), + config.profile_models[0].parameters(), + early_exit, + ) perf_analyzer_configs = [] for perf_config in pacg.get_configs(): @@ -846,6 +884,10 @@ def setUp(self): ) self.mock_os.start() + self._input_data = """{ + "data": [{"PROMPT": ["Hello, my name is"], "STREAM": [true]}] + }""" + def tearDown(self): self.mock_os.stop() patch.stopall() From d765027d98bd39d3a66aa4513e48c98c2e934000 Mon Sep 17 00:00:00 2001 From: braf Date: Thu, 5 Oct 2023 19:26:06 +0000 Subject: [PATCH 03/23] Adding test for prompt length --- .../config/input/config_defaults.py | 2 +- tests/test_perf_analyzer_config_generator.py | 34 +++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py index c2edd6e91..a10a7de0c 100755 --- a/model_analyzer/config/input/config_defaults.py +++ b/model_analyzer/config/input/config_defaults.py @@ -52,7 +52,7 @@ DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE = 128 DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS = 5 DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH = 1 -DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH = 1000 +DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH = 1024 DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT = 1 DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT = 256 DEFAULT_RUN_CONFIG_SEARCH_DISABLE = False diff --git a/tests/test_perf_analyzer_config_generator.py b/tests/test_perf_analyzer_config_generator.py index 8249044d0..619f6e23f 100755 --- a/tests/test_perf_analyzer_config_generator.py +++ b/tests/test_perf_analyzer_config_generator.py @@ -593,6 +593,40 @@ def test_llm_search_max_token_count(self): yaml_str, expected_configs, pa_cli_args ) + def test_llm_search_prompt_length(self): + """ + Test LLM Search: + - Prompt length 1->1024 + + Concurrency and max token count set to 1 + """ + + # yapf: disable + yaml_str = (""" + perf_analyzer_flags: + input-data: input_data.json + profile_models: + - my-model + """) + # yapf: enable + + prompt_lengths = utils.generate_doubled_list(1, 1024) + expected_configs = [ + construct_perf_analyzer_config(llm_search_mode=True) + for pl in prompt_lengths + ] + + pa_cli_args = [ + "--llm-search-enable", + "--run-config-search-max-concurrency", + "1", + "--run-config-search-max-token-count", + "1", + ] + self._run_and_test_perf_analyzer_config_generator( + yaml_str, expected_configs, pa_cli_args + ) + def test_perf_analyzer_config_ssl_options(self): """ Test Perf Analyzer SSL options: From c198e5a019057cbf8373bf8989ef8bf99c769d3d Mon Sep 17 00:00:00 2001 From: braf Date: Thu, 5 Oct 2023 21:09:55 +0000 Subject: [PATCH 04/23] Refactor PACG methods --- .../perf_analyzer_config_generator.py | 94 ++++++++++++------- .../config/input/config_defaults.py | 1 + tests/common/test_utils.py | 3 +- 3 files changed, 65 insertions(+), 33 deletions(-) diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py index e0266ee13..7a8937504 100755 --- a/model_analyzer/config/generate/perf_analyzer_config_generator.py +++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py @@ -18,9 +18,10 @@ import logging import tempfile from copy import deepcopy -from typing import Generator, List, Optional +from typing import Dict, Generator, List, Optional, Tuple from model_analyzer.config.input.config_command_profile import ConfigCommandProfile +from model_analyzer.config.input.config_defaults import DEFAULT_INPUT_JSON_PATH from model_analyzer.constants import ( LOGGER_NAME, THROUGHPUT_MINIMUM_CONSECUTIVE_NON_PARAMETER_TRIES, @@ -114,6 +115,8 @@ def __init__( ) ) + self._input_json_filename = DEFAULT_INPUT_JSON_PATH + "/input-data.json" + self._generate_perf_configs() @staticmethod @@ -205,7 +208,7 @@ def _create_input_dict(self, model_perf_analyzer_flags: dict) -> dict: else: return {} - def _modify_prompt_in_input_dict(self, prompt_length: int) -> dict: + def _modify_prompt_in_input_dict(self, prompt_length: int) -> Dict: modified_input_dict = deepcopy(self._llm_input_dict) modified_prompt = ["hi"] * prompt_length @@ -273,49 +276,62 @@ def _create_max_token_count_list(self) -> List[int]: ) def _generate_perf_configs(self) -> None: - for unmodified_params in utils.generate_parameter_combinations( + for ( + unmodified_non_parameter_combination + ) in utils.generate_parameter_combinations( self._perf_config_non_parameter_values ): - configs_with_concurrency = [] + all_perf_configs_for_a_given_parameter = [] for parameter in self._parameters: - params = deepcopy(unmodified_params) - new_perf_config = PerfAnalyzerConfig() + perf_config = self._create_base_perf_config() - new_perf_config.update_config_from_profile_config( - self._model_name, self._cli_config - ) + ( + prompt_length, + modified_non_parameter_combination, + ) = self._extract_prompt_length(unmodified_non_parameter_combination) - if self._cli_config.is_llm_model(): - prompt_length = params.pop("prompt-length") + self._update_perf_config_based_on_non_parameter_combination( + perf_config, modified_non_parameter_combination + ) + self._update_perf_config_based_on_parameter(perf_config, parameter) + self._update_perf_config_based_on_perf_analyzer_flags(perf_config) + self._update_perf_config_for_llm_model(perf_config, prompt_length) - new_perf_config.update_config(params) + all_perf_configs_for_a_given_parameter.append(perf_config) + self._configs.append(all_perf_configs_for_a_given_parameter) - new_perf_config = self._update_config_based_on_parameter( - new_perf_config, parameter - ) + def _create_base_perf_config(self) -> PerfAnalyzerConfig: + perf_config = PerfAnalyzerConfig() + perf_config.update_config_from_profile_config( + self._model_name, self._cli_config + ) - # User provided flags can override the search parameters - new_perf_config.update_config(self._perf_analyzer_flags) + return perf_config - if self._cli_config.is_llm_model(): - modified_input_dict = self._modify_prompt_in_input_dict( - prompt_length - ) + def _extract_prompt_length( + self, unmodified_parameter_combination: List[Dict] + ) -> Tuple[int, List[Dict]]: + if self._cli_config.is_llm_model(): + modified_parameter_combination = deepcopy(unmodified_parameter_combination) + prompt_length = modified_parameter_combination.pop("prompt-length") - # Write new input dict to temp file - temp_input_data_path = "./temp-input-data.json" - temp_input_data = open(temp_input_data_path, "w") - json.dump(modified_input_dict, temp_input_data) - temp_input_data.close() + return prompt_length, modified_parameter_combination + else: + return None, unmodified_parameter_combination - new_perf_config.update_config({"input-data": temp_input_data_path}) + def _update_perf_config_based_on_non_parameter_combination( + self, perf_config: PerfAnalyzerConfig, non_parameter_combination: Dict + ) -> None: + perf_config.update_config(non_parameter_combination) - configs_with_concurrency.append(new_perf_config) - self._configs.append(configs_with_concurrency) + def _update_perf_config_based_on_perf_analyzer_flags( + self, perf_config: PerfAnalyzerConfig + ) -> None: + perf_config.update_config(self._perf_analyzer_flags) - def _update_config_based_on_parameter( + def _update_perf_config_based_on_parameter( self, perf_config: PerfAnalyzerConfig, parameter: int - ) -> PerfAnalyzerConfig: + ) -> None: if self._cli_config.is_llm_model(): perf_config.update_config({"periodic-concurrency-range": parameter}) elif self._cli_config.is_request_rate_specified(self._model_parameters): @@ -323,7 +339,21 @@ def _update_config_based_on_parameter( else: perf_config.update_config({"concurrency-range": parameter}) - return perf_config + def _update_perf_config_for_llm_model( + self, perf_config: PerfAnalyzerConfig, prompt_length: int + ) -> None: + if not self._cli_config.is_llm_model(): + return + + modified_input_dict = self._modify_prompt_in_input_dict(prompt_length) + self._write_modified_input_dict_to_file(modified_input_dict) + + perf_config.update_config({"input-data": self._input_json_filename}) + + def _write_modified_input_dict_to_file(self, modified_input_dict: Dict) -> None: + temp_input_data = open(self._input_json_filename, "w") + json.dump(modified_input_dict, temp_input_data) + temp_input_data.close() def _create_non_parameter_perf_config_values(self) -> dict: perf_config_values = { diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py index a10a7de0c..4be24fd2b 100755 --- a/model_analyzer/config/input/config_defaults.py +++ b/model_analyzer/config/input/config_defaults.py @@ -38,6 +38,7 @@ DEFAULT_SKIP_SUMMARY_REPORTS = False DEFAULT_SKIP_DETAILED_REPORTS = False DEFAULT_OUTPUT_MODEL_REPOSITORY = os.path.join(os.getcwd(), "output_model_repository") +DEFAULT_INPUT_JSON_PATH = os.getcwd() DEFAULT_OVERRIDE_OUTPUT_REPOSITORY_FLAG = False DEFAULT_BATCH_SIZES = 1 DEFAULT_MAX_RETRIES = 50 diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py index be7d51fd0..8a6c76ed3 100755 --- a/tests/common/test_utils.py +++ b/tests/common/test_utils.py @@ -23,6 +23,7 @@ from model_analyzer.config.input.config_defaults import ( DEFAULT_BATCH_SIZES, DEFAULT_CLIENT_PROTOCOL, + DEFAULT_INPUT_JSON_PATH, DEFAULT_MEASUREMENT_MODE, DEFAULT_MONITORING_INTERVAL, DEFAULT_OUTPUT_MODEL_REPOSITORY, @@ -287,7 +288,7 @@ def construct_perf_analyzer_config( if llm_search_mode: pa_config._args["max-token-count"] = max_token_count - pa_config._args["input-data"] = "./temp-input-data.json" + pa_config._args["input-data"] = DEFAULT_INPUT_JSON_PATH + "/input-data.json" pa_config._args["measurement-mode"] = DEFAULT_MEASUREMENT_MODE From 79aa02a3de6ce65b7e6fdce8ab331bf655197845 Mon Sep 17 00:00:00 2001 From: braf Date: Thu, 5 Oct 2023 21:18:12 +0000 Subject: [PATCH 05/23] Further refactoring --- .../perf_analyzer_config_generator.py | 42 +++++++++++-------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py index 7a8937504..52a4eebfd 100755 --- a/model_analyzer/config/generate/perf_analyzer_config_generator.py +++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py @@ -276,30 +276,38 @@ def _create_max_token_count_list(self) -> List[int]: ) def _generate_perf_configs(self) -> None: - for ( - unmodified_non_parameter_combination - ) in utils.generate_parameter_combinations( + all_non_parameter_combinations = utils.generate_parameter_combinations( self._perf_config_non_parameter_values - ): + ) + for unmodified_non_parameter_combination in all_non_parameter_combinations: all_perf_configs_for_a_given_parameter = [] for parameter in self._parameters: - perf_config = self._create_base_perf_config() - - ( - prompt_length, - modified_non_parameter_combination, - ) = self._extract_prompt_length(unmodified_non_parameter_combination) - - self._update_perf_config_based_on_non_parameter_combination( - perf_config, modified_non_parameter_combination + new_perf_config = self._create_new_perf_config( + parameter, unmodified_non_parameter_combination ) - self._update_perf_config_based_on_parameter(perf_config, parameter) - self._update_perf_config_based_on_perf_analyzer_flags(perf_config) - self._update_perf_config_for_llm_model(perf_config, prompt_length) + all_perf_configs_for_a_given_parameter.append(new_perf_config) - all_perf_configs_for_a_given_parameter.append(perf_config) self._configs.append(all_perf_configs_for_a_given_parameter) + def _create_new_perf_config( + self, parameter: int, unmodified_non_parameter_combination: List[Dict] + ) -> PerfAnalyzerConfig: + perf_config = self._create_base_perf_config() + + ( + prompt_length, + modified_non_parameter_combination, + ) = self._extract_prompt_length(unmodified_non_parameter_combination) + + self._update_perf_config_based_on_non_parameter_combination( + perf_config, modified_non_parameter_combination + ) + self._update_perf_config_based_on_parameter(perf_config, parameter) + self._update_perf_config_based_on_perf_analyzer_flags(perf_config) + self._update_perf_config_for_llm_model(perf_config, prompt_length) + + return perf_config + def _create_base_perf_config(self) -> PerfAnalyzerConfig: perf_config = PerfAnalyzerConfig() perf_config.update_config_from_profile_config( From ac81a6b275e9b66b2438a140466cef8e17cb666a Mon Sep 17 00:00:00 2001 From: braf Date: Thu, 5 Oct 2023 22:41:12 +0000 Subject: [PATCH 06/23] Ensure early exit isn't enabled for LLM models --- .../config/generate/model_run_config_generator.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/model_analyzer/config/generate/model_run_config_generator.py b/model_analyzer/config/generate/model_run_config_generator.py index b068c7577..529fa5b83 100755 --- a/model_analyzer/config/generate/model_run_config_generator.py +++ b/model_analyzer/config/generate/model_run_config_generator.py @@ -150,5 +150,13 @@ def _determine_early_exit_enables( concurrency_specified = model.parameters()["concurrency"] config_parameters_exist = model.model_config_parameters() - self._pacg_early_exit_enable = early_exit_enable or not concurrency_specified - self._mcg_early_exit_enable = early_exit_enable or not config_parameters_exist + if config.is_llm_model(): + self._pacg_early_exit_enable = False + self._mcg_early_exit_enable = False + else: + self._pacg_early_exit_enable = ( + early_exit_enable or not concurrency_specified + ) + self._mcg_early_exit_enable = ( + early_exit_enable or not config_parameters_exist + ) From 015a2c2cbce49c49b24887ce9c2efa95ee3c4bc1 Mon Sep 17 00:00:00 2001 From: braf Date: Fri, 6 Oct 2023 22:44:21 +0000 Subject: [PATCH 07/23] Fix type checking errors --- .../config/generate/perf_analyzer_config_generator.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py index 52a4eebfd..7700e130c 100755 --- a/model_analyzer/config/generate/perf_analyzer_config_generator.py +++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py @@ -16,7 +16,6 @@ import json import logging -import tempfile from copy import deepcopy from typing import Dict, Generator, List, Optional, Tuple @@ -290,7 +289,7 @@ def _generate_perf_configs(self) -> None: self._configs.append(all_perf_configs_for_a_given_parameter) def _create_new_perf_config( - self, parameter: int, unmodified_non_parameter_combination: List[Dict] + self, parameter: int, unmodified_non_parameter_combination: Dict ) -> PerfAnalyzerConfig: perf_config = self._create_base_perf_config() @@ -317,15 +316,15 @@ def _create_base_perf_config(self) -> PerfAnalyzerConfig: return perf_config def _extract_prompt_length( - self, unmodified_parameter_combination: List[Dict] - ) -> Tuple[int, List[Dict]]: + self, unmodified_parameter_combination: Dict + ) -> Tuple[int, Dict]: if self._cli_config.is_llm_model(): modified_parameter_combination = deepcopy(unmodified_parameter_combination) prompt_length = modified_parameter_combination.pop("prompt-length") return prompt_length, modified_parameter_combination else: - return None, unmodified_parameter_combination + return 0, unmodified_parameter_combination def _update_perf_config_based_on_non_parameter_combination( self, perf_config: PerfAnalyzerConfig, non_parameter_combination: Dict From 2619b83c73d296a174229526c835fe31eeb1d3ca Mon Sep 17 00:00:00 2001 From: braf Date: Sat, 7 Oct 2023 12:13:01 +0000 Subject: [PATCH 08/23] Attempt at fixing codeql issue --- .../perf_analyzer_config_generator.py | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py index 7700e130c..901e87f7d 100755 --- a/model_analyzer/config/generate/perf_analyzer_config_generator.py +++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py @@ -189,8 +189,13 @@ def set_last_results( self._last_results = measurement self._parameter_results.extend(measurement) - def _set_perf_analyzer_flags(self, model_perf_analyzer_flags: dict) -> dict: - # For LLM models we will be creating custom input data based on prompt length + def _set_perf_analyzer_flags( + self, model_perf_analyzer_flags: Optional[dict] = None + ) -> dict: + if model_perf_analyzer_flags is None: + model_perf_analyzer_flags = {} + + # For LLM models we will be creating custom input-data based on prompt length if self._cli_config.is_llm_model(): perf_analyzer_flags = deepcopy(model_perf_analyzer_flags) perf_analyzer_flags.pop("input-data") @@ -198,7 +203,12 @@ def _set_perf_analyzer_flags(self, model_perf_analyzer_flags: dict) -> dict: else: return model_perf_analyzer_flags - def _create_input_dict(self, model_perf_analyzer_flags: dict) -> dict: + def _create_input_dict( + self, model_perf_analyzer_flags: Optional[dict] = None + ) -> dict: + if model_perf_analyzer_flags is None: + model_perf_analyzer_flags = {} + if self._cli_config.is_llm_model(): with open(model_perf_analyzer_flags["input-data"], "r") as f: input_dict = json.load(f) @@ -316,8 +326,11 @@ def _create_base_perf_config(self) -> PerfAnalyzerConfig: return perf_config def _extract_prompt_length( - self, unmodified_parameter_combination: Dict + self, unmodified_parameter_combination: Optional[Dict] = None ) -> Tuple[int, Dict]: + if unmodified_parameter_combination is None: + unmodified_parameter_combination = {} + if self._cli_config.is_llm_model(): modified_parameter_combination = deepcopy(unmodified_parameter_combination) prompt_length = modified_parameter_combination.pop("prompt-length") From 9f2a0654de7eea9b21abfc5e8611477efdac5d1f Mon Sep 17 00:00:00 2001 From: braf Date: Tue, 10 Oct 2023 19:17:54 +0000 Subject: [PATCH 09/23] Revert "Attempt at fixing codeql issue" This reverts commit 2619b83c73d296a174229526c835fe31eeb1d3ca. --- .../perf_analyzer_config_generator.py | 21 ++++--------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py index 901e87f7d..7700e130c 100755 --- a/model_analyzer/config/generate/perf_analyzer_config_generator.py +++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py @@ -189,13 +189,8 @@ def set_last_results( self._last_results = measurement self._parameter_results.extend(measurement) - def _set_perf_analyzer_flags( - self, model_perf_analyzer_flags: Optional[dict] = None - ) -> dict: - if model_perf_analyzer_flags is None: - model_perf_analyzer_flags = {} - - # For LLM models we will be creating custom input-data based on prompt length + def _set_perf_analyzer_flags(self, model_perf_analyzer_flags: dict) -> dict: + # For LLM models we will be creating custom input data based on prompt length if self._cli_config.is_llm_model(): perf_analyzer_flags = deepcopy(model_perf_analyzer_flags) perf_analyzer_flags.pop("input-data") @@ -203,12 +198,7 @@ def _set_perf_analyzer_flags( else: return model_perf_analyzer_flags - def _create_input_dict( - self, model_perf_analyzer_flags: Optional[dict] = None - ) -> dict: - if model_perf_analyzer_flags is None: - model_perf_analyzer_flags = {} - + def _create_input_dict(self, model_perf_analyzer_flags: dict) -> dict: if self._cli_config.is_llm_model(): with open(model_perf_analyzer_flags["input-data"], "r") as f: input_dict = json.load(f) @@ -326,11 +316,8 @@ def _create_base_perf_config(self) -> PerfAnalyzerConfig: return perf_config def _extract_prompt_length( - self, unmodified_parameter_combination: Optional[Dict] = None + self, unmodified_parameter_combination: Dict ) -> Tuple[int, Dict]: - if unmodified_parameter_combination is None: - unmodified_parameter_combination = {} - if self._cli_config.is_llm_model(): modified_parameter_combination = deepcopy(unmodified_parameter_combination) prompt_length = modified_parameter_combination.pop("prompt-length") From c5b702e8a0419071a678684eec02c4a1f849c7da Mon Sep 17 00:00:00 2001 From: braf Date: Tue, 10 Oct 2023 19:33:49 +0000 Subject: [PATCH 10/23] Attempt at codeQL fix --- .../generate/perf_analyzer_config_generator.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py index 7700e130c..85981c3a5 100755 --- a/model_analyzer/config/generate/perf_analyzer_config_generator.py +++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py @@ -93,11 +93,12 @@ def __init__( self._model_name = model_name self._cli_config = cli_config + self._llm_input_dict = self._create_input_dict(model_perf_analyzer_flags) + self._perf_analyzer_flags = self._set_perf_analyzer_flags( model_perf_analyzer_flags ) - self._llm_input_dict = self._create_input_dict(model_perf_analyzer_flags) self._model_parameters = model_parameters self._parameters = self._create_parameter_list() @@ -191,12 +192,15 @@ def set_last_results( def _set_perf_analyzer_flags(self, model_perf_analyzer_flags: dict) -> dict: # For LLM models we will be creating custom input data based on prompt length + perf_analyzer_flags = { + key: value for key, value in model_perf_analyzer_flags.items() + } + if self._cli_config.is_llm_model(): - perf_analyzer_flags = deepcopy(model_perf_analyzer_flags) + # perf_analyzer_flags = deepcopy(model_perf_analyzer_flags) perf_analyzer_flags.pop("input-data") - return perf_analyzer_flags - else: - return model_perf_analyzer_flags + + return perf_analyzer_flags def _create_input_dict(self, model_perf_analyzer_flags: dict) -> dict: if self._cli_config.is_llm_model(): From cbdc7465fb88f1e109715cfc2d1daa769c6c6735 Mon Sep 17 00:00:00 2001 From: braf Date: Tue, 10 Oct 2023 20:30:08 +0000 Subject: [PATCH 11/23] Adding deepcopy back in --- .../config/generate/perf_analyzer_config_generator.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py index 85981c3a5..876be5c91 100755 --- a/model_analyzer/config/generate/perf_analyzer_config_generator.py +++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py @@ -192,12 +192,12 @@ def set_last_results( def _set_perf_analyzer_flags(self, model_perf_analyzer_flags: dict) -> dict: # For LLM models we will be creating custom input data based on prompt length - perf_analyzer_flags = { - key: value for key, value in model_perf_analyzer_flags.items() - } + perf_analyzer_flags = deepcopy(model_perf_analyzer_flags) + # perf_analyzer_flags = { + # key: value for key, value in model_perf_analyzer_flags.items() + # } if self._cli_config.is_llm_model(): - # perf_analyzer_flags = deepcopy(model_perf_analyzer_flags) perf_analyzer_flags.pop("input-data") return perf_analyzer_flags From 0c909eaf216e7f610310d4f7cac9ed85ac64aafa Mon Sep 17 00:00:00 2001 From: braf Date: Tue, 10 Oct 2023 20:59:10 +0000 Subject: [PATCH 12/23] Removing deepcopy in an attempt to fix codeQL errors --- .../generate/perf_analyzer_config_generator.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py index 876be5c91..2ae96862d 100755 --- a/model_analyzer/config/generate/perf_analyzer_config_generator.py +++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py @@ -16,7 +16,6 @@ import json import logging -from copy import deepcopy from typing import Dict, Generator, List, Optional, Tuple from model_analyzer.config.input.config_command_profile import ConfigCommandProfile @@ -192,10 +191,7 @@ def set_last_results( def _set_perf_analyzer_flags(self, model_perf_analyzer_flags: dict) -> dict: # For LLM models we will be creating custom input data based on prompt length - perf_analyzer_flags = deepcopy(model_perf_analyzer_flags) - # perf_analyzer_flags = { - # key: value for key, value in model_perf_analyzer_flags.items() - # } + perf_analyzer_flags = {k: v for k, v in model_perf_analyzer_flags.items()} if self._cli_config.is_llm_model(): perf_analyzer_flags.pop("input-data") @@ -212,9 +208,9 @@ def _create_input_dict(self, model_perf_analyzer_flags: dict) -> dict: return {} def _modify_prompt_in_input_dict(self, prompt_length: int) -> Dict: - modified_input_dict = deepcopy(self._llm_input_dict) - modified_prompt = ["hi"] * prompt_length + + modified_input_dict = {k: v for k, v in self._llm_input_dict.items()} modified_input_dict["data"][0]["PROMPT"] = modified_prompt return modified_input_dict @@ -322,10 +318,12 @@ def _create_base_perf_config(self) -> PerfAnalyzerConfig: def _extract_prompt_length( self, unmodified_parameter_combination: Dict ) -> Tuple[int, Dict]: + modified_parameter_combination = { + k: v for k, v in unmodified_parameter_combination.items() + } + if self._cli_config.is_llm_model(): - modified_parameter_combination = deepcopy(unmodified_parameter_combination) prompt_length = modified_parameter_combination.pop("prompt-length") - return prompt_length, modified_parameter_combination else: return 0, unmodified_parameter_combination From 3f4450a81ea917826972d38faf965910a23d6571 Mon Sep 17 00:00:00 2001 From: Brian Raf <92820864+nv-braf@users.noreply.github.com> Date: Wed, 11 Oct 2023 08:26:15 -0700 Subject: [PATCH 13/23] Update model_analyzer/config/input/config_command_profile.py Co-authored-by: Hyunjae Woo <107147848+nv-hwoo@users.noreply.github.com> --- model_analyzer/config/input/config_command_profile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py index 89aecbefe..317987732 100755 --- a/model_analyzer/config/input/config_command_profile.py +++ b/model_analyzer/config/input/config_command_profile.py @@ -1548,7 +1548,7 @@ def is_request_rate_specified(self, model_parameters: dict) -> bool: def is_llm_model(self) -> bool: """ - Returns true the user has enabled llm search or set any llm search value + Returns true if the user has enabled llm search or set any llm search value """ return ( self.llm_search_enable From c69b5772e53c2fe24b8e1d055fc17e918eb878b0 Mon Sep 17 00:00:00 2001 From: Brian Raf <92820864+nv-braf@users.noreply.github.com> Date: Wed, 11 Oct 2023 08:28:48 -0700 Subject: [PATCH 14/23] Update model_analyzer/config/generate/perf_analyzer_config_generator.py Co-authored-by: Hyunjae Woo <107147848+nv-hwoo@users.noreply.github.com> --- .../config/generate/perf_analyzer_config_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py index 2ae96862d..88ee3224d 100755 --- a/model_analyzer/config/generate/perf_analyzer_config_generator.py +++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py @@ -198,7 +198,7 @@ def _set_perf_analyzer_flags(self, model_perf_analyzer_flags: dict) -> dict: return perf_analyzer_flags - def _create_input_dict(self, model_perf_analyzer_flags: dict) -> dict: + def _create_input_dict(self, model_perf_analyzer_flags: Dict) -> Dict: if self._cli_config.is_llm_model(): with open(model_perf_analyzer_flags["input-data"], "r") as f: input_dict = json.load(f) From b1eed54a5df08e0266c90bf2f4bd6136f1a6b11b Mon Sep 17 00:00:00 2001 From: Brian Raf <92820864+nv-braf@users.noreply.github.com> Date: Wed, 11 Oct 2023 08:31:20 -0700 Subject: [PATCH 15/23] Update model_analyzer/config/generate/perf_analyzer_config_generator.py Co-authored-by: Hyunjae Woo <107147848+nv-hwoo@users.noreply.github.com> --- .../config/generate/perf_analyzer_config_generator.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py index 88ee3224d..1efc473a4 100755 --- a/model_analyzer/config/generate/perf_analyzer_config_generator.py +++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py @@ -360,9 +360,8 @@ def _update_perf_config_for_llm_model( perf_config.update_config({"input-data": self._input_json_filename}) def _write_modified_input_dict_to_file(self, modified_input_dict: Dict) -> None: - temp_input_data = open(self._input_json_filename, "w") - json.dump(modified_input_dict, temp_input_data) - temp_input_data.close() + with open(self._input_json_filename, "w") as f: + json.dump(modified_input_dict, f) def _create_non_parameter_perf_config_values(self) -> dict: perf_config_values = { From a2fa148c40a5e9e85e1f331601bbd7080a5eff2b Mon Sep 17 00:00:00 2001 From: Brian Raf <92820864+nv-braf@users.noreply.github.com> Date: Wed, 11 Oct 2023 08:32:46 -0700 Subject: [PATCH 16/23] Update model_analyzer/config/generate/perf_analyzer_config_generator.py Co-authored-by: Hyunjae Woo <107147848+nv-hwoo@users.noreply.github.com> --- .../config/generate/perf_analyzer_config_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py index 1efc473a4..a2d6ea7ec 100755 --- a/model_analyzer/config/generate/perf_analyzer_config_generator.py +++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py @@ -189,7 +189,7 @@ def set_last_results( self._last_results = measurement self._parameter_results.extend(measurement) - def _set_perf_analyzer_flags(self, model_perf_analyzer_flags: dict) -> dict: + def _set_perf_analyzer_flags(self, model_perf_analyzer_flags: Dict) -> Dict: # For LLM models we will be creating custom input data based on prompt length perf_analyzer_flags = {k: v for k, v in model_perf_analyzer_flags.items()} From c96d897d589c1366f7eda66e5d476171ef36368b Mon Sep 17 00:00:00 2001 From: braf Date: Wed, 11 Oct 2023 15:36:09 +0000 Subject: [PATCH 17/23] Moving location of method --- .../generate/perf_analyzer_config_generator.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py index a2d6ea7ec..2555d6985 100755 --- a/model_analyzer/config/generate/perf_analyzer_config_generator.py +++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py @@ -207,14 +207,6 @@ def _create_input_dict(self, model_perf_analyzer_flags: Dict) -> Dict: else: return {} - def _modify_prompt_in_input_dict(self, prompt_length: int) -> Dict: - modified_prompt = ["hi"] * prompt_length - - modified_input_dict = {k: v for k, v in self._llm_input_dict.items()} - modified_input_dict["data"][0]["PROMPT"] = modified_prompt - - return modified_input_dict - def _create_parameter_list(self) -> List[int]: # The two possible parameters are request rate or concurrency # Concurrency is the default and will be used unless the user specifies @@ -359,6 +351,14 @@ def _update_perf_config_for_llm_model( perf_config.update_config({"input-data": self._input_json_filename}) + def _modify_prompt_in_input_dict(self, prompt_length: int) -> Dict: + modified_prompt = ["hi"] * prompt_length + + modified_input_dict = {k: v for k, v in self._llm_input_dict.items()} + modified_input_dict["data"][0]["PROMPT"] = modified_prompt + + return modified_input_dict + def _write_modified_input_dict_to_file(self, modified_input_dict: Dict) -> None: with open(self._input_json_filename, "w") as f: json.dump(modified_input_dict, f) From daee4cc244f7d931047fb069af33331f583cdc13 Mon Sep 17 00:00:00 2001 From: braf Date: Wed, 11 Oct 2023 16:23:28 +0000 Subject: [PATCH 18/23] Changing parameter to inference load --- .../perf_analyzer_config_generator.py | 109 +++++++++--------- model_analyzer/constants.py | 1 + 2 files changed, 55 insertions(+), 55 deletions(-) diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py index 2555d6985..e7596a9af 100755 --- a/model_analyzer/config/generate/perf_analyzer_config_generator.py +++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py @@ -22,8 +22,8 @@ from model_analyzer.config.input.config_defaults import DEFAULT_INPUT_JSON_PATH from model_analyzer.constants import ( LOGGER_NAME, + THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES, THROUGHPUT_MINIMUM_CONSECUTIVE_NON_PARAMETER_TRIES, - THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES, THROUGHPUT_MINIMUM_GAIN, ) from model_analyzer.perf_analyzer.perf_config import PerfAnalyzerConfig @@ -74,19 +74,19 @@ def __init__( # All configs are pregenerated in _configs[][] # Indexed as follows: - # _configs[_curr_non_parameter_index][_curr_parameter_index] + # _configs[_curr_non_parameter_index][_curr_inference_load_index] # self._curr_non_parameter_index = 0 - self._curr_parameter_index = 0 + self._curr_inference_load_index = 0 self._configs: List[List[PerfAnalyzerConfig]] = [] - self._parameter_warning_printed = False + self._inference_load_warning_printed = False # Flag to indicate we have started to return results # self._generator_started = False self._last_results: List[RunConfigMeasurement] = [] - self._parameter_results: List[Optional[RunConfigMeasurement]] = [] + self._inference_load_results: List[Optional[RunConfigMeasurement]] = [] self._non_parameter_results: List[Optional[RunConfigMeasurement]] = [] self._model_name = model_name @@ -99,7 +99,7 @@ def __init__( ) self._model_parameters = model_parameters - self._parameters = self._create_parameter_list() + self._inference_loads = self._create_inference_load_list() self._batch_sizes = sorted(model_parameters["batch_sizes"]) self._prompt_lengths = self._create_prompt_length_list() @@ -121,7 +121,7 @@ def __init__( @staticmethod def throughput_gain_valid_helper( throughputs: List[Optional[RunConfigMeasurement]], - min_tries: int = THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES, + min_tries: int = THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES, min_gain: float = THROUGHPUT_MINIMUM_GAIN, ) -> bool: if len(throughputs) < min_tries: @@ -158,7 +158,7 @@ def get_configs(self) -> Generator[PerfAnalyzerConfig, None, None]: self._generator_started = True config = self._configs[self._curr_non_parameter_index][ - self._curr_parameter_index + self._curr_inference_load_index ] yield (config) @@ -187,7 +187,7 @@ def set_last_results( measurement = [max(valid_measurements)] self._last_results = measurement - self._parameter_results.extend(measurement) + self._inference_load_results.extend(measurement) def _set_perf_analyzer_flags(self, model_perf_analyzer_flags: Dict) -> Dict: # For LLM models we will be creating custom input data based on prompt length @@ -207,8 +207,8 @@ def _create_input_dict(self, model_perf_analyzer_flags: Dict) -> Dict: else: return {} - def _create_parameter_list(self) -> List[int]: - # The two possible parameters are request rate or concurrency + def _create_inference_load_list(self) -> List[int]: + # The two possible inference loads are request rate or concurrency # Concurrency is the default and will be used unless the user specifies # request rate, either as a model parameter or a config option if self._cli_config.is_request_rate_specified(self._model_parameters): @@ -267,33 +267,33 @@ def _create_max_token_count_list(self) -> List[int]: ) def _generate_perf_configs(self) -> None: - all_non_parameter_combinations = utils.generate_parameter_combinations( + non_parameter_combinations = utils.generate_parameter_combinations( self._perf_config_non_parameter_values ) - for unmodified_non_parameter_combination in all_non_parameter_combinations: - all_perf_configs_for_a_given_parameter = [] - for parameter in self._parameters: + for non_parameter_combination in non_parameter_combinations: + perf_configs_for_a_given_combination = [] + for inference_load in self._inference_loads: new_perf_config = self._create_new_perf_config( - parameter, unmodified_non_parameter_combination + inference_load, non_parameter_combination ) - all_perf_configs_for_a_given_parameter.append(new_perf_config) + perf_configs_for_a_given_combination.append(new_perf_config) - self._configs.append(all_perf_configs_for_a_given_parameter) + self._configs.append(perf_configs_for_a_given_combination) def _create_new_perf_config( - self, parameter: int, unmodified_non_parameter_combination: Dict + self, inference_load: int, non_parameter_combination: Dict ) -> PerfAnalyzerConfig: perf_config = self._create_base_perf_config() ( prompt_length, modified_non_parameter_combination, - ) = self._extract_prompt_length(unmodified_non_parameter_combination) + ) = self._extract_prompt_length(non_parameter_combination) self._update_perf_config_based_on_non_parameter_combination( perf_config, modified_non_parameter_combination ) - self._update_perf_config_based_on_parameter(perf_config, parameter) + self._update_perf_config_based_on_inference_load(perf_config, inference_load) self._update_perf_config_based_on_perf_analyzer_flags(perf_config) self._update_perf_config_for_llm_model(perf_config, prompt_length) @@ -308,17 +308,16 @@ def _create_base_perf_config(self) -> PerfAnalyzerConfig: return perf_config def _extract_prompt_length( - self, unmodified_parameter_combination: Dict + self, non_parameter_combination: Dict ) -> Tuple[int, Dict]: - modified_parameter_combination = { - k: v for k, v in unmodified_parameter_combination.items() - } + if not self._cli_config.is_llm_model(): + return 0, non_parameter_combination - if self._cli_config.is_llm_model(): - prompt_length = modified_parameter_combination.pop("prompt-length") - return prompt_length, modified_parameter_combination - else: - return 0, unmodified_parameter_combination + modified_non_parameter_combination = { + k: v for k, v in non_parameter_combination.items() + } + prompt_length = modified_non_parameter_combination.pop("prompt-length") + return prompt_length, modified_non_parameter_combination def _update_perf_config_based_on_non_parameter_combination( self, perf_config: PerfAnalyzerConfig, non_parameter_combination: Dict @@ -330,15 +329,15 @@ def _update_perf_config_based_on_perf_analyzer_flags( ) -> None: perf_config.update_config(self._perf_analyzer_flags) - def _update_perf_config_based_on_parameter( - self, perf_config: PerfAnalyzerConfig, parameter: int + def _update_perf_config_based_on_inference_load( + self, perf_config: PerfAnalyzerConfig, inference_load: int ) -> None: if self._cli_config.is_llm_model(): - perf_config.update_config({"periodic-concurrency-range": parameter}) + perf_config.update_config({"periodic-concurrency-range": inference_load}) elif self._cli_config.is_request_rate_specified(self._model_parameters): - perf_config.update_config({"request-rate-range": parameter}) + perf_config.update_config({"request-rate-range": inference_load}) else: - perf_config.update_config({"concurrency-range": parameter}) + perf_config.update_config({"concurrency-range": inference_load}) def _update_perf_config_for_llm_model( self, perf_config: PerfAnalyzerConfig, prompt_length: int @@ -375,26 +374,26 @@ def _create_non_parameter_perf_config_values(self) -> dict: return perf_config_values def _step(self) -> None: - self._step_parameter() + self._step_inference_load() - if self._done_walking_parameters(): + if self._done_walking_inference_loads(): self._add_best_throughput_to_non_parameter_results() - self._reset_parameters() + self._reset_inference_loads() self._step_non_parameter() def _add_best_throughput_to_non_parameter_results(self) -> None: - if self._parameter_results: + if self._inference_load_results: # type is List[Optional[RCM]] - best = max(self._parameter_results) # type: ignore + best = max(self._inference_load_results) # type: ignore self._non_parameter_results.append(best) - def _reset_parameters(self) -> None: - self._curr_parameter_index = 0 - self._parameter_warning_printed = False - self._parameter_results = [] + def _reset_inference_loads(self) -> None: + self._curr_inference_load_index = 0 + self._inference_load_warning_printed = False + self._inference_load_results = [] - def _step_parameter(self) -> None: - self._curr_parameter_index += 1 + def _step_inference_load(self) -> None: + self._curr_inference_load_index += 1 def _step_non_parameter(self) -> None: self._curr_non_parameter_index += 1 @@ -402,11 +401,11 @@ def _step_non_parameter(self) -> None: def _done_walking(self) -> bool: return self._done_walking_non_parameters() - def _done_walking_parameters(self) -> bool: - if len(self._parameters) == self._curr_parameter_index: + def _done_walking_inference_loads(self) -> bool: + if len(self._inference_loads) == self._curr_inference_load_index: return True - if self._early_exit_enable and not self._parameter_throughput_gain_valid(): - if not self._parameter_warning_printed: + if self._early_exit_enable and not self._inference_load_throughput_gain_valid(): + if not self._inference_load_warning_printed: if self._cli_config.is_request_rate_specified(self._model_parameters): logger.info( "No longer increasing request rate as throughput has plateaued" @@ -415,7 +414,7 @@ def _done_walking_parameters(self) -> bool: logger.info( "No longer increasing concurrency as throughput has plateaued" ) - self._parameter_warning_printed = True + self._inference_load_warning_printed = True return True return False @@ -434,11 +433,11 @@ def _done_walking_non_parameters(self) -> bool: def _last_results_erroneous(self) -> bool: return not self._last_results or self._last_results[-1] is None - def _parameter_throughput_gain_valid(self) -> bool: - """Check if any of the last X parameter results resulted in valid gain""" + def _inference_load_throughput_gain_valid(self) -> bool: + """Check if any of the last X inference load results resulted in valid gain""" return PerfAnalyzerConfigGenerator.throughput_gain_valid_helper( - throughputs=self._parameter_results, - min_tries=THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES, + throughputs=self._inference_load_results, + min_tries=THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES, min_gain=THROUGHPUT_MINIMUM_GAIN, ) diff --git a/model_analyzer/constants.py b/model_analyzer/constants.py index 65554f287..b98078722 100755 --- a/model_analyzer/constants.py +++ b/model_analyzer/constants.py @@ -33,6 +33,7 @@ # Run Search THROUGHPUT_MINIMUM_GAIN = 0.05 THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES = 4 +THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES = 4 THROUGHPUT_MINIMUM_CONSECUTIVE_NON_PARAMETER_TRIES = 4 # Quick search algorithm constants From 3966c6cf7f30af4d5b6d4ed7fd8f946279f69d49 Mon Sep 17 00:00:00 2001 From: braf Date: Wed, 11 Oct 2023 16:49:17 +0000 Subject: [PATCH 19/23] Changing parameter to inference load --- ...lus_binary_search_run_config_generator.py} | 28 +++-- ..._concurrency_sweep_run_config_generator.py | 12 +- .../generate/run_config_generator_factory.py | 10 +- model_analyzer/constants.py | 1 - ...ter_search.py => inference_load_search.py} | 113 +++++++++--------- ...earch.py => test_inference_load_search.py} | 40 +++---- 6 files changed, 106 insertions(+), 98 deletions(-) rename model_analyzer/config/generate/{brute_plus_binary_parameter_search_run_config_generator.py => brute_plus_binary_search_run_config_generator.py} (86%) rename model_analyzer/result/{parameter_search.py => inference_load_search.py} (63%) rename tests/{test_parameter_search.py => test_inference_load_search.py} (92%) diff --git a/model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py b/model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py similarity index 86% rename from model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py rename to model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py index b0a217274..78d55a1bc 100755 --- a/model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +++ b/model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py @@ -29,7 +29,7 @@ from model_analyzer.config.run.run_config import RunConfig from model_analyzer.constants import LOGGER_NAME from model_analyzer.device.gpu_device import GPUDevice -from model_analyzer.result.parameter_search import ParameterSearch +from model_analyzer.result.inference_load_search import InferenceLoadSearch from model_analyzer.result.result_manager import ResultManager from model_analyzer.result.run_config_measurement import RunConfigMeasurement from model_analyzer.triton.client.client import TritonClient @@ -39,10 +39,10 @@ logger = logging.getLogger(LOGGER_NAME) -class BrutePlusBinaryParameterSearchRunConfigGenerator(ConfigGeneratorInterface): +class BrutePlusBinarySearchRunConfigGenerator(ConfigGeneratorInterface): """ First run BruteRunConfigGenerator for a brute search, then for - automatic searches use ParameterSearch to perform a binary search + automatic searches use InferenceLoadSearch to perform a binary search """ def __init__( @@ -132,17 +132,19 @@ def _binary_search_over_top_results(self) -> Generator[RunConfig, None, None]: for result in top_results: run_config = deepcopy(result.run_config()) model_parameters = self._get_model_parameters(model_name) - parameter_search = ParameterSearch( + inference_load_search = InferenceLoadSearch( config=self._config, model_parameters=model_parameters, - skip_parameter_sweep=True, + skip_inference_load_sweep=True, ) - for parameter in parameter_search.search_parameters(): - run_config = self._set_parameter( - run_config, model_parameters, parameter + for inference_load in inference_load_search.search_inference_loads(): + run_config = self._set_inference_load( + run_config, model_parameters, inference_load ) yield run_config - parameter_search.add_run_config_measurement(self._last_measurement) + inference_load_search.add_run_config_measurement( + self._last_measurement + ) def _get_model_parameters(self, model_name: str) -> Dict: for model in self._models: @@ -151,14 +153,14 @@ def _get_model_parameters(self, model_name: str) -> Dict: return {} - def _set_parameter( - self, run_config: RunConfig, model_parameters: Dict, parameter: int + def _set_inference_load( + self, run_config: RunConfig, model_parameters: Dict, inference_load: int ) -> RunConfig: for model_run_config in run_config.model_run_configs(): perf_config = model_run_config.perf_config() if self._config.is_request_rate_specified(model_parameters): - perf_config.update_config({"request-rate-range": parameter}) + perf_config.update_config({"request-rate-range": inference_load}) else: - perf_config.update_config({"concurrency-range": parameter}) + perf_config.update_config({"concurrency-range": inference_load}) return run_config diff --git a/model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py b/model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py index b7adbef97..14a669438 100755 --- a/model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +++ b/model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py @@ -30,7 +30,7 @@ from model_analyzer.config.run.run_config import RunConfig from model_analyzer.constants import LOGGER_NAME from model_analyzer.device.gpu_device import GPUDevice -from model_analyzer.result.parameter_search import ParameterSearch +from model_analyzer.result.inference_load_search import InferenceLoadSearch from model_analyzer.result.result_manager import ResultManager from model_analyzer.result.run_config_measurement import RunConfigMeasurement from model_analyzer.triton.client.client import TritonClient @@ -43,7 +43,7 @@ class QuickPlusConcurrencySweepRunConfigGenerator(ConfigGeneratorInterface): """ First run QuickRunConfigGenerator for a hill climbing search, then use - ParameterSearch for a concurrency sweep + binary search of the default + InferenceLoadSearch for a concurrency sweep + binary search of the default and Top N results """ @@ -139,11 +139,13 @@ def _sweep_concurrency_over_top_results(self) -> Generator[RunConfig, None, None for result in top_results: run_config = deepcopy(result.run_config()) - parameter_search = ParameterSearch(self._config) - for concurrency in parameter_search.search_parameters(): + inference_load_search = InferenceLoadSearch(self._config) + for concurrency in inference_load_search.search_inference_loads(): run_config = self._set_concurrency(run_config, concurrency) yield run_config - parameter_search.add_run_config_measurement(self._last_measurement) + inference_load_search.add_run_config_measurement( + self._last_measurement + ) def _set_concurrency(self, run_config: RunConfig, concurrency: int) -> RunConfig: for model_run_config in run_config.model_run_configs(): diff --git a/model_analyzer/config/generate/run_config_generator_factory.py b/model_analyzer/config/generate/run_config_generator_factory.py index da3fc7a7a..0cdcddeb6 100755 --- a/model_analyzer/config/generate/run_config_generator_factory.py +++ b/model_analyzer/config/generate/run_config_generator_factory.py @@ -31,8 +31,8 @@ from model_analyzer.triton.client.client import TritonClient from model_analyzer.triton.model.model_config import ModelConfig -from .brute_plus_binary_parameter_search_run_config_generator import ( - BrutePlusBinaryParameterSearchRunConfigGenerator, +from .brute_plus_binary_search_run_config_generator import ( + BrutePlusBinarySearchRunConfigGenerator, ) from .config_generator_interface import ConfigGeneratorInterface from .quick_plus_concurrency_sweep_run_config_generator import ( @@ -96,7 +96,7 @@ def create_run_config_generator( model_variant_name_manager=model_variant_name_manager, ) elif command_config.run_config_search_mode == "brute": - return RunConfigGeneratorFactory._create_brute_plus_binary_parameter_search_run_config_generator( + return RunConfigGeneratorFactory._create_brute_plus_binary_search_run_config_generator( command_config=command_config, gpus=gpus, models=new_models, @@ -110,7 +110,7 @@ def create_run_config_generator( ) @staticmethod - def _create_brute_plus_binary_parameter_search_run_config_generator( + def _create_brute_plus_binary_search_run_config_generator( command_config: ConfigCommandProfile, gpus: List[GPUDevice], models: List[ModelProfileSpec], @@ -118,7 +118,7 @@ def _create_brute_plus_binary_parameter_search_run_config_generator( result_manager: ResultManager, model_variant_name_manager: ModelVariantNameManager, ) -> ConfigGeneratorInterface: - return BrutePlusBinaryParameterSearchRunConfigGenerator( + return BrutePlusBinarySearchRunConfigGenerator( config=command_config, gpus=gpus, models=models, diff --git a/model_analyzer/constants.py b/model_analyzer/constants.py index b98078722..e1bccbb13 100755 --- a/model_analyzer/constants.py +++ b/model_analyzer/constants.py @@ -32,7 +32,6 @@ # Run Search THROUGHPUT_MINIMUM_GAIN = 0.05 -THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES = 4 THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES = 4 THROUGHPUT_MINIMUM_CONSECUTIVE_NON_PARAMETER_TRIES = 4 diff --git a/model_analyzer/result/parameter_search.py b/model_analyzer/result/inference_load_search.py similarity index 63% rename from model_analyzer/result/parameter_search.py rename to model_analyzer/result/inference_load_search.py index e716a5b7d..5c7c9598d 100755 --- a/model_analyzer/result/parameter_search.py +++ b/model_analyzer/result/inference_load_search.py @@ -21,7 +21,7 @@ from model_analyzer.config.input.config_command_profile import ConfigCommandProfile from model_analyzer.constants import ( LOGGER_NAME, - THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES, + THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES, THROUGHPUT_MINIMUM_GAIN, ) from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException @@ -30,11 +30,11 @@ logger = logging.getLogger(LOGGER_NAME) -class ParameterSearch: +class InferenceLoadSearch: """ - Generates the next parameter value to use when searching through + Generates the next inference load value to use when searching through RunConfigMeasurements for the best value (according to the users objective) - - Will sweep from by powers of two from min to max parameter + - Will sweep from by powers of two from min to max inference load - If the user specifies a constraint, the algorithm will perform a binary search around the boundary if the constraint is violated @@ -45,43 +45,43 @@ def __init__( self, config: ConfigCommandProfile, model_parameters: dict = {}, - skip_parameter_sweep: bool = False, + skip_inference_load_sweep: bool = False, ) -> None: """ Parameters ---------- config: ConfigCommandProfile Profile configuration information - skip_parameter_sweep: bool - If true, skips the parameter sweep and only does the binary search + skip_inference_load_sweep: bool + If true, skips the inference load sweep and only does the binary search """ - self._skip_parameter_sweep = skip_parameter_sweep - self._parameter_is_request_rate = config.is_request_rate_specified( + self._skip_inference_load_sweep = skip_inference_load_sweep + self._inference_load_is_request_rate = config.is_request_rate_specified( model_parameters ) - if self._parameter_is_request_rate: - self._min_parameter_index = int( + if self._inference_load_is_request_rate: + self._min_inference_load_index = int( log2(config.run_config_search_min_request_rate) ) - self._max_parameter_index = int( + self._max_inference_load_index = int( log2(config.run_config_search_max_request_rate) ) else: - self._min_parameter_index = int( + self._min_inference_load_index = int( log2(config.run_config_search_min_concurrency) ) - self._max_parameter_index = int( + self._max_inference_load_index = int( log2(config.run_config_search_max_concurrency) ) self._max_binary_search_steps = config.run_config_search_max_binary_search_steps self._run_config_measurements: List[Optional[RunConfigMeasurement]] = [] - self._parameters: List[int] = [] - self._last_failing_parameter = 0 - self._last_passing_parameter = 0 + self._inference_loads: List[int] = [] + self._last_failing_inference_load = 0 + self._last_passing_inference_load = 0 def add_run_config_measurement( self, run_config_measurement: Optional[RunConfigMeasurement] @@ -92,30 +92,31 @@ def add_run_config_measurement( """ self._run_config_measurements.append(run_config_measurement) - def search_parameters(self) -> Generator[int, None, None]: + def search_inference_loads(self) -> Generator[int, None, None]: """ - First performs a parameter sweep, and then, if necessary, perform - a binary parameter search around the point where the constraint - violated + First performs an inference load sweep, and then, if necessary, perform + a binary search around the point where the constraint was violated """ - yield from self._perform_parameter_sweep() + yield from self._perform_inference_load_sweep() if self._was_constraint_violated(): - yield from self._perform_binary_parameter_search() + yield from self._perform_binary_search() - def _perform_parameter_sweep(self) -> Generator[int, None, None]: - for parameter in ( + def _perform_inference_load_sweep(self) -> Generator[int, None, None]: + for inference_load in ( 2**i - for i in range(self._min_parameter_index, self._max_parameter_index + 1) + for i in range( + self._min_inference_load_index, self._max_inference_load_index + 1 + ) ): - if self._should_continue_parameter_sweep(): - self._parameters.append(parameter) - yield parameter + if self._should_continue_inference_load_sweep(): + self._inference_loads.append(inference_load) + yield inference_load else: # We can't actually skip the sweep because the results need to be added # but, we can suppress the logging messages - if not self._skip_parameter_sweep: - if self._parameter_is_request_rate: + if not self._skip_inference_load_sweep: + if self._inference_load_is_request_rate: logger.info( "Terminating request rate sweep - throughput is decreasing" ) @@ -125,7 +126,7 @@ def _perform_parameter_sweep(self) -> Generator[int, None, None]: ) return - def _should_continue_parameter_sweep(self) -> bool: + def _should_continue_inference_load_sweep(self) -> bool: self._check_measurement_count() if not self._are_minimum_tries_reached(): @@ -134,16 +135,16 @@ def _should_continue_parameter_sweep(self) -> bool: return not self._has_objective_gain_saturated() def _check_measurement_count(self) -> None: - if len(self._run_config_measurements) != len(self._parameters): + if len(self._run_config_measurements) != len(self._inference_loads): raise TritonModelAnalyzerException( - f"Internal Measurement count: {self._parameters}, doesn't match number " + f"Internal Measurement count: {self._inference_loads}, doesn't match number " f"of measurements added: {len(self._run_config_measurements)}." ) def _are_minimum_tries_reached(self) -> bool: if ( len(self._run_config_measurements) - < THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES + < THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES ): return False else: @@ -155,7 +156,7 @@ def _has_objective_gain_saturated(self) -> bool: def _calculate_gain(self) -> float: first_rcm = self._run_config_measurements[ - -THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES + -THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES ] best_rcm = self._get_best_rcm() @@ -177,7 +178,7 @@ def _get_best_rcm(self) -> Optional[RunConfigMeasurement]: pruned_rcms = [ rcm for rcm in self._run_config_measurements[ - -THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES: + -THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES: ] if rcm ] @@ -188,16 +189,16 @@ def _get_best_rcm(self) -> Optional[RunConfigMeasurement]: def _was_constraint_violated(self) -> bool: for i in range(len(self._run_config_measurements) - 1, 1, -1): if self._at_constraint_failure_boundary(i): - self._last_failing_parameter = self._parameters[i] - self._last_passing_parameter = self._parameters[i - 1] + self._last_failing_inference_load = self._inference_loads[i] + self._last_passing_inference_load = self._inference_loads[i - 1] return True if ( self._run_config_measurements[0] and not self._run_config_measurements[0].is_passing_constraints() ): - self._last_failing_parameter = self._parameters[i] - self._last_passing_parameter = 0 + self._last_failing_inference_load = self._inference_loads[i] + self._last_passing_inference_load = 0 return True else: return False @@ -220,27 +221,31 @@ def _at_constraint_failure_boundary(self, index: int) -> bool: return at_failure_boundary - def _perform_binary_parameter_search(self) -> Generator[int, None, None]: + def _perform_binary_search(self) -> Generator[int, None, None]: # This is needed because we are going to restart the search from the - # parameter that failed - so we expect this to be at the end of the list - self._parameters.append(self._last_failing_parameter) + # inference_load that failed - so we expect this to be at the end of the list + self._inference_loads.append(self._last_failing_inference_load) for i in range(0, self._max_binary_search_steps): - parameter = self._determine_next_binary_parameter() + inference_load = self._determine_next_binary_inference_load() - if parameter != self._parameters[-1]: - self._parameters.append(parameter) - yield parameter + if inference_load != self._inference_loads[-1]: + self._inference_loads.append(inference_load) + yield inference_load - def _determine_next_binary_parameter(self) -> int: + def _determine_next_binary_inference_load(self) -> int: if not self._run_config_measurements[-1]: return 0 if self._run_config_measurements[-1].is_passing_constraints(): - self._last_passing_parameter = self._parameters[-1] - parameter = int((self._last_failing_parameter + self._parameters[-1]) / 2) + self._last_passing_inference_load = self._inference_loads[-1] + inference_load = int( + (self._last_failing_inference_load + self._inference_loads[-1]) / 2 + ) else: - self._last_failing_parameter = self._parameters[-1] - parameter = int((self._last_passing_parameter + self._parameters[-1]) / 2) + self._last_failing_inference_load = self._inference_loads[-1] + inference_load = int( + (self._last_passing_inference_load + self._inference_loads[-1]) / 2 + ) - return parameter + return inference_load diff --git a/tests/test_parameter_search.py b/tests/test_inference_load_search.py similarity index 92% rename from tests/test_parameter_search.py rename to tests/test_inference_load_search.py index 7f410bb26..d8643ad66 100755 --- a/tests/test_parameter_search.py +++ b/tests/test_inference_load_search.py @@ -25,17 +25,17 @@ DEFAULT_RUN_CONFIG_MIN_CONCURRENCY, DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE, ) -from model_analyzer.constants import THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES +from model_analyzer.constants import THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException from model_analyzer.result.constraint_manager import ConstraintManager -from model_analyzer.result.parameter_search import ParameterSearch +from model_analyzer.result.inference_load_search import InferenceLoadSearch from model_analyzer.result.run_config_measurement import RunConfigMeasurement from .common import test_result_collector as trc from .common.test_utils import construct_run_config_measurement, evaluate_mock_config -class TestParameterSearch(trc.TestResultCollector): +class TestInferenceLoadSearch(trc.TestResultCollector): def setUp(self): self._min_concurrency_index = int(log2(DEFAULT_RUN_CONFIG_MIN_CONCURRENCY)) self._max_concurrency_index = int(log2(DEFAULT_RUN_CONFIG_MAX_CONCURRENCY)) @@ -67,9 +67,9 @@ def test_concurrency_sweep(self): """ config = self._create_single_model_no_constraints() constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch(config) + concurrency_search = InferenceLoadSearch(config) - for concurrency in concurrency_search.search_parameters(): + for concurrency in concurrency_search.search_inference_loads(): self._concurrencies.append(concurrency) concurrency_search.add_run_config_measurement( @@ -90,11 +90,11 @@ def test_request_rate_sweep(self): """ config = self._create_single_model_no_constraints() constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch( + concurrency_search = InferenceLoadSearch( config, model_parameters={"request_rate": "True"} ) - for request_rate in concurrency_search.search_parameters(): + for request_rate in concurrency_search.search_inference_loads(): self._request_rates.append(request_rate) concurrency_search.add_run_config_measurement( @@ -115,7 +115,7 @@ def test_saturating_sweep(self): """ config = self._create_single_model_no_constraints() constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch(config) + concurrency_search = InferenceLoadSearch(config) INCREASE_THROUGHPUT_COUNT = 4 # [100, 200, 400, 800, 1000, 1000,...] @@ -124,7 +124,7 @@ def test_saturating_sweep(self): for c in range(self._min_concurrency_index, self._max_concurrency_index + 1) ] - for i, concurrency in enumerate(concurrency_search.search_parameters()): + for i, concurrency in enumerate(concurrency_search.search_inference_loads()): self._concurrencies.append(concurrency) concurrency_search.add_run_config_measurement( @@ -140,7 +140,7 @@ def test_saturating_sweep(self): 2**c for c in range( INCREASE_THROUGHPUT_COUNT - + THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES + + THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES ) ] self.assertEqual(self._concurrencies, expected_concurrencies) @@ -152,12 +152,12 @@ def test_sweep_with_constraints_decreasing(self): """ config = self._create_single_model_with_constraints("95") constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch(config) + concurrency_search = InferenceLoadSearch(config) self._expected_concurrencies.extend([12, 10, 9]) latencies = [10 * c for c in self._expected_concurrencies] - for i, concurrency in enumerate(concurrency_search.search_parameters()): + for i, concurrency in enumerate(concurrency_search.search_inference_loads()): self._concurrencies.append(concurrency) concurrency_search.add_run_config_measurement( @@ -178,12 +178,12 @@ def test_sweep_with_constraints_decrease_then_increase(self): """ config = self._create_single_model_with_constraints("155") constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch(config) + concurrency_search = InferenceLoadSearch(config) self._expected_concurrencies.extend([12, 14, 15]) latencies = [10 * c for c in self._expected_concurrencies] - for i, concurrency in enumerate(concurrency_search.search_parameters()): + for i, concurrency in enumerate(concurrency_search.search_inference_loads()): self._concurrencies.append(concurrency) concurrency_search.add_run_config_measurement( @@ -204,14 +204,14 @@ def test_sweep_with_multiple_violation_areas(self): """ config = self._create_single_model_with_constraints("155") constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch(config) + concurrency_search = InferenceLoadSearch(config) self._expected_concurrencies.extend([12, 14, 15]) latencies = [10 * c for c in self._expected_concurrencies] # this adds an early constraint violation which should be ignored latencies[1] = 200 - for i, concurrency in enumerate(concurrency_search.search_parameters()): + for i, concurrency in enumerate(concurrency_search.search_inference_loads()): self._concurrencies.append(concurrency) concurrency_search.add_run_config_measurement( @@ -233,12 +233,12 @@ def test_sweep_with_constraints_hitting_limit(self): """ config = self._create_single_model_with_constraints("970") constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch(config) + concurrency_search = InferenceLoadSearch(config) self._expected_concurrencies.extend([768, 896, 960, 992, 976]) latencies = self._expected_concurrencies - for i, concurrency in enumerate(concurrency_search.search_parameters()): + for i, concurrency in enumerate(concurrency_search.search_inference_loads()): self._concurrencies.append(concurrency) concurrency_search.add_run_config_measurement( @@ -258,10 +258,10 @@ def test_not_adding_measurements(self): """ config = self._create_single_model_no_constraints() constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch(config) + concurrency_search = InferenceLoadSearch(config) with self.assertRaises(TritonModelAnalyzerException): - for concurrency in concurrency_search.search_parameters(): + for concurrency in concurrency_search.search_inference_loads(): self._concurrencies.append(concurrency) if concurrency < 32: From fbe1abf7595035e46e41100546ff8ca35c9b61b8 Mon Sep 17 00:00:00 2001 From: braf Date: Wed, 11 Oct 2023 19:52:25 +0000 Subject: [PATCH 20/23] Changing prompt length to text input length --- .../perf_analyzer_config_generator.py | 41 +++++++++-------- .../config/input/config_command_profile.py | 46 ++++++++++--------- .../config/input/config_defaults.py | 4 +- tests/test_cli.py | 6 +-- tests/test_config.py | 42 ++++++++--------- tests/test_perf_analyzer_config_generator.py | 14 +++--- 6 files changed, 79 insertions(+), 74 deletions(-) diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py index e7596a9af..aa8981393 100755 --- a/model_analyzer/config/generate/perf_analyzer_config_generator.py +++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py @@ -16,6 +16,7 @@ import json import logging +from itertools import repeat from typing import Dict, Generator, List, Optional, Tuple from model_analyzer.config.input.config_command_profile import ConfigCommandProfile @@ -64,7 +65,7 @@ def __init__( custom perf analyzer configuration model_parameters: Dict - model constraints for batch sizes, concurrency, request rate, prompt length, etc.. + model constraints for batch sizes, concurrency, request rate, text input length, etc.. early_exit_enable: Bool If true, this class can early exit during search of concurrency/request rate @@ -102,7 +103,7 @@ def __init__( self._inference_loads = self._create_inference_load_list() self._batch_sizes = sorted(model_parameters["batch_sizes"]) - self._prompt_lengths = self._create_prompt_length_list() + self._text_input_lengths = self._create_text_input_length_list() self._max_token_counts = self._create_max_token_count_list() self._perf_config_non_parameter_values = ( @@ -190,7 +191,7 @@ def set_last_results( self._inference_load_results.extend(measurement) def _set_perf_analyzer_flags(self, model_perf_analyzer_flags: Dict) -> Dict: - # For LLM models we will be creating custom input data based on prompt length + # For LLM models we will be creating custom input data based on text input length perf_analyzer_flags = {k: v for k, v in model_perf_analyzer_flags.items()} if self._cli_config.is_llm_model(): @@ -238,18 +239,18 @@ def _create_concurrency_list(self) -> List[int]: self._cli_config.run_config_search_max_concurrency, ) - def _create_prompt_length_list(self) -> List[int]: + def _create_text_input_length_list(self) -> List[int]: if not self._cli_config.is_llm_model(): return [] - if self._model_parameters["prompt_length"]: - return sorted(self._model_parameters["prompt_length"]) + if self._model_parameters["text_input_length"]: + return sorted(self._model_parameters["text_input_length"]) elif self._cli_config.run_config_search_disable: return [1] else: return utils.generate_doubled_list( - self._cli_config.run_config_search_min_prompt_length, - self._cli_config.run_config_search_max_prompt_length, + self._cli_config.run_config_search_min_text_input_length, + self._cli_config.run_config_search_max_text_input_length, ) def _create_max_token_count_list(self) -> List[int]: @@ -286,16 +287,16 @@ def _create_new_perf_config( perf_config = self._create_base_perf_config() ( - prompt_length, + text_input_length, modified_non_parameter_combination, - ) = self._extract_prompt_length(non_parameter_combination) + ) = self._extract_text_input_length(non_parameter_combination) self._update_perf_config_based_on_non_parameter_combination( perf_config, modified_non_parameter_combination ) self._update_perf_config_based_on_inference_load(perf_config, inference_load) self._update_perf_config_based_on_perf_analyzer_flags(perf_config) - self._update_perf_config_for_llm_model(perf_config, prompt_length) + self._update_perf_config_for_llm_model(perf_config, text_input_length) return perf_config @@ -307,7 +308,7 @@ def _create_base_perf_config(self) -> PerfAnalyzerConfig: return perf_config - def _extract_prompt_length( + def _extract_text_input_length( self, non_parameter_combination: Dict ) -> Tuple[int, Dict]: if not self._cli_config.is_llm_model(): @@ -316,8 +317,8 @@ def _extract_prompt_length( modified_non_parameter_combination = { k: v for k, v in non_parameter_combination.items() } - prompt_length = modified_non_parameter_combination.pop("prompt-length") - return prompt_length, modified_non_parameter_combination + text_input_length = modified_non_parameter_combination.pop("text-input-length") + return text_input_length, modified_non_parameter_combination def _update_perf_config_based_on_non_parameter_combination( self, perf_config: PerfAnalyzerConfig, non_parameter_combination: Dict @@ -340,21 +341,21 @@ def _update_perf_config_based_on_inference_load( perf_config.update_config({"concurrency-range": inference_load}) def _update_perf_config_for_llm_model( - self, perf_config: PerfAnalyzerConfig, prompt_length: int + self, perf_config: PerfAnalyzerConfig, text_input_length: int ) -> None: if not self._cli_config.is_llm_model(): return - modified_input_dict = self._modify_prompt_in_input_dict(prompt_length) + modified_input_dict = self._modify_text_in_input_dict(text_input_length) self._write_modified_input_dict_to_file(modified_input_dict) perf_config.update_config({"input-data": self._input_json_filename}) - def _modify_prompt_in_input_dict(self, prompt_length: int) -> Dict: - modified_prompt = ["hi"] * prompt_length + def _modify_text_in_input_dict(self, text_input_length: int) -> Dict: + modified_text = " ".join(repeat("Hello", text_input_length)) modified_input_dict = {k: v for k, v in self._llm_input_dict.items()} - modified_input_dict["data"][0]["PROMPT"] = modified_prompt + modified_input_dict["data"][0]["text-input"] = modified_text return modified_input_dict @@ -369,7 +370,7 @@ def _create_non_parameter_perf_config_values(self) -> dict: if self._cli_config.is_llm_model(): perf_config_values["max-token-count"] = self._max_token_counts - perf_config_values["prompt-length"] = self._prompt_lengths + perf_config_values["text-input-length"] = self._text_input_lengths return perf_config_values diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py index 317987732..a215a2251 100755 --- a/model_analyzer/config/input/config_command_profile.py +++ b/model_analyzer/config/input/config_command_profile.py @@ -498,7 +498,7 @@ def _add_profile_models_configs(self): "batch_sizes": ConfigListNumeric(type_=int), "concurrency": ConfigListNumeric(type_=int), "request_rate": ConfigListNumeric(type_=int), - "prompt_length": ConfigListNumeric(type_=int), + "text_input_length": ConfigListNumeric(type_=int), "max_token_count": ConfigListNumeric(type_=int), } ), @@ -573,10 +573,10 @@ def _add_profile_models_configs(self): ) self._add_config( ConfigField( - "prompt_length", - flags=["--prompt-length"], + "text_input_length", + flags=["--text-input-length"], field_type=ConfigListNumeric(int), - description="Comma-delimited list of prompt length values or ranges " + description="Comma-delimited list of text input length values or ranges " " to be used during profiling LLMs", ) ) @@ -813,25 +813,25 @@ def _add_run_search_configs(self): field_type=ConfigPrimitive(bool), parser_args={"action": "store_true"}, default_value=config_defaults.DEFAULT_LLM_SEARCH_ENABLE, - description="Enables searching values are important to LLMs: prompt length, max token, etc...", + description="Enables searching values are important to LLMs: text input length, max token, etc...", ) ) self._add_config( ConfigField( - "run_config_search_min_prompt_length", - flags=["--run-config-search-min-prompt-length"], + "run_config_search_min_text_input_length", + flags=["--run-config-search-min-text-input-length"], field_type=ConfigPrimitive(int), - default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH, - description="Min prompt length that run config search should start with.", + default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH, + description="Min text input length that run config search should start with.", ) ) self._add_config( ConfigField( - "run_config_search_max_prompt_length", - flags=["--run-config-search-max-prompt-length"], + "run_config_search_max_text_input_length", + flags=["--run-config-search-max-text-input-length"], field_type=ConfigPrimitive(int), - default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH, - description="Max prompt length that run config search will not go beyond.", + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_TEXT_INPUT_LENGTH, + description="Max text input length that run config search will not go beyond.", ) ) self._add_config( @@ -1421,7 +1421,7 @@ def _autofill_values(self): "batch_sizes": self.batch_sizes, "concurrency": self.concurrency, "request_rate": self.request_rate, - "prompt_length": self.prompt_length, + "text_input_length": self.text_input_length, "max_token_count": self.max_token_count, } else: @@ -1447,13 +1447,13 @@ def _autofill_values(self): else: new_model["parameters"].update({"request_rate": self.request_rate}) - if "prompt_length" in model.parameters(): + if "text_input_length" in model.parameters(): new_model["parameters"].update( - {"prompt_length": model.parameters()["prompt_length"]} + {"text_input_length": model.parameters()["text_input_length"]} ) else: new_model["parameters"].update( - {"prompt_length": self.prompt_length} + {"text_input_length": self.text_input_length} ) if "max_token_count" in model.parameters(): @@ -1462,7 +1462,7 @@ def _autofill_values(self): ) else: new_model["parameters"].update( - {"max_token_count": self.prompt_length} + {"max_token_count": self.text_input_length} ) if ( @@ -1552,10 +1552,14 @@ def is_llm_model(self) -> bool: """ return ( self.llm_search_enable - or self.get_config()["run_config_search_min_prompt_length"].is_set_by_user() - or self.get_config()["run_config_search_max_prompt_length"].is_set_by_user() + or self.get_config()[ + "run_config_search_min_text_input_length" + ].is_set_by_user() + or self.get_config()[ + "run_config_search_max_text_input_length" + ].is_set_by_user() or self.get_config()["run_config_search_min_token_count"].is_set_by_user() or self.get_config()["run_config_search_max_token_count"].is_set_by_user() - or self.get_config()["prompt_length"].is_set_by_user() + or self.get_config()["text_input_length"].is_set_by_user() or self.get_config()["max_token_count"].is_set_by_user() ) diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py index 4be24fd2b..7e37f7c7d 100755 --- a/model_analyzer/config/input/config_defaults.py +++ b/model_analyzer/config/input/config_defaults.py @@ -52,8 +52,8 @@ DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE = 1 DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE = 128 DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS = 5 -DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH = 1 -DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH = 1024 +DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH = 1 +DEFAULT_RUN_CONFIG_MAX_TEXT_INPUT_LENGTH = 1024 DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT = 1 DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT = 256 DEFAULT_RUN_CONFIG_SEARCH_DISABLE = False diff --git a/tests/test_cli.py b/tests/test_cli.py index 75be15038..94dbf0b21 100755 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -85,8 +85,8 @@ def get_test_options(): OptionStruct("int", "profile", "--run-config-search-min-instance-count", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT)), OptionStruct("int", "profile", "--run-config-search-max-instance-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT)), OptionStruct("int", "profile", "--run-config-search-max-binary-search-steps", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS)), - OptionStruct("int", "profile", "--run-config-search-min-prompt-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH)), - OptionStruct("int", "profile", "--run-config-search-max-prompt-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH)), + OptionStruct("int", "profile", "--run-config-search-min-text-input-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH)), + OptionStruct("int", "profile", "--run-config-search-max-text-input-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_TEXT_INPUT_LENGTH)), OptionStruct("int", "profile", "--run-config-search-min-token-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT)), OptionStruct("int", "profile", "--run-config-search-max-token-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT)), OptionStruct("float", "profile", "--monitoring-interval", "-i", "10.0", str(config_defaults.DEFAULT_MONITORING_INTERVAL)), @@ -140,7 +140,7 @@ def get_test_options(): OptionStruct("intlist", "profile", "--batch-sizes", "-b", "2, 4, 6", "1"), OptionStruct("intlist", "profile", "--concurrency", "-c", "1, 2, 3", None), OptionStruct("intlist", "profile", "--request-rate", None, "1, 2, 3", None), - OptionStruct("intlist", "profile", "--prompt-length", None, "1, 2, 3", None), + OptionStruct("intlist", "profile", "--text-input-length", None, "1, 2, 3", None), OptionStruct("intlist", "profile", "--max-token-count", None, "1, 2, 3", None), OptionStruct("stringlist", "profile", "--triton-docker-mounts", None, "a:b:c, d:e:f", None, extra_commands=["--triton-launch-mode", "docker"]), OptionStruct("stringlist", "profile", "--gpus", None, "a, b, c", "all"), diff --git a/tests/test_config.py b/tests/test_config.py index 2e95d3d4d..01dc739d8 100755 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -292,7 +292,7 @@ def test_range_and_list_values(self): "batch_sizes": [1], "concurrency": [], "request_rate": [], - "prompt_length": [], + "text_input_length": [], "max_token_count": [], }, objectives={"perf_throughput": 10}, @@ -303,7 +303,7 @@ def test_range_and_list_values(self): "batch_sizes": [1], "concurrency": [], "request_rate": [], - "prompt_length": [], + "text_input_length": [], "max_token_count": [], }, objectives={"perf_throughput": 10}, @@ -442,7 +442,7 @@ def test_object(self): "batch_sizes": [1], "concurrency": [1, 2, 3, 4], "request_rate": [], - "prompt_length": [], + "text_input_length": [], "max_token_count": [], }, objectives={"perf_throughput": 10}, @@ -453,7 +453,7 @@ def test_object(self): "batch_sizes": [1], "concurrency": [], "request_rate": [], - "prompt_length": [], + "text_input_length": [], "max_token_count": [], }, objectives={"perf_throughput": 10}, @@ -509,7 +509,7 @@ def test_object(self): "batch_sizes": [1], "concurrency": [1, 2, 3, 4], "request_rate": [], - "prompt_length": [], + "text_input_length": [], "max_token_count": [], }, objectives={"perf_throughput": 10}, @@ -520,7 +520,7 @@ def test_object(self): "concurrency": [1, 2, 3, 4], "batch_sizes": [2, 4, 6], "request_rate": [], - "prompt_length": [], + "text_input_length": [], "max_token_count": [], }, objectives={"perf_throughput": 10}, @@ -593,7 +593,7 @@ def test_constraints(self): "batch_sizes": [1], "concurrency": [1, 2, 3, 4], "request_rate": [], - "prompt_length": [], + "text_input_length": [], "max_token_count": [], }, objectives={"perf_throughput": 10, "gpu_used_memory": 5}, @@ -609,7 +609,7 @@ def test_constraints(self): "batch_sizes": [1], "concurrency": [], "request_rate": [], - "prompt_length": [], + "text_input_length": [], "max_token_count": [], }, objectives={"perf_throughput": 10}, @@ -733,7 +733,7 @@ def test_config_model(self): "batch_sizes": [1], "concurrency": [], "request_rate": [], - "prompt_length": [], + "text_input_length": [], "max_token_count": [], }, objectives={"perf_throughput": 10}, @@ -764,7 +764,7 @@ def test_config_model(self): "batch_sizes": [1], "concurrency": [], "request_rate": [], - "prompt_length": [], + "text_input_length": [], "max_token_count": [], }, objectives={"perf_throughput": 10}, @@ -806,7 +806,7 @@ def test_config_model(self): "batch_sizes": [1], "concurrency": [], "request_rate": [], - "prompt_length": [], + "text_input_length": [], "max_token_count": [], }, objectives={"perf_throughput": 10}, @@ -855,7 +855,7 @@ def test_config_model(self): "batch_sizes": [1], "concurrency": [], "request_rate": [], - "prompt_length": [], + "text_input_length": [], "max_token_count": [], }, objectives={"perf_throughput": 10}, @@ -891,7 +891,7 @@ def test_config_model(self): "batch_sizes": [1], "concurrency": [], "request_rate": [], - "prompt_length": [], + "text_input_length": [], "max_token_count": [], }, objectives={"perf_throughput": 10}, @@ -940,7 +940,7 @@ def test_config_model(self): "batch_sizes": [1], "concurrency": [], "request_rate": [], - "prompt_length": [], + "text_input_length": [], "max_token_count": [], }, objectives={"perf_throughput": 10}, @@ -972,7 +972,7 @@ def test_config_model(self): "batch_sizes": [1], "concurrency": [], "request_rate": [], - "prompt_length": [], + "text_input_length": [], "max_token_count": [], }, objectives={"perf_throughput": 10}, @@ -1249,7 +1249,7 @@ def test_autofill(self): "batch_sizes": [1], "concurrency": [], "request_rate": [], - "prompt_length": [], + "text_input_length": [], "max_token_count": [], }, objectives={"perf_throughput": 10}, @@ -1295,7 +1295,7 @@ def test_autofill(self): "batch_sizes": [16, 32], "concurrency": [2, 4], "request_rate": [], - "prompt_length": [], + "text_input_length": [], "max_token_count": [], }, objectives={"perf_throughput": 10, "gpu_used_memory": 5}, @@ -1345,7 +1345,7 @@ def test_autofill(self): "batch_sizes": [16, 32], "concurrency": [2, 4], "request_rate": [], - "prompt_length": [], + "text_input_length": [], "max_token_count": [], }, objectives={"gpu_used_memory": 10}, @@ -1391,7 +1391,7 @@ def test_autofill(self): "batch_sizes": [16, 32], "concurrency": [2, 4], "request_rate": [], - "prompt_length": [], + "text_input_length": [], "max_token_count": [], }, objectives={"gpu_used_memory": 10}, @@ -1448,7 +1448,7 @@ def test_autofill(self): "batch_sizes": [16, 32], "concurrency": [5, 6, 7], "request_rate": [], - "prompt_length": [], + "text_input_length": [], "max_token_count": [], }, objectives={"gpu_used_memory": 10}, @@ -1463,7 +1463,7 @@ def test_autofill(self): "batch_sizes": [1, 2], "concurrency": [2, 4], "request_rate": [], - "prompt_length": [], + "text_input_length": [], "max_token_count": [], }, objectives={"perf_throughput": 10, "perf_latency_p99": 5}, diff --git a/tests/test_perf_analyzer_config_generator.py b/tests/test_perf_analyzer_config_generator.py index 619f6e23f..79cce3de0 100755 --- a/tests/test_perf_analyzer_config_generator.py +++ b/tests/test_perf_analyzer_config_generator.py @@ -564,7 +564,7 @@ def test_llm_search_max_token_count(self): Test LLM Search: - max token count 1->256 - Concurrency and prompt length max set to 1 + Concurrency and text input length max set to 1 """ # yapf: disable @@ -586,17 +586,17 @@ def test_llm_search_max_token_count(self): "--llm-search-enable", "--run-config-search-max-concurrency", "1", - "--run-config-search-max-prompt-length", + "--run-config-search-max-text-input-length", "1", ] self._run_and_test_perf_analyzer_config_generator( yaml_str, expected_configs, pa_cli_args ) - def test_llm_search_prompt_length(self): + def test_llm_search_text_input_length(self): """ Test LLM Search: - - Prompt length 1->1024 + - Input length 1->1024 Concurrency and max token count set to 1 """ @@ -610,10 +610,10 @@ def test_llm_search_prompt_length(self): """) # yapf: enable - prompt_lengths = utils.generate_doubled_list(1, 1024) + text_input_lengths = utils.generate_doubled_list(1, 1024) expected_configs = [ construct_perf_analyzer_config(llm_search_mode=True) - for pl in prompt_lengths + for pl in text_input_lengths ] pa_cli_args = [ @@ -919,7 +919,7 @@ def setUp(self): self.mock_os.start() self._input_data = """{ - "data": [{"PROMPT": ["Hello, my name is"], "STREAM": [true]}] + "data": [{"text_input": ["Hello, my name is"], "stream": [true]}] }""" def tearDown(self): From abec25d515a4e7989a07e16d96fc9a5e4f31e968 Mon Sep 17 00:00:00 2001 From: braf Date: Wed, 11 Oct 2023 20:05:28 +0000 Subject: [PATCH 21/23] Changing max_tokens to use request-parameter --- .../config/generate/perf_analyzer_config_generator.py | 4 +++- model_analyzer/perf_analyzer/perf_config.py | 4 ++-- tests/common/test_utils.py | 4 +++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py index aa8981393..4b759bee7 100755 --- a/model_analyzer/config/generate/perf_analyzer_config_generator.py +++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py @@ -369,7 +369,9 @@ def _create_non_parameter_perf_config_values(self) -> dict: } if self._cli_config.is_llm_model(): - perf_config_values["max-token-count"] = self._max_token_counts + perf_config_values["request-parameter"] = [ + "max_token:" + str(mtc) + ":int" for mtc in self._max_token_counts + ] perf_config_values["text-input-length"] = self._text_input_lengths return perf_config_values diff --git a/model_analyzer/perf_analyzer/perf_config.py b/model_analyzer/perf_analyzer/perf_config.py index c9865f515..7cab2dd3c 100755 --- a/model_analyzer/perf_analyzer/perf_config.py +++ b/model_analyzer/perf_analyzer/perf_config.py @@ -72,7 +72,7 @@ class PerfAnalyzerConfig: "metrics-url", "metrics-interval", "bls-composing-models", - "max-token-count", + "request-parameter", ] input_to_options = [ @@ -276,7 +276,7 @@ def extract_model_specific_parameters(self): "concurrency-range": self._args["concurrency-range"], "request-rate-range": self._args["request-rate-range"], "periodic-concurrency-range": self._args["periodic-concurrency-range"], - "max-token-count": self._args["max-token-count"], + "max-tokens": self._args["request-parameter"], } @classmethod diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py index 8a6c76ed3..caa9763ce 100755 --- a/tests/common/test_utils.py +++ b/tests/common/test_utils.py @@ -287,7 +287,9 @@ def construct_perf_analyzer_config( pa_config._args["concurrency-range"] = concurrency if llm_search_mode: - pa_config._args["max-token-count"] = max_token_count + pa_config._args["request-parameter"] = ( + "max_token:" + str(max_token_count) + ":int" + ) pa_config._args["input-data"] = DEFAULT_INPUT_JSON_PATH + "/input-data.json" pa_config._args["measurement-mode"] = DEFAULT_MEASUREMENT_MODE From f8729dbbb7b47fb5c2f571d21a447beb52ba2627 Mon Sep 17 00:00:00 2001 From: braf Date: Thu, 12 Oct 2023 15:28:21 +0000 Subject: [PATCH 22/23] Fix input-data typo --- tests/test_perf_analyzer_config_generator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_perf_analyzer_config_generator.py b/tests/test_perf_analyzer_config_generator.py index 79cce3de0..69e42ef8d 100755 --- a/tests/test_perf_analyzer_config_generator.py +++ b/tests/test_perf_analyzer_config_generator.py @@ -570,7 +570,7 @@ def test_llm_search_max_token_count(self): # yapf: disable yaml_str = (""" perf_analyzer_flags: - input-data: input_data.json + input-data: input-data.json profile_models: - my-model """) @@ -604,7 +604,7 @@ def test_llm_search_text_input_length(self): # yapf: disable yaml_str = (""" perf_analyzer_flags: - input-data: input_data.json + input-data: input-data.json profile_models: - my-model """) From 2cda3df079a0fb76682f14c5a7230570c7d8a756 Mon Sep 17 00:00:00 2001 From: braf Date: Thu, 12 Oct 2023 15:34:13 +0000 Subject: [PATCH 23/23] Changing non-parameter to parameter --- .../perf_analyzer_config_generator.py | 87 +++++++++---------- model_analyzer/constants.py | 2 +- 2 files changed, 44 insertions(+), 45 deletions(-) diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py index 4b759bee7..f17c2bc18 100755 --- a/model_analyzer/config/generate/perf_analyzer_config_generator.py +++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py @@ -24,7 +24,7 @@ from model_analyzer.constants import ( LOGGER_NAME, THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES, - THROUGHPUT_MINIMUM_CONSECUTIVE_NON_PARAMETER_TRIES, + THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES, THROUGHPUT_MINIMUM_GAIN, ) from model_analyzer.perf_analyzer.perf_config import PerfAnalyzerConfig @@ -75,9 +75,12 @@ def __init__( # All configs are pregenerated in _configs[][] # Indexed as follows: - # _configs[_curr_non_parameter_index][_curr_inference_load_index] + # _configs[_curr_parameter_index][_curr_inference_load_index] # - self._curr_non_parameter_index = 0 + # Parameters are: batch size, text input length, max token size + # Inference load are: concurrency/periodic-concurrency, request-rate + # + self._curr_parameter_index = 0 self._curr_inference_load_index = 0 self._configs: List[List[PerfAnalyzerConfig]] = [] self._inference_load_warning_printed = False @@ -88,7 +91,7 @@ def __init__( self._last_results: List[RunConfigMeasurement] = [] self._inference_load_results: List[Optional[RunConfigMeasurement]] = [] - self._non_parameter_results: List[Optional[RunConfigMeasurement]] = [] + self._parameter_results: List[Optional[RunConfigMeasurement]] = [] self._model_name = model_name self._cli_config = cli_config @@ -106,13 +109,9 @@ def __init__( self._text_input_lengths = self._create_text_input_length_list() self._max_token_counts = self._create_max_token_count_list() - self._perf_config_non_parameter_values = ( - self._create_non_parameter_perf_config_values() - ) - self._non_parameter_count = len( - utils.generate_parameter_combinations( - self._perf_config_non_parameter_values - ) + self._perf_config_parameter_values = self._create_parameter_perf_config_values() + self._parameter_count = len( + utils.generate_parameter_combinations(self._perf_config_parameter_values) ) self._input_json_filename = DEFAULT_INPUT_JSON_PATH + "/input-data.json" @@ -158,7 +157,7 @@ def get_configs(self) -> Generator[PerfAnalyzerConfig, None, None]: break self._generator_started = True - config = self._configs[self._curr_non_parameter_index][ + config = self._configs[self._curr_parameter_index][ self._curr_inference_load_index ] yield (config) @@ -268,31 +267,31 @@ def _create_max_token_count_list(self) -> List[int]: ) def _generate_perf_configs(self) -> None: - non_parameter_combinations = utils.generate_parameter_combinations( - self._perf_config_non_parameter_values + parameter_combinations = utils.generate_parameter_combinations( + self._perf_config_parameter_values ) - for non_parameter_combination in non_parameter_combinations: + for parameter_combination in parameter_combinations: perf_configs_for_a_given_combination = [] for inference_load in self._inference_loads: new_perf_config = self._create_new_perf_config( - inference_load, non_parameter_combination + inference_load, parameter_combination ) perf_configs_for_a_given_combination.append(new_perf_config) self._configs.append(perf_configs_for_a_given_combination) def _create_new_perf_config( - self, inference_load: int, non_parameter_combination: Dict + self, inference_load: int, parameter_combination: Dict ) -> PerfAnalyzerConfig: perf_config = self._create_base_perf_config() ( text_input_length, - modified_non_parameter_combination, - ) = self._extract_text_input_length(non_parameter_combination) + modified_parameter_combination, + ) = self._extract_text_input_length(parameter_combination) - self._update_perf_config_based_on_non_parameter_combination( - perf_config, modified_non_parameter_combination + self._update_perf_config_based_on_parameter_combination( + perf_config, modified_parameter_combination ) self._update_perf_config_based_on_inference_load(perf_config, inference_load) self._update_perf_config_based_on_perf_analyzer_flags(perf_config) @@ -309,21 +308,21 @@ def _create_base_perf_config(self) -> PerfAnalyzerConfig: return perf_config def _extract_text_input_length( - self, non_parameter_combination: Dict + self, parameter_combination: Dict ) -> Tuple[int, Dict]: if not self._cli_config.is_llm_model(): - return 0, non_parameter_combination + return 0, parameter_combination - modified_non_parameter_combination = { - k: v for k, v in non_parameter_combination.items() + modified_parameter_combination = { + k: v for k, v in parameter_combination.items() } - text_input_length = modified_non_parameter_combination.pop("text-input-length") - return text_input_length, modified_non_parameter_combination + text_input_length = modified_parameter_combination.pop("text-input-length") + return text_input_length, modified_parameter_combination - def _update_perf_config_based_on_non_parameter_combination( - self, perf_config: PerfAnalyzerConfig, non_parameter_combination: Dict + def _update_perf_config_based_on_parameter_combination( + self, perf_config: PerfAnalyzerConfig, parameter_combination: Dict ) -> None: - perf_config.update_config(non_parameter_combination) + perf_config.update_config(parameter_combination) def _update_perf_config_based_on_perf_analyzer_flags( self, perf_config: PerfAnalyzerConfig @@ -363,7 +362,7 @@ def _write_modified_input_dict_to_file(self, modified_input_dict: Dict) -> None: with open(self._input_json_filename, "w") as f: json.dump(modified_input_dict, f) - def _create_non_parameter_perf_config_values(self) -> dict: + def _create_parameter_perf_config_values(self) -> dict: perf_config_values = { "batch-size": self._batch_sizes, } @@ -380,15 +379,15 @@ def _step(self) -> None: self._step_inference_load() if self._done_walking_inference_loads(): - self._add_best_throughput_to_non_parameter_results() + self._add_best_throughput_to_parameter_results() self._reset_inference_loads() - self._step_non_parameter() + self._step_parameter() - def _add_best_throughput_to_non_parameter_results(self) -> None: + def _add_best_throughput_to_parameter_results(self) -> None: if self._inference_load_results: # type is List[Optional[RCM]] best = max(self._inference_load_results) # type: ignore - self._non_parameter_results.append(best) + self._parameter_results.append(best) def _reset_inference_loads(self) -> None: self._curr_inference_load_index = 0 @@ -398,11 +397,11 @@ def _reset_inference_loads(self) -> None: def _step_inference_load(self) -> None: self._curr_inference_load_index += 1 - def _step_non_parameter(self) -> None: - self._curr_non_parameter_index += 1 + def _step_parameter(self) -> None: + self._curr_parameter_index += 1 def _done_walking(self) -> bool: - return self._done_walking_non_parameters() + return self._done_walking_parameters() def _done_walking_inference_loads(self) -> bool: if len(self._inference_loads) == self._curr_inference_load_index: @@ -421,11 +420,11 @@ def _done_walking_inference_loads(self) -> bool: return True return False - def _done_walking_non_parameters(self) -> bool: - if self._non_parameter_count == self._curr_non_parameter_index: + def _done_walking_parameters(self) -> bool: + if self._parameter_count == self._curr_parameter_index: return True - if self._early_exit_enable and not self._non_parameter_throughput_gain_valid(): + if self._early_exit_enable and not self._parameter_throughput_gain_valid(): logger.info( "No longer increasing client batch size as throughput has plateaued" ) @@ -444,10 +443,10 @@ def _inference_load_throughput_gain_valid(self) -> bool: min_gain=THROUGHPUT_MINIMUM_GAIN, ) - def _non_parameter_throughput_gain_valid(self) -> bool: + def _parameter_throughput_gain_valid(self) -> bool: """Check if any of the last X non-parameter results resulted in valid gain""" return PerfAnalyzerConfigGenerator.throughput_gain_valid_helper( - throughputs=self._non_parameter_results, - min_tries=THROUGHPUT_MINIMUM_CONSECUTIVE_NON_PARAMETER_TRIES, + throughputs=self._parameter_results, + min_tries=THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES, min_gain=THROUGHPUT_MINIMUM_GAIN, ) diff --git a/model_analyzer/constants.py b/model_analyzer/constants.py index e1bccbb13..09f581326 100755 --- a/model_analyzer/constants.py +++ b/model_analyzer/constants.py @@ -33,7 +33,7 @@ # Run Search THROUGHPUT_MINIMUM_GAIN = 0.05 THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES = 4 -THROUGHPUT_MINIMUM_CONSECUTIVE_NON_PARAMETER_TRIES = 4 +THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES = 4 # Quick search algorithm constants RADIUS = 3