diff --git a/model_analyzer/config/generate/automatic_model_config_generator.py b/model_analyzer/config/generate/automatic_model_config_generator.py index 79925cb7d..283f112d0 100755 --- a/model_analyzer/config/generate/automatic_model_config_generator.py +++ b/model_analyzer/config/generate/automatic_model_config_generator.py @@ -79,10 +79,7 @@ def __init__( logger.info("") AutomaticModelConfigGenerator._log_first_run = True - self._max_instance_count = config.run_config_search_max_instance_count - self._min_instance_count = config.run_config_search_min_instance_count - self._max_model_batch_size = config.run_config_search_max_model_batch_size - self._min_model_batch_size = config.run_config_search_min_model_batch_size + self._set_min_max_search_values(config) self._instance_kind = "KIND_CPU" if self._cpu_only else "KIND_GPU" @@ -162,3 +159,9 @@ def _get_curr_param_combo(self) -> Dict: config["dynamic_batching"] = {} return config + + def _set_min_max_search_values(self, config: ConfigCommandProfile) -> None: + self._max_instance_count = config.run_config_search_max_instance_count + self._min_instance_count = config.run_config_search_min_instance_count + self._max_model_batch_size = config.run_config_search_max_model_batch_size + self._min_model_batch_size = config.run_config_search_min_model_batch_size diff --git a/model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py b/model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py similarity index 86% rename from model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py rename to model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py index b0a217274..78d55a1bc 100755 --- a/model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +++ b/model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py @@ -29,7 +29,7 @@ from model_analyzer.config.run.run_config import RunConfig from model_analyzer.constants import LOGGER_NAME from model_analyzer.device.gpu_device import GPUDevice -from model_analyzer.result.parameter_search import ParameterSearch +from model_analyzer.result.inference_load_search import InferenceLoadSearch from model_analyzer.result.result_manager import ResultManager from model_analyzer.result.run_config_measurement import RunConfigMeasurement from model_analyzer.triton.client.client import TritonClient @@ -39,10 +39,10 @@ logger = logging.getLogger(LOGGER_NAME) -class BrutePlusBinaryParameterSearchRunConfigGenerator(ConfigGeneratorInterface): +class BrutePlusBinarySearchRunConfigGenerator(ConfigGeneratorInterface): """ First run BruteRunConfigGenerator for a brute search, then for - automatic searches use ParameterSearch to perform a binary search + automatic searches use InferenceLoadSearch to perform a binary search """ def __init__( @@ -132,17 +132,19 @@ def _binary_search_over_top_results(self) -> Generator[RunConfig, None, None]: for result in top_results: run_config = deepcopy(result.run_config()) model_parameters = self._get_model_parameters(model_name) - parameter_search = ParameterSearch( + inference_load_search = InferenceLoadSearch( config=self._config, model_parameters=model_parameters, - skip_parameter_sweep=True, + skip_inference_load_sweep=True, ) - for parameter in parameter_search.search_parameters(): - run_config = self._set_parameter( - run_config, model_parameters, parameter + for inference_load in inference_load_search.search_inference_loads(): + run_config = self._set_inference_load( + run_config, model_parameters, inference_load ) yield run_config - parameter_search.add_run_config_measurement(self._last_measurement) + inference_load_search.add_run_config_measurement( + self._last_measurement + ) def _get_model_parameters(self, model_name: str) -> Dict: for model in self._models: @@ -151,14 +153,14 @@ def _get_model_parameters(self, model_name: str) -> Dict: return {} - def _set_parameter( - self, run_config: RunConfig, model_parameters: Dict, parameter: int + def _set_inference_load( + self, run_config: RunConfig, model_parameters: Dict, inference_load: int ) -> RunConfig: for model_run_config in run_config.model_run_configs(): perf_config = model_run_config.perf_config() if self._config.is_request_rate_specified(model_parameters): - perf_config.update_config({"request-rate-range": parameter}) + perf_config.update_config({"request-rate-range": inference_load}) else: - perf_config.update_config({"concurrency-range": parameter}) + perf_config.update_config({"concurrency-range": inference_load}) return run_config diff --git a/model_analyzer/config/generate/model_run_config_generator.py b/model_analyzer/config/generate/model_run_config_generator.py index b068c7577..529fa5b83 100755 --- a/model_analyzer/config/generate/model_run_config_generator.py +++ b/model_analyzer/config/generate/model_run_config_generator.py @@ -150,5 +150,13 @@ def _determine_early_exit_enables( concurrency_specified = model.parameters()["concurrency"] config_parameters_exist = model.model_config_parameters() - self._pacg_early_exit_enable = early_exit_enable or not concurrency_specified - self._mcg_early_exit_enable = early_exit_enable or not config_parameters_exist + if config.is_llm_model(): + self._pacg_early_exit_enable = False + self._mcg_early_exit_enable = False + else: + self._pacg_early_exit_enable = ( + early_exit_enable or not concurrency_specified + ) + self._mcg_early_exit_enable = ( + early_exit_enable or not config_parameters_exist + ) diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py index 985032564..f17c2bc18 100755 --- a/model_analyzer/config/generate/perf_analyzer_config_generator.py +++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py @@ -14,13 +14,16 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import logging -from typing import Generator, List, Optional +from itertools import repeat +from typing import Dict, Generator, List, Optional, Tuple from model_analyzer.config.input.config_command_profile import ConfigCommandProfile +from model_analyzer.config.input.config_defaults import DEFAULT_INPUT_JSON_PATH from model_analyzer.constants import ( LOGGER_NAME, - THROUGHPUT_MINIMUM_CONSECUTIVE_BATCH_SIZE_TRIES, + THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES, THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES, THROUGHPUT_MINIMUM_GAIN, ) @@ -62,7 +65,7 @@ def __init__( custom perf analyzer configuration model_parameters: Dict - model constraints for batch_sizes, concurrency and/or request rate + model constraints for batch sizes, concurrency, request rate, text input length, etc.. early_exit_enable: Bool If true, this class can early exit during search of concurrency/request rate @@ -72,35 +75,53 @@ def __init__( # All configs are pregenerated in _configs[][] # Indexed as follows: - # _configs[_curr_batch_size_index][_curr_parameter_index] + # _configs[_curr_parameter_index][_curr_inference_load_index] + # + # Parameters are: batch size, text input length, max token size + # Inference load are: concurrency/periodic-concurrency, request-rate # self._curr_parameter_index = 0 - self._curr_batch_size_index = 0 + self._curr_inference_load_index = 0 self._configs: List[List[PerfAnalyzerConfig]] = [] - self._parameter_warning_printed = False + self._inference_load_warning_printed = False # Flag to indicate we have started to return results # self._generator_started = False self._last_results: List[RunConfigMeasurement] = [] + self._inference_load_results: List[Optional[RunConfigMeasurement]] = [] self._parameter_results: List[Optional[RunConfigMeasurement]] = [] - self._batch_size_results: List[Optional[RunConfigMeasurement]] = [] self._model_name = model_name - self._perf_analyzer_flags = model_perf_analyzer_flags - - self._batch_sizes = sorted(model_parameters["batch_sizes"]) self._cli_config = cli_config + self._llm_input_dict = self._create_input_dict(model_perf_analyzer_flags) + + self._perf_analyzer_flags = self._set_perf_analyzer_flags( + model_perf_analyzer_flags + ) + self._model_parameters = model_parameters - self._parameters = self._create_parameter_list() + self._inference_loads = self._create_inference_load_list() + + self._batch_sizes = sorted(model_parameters["batch_sizes"]) + self._text_input_lengths = self._create_text_input_length_list() + self._max_token_counts = self._create_max_token_count_list() + + self._perf_config_parameter_values = self._create_parameter_perf_config_values() + self._parameter_count = len( + utils.generate_parameter_combinations(self._perf_config_parameter_values) + ) + + self._input_json_filename = DEFAULT_INPUT_JSON_PATH + "/input-data.json" + self._generate_perf_configs() @staticmethod def throughput_gain_valid_helper( throughputs: List[Optional[RunConfigMeasurement]], - min_tries: int = THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES, + min_tries: int = THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES, min_gain: float = THROUGHPUT_MINIMUM_GAIN, ) -> bool: if len(throughputs) < min_tries: @@ -136,8 +157,8 @@ def get_configs(self) -> Generator[PerfAnalyzerConfig, None, None]: break self._generator_started = True - config = self._configs[self._curr_batch_size_index][ - self._curr_parameter_index + config = self._configs[self._curr_parameter_index][ + self._curr_inference_load_index ] yield (config) @@ -166,10 +187,28 @@ def set_last_results( measurement = [max(valid_measurements)] self._last_results = measurement - self._parameter_results.extend(measurement) + self._inference_load_results.extend(measurement) + + def _set_perf_analyzer_flags(self, model_perf_analyzer_flags: Dict) -> Dict: + # For LLM models we will be creating custom input data based on text input length + perf_analyzer_flags = {k: v for k, v in model_perf_analyzer_flags.items()} + + if self._cli_config.is_llm_model(): + perf_analyzer_flags.pop("input-data") + + return perf_analyzer_flags + + def _create_input_dict(self, model_perf_analyzer_flags: Dict) -> Dict: + if self._cli_config.is_llm_model(): + with open(model_perf_analyzer_flags["input-data"], "r") as f: + input_dict = json.load(f) + + return input_dict + else: + return {} - def _create_parameter_list(self) -> List[int]: - # The two possible parameters are request rate or concurrency + def _create_inference_load_list(self) -> List[int]: + # The two possible inference loads are request rate or concurrency # Concurrency is the default and will be used unless the user specifies # request rate, either as a model parameter or a config option if self._cli_config.is_request_rate_specified(self._model_parameters): @@ -199,75 +238,176 @@ def _create_concurrency_list(self) -> List[int]: self._cli_config.run_config_search_max_concurrency, ) + def _create_text_input_length_list(self) -> List[int]: + if not self._cli_config.is_llm_model(): + return [] + + if self._model_parameters["text_input_length"]: + return sorted(self._model_parameters["text_input_length"]) + elif self._cli_config.run_config_search_disable: + return [1] + else: + return utils.generate_doubled_list( + self._cli_config.run_config_search_min_text_input_length, + self._cli_config.run_config_search_max_text_input_length, + ) + + def _create_max_token_count_list(self) -> List[int]: + if not self._cli_config.is_llm_model(): + return [] + + if self._model_parameters["max_token_count"]: + return sorted(self._model_parameters["max_token_count"]) + elif self._cli_config.run_config_search_disable: + return [1] + else: + return utils.generate_doubled_list( + self._cli_config.run_config_search_min_token_count, + self._cli_config.run_config_search_max_token_count, + ) + def _generate_perf_configs(self) -> None: - perf_config_non_parameter_values = ( - self._create_non_parameter_perf_config_values() + parameter_combinations = utils.generate_parameter_combinations( + self._perf_config_parameter_values ) + for parameter_combination in parameter_combinations: + perf_configs_for_a_given_combination = [] + for inference_load in self._inference_loads: + new_perf_config = self._create_new_perf_config( + inference_load, parameter_combination + ) + perf_configs_for_a_given_combination.append(new_perf_config) - for params in utils.generate_parameter_combinations( - perf_config_non_parameter_values - ): - configs_with_concurrency = [] - for parameter in self._parameters: - new_perf_config = PerfAnalyzerConfig() + self._configs.append(perf_configs_for_a_given_combination) - new_perf_config.update_config_from_profile_config( - self._model_name, self._cli_config - ) + def _create_new_perf_config( + self, inference_load: int, parameter_combination: Dict + ) -> PerfAnalyzerConfig: + perf_config = self._create_base_perf_config() - new_perf_config.update_config(params) + ( + text_input_length, + modified_parameter_combination, + ) = self._extract_text_input_length(parameter_combination) - if self._cli_config.is_request_rate_specified(self._model_parameters): - new_perf_config.update_config({"request-rate-range": parameter}) - else: - new_perf_config.update_config({"concurrency-range": parameter}) + self._update_perf_config_based_on_parameter_combination( + perf_config, modified_parameter_combination + ) + self._update_perf_config_based_on_inference_load(perf_config, inference_load) + self._update_perf_config_based_on_perf_analyzer_flags(perf_config) + self._update_perf_config_for_llm_model(perf_config, text_input_length) + + return perf_config + + def _create_base_perf_config(self) -> PerfAnalyzerConfig: + perf_config = PerfAnalyzerConfig() + perf_config.update_config_from_profile_config( + self._model_name, self._cli_config + ) + + return perf_config + + def _extract_text_input_length( + self, parameter_combination: Dict + ) -> Tuple[int, Dict]: + if not self._cli_config.is_llm_model(): + return 0, parameter_combination + + modified_parameter_combination = { + k: v for k, v in parameter_combination.items() + } + text_input_length = modified_parameter_combination.pop("text-input-length") + return text_input_length, modified_parameter_combination + + def _update_perf_config_based_on_parameter_combination( + self, perf_config: PerfAnalyzerConfig, parameter_combination: Dict + ) -> None: + perf_config.update_config(parameter_combination) + + def _update_perf_config_based_on_perf_analyzer_flags( + self, perf_config: PerfAnalyzerConfig + ) -> None: + perf_config.update_config(self._perf_analyzer_flags) + + def _update_perf_config_based_on_inference_load( + self, perf_config: PerfAnalyzerConfig, inference_load: int + ) -> None: + if self._cli_config.is_llm_model(): + perf_config.update_config({"periodic-concurrency-range": inference_load}) + elif self._cli_config.is_request_rate_specified(self._model_parameters): + perf_config.update_config({"request-rate-range": inference_load}) + else: + perf_config.update_config({"concurrency-range": inference_load}) + + def _update_perf_config_for_llm_model( + self, perf_config: PerfAnalyzerConfig, text_input_length: int + ) -> None: + if not self._cli_config.is_llm_model(): + return + + modified_input_dict = self._modify_text_in_input_dict(text_input_length) + self._write_modified_input_dict_to_file(modified_input_dict) - # User provided flags can override the search parameters - new_perf_config.update_config(self._perf_analyzer_flags) + perf_config.update_config({"input-data": self._input_json_filename}) - configs_with_concurrency.append(new_perf_config) - self._configs.append(configs_with_concurrency) + def _modify_text_in_input_dict(self, text_input_length: int) -> Dict: + modified_text = " ".join(repeat("Hello", text_input_length)) - def _create_non_parameter_perf_config_values(self) -> dict: + modified_input_dict = {k: v for k, v in self._llm_input_dict.items()} + modified_input_dict["data"][0]["text-input"] = modified_text + + return modified_input_dict + + def _write_modified_input_dict_to_file(self, modified_input_dict: Dict) -> None: + with open(self._input_json_filename, "w") as f: + json.dump(modified_input_dict, f) + + def _create_parameter_perf_config_values(self) -> dict: perf_config_values = { "batch-size": self._batch_sizes, } + if self._cli_config.is_llm_model(): + perf_config_values["request-parameter"] = [ + "max_token:" + str(mtc) + ":int" for mtc in self._max_token_counts + ] + perf_config_values["text-input-length"] = self._text_input_lengths + return perf_config_values def _step(self) -> None: - self._step_parameter() + self._step_inference_load() - if self._done_walking_parameters(): - self._add_best_throughput_to_batch_sizes() - self._reset_parameters() - self._step_batch_size() + if self._done_walking_inference_loads(): + self._add_best_throughput_to_parameter_results() + self._reset_inference_loads() + self._step_parameter() - def _add_best_throughput_to_batch_sizes(self) -> None: - if self._parameter_results: + def _add_best_throughput_to_parameter_results(self) -> None: + if self._inference_load_results: # type is List[Optional[RCM]] - best = max(self._parameter_results) # type: ignore - self._batch_size_results.append(best) + best = max(self._inference_load_results) # type: ignore + self._parameter_results.append(best) - def _reset_parameters(self) -> None: - self._curr_parameter_index = 0 - self._parameter_warning_printed = False - self._parameter_results = [] + def _reset_inference_loads(self) -> None: + self._curr_inference_load_index = 0 + self._inference_load_warning_printed = False + self._inference_load_results = [] + + def _step_inference_load(self) -> None: + self._curr_inference_load_index += 1 def _step_parameter(self) -> None: self._curr_parameter_index += 1 - def _step_batch_size(self) -> None: - self._curr_batch_size_index += 1 - def _done_walking(self) -> bool: - return self._done_walking_batch_sizes() + return self._done_walking_parameters() - def _done_walking_parameters(self) -> bool: - if len(self._parameters) == self._curr_parameter_index: + def _done_walking_inference_loads(self) -> bool: + if len(self._inference_loads) == self._curr_inference_load_index: return True - if self._early_exit_enable and not self._parameter_throughput_gain_valid(): - if not self._parameter_warning_printed: + if self._early_exit_enable and not self._inference_load_throughput_gain_valid(): + if not self._inference_load_warning_printed: if self._cli_config.is_request_rate_specified(self._model_parameters): logger.info( "No longer increasing request rate as throughput has plateaued" @@ -276,15 +416,15 @@ def _done_walking_parameters(self) -> bool: logger.info( "No longer increasing concurrency as throughput has plateaued" ) - self._parameter_warning_printed = True + self._inference_load_warning_printed = True return True return False - def _done_walking_batch_sizes(self) -> bool: - if len(self._batch_sizes) == self._curr_batch_size_index: + def _done_walking_parameters(self) -> bool: + if self._parameter_count == self._curr_parameter_index: return True - if self._early_exit_enable and not self._batch_size_throughput_gain_valid(): + if self._early_exit_enable and not self._parameter_throughput_gain_valid(): logger.info( "No longer increasing client batch size as throughput has plateaued" ) @@ -295,18 +435,18 @@ def _done_walking_batch_sizes(self) -> bool: def _last_results_erroneous(self) -> bool: return not self._last_results or self._last_results[-1] is None - def _parameter_throughput_gain_valid(self) -> bool: - """Check if any of the last X parameter results resulted in valid gain""" + def _inference_load_throughput_gain_valid(self) -> bool: + """Check if any of the last X inference load results resulted in valid gain""" return PerfAnalyzerConfigGenerator.throughput_gain_valid_helper( - throughputs=self._parameter_results, - min_tries=THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES, + throughputs=self._inference_load_results, + min_tries=THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES, min_gain=THROUGHPUT_MINIMUM_GAIN, ) - def _batch_size_throughput_gain_valid(self) -> bool: - """Check if any of the last X batch_size results resulted in valid gain""" + def _parameter_throughput_gain_valid(self) -> bool: + """Check if any of the last X non-parameter results resulted in valid gain""" return PerfAnalyzerConfigGenerator.throughput_gain_valid_helper( - throughputs=self._batch_size_results, - min_tries=THROUGHPUT_MINIMUM_CONSECUTIVE_BATCH_SIZE_TRIES, + throughputs=self._parameter_results, + min_tries=THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES, min_gain=THROUGHPUT_MINIMUM_GAIN, ) diff --git a/model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py b/model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py index b7adbef97..14a669438 100755 --- a/model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +++ b/model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py @@ -30,7 +30,7 @@ from model_analyzer.config.run.run_config import RunConfig from model_analyzer.constants import LOGGER_NAME from model_analyzer.device.gpu_device import GPUDevice -from model_analyzer.result.parameter_search import ParameterSearch +from model_analyzer.result.inference_load_search import InferenceLoadSearch from model_analyzer.result.result_manager import ResultManager from model_analyzer.result.run_config_measurement import RunConfigMeasurement from model_analyzer.triton.client.client import TritonClient @@ -43,7 +43,7 @@ class QuickPlusConcurrencySweepRunConfigGenerator(ConfigGeneratorInterface): """ First run QuickRunConfigGenerator for a hill climbing search, then use - ParameterSearch for a concurrency sweep + binary search of the default + InferenceLoadSearch for a concurrency sweep + binary search of the default and Top N results """ @@ -139,11 +139,13 @@ def _sweep_concurrency_over_top_results(self) -> Generator[RunConfig, None, None for result in top_results: run_config = deepcopy(result.run_config()) - parameter_search = ParameterSearch(self._config) - for concurrency in parameter_search.search_parameters(): + inference_load_search = InferenceLoadSearch(self._config) + for concurrency in inference_load_search.search_inference_loads(): run_config = self._set_concurrency(run_config, concurrency) yield run_config - parameter_search.add_run_config_measurement(self._last_measurement) + inference_load_search.add_run_config_measurement( + self._last_measurement + ) def _set_concurrency(self, run_config: RunConfig, concurrency: int) -> RunConfig: for model_run_config in run_config.model_run_configs(): diff --git a/model_analyzer/config/generate/run_config_generator_factory.py b/model_analyzer/config/generate/run_config_generator_factory.py index da3fc7a7a..0cdcddeb6 100755 --- a/model_analyzer/config/generate/run_config_generator_factory.py +++ b/model_analyzer/config/generate/run_config_generator_factory.py @@ -31,8 +31,8 @@ from model_analyzer.triton.client.client import TritonClient from model_analyzer.triton.model.model_config import ModelConfig -from .brute_plus_binary_parameter_search_run_config_generator import ( - BrutePlusBinaryParameterSearchRunConfigGenerator, +from .brute_plus_binary_search_run_config_generator import ( + BrutePlusBinarySearchRunConfigGenerator, ) from .config_generator_interface import ConfigGeneratorInterface from .quick_plus_concurrency_sweep_run_config_generator import ( @@ -96,7 +96,7 @@ def create_run_config_generator( model_variant_name_manager=model_variant_name_manager, ) elif command_config.run_config_search_mode == "brute": - return RunConfigGeneratorFactory._create_brute_plus_binary_parameter_search_run_config_generator( + return RunConfigGeneratorFactory._create_brute_plus_binary_search_run_config_generator( command_config=command_config, gpus=gpus, models=new_models, @@ -110,7 +110,7 @@ def create_run_config_generator( ) @staticmethod - def _create_brute_plus_binary_parameter_search_run_config_generator( + def _create_brute_plus_binary_search_run_config_generator( command_config: ConfigCommandProfile, gpus: List[GPUDevice], models: List[ModelProfileSpec], @@ -118,7 +118,7 @@ def _create_brute_plus_binary_parameter_search_run_config_generator( result_manager: ResultManager, model_variant_name_manager: ModelVariantNameManager, ) -> ConfigGeneratorInterface: - return BrutePlusBinaryParameterSearchRunConfigGenerator( + return BrutePlusBinarySearchRunConfigGenerator( config=command_config, gpus=gpus, models=models, diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py index 9c40f16ef..a215a2251 100755 --- a/model_analyzer/config/input/config_command_profile.py +++ b/model_analyzer/config/input/config_command_profile.py @@ -498,6 +498,8 @@ def _add_profile_models_configs(self): "batch_sizes": ConfigListNumeric(type_=int), "concurrency": ConfigListNumeric(type_=int), "request_rate": ConfigListNumeric(type_=int), + "text_input_length": ConfigListNumeric(type_=int), + "max_token_count": ConfigListNumeric(type_=int), } ), "objectives": objectives_scheme, @@ -571,10 +573,10 @@ def _add_profile_models_configs(self): ) self._add_config( ConfigField( - "prompt_length", - flags=["--prompt-length"], + "text_input_length", + flags=["--text-input-length"], field_type=ConfigListNumeric(int), - description="Comma-delimited list of prompt length values or ranges " + description="Comma-delimited list of text input length values or ranges " " to be used during profiling LLMs", ) ) @@ -811,25 +813,25 @@ def _add_run_search_configs(self): field_type=ConfigPrimitive(bool), parser_args={"action": "store_true"}, default_value=config_defaults.DEFAULT_LLM_SEARCH_ENABLE, - description="Enables searching values are important to LLMs: prompt length, max token, etc...", + description="Enables searching values are important to LLMs: text input length, max token, etc...", ) ) self._add_config( ConfigField( - "run_config_search_min_prompt_length", - flags=["--run-config-search-min-prompt-length"], + "run_config_search_min_text_input_length", + flags=["--run-config-search-min-text-input-length"], field_type=ConfigPrimitive(int), - default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH, - description="Min prompt length that run config search should start with.", + default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH, + description="Min text input length that run config search should start with.", ) ) self._add_config( ConfigField( - "run_config_search_max_prompt_length", - flags=["--run-config-search-max-prompt-length"], + "run_config_search_max_text_input_length", + flags=["--run-config-search-max-text-input-length"], field_type=ConfigPrimitive(int), - default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH, - description="Max prompt length that run config search will not go beyond.", + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_TEXT_INPUT_LENGTH, + description="Max text input length that run config search will not go beyond.", ) ) self._add_config( @@ -1419,6 +1421,8 @@ def _autofill_values(self): "batch_sizes": self.batch_sizes, "concurrency": self.concurrency, "request_rate": self.request_rate, + "text_input_length": self.text_input_length, + "max_token_count": self.max_token_count, } else: new_model["parameters"] = {} @@ -1443,6 +1447,24 @@ def _autofill_values(self): else: new_model["parameters"].update({"request_rate": self.request_rate}) + if "text_input_length" in model.parameters(): + new_model["parameters"].update( + {"text_input_length": model.parameters()["text_input_length"]} + ) + else: + new_model["parameters"].update( + {"text_input_length": self.text_input_length} + ) + + if "max_token_count" in model.parameters(): + new_model["max_token_count"].update( + {"max_token_count": model.parameters()["max_token_count"]} + ) + else: + new_model["parameters"].update( + {"max_token_count": self.text_input_length} + ) + if ( new_model["parameters"]["request_rate"] and new_model["parameters"]["concurrency"] @@ -1523,3 +1545,21 @@ def is_request_rate_specified(self, model_parameters: dict) -> bool: or self.get_config()["run_config_search_min_request_rate"].is_set_by_user() or self.get_config()["run_config_search_max_request_rate"].is_set_by_user() ) + + def is_llm_model(self) -> bool: + """ + Returns true if the user has enabled llm search or set any llm search value + """ + return ( + self.llm_search_enable + or self.get_config()[ + "run_config_search_min_text_input_length" + ].is_set_by_user() + or self.get_config()[ + "run_config_search_max_text_input_length" + ].is_set_by_user() + or self.get_config()["run_config_search_min_token_count"].is_set_by_user() + or self.get_config()["run_config_search_max_token_count"].is_set_by_user() + or self.get_config()["text_input_length"].is_set_by_user() + or self.get_config()["max_token_count"].is_set_by_user() + ) diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py index c2edd6e91..7e37f7c7d 100755 --- a/model_analyzer/config/input/config_defaults.py +++ b/model_analyzer/config/input/config_defaults.py @@ -38,6 +38,7 @@ DEFAULT_SKIP_SUMMARY_REPORTS = False DEFAULT_SKIP_DETAILED_REPORTS = False DEFAULT_OUTPUT_MODEL_REPOSITORY = os.path.join(os.getcwd(), "output_model_repository") +DEFAULT_INPUT_JSON_PATH = os.getcwd() DEFAULT_OVERRIDE_OUTPUT_REPOSITORY_FLAG = False DEFAULT_BATCH_SIZES = 1 DEFAULT_MAX_RETRIES = 50 @@ -51,8 +52,8 @@ DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE = 1 DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE = 128 DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS = 5 -DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH = 1 -DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH = 1000 +DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH = 1 +DEFAULT_RUN_CONFIG_MAX_TEXT_INPUT_LENGTH = 1024 DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT = 1 DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT = 256 DEFAULT_RUN_CONFIG_SEARCH_DISABLE = False diff --git a/model_analyzer/constants.py b/model_analyzer/constants.py index 886360d34..09f581326 100755 --- a/model_analyzer/constants.py +++ b/model_analyzer/constants.py @@ -32,8 +32,8 @@ # Run Search THROUGHPUT_MINIMUM_GAIN = 0.05 +THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES = 4 THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES = 4 -THROUGHPUT_MINIMUM_CONSECUTIVE_BATCH_SIZE_TRIES = 4 # Quick search algorithm constants RADIUS = 3 diff --git a/model_analyzer/perf_analyzer/perf_config.py b/model_analyzer/perf_analyzer/perf_config.py index e9160a44a..7cab2dd3c 100755 --- a/model_analyzer/perf_analyzer/perf_config.py +++ b/model_analyzer/perf_analyzer/perf_config.py @@ -33,6 +33,7 @@ class PerfAnalyzerConfig: "measurement-interval", "concurrency-range", "request-rate-range", + "periodic-concurrency-range", "request-distribution", "request-intervals", "binary-search", @@ -71,6 +72,7 @@ class PerfAnalyzerConfig: "metrics-url", "metrics-interval", "bls-composing-models", + "request-parameter", ] input_to_options = [ @@ -273,6 +275,8 @@ def extract_model_specific_parameters(self): "batch-size": self._options["-b"], "concurrency-range": self._args["concurrency-range"], "request-rate-range": self._args["request-rate-range"], + "periodic-concurrency-range": self._args["periodic-concurrency-range"], + "max-tokens": self._args["request-parameter"], } @classmethod diff --git a/model_analyzer/result/parameter_search.py b/model_analyzer/result/inference_load_search.py similarity index 63% rename from model_analyzer/result/parameter_search.py rename to model_analyzer/result/inference_load_search.py index e716a5b7d..5c7c9598d 100755 --- a/model_analyzer/result/parameter_search.py +++ b/model_analyzer/result/inference_load_search.py @@ -21,7 +21,7 @@ from model_analyzer.config.input.config_command_profile import ConfigCommandProfile from model_analyzer.constants import ( LOGGER_NAME, - THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES, + THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES, THROUGHPUT_MINIMUM_GAIN, ) from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException @@ -30,11 +30,11 @@ logger = logging.getLogger(LOGGER_NAME) -class ParameterSearch: +class InferenceLoadSearch: """ - Generates the next parameter value to use when searching through + Generates the next inference load value to use when searching through RunConfigMeasurements for the best value (according to the users objective) - - Will sweep from by powers of two from min to max parameter + - Will sweep from by powers of two from min to max inference load - If the user specifies a constraint, the algorithm will perform a binary search around the boundary if the constraint is violated @@ -45,43 +45,43 @@ def __init__( self, config: ConfigCommandProfile, model_parameters: dict = {}, - skip_parameter_sweep: bool = False, + skip_inference_load_sweep: bool = False, ) -> None: """ Parameters ---------- config: ConfigCommandProfile Profile configuration information - skip_parameter_sweep: bool - If true, skips the parameter sweep and only does the binary search + skip_inference_load_sweep: bool + If true, skips the inference load sweep and only does the binary search """ - self._skip_parameter_sweep = skip_parameter_sweep - self._parameter_is_request_rate = config.is_request_rate_specified( + self._skip_inference_load_sweep = skip_inference_load_sweep + self._inference_load_is_request_rate = config.is_request_rate_specified( model_parameters ) - if self._parameter_is_request_rate: - self._min_parameter_index = int( + if self._inference_load_is_request_rate: + self._min_inference_load_index = int( log2(config.run_config_search_min_request_rate) ) - self._max_parameter_index = int( + self._max_inference_load_index = int( log2(config.run_config_search_max_request_rate) ) else: - self._min_parameter_index = int( + self._min_inference_load_index = int( log2(config.run_config_search_min_concurrency) ) - self._max_parameter_index = int( + self._max_inference_load_index = int( log2(config.run_config_search_max_concurrency) ) self._max_binary_search_steps = config.run_config_search_max_binary_search_steps self._run_config_measurements: List[Optional[RunConfigMeasurement]] = [] - self._parameters: List[int] = [] - self._last_failing_parameter = 0 - self._last_passing_parameter = 0 + self._inference_loads: List[int] = [] + self._last_failing_inference_load = 0 + self._last_passing_inference_load = 0 def add_run_config_measurement( self, run_config_measurement: Optional[RunConfigMeasurement] @@ -92,30 +92,31 @@ def add_run_config_measurement( """ self._run_config_measurements.append(run_config_measurement) - def search_parameters(self) -> Generator[int, None, None]: + def search_inference_loads(self) -> Generator[int, None, None]: """ - First performs a parameter sweep, and then, if necessary, perform - a binary parameter search around the point where the constraint - violated + First performs an inference load sweep, and then, if necessary, perform + a binary search around the point where the constraint was violated """ - yield from self._perform_parameter_sweep() + yield from self._perform_inference_load_sweep() if self._was_constraint_violated(): - yield from self._perform_binary_parameter_search() + yield from self._perform_binary_search() - def _perform_parameter_sweep(self) -> Generator[int, None, None]: - for parameter in ( + def _perform_inference_load_sweep(self) -> Generator[int, None, None]: + for inference_load in ( 2**i - for i in range(self._min_parameter_index, self._max_parameter_index + 1) + for i in range( + self._min_inference_load_index, self._max_inference_load_index + 1 + ) ): - if self._should_continue_parameter_sweep(): - self._parameters.append(parameter) - yield parameter + if self._should_continue_inference_load_sweep(): + self._inference_loads.append(inference_load) + yield inference_load else: # We can't actually skip the sweep because the results need to be added # but, we can suppress the logging messages - if not self._skip_parameter_sweep: - if self._parameter_is_request_rate: + if not self._skip_inference_load_sweep: + if self._inference_load_is_request_rate: logger.info( "Terminating request rate sweep - throughput is decreasing" ) @@ -125,7 +126,7 @@ def _perform_parameter_sweep(self) -> Generator[int, None, None]: ) return - def _should_continue_parameter_sweep(self) -> bool: + def _should_continue_inference_load_sweep(self) -> bool: self._check_measurement_count() if not self._are_minimum_tries_reached(): @@ -134,16 +135,16 @@ def _should_continue_parameter_sweep(self) -> bool: return not self._has_objective_gain_saturated() def _check_measurement_count(self) -> None: - if len(self._run_config_measurements) != len(self._parameters): + if len(self._run_config_measurements) != len(self._inference_loads): raise TritonModelAnalyzerException( - f"Internal Measurement count: {self._parameters}, doesn't match number " + f"Internal Measurement count: {self._inference_loads}, doesn't match number " f"of measurements added: {len(self._run_config_measurements)}." ) def _are_minimum_tries_reached(self) -> bool: if ( len(self._run_config_measurements) - < THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES + < THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES ): return False else: @@ -155,7 +156,7 @@ def _has_objective_gain_saturated(self) -> bool: def _calculate_gain(self) -> float: first_rcm = self._run_config_measurements[ - -THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES + -THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES ] best_rcm = self._get_best_rcm() @@ -177,7 +178,7 @@ def _get_best_rcm(self) -> Optional[RunConfigMeasurement]: pruned_rcms = [ rcm for rcm in self._run_config_measurements[ - -THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES: + -THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES: ] if rcm ] @@ -188,16 +189,16 @@ def _get_best_rcm(self) -> Optional[RunConfigMeasurement]: def _was_constraint_violated(self) -> bool: for i in range(len(self._run_config_measurements) - 1, 1, -1): if self._at_constraint_failure_boundary(i): - self._last_failing_parameter = self._parameters[i] - self._last_passing_parameter = self._parameters[i - 1] + self._last_failing_inference_load = self._inference_loads[i] + self._last_passing_inference_load = self._inference_loads[i - 1] return True if ( self._run_config_measurements[0] and not self._run_config_measurements[0].is_passing_constraints() ): - self._last_failing_parameter = self._parameters[i] - self._last_passing_parameter = 0 + self._last_failing_inference_load = self._inference_loads[i] + self._last_passing_inference_load = 0 return True else: return False @@ -220,27 +221,31 @@ def _at_constraint_failure_boundary(self, index: int) -> bool: return at_failure_boundary - def _perform_binary_parameter_search(self) -> Generator[int, None, None]: + def _perform_binary_search(self) -> Generator[int, None, None]: # This is needed because we are going to restart the search from the - # parameter that failed - so we expect this to be at the end of the list - self._parameters.append(self._last_failing_parameter) + # inference_load that failed - so we expect this to be at the end of the list + self._inference_loads.append(self._last_failing_inference_load) for i in range(0, self._max_binary_search_steps): - parameter = self._determine_next_binary_parameter() + inference_load = self._determine_next_binary_inference_load() - if parameter != self._parameters[-1]: - self._parameters.append(parameter) - yield parameter + if inference_load != self._inference_loads[-1]: + self._inference_loads.append(inference_load) + yield inference_load - def _determine_next_binary_parameter(self) -> int: + def _determine_next_binary_inference_load(self) -> int: if not self._run_config_measurements[-1]: return 0 if self._run_config_measurements[-1].is_passing_constraints(): - self._last_passing_parameter = self._parameters[-1] - parameter = int((self._last_failing_parameter + self._parameters[-1]) / 2) + self._last_passing_inference_load = self._inference_loads[-1] + inference_load = int( + (self._last_failing_inference_load + self._inference_loads[-1]) / 2 + ) else: - self._last_failing_parameter = self._parameters[-1] - parameter = int((self._last_passing_parameter + self._parameters[-1]) / 2) + self._last_failing_inference_load = self._inference_loads[-1] + inference_load = int( + (self._last_passing_inference_load + self._inference_loads[-1]) / 2 + ) - return parameter + return inference_load diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py index 9d418027f..caa9763ce 100755 --- a/tests/common/test_utils.py +++ b/tests/common/test_utils.py @@ -23,6 +23,7 @@ from model_analyzer.config.input.config_defaults import ( DEFAULT_BATCH_SIZES, DEFAULT_CLIENT_PROTOCOL, + DEFAULT_INPUT_JSON_PATH, DEFAULT_MEASUREMENT_MODE, DEFAULT_MONITORING_INTERVAL, DEFAULT_OUTPUT_MODEL_REPOSITORY, @@ -237,9 +238,11 @@ def construct_perf_analyzer_config( batch_size=DEFAULT_BATCH_SIZES, concurrency=1, request_rate=None, + max_token_count=1, launch_mode=DEFAULT_TRITON_LAUNCH_MODE, client_protocol=DEFAULT_CLIENT_PROTOCOL, perf_analyzer_flags=None, + llm_search_mode=False, ): """ Constructs a Perf Analyzer Config @@ -262,6 +265,8 @@ def construct_perf_analyzer_config( The client protocol for this PA configuration perf_analyzer_flags: dict A dict of any additional PA flags to be set + llm_search_mode: bool + Indicates we should use LLM search parameters Returns ------- @@ -276,9 +281,17 @@ def construct_perf_analyzer_config( if request_rate: pa_config._args["request-rate-range"] = request_rate + elif llm_search_mode: + pa_config._args["periodic-concurrency-range"] = concurrency else: pa_config._args["concurrency-range"] = concurrency + if llm_search_mode: + pa_config._args["request-parameter"] = ( + "max_token:" + str(max_token_count) + ":int" + ) + pa_config._args["input-data"] = DEFAULT_INPUT_JSON_PATH + "/input-data.json" + pa_config._args["measurement-mode"] = DEFAULT_MEASUREMENT_MODE pa_config.update_config(perf_analyzer_flags) diff --git a/tests/test_cli.py b/tests/test_cli.py index 75be15038..94dbf0b21 100755 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -85,8 +85,8 @@ def get_test_options(): OptionStruct("int", "profile", "--run-config-search-min-instance-count", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT)), OptionStruct("int", "profile", "--run-config-search-max-instance-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT)), OptionStruct("int", "profile", "--run-config-search-max-binary-search-steps", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS)), - OptionStruct("int", "profile", "--run-config-search-min-prompt-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH)), - OptionStruct("int", "profile", "--run-config-search-max-prompt-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH)), + OptionStruct("int", "profile", "--run-config-search-min-text-input-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH)), + OptionStruct("int", "profile", "--run-config-search-max-text-input-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_TEXT_INPUT_LENGTH)), OptionStruct("int", "profile", "--run-config-search-min-token-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT)), OptionStruct("int", "profile", "--run-config-search-max-token-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT)), OptionStruct("float", "profile", "--monitoring-interval", "-i", "10.0", str(config_defaults.DEFAULT_MONITORING_INTERVAL)), @@ -140,7 +140,7 @@ def get_test_options(): OptionStruct("intlist", "profile", "--batch-sizes", "-b", "2, 4, 6", "1"), OptionStruct("intlist", "profile", "--concurrency", "-c", "1, 2, 3", None), OptionStruct("intlist", "profile", "--request-rate", None, "1, 2, 3", None), - OptionStruct("intlist", "profile", "--prompt-length", None, "1, 2, 3", None), + OptionStruct("intlist", "profile", "--text-input-length", None, "1, 2, 3", None), OptionStruct("intlist", "profile", "--max-token-count", None, "1, 2, 3", None), OptionStruct("stringlist", "profile", "--triton-docker-mounts", None, "a:b:c, d:e:f", None, extra_commands=["--triton-launch-mode", "docker"]), OptionStruct("stringlist", "profile", "--gpus", None, "a, b, c", "all"), diff --git a/tests/test_config.py b/tests/test_config.py index ca9835cec..01dc739d8 100755 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -288,12 +288,24 @@ def test_range_and_list_values(self): expected_model_configs = [ ConfigModelProfileSpec( "model_1", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "text_input_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, ), ConfigModelProfileSpec( "model_2", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "text_input_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, ), ] @@ -430,12 +442,20 @@ def test_object(self): "batch_sizes": [1], "concurrency": [1, 2, 3, 4], "request_rate": [], + "text_input_length": [], + "max_token_count": [], }, objectives={"perf_throughput": 10}, ), ConfigModelProfileSpec( "vgg_19_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "text_input_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, ), ] @@ -489,6 +509,8 @@ def test_object(self): "batch_sizes": [1], "concurrency": [1, 2, 3, 4], "request_rate": [], + "text_input_length": [], + "max_token_count": [], }, objectives={"perf_throughput": 10}, ), @@ -498,6 +520,8 @@ def test_object(self): "concurrency": [1, 2, 3, 4], "batch_sizes": [2, 4, 6], "request_rate": [], + "text_input_length": [], + "max_token_count": [], }, objectives={"perf_throughput": 10}, ), @@ -569,6 +593,8 @@ def test_constraints(self): "batch_sizes": [1], "concurrency": [1, 2, 3, 4], "request_rate": [], + "text_input_length": [], + "max_token_count": [], }, objectives={"perf_throughput": 10, "gpu_used_memory": 5}, constraints={ @@ -579,7 +605,13 @@ def test_constraints(self): ), ConfigModelProfileSpec( "vgg_19_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "text_input_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, ), ] @@ -697,7 +729,13 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "text_input_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, model_config_parameters={ "instance_group": [[{"kind": ["KIND_GPU"], "count": [1]}]] @@ -722,7 +760,13 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "text_input_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, model_config_parameters={ "instance_group": [[{"kind": ["KIND_GPU"], "count": [1]}]] @@ -758,7 +802,13 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "text_input_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, model_config_parameters={ "instance_group": [ @@ -801,7 +851,13 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "text_input_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, model_config_parameters={ "instance_group": [ @@ -831,7 +887,13 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "text_input_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, model_config_parameters={ "input": [ @@ -874,7 +936,13 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "text_input_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, perf_analyzer_flags={ "measurement-interval": 10000, @@ -900,7 +968,13 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "text_input_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, perf_analyzer_flags={ "measurement-interval": 10000, @@ -1171,7 +1245,13 @@ def test_autofill(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters={ + "batch_sizes": [1], + "concurrency": [], + "request_rate": [], + "text_input_length": [], + "max_token_count": [], + }, objectives={"perf_throughput": 10}, model_config_parameters={ "instance_group": [[{"kind": ["KIND_GPU"], "count": [1]}]] @@ -1215,6 +1295,8 @@ def test_autofill(self): "batch_sizes": [16, 32], "concurrency": [2, 4], "request_rate": [], + "text_input_length": [], + "max_token_count": [], }, objectives={"perf_throughput": 10, "gpu_used_memory": 5}, constraints={ @@ -1263,6 +1345,8 @@ def test_autofill(self): "batch_sizes": [16, 32], "concurrency": [2, 4], "request_rate": [], + "text_input_length": [], + "max_token_count": [], }, objectives={"gpu_used_memory": 10}, constraints={"perf_latency_p99": {"max": 8000}}, @@ -1307,6 +1391,8 @@ def test_autofill(self): "batch_sizes": [16, 32], "concurrency": [2, 4], "request_rate": [], + "text_input_length": [], + "max_token_count": [], }, objectives={"gpu_used_memory": 10}, constraints={"perf_latency_p99": {"max": 8000}}, @@ -1362,6 +1448,8 @@ def test_autofill(self): "batch_sizes": [16, 32], "concurrency": [5, 6, 7], "request_rate": [], + "text_input_length": [], + "max_token_count": [], }, objectives={"gpu_used_memory": 10}, constraints={ @@ -1375,6 +1463,8 @@ def test_autofill(self): "batch_sizes": [1, 2], "concurrency": [2, 4], "request_rate": [], + "text_input_length": [], + "max_token_count": [], }, objectives={"perf_throughput": 10, "perf_latency_p99": 5}, constraints={"perf_latency_p99": {"max": 8000}}, diff --git a/tests/test_parameter_search.py b/tests/test_inference_load_search.py similarity index 92% rename from tests/test_parameter_search.py rename to tests/test_inference_load_search.py index 7f410bb26..d8643ad66 100755 --- a/tests/test_parameter_search.py +++ b/tests/test_inference_load_search.py @@ -25,17 +25,17 @@ DEFAULT_RUN_CONFIG_MIN_CONCURRENCY, DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE, ) -from model_analyzer.constants import THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES +from model_analyzer.constants import THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException from model_analyzer.result.constraint_manager import ConstraintManager -from model_analyzer.result.parameter_search import ParameterSearch +from model_analyzer.result.inference_load_search import InferenceLoadSearch from model_analyzer.result.run_config_measurement import RunConfigMeasurement from .common import test_result_collector as trc from .common.test_utils import construct_run_config_measurement, evaluate_mock_config -class TestParameterSearch(trc.TestResultCollector): +class TestInferenceLoadSearch(trc.TestResultCollector): def setUp(self): self._min_concurrency_index = int(log2(DEFAULT_RUN_CONFIG_MIN_CONCURRENCY)) self._max_concurrency_index = int(log2(DEFAULT_RUN_CONFIG_MAX_CONCURRENCY)) @@ -67,9 +67,9 @@ def test_concurrency_sweep(self): """ config = self._create_single_model_no_constraints() constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch(config) + concurrency_search = InferenceLoadSearch(config) - for concurrency in concurrency_search.search_parameters(): + for concurrency in concurrency_search.search_inference_loads(): self._concurrencies.append(concurrency) concurrency_search.add_run_config_measurement( @@ -90,11 +90,11 @@ def test_request_rate_sweep(self): """ config = self._create_single_model_no_constraints() constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch( + concurrency_search = InferenceLoadSearch( config, model_parameters={"request_rate": "True"} ) - for request_rate in concurrency_search.search_parameters(): + for request_rate in concurrency_search.search_inference_loads(): self._request_rates.append(request_rate) concurrency_search.add_run_config_measurement( @@ -115,7 +115,7 @@ def test_saturating_sweep(self): """ config = self._create_single_model_no_constraints() constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch(config) + concurrency_search = InferenceLoadSearch(config) INCREASE_THROUGHPUT_COUNT = 4 # [100, 200, 400, 800, 1000, 1000,...] @@ -124,7 +124,7 @@ def test_saturating_sweep(self): for c in range(self._min_concurrency_index, self._max_concurrency_index + 1) ] - for i, concurrency in enumerate(concurrency_search.search_parameters()): + for i, concurrency in enumerate(concurrency_search.search_inference_loads()): self._concurrencies.append(concurrency) concurrency_search.add_run_config_measurement( @@ -140,7 +140,7 @@ def test_saturating_sweep(self): 2**c for c in range( INCREASE_THROUGHPUT_COUNT - + THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES + + THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES ) ] self.assertEqual(self._concurrencies, expected_concurrencies) @@ -152,12 +152,12 @@ def test_sweep_with_constraints_decreasing(self): """ config = self._create_single_model_with_constraints("95") constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch(config) + concurrency_search = InferenceLoadSearch(config) self._expected_concurrencies.extend([12, 10, 9]) latencies = [10 * c for c in self._expected_concurrencies] - for i, concurrency in enumerate(concurrency_search.search_parameters()): + for i, concurrency in enumerate(concurrency_search.search_inference_loads()): self._concurrencies.append(concurrency) concurrency_search.add_run_config_measurement( @@ -178,12 +178,12 @@ def test_sweep_with_constraints_decrease_then_increase(self): """ config = self._create_single_model_with_constraints("155") constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch(config) + concurrency_search = InferenceLoadSearch(config) self._expected_concurrencies.extend([12, 14, 15]) latencies = [10 * c for c in self._expected_concurrencies] - for i, concurrency in enumerate(concurrency_search.search_parameters()): + for i, concurrency in enumerate(concurrency_search.search_inference_loads()): self._concurrencies.append(concurrency) concurrency_search.add_run_config_measurement( @@ -204,14 +204,14 @@ def test_sweep_with_multiple_violation_areas(self): """ config = self._create_single_model_with_constraints("155") constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch(config) + concurrency_search = InferenceLoadSearch(config) self._expected_concurrencies.extend([12, 14, 15]) latencies = [10 * c for c in self._expected_concurrencies] # this adds an early constraint violation which should be ignored latencies[1] = 200 - for i, concurrency in enumerate(concurrency_search.search_parameters()): + for i, concurrency in enumerate(concurrency_search.search_inference_loads()): self._concurrencies.append(concurrency) concurrency_search.add_run_config_measurement( @@ -233,12 +233,12 @@ def test_sweep_with_constraints_hitting_limit(self): """ config = self._create_single_model_with_constraints("970") constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch(config) + concurrency_search = InferenceLoadSearch(config) self._expected_concurrencies.extend([768, 896, 960, 992, 976]) latencies = self._expected_concurrencies - for i, concurrency in enumerate(concurrency_search.search_parameters()): + for i, concurrency in enumerate(concurrency_search.search_inference_loads()): self._concurrencies.append(concurrency) concurrency_search.add_run_config_measurement( @@ -258,10 +258,10 @@ def test_not_adding_measurements(self): """ config = self._create_single_model_no_constraints() constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch(config) + concurrency_search = InferenceLoadSearch(config) with self.assertRaises(TritonModelAnalyzerException): - for concurrency in concurrency_search.search_parameters(): + for concurrency in concurrency_search.search_inference_loads(): self._concurrencies.append(concurrency) if concurrency < 32: diff --git a/tests/test_perf_analyzer_config_generator.py b/tests/test_perf_analyzer_config_generator.py index e9852356e..69e42ef8d 100755 --- a/tests/test_perf_analyzer_config_generator.py +++ b/tests/test_perf_analyzer_config_generator.py @@ -15,7 +15,7 @@ # limitations under the License. import unittest -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock, mock_open, patch from model_analyzer.config.generate.generator_utils import GeneratorUtils as utils from model_analyzer.config.generate.perf_analyzer_config_generator import ( @@ -41,7 +41,11 @@ def __init__(self, methodname): super().__init__(methodname) self._perf_throughput = 1 - def test_set_last_results(self): + @patch( + "model_analyzer.config.input.config_command_profile.ConfigCommandProfile.is_llm_model", + return_value=False, + ) + def test_set_last_results(self, *args): """ Test set_last_results() with multi model @@ -60,8 +64,26 @@ def test_set_last_results(self): ["modelA", "modelB"], [{"perf_throughput": 10}, {"perf_throughput": 2}] ) + args = [ + "model-analyzer", + "profile", + "--model-repository", + "cli_repository", + "-f", + "path-to-config-file", + ] + + # yapf: disable + yaml_str = (""" + profile_models: + - my-model + """) + # yapf: enable + + config = evaluate_mock_config(args, yaml_str, subcommand="profile") + pacg = PerfAnalyzerConfigGenerator( - MagicMock(), MagicMock(), MagicMock(), MagicMock(), early_exit_enable=False + config, MagicMock(), MagicMock(), MagicMock(), early_exit_enable=False ) pacg.set_last_results([measurement1, measurement2, measurement3]) @@ -537,6 +559,74 @@ def test_perf_analyzer_flags(self): self._run_and_test_perf_analyzer_config_generator(yaml_str, expected_configs) + def test_llm_search_max_token_count(self): + """ + Test LLM Search: + - max token count 1->256 + + Concurrency and text input length max set to 1 + """ + + # yapf: disable + yaml_str = (""" + perf_analyzer_flags: + input-data: input-data.json + profile_models: + - my-model + """) + # yapf: enable + + max_token_counts = utils.generate_doubled_list(1, 256) + expected_configs = [ + construct_perf_analyzer_config(max_token_count=mtc, llm_search_mode=True) + for mtc in max_token_counts + ] + + pa_cli_args = [ + "--llm-search-enable", + "--run-config-search-max-concurrency", + "1", + "--run-config-search-max-text-input-length", + "1", + ] + self._run_and_test_perf_analyzer_config_generator( + yaml_str, expected_configs, pa_cli_args + ) + + def test_llm_search_text_input_length(self): + """ + Test LLM Search: + - Input length 1->1024 + + Concurrency and max token count set to 1 + """ + + # yapf: disable + yaml_str = (""" + perf_analyzer_flags: + input-data: input-data.json + profile_models: + - my-model + """) + # yapf: enable + + text_input_lengths = utils.generate_doubled_list(1, 1024) + expected_configs = [ + construct_perf_analyzer_config(llm_search_mode=True) + for pl in text_input_lengths + ] + + pa_cli_args = [ + "--llm-search-enable", + "--run-config-search-max-concurrency", + "1", + "--run-config-search-max-token-count", + "1", + ] + self._run_and_test_perf_analyzer_config_generator( + yaml_str, expected_configs, pa_cli_args + ) + def test_perf_analyzer_config_ssl_options(self): """ Test Perf Analyzer SSL options: @@ -754,13 +844,17 @@ def _run_and_test_perf_analyzer_config_generator( config = evaluate_mock_config(args, yaml_str, subcommand="profile") - pacg = PerfAnalyzerConfigGenerator( - config, - config.profile_models[0].model_name(), - config.profile_models[0].perf_analyzer_flags(), - config.profile_models[0].parameters(), - early_exit, - ) + with patch( + "model_analyzer.config.generate.perf_analyzer_config_generator.open", + mock_open(read_data=self._input_data), + ): + pacg = PerfAnalyzerConfigGenerator( + config, + config.profile_models[0].model_name(), + config.profile_models[0].perf_analyzer_flags(), + config.profile_models[0].parameters(), + early_exit, + ) perf_analyzer_configs = [] for perf_config in pacg.get_configs(): @@ -824,6 +918,10 @@ def setUp(self): ) self.mock_os.start() + self._input_data = """{ + "data": [{"text_input": ["Hello, my name is"], "stream": [true]}] + }""" + def tearDown(self): self.mock_os.stop() patch.stopall()