diff --git a/model_analyzer/analyzer.py b/model_analyzer/analyzer.py index c68acae3f..f58bd1bdb 100755 --- a/model_analyzer/analyzer.py +++ b/model_analyzer/analyzer.py @@ -136,8 +136,14 @@ def profile( if not self._config.skip_summary_reports: self._create_summary_tables(verbose) - self._create_summary_reports(mode) - self._create_detailed_reports(mode) + + # TODO TMA-1401: need to figure out summary reporting for LLMs + if not self._config.is_llm_model(): + self._create_summary_reports(mode) + + # TODO TMA-1443: need to figure out detailed reporting for LLMs + if not self._config.is_llm_model(): + self._create_detailed_reports(mode) self._check_for_perf_analyzer_errors() diff --git a/model_analyzer/config/generate/automatic_model_config_generator.py b/model_analyzer/config/generate/automatic_model_config_generator.py index 79925cb7d..c4d7595b4 100755 --- a/model_analyzer/config/generate/automatic_model_config_generator.py +++ b/model_analyzer/config/generate/automatic_model_config_generator.py @@ -79,10 +79,7 @@ def __init__( logger.info("") AutomaticModelConfigGenerator._log_first_run = True - self._max_instance_count = config.run_config_search_max_instance_count - self._min_instance_count = config.run_config_search_min_instance_count - self._max_model_batch_size = config.run_config_search_max_model_batch_size - self._min_model_batch_size = config.run_config_search_min_model_batch_size + self._set_min_max_search_values(config) self._instance_kind = "KIND_CPU" if self._cpu_only else "KIND_GPU" @@ -91,7 +88,7 @@ def __init__( self._reset_max_batch_size() - if not self._early_exit_enable: + if not self._early_exit_enable and not self._config.is_llm_model(): raise TritonModelAnalyzerException( "Early exit disable is not supported in automatic model config generator" ) @@ -162,3 +159,9 @@ def _get_curr_param_combo(self) -> Dict: config["dynamic_batching"] = {} return config + + def _set_min_max_search_values(self, config: ConfigCommandProfile) -> None: + self._max_instance_count = config.run_config_search_max_instance_count + self._min_instance_count = config.run_config_search_min_instance_count + self._max_model_batch_size = config.run_config_search_max_model_batch_size + self._min_model_batch_size = config.run_config_search_min_model_batch_size diff --git a/model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py b/model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py similarity index 84% rename from model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py rename to model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py index b0a217274..efe403041 100755 --- a/model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +++ b/model_analyzer/config/generate/brute_plus_binary_search_run_config_generator.py @@ -29,7 +29,7 @@ from model_analyzer.config.run.run_config import RunConfig from model_analyzer.constants import LOGGER_NAME from model_analyzer.device.gpu_device import GPUDevice -from model_analyzer.result.parameter_search import ParameterSearch +from model_analyzer.result.inference_load_search import InferenceLoadSearch from model_analyzer.result.result_manager import ResultManager from model_analyzer.result.run_config_measurement import RunConfigMeasurement from model_analyzer.triton.client.client import TritonClient @@ -39,10 +39,10 @@ logger = logging.getLogger(LOGGER_NAME) -class BrutePlusBinaryParameterSearchRunConfigGenerator(ConfigGeneratorInterface): +class BrutePlusBinarySearchRunConfigGenerator(ConfigGeneratorInterface): """ First run BruteRunConfigGenerator for a brute search, then for - automatic searches use ParameterSearch to perform a binary search + automatic searches use InferenceLoadSearch to perform a binary search """ def __init__( @@ -116,7 +116,11 @@ def _create_brute_run_config_generator(self) -> BruteRunConfigGenerator: def _can_binary_search_top_results(self) -> bool: for model in self._models: - if model.parameters()["concurrency"] or model.parameters()["request_rate"]: + if ( + model.parameters()["concurrency"] + or model.parameters()["request_rate"] + or self._config.is_llm_model() + ): return False return True @@ -132,17 +136,19 @@ def _binary_search_over_top_results(self) -> Generator[RunConfig, None, None]: for result in top_results: run_config = deepcopy(result.run_config()) model_parameters = self._get_model_parameters(model_name) - parameter_search = ParameterSearch( + inference_load_search = InferenceLoadSearch( config=self._config, model_parameters=model_parameters, - skip_parameter_sweep=True, + skip_inference_load_sweep=True, ) - for parameter in parameter_search.search_parameters(): - run_config = self._set_parameter( - run_config, model_parameters, parameter + for inference_load in inference_load_search.search_inference_loads(): + run_config = self._set_inference_load( + run_config, model_parameters, inference_load ) yield run_config - parameter_search.add_run_config_measurement(self._last_measurement) + inference_load_search.add_run_config_measurement( + self._last_measurement + ) def _get_model_parameters(self, model_name: str) -> Dict: for model in self._models: @@ -151,14 +157,14 @@ def _get_model_parameters(self, model_name: str) -> Dict: return {} - def _set_parameter( - self, run_config: RunConfig, model_parameters: Dict, parameter: int + def _set_inference_load( + self, run_config: RunConfig, model_parameters: Dict, inference_load: int ) -> RunConfig: for model_run_config in run_config.model_run_configs(): perf_config = model_run_config.perf_config() if self._config.is_request_rate_specified(model_parameters): - perf_config.update_config({"request-rate-range": parameter}) + perf_config.update_config({"request-rate-range": inference_load}) else: - perf_config.update_config({"concurrency-range": parameter}) + perf_config.update_config({"concurrency-range": inference_load}) return run_config diff --git a/model_analyzer/config/generate/brute_run_config_generator.py b/model_analyzer/config/generate/brute_run_config_generator.py index d226811aa..151e97fde 100755 --- a/model_analyzer/config/generate/brute_run_config_generator.py +++ b/model_analyzer/config/generate/brute_run_config_generator.py @@ -80,7 +80,7 @@ def __init__( self._curr_results: List = [[] for n in range(self._num_models)] self._curr_generators: Dict[int, ConfigGeneratorInterface] = {} - self._skip_default_config = skip_default_config + self._skip_default_config = skip_default_config or config.is_llm_model() def set_last_results( self, measurements: List[Optional[RunConfigMeasurement]] diff --git a/model_analyzer/config/generate/generator_utils.py b/model_analyzer/config/generate/generator_utils.py index 1f0e9c5eb..551fa0c28 100755 --- a/model_analyzer/config/generate/generator_utils.py +++ b/model_analyzer/config/generate/generator_utils.py @@ -15,7 +15,7 @@ # limitations under the License. from itertools import product -from typing import Dict, List +from typing import Dict, List, Optional class GeneratorUtils: @@ -80,8 +80,8 @@ def generate_combinations(value: object) -> List: @staticmethod def generate_parameter_combinations(params: Dict) -> List[Dict]: """ - Generate a list of all possible subdictionaries - from given dictionary. The subdictionaries will + Generate a list of all possible sub-dictionaries + from given dictionary. The sub-dictionaries will have all the same keys, but only one value from each key. @@ -108,9 +108,45 @@ def generate_doubled_list(min_value: int, max_value: int) -> List[int]: The value that the generated list will not exceed """ + assert min_value <= max_value + list = [] val = 1 if min_value == 0 else min_value while val <= max_value: list.append(val) val *= 2 return list + + @staticmethod + def extract_value_from_request_parameter(request_parameter: Optional[str]) -> int: + if not request_parameter: + return 0 + + # Format is: :: + # Example: max_tokens:10:int + _, value, _ = request_parameter.split(":") + + # this catches the case for non-LLM models where the user has specified request parameters + try: + int(value) + except ValueError as _: + return 0 + + return int(value) + + @staticmethod + def extract_text_input_length_from_input_data(input_data: Optional[str]) -> int: + if not input_data: + return 0 + + # format is input-data-.json + _, _, text_input_length = input_data.split("-") + text_input_length, _ = text_input_length.split(".") + + # this catches the case for non-LLM models where the user has specified input data + try: + int(text_input_length) + except ValueError as _: + return 0 + + return int(text_input_length) diff --git a/model_analyzer/config/generate/model_run_config_generator.py b/model_analyzer/config/generate/model_run_config_generator.py index b068c7577..529fa5b83 100755 --- a/model_analyzer/config/generate/model_run_config_generator.py +++ b/model_analyzer/config/generate/model_run_config_generator.py @@ -150,5 +150,13 @@ def _determine_early_exit_enables( concurrency_specified = model.parameters()["concurrency"] config_parameters_exist = model.model_config_parameters() - self._pacg_early_exit_enable = early_exit_enable or not concurrency_specified - self._mcg_early_exit_enable = early_exit_enable or not config_parameters_exist + if config.is_llm_model(): + self._pacg_early_exit_enable = False + self._mcg_early_exit_enable = False + else: + self._pacg_early_exit_enable = ( + early_exit_enable or not concurrency_specified + ) + self._mcg_early_exit_enable = ( + early_exit_enable or not config_parameters_exist + ) diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py index 985032564..10c86e610 100755 --- a/model_analyzer/config/generate/perf_analyzer_config_generator.py +++ b/model_analyzer/config/generate/perf_analyzer_config_generator.py @@ -14,13 +14,25 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import logging -from typing import Generator, List, Optional +import os +from itertools import repeat +from typing import Any, Dict, Generator, List, Optional, Tuple from model_analyzer.config.input.config_command_profile import ConfigCommandProfile +from model_analyzer.config.input.config_defaults import ( + DEFAULT_INPUT_JSON_PATH, + DEFAULT_RUN_CONFIG_MIN_CONCURRENCY, + DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT, + DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD, + DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE, + DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH, + DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY, +) from model_analyzer.constants import ( LOGGER_NAME, - THROUGHPUT_MINIMUM_CONSECUTIVE_BATCH_SIZE_TRIES, + THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES, THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES, THROUGHPUT_MINIMUM_GAIN, ) @@ -62,7 +74,7 @@ def __init__( custom perf analyzer configuration model_parameters: Dict - model constraints for batch_sizes, concurrency and/or request rate + model constraints for batch sizes, concurrency, request rate, text input length, etc.. early_exit_enable: Bool If true, this class can early exit during search of concurrency/request rate @@ -72,35 +84,54 @@ def __init__( # All configs are pregenerated in _configs[][] # Indexed as follows: - # _configs[_curr_batch_size_index][_curr_parameter_index] + # _configs[_curr_parameter_index][_curr_inference_load_index] + # + # Parameters are: batch size, text input length, max token size + # Inference load are: concurrency/periodic-concurrency, request-rate # self._curr_parameter_index = 0 - self._curr_batch_size_index = 0 + self._curr_inference_load_index = 0 self._configs: List[List[PerfAnalyzerConfig]] = [] - self._parameter_warning_printed = False + self._inference_load_warning_printed = False # Flag to indicate we have started to return results # self._generator_started = False self._last_results: List[RunConfigMeasurement] = [] + self._inference_load_results: List[Optional[RunConfigMeasurement]] = [] self._parameter_results: List[Optional[RunConfigMeasurement]] = [] - self._batch_size_results: List[Optional[RunConfigMeasurement]] = [] self._model_name = model_name - self._perf_analyzer_flags = model_perf_analyzer_flags - - self._batch_sizes = sorted(model_parameters["batch_sizes"]) self._cli_config = cli_config + self._llm_input_dict = self._create_input_dict(model_perf_analyzer_flags) + + self._perf_analyzer_flags = self._set_perf_analyzer_flags( + model_perf_analyzer_flags + ) + self._model_parameters = model_parameters - self._parameters = self._create_parameter_list() + self._inference_loads = self._create_inference_load_list() + + self._batch_sizes = sorted(model_parameters["batch_sizes"]) + self._text_input_lengths = self._create_text_input_length_list() + self._max_token_counts = self._create_max_token_count_list() + self._request_periods = self._create_request_period_list() + + self._perf_config_parameter_values = self._create_parameter_perf_config_values() + self._parameter_count = len( + utils.generate_parameter_combinations(self._perf_config_parameter_values) + ) + + self._input_json_base_filename = DEFAULT_INPUT_JSON_PATH + "/input-data-" + self._generate_perf_configs() @staticmethod def throughput_gain_valid_helper( throughputs: List[Optional[RunConfigMeasurement]], - min_tries: int = THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES, + min_tries: int = THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES, min_gain: float = THROUGHPUT_MINIMUM_GAIN, ) -> bool: if len(throughputs) < min_tries: @@ -136,8 +167,8 @@ def get_configs(self) -> Generator[PerfAnalyzerConfig, None, None]: break self._generator_started = True - config = self._configs[self._curr_batch_size_index][ - self._curr_parameter_index + config = self._configs[self._curr_parameter_index][ + self._curr_inference_load_index ] yield (config) @@ -166,13 +197,34 @@ def set_last_results( measurement = [max(valid_measurements)] self._last_results = measurement - self._parameter_results.extend(measurement) + self._inference_load_results.extend(measurement) - def _create_parameter_list(self) -> List[int]: - # The two possible parameters are request rate or concurrency - # Concurrency is the default and will be used unless the user specifies + def _set_perf_analyzer_flags(self, model_perf_analyzer_flags: Dict) -> Dict: + # For LLM models we will be creating custom input data based on text input length + perf_analyzer_flags = {k: v for k, v in model_perf_analyzer_flags.items()} + + if self._cli_config.is_llm_model(): + perf_analyzer_flags.pop("input-data") + + return perf_analyzer_flags + + def _create_input_dict(self, model_perf_analyzer_flags: Dict) -> Dict: + if self._cli_config.is_llm_model(): + with open(model_perf_analyzer_flags["input-data"], "r") as f: + input_dict = json.load(f) + + return input_dict + else: + return {} + + def _create_inference_load_list(self) -> List[Any]: + # The three possible inference loads are request rate, concurrency or periodic concurrency + # For LLM models periodic concurrency is used for non-LLM models + # concurrency is the default and will be used unless the user specifies # request rate, either as a model parameter or a config option - if self._cli_config.is_request_rate_specified(self._model_parameters): + if self._cli_config.is_llm_model(): + return self._create_periodic_concurrency_list() + elif self._cli_config.is_request_rate_specified(self._model_parameters): return self._create_request_rate_list() else: return self._create_concurrency_list() @@ -181,7 +233,7 @@ def _create_request_rate_list(self) -> List[int]: if self._model_parameters["request_rate"]: return sorted(self._model_parameters["request_rate"]) elif self._cli_config.run_config_search_disable: - return [1] + return [DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE] else: return utils.generate_doubled_list( self._cli_config.run_config_search_min_request_rate, @@ -192,82 +244,264 @@ def _create_concurrency_list(self) -> List[int]: if self._model_parameters["concurrency"]: return sorted(self._model_parameters["concurrency"]) elif self._cli_config.run_config_search_disable: - return [1] + return [DEFAULT_RUN_CONFIG_MIN_CONCURRENCY] else: return utils.generate_doubled_list( self._cli_config.run_config_search_min_concurrency, self._cli_config.run_config_search_max_concurrency, ) - def _generate_perf_configs(self) -> None: - perf_config_non_parameter_values = ( - self._create_non_parameter_perf_config_values() + def _create_periodic_concurrency_list(self) -> List[str]: + if self._model_parameters["periodic_concurrency"]: + return sorted(self._model_parameters["periodic_concurrency"]) + elif self._cli_config.run_config_search_disable: + return [DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY] + + periodic_concurrencies = self._generate_periodic_concurrencies() + return periodic_concurrencies + + def _generate_periodic_concurrencies(self) -> List[str]: + periodic_concurrencies = [] + + periodic_concurrency_doubled_list = utils.generate_doubled_list( + self._cli_config.run_config_search_min_periodic_concurrency, + self._cli_config.run_config_search_max_periodic_concurrency, + ) + + step_doubled_list = utils.generate_doubled_list( + self._cli_config.run_config_search_min_periodic_concurrency_step, + self._cli_config.run_config_search_max_periodic_concurrency_step, ) - for params in utils.generate_parameter_combinations( - perf_config_non_parameter_values - ): - configs_with_concurrency = [] - for parameter in self._parameters: - new_perf_config = PerfAnalyzerConfig() + for start in periodic_concurrency_doubled_list: + for end in periodic_concurrency_doubled_list: + for step in step_doubled_list: + if self._is_illegal_periodic_concurrency_combination( + start, end, step + ): + continue + + periodic_concurrencies.append(f"{start}:{end}:{step}") + return periodic_concurrencies - new_perf_config.update_config_from_profile_config( - self._model_name, self._cli_config + def _is_illegal_periodic_concurrency_combination( + self, start: int, end: int, step: int + ) -> bool: + if start > end: + return True + elif start == end and step != 1: + return True + elif (end - start) % step: + return True + else: + return False + + def _create_text_input_length_list(self) -> List[int]: + if not self._cli_config.is_llm_model(): + return [] + + if self._model_parameters["text_input_length"]: + return sorted(self._model_parameters["text_input_length"]) + elif self._cli_config.run_config_search_disable: + return [DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH] + else: + return utils.generate_doubled_list( + self._cli_config.run_config_search_min_text_input_length, + self._cli_config.run_config_search_max_text_input_length, + ) + + def _create_max_token_count_list(self) -> List[int]: + if not self._cli_config.is_llm_model(): + return [] + + if self._model_parameters["max_token_count"]: + return sorted(self._model_parameters["max_token_count"]) + elif self._cli_config.run_config_search_disable: + return [DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT] + else: + return utils.generate_doubled_list( + self._cli_config.run_config_search_min_max_token_count, + self._cli_config.run_config_search_max_max_token_count, + ) + + def _create_request_period_list(self) -> List[int]: + if not self._cli_config.is_llm_model(): + return [] + + if self._model_parameters["request_period"]: + return sorted(self._model_parameters["request_period"]) + elif self._cli_config.run_config_search_disable: + return [DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD] + else: + return utils.generate_doubled_list( + self._cli_config.run_config_search_min_request_period, + self._cli_config.run_config_search_max_request_period, + ) + + def _generate_perf_configs(self) -> None: + parameter_combinations = utils.generate_parameter_combinations( + self._perf_config_parameter_values + ) + for parameter_combination in parameter_combinations: + perf_configs_for_a_given_combination = [] + for inference_load in self._inference_loads: + new_perf_config = self._create_new_perf_config( + inference_load, parameter_combination ) + perf_configs_for_a_given_combination.append(new_perf_config) - new_perf_config.update_config(params) + self._configs.append(perf_configs_for_a_given_combination) - if self._cli_config.is_request_rate_specified(self._model_parameters): - new_perf_config.update_config({"request-rate-range": parameter}) - else: - new_perf_config.update_config({"concurrency-range": parameter}) + def _create_new_perf_config( + self, inference_load: int, parameter_combination: Dict + ) -> PerfAnalyzerConfig: + perf_config = self._create_base_perf_config() + + ( + text_input_length, + modified_parameter_combination, + ) = self._extract_text_input_length(parameter_combination) - # User provided flags can override the search parameters - new_perf_config.update_config(self._perf_analyzer_flags) + self._update_perf_config_based_on_parameter_combination( + perf_config, modified_parameter_combination + ) + self._update_perf_config_based_on_inference_load(perf_config, inference_load) + self._update_perf_config_based_on_perf_analyzer_flags(perf_config) + self._update_perf_config_for_llm_model(perf_config, text_input_length) - configs_with_concurrency.append(new_perf_config) - self._configs.append(configs_with_concurrency) + return perf_config + + def _create_base_perf_config(self) -> PerfAnalyzerConfig: + perf_config = PerfAnalyzerConfig() + perf_config.update_config_from_profile_config( + self._model_name, self._cli_config + ) + + return perf_config + + def _extract_text_input_length( + self, parameter_combination: Dict + ) -> Tuple[int, Dict]: + if not self._cli_config.is_llm_model(): + return 0, parameter_combination + + modified_parameter_combination = { + k: v for k, v in parameter_combination.items() + } + text_input_length = modified_parameter_combination.pop("text-input-length") + return text_input_length, modified_parameter_combination - def _create_non_parameter_perf_config_values(self) -> dict: + def _update_perf_config_based_on_parameter_combination( + self, perf_config: PerfAnalyzerConfig, parameter_combination: Dict + ) -> None: + if "request-parameter" in parameter_combination: + request_parameter = parameter_combination["request-parameter"] + max_tokens = utils.extract_value_from_request_parameter(request_parameter) + parameter_combination["request-period"] = ( + max_tokens + if max_tokens < parameter_combination["request-period"] + else parameter_combination["request-period"] + ) + + perf_config.update_config(parameter_combination) + + def _update_perf_config_based_on_perf_analyzer_flags( + self, perf_config: PerfAnalyzerConfig + ) -> None: + perf_config.update_config(self._perf_analyzer_flags) + + def _update_perf_config_based_on_inference_load( + self, perf_config: PerfAnalyzerConfig, inference_load: int + ) -> None: + if self._cli_config.is_llm_model(): + perf_config.update_config({"periodic-concurrency-range": inference_load}) + perf_config.update_config({"streaming": "True"}) + elif self._cli_config.is_request_rate_specified(self._model_parameters): + perf_config.update_config({"request-rate-range": inference_load}) + else: + perf_config.update_config({"concurrency-range": inference_load}) + + def _update_perf_config_for_llm_model( + self, perf_config: PerfAnalyzerConfig, text_input_length: int + ) -> None: + if not self._cli_config.is_llm_model(): + return + + input_json_filename = ( + self._input_json_base_filename + f"{text_input_length}.json" + ) + modified_input_dict = self._modify_text_in_input_dict(text_input_length) + self._write_modified_input_dict_to_file( + modified_input_dict, input_json_filename + ) + + perf_config.update_config({"input-data": input_json_filename}) + + def _modify_text_in_input_dict(self, text_input_length: int) -> Dict: + modified_text = " ".join(repeat("Hello", text_input_length)) + + modified_input_dict = {k: v for k, v in self._llm_input_dict.items()} + # FIXME: this needs to be updated once tritonserver/PA are updated TMA-1414 + modified_input_dict["data"][0]["PROMPT"] = [modified_text] + + return modified_input_dict + + def _write_modified_input_dict_to_file( + self, modified_input_dict: Dict, input_json_filename: str + ) -> None: + if not os.path.exists(DEFAULT_INPUT_JSON_PATH): + os.makedirs(DEFAULT_INPUT_JSON_PATH) + + with open(input_json_filename, "w") as f: + json.dump(modified_input_dict, f) + + def _create_parameter_perf_config_values(self) -> dict: perf_config_values = { "batch-size": self._batch_sizes, } + if self._cli_config.is_llm_model(): + perf_config_values["request-parameter"] = [ + f"max_tokens:{str(mtc)}:int" for mtc in self._max_token_counts + ] + perf_config_values["request-period"] = self._request_periods + perf_config_values["text-input-length"] = self._text_input_lengths + return perf_config_values def _step(self) -> None: - self._step_parameter() + self._step_inference_load() - if self._done_walking_parameters(): - self._add_best_throughput_to_batch_sizes() - self._reset_parameters() - self._step_batch_size() + if self._done_walking_inference_loads(): + self._add_best_throughput_to_parameter_results() + self._reset_inference_loads() + self._step_parameter() - def _add_best_throughput_to_batch_sizes(self) -> None: - if self._parameter_results: + def _add_best_throughput_to_parameter_results(self) -> None: + if self._inference_load_results: # type is List[Optional[RCM]] - best = max(self._parameter_results) # type: ignore - self._batch_size_results.append(best) + best = max(self._inference_load_results) # type: ignore + self._parameter_results.append(best) - def _reset_parameters(self) -> None: - self._curr_parameter_index = 0 - self._parameter_warning_printed = False - self._parameter_results = [] + def _reset_inference_loads(self) -> None: + self._curr_inference_load_index = 0 + self._inference_load_warning_printed = False + self._inference_load_results = [] + + def _step_inference_load(self) -> None: + self._curr_inference_load_index += 1 def _step_parameter(self) -> None: self._curr_parameter_index += 1 - def _step_batch_size(self) -> None: - self._curr_batch_size_index += 1 - def _done_walking(self) -> bool: - return self._done_walking_batch_sizes() + return self._done_walking_parameters() - def _done_walking_parameters(self) -> bool: - if len(self._parameters) == self._curr_parameter_index: + def _done_walking_inference_loads(self) -> bool: + if len(self._inference_loads) == self._curr_inference_load_index: return True - if self._early_exit_enable and not self._parameter_throughput_gain_valid(): - if not self._parameter_warning_printed: + if self._early_exit_enable and not self._inference_load_throughput_gain_valid(): + if not self._inference_load_warning_printed: if self._cli_config.is_request_rate_specified(self._model_parameters): logger.info( "No longer increasing request rate as throughput has plateaued" @@ -276,15 +510,15 @@ def _done_walking_parameters(self) -> bool: logger.info( "No longer increasing concurrency as throughput has plateaued" ) - self._parameter_warning_printed = True + self._inference_load_warning_printed = True return True return False - def _done_walking_batch_sizes(self) -> bool: - if len(self._batch_sizes) == self._curr_batch_size_index: + def _done_walking_parameters(self) -> bool: + if self._parameter_count == self._curr_parameter_index: return True - if self._early_exit_enable and not self._batch_size_throughput_gain_valid(): + if self._early_exit_enable and not self._parameter_throughput_gain_valid(): logger.info( "No longer increasing client batch size as throughput has plateaued" ) @@ -295,18 +529,18 @@ def _done_walking_batch_sizes(self) -> bool: def _last_results_erroneous(self) -> bool: return not self._last_results or self._last_results[-1] is None - def _parameter_throughput_gain_valid(self) -> bool: - """Check if any of the last X parameter results resulted in valid gain""" + def _inference_load_throughput_gain_valid(self) -> bool: + """Check if any of the last X inference load results resulted in valid gain""" return PerfAnalyzerConfigGenerator.throughput_gain_valid_helper( - throughputs=self._parameter_results, - min_tries=THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES, + throughputs=self._inference_load_results, + min_tries=THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES, min_gain=THROUGHPUT_MINIMUM_GAIN, ) - def _batch_size_throughput_gain_valid(self) -> bool: - """Check if any of the last X batch_size results resulted in valid gain""" + def _parameter_throughput_gain_valid(self) -> bool: + """Check if any of the last X non-parameter results resulted in valid gain""" return PerfAnalyzerConfigGenerator.throughput_gain_valid_helper( - throughputs=self._batch_size_results, - min_tries=THROUGHPUT_MINIMUM_CONSECUTIVE_BATCH_SIZE_TRIES, + throughputs=self._parameter_results, + min_tries=THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES, min_gain=THROUGHPUT_MINIMUM_GAIN, ) diff --git a/model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py b/model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py index b7adbef97..14a669438 100755 --- a/model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +++ b/model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py @@ -30,7 +30,7 @@ from model_analyzer.config.run.run_config import RunConfig from model_analyzer.constants import LOGGER_NAME from model_analyzer.device.gpu_device import GPUDevice -from model_analyzer.result.parameter_search import ParameterSearch +from model_analyzer.result.inference_load_search import InferenceLoadSearch from model_analyzer.result.result_manager import ResultManager from model_analyzer.result.run_config_measurement import RunConfigMeasurement from model_analyzer.triton.client.client import TritonClient @@ -43,7 +43,7 @@ class QuickPlusConcurrencySweepRunConfigGenerator(ConfigGeneratorInterface): """ First run QuickRunConfigGenerator for a hill climbing search, then use - ParameterSearch for a concurrency sweep + binary search of the default + InferenceLoadSearch for a concurrency sweep + binary search of the default and Top N results """ @@ -139,11 +139,13 @@ def _sweep_concurrency_over_top_results(self) -> Generator[RunConfig, None, None for result in top_results: run_config = deepcopy(result.run_config()) - parameter_search = ParameterSearch(self._config) - for concurrency in parameter_search.search_parameters(): + inference_load_search = InferenceLoadSearch(self._config) + for concurrency in inference_load_search.search_inference_loads(): run_config = self._set_concurrency(run_config, concurrency) yield run_config - parameter_search.add_run_config_measurement(self._last_measurement) + inference_load_search.add_run_config_measurement( + self._last_measurement + ) def _set_concurrency(self, run_config: RunConfig, concurrency: int) -> RunConfig: for model_run_config in run_config.model_run_configs(): diff --git a/model_analyzer/config/generate/run_config_generator_factory.py b/model_analyzer/config/generate/run_config_generator_factory.py index da3fc7a7a..0cdcddeb6 100755 --- a/model_analyzer/config/generate/run_config_generator_factory.py +++ b/model_analyzer/config/generate/run_config_generator_factory.py @@ -31,8 +31,8 @@ from model_analyzer.triton.client.client import TritonClient from model_analyzer.triton.model.model_config import ModelConfig -from .brute_plus_binary_parameter_search_run_config_generator import ( - BrutePlusBinaryParameterSearchRunConfigGenerator, +from .brute_plus_binary_search_run_config_generator import ( + BrutePlusBinarySearchRunConfigGenerator, ) from .config_generator_interface import ConfigGeneratorInterface from .quick_plus_concurrency_sweep_run_config_generator import ( @@ -96,7 +96,7 @@ def create_run_config_generator( model_variant_name_manager=model_variant_name_manager, ) elif command_config.run_config_search_mode == "brute": - return RunConfigGeneratorFactory._create_brute_plus_binary_parameter_search_run_config_generator( + return RunConfigGeneratorFactory._create_brute_plus_binary_search_run_config_generator( command_config=command_config, gpus=gpus, models=new_models, @@ -110,7 +110,7 @@ def create_run_config_generator( ) @staticmethod - def _create_brute_plus_binary_parameter_search_run_config_generator( + def _create_brute_plus_binary_search_run_config_generator( command_config: ConfigCommandProfile, gpus: List[GPUDevice], models: List[ModelProfileSpec], @@ -118,7 +118,7 @@ def _create_brute_plus_binary_parameter_search_run_config_generator( result_manager: ResultManager, model_variant_name_manager: ModelVariantNameManager, ) -> ConfigGeneratorInterface: - return BrutePlusBinaryParameterSearchRunConfigGenerator( + return BrutePlusBinarySearchRunConfigGenerator( config=command_config, gpus=gpus, models=models, diff --git a/model_analyzer/config/input/config_command.py b/model_analyzer/config/input/config_command.py index 23e4fc484..59d3e87ce 100755 --- a/model_analyzer/config/input/config_command.py +++ b/model_analyzer/config/input/config_command.py @@ -129,6 +129,7 @@ def _check_for_illegal_config_settings( self._check_for_bls_incompatibility(args, yaml_config) self._check_for_concurrency_rate_request_conflicts(args, yaml_config) self._check_for_config_search_rate_request_conflicts(args, yaml_config) + self._check_for_llm_incompatibility(args, yaml_config) def _set_field_values( self, args: Namespace, yaml_config: Optional[Dict[str, List]] @@ -398,6 +399,53 @@ def _check_for_config_search_rate_request_conflicts( f"\nCannot have both `run-config-search-max-request-rate` and `run-config-search-min/max-concurrency` specified in the config/CLI." ) + def _check_for_llm_incompatibility( + self, args: Namespace, yaml_config: Optional[Dict[str, List]] + ) -> None: + if not self._get_config_value("llm_search_enable", args, yaml_config): + return + + if ( + self._get_config_value("run_config_search_mode", args, yaml_config) + == "quick" + ): + raise TritonModelAnalyzerException( + f"\nLLM models are not supported in quick search. Please use brute search mode." + ) + + self._check_for_illegal_llm_option( + args, yaml_config, "run_config_search_min_model_batch_size" + ) + self._check_for_illegal_llm_option( + args, yaml_config, "run_config_search_max_model_batch_size" + ) + self._check_for_illegal_llm_option( + args, yaml_config, "run_config_search_min_concurrency" + ) + self._check_for_illegal_llm_option( + args, yaml_config, "run_config_search_max_concurrency" + ) + self._check_for_illegal_llm_option( + args, yaml_config, "run_config_search_min_request_rate" + ) + self._check_for_illegal_llm_option( + args, yaml_config, "run_config_search_max_request_rate" + ) + self._check_for_illegal_llm_option( + args, yaml_config, "request_rate_search_enable" + ) + self._check_for_illegal_llm_option(args, yaml_config, "concurrency") + self._check_for_illegal_llm_option(args, yaml_config, "latency_budget") + self._check_for_illegal_llm_option(args, yaml_config, "min_throughput") + + def _check_for_illegal_llm_option( + self, args: Namespace, yaml_config: Optional[Dict[str, List]], option: str + ) -> None: + if self._get_config_value(option, args, yaml_config): + raise TritonModelAnalyzerException( + f"\nLLM models do not support setting the `{option}` option when profiling." + ) + def _preprocess_and_verify_arguments(self): """ Enforces some rules on the config. diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py index 02d6def28..da11ac967 100755 --- a/model_analyzer/config/input/config_command_profile.py +++ b/model_analyzer/config/input/config_command_profile.py @@ -17,12 +17,14 @@ import argparse import logging import os +from typing import Dict import numba.cuda import psutil from google.protobuf.descriptor import FieldDescriptor from tritonclient.grpc.model_config_pb2 import ModelConfig +import model_analyzer.config.input.config_defaults as config_defaults from model_analyzer.config.input.config_utils import ( binary_path_validator, file_path_validator, @@ -36,62 +38,6 @@ from model_analyzer.triton.server.server_config import TritonServerConfig from .config_command import ConfigCommand -from .config_defaults import ( - DEFAULT_ALWAYS_REPORT_GPU_METRICS, - DEFAULT_BATCH_SIZES, - DEFAULT_CHECKPOINT_DIRECTORY, - DEFAULT_CLIENT_PROTOCOL, - DEFAULT_COLLECT_CPU_METRICS, - DEFAULT_DURATION_SECONDS, - DEFAULT_EXPORT_PATH, - DEFAULT_FILENAME_MODEL_GPU, - DEFAULT_FILENAME_MODEL_INFERENCE, - DEFAULT_FILENAME_SERVER_ONLY, - DEFAULT_GPU_OUTPUT_FIELDS, - DEFAULT_GPUS, - DEFAULT_INFERENCE_OUTPUT_FIELDS, - DEFAULT_MAX_RETRIES, - DEFAULT_MODEL_WEIGHTING, - DEFAULT_MONITORING_INTERVAL, - DEFAULT_NUM_CONFIGS_PER_MODEL, - DEFAULT_NUM_TOP_MODEL_CONFIGS, - DEFAULT_OFFLINE_OBJECTIVES, - DEFAULT_OFFLINE_PLOTS, - DEFAULT_ONLINE_OBJECTIVES, - DEFAULT_ONLINE_PLOTS, - DEFAULT_OUTPUT_MODEL_REPOSITORY, - DEFAULT_OVERRIDE_OUTPUT_REPOSITORY_FLAG, - DEFAULT_PERF_ANALYZER_CPU_UTIL, - DEFAULT_PERF_ANALYZER_PATH, - DEFAULT_PERF_ANALYZER_TIMEOUT, - DEFAULT_PERF_MAX_AUTO_ADJUSTS, - DEFAULT_PERF_OUTPUT_FLAG, - DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS, - DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS, - DEFAULT_REQUEST_RATE_SEARCH_ENABLE, - DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS, - DEFAULT_RUN_CONFIG_MAX_CONCURRENCY, - DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT, - DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE, - DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE, - DEFAULT_RUN_CONFIG_MIN_CONCURRENCY, - DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT, - DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE, - DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE, - DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE, - DEFAULT_RUN_CONFIG_SEARCH_DISABLE, - DEFAULT_RUN_CONFIG_SEARCH_MODE, - DEFAULT_SERVER_OUTPUT_FIELDS, - DEFAULT_SKIP_DETAILED_REPORTS, - DEFAULT_SKIP_SUMMARY_REPORTS, - DEFAULT_TRITON_DOCKER_IMAGE, - DEFAULT_TRITON_GRPC_ENDPOINT, - DEFAULT_TRITON_HTTP_ENDPOINT, - DEFAULT_TRITON_INSTALL_PATH, - DEFAULT_TRITON_LAUNCH_MODE, - DEFAULT_TRITON_METRICS_URL, - DEFAULT_TRITON_SERVER_PATH, -) from .config_enum import ConfigEnum from .config_field import ConfigField from .config_list_generic import ConfigListGeneric @@ -224,7 +170,7 @@ def _fill_config(self): ConfigField( "checkpoint_directory", flags=["-s", "--checkpoint-directory"], - default_value=DEFAULT_CHECKPOINT_DIRECTORY, + default_value=config_defaults.DEFAULT_CHECKPOINT_DIRECTORY, field_type=ConfigPrimitive(str, validator=parent_path_validator), description="Full path to directory to which to read and write checkpoints and profile data.", ) @@ -234,7 +180,7 @@ def _fill_config(self): "monitoring_interval", flags=["-i", "--monitoring-interval"], field_type=ConfigPrimitive(float), - default_value=DEFAULT_MONITORING_INTERVAL, + default_value=config_defaults.DEFAULT_MONITORING_INTERVAL, description="Interval of time between metrics measurements in seconds", ) ) @@ -243,7 +189,7 @@ def _fill_config(self): "duration_seconds", field_type=ConfigPrimitive(int), flags=["-d", "--duration-seconds"], - default_value=DEFAULT_DURATION_SECONDS, + default_value=config_defaults.DEFAULT_DURATION_SECONDS, description="Specifies how long (seconds) to gather server-only metrics", ) ) @@ -253,7 +199,7 @@ def _fill_config(self): field_type=ConfigPrimitive(bool), flags=["--collect-cpu-metrics"], parser_args={"action": "store_true"}, - default_value=DEFAULT_COLLECT_CPU_METRICS, + default_value=config_defaults.DEFAULT_COLLECT_CPU_METRICS, description="Specify whether CPU metrics are collected or not", ) ) @@ -262,7 +208,7 @@ def _fill_config(self): "gpus", flags=["--gpus"], field_type=ConfigListString(), - default_value=DEFAULT_GPUS, + default_value=config_defaults.DEFAULT_GPUS, description="List of GPU UUIDs to be used for the profiling. " "Use 'all' to profile all the GPUs visible by CUDA.", ) @@ -273,7 +219,7 @@ def _fill_config(self): flags=["--always-report-gpu-metrics"], field_type=ConfigPrimitive(bool), parser_args={"action": "store_true"}, - default_value=DEFAULT_ALWAYS_REPORT_GPU_METRICS, + default_value=config_defaults.DEFAULT_ALWAYS_REPORT_GPU_METRICS, description="Report GPU metrics, even when the model is `cpu_only`.", ) ) @@ -283,7 +229,7 @@ def _fill_config(self): flags=["--skip-summary-reports"], field_type=ConfigPrimitive(bool), parser_args={"action": "store_true"}, - default_value=DEFAULT_SKIP_SUMMARY_REPORTS, + default_value=config_defaults.DEFAULT_SKIP_SUMMARY_REPORTS, description="Skips the generation of analysis summary reports and tables.", ) ) @@ -293,7 +239,7 @@ def _fill_config(self): flags=["--skip-detailed-reports"], field_type=ConfigPrimitive(bool), parser_args={"action": "store_true"}, - default_value=DEFAULT_SKIP_DETAILED_REPORTS, + default_value=config_defaults.DEFAULT_SKIP_DETAILED_REPORTS, description="Skips the generation of detailed summary reports and tables.", ) ) @@ -325,7 +271,7 @@ def _add_repository_configs(self): ConfigField( "output_model_repository_path", field_type=ConfigPrimitive(str), - default_value=DEFAULT_OUTPUT_MODEL_REPOSITORY, + default_value=config_defaults.DEFAULT_OUTPUT_MODEL_REPOSITORY, flags=["--output-model-repository-path"], description="Output model repository path used by Model Analyzer." " This is the directory that will contain all the generated model configurations", @@ -336,7 +282,7 @@ def _add_repository_configs(self): "override_output_model_repository", field_type=ConfigPrimitive(bool), parser_args={"action": "store_true"}, - default_value=DEFAULT_OVERRIDE_OUTPUT_REPOSITORY_FLAG, + default_value=config_defaults.DEFAULT_OVERRIDE_OUTPUT_REPOSITORY_FLAG, flags=["--override-output-model-repository"], description="Will override the contents of the output model repository" " and replace it with the new results.", @@ -520,7 +466,7 @@ def _add_profile_models_configs(self): ConfigField( "objectives", field_type=objectives_scheme, - default_value=DEFAULT_OFFLINE_OBJECTIVES, + default_value=config_defaults.DEFAULT_OFFLINE_OBJECTIVES, description="Model Analyzer uses the objectives described here to find the best configuration for each model.", ) ) @@ -552,7 +498,11 @@ def _add_profile_models_configs(self): schema={ "batch_sizes": ConfigListNumeric(type_=int), "concurrency": ConfigListNumeric(type_=int), + "periodic_concurrency": ConfigListString(), "request_rate": ConfigListNumeric(type_=int), + "request_period": ConfigListNumeric(type_=int), + "text_input_length": ConfigListNumeric(type_=int), + "max_token_count": ConfigListNumeric(type_=int), } ), "objectives": objectives_scheme, @@ -602,7 +552,7 @@ def _add_profile_models_configs(self): "batch_sizes", flags=["-b", "--batch-sizes"], field_type=ConfigListNumeric(int), - default_value=DEFAULT_BATCH_SIZES, + default_value=config_defaults.DEFAULT_BATCH_SIZES, description="Comma-delimited list of batch sizes to use for the profiling", ) ) @@ -615,6 +565,14 @@ def _add_profile_models_configs(self): " to be used during profiling", ) ) + self._add_config( + ConfigField( + "periodic_concurrency", + flags=["--periodic-concurrency"], + field_type=ConfigListString(), + description="A list of ranges to be used during profiling", + ) + ) self._add_config( ConfigField( "request_rate", @@ -624,6 +582,33 @@ def _add_profile_models_configs(self): " to be used during profiling", ) ) + self._add_config( + ConfigField( + "request_period", + flags=["--request-period"], + field_type=ConfigListNumeric(int), + description="Comma-delimited list of request period values or ranges " + " to be used during profiling", + ) + ) + self._add_config( + ConfigField( + "text_input_length", + flags=["--text-input-length"], + field_type=ConfigListNumeric(int), + description="Comma-delimited list of text input length values or ranges " + " to be used during profiling LLMs", + ) + ) + self._add_config( + ConfigField( + "max_token_count", + flags=["--max-token-count"], + field_type=ConfigListNumeric(int), + description="Comma-delimited list of max token values or ranges " + " to be used during profiling LLMs", + ) + ) self._add_config( ConfigField( "reload_model_disable", @@ -685,7 +670,7 @@ def _add_client_configs(self): "client_max_retries", flags=["-r", "--client-max-retries"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_MAX_RETRIES, + default_value=config_defaults.DEFAULT_MAX_RETRIES, description="Specifies the max number of retries for any requests to Triton server.", ) ) @@ -695,7 +680,7 @@ def _add_client_configs(self): flags=["--client-protocol"], choices=["http", "grpc"], field_type=ConfigPrimitive(str), - default_value=DEFAULT_CLIENT_PROTOCOL, + default_value=config_defaults.DEFAULT_CLIENT_PROTOCOL, description="The protocol used to communicate with the Triton Inference Server", ) ) @@ -721,8 +706,8 @@ def _add_run_search_configs(self): "run_config_search_max_concurrency", flags=["--run-config-search-max-concurrency"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_RUN_CONFIG_MAX_CONCURRENCY, - description="Max concurrency value that run config search should not go beyond that.", + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_CONCURRENCY, + description="Max concurrency value that run config search should not go beyond.", ) ) self._add_config( @@ -730,17 +715,53 @@ def _add_run_search_configs(self): "run_config_search_min_concurrency", flags=["--run-config-search-min-concurrency"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_RUN_CONFIG_MIN_CONCURRENCY, + default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_CONCURRENCY, description="Min concurrency value that run config search should start with.", ) ) + self._add_config( + ConfigField( + "run_config_search_max_periodic_concurrency", + flags=["--run-config-search-max-periodic-concurrency"], + field_type=ConfigPrimitive(int), + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_PERIODIC_CONCURRENCY, + description="Max periodic concurrency value that run config search should not go beyond.", + ) + ) + self._add_config( + ConfigField( + "run_config_search_min_periodic_concurrency", + flags=["--run-config-search-min-periodic-concurrency"], + field_type=ConfigPrimitive(int), + default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY, + description="Min periodic concurrency value that run config search should start with.", + ) + ) + self._add_config( + ConfigField( + "run_config_search_max_periodic_concurrency_step", + flags=["--run-config-search-max-periodic-concurrency-step"], + field_type=ConfigPrimitive(int), + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_PERIODIC_CONCURRENCY_STEP, + description="Max periodic concurrency step value that run config search should not go beyond.", + ) + ) + self._add_config( + ConfigField( + "run_config_search_min_periodic_concurrency_step", + flags=["--run-config-search-min-periodic-concurrency-step"], + field_type=ConfigPrimitive(int), + default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY_STEP, + description="Min periodic concurrency step value that run config search should start with.", + ) + ) self._add_config( ConfigField( "run_config_search_max_request_rate", flags=["--run-config-search-max-request-rate"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE, - description="Max request rate value that run config search should not go beyond that.", + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE, + description="Max request rate value that run config search should not go beyond.", ) ) self._add_config( @@ -748,17 +769,35 @@ def _add_run_search_configs(self): "run_config_search_min_request_rate", flags=["--run-config-search-min-request-rate"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE, + default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE, description="Min request rate value that run config search should start with.", ) ) + self._add_config( + ConfigField( + "run_config_search_max_request_period", + flags=["--run-config-search-max-request-period"], + field_type=ConfigPrimitive(int), + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_REQUEST_PERIOD, + description="Max request period value that run config search should not go beyond.", + ) + ) + self._add_config( + ConfigField( + "run_config_search_min_request_period", + flags=["--run-config-search-min-request-period"], + field_type=ConfigPrimitive(int), + default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD, + description="Min request period value that run config search should start with.", + ) + ) self._add_config( ConfigField( "run_config_search_max_instance_count", flags=["--run-config-search-max-instance-count"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT, - description="Max instance count value that run config search should not go beyond that.", + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT, + description="Max instance count value that run config search should not go beyond.", ) ) self._add_config( @@ -766,7 +805,7 @@ def _add_run_search_configs(self): "run_config_search_min_instance_count", flags=["--run-config-search-min-instance-count"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT, + default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT, description="Min instance count value that run config search should start with.", ) ) @@ -775,7 +814,7 @@ def _add_run_search_configs(self): "run_config_search_max_model_batch_size", flags=["--run-config-search-max-model-batch-size"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE, + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE, description="Value for the model's max_batch_size that run config search will not go beyond.", ) ) @@ -784,7 +823,7 @@ def _add_run_search_configs(self): "run_config_search_min_model_batch_size", flags=["--run-config-search-min-model-batch-size"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE, + default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE, description="Value for the model's max_batch_size that run config search will start from.", ) ) @@ -793,7 +832,7 @@ def _add_run_search_configs(self): "run_config_search_max_binary_search_steps", flags=["--run-config-search-max-binary-search-steps"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS, + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS, description="Maximum number of steps take during the binary concurrency search.", ) ) @@ -803,7 +842,7 @@ def _add_run_search_configs(self): flags=["--run-config-search-mode"], choices=["brute", "quick"], field_type=ConfigPrimitive(str), - default_value=DEFAULT_RUN_CONFIG_SEARCH_MODE, + default_value=config_defaults.DEFAULT_RUN_CONFIG_SEARCH_MODE, description="The search mode for Model Analyzer to find and evaluate" " model configurations. 'brute' will brute force all combinations of" " configuration options. 'quick' will attempt to find a near-optimal" @@ -817,7 +856,7 @@ def _add_run_search_configs(self): flags=["--run-config-search-disable"], field_type=ConfigPrimitive(bool), parser_args={"action": "store_true"}, - default_value=DEFAULT_RUN_CONFIG_SEARCH_DISABLE, + default_value=config_defaults.DEFAULT_RUN_CONFIG_SEARCH_DISABLE, description="Disable run config search.", ) ) @@ -827,7 +866,7 @@ def _add_run_search_configs(self): flags=["--run-config-profile-models-concurrently-enable"], field_type=ConfigPrimitive(bool), parser_args={"action": "store_true"}, - default_value=DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE, + default_value=config_defaults.DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE, description="Enable the profiling of all supplied models concurrently.", ) ) @@ -837,10 +876,56 @@ def _add_run_search_configs(self): flags=["--request-rate-search-enable"], field_type=ConfigPrimitive(bool), parser_args={"action": "store_true"}, - default_value=DEFAULT_REQUEST_RATE_SEARCH_ENABLE, + default_value=config_defaults.DEFAULT_REQUEST_RATE_SEARCH_ENABLE, description="Enables the searching of request rate (instead of concurrency).", ) ) + self._add_config( + ConfigField( + "llm_search_enable", + flags=["--llm-search-enable"], + field_type=ConfigPrimitive(bool), + parser_args={"action": "store_true"}, + default_value=config_defaults.DEFAULT_LLM_SEARCH_ENABLE, + description="Enables searching values are important to LLMs: text input length, max token, etc...", + ) + ) + self._add_config( + ConfigField( + "run_config_search_min_text_input_length", + flags=["--run-config-search-min-text-input-length"], + field_type=ConfigPrimitive(int), + default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH, + description="Min text input length that run config search should start with.", + ) + ) + self._add_config( + ConfigField( + "run_config_search_max_text_input_length", + flags=["--run-config-search-max-text-input-length"], + field_type=ConfigPrimitive(int), + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_TEXT_INPUT_LENGTH, + description="Max text input length that run config search will not go beyond.", + ) + ) + self._add_config( + ConfigField( + "run_config_search_min_max_token_count", + flags=["--run-config-search-min-max-token-count"], + field_type=ConfigPrimitive(int), + default_value=config_defaults.DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT, + description="Min max_token count that run config search should start with.", + ) + ) + self._add_config( + ConfigField( + "run_config_search_max_max_token_count", + flags=["--run-config-search-max-max-token-count"], + field_type=ConfigPrimitive(int), + default_value=config_defaults.DEFAULT_RUN_CONFIG_MAX_MAX_TOKEN_COUNT, + description="Max max_token count that run config search will not go beyond.", + ) + ) def _add_triton_configs(self): """ @@ -853,7 +938,7 @@ def _add_triton_configs(self): "triton_launch_mode", field_type=ConfigPrimitive(str), flags=["--triton-launch-mode"], - default_value=DEFAULT_TRITON_LAUNCH_MODE, + default_value=config_defaults.DEFAULT_TRITON_LAUNCH_MODE, choices=["local", "docker", "remote", "c_api"], description="The method by which to launch Triton Server. " "'local' assumes tritonserver binary is available locally. " @@ -869,7 +954,7 @@ def _add_triton_configs(self): "triton_docker_image", flags=["--triton-docker-image"], field_type=ConfigPrimitive(str), - default_value=DEFAULT_TRITON_DOCKER_IMAGE, + default_value=config_defaults.DEFAULT_TRITON_DOCKER_IMAGE, description="Triton Server Docker image tag", ) ) @@ -878,7 +963,7 @@ def _add_triton_configs(self): "triton_http_endpoint", flags=["--triton-http-endpoint"], field_type=ConfigPrimitive(str), - default_value=DEFAULT_TRITON_HTTP_ENDPOINT, + default_value=config_defaults.DEFAULT_TRITON_HTTP_ENDPOINT, description="Triton Server HTTP endpoint url used by Model Analyzer client.", ) ) @@ -887,7 +972,7 @@ def _add_triton_configs(self): "triton_grpc_endpoint", flags=["--triton-grpc-endpoint"], field_type=ConfigPrimitive(str), - default_value=DEFAULT_TRITON_GRPC_ENDPOINT, + default_value=config_defaults.DEFAULT_TRITON_GRPC_ENDPOINT, description="Triton Server HTTP endpoint url used by Model Analyzer client.", ) ) @@ -896,7 +981,7 @@ def _add_triton_configs(self): "triton_metrics_url", field_type=ConfigPrimitive(str), flags=["--triton-metrics-url"], - default_value=DEFAULT_TRITON_METRICS_URL, + default_value=config_defaults.DEFAULT_TRITON_METRICS_URL, description="Triton Server Metrics endpoint url. ", ) ) @@ -905,7 +990,7 @@ def _add_triton_configs(self): "triton_server_path", field_type=ConfigPrimitive(str), flags=["--triton-server-path"], - default_value=DEFAULT_TRITON_SERVER_PATH, + default_value=config_defaults.DEFAULT_TRITON_SERVER_PATH, description="The full path to the tritonserver binary executable", ) ) @@ -953,7 +1038,7 @@ def _add_triton_configs(self): ConfigField( "triton_install_path", field_type=ConfigPrimitive(str), - default_value=DEFAULT_TRITON_INSTALL_PATH, + default_value=config_defaults.DEFAULT_TRITON_INSTALL_PATH, flags=["--triton-install-path"], description=( "Path to Triton install directory i.e. the parent directory of 'lib/libtritonserver.so'." @@ -973,7 +1058,7 @@ def _add_perf_analyzer_configs(self): "perf_analyzer_timeout", flags=["--perf-analyzer-timeout"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_PERF_ANALYZER_TIMEOUT, + default_value=config_defaults.DEFAULT_PERF_ANALYZER_TIMEOUT, description="Perf analyzer timeout value in seconds.", ) ) @@ -982,7 +1067,8 @@ def _add_perf_analyzer_configs(self): "perf_analyzer_cpu_util", flags=["--perf-analyzer-cpu-util"], field_type=ConfigPrimitive(float), - default_value=psutil.cpu_count() * DEFAULT_PERF_ANALYZER_CPU_UTIL, + default_value=psutil.cpu_count() + * config_defaults.DEFAULT_PERF_ANALYZER_CPU_UTIL, description="Maximum CPU utilization value allowed for the perf_analyzer.", ) ) @@ -991,7 +1077,7 @@ def _add_perf_analyzer_configs(self): "perf_analyzer_path", flags=["--perf-analyzer-path"], field_type=ConfigPrimitive(str, validator=binary_path_validator), - default_value=DEFAULT_PERF_ANALYZER_PATH, + default_value=config_defaults.DEFAULT_PERF_ANALYZER_PATH, description="The full path to the perf_analyzer binary executable", ) ) @@ -1001,7 +1087,7 @@ def _add_perf_analyzer_configs(self): flags=["--perf-output"], parser_args={"action": "store_true"}, field_type=ConfigPrimitive(bool), - default_value=DEFAULT_PERF_OUTPUT_FLAG, + default_value=config_defaults.DEFAULT_PERF_OUTPUT_FLAG, description="Enables the output from the perf_analyzer to a file specified by" " perf_output_path. If perf_output_path is None, output will be" " written to stdout.", @@ -1020,7 +1106,7 @@ def _add_perf_analyzer_configs(self): "perf_analyzer_max_auto_adjusts", flags=["--perf-analyzer-max-auto-adjusts"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_PERF_MAX_AUTO_ADJUSTS, + default_value=config_defaults.DEFAULT_PERF_MAX_AUTO_ADJUSTS, description="Maximum number of times perf_analyzer is " "launched with auto adjusted parameters in an attempt to profile a model. ", ) @@ -1034,7 +1120,7 @@ def _add_export_configs(self): ConfigField( "export_path", flags=["-e", "--export-path"], - default_value=DEFAULT_EXPORT_PATH, + default_value=config_defaults.DEFAULT_EXPORT_PATH, field_type=ConfigPrimitive(str, validator=parent_path_validator), description="Full path to directory in which to store the results", ) @@ -1043,7 +1129,7 @@ def _add_export_configs(self): ConfigField( "filename_model_inference", flags=["--filename-model-inference"], - default_value=DEFAULT_FILENAME_MODEL_INFERENCE, + default_value=config_defaults.DEFAULT_FILENAME_MODEL_INFERENCE, field_type=ConfigPrimitive(str), description="Specifies filename for storing model inference metrics", ) @@ -1053,7 +1139,7 @@ def _add_export_configs(self): "filename_model_gpu", flags=["--filename-model-gpu"], field_type=ConfigPrimitive(str), - default_value=DEFAULT_FILENAME_MODEL_GPU, + default_value=config_defaults.DEFAULT_FILENAME_MODEL_GPU, description="Specifies filename for storing model GPU metrics", ) ) @@ -1062,7 +1148,7 @@ def _add_export_configs(self): "filename_server_only", flags=["--filename-server-only"], field_type=ConfigPrimitive(str), - default_value=DEFAULT_FILENAME_SERVER_ONLY, + default_value=config_defaults.DEFAULT_FILENAME_SERVER_ONLY, description="Specifies filename for server-only metrics", ) ) @@ -1076,7 +1162,7 @@ def _add_report_configs(self): "num_configs_per_model", flags=["--num-configs-per-model"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_NUM_CONFIGS_PER_MODEL, + default_value=config_defaults.DEFAULT_NUM_CONFIGS_PER_MODEL, description="The number of configurations to plot per model in the summary.", ) ) @@ -1085,7 +1171,7 @@ def _add_report_configs(self): "num_top_model_configs", flags=["--num-top-model-configs"], field_type=ConfigPrimitive(int), - default_value=DEFAULT_NUM_TOP_MODEL_CONFIGS, + default_value=config_defaults.DEFAULT_NUM_TOP_MODEL_CONFIGS, description="Model Analyzer will compare this many of the top models configs across all models.", ) ) @@ -1100,7 +1186,7 @@ def _add_table_configs(self): "inference_output_fields", flags=["--inference-output-fields"], field_type=ConfigListString(), - default_value=DEFAULT_INFERENCE_OUTPUT_FIELDS, + default_value=config_defaults.DEFAULT_INFERENCE_OUTPUT_FIELDS, description="Specifies column keys for model inference metrics table", ) ) @@ -1109,7 +1195,7 @@ def _add_table_configs(self): "gpu_output_fields", flags=["--gpu-output-fields"], field_type=ConfigListString(), - default_value=DEFAULT_GPU_OUTPUT_FIELDS, + default_value=config_defaults.DEFAULT_GPU_OUTPUT_FIELDS, description="Specifies column keys for model gpu metrics table", ) ) @@ -1118,7 +1204,7 @@ def _add_table_configs(self): "server_output_fields", flags=["--server-output-fields"], field_type=ConfigListString(), - default_value=DEFAULT_SERVER_OUTPUT_FIELDS, + default_value=config_defaults.DEFAULT_SERVER_OUTPUT_FIELDS, description="Specifies column keys for server-only metrics table", ) ) @@ -1163,7 +1249,9 @@ def set_config_values(self, args: argparse.Namespace) -> None: this exception """ if args.mode == "online" and "latency_budget" not in args: - self._fields["objectives"].set_default_value(DEFAULT_ONLINE_OBJECTIVES) + self._fields["objectives"].set_default_value( + config_defaults.DEFAULT_ONLINE_OBJECTIVES + ) super().set_config_values(args) @@ -1171,9 +1259,9 @@ def set_config_values(self, args: argparse.Namespace) -> None: # able to edit these plots. self._add_plot_configs() if args.mode == "online": - self._fields["plots"].set_value(DEFAULT_ONLINE_PLOTS) + self._fields["plots"].set_value(config_defaults.DEFAULT_ONLINE_PLOTS) elif args.mode == "offline": - self._fields["plots"].set_value(DEFAULT_OFFLINE_PLOTS) + self._fields["plots"].set_value(config_defaults.DEFAULT_OFFLINE_PLOTS) def _add_plot_configs(self): """ @@ -1331,16 +1419,23 @@ def _autofill_values(self): {"perf_throughput": {"min": self.min_throughput}} ) - # Switch default output fields if request rate is being used + # Switch default output fields if LLM model or request rate is being used # and the user didn't specify a custom output field - if self._using_request_rate(): + if self.is_llm_model(): if not self._fields["inference_output_fields"].is_set_by_user(): self.inference_output_fields = ( - DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS + config_defaults.DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS + ) + elif self._using_request_rate(): + if not self._fields["inference_output_fields"].is_set_by_user(): + self.inference_output_fields = ( + config_defaults.DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS ) if not self._fields["gpu_output_fields"].is_set_by_user(): - self.gpu_output_fields = DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS + self.gpu_output_fields = ( + config_defaults.DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS + ) new_profile_models = {} for i, model in enumerate(self.profile_models): @@ -1369,7 +1464,7 @@ def _autofill_values(self): "Weighting can not be specified as a global parameter. Please make this a model parameter." ) else: - new_model["weighting"] = DEFAULT_MODEL_WEIGHTING + new_model["weighting"] = config_defaults.DEFAULT_MODEL_WEIGHTING else: new_model["weighting"] = model.weighting() @@ -1404,30 +1499,35 @@ def _autofill_values(self): new_model["parameters"] = { "batch_sizes": self.batch_sizes, "concurrency": self.concurrency, + "periodic_concurrency": self.periodic_concurrency, "request_rate": self.request_rate, + "request_period": self.request_period, + "text_input_length": self.text_input_length, + "max_token_count": self.max_token_count, } else: new_model["parameters"] = {} - if "batch_sizes" in model.parameters(): - new_model["parameters"].update( - {"batch_sizes": model.parameters()["batch_sizes"]} - ) - else: - new_model["parameters"].update({"batch_sizes": self.batch_sizes}) - - if "concurrency" in model.parameters(): - new_model["parameters"].update( - {"concurrency": model.parameters()["concurrency"]} - ) - else: - new_model["parameters"].update({"concurrency": self.concurrency}) - - if "request_rate" in model.parameters(): - new_model["parameters"].update( - {"request_rate": model.parameters()["request_rate"]} - ) - else: - new_model["parameters"].update({"request_rate": self.request_rate}) + new_model["parameters"].update( + self._set_model_parameter(model, "batch_sizes") + ) + new_model["parameters"].update( + self._set_model_parameter(model, "concurrency") + ) + new_model["parameters"].update( + self._set_model_parameter(model, "periodic_concurrency") + ) + new_model["parameters"].update( + self._set_model_parameter(model, "request_rate") + ) + new_model["parameters"].update( + self._set_model_parameter(model, "request_period") + ) + new_model["parameters"].update( + self._set_model_parameter(model, "max_token_count") + ) + new_model["parameters"].update( + self._set_model_parameter(model, "text_input_length") + ) if ( new_model["parameters"]["request_rate"] @@ -1470,6 +1570,14 @@ def _autofill_values(self): new_profile_models[model.model_name()] = new_model self._fields["profile_models"].set_value(new_profile_models) + def _set_model_parameter( + self, model: ConfigModelProfileSpec, parameter_name: str + ) -> Dict: + if parameter_name in model.parameters(): + return {parameter_name: model.parameters()[parameter_name]} + else: + return {parameter_name: getattr(self, parameter_name)} + def _using_request_rate(self) -> bool: if self.request_rate or self.request_rate_search_enable: return True @@ -1509,3 +1617,31 @@ def is_request_rate_specified(self, model_parameters: dict) -> bool: or self.get_config()["run_config_search_min_request_rate"].is_set_by_user() or self.get_config()["run_config_search_max_request_rate"].is_set_by_user() ) + + def is_llm_model(self) -> bool: + """ + Returns true if the user has enabled llm search or set any llm search value + """ + config = self.get_config() + + return ( + self.llm_search_enable + or config["run_config_search_min_text_input_length"].is_set_by_user() + or config["run_config_search_max_text_input_length"].is_set_by_user() + or config["run_config_search_min_max_token_count"].is_set_by_user() + or config["run_config_search_max_max_token_count"].is_set_by_user() + or config["run_config_search_min_periodic_concurrency"].is_set_by_user() + or config["run_config_search_max_periodic_concurrency"].is_set_by_user() + or config[ + "run_config_search_min_periodic_concurrency_step" + ].is_set_by_user() + or config[ + "run_config_search_max_periodic_concurrency_step" + ].is_set_by_user() + or config["run_config_search_min_request_period"].is_set_by_user() + or config["run_config_search_max_request_period"].is_set_by_user() + or config["text_input_length"].is_set_by_user() + or config["max_token_count"].is_set_by_user() + or config["periodic_concurrency"].is_set_by_user() + or config["request_period"].is_set_by_user() + ) diff --git a/model_analyzer/config/input/config_command_report.py b/model_analyzer/config/input/config_command_report.py index 7d1eee7fb..d4cf9b536 100755 --- a/model_analyzer/config/input/config_command_report.py +++ b/model_analyzer/config/input/config_command_report.py @@ -209,6 +209,10 @@ def set_config_values(self, args): super().set_config_values(args) + # TODO TMA-1443: Update this when adding support for detailed reporting + def is_llm_model(self) -> bool: + return False + def _preprocess_and_verify_arguments(self): """ Enforces some rules on the config. diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py index 67c62dca9..fb0b62ee8 100755 --- a/model_analyzer/config/input/config_defaults.py +++ b/model_analyzer/config/input/config_defaults.py @@ -38,23 +38,36 @@ DEFAULT_SKIP_SUMMARY_REPORTS = False DEFAULT_SKIP_DETAILED_REPORTS = False DEFAULT_OUTPUT_MODEL_REPOSITORY = os.path.join(os.getcwd(), "output_model_repository") +DEFAULT_INPUT_JSON_PATH = os.path.join(os.getcwd(), "input_json_dir") DEFAULT_OVERRIDE_OUTPUT_REPOSITORY_FLAG = False DEFAULT_BATCH_SIZES = 1 DEFAULT_MAX_RETRIES = 50 DEFAULT_CLIENT_PROTOCOL = "grpc" DEFAULT_RUN_CONFIG_MAX_CONCURRENCY = 1024 DEFAULT_RUN_CONFIG_MIN_CONCURRENCY = 1 +DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY = "1:1:1" +DEFAULT_RUN_CONFIG_MAX_PERIODIC_CONCURRENCY = 1024 +DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY = 16 +DEFAULT_RUN_CONFIG_MAX_PERIODIC_CONCURRENCY_STEP = 128 +DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY_STEP = 4 DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE = 8192 DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE = 16 +DEFAULT_RUN_CONFIG_MAX_REQUEST_PERIOD = 256 +DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD = 1 DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT = 5 DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT = 1 DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE = 1 DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE = 128 DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS = 5 +DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH = 1 +DEFAULT_RUN_CONFIG_MAX_TEXT_INPUT_LENGTH = 1024 +DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT = 1 +DEFAULT_RUN_CONFIG_MAX_MAX_TOKEN_COUNT = 256 DEFAULT_RUN_CONFIG_SEARCH_DISABLE = False DEFAULT_RUN_CONFIG_SEARCH_MODE = "brute" DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE = False DEFAULT_REQUEST_RATE_SEARCH_ENABLE = False +DEFAULT_LLM_SEARCH_ENABLE = False DEFAULT_TRITON_LAUNCH_MODE = "local" DEFAULT_TRITON_DOCKER_IMAGE = "nvcr.io/nvidia/tritonserver:23.09-py3" DEFAULT_TRITON_HTTP_ENDPOINT = "localhost:8000" @@ -129,6 +142,18 @@ "perf_throughput", "perf_latency_p99", ] +DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS = [ + "model_name", + "batch_size", + "periodic_concurrency", + "request_period", + "text_input_length", + "max_tokens", + "model_config_path", + "instance_group", + "avg_first_token_latency", + "avg_token_to_token_latency", +] DEFAULT_GPU_OUTPUT_FIELDS = [ "model_name", "gpu_uuid", diff --git a/model_analyzer/config/input/config_list_numeric.py b/model_analyzer/config/input/config_list_numeric.py index 799cbdf9e..b677bcdab 100755 --- a/model_analyzer/config/input/config_list_numeric.py +++ b/model_analyzer/config/input/config_list_numeric.py @@ -103,7 +103,14 @@ def set_value(self, value): try: if self._is_string(value): self._value = [] - value = value.split(",") + if "," in value: + value = value.split(",") + elif ":" in value: + value = value.split(":") + if len(value) == 2: + value = {"start": value[0], "stop": value[1], "step": 1} + else: + value = {"start": value[0], "stop": value[1], "step": value[2]} if self._is_list(value): new_value = self._process_list(value) diff --git a/model_analyzer/constants.py b/model_analyzer/constants.py index 886360d34..09f581326 100755 --- a/model_analyzer/constants.py +++ b/model_analyzer/constants.py @@ -32,8 +32,8 @@ # Run Search THROUGHPUT_MINIMUM_GAIN = 0.05 +THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES = 4 THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES = 4 -THROUGHPUT_MINIMUM_CONSECUTIVE_BATCH_SIZE_TRIES = 4 # Quick search algorithm constants RADIUS = 3 diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py index c88f8e655..0a8f2b725 100755 --- a/model_analyzer/perf_analyzer/perf_analyzer.py +++ b/model_analyzer/perf_analyzer/perf_analyzer.py @@ -16,6 +16,7 @@ import csv import glob +import json import logging import os import re @@ -25,6 +26,7 @@ from typing import Dict, List import psutil +from numpy import mean from model_analyzer.constants import ( INTERVAL_SLEEP_TIME, @@ -36,6 +38,10 @@ ) from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException from model_analyzer.record.record import Record +from model_analyzer.record.types.avg_first_token_latency import AvgFirstTokenLatency +from model_analyzer.record.types.avg_token_to_token_latency import ( + AvgTokenToTokenLatency, +) from model_analyzer.record.types.gpu_free_memory import GPUFreeMemory from model_analyzer.record.types.gpu_power_usage import GPUPowerUsage from model_analyzer.record.types.gpu_used_memory import GPUUsedMemory @@ -91,6 +97,11 @@ class PerfAnalyzer: ["gpu_used_memory", "Max GPU Memory Usage", GPUUsedMemory, "1000000"], ["gpu_free_memory", "Total GPU Memory", GPUFreeMemory, "1000000"] ] + + llm_metric_table = [ + ["avg_first_token_latency", None, AvgFirstTokenLatency, "1000000"], + ["avg_token_to_token_latency", None, AvgTokenToTokenLatency, "1000000"] + ] # yapf: enable @staticmethod @@ -109,6 +120,14 @@ def get_gpu_metrics(): ] return gpu_metrics + @staticmethod + def get_llm_metrics(): + llm_metrics = [ + llm_metric[PerfAnalyzer.RECORD_CLASS] + for llm_metric in PerfAnalyzer.llm_metric_table + ] + return llm_metrics + def __init__(self, path, config, max_retries, timeout, max_cpu_util): """ Parameters @@ -134,6 +153,7 @@ def __init__(self, path, config, max_retries, timeout, max_cpu_util): self._output = "" self._perf_records = {} self._gpu_records = [] + self._llm_records = {} self._max_cpu_util = max_cpu_util def run(self, metrics, env=None): @@ -207,6 +227,19 @@ def get_gpu_records(self): return self._gpu_records + def get_llm_records(self): + """ + Returns + ------- + The LLM records from the last perf_analyzer run + """ + + if self._llm_records: + return self._llm_records + raise TritonModelAnalyzerException( + "Attempted to get perf_analyzer results without calling run first." + ) + def output(self): """ Returns @@ -252,6 +285,14 @@ def _get_single_model_cmd(self, index): if self._is_multi_model(): cmd += ["--enable-mpi"] cmd += self._get_pa_cli_command(index).replace("=", " ").split() + + # OPTME: There should be a more elegant way of determining how to add EOS + # We have to do it here because we use a dictionary to create the PA command + # and it already contains `--request-parameter` + if "--periodic-concurrency-range" in cmd: + cmd.append("--request-parameter") + cmd.append("ignore_eos:true:bool") + return cmd def _get_pa_cli_command(self, index): @@ -448,21 +489,88 @@ def _parse_outputs(self, metrics): logger.debug( f"Reading PA results from {perf_config['latency-report-file']}" ) - with open(perf_config["latency-report-file"], mode="r") as f: - csv_reader = csv.DictReader(f, delimiter=",") - - for row in csv_reader: - self._perf_records[ - perf_config["model-name"] - ] = self._extract_perf_records_from_row(metrics, row) - self._gpu_records = self._extract_gpu_records_from_row(metrics, row) + self._extract_gpu_records(perf_config, metrics) + self._extract_llm_records(perf_config, metrics) for perf_config in [ mrc.perf_config() for mrc in self._config.model_run_configs() ]: - # Remove the latency file and all associated composing model latency files + # Remove the latency/profile export files and all associated composing model latency files for f in glob.glob(f"*{perf_config['latency-report-file']}"): os.remove(f) + for f in glob.glob(f"*{perf_config['profile-export-file']}"): + os.remove(f) + + def _extract_gpu_records(self, perf_config, metrics): + if perf_config["profile-export-file"]: + return + + with open(perf_config["latency-report-file"], mode="r") as f: + csv_reader = csv.DictReader(f, delimiter=",") + + for row in csv_reader: + self._perf_records[ + perf_config["model-name"] + ] = self._extract_perf_records_from_row(metrics, row) + self._gpu_records = self._extract_gpu_records_from_row(metrics, row) + + def _extract_llm_records(self, perf_config, metrics): + if not perf_config["profile-export-file"]: + return + + self._llm_records[perf_config["model-name"]] = [] + + with open(perf_config["profile-export-file"], mode="r") as f: + llm_output = json.load(f) + + avg_first_token_latency = self._calculate_avg_first_token_latency( + llm_output + ) + record = PerfAnalyzer.llm_metric_table[0][PerfAnalyzer.RECORD_CLASS]( + value=avg_first_token_latency + ) # type: ignore + + self._llm_records[perf_config["model-name"]].append(record) + + avg_token_to_token_latency = self._calculate_avg_token_to_token_latency( + llm_output + ) + record = PerfAnalyzer.llm_metric_table[1][PerfAnalyzer.RECORD_CLASS]( + value=avg_token_to_token_latency + ) # type: ignore + self._llm_records[perf_config["model-name"]].append(record) + + def _calculate_avg_first_token_latency(self, llm_output: Dict) -> float: + total_first_token_latencies = [] + for request in llm_output["experiments"][0]["requests"]: + total_first_token_latencies.append( + request["response_timestamps"][0] - request["timestamp"] + ) + + avg_first_token_latency = float(mean(total_first_token_latencies)) + reduction_factor = float( + PerfAnalyzer.llm_metric_table[0][PerfAnalyzer.REDUCTION_FACTOR] # type: ignore + ) + + return avg_first_token_latency / reduction_factor + + def _calculate_avg_token_to_token_latency(self, llm_output: Dict) -> float: + token_to_token_latencies = [] + for request in llm_output["experiments"][0]["requests"]: + response_to_response_latencies = [] + prev_response = request["response_timestamps"][0] + for response in request["response_timestamps"][1:]: + response_to_response_latencies.append(response - prev_response) + prev_response = response + + token_to_token_latencies.append(mean(response_to_response_latencies)) + + avg_token_to_token_latency = float(mean(token_to_token_latencies)) + reduction_factor = float( + PerfAnalyzer.llm_metric_table[1][PerfAnalyzer.REDUCTION_FACTOR] # type: ignore + ) + + return avg_token_to_token_latency / reduction_factor def _extract_perf_records_from_row( self, requested_metrics: List[Record], row_metrics: Dict[str, str] diff --git a/model_analyzer/perf_analyzer/perf_config.py b/model_analyzer/perf_analyzer/perf_config.py index e9160a44a..35df6b7a3 100755 --- a/model_analyzer/perf_analyzer/perf_config.py +++ b/model_analyzer/perf_analyzer/perf_config.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from model_analyzer.config.generate.generator_utils import GeneratorUtils as utils from model_analyzer.config.input.config_defaults import DEFAULT_MEASUREMENT_MODE from model_analyzer.constants import SECONDS_TO_MILLISECONDS_MULTIPLIER from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException @@ -33,6 +34,7 @@ class PerfAnalyzerConfig: "measurement-interval", "concurrency-range", "request-rate-range", + "periodic-concurrency-range", "request-distribution", "request-intervals", "binary-search", @@ -71,6 +73,8 @@ class PerfAnalyzerConfig: "metrics-url", "metrics-interval", "bls-composing-models", + "request-parameter", + "request-period", ] input_to_options = [ @@ -80,6 +84,7 @@ class PerfAnalyzerConfig: "url", "protocol", "latency-report-file", + "profile-export-file", "http-header", ] @@ -96,6 +101,8 @@ class PerfAnalyzerConfig: "collect-metrics", ] + llm_args = ["text-input-length", "max-tokens"] + def __init__(self): """ Construct a PerfAnalyzerConfig @@ -110,6 +117,7 @@ def __init__(self): "-u": None, "-i": None, "-f": None, + "--profile-export-file": None, "-H": None, } self._verbose = {"-v": None, "-v -v": None, "--verbose-csv": None} @@ -121,6 +129,7 @@ def __init__(self): "url": "-u", "protocol": "-i", "latency-report-file": "-f", + "profile-export-file": "--profile-export-file", "http-header": "-H", } @@ -145,7 +154,12 @@ def allowed_keys(cls): passed into perf_analyzer """ - return cls.perf_analyzer_args + cls.input_to_options + cls.input_to_verbose + return ( + cls.perf_analyzer_args + + cls.input_to_options + + cls.input_to_verbose + + cls.llm_args + ) @classmethod def additive_keys(cls): @@ -191,6 +205,9 @@ def update_config_from_profile_config(self, model_name, profile_config): "verbose-csv": "--verbose-csv", } + if profile_config.is_llm_model(): + params.update({"profile-export-file": model_name + "-results.json"}) + if profile_config.triton_launch_mode == "c_api": params.update( { @@ -273,6 +290,14 @@ def extract_model_specific_parameters(self): "batch-size": self._options["-b"], "concurrency-range": self._args["concurrency-range"], "request-rate-range": self._args["request-rate-range"], + "periodic-concurrency-range": self._args["periodic-concurrency-range"], + "max-tokens": utils.extract_value_from_request_parameter( + self._args["request-parameter"] + ), + "request-period": self._args["request-period"], + "text-input-length": utils.extract_text_input_length_from_input_data( + self._args["input-data"] + ), } @classmethod @@ -303,7 +328,7 @@ def remove_url_from_cli_string(cls, cli_string): @classmethod def remove_mrc_from_cli_string(cls, cli_string): """ - utility function strips the measruement request count + utility function strips the measurement request count from a cli string representation Parameters diff --git a/model_analyzer/record/metrics_manager.py b/model_analyzer/record/metrics_manager.py index 176b632df..10459a76f 100755 --- a/model_analyzer/record/metrics_manager.py +++ b/model_analyzer/record/metrics_manager.py @@ -16,6 +16,7 @@ import logging import os +import shutil import time from collections import defaultdict from typing import Dict, List, Optional, Tuple @@ -27,6 +28,7 @@ from model_analyzer.config.generate.base_model_config_generator import ( BaseModelConfigGenerator, ) +from model_analyzer.config.input.config_defaults import DEFAULT_INPUT_JSON_PATH from model_analyzer.config.run.run_config import RunConfig from model_analyzer.constants import LOGGER_NAME, PA_ERROR_LOG_FILENAME from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException @@ -69,6 +71,8 @@ class MetricsManager: "gpu_power_usage", "cpu_available_ram", "cpu_used_ram", + "avg_first_token_latency", + "avg_token_to_token_latency", ] def __init__(self, config, client, server, gpus, result_manager, state_manager): @@ -116,6 +120,7 @@ def __init__(self, config, client, server, gpus, result_manager, state_manager): self._gpu_metrics, self._perf_metrics, self._cpu_metrics, + self._llm_metrics, ) = self._categorize_metrics(self.metrics, self._config.collect_cpu_metrics) self._gpus = gpus self._init_state() @@ -160,21 +165,23 @@ def _categorize_metrics(metric_tags, collect_cpu_metrics=False): Returns ------- - (list,list,list) - tuple of three lists (DCGM, PerfAnalyzer, CPU) metrics + (list,list,list,list) + tuple of four lists (DCGM, PerfAnalyzer, CPU, LLM) metrics """ - gpu_metrics, perf_metrics, cpu_metrics = [], [], [] + gpu_metrics, perf_metrics, cpu_metrics, llm_metrics = [], [], [], [] # Separates metrics and objectives into related lists for metric in MetricsManager.get_metric_types(metric_tags): if metric in PerfAnalyzer.get_gpu_metrics(): gpu_metrics.append(metric) elif metric in PerfAnalyzer.get_perf_metrics(): perf_metrics.append(metric) + elif metric in PerfAnalyzer.get_llm_metrics(): + llm_metrics.append(metric) elif collect_cpu_metrics and (metric in CPUMonitor.cpu_metrics): cpu_metrics.append(metric) - return gpu_metrics, perf_metrics, cpu_metrics + return gpu_metrics, perf_metrics, cpu_metrics, llm_metrics def profile_server(self): """ @@ -304,6 +311,9 @@ def profile_models(self, run_config: RunConfig) -> Optional[RunConfigMeasurement def finalize(self): self._server.stop() + if os.path.exists(DEFAULT_INPUT_JSON_PATH): + shutil.rmtree(DEFAULT_INPUT_JSON_PATH) + def _create_model_variants(self, run_config: RunConfig) -> None: """ Creates and fills all model variant directories @@ -556,6 +566,9 @@ def _run_perf_analyzer( ) metrics_to_gather = self._perf_metrics + self._gpu_metrics + if self._config.is_llm_model(): + metrics_to_gather += self._llm_metrics + status = perf_analyzer.run(metrics_to_gather, env=perf_analyzer_env) self._write_perf_analyzer_output(perf_output_writer, perf_analyzer) @@ -564,7 +577,12 @@ def _run_perf_analyzer( self._handle_unsuccessful_perf_analyzer_run(perf_analyzer) return (None, None) - perf_records = perf_analyzer.get_perf_records() + # FIXME: PA does not return a latency report file if an export report file is specified + perf_records = ( + perf_analyzer.get_llm_records() + if self._config.is_llm_model() + else perf_analyzer.get_perf_records() + ) gpu_records = perf_analyzer.get_gpu_records() aggregated_perf_records = self._aggregate_perf_records(perf_records) diff --git a/model_analyzer/record/record.py b/model_analyzer/record/record.py index 23aa9e50f..8a55b6a88 100755 --- a/model_analyzer/record/record.py +++ b/model_analyzer/record/record.py @@ -101,7 +101,7 @@ def __init__(self, value, timestamp): Parameters ---------- value : float or int - The value of the GPU metrtic + The value of the GPU metric timestamp : int The timestamp for the record in nanoseconds """ diff --git a/model_analyzer/record/types/avg_first_token_latency.py b/model_analyzer/record/types/avg_first_token_latency.py new file mode 100755 index 000000000..fe862aad6 --- /dev/null +++ b/model_analyzer/record/types/avg_first_token_latency.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.record import DecreasingRecord + + +@total_ordering +class AvgFirstTokenLatency(DecreasingRecord): + """ + A record for perf_analyzer average first token latency metric + """ + + tag = "avg_first_token_latency" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed avg time for first token-to-token latency + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "Avg First Token latency (ms)" + + def __eq__(self, other): + """ + Allows checking for + equality between two records + """ + + return self.value() == other.value() + + def __lt__(self, other): + """ + Allows checking if + this record is less than + the other + """ + + return self.value() > other.value() + + def __add__(self, other): + """ + Allows adding two records together + to produce a brand new record. + """ + + return self.__class__(value=(self.value() + other.value())) + + def __sub__(self, other): + """ + Allows subbing two records together + to produce a brand new record. + + ** Note this does reverse subtraction because + of the inverted nature of latency (lower is better) + """ + + return self.__class__(value=(other.value() - self.value())) diff --git a/model_analyzer/record/types/avg_token_to_token_latency.py b/model_analyzer/record/types/avg_token_to_token_latency.py new file mode 100755 index 000000000..72ccdfe5f --- /dev/null +++ b/model_analyzer/record/types/avg_token_to_token_latency.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.record import DecreasingRecord + + +@total_ordering +class AvgTokenToTokenLatency(DecreasingRecord): + """ + A record for perf_analyzer average token-to-token latency metric + """ + + tag = "avg_token_to_token_latency" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed avg time for token-to-token latency + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "Avg Token-to-Token latency (ms)" + + def __eq__(self, other): + """ + Allows checking for + equality between two records + """ + + return self.value() == other.value() + + def __lt__(self, other): + """ + Allows checking if + this record is less than + the other + """ + + return self.value() > other.value() + + def __add__(self, other): + """ + Allows adding two records together + to produce a brand new record. + """ + + return self.__class__(value=(self.value() + other.value())) + + def __sub__(self, other): + """ + Allows subbing two records together + to produce a brand new record. + + ** Note this does reverse subtraction because + of the inverted nature of latency (lower is better) + """ + + return self.__class__(value=(other.value() - self.value())) diff --git a/model_analyzer/result/parameter_search.py b/model_analyzer/result/inference_load_search.py similarity index 63% rename from model_analyzer/result/parameter_search.py rename to model_analyzer/result/inference_load_search.py index e716a5b7d..5c7c9598d 100755 --- a/model_analyzer/result/parameter_search.py +++ b/model_analyzer/result/inference_load_search.py @@ -21,7 +21,7 @@ from model_analyzer.config.input.config_command_profile import ConfigCommandProfile from model_analyzer.constants import ( LOGGER_NAME, - THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES, + THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES, THROUGHPUT_MINIMUM_GAIN, ) from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException @@ -30,11 +30,11 @@ logger = logging.getLogger(LOGGER_NAME) -class ParameterSearch: +class InferenceLoadSearch: """ - Generates the next parameter value to use when searching through + Generates the next inference load value to use when searching through RunConfigMeasurements for the best value (according to the users objective) - - Will sweep from by powers of two from min to max parameter + - Will sweep from by powers of two from min to max inference load - If the user specifies a constraint, the algorithm will perform a binary search around the boundary if the constraint is violated @@ -45,43 +45,43 @@ def __init__( self, config: ConfigCommandProfile, model_parameters: dict = {}, - skip_parameter_sweep: bool = False, + skip_inference_load_sweep: bool = False, ) -> None: """ Parameters ---------- config: ConfigCommandProfile Profile configuration information - skip_parameter_sweep: bool - If true, skips the parameter sweep and only does the binary search + skip_inference_load_sweep: bool + If true, skips the inference load sweep and only does the binary search """ - self._skip_parameter_sweep = skip_parameter_sweep - self._parameter_is_request_rate = config.is_request_rate_specified( + self._skip_inference_load_sweep = skip_inference_load_sweep + self._inference_load_is_request_rate = config.is_request_rate_specified( model_parameters ) - if self._parameter_is_request_rate: - self._min_parameter_index = int( + if self._inference_load_is_request_rate: + self._min_inference_load_index = int( log2(config.run_config_search_min_request_rate) ) - self._max_parameter_index = int( + self._max_inference_load_index = int( log2(config.run_config_search_max_request_rate) ) else: - self._min_parameter_index = int( + self._min_inference_load_index = int( log2(config.run_config_search_min_concurrency) ) - self._max_parameter_index = int( + self._max_inference_load_index = int( log2(config.run_config_search_max_concurrency) ) self._max_binary_search_steps = config.run_config_search_max_binary_search_steps self._run_config_measurements: List[Optional[RunConfigMeasurement]] = [] - self._parameters: List[int] = [] - self._last_failing_parameter = 0 - self._last_passing_parameter = 0 + self._inference_loads: List[int] = [] + self._last_failing_inference_load = 0 + self._last_passing_inference_load = 0 def add_run_config_measurement( self, run_config_measurement: Optional[RunConfigMeasurement] @@ -92,30 +92,31 @@ def add_run_config_measurement( """ self._run_config_measurements.append(run_config_measurement) - def search_parameters(self) -> Generator[int, None, None]: + def search_inference_loads(self) -> Generator[int, None, None]: """ - First performs a parameter sweep, and then, if necessary, perform - a binary parameter search around the point where the constraint - violated + First performs an inference load sweep, and then, if necessary, perform + a binary search around the point where the constraint was violated """ - yield from self._perform_parameter_sweep() + yield from self._perform_inference_load_sweep() if self._was_constraint_violated(): - yield from self._perform_binary_parameter_search() + yield from self._perform_binary_search() - def _perform_parameter_sweep(self) -> Generator[int, None, None]: - for parameter in ( + def _perform_inference_load_sweep(self) -> Generator[int, None, None]: + for inference_load in ( 2**i - for i in range(self._min_parameter_index, self._max_parameter_index + 1) + for i in range( + self._min_inference_load_index, self._max_inference_load_index + 1 + ) ): - if self._should_continue_parameter_sweep(): - self._parameters.append(parameter) - yield parameter + if self._should_continue_inference_load_sweep(): + self._inference_loads.append(inference_load) + yield inference_load else: # We can't actually skip the sweep because the results need to be added # but, we can suppress the logging messages - if not self._skip_parameter_sweep: - if self._parameter_is_request_rate: + if not self._skip_inference_load_sweep: + if self._inference_load_is_request_rate: logger.info( "Terminating request rate sweep - throughput is decreasing" ) @@ -125,7 +126,7 @@ def _perform_parameter_sweep(self) -> Generator[int, None, None]: ) return - def _should_continue_parameter_sweep(self) -> bool: + def _should_continue_inference_load_sweep(self) -> bool: self._check_measurement_count() if not self._are_minimum_tries_reached(): @@ -134,16 +135,16 @@ def _should_continue_parameter_sweep(self) -> bool: return not self._has_objective_gain_saturated() def _check_measurement_count(self) -> None: - if len(self._run_config_measurements) != len(self._parameters): + if len(self._run_config_measurements) != len(self._inference_loads): raise TritonModelAnalyzerException( - f"Internal Measurement count: {self._parameters}, doesn't match number " + f"Internal Measurement count: {self._inference_loads}, doesn't match number " f"of measurements added: {len(self._run_config_measurements)}." ) def _are_minimum_tries_reached(self) -> bool: if ( len(self._run_config_measurements) - < THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES + < THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES ): return False else: @@ -155,7 +156,7 @@ def _has_objective_gain_saturated(self) -> bool: def _calculate_gain(self) -> float: first_rcm = self._run_config_measurements[ - -THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES + -THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES ] best_rcm = self._get_best_rcm() @@ -177,7 +178,7 @@ def _get_best_rcm(self) -> Optional[RunConfigMeasurement]: pruned_rcms = [ rcm for rcm in self._run_config_measurements[ - -THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES: + -THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES: ] if rcm ] @@ -188,16 +189,16 @@ def _get_best_rcm(self) -> Optional[RunConfigMeasurement]: def _was_constraint_violated(self) -> bool: for i in range(len(self._run_config_measurements) - 1, 1, -1): if self._at_constraint_failure_boundary(i): - self._last_failing_parameter = self._parameters[i] - self._last_passing_parameter = self._parameters[i - 1] + self._last_failing_inference_load = self._inference_loads[i] + self._last_passing_inference_load = self._inference_loads[i - 1] return True if ( self._run_config_measurements[0] and not self._run_config_measurements[0].is_passing_constraints() ): - self._last_failing_parameter = self._parameters[i] - self._last_passing_parameter = 0 + self._last_failing_inference_load = self._inference_loads[i] + self._last_passing_inference_load = 0 return True else: return False @@ -220,27 +221,31 @@ def _at_constraint_failure_boundary(self, index: int) -> bool: return at_failure_boundary - def _perform_binary_parameter_search(self) -> Generator[int, None, None]: + def _perform_binary_search(self) -> Generator[int, None, None]: # This is needed because we are going to restart the search from the - # parameter that failed - so we expect this to be at the end of the list - self._parameters.append(self._last_failing_parameter) + # inference_load that failed - so we expect this to be at the end of the list + self._inference_loads.append(self._last_failing_inference_load) for i in range(0, self._max_binary_search_steps): - parameter = self._determine_next_binary_parameter() + inference_load = self._determine_next_binary_inference_load() - if parameter != self._parameters[-1]: - self._parameters.append(parameter) - yield parameter + if inference_load != self._inference_loads[-1]: + self._inference_loads.append(inference_load) + yield inference_load - def _determine_next_binary_parameter(self) -> int: + def _determine_next_binary_inference_load(self) -> int: if not self._run_config_measurements[-1]: return 0 if self._run_config_measurements[-1].is_passing_constraints(): - self._last_passing_parameter = self._parameters[-1] - parameter = int((self._last_failing_parameter + self._parameters[-1]) / 2) + self._last_passing_inference_load = self._inference_loads[-1] + inference_load = int( + (self._last_failing_inference_load + self._inference_loads[-1]) / 2 + ) else: - self._last_failing_parameter = self._parameters[-1] - parameter = int((self._last_passing_parameter + self._parameters[-1]) / 2) + self._last_failing_inference_load = self._inference_loads[-1] + inference_load = int( + (self._last_passing_inference_load + self._inference_loads[-1]) / 2 + ) - return parameter + return inference_load diff --git a/model_analyzer/result/result_table_manager.py b/model_analyzer/result/result_table_manager.py index 12a406e7c..83a6b8bd4 100755 --- a/model_analyzer/result/result_table_manager.py +++ b/model_analyzer/result/result_table_manager.py @@ -37,6 +37,10 @@ class ResultTableManager: "model_name": "Model", "batch_size": "Batch", "concurrency": "Concurrency", + "periodic_concurrency": "Periodic Concurrency", + "text_input_length": "Text Input Length", + "max_tokens": "Max Tokens", + "request_period": "Request Period", "request_rate": "Request Rate", "model_config_path": "Model Config Path", "instance_group": "Instance Group", @@ -456,6 +460,10 @@ def _tabulate_measurement( batch_sizes, concurrencies, request_rates, + periodic_concurrencies, + max_tokens, + request_period, + text_input_length, ) = self._tabulate_measurement_setup(run_config_measurement) satisfies = "Yes" if passes else "No" @@ -467,6 +475,10 @@ def _tabulate_measurement( batch_sizes, concurrencies, request_rates, + periodic_concurrencies, + max_tokens, + request_period, + text_input_length, satisfies, model_name, model_config_name, @@ -494,6 +506,10 @@ def _tabulate_measurement( batch_sizes, concurrencies, request_rates, + periodic_concurrencies, + max_tokens, + request_period, + text_input_length, satisfies, model_name, model_config_name, @@ -526,8 +542,37 @@ def _tabulate_measurement_setup(self, run_config_measurement): for pa_params in model_specific_pa_params if "request-rate-range" in pa_params ] + periodic_concurrencies = [ + pa_params["periodic-concurrency-range"] + for pa_params in model_specific_pa_params + if "periodic-concurrency-range" in pa_params + ] + max_tokens = [ + pa_params["max-tokens"] + for pa_params in model_specific_pa_params + if "max-tokens" in pa_params + ] + request_periods = [ + pa_params["request-period"] + for pa_params in model_specific_pa_params + if "request-period" in pa_params + ] + text_input_lengths = [ + pa_params["text-input-length"] + for pa_params in model_specific_pa_params + if "text-input-length" in pa_params + ] - return model_specific_pa_params, batch_sizes, concurrencies, request_rates + return ( + model_specific_pa_params, + batch_sizes, + concurrencies, + request_rates, + periodic_concurrencies, + max_tokens, + request_periods, + text_input_lengths, + ) def _populate_inference_rows( self, run_config_measurement, inference_fields, inference_row @@ -574,6 +619,10 @@ def _get_common_row_items( batch_sizes, concurrencies, request_rates, + periodic_concurrencies, + max_tokens, + request_period, + text_input_length, satisfies, model_name, model_config_path, @@ -604,6 +653,30 @@ def _get_common_row_items( if request_rate_index is not None: row[request_rate_index] = format_for_csv(request_rates) + # Periodic Concurrency + periodic_concurrency_index = self._find_index_for_field( + fields, "periodic_concurrency" + ) + if periodic_concurrency_index is not None: + row[periodic_concurrency_index] = format_for_csv(periodic_concurrencies) + + # Max Tokens + max_tokens_index = self._find_index_for_field(fields, "max_tokens") + if max_tokens_index is not None: + row[max_tokens_index] = format_for_csv(max_tokens) + + # Request Period + request_period_index = self._find_index_for_field(fields, "request_period") + if request_period_index is not None: + row[request_period_index] = format_for_csv(request_period) + + # Text Input Length + text_input_length_index = self._find_index_for_field( + fields, "text_input_length" + ) + if text_input_length_index is not None: + row[text_input_length_index] = format_for_csv(text_input_length) + # Satisfies satisfies_constraints_index = self._find_index_for_field( fields, "satisfies_constraints" diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py index 9d418027f..64c7525ef 100755 --- a/tests/common/test_utils.py +++ b/tests/common/test_utils.py @@ -23,9 +23,15 @@ from model_analyzer.config.input.config_defaults import ( DEFAULT_BATCH_SIZES, DEFAULT_CLIENT_PROTOCOL, + DEFAULT_INPUT_JSON_PATH, DEFAULT_MEASUREMENT_MODE, DEFAULT_MONITORING_INTERVAL, DEFAULT_OUTPUT_MODEL_REPOSITORY, + DEFAULT_RUN_CONFIG_MIN_CONCURRENCY, + DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT, + DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD, + DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH, + DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY, DEFAULT_TRITON_GRPC_ENDPOINT, DEFAULT_TRITON_HTTP_ENDPOINT, DEFAULT_TRITON_INSTALL_PATH, @@ -234,12 +240,18 @@ def convert_avg_gpu_metrics_to_data(avg_gpu_metric_values): def construct_perf_analyzer_config( model_name="my-model", output_file_name="my-model-results.csv", + export_file_name="my-model-results.json", batch_size=DEFAULT_BATCH_SIZES, - concurrency=1, + concurrency=DEFAULT_RUN_CONFIG_MIN_CONCURRENCY, + periodic_concurrency=DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY, request_rate=None, + max_token_count=DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT, + text_input_length=DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH, + request_period=DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD, launch_mode=DEFAULT_TRITON_LAUNCH_MODE, client_protocol=DEFAULT_CLIENT_PROTOCOL, perf_analyzer_flags=None, + llm_search_mode=False, ): """ Constructs a Perf Analyzer Config @@ -250,10 +262,20 @@ def construct_perf_analyzer_config( The name of the model output_file_name: str The name of the output file + export_file_name: str + The name of the export file batch_size: int The batch size for this PA configuration concurrency: int The concurrency value for this PA configuration + periodic_concurrency: list + The periodic concurrency value for this PA configuration + max_token_count: int + The max token count for this PA configuration + text_input_length: int + The text input length for this PA configuration + request_period: int + The request period for this PA configuration request_rate: int The request rate value for this PA configuration launch_mode: str @@ -262,6 +284,8 @@ def construct_perf_analyzer_config( The client protocol for this PA configuration perf_analyzer_flags: dict A dict of any additional PA flags to be set + llm_search_mode: bool + Indicates we should use LLM search parameters Returns ------- @@ -274,11 +298,26 @@ def construct_perf_analyzer_config( pa_config._options["-f"] = output_file_name pa_config._options["-b"] = batch_size + if llm_search_mode: + pa_config._options["--profile-export-file"] = export_file_name + if request_rate: pa_config._args["request-rate-range"] = request_rate + elif llm_search_mode: + pa_config._args["periodic-concurrency-range"] = periodic_concurrency else: pa_config._args["concurrency-range"] = concurrency + if llm_search_mode: + pa_config._args["request-parameter"] = f"max_tokens:{str(max_token_count)}:int" + + pa_config._args["request-period"] = request_period + pa_config._args[ + "input-data" + ] = f"{DEFAULT_INPUT_JSON_PATH}/input-data-{str(text_input_length)}.json" + + pa_config._args["streaming"] = "True" + pa_config._args["measurement-mode"] = DEFAULT_MEASUREMENT_MODE pa_config.update_config(perf_analyzer_flags) diff --git a/tests/test_cli.py b/tests/test_cli.py index 98ec60237..1a2fb84a2 100755 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -30,10 +30,10 @@ import psutil +import model_analyzer.config.input.config_defaults as config_defaults from model_analyzer.cli.cli import CLI from model_analyzer.config.input.config_command_profile import ConfigCommandProfile from model_analyzer.config.input.config_command_report import ConfigCommandReport -from model_analyzer.config.input.config_defaults import DEFAULT_TRITON_DOCKER_IMAGE from model_analyzer.config.input.config_status import ConfigStatus from model_analyzer.constants import CONFIG_PARSER_SUCCESS from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException @@ -60,6 +60,7 @@ def get_test_options(): OptionStruct("bool", "profile","--run-config-search-disable"), OptionStruct("bool", "profile","--run-config-profile-models-concurrently-enable"), OptionStruct("bool", "profile","--request-rate-search-enable"), + OptionStruct("bool", "profile","--llm-search-enable"), OptionStruct("bool", "profile","--reload-model-disable"), OptionStruct("bool", "profile","--early-exit-enable"), OptionStruct("bool", "profile","--skip-summary-reports"), @@ -71,23 +72,33 @@ def get_test_options(): # The following options can be None: # short_option # expected_default_value - OptionStruct("int", "profile", "--client-max-retries", "-r", "125", "50"), - OptionStruct("int", "profile", "--duration-seconds", "-d", "10", "3"), - OptionStruct("int", "profile", "--perf-analyzer-timeout", None, "100", "600"), - OptionStruct("int", "profile", "--perf-analyzer-max-auto-adjusts", None, "100", "10"), - OptionStruct("int", "profile", "--run-config-search-min-concurrency", None, "2", "1"), - OptionStruct("int", "profile", "--run-config-search-max-concurrency", None, "100", "1024"), - OptionStruct("int", "profile", "--run-config-search-min-request-rate", None, "2", "16"), - OptionStruct("int", "profile", "--run-config-search-max-request-rate", None, "100", "8192"), - OptionStruct("int", "profile", "--run-config-search-min-model-batch-size", None, "100", "1"), - OptionStruct("int", "profile", "--run-config-search-max-model-batch-size", None, "100", "128"), - OptionStruct("int", "profile", "--run-config-search-min-instance-count", None, "2", "1"), - OptionStruct("int", "profile", "--run-config-search-max-instance-count", None, "10", "5"), - OptionStruct("int", "profile", "--run-config-search-max-binary-search-steps", None, "10", "5"), - OptionStruct("float", "profile", "--monitoring-interval", "-i", "10.0", "1.0"), - OptionStruct("float", "profile", "--perf-analyzer-cpu-util", None, "10.0", str(psutil.cpu_count() * 80.0)), - OptionStruct("int", "profile", "--num-configs-per-model", None, "10", "3"), - OptionStruct("int", "profile", "--num-top-model-configs", None, "10", "0"), + OptionStruct("int", "profile", "--client-max-retries", "-r", "125", str(config_defaults.DEFAULT_MAX_RETRIES)), + OptionStruct("int", "profile", "--duration-seconds", "-d", "10", str(config_defaults.DEFAULT_DURATION_SECONDS)), + OptionStruct("int", "profile", "--perf-analyzer-timeout", None, "100", str(config_defaults.DEFAULT_PERF_ANALYZER_TIMEOUT)), + OptionStruct("int", "profile", "--perf-analyzer-max-auto-adjusts", None, "100", str(config_defaults.DEFAULT_PERF_MAX_AUTO_ADJUSTS)), + OptionStruct("int", "profile", "--run-config-search-min-concurrency", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_CONCURRENCY)), + OptionStruct("int", "profile", "--run-config-search-max-concurrency", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_CONCURRENCY)), + OptionStruct("int", "profile", "--run-config-search-min-periodic-concurrency", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY)), + OptionStruct("int", "profile", "--run-config-search-max-periodic-concurrency", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_PERIODIC_CONCURRENCY)), + OptionStruct("int", "profile", "--run-config-search-min-periodic-concurrency-step", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_PERIODIC_CONCURRENCY_STEP)), + OptionStruct("int", "profile", "--run-config-search-max-periodic-concurrency-step", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_PERIODIC_CONCURRENCY_STEP)), + OptionStruct("int", "profile", "--run-config-search-min-request-period", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD)), + OptionStruct("int", "profile", "--run-config-search-max-request-period", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_REQUEST_PERIOD)), + OptionStruct("int", "profile", "--run-config-search-min-request-rate", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE)), + OptionStruct("int", "profile", "--run-config-search-max-request-rate", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE)), + OptionStruct("int", "profile", "--run-config-search-min-model-batch-size", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE)), + OptionStruct("int", "profile", "--run-config-search-max-model-batch-size", None, "100", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE)), + OptionStruct("int", "profile", "--run-config-search-min-instance-count", None, "2", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT)), + OptionStruct("int", "profile", "--run-config-search-max-instance-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT)), + OptionStruct("int", "profile", "--run-config-search-max-binary-search-steps", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS)), + OptionStruct("int", "profile", "--run-config-search-min-text-input-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH)), + OptionStruct("int", "profile", "--run-config-search-max-text-input-length", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_TEXT_INPUT_LENGTH)), + OptionStruct("int", "profile", "--run-config-search-min-max-token-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT)), + OptionStruct("int", "profile", "--run-config-search-max-max-token-count", None, "10", str(config_defaults.DEFAULT_RUN_CONFIG_MAX_MAX_TOKEN_COUNT)), + OptionStruct("float", "profile", "--monitoring-interval", "-i", "10.0", str(config_defaults.DEFAULT_MONITORING_INTERVAL)), + OptionStruct("float", "profile", "--perf-analyzer-cpu-util", None, "10.0", str(psutil.cpu_count() * config_defaults.DEFAULT_PERF_ANALYZER_CPU_UTIL)), + OptionStruct("int", "profile", "--num-configs-per-model", None, "10", str(config_defaults.DEFAULT_NUM_CONFIGS_PER_MODEL)), + OptionStruct("int", "profile", "--num-top-model-configs", None, "10", str(config_defaults.DEFAULT_NUM_TOP_MODEL_CONFIGS)), OptionStruct("int", "profile", "--latency-budget", None, "200", None), OptionStruct("int", "profile", "--min-throughput", None, "300", None), @@ -105,7 +116,7 @@ def get_test_options(): OptionStruct("string", "profile", "--client-protocol", None, ["http", "grpc"], "grpc", "SHOULD_FAIL"), OptionStruct("string", "profile", "--perf-analyzer-path", None, ".", "perf_analyzer", None), OptionStruct("string", "profile", "--perf-output-path", None, ".", None, None), - OptionStruct("string", "profile", "--triton-docker-image", None, "test_image", DEFAULT_TRITON_DOCKER_IMAGE, None), + OptionStruct("string", "profile", "--triton-docker-image", None, "test_image", config_defaults.DEFAULT_TRITON_DOCKER_IMAGE, None), OptionStruct("string", "profile", "--triton-http-endpoint", None, "localhost:4000", "localhost:8000", None), OptionStruct("string", "profile", "--triton-grpc-endpoint", None, "localhost:4001", "localhost:8001", None), OptionStruct("string", "profile", "--triton-metrics-url", None, "localhost:4002", "http://localhost:8002/metrics", None), @@ -134,7 +145,11 @@ def get_test_options(): # expected_default_value OptionStruct("intlist", "profile", "--batch-sizes", "-b", "2, 4, 6", "1"), OptionStruct("intlist", "profile", "--concurrency", "-c", "1, 2, 3", None), + OptionStruct("stringlist", "profile", "--periodic-concurrency", None, '"5:50:5", "10:100:10"', None, None), OptionStruct("intlist", "profile", "--request-rate", None, "1, 2, 3", None), + OptionStruct("intlist", "profile", "--request-period", None, "1, 2, 3", None), + OptionStruct("intlist", "profile", "--text-input-length", None, "1, 2, 3", None), + OptionStruct("intlist", "profile", "--max-token-count", None, "1, 2, 3", None), OptionStruct("stringlist", "profile", "--triton-docker-mounts", None, "a:b:c, d:e:f", None, extra_commands=["--triton-launch-mode", "docker"]), OptionStruct("stringlist", "profile", "--gpus", None, "a, b, c", "all"), OptionStruct("stringlist", "profile", "--inference-output-fields", None, "a, b, c", @@ -588,9 +603,15 @@ def _convert_string_to_numeric(self, number): return float(number) if "." in number else int(number) def _convert_string_to_int_list(self, list_values): - ret_val = [int(x) for x in list_values.split(",")] + if ":" in list_values: + ret_val = [int(x) for x in list_values.split(":")] + ret_val = list(range(ret_val[0], ret_val[1] + 1, ret_val[2])) + else: + ret_val = [int(x) for x in list_values.split(",")] + if len(ret_val) == 1: return ret_val[0] + return ret_val def _convert_string_to_string_list(self, list_values): diff --git a/tests/test_config.py b/tests/test_config.py index ca9835cec..2dd371787 100755 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -47,6 +47,26 @@ class TestConfig(trc.TestResultCollector): + def _create_parameters( + self, + batch_sizes: List = [], + concurrency: List = [], + periodic_concurrency: List = [], + request_rate: List = [], + request_period: List = [], + text_input_length: List = [], + max_token_count: List = [], + ) -> Dict: + return { + "batch_sizes": batch_sizes, + "concurrency": concurrency, + "periodic_concurrency": periodic_concurrency, + "request_rate": request_rate, + "request_period": request_period, + "text_input_length": text_input_length, + "max_token_count": max_token_count, + } + def _evaluate_config(self, args, yaml_content, subcommand="profile"): mock_numba = MockNumba( mock_paths=["model_analyzer.config.input.config_command_profile"] @@ -288,12 +308,12 @@ def test_range_and_list_values(self): expected_model_configs = [ ConfigModelProfileSpec( "model_1", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, ), ConfigModelProfileSpec( "model_2", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, ), ] @@ -426,16 +446,14 @@ def test_object(self): expected_model_objects = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={ - "batch_sizes": [1], - "concurrency": [1, 2, 3, 4], - "request_rate": [], - }, + parameters=self._create_parameters( + batch_sizes=[1], concurrency=[1, 2, 3, 4] + ), objectives={"perf_throughput": 10}, ), ConfigModelProfileSpec( "vgg_19_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, ), ] @@ -485,20 +503,16 @@ def test_object(self): expected_model_objects = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={ - "batch_sizes": [1], - "concurrency": [1, 2, 3, 4], - "request_rate": [], - }, + parameters=self._create_parameters( + batch_sizes=[1], concurrency=[1, 2, 3, 4] + ), objectives={"perf_throughput": 10}, ), ConfigModelProfileSpec( "vgg_19_graphdef", - parameters={ - "concurrency": [1, 2, 3, 4], - "batch_sizes": [2, 4, 6], - "request_rate": [], - }, + parameters=self._create_parameters( + batch_sizes=[2, 4, 6], concurrency=[1, 2, 3, 4] + ), objectives={"perf_throughput": 10}, ), ] @@ -565,11 +579,9 @@ def test_constraints(self): expected_model_objects = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={ - "batch_sizes": [1], - "concurrency": [1, 2, 3, 4], - "request_rate": [], - }, + parameters=self._create_parameters( + batch_sizes=[1], concurrency=[1, 2, 3, 4] + ), objectives={"perf_throughput": 10, "gpu_used_memory": 5}, constraints={ "gpu_used_memory": { @@ -579,7 +591,7 @@ def test_constraints(self): ), ConfigModelProfileSpec( "vgg_19_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, ), ] @@ -697,7 +709,7 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, model_config_parameters={ "instance_group": [[{"kind": ["KIND_GPU"], "count": [1]}]] @@ -722,7 +734,7 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, model_config_parameters={ "instance_group": [[{"kind": ["KIND_GPU"], "count": [1]}]] @@ -758,7 +770,7 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, model_config_parameters={ "instance_group": [ @@ -801,7 +813,7 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, model_config_parameters={ "instance_group": [ @@ -831,7 +843,7 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, model_config_parameters={ "input": [ @@ -874,7 +886,7 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, perf_analyzer_flags={ "measurement-interval": 10000, @@ -900,7 +912,7 @@ def test_config_model(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, perf_analyzer_flags={ "measurement-interval": 10000, @@ -1171,7 +1183,7 @@ def test_autofill(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={"batch_sizes": [1], "concurrency": [], "request_rate": []}, + parameters=self._create_parameters(batch_sizes=[1]), objectives={"perf_throughput": 10}, model_config_parameters={ "instance_group": [[{"kind": ["KIND_GPU"], "count": [1]}]] @@ -1211,11 +1223,9 @@ def test_autofill(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={ - "batch_sizes": [16, 32], - "concurrency": [2, 4], - "request_rate": [], - }, + parameters=self._create_parameters( + batch_sizes=[16, 32], concurrency=[2, 4] + ), objectives={"perf_throughput": 10, "gpu_used_memory": 5}, constraints={ "gpu_used_memory": { @@ -1259,11 +1269,9 @@ def test_autofill(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={ - "batch_sizes": [16, 32], - "concurrency": [2, 4], - "request_rate": [], - }, + parameters=self._create_parameters( + batch_sizes=[16, 32], concurrency=[2, 4] + ), objectives={"gpu_used_memory": 10}, constraints={"perf_latency_p99": {"max": 8000}}, model_config_parameters={ @@ -1303,11 +1311,9 @@ def test_autofill(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={ - "batch_sizes": [16, 32], - "concurrency": [2, 4], - "request_rate": [], - }, + parameters=self._create_parameters( + batch_sizes=[16, 32], concurrency=[2, 4] + ), objectives={"gpu_used_memory": 10}, constraints={"perf_latency_p99": {"max": 8000}}, model_config_parameters={ @@ -1358,11 +1364,9 @@ def test_autofill(self): expected_model_configs = [ ConfigModelProfileSpec( "vgg_16_graphdef", - parameters={ - "batch_sizes": [16, 32], - "concurrency": [5, 6, 7], - "request_rate": [], - }, + parameters=self._create_parameters( + batch_sizes=[16, 32], concurrency=[5, 6, 7] + ), objectives={"gpu_used_memory": 10}, constraints={ "perf_latency_p99": {"max": 8000}, @@ -1371,11 +1375,9 @@ def test_autofill(self): ), ConfigModelProfileSpec( "vgg_19_graphdef", - parameters={ - "batch_sizes": [1, 2], - "concurrency": [2, 4], - "request_rate": [], - }, + parameters=self._create_parameters( + batch_sizes=[1, 2], concurrency=[2, 4] + ), objectives={"perf_throughput": 10, "perf_latency_p99": 5}, constraints={"perf_latency_p99": {"max": 8000}}, ), @@ -2338,6 +2340,80 @@ def _test_arg_conflict( with self.assertRaises(TritonModelAnalyzerException): self._evaluate_config(args, yaml_content) + def test_llm_mode_rcs(self): + """ + Test RCS options for an LLM model + """ + yaml_content = "" + + self._test_llm_mode_case( + yaml_content, + ["--run-config-search-mode", "brute"], + is_legal=True, + use_value=False, + use_list=False, + ) + self._test_llm_mode_case( + yaml_content, + ["--run-config-search-mode", "quick"], + use_value=False, + use_list=False, + ) + + self._test_llm_mode_case( + yaml_content, ["--run-config-search-min-model-batch-size"] + ) + self._test_llm_mode_case( + yaml_content, ["--run-config-search-max-model-batch-size"] + ) + self._test_llm_mode_case(yaml_content, ["--run-config-search-min-concurrency"]) + self._test_llm_mode_case(yaml_content, ["--run-config-search-max-concurrency"]) + self._test_llm_mode_case(yaml_content, ["--run-config-search-min-request-rate"]) + self._test_llm_mode_case(yaml_content, ["--run-config-search-max-request-rate"]) + self._test_llm_mode_case( + yaml_content, + ["--request-rate-search-enable"], + use_value=False, + use_list=False, + ) + self._test_llm_mode_case(yaml_content, ["--concurrency"]) + self._test_llm_mode_case(yaml_content, ["--latency-budget"]) + self._test_llm_mode_case(yaml_content, ["--min-throughput"]) + + def _test_llm_mode_case( + self, + yaml_content: Optional[Dict[str, List]], + options_string: str, + is_legal: bool = False, + use_value: bool = True, + use_list: bool = True, + ) -> None: + """ + Tests that options raise exceptions in LLM mode + """ + args = [ + "model-analyzer", + "profile", + "--model-repository", + "cli-repository", + "--profile-models", + "test_llm_modelA", + "--llm-search-enable", + ] + + args.extend(options_string) + + if use_value: + args.append("1") + elif use_list: + args.append(["1", "2", "4"]) + + if is_legal: + self._evaluate_config(args, yaml_content, subcommand="profile") + else: + with self.assertRaises(TritonModelAnalyzerException): + self._evaluate_config(args, yaml_content, subcommand="profile") + if __name__ == "__main__": unittest.main() diff --git a/tests/test_parameter_search.py b/tests/test_inference_load_search.py similarity index 92% rename from tests/test_parameter_search.py rename to tests/test_inference_load_search.py index 7f410bb26..d8643ad66 100755 --- a/tests/test_parameter_search.py +++ b/tests/test_inference_load_search.py @@ -25,17 +25,17 @@ DEFAULT_RUN_CONFIG_MIN_CONCURRENCY, DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE, ) -from model_analyzer.constants import THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES +from model_analyzer.constants import THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException from model_analyzer.result.constraint_manager import ConstraintManager -from model_analyzer.result.parameter_search import ParameterSearch +from model_analyzer.result.inference_load_search import InferenceLoadSearch from model_analyzer.result.run_config_measurement import RunConfigMeasurement from .common import test_result_collector as trc from .common.test_utils import construct_run_config_measurement, evaluate_mock_config -class TestParameterSearch(trc.TestResultCollector): +class TestInferenceLoadSearch(trc.TestResultCollector): def setUp(self): self._min_concurrency_index = int(log2(DEFAULT_RUN_CONFIG_MIN_CONCURRENCY)) self._max_concurrency_index = int(log2(DEFAULT_RUN_CONFIG_MAX_CONCURRENCY)) @@ -67,9 +67,9 @@ def test_concurrency_sweep(self): """ config = self._create_single_model_no_constraints() constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch(config) + concurrency_search = InferenceLoadSearch(config) - for concurrency in concurrency_search.search_parameters(): + for concurrency in concurrency_search.search_inference_loads(): self._concurrencies.append(concurrency) concurrency_search.add_run_config_measurement( @@ -90,11 +90,11 @@ def test_request_rate_sweep(self): """ config = self._create_single_model_no_constraints() constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch( + concurrency_search = InferenceLoadSearch( config, model_parameters={"request_rate": "True"} ) - for request_rate in concurrency_search.search_parameters(): + for request_rate in concurrency_search.search_inference_loads(): self._request_rates.append(request_rate) concurrency_search.add_run_config_measurement( @@ -115,7 +115,7 @@ def test_saturating_sweep(self): """ config = self._create_single_model_no_constraints() constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch(config) + concurrency_search = InferenceLoadSearch(config) INCREASE_THROUGHPUT_COUNT = 4 # [100, 200, 400, 800, 1000, 1000,...] @@ -124,7 +124,7 @@ def test_saturating_sweep(self): for c in range(self._min_concurrency_index, self._max_concurrency_index + 1) ] - for i, concurrency in enumerate(concurrency_search.search_parameters()): + for i, concurrency in enumerate(concurrency_search.search_inference_loads()): self._concurrencies.append(concurrency) concurrency_search.add_run_config_measurement( @@ -140,7 +140,7 @@ def test_saturating_sweep(self): 2**c for c in range( INCREASE_THROUGHPUT_COUNT - + THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES + + THROUGHPUT_MINIMUM_CONSECUTIVE_INFERENCE_LOAD_TRIES ) ] self.assertEqual(self._concurrencies, expected_concurrencies) @@ -152,12 +152,12 @@ def test_sweep_with_constraints_decreasing(self): """ config = self._create_single_model_with_constraints("95") constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch(config) + concurrency_search = InferenceLoadSearch(config) self._expected_concurrencies.extend([12, 10, 9]) latencies = [10 * c for c in self._expected_concurrencies] - for i, concurrency in enumerate(concurrency_search.search_parameters()): + for i, concurrency in enumerate(concurrency_search.search_inference_loads()): self._concurrencies.append(concurrency) concurrency_search.add_run_config_measurement( @@ -178,12 +178,12 @@ def test_sweep_with_constraints_decrease_then_increase(self): """ config = self._create_single_model_with_constraints("155") constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch(config) + concurrency_search = InferenceLoadSearch(config) self._expected_concurrencies.extend([12, 14, 15]) latencies = [10 * c for c in self._expected_concurrencies] - for i, concurrency in enumerate(concurrency_search.search_parameters()): + for i, concurrency in enumerate(concurrency_search.search_inference_loads()): self._concurrencies.append(concurrency) concurrency_search.add_run_config_measurement( @@ -204,14 +204,14 @@ def test_sweep_with_multiple_violation_areas(self): """ config = self._create_single_model_with_constraints("155") constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch(config) + concurrency_search = InferenceLoadSearch(config) self._expected_concurrencies.extend([12, 14, 15]) latencies = [10 * c for c in self._expected_concurrencies] # this adds an early constraint violation which should be ignored latencies[1] = 200 - for i, concurrency in enumerate(concurrency_search.search_parameters()): + for i, concurrency in enumerate(concurrency_search.search_inference_loads()): self._concurrencies.append(concurrency) concurrency_search.add_run_config_measurement( @@ -233,12 +233,12 @@ def test_sweep_with_constraints_hitting_limit(self): """ config = self._create_single_model_with_constraints("970") constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch(config) + concurrency_search = InferenceLoadSearch(config) self._expected_concurrencies.extend([768, 896, 960, 992, 976]) latencies = self._expected_concurrencies - for i, concurrency in enumerate(concurrency_search.search_parameters()): + for i, concurrency in enumerate(concurrency_search.search_inference_loads()): self._concurrencies.append(concurrency) concurrency_search.add_run_config_measurement( @@ -258,10 +258,10 @@ def test_not_adding_measurements(self): """ config = self._create_single_model_no_constraints() constraint_manager = ConstraintManager(config) - concurrency_search = ParameterSearch(config) + concurrency_search = InferenceLoadSearch(config) with self.assertRaises(TritonModelAnalyzerException): - for concurrency in concurrency_search.search_parameters(): + for concurrency in concurrency_search.search_inference_loads(): self._concurrencies.append(concurrency) if concurrency < 32: diff --git a/tests/test_model_manager.py b/tests/test_model_manager.py index 0370c6c77..fcbd8eee8 100755 --- a/tests/test_model_manager.py +++ b/tests/test_model_manager.py @@ -1294,7 +1294,9 @@ def _test_model_manager(self, yaml_content, expected_ranges, args=None): MagicMock(), ) - model_manager.run_models([config.profile_models[0]]) + with patch("shutil.rmtree"): + model_manager.run_models([config.profile_models[0]]) + self.mock_model_config.stop() self._check_results(model_manager, expected_ranges) diff --git a/tests/test_perf_analyzer.py b/tests/test_perf_analyzer.py index e95f0d4a1..a984279bd 100755 --- a/tests/test_perf_analyzer.py +++ b/tests/test_perf_analyzer.py @@ -49,6 +49,7 @@ from model_analyzer.triton.client.client_factory import TritonClientFactory from model_analyzer.triton.server.server_config import TritonServerConfig from model_analyzer.triton.server.server_factory import TritonServerFactory +from tests.common.test_utils import construct_perf_analyzer_config from .common import test_result_collector as trc from .mocks.mock_client import MockTritonClientMethods @@ -67,7 +68,56 @@ TEST_GRPC_URL = "test_hostname:test_port" -class TestPerfAnalyzerMethods(trc.TestResultCollector): +def mock_open_method(*args, **kwargs): + pa_csv_mock = """Concurrency,Inferences/Second,Client Send,Network+Server Send/Recv,Server Queue,Server Compute Input,Server Compute Infer,Server Compute Output,""" + pa_csv_mock += """Client Recv,p50 latency,p90 latency,p95 latency,p99 latency,Avg latency,request/response,response wait,""" + pa_csv_mock += """Avg GPU Utilization,Avg GPU Power Usage,Max GPU Memory Usage,Total GPU Memory\n""" + pa_csv_mock += """1,46.8,2,187,18,34,65,16,1,4600,4700,4800,4900,5000,3,314,""" + pa_csv_mock += """GPU-aaf4fea0:0.809;GPU-aaf4fea1:0.901;GPU-aaf4fea2:0.745;,GPU-aaf4fea0:91.2;GPU-aaf4fea1:100;,GPU-aaf4fea0:1000000000;GPU-aaf4fea1:2000000000,GPU-aaf4fea0:1500000000;GPU-aaf4fea2:3000000000""" + + # yapf: disable + pa_json_mock = """ + { + "experiments": [ + { + "experiment": { + "mode": "concurrency", + "value": 4 + }, + "requests": [ + { + "timestamp": 1, + "sequence_id": 1, + "response_timestamps": [2,3,4] + }, + { + "timestamp": 4, + "sequence_id": 2, + "response_timestamps": [5,6] + }, + { + "timestamp": 6, + "sequence_id": 3, + "response_timestamps": [7,8,9] + } + ], + "window_boundaries": [1,5,6] + } + ], + "version": "1.2.3" + } + """ + # yapf: enable + + if args[0] == "my-model-results.csv": + return mock_open(read_data=pa_csv_mock)(*args, **kwargs) + elif args[0] == "my-model-llm-results.csv": + return mock_open(read_data=pa_json_mock)(*args, **kwargs) + else: + return mock_open(read_data=None)(*args, **kwargs) + + +class TestPerfAnalyzer(trc.TestResultCollector): def setUp(self): # Mocks self.server_local_mock = MockServerLocalMethods() @@ -80,7 +130,7 @@ def setUp(self): self.client_mock.start() # PerfAnalyzer config for all tests - self.config = PerfAnalyzerConfig() + self.config = construct_perf_analyzer_config() self.config["model-name"] = TEST_MODEL_NAME self.config["measurement-interval"] = 1000 self.config["measurement-request-count"] = 50 @@ -90,6 +140,16 @@ def setUp(self): ModelRunConfig("fake_name", MagicMock(), self.config) ) + self.llm_config = construct_perf_analyzer_config(llm_search_mode=True) + self.llm_config["model-name"] = TEST_MODEL_NAME + self.llm_config["measurement-interval"] = 1000 + self.llm_config["measurement-request-count"] = 50 + + self.llm_run_config = RunConfig({}) + self.llm_run_config.add_model_run_config( + ModelRunConfig("fake_name", MagicMock(), self.llm_config) + ) + self.gpus = [GPUDevice("TEST_DEVICE_NAME", 0, "TEST_PCI_BUS_ID", "TEST_UUID")] # Triton Server @@ -132,7 +192,7 @@ def test_perf_analyzer_config(self): def test_perf_analyzer_boolean_args(self): """Test that only positive boolean args get added""" - expected_cli_str = "-m test_model --measurement-interval=1000 --binary-search --measurement-request-count=50" + expected_cli_str = "-m test_model -b 1 -u localhost:8001 -i grpc -f my-model-results.csv --measurement-interval=1000 --concurrency-range=1 --binary-search --measurement-mode=count_windows --measurement-request-count=50 --collect-metrics --metrics-url=http://localhost:8002/metrics --metrics-interval=1000.0" self.config["async"] = "False" self.config["binary-search"] = "True" @@ -141,7 +201,7 @@ def test_perf_analyzer_boolean_args(self): def test_perf_analyzer_additive_args(self): shape = ["name1:1,2,3", "name2:4,5,6"] - expected_cli_str = "-m test_model --measurement-interval=1000 --shape=name1:1,2,3 --shape=name2:4,5,6 --measurement-request-count=50" + expected_cli_str = "-m test_model -b 1 -u localhost:8001 -i grpc -f my-model-results.csv --measurement-interval=1000 --concurrency-range=1 --shape=name1:1,2,3 --shape=name2:4,5,6 --measurement-mode=count_windows --measurement-request-count=50 --collect-metrics --metrics-url=http://localhost:8002/metrics --metrics-interval=1000.0" self.config["shape"] = shape[:] @@ -149,7 +209,7 @@ def test_perf_analyzer_additive_args(self): self.assertEqual(self.config.to_cli_string(), expected_cli_str) shape = "name1:1,2,3" - expected_cli_str = "-m test_model --measurement-interval=1000 --shape=name1:1,2,3 --measurement-request-count=50" + expected_cli_str = "-m test_model -b 1 -u localhost:8001 -i grpc -f my-model-results.csv --measurement-interval=1000 --concurrency-range=1 --shape=name1:1,2,3 --measurement-mode=count_windows --measurement-request-count=50 --collect-metrics --metrics-url=http://localhost:8002/metrics --metrics-interval=1000.0" self.config["shape"] = shape self.assertEqual(self.config.to_cli_string(), expected_cli_str) @@ -177,10 +237,13 @@ def test_perf_analyzer_ssl_args(self): ssl_https_private_key_file = "h" expected_cli_str = ( - f"-m test_model --measurement-interval=1000 --measurement-request-count=50 --ssl-grpc-use-ssl " + f"-m test_model -b 1 -u localhost:8001 -i grpc -f my-model-results.csv --measurement-interval=1000 " + f"--concurrency-range=1 --measurement-mode=count_windows --measurement-request-count=50 --ssl-grpc-use-ssl " f"--ssl-grpc-root-certifications-file=a --ssl-grpc-private-key-file=b --ssl-grpc-certificate-chain-file=c " - f"--ssl-https-verify-peer=1 --ssl-https-verify-host=2 --ssl-https-ca-certificates-file=d --ssl-https-client-certificate-type=e " - f"--ssl-https-client-certificate-file=f --ssl-https-private-key-type=g --ssl-https-private-key-file=h" + f"--ssl-https-verify-peer=1 --ssl-https-verify-host=2 --ssl-https-ca-certificates-file=d " + f"--ssl-https-client-certificate-type=e --ssl-https-client-certificate-file=f --ssl-https-private-key-type=g " + f"--ssl-https-private-key-file=h --collect-metrics --metrics-url=http://localhost:8002/metrics " + f"--metrics-interval=1000.0" ) self.config["ssl-grpc-use-ssl"] = ssl_grpc_use_ssl @@ -241,11 +304,15 @@ def test_perf_analyzer_ssl_args(self): self.config["ssl-grpc-use-ssl"] = ssl_grpc_use_ssl self.assertEqual(self.config["ssl-grpc-use-ssl"], ssl_grpc_use_ssl) expected_cli_str = ( - f"-m test_model --measurement-interval=1000 --measurement-request-count=50 " + f"-m test_model -b 1 -u localhost:8001 -i grpc -f my-model-results.csv --measurement-interval=1000 " + f"--concurrency-range=1 --measurement-mode=count_windows --measurement-request-count=50 " f"--ssl-grpc-root-certifications-file=a --ssl-grpc-private-key-file=b --ssl-grpc-certificate-chain-file=c " - f"--ssl-https-verify-peer=1 --ssl-https-verify-host=2 --ssl-https-ca-certificates-file=d --ssl-https-client-certificate-type=e " - f"--ssl-https-client-certificate-file=f --ssl-https-private-key-type=g --ssl-https-private-key-file=h" + f"--ssl-https-verify-peer=1 --ssl-https-verify-host=2 --ssl-https-ca-certificates-file=d " + f"--ssl-https-client-certificate-type=e --ssl-https-client-certificate-file=f " + f"--ssl-https-private-key-type=g --ssl-https-private-key-file=h --collect-metrics " + f"--metrics-url=http://localhost:8002/metrics --metrics-interval=1000.0" ) + self.assertEqual(self.config.to_cli_string(), expected_cli_str) def test_run(self): @@ -268,18 +335,12 @@ def test_run(self): self.server.start() self.client.wait_for_server_ready(num_retries=1) - pa_csv_mock = """Concurrency,Inferences/Second,Client Send,Network+Server Send/Recv,Server Queue,Server Compute Input,Server Compute Infer,Server Compute Output,""" - pa_csv_mock += """Client Recv,p50 latency,p90 latency,p95 latency,p99 latency,Avg latency,request/response,response wait,""" - pa_csv_mock += """Avg GPU Utilization,Avg GPU Power Usage,Max GPU Memory Usage,Total GPU Memory\n""" - pa_csv_mock += """1,46.8,2,187,18,34,65,16,1,4600,4700,4800,4900,5000,3,314,""" - pa_csv_mock += """GPU-aaf4fea0:0.809;GPU-aaf4fea1:0.901;GPU-aaf4fea2:0.745;,GPU-aaf4fea0:91.2;GPU-aaf4fea1:100;,GPU-aaf4fea0:1000000000;GPU-aaf4fea1:2000000000,GPU-aaf4fea0:1500000000;GPU-aaf4fea2:3000000000""" - # Test avg latency parsing. GPU metric is ignored for get_perf_records() perf_metrics = [PerfLatencyAvg, GPUUtilization] with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(perf_metrics) @@ -292,7 +353,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(perf_metrics) @@ -305,7 +366,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(perf_metrics) @@ -318,7 +379,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(perf_metrics) @@ -331,7 +392,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(perf_metrics) @@ -344,7 +405,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(perf_metrics) @@ -357,7 +418,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(perf_metrics) @@ -370,7 +431,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(perf_metrics) @@ -383,7 +444,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(perf_metrics) @@ -396,7 +457,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(perf_metrics) @@ -409,7 +470,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(gpu_metrics) @@ -427,7 +488,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(gpu_metrics) @@ -443,7 +504,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(gpu_metrics) @@ -462,7 +523,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(gpu_metrics) @@ -487,7 +548,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(perf_metrics) @@ -651,10 +712,27 @@ def test_get_cmd_single_model(self): "perf_analyzer", "-m", "test_model", + "-b", + "1", + "-u", + "localhost:8001", + "-i", + "grpc", + "-f", + "my-model-results.csv", "--measurement-interval", "1000", + "--concurrency-range", + "1", + "--measurement-mode", + "count_windows", "--measurement-request-count", "50", + "--collect-metrics", + "--metrics-url", + "http://localhost:8002/metrics", + "--metrics-interval", + "1000.0", ] self.assertEqual(pa._get_cmd(), expected_cmd) diff --git a/tests/test_perf_analyzer_config_generator.py b/tests/test_perf_analyzer_config_generator.py index e9852356e..3a6f6795d 100755 --- a/tests/test_perf_analyzer_config_generator.py +++ b/tests/test_perf_analyzer_config_generator.py @@ -15,7 +15,7 @@ # limitations under the License. import unittest -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock, mock_open, patch from model_analyzer.config.generate.generator_utils import GeneratorUtils as utils from model_analyzer.config.generate.perf_analyzer_config_generator import ( @@ -41,7 +41,11 @@ def __init__(self, methodname): super().__init__(methodname) self._perf_throughput = 1 - def test_set_last_results(self): + @patch( + "model_analyzer.config.input.config_command_profile.ConfigCommandProfile.is_llm_model", + return_value=False, + ) + def test_set_last_results(self, *args): """ Test set_last_results() with multi model @@ -60,8 +64,26 @@ def test_set_last_results(self): ["modelA", "modelB"], [{"perf_throughput": 10}, {"perf_throughput": 2}] ) + args = [ + "model-analyzer", + "profile", + "--model-repository", + "cli_repository", + "-f", + "path-to-config-file", + ] + + # yapf: disable + yaml_str = (""" + profile_models: + - my-model + """) + # yapf: enable + + config = evaluate_mock_config(args, yaml_str, subcommand="profile") + pacg = PerfAnalyzerConfigGenerator( - MagicMock(), MagicMock(), MagicMock(), MagicMock(), early_exit_enable=False + config, MagicMock(), MagicMock(), MagicMock(), early_exit_enable=False ) pacg.set_last_results([measurement1, measurement2, measurement3]) @@ -537,6 +559,184 @@ def test_perf_analyzer_flags(self): self._run_and_test_perf_analyzer_config_generator(yaml_str, expected_configs) + def test_llm_search_max_token_count(self): + """ + Test LLM Search: + - max token count 1->256 + + Concurrency and text input length max set to 1 + """ + + # yapf: disable + yaml_str = (""" + perf_analyzer_flags: + input-data: input-data.json + profile_models: + - my-model + """) + # yapf: enable + + max_token_counts = utils.generate_doubled_list(1, 256) + periodic_concurrencies = ["16:32:4", "16:32:8", "16:32:16"] + + expected_configs = [] + for mtc in max_token_counts: + for pc in periodic_concurrencies: + expected_configs.append( + construct_perf_analyzer_config( + max_token_count=mtc, + llm_search_mode=True, + periodic_concurrency=pc, + ) + ) + + pa_cli_args = [ + "--llm-search-enable", + "--run-config-search-max-periodic-concurrency", + "32", + "--run-config-search-max-text-input-length", + "1", + "--run-config-search-max-request-period", + "1", + ] + self._run_and_test_perf_analyzer_config_generator( + yaml_str, expected_configs, pa_cli_args + ) + + def test_llm_search_text_input_length(self): + """ + Test LLM Search: + - Input length 1->1024 + + Periodic Concurrency and max token count set to 1 + """ + + # yapf: disable + yaml_str = (""" + perf_analyzer_flags: + input-data: input-data.json + profile_models: + - my-model + """) + # yapf: enable + + text_input_lengths = utils.generate_doubled_list(1, 1024) + periodic_concurrencies = ["16:32:4", "16:32:8", "16:32:16"] + + expected_configs = [] + for til in text_input_lengths: + for pc in periodic_concurrencies: + expected_configs.append( + construct_perf_analyzer_config( + llm_search_mode=True, + periodic_concurrency=pc, + text_input_length=til, + ) + ) + + pa_cli_args = [ + "--llm-search-enable", + "--run-config-search-max-periodic-concurrency", + "32", + "--run-config-search-max-max-token-count", + "1", + "--run-config-search-max-request-period", + "1", + ] + self._run_and_test_perf_analyzer_config_generator( + yaml_str, expected_configs, pa_cli_args + ) + + def test_periodic_concurrency_parameter(self): + """ + Test LLM Search: + - periodic-concurrency: 10:100:10 + + Max token set to 1 + Text input set to 1 + """ + + # yapf: disable + yaml_str = (""" + perf_analyzer_flags: + input-data: input-data.json + profile_models: + - my-model + """) + # yapf: enable + + expected_configs = [ + construct_perf_analyzer_config( + llm_search_mode=True, periodic_concurrency="10:100:10" + ) + ] + + pa_cli_args = [ + "--llm-search-enable", + "--periodic-concurrency", + "10:100:10", + "--run-config-search-max-max-token-count", + "1", + "--run-config-search-max-text-input-length", + "1", + "--run-config-search-max-request-period", + "1", + ] + self._run_and_test_perf_analyzer_config_generator( + yaml_str, expected_configs, pa_cli_args + ) + + def test_periodic_concurrency_search(self): + """ + Test LLM Search: + - Period Concurrency using RCS values + + Max token set to 1 + Text input set to 1 + """ + + # yapf: disable + yaml_str = (""" + perf_analyzer_flags: + input-data: input-data.json + profile_models: + - my-model + """) + # yapf: enable + + periodic_concurrencies = [ + "16:32:8", + "16:32:16", + "16:64:8", + "16:64:16", + "32:64:8", + "32:64:16", + "32:64:32", + ] + expected_configs = [ + construct_perf_analyzer_config( + llm_search_mode=True, periodic_concurrency=pc + ) + for pc in periodic_concurrencies + ] + + pa_cli_args = [ + "--llm-search-enable", + "--run-config-search-max-max-token-count", + "1", + "--run-config-search-max-text-input-length", + "1", + "--run-config-search-max-periodic-concurrency", + "64", + "--run-config-search-min-periodic-concurrency-step", + "8", + "--run-config-search-max-request-period", + "1", + ] + self._run_and_test_perf_analyzer_config_generator( + yaml_str, expected_configs, pa_cli_args + ) + def test_perf_analyzer_config_ssl_options(self): """ Test Perf Analyzer SSL options: @@ -754,13 +954,17 @@ def _run_and_test_perf_analyzer_config_generator( config = evaluate_mock_config(args, yaml_str, subcommand="profile") - pacg = PerfAnalyzerConfigGenerator( - config, - config.profile_models[0].model_name(), - config.profile_models[0].perf_analyzer_flags(), - config.profile_models[0].parameters(), - early_exit, - ) + with patch( + "model_analyzer.config.generate.perf_analyzer_config_generator.open", + mock_open(read_data=self._input_data), + ): + pacg = PerfAnalyzerConfigGenerator( + config, + config.profile_models[0].model_name(), + config.profile_models[0].perf_analyzer_flags(), + config.profile_models[0].parameters(), + early_exit, + ) perf_analyzer_configs = [] for perf_config in pacg.get_configs(): @@ -824,6 +1028,10 @@ def setUp(self): ) self.mock_os.start() + self._input_data = """{ + "data": [{"text_input": ["Hello, my name is"], "stream": [true]}] + }""" + def tearDown(self): self.mock_os.stop() patch.stopall() diff --git a/tests/test_record_types.py b/tests/test_record_types.py index 4bd6d8b32..4d3482af1 100755 --- a/tests/test_record_types.py +++ b/tests/test_record_types.py @@ -59,6 +59,8 @@ def setUp(self): "perf_client_send_recv", "perf_server_compute_input", "gpu_power_usage", + "avg_first_token_latency", + "avg_token_to_token_latency", ] } self.more_is_better_types = { diff --git a/tests/test_result_table_manager.py b/tests/test_result_table_manager.py index 278ceb5ea..854bb0543 100755 --- a/tests/test_result_table_manager.py +++ b/tests/test_result_table_manager.py @@ -304,6 +304,10 @@ def test_get_common_row_items_with_backend_parameters(self): dynamic_batchings=None, instance_groups=None, max_batch_sizes=None, + periodic_concurrencies=None, + max_tokens=None, + request_period=None, + text_input_length=None, backend_parameters=backend_parameters, ) self.assertEqual(