Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LLM support to Brute Search #769

Merged
merged 23 commits into from
Oct 12, 2023
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
49b4e15
Initial coding complete
nv-braf Oct 5, 2023
43d5c1d
First unit test passing
nv-braf Oct 5, 2023
d765027
Adding test for prompt length
nv-braf Oct 5, 2023
c198e5a
Refactor PACG methods
nv-braf Oct 5, 2023
79aa02a
Further refactoring
nv-braf Oct 5, 2023
ac81a6b
Ensure early exit isn't enabled for LLM models
nv-braf Oct 5, 2023
015a2c2
Fix type checking errors
nv-braf Oct 6, 2023
2619b83
Attempt at fixing codeql issue
nv-braf Oct 7, 2023
9f2a065
Revert "Attempt at fixing codeql issue"
nv-braf Oct 10, 2023
c5b702e
Attempt at codeQL fix
nv-braf Oct 10, 2023
cbdc746
Adding deepcopy back in
nv-braf Oct 10, 2023
0c909ea
Removing deepcopy in an attempt to fix codeQL errors
nv-braf Oct 10, 2023
3f4450a
Update model_analyzer/config/input/config_command_profile.py
nv-braf Oct 11, 2023
c69b577
Update model_analyzer/config/generate/perf_analyzer_config_generator.py
nv-braf Oct 11, 2023
b1eed54
Update model_analyzer/config/generate/perf_analyzer_config_generator.py
nv-braf Oct 11, 2023
a2fa148
Update model_analyzer/config/generate/perf_analyzer_config_generator.py
nv-braf Oct 11, 2023
c96d897
Moving location of method
nv-braf Oct 11, 2023
daee4cc
Changing parameter to inference load
nv-braf Oct 11, 2023
3966c6c
Changing parameter to inference load
nv-braf Oct 11, 2023
fbe1abf
Changing prompt length to text input length
nv-braf Oct 11, 2023
abec25d
Changing max_tokens to use request-parameter
nv-braf Oct 11, 2023
f8729db
Fix input-data typo
nv-braf Oct 12, 2023
2cda3df
Changing non-parameter to parameter
nv-braf Oct 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,7 @@ def __init__(
logger.info("")
AutomaticModelConfigGenerator._log_first_run = True

self._max_instance_count = config.run_config_search_max_instance_count
self._min_instance_count = config.run_config_search_min_instance_count
self._max_model_batch_size = config.run_config_search_max_model_batch_size
self._min_model_batch_size = config.run_config_search_min_model_batch_size
self._set_min_max_search_values(config)

self._instance_kind = "KIND_CPU" if self._cpu_only else "KIND_GPU"

Expand Down Expand Up @@ -162,3 +159,9 @@ def _get_curr_param_combo(self) -> Dict:
config["dynamic_batching"] = {}

return config

def _set_min_max_search_values(self, config: ConfigCommandProfile) -> None:
self._max_instance_count = config.run_config_search_max_instance_count
self._min_instance_count = config.run_config_search_min_instance_count
self._max_model_batch_size = config.run_config_search_max_model_batch_size
self._min_model_batch_size = config.run_config_search_min_model_batch_size
12 changes: 10 additions & 2 deletions model_analyzer/config/generate/model_run_config_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,5 +150,13 @@ def _determine_early_exit_enables(
concurrency_specified = model.parameters()["concurrency"]
config_parameters_exist = model.model_config_parameters()

self._pacg_early_exit_enable = early_exit_enable or not concurrency_specified
self._mcg_early_exit_enable = early_exit_enable or not config_parameters_exist
if config.is_llm_model():
self._pacg_early_exit_enable = False
self._mcg_early_exit_enable = False
nv-hwoo marked this conversation as resolved.
Show resolved Hide resolved
else:
self._pacg_early_exit_enable = (
early_exit_enable or not concurrency_specified
)
self._mcg_early_exit_enable = (
early_exit_enable or not config_parameters_exist
)
230 changes: 186 additions & 44 deletions model_analyzer/config/generate/perf_analyzer_config_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import logging
from typing import Generator, List, Optional
from copy import deepcopy
from typing import Dict, Generator, List, Optional, Tuple

from model_analyzer.config.input.config_command_profile import ConfigCommandProfile
from model_analyzer.config.input.config_defaults import DEFAULT_INPUT_JSON_PATH
from model_analyzer.constants import (
LOGGER_NAME,
THROUGHPUT_MINIMUM_CONSECUTIVE_BATCH_SIZE_TRIES,
THROUGHPUT_MINIMUM_CONSECUTIVE_NON_PARAMETER_TRIES,
THROUGHPUT_MINIMUM_CONSECUTIVE_PARAMETER_TRIES,
THROUGHPUT_MINIMUM_GAIN,
)
Expand Down Expand Up @@ -62,7 +65,7 @@
custom perf analyzer configuration

model_parameters: Dict
model constraints for batch_sizes, concurrency and/or request rate
model constraints for batch sizes, concurrency, request rate, prompt length, etc..

early_exit_enable: Bool
If true, this class can early exit during search of concurrency/request rate
Expand All @@ -72,10 +75,10 @@

# All configs are pregenerated in _configs[][]
# Indexed as follows:
# _configs[_curr_batch_size_index][_curr_parameter_index]
# _configs[_curr_non_parameter_index][_curr_parameter_index]
#
self._curr_non_parameter_index = 0
nv-braf marked this conversation as resolved.
Show resolved Hide resolved
self._curr_parameter_index = 0
self._curr_batch_size_index = 0
self._configs: List[List[PerfAnalyzerConfig]] = []
self._parameter_warning_printed = False

Expand All @@ -85,16 +88,35 @@

self._last_results: List[RunConfigMeasurement] = []
self._parameter_results: List[Optional[RunConfigMeasurement]] = []
self._batch_size_results: List[Optional[RunConfigMeasurement]] = []
self._non_parameter_results: List[Optional[RunConfigMeasurement]] = []

self._model_name = model_name
self._perf_analyzer_flags = model_perf_analyzer_flags

self._batch_sizes = sorted(model_parameters["batch_sizes"])
self._cli_config = cli_config

self._llm_input_dict = self._create_input_dict(model_perf_analyzer_flags)

self._perf_analyzer_flags = self._set_perf_analyzer_flags(
model_perf_analyzer_flags
)

self._model_parameters = model_parameters
self._parameters = self._create_parameter_list()

self._batch_sizes = sorted(model_parameters["batch_sizes"])
self._prompt_lengths = self._create_prompt_length_list()
self._max_token_counts = self._create_max_token_count_list()

self._perf_config_non_parameter_values = (
self._create_non_parameter_perf_config_values()
)
self._non_parameter_count = len(
utils.generate_parameter_combinations(
self._perf_config_non_parameter_values
)
)

self._input_json_filename = DEFAULT_INPUT_JSON_PATH + "/input-data.json"

self._generate_perf_configs()

@staticmethod
Expand Down Expand Up @@ -136,7 +158,7 @@
break

self._generator_started = True
config = self._configs[self._curr_batch_size_index][
config = self._configs[self._curr_non_parameter_index][
self._curr_parameter_index
]
yield (config)
Expand Down Expand Up @@ -168,6 +190,35 @@
self._last_results = measurement
self._parameter_results.extend(measurement)

def _set_perf_analyzer_flags(self, model_perf_analyzer_flags: dict) -> dict:
nv-braf marked this conversation as resolved.
Show resolved Hide resolved
# For LLM models we will be creating custom input data based on prompt length
perf_analyzer_flags = deepcopy(model_perf_analyzer_flags)
# perf_analyzer_flags = {
# key: value for key, value in model_perf_analyzer_flags.items()
# }

if self._cli_config.is_llm_model():
perf_analyzer_flags.pop("input-data")
Fixed Show fixed Hide fixed
Fixed Show fixed Hide fixed

return perf_analyzer_flags

def _create_input_dict(self, model_perf_analyzer_flags: dict) -> dict:
nv-braf marked this conversation as resolved.
Show resolved Hide resolved
if self._cli_config.is_llm_model():
with open(model_perf_analyzer_flags["input-data"], "r") as f:
input_dict = json.load(f)

return input_dict
else:
return {}

def _modify_prompt_in_input_dict(self, prompt_length: int) -> Dict:
nv-braf marked this conversation as resolved.
Show resolved Hide resolved
modified_input_dict = deepcopy(self._llm_input_dict)

modified_prompt = ["hi"] * prompt_length
modified_input_dict["data"][0]["PROMPT"] = modified_prompt
Fixed Show fixed Hide fixed

return modified_input_dict

def _create_parameter_list(self) -> List[int]:
# The two possible parameters are request rate or concurrency
# Concurrency is the default and will be used unless the user specifies
Expand Down Expand Up @@ -199,55 +250,146 @@
self._cli_config.run_config_search_max_concurrency,
)

def _create_prompt_length_list(self) -> List[int]:
if not self._cli_config.is_llm_model():
return []

if self._model_parameters["prompt_length"]:
return sorted(self._model_parameters["prompt_length"])
elif self._cli_config.run_config_search_disable:
return [1]
else:
return utils.generate_doubled_list(
self._cli_config.run_config_search_min_prompt_length,
self._cli_config.run_config_search_max_prompt_length,
)

def _create_max_token_count_list(self) -> List[int]:
if not self._cli_config.is_llm_model():
return []

if self._model_parameters["max_token_count"]:
return sorted(self._model_parameters["max_token_count"])
elif self._cli_config.run_config_search_disable:
return [1]
else:
return utils.generate_doubled_list(
self._cli_config.run_config_search_min_token_count,
self._cli_config.run_config_search_max_token_count,
)

def _generate_perf_configs(self) -> None:
perf_config_non_parameter_values = (
self._create_non_parameter_perf_config_values()
all_non_parameter_combinations = utils.generate_parameter_combinations(
nv-braf marked this conversation as resolved.
Show resolved Hide resolved
self._perf_config_non_parameter_values
)

for params in utils.generate_parameter_combinations(
perf_config_non_parameter_values
):
configs_with_concurrency = []
for unmodified_non_parameter_combination in all_non_parameter_combinations:
nv-braf marked this conversation as resolved.
Show resolved Hide resolved
all_perf_configs_for_a_given_parameter = []
for parameter in self._parameters:
new_perf_config = PerfAnalyzerConfig()

new_perf_config.update_config_from_profile_config(
self._model_name, self._cli_config
new_perf_config = self._create_new_perf_config(
parameter, unmodified_non_parameter_combination
)
all_perf_configs_for_a_given_parameter.append(new_perf_config)

new_perf_config.update_config(params)
self._configs.append(all_perf_configs_for_a_given_parameter)

if self._cli_config.is_request_rate_specified(self._model_parameters):
new_perf_config.update_config({"request-rate-range": parameter})
else:
new_perf_config.update_config({"concurrency-range": parameter})
def _create_new_perf_config(
self, parameter: int, unmodified_non_parameter_combination: Dict
) -> PerfAnalyzerConfig:
perf_config = self._create_base_perf_config()

(
prompt_length,
modified_non_parameter_combination,
) = self._extract_prompt_length(unmodified_non_parameter_combination)
nv-hwoo marked this conversation as resolved.
Show resolved Hide resolved

self._update_perf_config_based_on_non_parameter_combination(
perf_config, modified_non_parameter_combination
)
self._update_perf_config_based_on_parameter(perf_config, parameter)
self._update_perf_config_based_on_perf_analyzer_flags(perf_config)
nv-hwoo marked this conversation as resolved.
Show resolved Hide resolved
self._update_perf_config_for_llm_model(perf_config, prompt_length)

return perf_config

def _create_base_perf_config(self) -> PerfAnalyzerConfig:
perf_config = PerfAnalyzerConfig()
perf_config.update_config_from_profile_config(
self._model_name, self._cli_config
)

# User provided flags can override the search parameters
new_perf_config.update_config(self._perf_analyzer_flags)
return perf_config

configs_with_concurrency.append(new_perf_config)
self._configs.append(configs_with_concurrency)
def _extract_prompt_length(
self, unmodified_parameter_combination: Dict
) -> Tuple[int, Dict]:
if self._cli_config.is_llm_model():
modified_parameter_combination = deepcopy(unmodified_parameter_combination)
prompt_length = modified_parameter_combination.pop("prompt-length")
Fixed Show fixed Hide fixed
nv-braf marked this conversation as resolved.
Show resolved Hide resolved

return prompt_length, modified_parameter_combination
else:
return 0, unmodified_parameter_combination

def _update_perf_config_based_on_non_parameter_combination(
self, perf_config: PerfAnalyzerConfig, non_parameter_combination: Dict
) -> None:
perf_config.update_config(non_parameter_combination)

def _update_perf_config_based_on_perf_analyzer_flags(
self, perf_config: PerfAnalyzerConfig
) -> None:
perf_config.update_config(self._perf_analyzer_flags)

def _update_perf_config_based_on_parameter(
nv-braf marked this conversation as resolved.
Show resolved Hide resolved
self, perf_config: PerfAnalyzerConfig, parameter: int
) -> None:
if self._cli_config.is_llm_model():
perf_config.update_config({"periodic-concurrency-range": parameter})
elif self._cli_config.is_request_rate_specified(self._model_parameters):
perf_config.update_config({"request-rate-range": parameter})
else:
perf_config.update_config({"concurrency-range": parameter})

def _update_perf_config_for_llm_model(
self, perf_config: PerfAnalyzerConfig, prompt_length: int
) -> None:
if not self._cli_config.is_llm_model():
return

modified_input_dict = self._modify_prompt_in_input_dict(prompt_length)
self._write_modified_input_dict_to_file(modified_input_dict)

perf_config.update_config({"input-data": self._input_json_filename})

def _write_modified_input_dict_to_file(self, modified_input_dict: Dict) -> None:
temp_input_data = open(self._input_json_filename, "w")
json.dump(modified_input_dict, temp_input_data)
temp_input_data.close()
nv-braf marked this conversation as resolved.
Show resolved Hide resolved

def _create_non_parameter_perf_config_values(self) -> dict:
perf_config_values = {
"batch-size": self._batch_sizes,
}

if self._cli_config.is_llm_model():
perf_config_values["max-token-count"] = self._max_token_counts
perf_config_values["prompt-length"] = self._prompt_lengths

return perf_config_values

def _step(self) -> None:
self._step_parameter()

if self._done_walking_parameters():
self._add_best_throughput_to_batch_sizes()
self._add_best_throughput_to_non_parameter_results()
self._reset_parameters()
self._step_batch_size()
self._step_non_parameter()

def _add_best_throughput_to_batch_sizes(self) -> None:
def _add_best_throughput_to_non_parameter_results(self) -> None:
if self._parameter_results:
# type is List[Optional[RCM]]
best = max(self._parameter_results) # type: ignore
self._batch_size_results.append(best)
self._non_parameter_results.append(best)

def _reset_parameters(self) -> None:
self._curr_parameter_index = 0
Expand All @@ -257,11 +399,11 @@
def _step_parameter(self) -> None:
self._curr_parameter_index += 1

def _step_batch_size(self) -> None:
self._curr_batch_size_index += 1
def _step_non_parameter(self) -> None:
self._curr_non_parameter_index += 1

def _done_walking(self) -> bool:
return self._done_walking_batch_sizes()
return self._done_walking_non_parameters()

def _done_walking_parameters(self) -> bool:
if len(self._parameters) == self._curr_parameter_index:
Expand All @@ -280,11 +422,11 @@
return True
return False

def _done_walking_batch_sizes(self) -> bool:
if len(self._batch_sizes) == self._curr_batch_size_index:
def _done_walking_non_parameters(self) -> bool:
if self._non_parameter_count == self._curr_non_parameter_index:
return True

if self._early_exit_enable and not self._batch_size_throughput_gain_valid():
if self._early_exit_enable and not self._non_parameter_throughput_gain_valid():
logger.info(
"No longer increasing client batch size as throughput has plateaued"
)
Expand All @@ -303,10 +445,10 @@
min_gain=THROUGHPUT_MINIMUM_GAIN,
)

def _batch_size_throughput_gain_valid(self) -> bool:
"""Check if any of the last X batch_size results resulted in valid gain"""
def _non_parameter_throughput_gain_valid(self) -> bool:
"""Check if any of the last X non-parameter results resulted in valid gain"""
return PerfAnalyzerConfigGenerator.throughput_gain_valid_helper(
throughputs=self._batch_size_results,
min_tries=THROUGHPUT_MINIMUM_CONSECUTIVE_BATCH_SIZE_TRIES,
throughputs=self._non_parameter_results,
min_tries=THROUGHPUT_MINIMUM_CONSECUTIVE_NON_PARAMETER_TRIES,
min_gain=THROUGHPUT_MINIMUM_GAIN,
)
Loading
Loading