Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MVP LLM support to MA #783

Merged
merged 12 commits into from
Nov 3, 2023
10 changes: 8 additions & 2 deletions model_analyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,14 @@ def profile(

if not self._config.skip_summary_reports:
self._create_summary_tables(verbose)
self._create_summary_reports(mode)
self._create_detailed_reports(mode)

# TODO TMA-1401: need to figure out summary reporting for LLMs
if not self._config.is_llm_model():
self._create_summary_reports(mode)

# TODO TMA-1443: need to figure out detailed reporting for LLMs
if not self._config.is_llm_model():
self._create_detailed_reports(mode)

self._check_for_perf_analyzer_errors()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,7 @@ def __init__(
logger.info("")
AutomaticModelConfigGenerator._log_first_run = True

self._max_instance_count = config.run_config_search_max_instance_count
self._min_instance_count = config.run_config_search_min_instance_count
self._max_model_batch_size = config.run_config_search_max_model_batch_size
self._min_model_batch_size = config.run_config_search_min_model_batch_size
self._set_min_max_search_values(config)

self._instance_kind = "KIND_CPU" if self._cpu_only else "KIND_GPU"

Expand All @@ -91,7 +88,7 @@ def __init__(

self._reset_max_batch_size()

if not self._early_exit_enable:
if not self._early_exit_enable and not self._config.is_llm_model():
raise TritonModelAnalyzerException(
"Early exit disable is not supported in automatic model config generator"
)
Expand Down Expand Up @@ -162,3 +159,9 @@ def _get_curr_param_combo(self) -> Dict:
config["dynamic_batching"] = {}

return config

def _set_min_max_search_values(self, config: ConfigCommandProfile) -> None:
self._max_instance_count = config.run_config_search_max_instance_count
self._min_instance_count = config.run_config_search_min_instance_count
self._max_model_batch_size = config.run_config_search_max_model_batch_size
self._min_model_batch_size = config.run_config_search_min_model_batch_size
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from model_analyzer.config.run.run_config import RunConfig
from model_analyzer.constants import LOGGER_NAME
from model_analyzer.device.gpu_device import GPUDevice
from model_analyzer.result.parameter_search import ParameterSearch
from model_analyzer.result.inference_load_search import InferenceLoadSearch
from model_analyzer.result.result_manager import ResultManager
from model_analyzer.result.run_config_measurement import RunConfigMeasurement
from model_analyzer.triton.client.client import TritonClient
Expand All @@ -39,10 +39,10 @@
logger = logging.getLogger(LOGGER_NAME)


class BrutePlusBinaryParameterSearchRunConfigGenerator(ConfigGeneratorInterface):
class BrutePlusBinarySearchRunConfigGenerator(ConfigGeneratorInterface):
"""
First run BruteRunConfigGenerator for a brute search, then for
automatic searches use ParameterSearch to perform a binary search
automatic searches use InferenceLoadSearch to perform a binary search
"""

def __init__(
Expand Down Expand Up @@ -116,7 +116,11 @@ def _create_brute_run_config_generator(self) -> BruteRunConfigGenerator:

def _can_binary_search_top_results(self) -> bool:
for model in self._models:
if model.parameters()["concurrency"] or model.parameters()["request_rate"]:
if (
model.parameters()["concurrency"]
or model.parameters()["request_rate"]
or self._config.is_llm_model()
):
return False

return True
Expand All @@ -132,17 +136,19 @@ def _binary_search_over_top_results(self) -> Generator[RunConfig, None, None]:
for result in top_results:
run_config = deepcopy(result.run_config())
model_parameters = self._get_model_parameters(model_name)
parameter_search = ParameterSearch(
inference_load_search = InferenceLoadSearch(
config=self._config,
model_parameters=model_parameters,
skip_parameter_sweep=True,
skip_inference_load_sweep=True,
)
for parameter in parameter_search.search_parameters():
run_config = self._set_parameter(
run_config, model_parameters, parameter
for inference_load in inference_load_search.search_inference_loads():
run_config = self._set_inference_load(
run_config, model_parameters, inference_load
)
yield run_config
parameter_search.add_run_config_measurement(self._last_measurement)
inference_load_search.add_run_config_measurement(
self._last_measurement
)

def _get_model_parameters(self, model_name: str) -> Dict:
for model in self._models:
Expand All @@ -151,14 +157,14 @@ def _get_model_parameters(self, model_name: str) -> Dict:

return {}

def _set_parameter(
self, run_config: RunConfig, model_parameters: Dict, parameter: int
def _set_inference_load(
self, run_config: RunConfig, model_parameters: Dict, inference_load: int
) -> RunConfig:
for model_run_config in run_config.model_run_configs():
perf_config = model_run_config.perf_config()
if self._config.is_request_rate_specified(model_parameters):
perf_config.update_config({"request-rate-range": parameter})
perf_config.update_config({"request-rate-range": inference_load})
else:
perf_config.update_config({"concurrency-range": parameter})
perf_config.update_config({"concurrency-range": inference_load})

return run_config
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def __init__(
self._curr_results: List = [[] for n in range(self._num_models)]
self._curr_generators: Dict[int, ConfigGeneratorInterface] = {}

self._skip_default_config = skip_default_config
self._skip_default_config = skip_default_config or config.is_llm_model()

def set_last_results(
self, measurements: List[Optional[RunConfigMeasurement]]
Expand Down
42 changes: 39 additions & 3 deletions model_analyzer/config/generate/generator_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# limitations under the License.

from itertools import product
from typing import Dict, List
from typing import Dict, List, Optional


class GeneratorUtils:
Expand Down Expand Up @@ -80,8 +80,8 @@ def generate_combinations(value: object) -> List:
@staticmethod
def generate_parameter_combinations(params: Dict) -> List[Dict]:
"""
Generate a list of all possible subdictionaries
from given dictionary. The subdictionaries will
Generate a list of all possible sub-dictionaries
from given dictionary. The sub-dictionaries will
have all the same keys, but only one value from
each key.

Expand All @@ -108,9 +108,45 @@ def generate_doubled_list(min_value: int, max_value: int) -> List[int]:
The value that the generated list will not exceed
"""

assert min_value <= max_value

list = []
val = 1 if min_value == 0 else min_value
while val <= max_value:
list.append(val)
val *= 2
return list

@staticmethod
def extract_value_from_request_parameter(request_parameter: Optional[str]) -> int:
if not request_parameter:
return 0

# Format is: <parameter>:<value>:<type>
# Example: max_tokens:10:int
_, value, _ = request_parameter.split(":")

# this catches the case for non-LLM models where the user has specified request parameters
try:
int(value)
except ValueError as _:
return 0

return int(value)

@staticmethod
def extract_text_input_length_from_input_data(input_data: Optional[str]) -> int:
if not input_data:
return 0

# format is input-data-<num>.json
_, _, text_input_length = input_data.split("-")
text_input_length, _ = text_input_length.split(".")

# this catches the case for non-LLM models where the user has specified input data
try:
int(text_input_length)
except ValueError as _:
return 0

return int(text_input_length)
12 changes: 10 additions & 2 deletions model_analyzer/config/generate/model_run_config_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,5 +150,13 @@ def _determine_early_exit_enables(
concurrency_specified = model.parameters()["concurrency"]
config_parameters_exist = model.model_config_parameters()

self._pacg_early_exit_enable = early_exit_enable or not concurrency_specified
self._mcg_early_exit_enable = early_exit_enable or not config_parameters_exist
if config.is_llm_model():
self._pacg_early_exit_enable = False
self._mcg_early_exit_enable = False
else:
self._pacg_early_exit_enable = (
early_exit_enable or not concurrency_specified
)
self._mcg_early_exit_enable = (
early_exit_enable or not config_parameters_exist
)
Loading
Loading