Skip to content

Commit

Permalink
Add Debug info to Optuna (#889)
Browse files Browse the repository at this point in the history
* Adding debug info + bug fixes

* Fixes based on PR
  • Loading branch information
nv-braf committed Jun 6, 2024
1 parent f32d186 commit b399d10
Show file tree
Hide file tree
Showing 3 changed files with 119 additions and 22 deletions.
98 changes: 82 additions & 16 deletions model_analyzer/config/generate/optuna_run_config_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from model_analyzer.config.run.model_run_config import ModelRunConfig
from model_analyzer.config.run.run_config import RunConfig
from model_analyzer.constants import LOGGER_NAME
from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
from model_analyzer.perf_analyzer.perf_config import PerfAnalyzerConfig
from model_analyzer.result.run_config_measurement import RunConfigMeasurement
from model_analyzer.triton.model.model_config import ModelConfig
Expand Down Expand Up @@ -69,6 +70,9 @@ class OptunaRunConfigGenerator(ConfigGeneratorInterface):
"max_queue_delay_microseconds",
]

# TODO: TMA-1927: Figure out the correct value for this
NO_MEASUREMENT_SCORE = -1

def __init__(
self,
config: ConfigCommandProfile,
Expand Down Expand Up @@ -102,6 +106,8 @@ def __init__(

self._num_models = len(models)
self._last_measurement: Optional[RunConfigMeasurement] = None
self._best_config_name = ""
self._best_config_score: Optional[float] = None

self._c_api_mode = config.triton_launch_mode == "c_api"

Expand Down Expand Up @@ -137,21 +143,48 @@ def get_configs(self) -> Generator[RunConfig, None, None]:
RunConfig
The next RunConfig generated by this class
"""
logger.info(
"Measuring default configuration to establish a baseline measurement"
)
default_run_config = self._create_default_run_config()
yield default_run_config
self._default_measurement = self._last_measurement

max_configs_to_search = self._determine_maximum_number_of_configs_to_search()
self._capture_default_measurement(default_run_config)
self._set_best_measurement(default_run_config)

if logging.DEBUG:
self._print_debug_search_space_info()

max_configs_to_search = self._determine_maximum_number_of_configs_to_search()
# TODO: TMA-1885: Need an early exit strategy
for _ in range(max_configs_to_search):
for trial_count in range(max_configs_to_search):
trial = self._study.ask()
trial_objectives = self._create_trial_objectives(trial)
logger.debug(f"Trial {trial_count+1} of {max_configs_to_search}:")
run_config = self._create_objective_based_run_config(trial_objectives)
yield run_config

score = self._calculate_score()
self._set_best_measurement(run_config, score)

if logging.DEBUG:
self._print_debug_score_info(run_config, score)

self._study.tell(trial, score)

def _capture_default_measurement(self, default_run_config: RunConfig) -> None:
if not self._last_measurement:
raise TritonModelAnalyzerException(
"Default configuration did not return a measurement. Please check PA/Tritonserver log files."
)

self._default_measurement = self._last_measurement

def _set_best_measurement(self, run_config: RunConfig, score: float = 0) -> None:
if self._best_config_score is None or score > self._best_config_score:
self._best_config_name = run_config.model_variants_name()
self._best_config_score = score

def _determine_maximum_number_of_configs_to_search(self) -> int:
max_trials_based_on_percentage_of_search_space = (
self._determine_trials_based_on_max_percentage_of_search_space()
Expand Down Expand Up @@ -193,14 +226,30 @@ def _decide_between_percentage_and_trial_count(
self._config.optuna_max_trials
< max_trials_based_on_percentage_of_search_space
):
logger.debug(
f"Maximum number of trials: {self._config.optuna_max_trials} (optuna_max_trials)"
)
max_configs_to_search = self._config.optuna_max_trials
else:
logger.debug(
f"Maximum number of trials: {max_trials_based_on_percentage_of_search_space} "
f"({self._config.max_percentage_of_search_space}% of search space)"
)
max_configs_to_search = max_trials_based_on_percentage_of_search_space
elif max_trials_set_by_user:
logger.debug(
f"Maximum number of trials: {self._config.optuna_max_trials} (set by max. trials)"
)
max_configs_to_search = self._config.optuna_max_trials
else:
logger.debug(
f"Maximum number of trials: {max_trials_based_on_percentage_of_search_space} "
f"({self._config.max_percentage_of_search_space}% of search space)"
)
max_configs_to_search = max_trials_based_on_percentage_of_search_space

if logging.DEBUG:
logger.info("")
return max_configs_to_search

def _create_trial_objectives(self, trial: optuna.Trial) -> TrialObjectives:
Expand Down Expand Up @@ -239,15 +288,15 @@ def _create_trial_objective(
return objective

def _get_objective_concurrency(self, trial_objectives: TrialObjectives) -> int:
concurrency = (
concurrency_formula = (
2
* int(trial_objectives["instance_group"])
* int(trial_objectives["batch_sizes"])
)
concurrency = (
DEFAULT_RUN_CONFIG_MAX_CONCURRENCY
if concurrency > DEFAULT_RUN_CONFIG_MAX_CONCURRENCY
else concurrency
self._config.run_config_search_max_concurrency
if concurrency_formula > self._config.run_config_search_max_concurrency
else concurrency_formula
)

return concurrency
Expand Down Expand Up @@ -286,7 +335,7 @@ def _create_parameter_combo(
param_combo["dynamic_batching"] = []

# TODO: TMA-1927: Add support for multi-model
if trial_objectives["instance_group"]:
if "instance_group" in trial_objectives:
kind = "KIND_CPU" if self._models[0].cpu_only() else "KIND_GPU"
param_combo["instance_group"] = [
{
Expand All @@ -295,10 +344,10 @@ def _create_parameter_combo(
}
]

if trial_objectives["batch_sizes"]:
if "batch_sizes" in trial_objectives:
param_combo["max_batch_size"] = trial_objectives["batch_sizes"]

if trial_objectives["max_queue_delay_microseconds"]:
if "max_queue_delay_microseconds" in trial_objectives:
param_combo["dynamic_batching"] = {
"max_queue_delay_microseconds": trial_objectives[
"max_queue_delay_microseconds"
Expand All @@ -313,8 +362,7 @@ def _calculate_score(self) -> float:
self._last_measurement
)
else:
# TODO: TMA-1927: Figure out the correct value for this (and make it a constant)
score = -1
score = OptunaRunConfigGenerator.NO_MEASUREMENT_SCORE

return score

Expand Down Expand Up @@ -416,8 +464,26 @@ def _create_perf_analyzer_config(
perf_analyzer_config.update_config(model.perf_analyzer_flags())
return perf_analyzer_config

def _print_debug_logs(
self, measurements: List[Union[RunConfigMeasurement, None]]
def _print_debug_search_space_info(self) -> None:
logger.info("")
logger.debug(
f"Number of configs in search space: {self._search_parameters.number_of_total_possible_configurations()}"
)

for name in self._search_parameters.get_search_parameters():
logger.debug(self._search_parameters.print_info(name))

logger.info("")

def _print_debug_score_info(
self,
run_config: RunConfig,
score: float,
) -> None:
# TODO: TMA-1928
NotImplemented
if score != OptunaRunConfigGenerator.NO_MEASUREMENT_SCORE:
logger.debug(
f"Objective score for {run_config.model_variants_name()}: {int(score * 100)} --- " # type: ignore
f"Best: {self._best_config_name} ({int(self._best_config_score * 100)})" # type: ignore
)

logger.info("")
21 changes: 21 additions & 0 deletions model_analyzer/config/generate/search_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ def __init__(

self._populate_search_parameters()

def get_search_parameters(self) -> Dict[str, SearchParameter]:
return self._search_parameters

def get_parameter(self, name: str) -> Optional[SearchParameter]:
return self._search_parameters.get(name)

Expand Down Expand Up @@ -76,6 +79,24 @@ def number_of_total_possible_configurations(self) -> int:

return total_number_of_configs

def print_info(self, name: str) -> str:
info_string = f" {name}: "

parameter = self._search_parameters[name]
if parameter.category is ParameterCategory.INTEGER:
info_string += f"{parameter.min_range} to {parameter.max_range}"
elif parameter.category is ParameterCategory.EXPONENTIAL:
info_string += f"{2**parameter.min_range} to {2**parameter.max_range}" # type: ignore
elif (
parameter.category is ParameterCategory.INT_LIST
or parameter.category is ParameterCategory.STR_LIST
):
info_string += f"{parameter.enumerated_list}"

info_string += f" ({self._number_of_configurations_for_parameter(parameter)})"

return info_string

def _number_of_configurations_for_parameter(
self, parameter: SearchParameter
) -> int:
Expand Down
22 changes: 16 additions & 6 deletions model_analyzer/record/metrics_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -783,13 +783,23 @@ def _print_run_config_info(self, run_config):
for model_run_config in run_config.model_run_configs():
perf_config = model_run_config.perf_config()
if perf_config["request-rate-range"]:
logger.info(
f"Profiling {model_run_config.model_variant_name()}: client batch size={perf_config['batch-size']}, request-rate-range={perf_config['request-rate-range']}"
)
if perf_config["batch-size"] != 1:
logger.info(
f"Profiling {model_run_config.model_variant_name()}: client batch size={perf_config['batch-size']}, request-rate-range={perf_config['request-rate-range']}"
)
else:
logger.info(
f"Profiling {model_run_config.model_variant_name()}: request-rate-range={perf_config['request-rate-range']}"
)
else:
logger.info(
f"Profiling {model_run_config.model_variant_name()}: client batch size={perf_config['batch-size']}, concurrency={perf_config['concurrency-range']}"
)
if perf_config["batch-size"] != 1:
logger.info(
f"Profiling {model_run_config.model_variant_name()}: client batch size={perf_config['batch-size']}, concurrency={perf_config['concurrency-range']}"
)
else:
logger.info(
f"Profiling {model_run_config.model_variant_name()}: concurrency={perf_config['concurrency-range']}"
)

# Vertical spacing when running multiple models at a time
if len(run_config.model_run_configs()) > 1:
Expand Down

0 comments on commit b399d10

Please sign in to comment.