From 4e96d21798efae73be0ed0ef9a80d8b8b21019bc Mon Sep 17 00:00:00 2001 From: Brian Raf <92820864+nv-braf@users.noreply.github.com> Date: Thu, 6 Jun 2024 22:04:22 -0700 Subject: [PATCH] Documentation update for Optuna (alpha release) (#895) * Documentation update for Optuna (alpha release) * More fixes based on PR --- README.md | 2 + docs/config.md | 39 ++++++++++--- docs/config_search.md | 106 +++++++++++++++++++++++++++++++++++ docs/ensemble_quick_start.md | 2 +- 4 files changed, 139 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 1cbe97e37..29d9a07e9 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,8 @@ Triton Model Analyzer is a CLI tool which can help you find a more optimal confi ### Search Modes +- [Optuna Search](docs/config_search.md#optuna-search-mode) **_-ALPHA RELEASE-_** allows you to search for every parameter that can be specified in the model configuration, using a hyperparameter optimization framework. Please see the [Optuna](https://optuna.org/) website if you are interested in specific details on how the algorithm functions. + - [Quick Search](docs/config_search.md#quick-search-mode) will **sparsely** search the [Max Batch Size](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#maximum-batch-size), [Dynamic Batching](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#dynamic-batcher), and [Instance Group](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups) spaces by utilizing a heuristic hill-climbing algorithm to help you quickly find a more optimal configuration diff --git a/docs/config.md b/docs/config.md index 875d15936..69a91e7b3 100644 --- a/docs/config.md +++ b/docs/config.md @@ -188,31 +188,31 @@ cpu_only_composing_models: # List of GPU UUIDs to be used for the profiling. Use 'all' to profile all the GPUs visible by CUDA [ gpus: | default: 'all' ] -# Search mode. Options are "brute" and "quick" +# Search mode. Options are "brute", "quick", and "optuna" [ run_config_search_mode: | default: brute] -# Minimum concurrency used for the automatic/quick config search +# Minimum concurrency used for the automatic/quick/optuna config search [ run_config_search_min_concurrency: | default: 1 ] -# Maximum concurrency used for the automatic/quick config search +# Maximum concurrency used for the automatic/quick/optuna config search [ run_config_search_max_concurrency: | default: 1024 ] -# Minimum max_batch_size used for the automatic/quick config search +# Minimum max_batch_size used for the automatic/quick/optuna config search [ run_config_search_min_model_batch_size: | default: 1 ] -# Maximum max_batch_size used for the automatic/quick config search +# Maximum max_batch_size used for the automatic/quick/optuna config search [ run_config_search_max_model_batch_size: | default: 128 ] -# Minimum instance group count used for the automatic/quick config search +# Minimum instance group count used for the automatic/quick/optuna config search [ run_config_search_min_instance_count: | default: 1 ] -# Maximum instance group count used for the automatic/quick config search +# Maximum instance group count used for the automatic/quick/optuna config search [ run_config_search_max_instance_count: | default: 5 ] -# Minimum request rate used for the automatic/quick config search +# Minimum request rate used for the automatic/quick/optuna config search [ run_config_search_min_request_rate: | default: 16 ] -# Maximum request rate used for the automatic/quick config search +# Maximum request rate used for the automatic/quick/optuna config search [ run_config_search_max_request_rate: | default: 8092 ] # Maximum number of steps taken during a binary search @@ -227,6 +227,27 @@ cpu_only_composing_models: # Enables the searching of request rate (instead of concurrency) [ request_rate_search_enable: | default: false] +# Minimum percentage of the search space to profile when using Optuna +[ min_percentage_of_search_space: | default: 5] + +# Maximum percentage of the search space to profile when using Optuna +[ max_percentage_of_search_space: | default: 10] + +# Minimum number of trials to profile when using Optuna +[ optuna_min_trials: | default: None] + +# Maximum number of trials to profile when using Optuna +[ optuna_max_trials: | default: None] + +# Number of trials without improvement before triggering early exit when using Optuna +[ optuna_early_exit_threshold: | default: 10] + +# Use the concurrency formula instead of searching the concurrency space in Optuna search mode +[ use_concurrency_formula: | default: false] + +# Disables the sweeping of concurrencies for the top-N models after quick/optuna search completion +[ concurrency_sweep_disable: | default: false] + # Always report GPU metrics, even if the model(s) is cpu_only [ always_report_gpu_metrics: | default: false] diff --git a/docs/config_search.md b/docs/config_search.md index b11c7296c..67bf47c6b 100644 --- a/docs/config_search.md +++ b/docs/config_search.md @@ -22,6 +22,7 @@ limitations under the License. - [Automatic Brute Search](#automatic-brute-search) - [Manual Brute Search](#manual-brute-search) - [Quick Search Mode](#quick-search-mode) +- [Optuna Search Mode](#optuna-search-mode) - [Ensemble Model Search](#ensemble-model-search) - [BLS Model Search](#bls-model-search) - [LLM Search](#llm-search) @@ -48,6 +49,9 @@ Model Analyzer's `profile` subcommand supports multiple modes when searching to - Single BLS models - Multiple models being profiled concurrently - **Command:** `--run-config-search-mode quick` +- [Optuna Search](config_search.md#optuna-search-mode) **-ALPHA RELEASE-** + - **Search type:** Heuristic sweep using a hyperparameter optimization framework to find an optimal configuration + - **Command:** `--run-config-search-mode optuna` --- @@ -276,6 +280,108 @@ profile_models: --- +## Optuna Search Mode + +**-ALPHA RELEASE-** + +_This mode has the following limitations:_ + +- **Ensemble, BLS or concurrent multi-model profiling is not supported** +- **Profiling with request rate is not supported** + +This mode uses a hyperparameter optimization framework to search the configuration +space, looking for the maximal objective value within the specified constraints. +Please see the [Optuna](https://optuna.org/) website if you are interested in specific details on how the algorithm functions. + +Optuna allows you to search for every parameter that can be specified in the model configuration. Parameters can be specified +with a min/max range (using the run-config-search options) or a list of parameters to test against can be set in the +parameters/model_config_parameters field. + +After optuna search has found the best config(s), it will then sweep the top-N configurations found (specified by `--num-configs-per-model`) over the default concurrency range before generation of the summary reports. + +--- + +_An example model analyzer YAML config that performs an Optuna Search:_ + +```yaml +model_repository: /path/to/model/repository/ + +run_config_search_mode: optuna +profile_models: + - model_A +``` + +--- + +A number of new configuration options were added to support tailoring the Optuna search to your needs: + +- `--min/max_percentage_of_search_space`: sets the percentage of the space you want Optuna to search +- `--optuna-min/max-trials`: sets the number of trials Optuna will attempt +- `--optuna-early-exit-threshold`: sets the number of trials without improvement before triggering early exit +- `--use-concurrency-formula`: uses a formula (2 \* batch size \* instance group count), rather than sweeping concurrency + +--- + +_An example that performs an Optuna Search using these new configuration options:_ + +```yaml +model_repository: /path/to/model/repository/ + +run_config_search_mode: optuna +run_config_search_max_instance_count: 8 +run_config_search_min_concurrency: 32 +run_config_search_max_concurrency: 256 + +use_concurrency_formula: True +min_percentage_of_search_space: 10 +optuna_max_trials: 200 +optuna_early_exit_threshold: 15 + +profile_models: + model_A: + model_config_parameters: + max_batch_size: [1, 4, 8, 32, 64, 128] + dynamic_batching: + max_queue_delay_microseconds: [100, 200, 300] + parameters: + batch_sizes: 1, 2, 4, 8, 16 +``` + +_The debug output showing how the space will be searched:_ + +```yaml +Number of configs in search space: 720 + batch_sizes: [1, 2, 4, 8, 16] (5) + max_batch_size: [1, 4, 8, 32, 64, 128] (6) + instance_group: 1 to 8 (8) + max_queue_delay_microseconds: [100, 200, 300] (3) + +Minimum number of trials: 72 (10% of search space) +Maximum number of trials: 200 (set by max trials) +``` + +--- + +### Optuna Search in Detail + +When performing an Optuna Search, Model Analyzer's goal is to maximize the configuration's `objective score`. First, +MA profiles the default configuration and assigns it an `objective score` of zero. All future configurations +are also assigned an `objective score`; with positive values indicating this configuration is better than the default +configuration and negative values indicating it performs worse. + +_Here is an example debug output:_ + +```yaml +Trial 7 of 200: + Creating model config: model_A_config_6 + Setting dynamic_batching to {'max_queue_delay_microseconds': 200} + Setting instance_group to [{'count': 4, 'kind': 'KIND_GPU'}] + Setting max_batch_size to 64 + + Profiling model_A_config_6: client batch size=4, concurrency=256 + Objective score for model_A_config_6: 57 --- Best: model_A_config_4 (83) +``` + ## Ensemble Model Search _This mode has the following limitations:_ diff --git a/docs/ensemble_quick_start.md b/docs/ensemble_quick_start.md index 18cbcb398..a8a5e79b7 100644 --- a/docs/ensemble_quick_start.md +++ b/docs/ensemble_quick_start.md @@ -45,7 +45,7 @@ git pull origin main **3. Add a version directory to ensemble_add_sub** ``` -mkdir examples/quick/ensemble_add_sub/1 +mkdir examples/quick-start/ensemble_add_sub/1 ``` ## `Step 2:` Pull and Run the SDK Container