From 4e96d21798efae73be0ed0ef9a80d8b8b21019bc Mon Sep 17 00:00:00 2001
From: Brian Raf <92820864+nv-braf@users.noreply.github.com>
Date: Thu, 6 Jun 2024 22:04:22 -0700
Subject: [PATCH] Documentation update for Optuna (alpha release) (#895)

* Documentation update for Optuna (alpha release)

* More fixes based on PR
---
 README.md                    |   2 +
 docs/config.md               |  39 ++++++++++---
 docs/config_search.md        | 106 +++++++++++++++++++++++++++++++++++
 docs/ensemble_quick_start.md |   2 +-
 4 files changed, 139 insertions(+), 10 deletions(-)
diff --git a/README.md b/README.md
index 1cbe97e37..29d9a07e9 100644
--- a/README.md
+++ b/README.md
@@ -33,6 +33,8 @@ Triton Model Analyzer is a CLI tool which can help you find a more optimal confi
 
 ### Search Modes
 
+- [Optuna Search](docs/config_search.md#optuna-search-mode) **_-ALPHA RELEASE-_** allows you to search for every parameter that can be specified in the model configuration, using a hyperparameter optimization framework. Please see the [Optuna](https://optuna.org/) website if you are interested in specific details on how the algorithm functions.
+
 - [Quick Search](docs/config_search.md#quick-search-mode) will **sparsely** search the [Max Batch Size](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#maximum-batch-size),
   [Dynamic Batching](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#dynamic-batcher), and
   [Instance Group](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups) spaces by utilizing a heuristic hill-climbing algorithm to help you quickly find a more optimal configuration
diff --git a/docs/config.md b/docs/config.md
index 875d15936..69a91e7b3 100644
--- a/docs/config.md
+++ b/docs/config.md
@@ -188,31 +188,31 @@ cpu_only_composing_models: <comma-delimited-string-list>
 # List of GPU UUIDs to be used for the profiling. Use 'all' to profile all the GPUs visible by CUDA
 [ gpus: <string|comma-delimited-list-string> | default: 'all' ]
 
-# Search mode. Options are "brute" and "quick"
+# Search mode. Options are "brute", "quick", and "optuna"
 [ run_config_search_mode: <string> | default: brute]
 
-# Minimum concurrency used for the automatic/quick config search
+# Minimum concurrency used for the automatic/quick/optuna config search
 [ run_config_search_min_concurrency: <int> | default: 1 ]
 
-# Maximum concurrency used for the automatic/quick config search
+# Maximum concurrency used for the automatic/quick/optuna config search
 [ run_config_search_max_concurrency: <int> | default: 1024 ]
 
-# Minimum max_batch_size used for the automatic/quick config search
+# Minimum max_batch_size used for the automatic/quick/optuna config search
 [ run_config_search_min_model_batch_size: <int> | default: 1 ]
 
-# Maximum max_batch_size used for the automatic/quick config search
+# Maximum max_batch_size used for the automatic/quick/optuna config search
 [ run_config_search_max_model_batch_size: <int> | default: 128 ]
 
-# Minimum instance group count used for the automatic/quick config search
+# Minimum instance group count used for the automatic/quick/optuna config search
 [ run_config_search_min_instance_count: <int> | default: 1 ]
 
-# Maximum instance group count used for the automatic/quick config search
+# Maximum instance group count used for the automatic/quick/optuna config search
 [ run_config_search_max_instance_count: <int> | default: 5 ]
 
-# Minimum request rate used for the automatic/quick config search
+# Minimum request rate used for the automatic/quick/optuna config search
 [ run_config_search_min_request_rate: <int> | default: 16 ]
 
-# Maximum request rate used for the automatic/quick config search
+# Maximum request rate used for the automatic/quick/optuna config search
 [ run_config_search_max_request_rate: <int> | default: 8092 ]
 
 # Maximum number of steps taken during a binary search
@@ -227,6 +227,27 @@ cpu_only_composing_models: <comma-delimited-string-list>
 # Enables the searching of request rate (instead of concurrency)
 [ request_rate_search_enable: <bool> | default: false]
 
+# Minimum percentage of the search space to profile when using Optuna
+[ min_percentage_of_search_space: <int> | default: 5]
+
+# Maximum percentage of the search space to profile when using Optuna
+[ max_percentage_of_search_space: <int> | default: 10]
+
+# Minimum number of trials to profile when using Optuna
+[ optuna_min_trials: <int> | default: None]
+
+# Maximum number of trials to profile when using Optuna
+[ optuna_max_trials: <int> | default: None]
+
+# Number of trials without improvement before triggering early exit when using Optuna
+[ optuna_early_exit_threshold: <int> | default: 10]
+
+# Use the concurrency formula instead of searching the concurrency space in Optuna search mode
+[ use_concurrency_formula: <bool> | default: false]
+
+# Disables the sweeping of concurrencies for the top-N models after quick/optuna search completion
+[ concurrency_sweep_disable: <bool> | default: false]
+
 # Always report GPU metrics, even if the model(s) is cpu_only
 [ always_report_gpu_metrics: <bool> | default: false]
 
diff --git a/docs/config_search.md b/docs/config_search.md
index b11c7296c..67bf47c6b 100644
--- a/docs/config_search.md
+++ b/docs/config_search.md
@@ -22,6 +22,7 @@ limitations under the License.
   - [Automatic Brute Search](#automatic-brute-search)
   - [Manual Brute Search](#manual-brute-search)
 - [Quick Search Mode](#quick-search-mode)
+- [Optuna Search Mode](#optuna-search-mode)
 - [Ensemble Model Search](#ensemble-model-search)
 - [BLS Model Search](#bls-model-search)
 - [LLM Search](#llm-search)
@@ -48,6 +49,9 @@ Model Analyzer's `profile` subcommand supports multiple modes when searching to
     - Single BLS models
     - Multiple models being profiled concurrently
   - **Command:** `--run-config-search-mode quick`
+- [Optuna Search](config_search.md#optuna-search-mode) **-ALPHA RELEASE-**
+  - **Search type:** Heuristic sweep using a hyperparameter optimization framework to find an optimal configuration
+  - **Command:** `--run-config-search-mode optuna`
 
 ---
 
@@ -276,6 +280,108 @@ profile_models:
 
 ---
 
+## Optuna Search Mode
+
+**-ALPHA RELEASE-**
+
+_This mode has the following limitations:_
+
+- **Ensemble, BLS or concurrent multi-model profiling is not supported**
+- **Profiling with request rate is not supported**
+
+This mode uses a hyperparameter optimization framework to search the configuration
+space, looking for the maximal objective value within the specified constraints.
+Please see the [Optuna](https://optuna.org/) website if you are interested in specific details on how the algorithm functions.
+
+Optuna allows you to search for every parameter that can be specified in the model configuration. Parameters can be specified
+with a min/max range (using the run-config-search options) or a list of parameters to test against can be set in the
+parameters/model_config_parameters field.
+
+After optuna search has found the best config(s), it will then sweep the top-N configurations found (specified by `--num-configs-per-model`) over the default concurrency range before generation of the summary reports.
+
+---
+
+_An example model analyzer YAML config that performs an Optuna Search:_
+
+```yaml
+model_repository: /path/to/model/repository/
+
+run_config_search_mode: optuna
+profile_models:
+  - model_A
+```
+
+---
+
+A number of new configuration options were added to support tailoring the Optuna search to your needs:
+
+- `--min/max_percentage_of_search_space`: sets the percentage of the space you want Optuna to search
+- `--optuna-min/max-trials`: sets the number of trials Optuna will attempt
+- `--optuna-early-exit-threshold`: sets the number of trials without improvement before triggering early exit
+- `--use-concurrency-formula`: uses a formula (2 \* batch size \* instance group count), rather than sweeping concurrency
+
+---
+
+_An example that performs an Optuna Search using these new configuration options:_
+
+```yaml
+model_repository: /path/to/model/repository/
+
+run_config_search_mode: optuna
+run_config_search_max_instance_count: 8
+run_config_search_min_concurrency: 32
+run_config_search_max_concurrency: 256
+
+use_concurrency_formula: True
+min_percentage_of_search_space: 10
+optuna_max_trials: 200
+optuna_early_exit_threshold: 15
+
+profile_models:
+  model_A:
+    model_config_parameters:
+      max_batch_size: [1, 4, 8, 32, 64, 128]
+      dynamic_batching:
+        max_queue_delay_microseconds: [100, 200, 300]
+    parameters:
+      batch_sizes: 1, 2, 4, 8, 16
+```
+
+_The debug output showing how the space will be searched:_
+
+```yaml
+Number of configs in search space: 720
+   batch_sizes: [1, 2, 4, 8, 16] (5)
+   max_batch_size: [1, 4, 8, 32, 64, 128] (6)
+   instance_group: 1 to 8 (8)
+   max_queue_delay_microseconds: [100, 200, 300] (3)
+
+Minimum number of trials: 72 (10% of search space)
+Maximum number of trials: 200 (set by max trials)
+```
+
+---
+
+### Optuna Search in Detail
+
+When performing an Optuna Search, Model Analyzer's goal is to maximize the configuration's `objective score`. First,
+MA profiles the default configuration and assigns it an `objective score` of zero. All future configurations
+are also assigned an `objective score`; with positive values indicating this configuration is better than the default
+configuration and negative values indicating it performs worse.
+
+_Here is an example debug output:_
+
+```yaml
+Trial 7 of 200:
+  Creating model config: model_A_config_6
+  Setting dynamic_batching to {'max_queue_delay_microseconds': 200}
+  Setting instance_group to [{'count': 4, 'kind': 'KIND_GPU'}]
+  Setting max_batch_size to 64
+
+  Profiling model_A_config_6: client batch size=4, concurrency=256
+  Objective score for model_A_config_6: 57 --- Best: model_A_config_4 (83)
+```
+
 ## Ensemble Model Search
 
 _This mode has the following limitations:_
diff --git a/docs/ensemble_quick_start.md b/docs/ensemble_quick_start.md
index 18cbcb398..a8a5e79b7 100644
--- a/docs/ensemble_quick_start.md
+++ b/docs/ensemble_quick_start.md
@@ -45,7 +45,7 @@ git pull origin main
 **3. Add a version directory to ensemble_add_sub**
 
 ```
-mkdir examples/quick/ensemble_add_sub/1
+mkdir examples/quick-start/ensemble_add_sub/1
 ```
 
 ## `Step 2:` Pull and Run the SDK Container