Adding new options for LLM

triton-inference-server · Oct 2, 2023 · e000bb4 · e000bb4
1 parent a387c11
commit e000bb4
Show file tree

Hide file tree

Showing 4 changed files with 96 additions and 74 deletions.
diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py
@@ -36,62 +36,7 @@
 from model_analyzer.triton.server.server_config import TritonServerConfig
 
 from .config_command import ConfigCommand
-from .config_defaults import (
-    DEFAULT_ALWAYS_REPORT_GPU_METRICS,
-    DEFAULT_BATCH_SIZES,
-    DEFAULT_CHECKPOINT_DIRECTORY,
-    DEFAULT_CLIENT_PROTOCOL,
-    DEFAULT_COLLECT_CPU_METRICS,
-    DEFAULT_DURATION_SECONDS,
-    DEFAULT_EXPORT_PATH,
-    DEFAULT_FILENAME_MODEL_GPU,
-    DEFAULT_FILENAME_MODEL_INFERENCE,
-    DEFAULT_FILENAME_SERVER_ONLY,
-    DEFAULT_GPU_OUTPUT_FIELDS,
-    DEFAULT_GPUS,
-    DEFAULT_INFERENCE_OUTPUT_FIELDS,
-    DEFAULT_MAX_RETRIES,
-    DEFAULT_MODEL_WEIGHTING,
-    DEFAULT_MONITORING_INTERVAL,
-    DEFAULT_NUM_CONFIGS_PER_MODEL,
-    DEFAULT_NUM_TOP_MODEL_CONFIGS,
-    DEFAULT_OFFLINE_OBJECTIVES,
-    DEFAULT_OFFLINE_PLOTS,
-    DEFAULT_ONLINE_OBJECTIVES,
-    DEFAULT_ONLINE_PLOTS,
-    DEFAULT_OUTPUT_MODEL_REPOSITORY,
-    DEFAULT_OVERRIDE_OUTPUT_REPOSITORY_FLAG,
-    DEFAULT_PERF_ANALYZER_CPU_UTIL,
-    DEFAULT_PERF_ANALYZER_PATH,
-    DEFAULT_PERF_ANALYZER_TIMEOUT,
-    DEFAULT_PERF_MAX_AUTO_ADJUSTS,
-    DEFAULT_PERF_OUTPUT_FLAG,
-    DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS,
-    DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS,
-    DEFAULT_REQUEST_RATE_SEARCH_ENABLE,
-    DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS,
-    DEFAULT_RUN_CONFIG_MAX_CONCURRENCY,
-    DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT,
-    DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE,
-    DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE,
-    DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
-    DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT,
-    DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE,
-    DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE,
-    DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE,
-    DEFAULT_RUN_CONFIG_SEARCH_DISABLE,
-    DEFAULT_RUN_CONFIG_SEARCH_MODE,
-    DEFAULT_SERVER_OUTPUT_FIELDS,
-    DEFAULT_SKIP_DETAILED_REPORTS,
-    DEFAULT_SKIP_SUMMARY_REPORTS,
-    DEFAULT_TRITON_DOCKER_IMAGE,
-    DEFAULT_TRITON_GRPC_ENDPOINT,
-    DEFAULT_TRITON_HTTP_ENDPOINT,
-    DEFAULT_TRITON_INSTALL_PATH,
-    DEFAULT_TRITON_LAUNCH_MODE,
-    DEFAULT_TRITON_METRICS_URL,
-    DEFAULT_TRITON_SERVER_PATH,
-)
+from .config_defaults import *
 from .config_enum import ConfigEnum
 from .config_field import ConfigField
 from .config_list_generic import ConfigListGeneric
@@ -624,6 +569,24 @@ def _add_profile_models_configs(self):
                 " to be used during profiling",
             )
         )
+        self._add_config(
+            ConfigField(
+                "prompt_length",
+                flags=["--prompt-length"],
+                field_type=ConfigListNumeric(int),
+                description="Comma-delimited list of prompt length values or ranges <start:end:step>"
+                " to be used during profiling LLMs",
+            )
+        )
+        self._add_config(
+            ConfigField(
+                "max_token_count",
+                flags=["--max-token-count"],
+                field_type=ConfigListNumeric(int),
+                description="Comma-delimited list of max token values or ranges <start:end:step>"
+                " to be used during profiling LLMs",
+            )
+        )
         self._add_config(
             ConfigField(
                 "reload_model_disable",
@@ -841,6 +804,52 @@ def _add_run_search_configs(self):
                 description="Enables the searching of request rate (instead of concurrency).",
             )
         )
+        self._add_config(
+            ConfigField(
+                "llm_search_enable",
+                flags=["--llm-search-enable"],
+                field_type=ConfigPrimitive(bool),
+                parser_args={"action": "store_true"},
+                default_value=DEFAULT_LLM_SEARCH_ENABLE,
+                description="Enables searching values are important to LLMs: prompt length, max token, etc...",
+            )
+        )
+        self._add_config(
+            ConfigField(
+                "run_config_search_min_prompt_length",
+                flags=["--run-config-search-min-prompt-length"],
+                field_type=ConfigPrimitive(int),
+                default_value=DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH,
+                description="Min prompt length that run config search should start with.",
+            )
+        )
+        self._add_config(
+            ConfigField(
+                "run_config_search_max_prompt_length",
+                flags=["--run-config-search-max-prompt-length"],
+                field_type=ConfigPrimitive(int),
+                default_value=DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH,
+                description="Max prompt length that run config search will not go beyond.",
+            )
+        )
+        self._add_config(
+            ConfigField(
+                "run_config_search_min_token_count",
+                flags=["--run-config-search-min-token-count"],
+                field_type=ConfigPrimitive(int),
+                default_value=DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT,
+                description="Min token count that run config search should start with.",
+            )
+        )
+        self._add_config(
+            ConfigField(
+                "run_config_search_max_token_count",
+                flags=["--run-config-search-max-token-count"],
+                field_type=ConfigPrimitive(int),
+                default_value=DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT,
+                description="Max token count that run config search will not go beyond.",
+            )
+        )
 
     def _add_triton_configs(self):
         """

diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py
@@ -51,10 +51,15 @@
 DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE = 1
 DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE = 128
 DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS = 5
+DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH = 1
+DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH = 1000
+DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT = 1
+DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT = 256
 DEFAULT_RUN_CONFIG_SEARCH_DISABLE = False
 DEFAULT_RUN_CONFIG_SEARCH_MODE = "brute"
 DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE = False
 DEFAULT_REQUEST_RATE_SEARCH_ENABLE = False
+DEFAULT_LLM_SEARCH_ENABLE = False
 DEFAULT_TRITON_LAUNCH_MODE = "local"
 DEFAULT_TRITON_DOCKER_IMAGE = "nvcr.io/nvidia/tritonserver:23.09-py3"
 DEFAULT_TRITON_HTTP_ENDPOINT = "localhost:8000"

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -33,6 +33,7 @@
 from model_analyzer.cli.cli import CLI
 from model_analyzer.config.input.config_command_profile import ConfigCommandProfile
 from model_analyzer.config.input.config_command_report import ConfigCommandReport
+from model_analyzer.config.input.config_defaults import *
 from model_analyzer.config.input.config_defaults import DEFAULT_TRITON_DOCKER_IMAGE
 from model_analyzer.config.input.config_status import ConfigStatus
 from model_analyzer.constants import CONFIG_PARSER_SUCCESS
@@ -60,6 +61,7 @@ def get_test_options():
         OptionStruct("bool", "profile","--run-config-search-disable"),
         OptionStruct("bool", "profile","--run-config-profile-models-concurrently-enable"),
         OptionStruct("bool", "profile","--request-rate-search-enable"),
+        OptionStruct("bool", "profile","--llm-search-enable"),
         OptionStruct("bool", "profile","--reload-model-disable"),
         OptionStruct("bool", "profile","--early-exit-enable"),
         OptionStruct("bool", "profile","--skip-summary-reports"),
@@ -71,23 +73,27 @@ def get_test_options():
         # The following options can be None:
         #   short_option
         #   expected_default_value
-        OptionStruct("int", "profile", "--client-max-retries", "-r", "125", "50"),
-        OptionStruct("int", "profile", "--duration-seconds", "-d", "10", "3"),
-        OptionStruct("int", "profile", "--perf-analyzer-timeout", None, "100", "600"),
-        OptionStruct("int", "profile", "--perf-analyzer-max-auto-adjusts", None, "100", "10"),
-        OptionStruct("int", "profile", "--run-config-search-min-concurrency", None, "2", "1"),
-        OptionStruct("int", "profile", "--run-config-search-max-concurrency", None, "100", "1024"),
-        OptionStruct("int", "profile", "--run-config-search-min-request-rate", None, "2", "16"),
-        OptionStruct("int", "profile", "--run-config-search-max-request-rate", None, "100", "8192"),
-        OptionStruct("int", "profile", "--run-config-search-min-model-batch-size", None, "100", "1"),
-        OptionStruct("int", "profile", "--run-config-search-max-model-batch-size", None, "100", "128"),
-        OptionStruct("int", "profile", "--run-config-search-min-instance-count", None, "2", "1"),
-        OptionStruct("int", "profile", "--run-config-search-max-instance-count", None, "10", "5"),
-        OptionStruct("int", "profile", "--run-config-search-max-binary-search-steps", None, "10", "5"),
-        OptionStruct("float", "profile", "--monitoring-interval", "-i", "10.0", "1.0"),
-        OptionStruct("float", "profile", "--perf-analyzer-cpu-util", None, "10.0", str(psutil.cpu_count() * 80.0)),
-        OptionStruct("int", "profile", "--num-configs-per-model", None, "10", "3"),
-        OptionStruct("int", "profile", "--num-top-model-configs", None, "10", "0"),
+        OptionStruct("int", "profile", "--client-max-retries", "-r", "125", str(DEFAULT_MAX_RETRIES)),
+        OptionStruct("int", "profile", "--duration-seconds", "-d", "10", str(DEFAULT_DURATION_SECONDS)),
+        OptionStruct("int", "profile", "--perf-analyzer-timeout", None, "100", str(DEFAULT_PERF_ANALYZER_TIMEOUT)),
+        OptionStruct("int", "profile", "--perf-analyzer-max-auto-adjusts", None, "100", str(DEFAULT_PERF_MAX_AUTO_ADJUSTS)),
+        OptionStruct("int", "profile", "--run-config-search-min-concurrency", None, "2", str(DEFAULT_RUN_CONFIG_MIN_CONCURRENCY)),
+        OptionStruct("int", "profile", "--run-config-search-max-concurrency", None, "100", str(DEFAULT_RUN_CONFIG_MAX_CONCURRENCY)),
+        OptionStruct("int", "profile", "--run-config-search-min-request-rate", None, "2", str(DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE)),
+        OptionStruct("int", "profile", "--run-config-search-max-request-rate", None, "100", str(DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE)),
+        OptionStruct("int", "profile", "--run-config-search-min-model-batch-size", None, "100", str(DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE)),
+        OptionStruct("int", "profile", "--run-config-search-max-model-batch-size", None, "100", str(DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE)),
+        OptionStruct("int", "profile", "--run-config-search-min-instance-count", None, "2", str(DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT)),
+        OptionStruct("int", "profile", "--run-config-search-max-instance-count", None, "10", str(DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT)),
+        OptionStruct("int", "profile", "--run-config-search-max-binary-search-steps", None, "10", str(DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS)),
+        OptionStruct("int", "profile", "--run-config-search-min-prompt-length", None, "10", str(DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH)),
+        OptionStruct("int", "profile", "--run-config-search-max-prompt-length", None, "10", str(DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH)),
+        OptionStruct("int", "profile", "--run-config-search-min-token-count", None, "10", str(DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT)),
+        OptionStruct("int", "profile", "--run-config-search-max-token-count", None, "10", str(DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT)),
+        OptionStruct("float", "profile", "--monitoring-interval", "-i", "10.0", str(DEFAULT_MONITORING_INTERVAL)),
+        OptionStruct("float", "profile", "--perf-analyzer-cpu-util", None, "10.0", str(psutil.cpu_count() * DEFAULT_PERF_ANALYZER_CPU_UTIL)),
+        OptionStruct("int", "profile", "--num-configs-per-model", None, "10", str(DEFAULT_NUM_CONFIGS_PER_MODEL)),
+        OptionStruct("int", "profile", "--num-top-model-configs", None, "10", str(DEFAULT_NUM_TOP_MODEL_CONFIGS)),
         OptionStruct("int", "profile", "--latency-budget", None, "200", None),
         OptionStruct("int", "profile", "--min-throughput", None, "300", None),
 
@@ -135,6 +141,8 @@ def get_test_options():
         OptionStruct("intlist", "profile", "--batch-sizes", "-b", "2, 4, 6", "1"),
         OptionStruct("intlist", "profile", "--concurrency", "-c", "1, 2, 3", None),
         OptionStruct("intlist", "profile", "--request-rate", None, "1, 2, 3", None),
+        OptionStruct("intlist", "profile", "--prompt-length", None, "1, 2, 3", None),
+        OptionStruct("intlist", "profile", "--max-token-count", None, "1, 2, 3", None),
         OptionStruct("stringlist", "profile", "--triton-docker-mounts", None, "a:b:c, d:e:f", None, extra_commands=["--triton-launch-mode", "docker"]),
         OptionStruct("stringlist", "profile", "--gpus", None, "a, b, c", "all"),
         OptionStruct("stringlist", "profile", "--inference-output-fields", None, "a, b, c",

diff --git a/tests/test_model_config_measurement.py b/tests/test_model_config_measurement.py
@@ -16,7 +16,7 @@
 
 import json
 import unittest
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch
 
 from model_analyzer.result.model_config_measurement import ModelConfigMeasurement
 from tests.common.test_utils import convert_non_gpu_metrics_to_data, default_encode