From e000bb4219ca20bd611eb833e20ac2eeb567fea2 Mon Sep 17 00:00:00 2001 From: braf Date: Fri, 29 Sep 2023 21:38:52 +0000 Subject: [PATCH] Adding new options for LLM --- .../config/input/config_command_profile.py | 121 ++++++++++-------- .../config/input/config_defaults.py | 5 + tests/test_cli.py | 42 +++--- tests/test_model_config_measurement.py | 2 +- 4 files changed, 96 insertions(+), 74 deletions(-) diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py index 02d6def28..6b9fdb147 100755 --- a/model_analyzer/config/input/config_command_profile.py +++ b/model_analyzer/config/input/config_command_profile.py @@ -36,62 +36,7 @@ from model_analyzer.triton.server.server_config import TritonServerConfig from .config_command import ConfigCommand -from .config_defaults import ( - DEFAULT_ALWAYS_REPORT_GPU_METRICS, - DEFAULT_BATCH_SIZES, - DEFAULT_CHECKPOINT_DIRECTORY, - DEFAULT_CLIENT_PROTOCOL, - DEFAULT_COLLECT_CPU_METRICS, - DEFAULT_DURATION_SECONDS, - DEFAULT_EXPORT_PATH, - DEFAULT_FILENAME_MODEL_GPU, - DEFAULT_FILENAME_MODEL_INFERENCE, - DEFAULT_FILENAME_SERVER_ONLY, - DEFAULT_GPU_OUTPUT_FIELDS, - DEFAULT_GPUS, - DEFAULT_INFERENCE_OUTPUT_FIELDS, - DEFAULT_MAX_RETRIES, - DEFAULT_MODEL_WEIGHTING, - DEFAULT_MONITORING_INTERVAL, - DEFAULT_NUM_CONFIGS_PER_MODEL, - DEFAULT_NUM_TOP_MODEL_CONFIGS, - DEFAULT_OFFLINE_OBJECTIVES, - DEFAULT_OFFLINE_PLOTS, - DEFAULT_ONLINE_OBJECTIVES, - DEFAULT_ONLINE_PLOTS, - DEFAULT_OUTPUT_MODEL_REPOSITORY, - DEFAULT_OVERRIDE_OUTPUT_REPOSITORY_FLAG, - DEFAULT_PERF_ANALYZER_CPU_UTIL, - DEFAULT_PERF_ANALYZER_PATH, - DEFAULT_PERF_ANALYZER_TIMEOUT, - DEFAULT_PERF_MAX_AUTO_ADJUSTS, - DEFAULT_PERF_OUTPUT_FLAG, - DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS, - DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS, - DEFAULT_REQUEST_RATE_SEARCH_ENABLE, - DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS, - DEFAULT_RUN_CONFIG_MAX_CONCURRENCY, - DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT, - DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE, - DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE, - DEFAULT_RUN_CONFIG_MIN_CONCURRENCY, - DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT, - DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE, - DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE, - DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE, - DEFAULT_RUN_CONFIG_SEARCH_DISABLE, - DEFAULT_RUN_CONFIG_SEARCH_MODE, - DEFAULT_SERVER_OUTPUT_FIELDS, - DEFAULT_SKIP_DETAILED_REPORTS, - DEFAULT_SKIP_SUMMARY_REPORTS, - DEFAULT_TRITON_DOCKER_IMAGE, - DEFAULT_TRITON_GRPC_ENDPOINT, - DEFAULT_TRITON_HTTP_ENDPOINT, - DEFAULT_TRITON_INSTALL_PATH, - DEFAULT_TRITON_LAUNCH_MODE, - DEFAULT_TRITON_METRICS_URL, - DEFAULT_TRITON_SERVER_PATH, -) +from .config_defaults import * from .config_enum import ConfigEnum from .config_field import ConfigField from .config_list_generic import ConfigListGeneric @@ -624,6 +569,24 @@ def _add_profile_models_configs(self): " to be used during profiling", ) ) + self._add_config( + ConfigField( + "prompt_length", + flags=["--prompt-length"], + field_type=ConfigListNumeric(int), + description="Comma-delimited list of prompt length values or ranges " + " to be used during profiling LLMs", + ) + ) + self._add_config( + ConfigField( + "max_token_count", + flags=["--max-token-count"], + field_type=ConfigListNumeric(int), + description="Comma-delimited list of max token values or ranges " + " to be used during profiling LLMs", + ) + ) self._add_config( ConfigField( "reload_model_disable", @@ -841,6 +804,52 @@ def _add_run_search_configs(self): description="Enables the searching of request rate (instead of concurrency).", ) ) + self._add_config( + ConfigField( + "llm_search_enable", + flags=["--llm-search-enable"], + field_type=ConfigPrimitive(bool), + parser_args={"action": "store_true"}, + default_value=DEFAULT_LLM_SEARCH_ENABLE, + description="Enables searching values are important to LLMs: prompt length, max token, etc...", + ) + ) + self._add_config( + ConfigField( + "run_config_search_min_prompt_length", + flags=["--run-config-search-min-prompt-length"], + field_type=ConfigPrimitive(int), + default_value=DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH, + description="Min prompt length that run config search should start with.", + ) + ) + self._add_config( + ConfigField( + "run_config_search_max_prompt_length", + flags=["--run-config-search-max-prompt-length"], + field_type=ConfigPrimitive(int), + default_value=DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH, + description="Max prompt length that run config search will not go beyond.", + ) + ) + self._add_config( + ConfigField( + "run_config_search_min_token_count", + flags=["--run-config-search-min-token-count"], + field_type=ConfigPrimitive(int), + default_value=DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT, + description="Min token count that run config search should start with.", + ) + ) + self._add_config( + ConfigField( + "run_config_search_max_token_count", + flags=["--run-config-search-max-token-count"], + field_type=ConfigPrimitive(int), + default_value=DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT, + description="Max token count that run config search will not go beyond.", + ) + ) def _add_triton_configs(self): """ diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py index 67c62dca9..c2edd6e91 100755 --- a/model_analyzer/config/input/config_defaults.py +++ b/model_analyzer/config/input/config_defaults.py @@ -51,10 +51,15 @@ DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE = 1 DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE = 128 DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS = 5 +DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH = 1 +DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH = 1000 +DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT = 1 +DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT = 256 DEFAULT_RUN_CONFIG_SEARCH_DISABLE = False DEFAULT_RUN_CONFIG_SEARCH_MODE = "brute" DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE = False DEFAULT_REQUEST_RATE_SEARCH_ENABLE = False +DEFAULT_LLM_SEARCH_ENABLE = False DEFAULT_TRITON_LAUNCH_MODE = "local" DEFAULT_TRITON_DOCKER_IMAGE = "nvcr.io/nvidia/tritonserver:23.09-py3" DEFAULT_TRITON_HTTP_ENDPOINT = "localhost:8000" diff --git a/tests/test_cli.py b/tests/test_cli.py index 98ec60237..35eecae13 100755 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -33,6 +33,7 @@ from model_analyzer.cli.cli import CLI from model_analyzer.config.input.config_command_profile import ConfigCommandProfile from model_analyzer.config.input.config_command_report import ConfigCommandReport +from model_analyzer.config.input.config_defaults import * from model_analyzer.config.input.config_defaults import DEFAULT_TRITON_DOCKER_IMAGE from model_analyzer.config.input.config_status import ConfigStatus from model_analyzer.constants import CONFIG_PARSER_SUCCESS @@ -60,6 +61,7 @@ def get_test_options(): OptionStruct("bool", "profile","--run-config-search-disable"), OptionStruct("bool", "profile","--run-config-profile-models-concurrently-enable"), OptionStruct("bool", "profile","--request-rate-search-enable"), + OptionStruct("bool", "profile","--llm-search-enable"), OptionStruct("bool", "profile","--reload-model-disable"), OptionStruct("bool", "profile","--early-exit-enable"), OptionStruct("bool", "profile","--skip-summary-reports"), @@ -71,23 +73,27 @@ def get_test_options(): # The following options can be None: # short_option # expected_default_value - OptionStruct("int", "profile", "--client-max-retries", "-r", "125", "50"), - OptionStruct("int", "profile", "--duration-seconds", "-d", "10", "3"), - OptionStruct("int", "profile", "--perf-analyzer-timeout", None, "100", "600"), - OptionStruct("int", "profile", "--perf-analyzer-max-auto-adjusts", None, "100", "10"), - OptionStruct("int", "profile", "--run-config-search-min-concurrency", None, "2", "1"), - OptionStruct("int", "profile", "--run-config-search-max-concurrency", None, "100", "1024"), - OptionStruct("int", "profile", "--run-config-search-min-request-rate", None, "2", "16"), - OptionStruct("int", "profile", "--run-config-search-max-request-rate", None, "100", "8192"), - OptionStruct("int", "profile", "--run-config-search-min-model-batch-size", None, "100", "1"), - OptionStruct("int", "profile", "--run-config-search-max-model-batch-size", None, "100", "128"), - OptionStruct("int", "profile", "--run-config-search-min-instance-count", None, "2", "1"), - OptionStruct("int", "profile", "--run-config-search-max-instance-count", None, "10", "5"), - OptionStruct("int", "profile", "--run-config-search-max-binary-search-steps", None, "10", "5"), - OptionStruct("float", "profile", "--monitoring-interval", "-i", "10.0", "1.0"), - OptionStruct("float", "profile", "--perf-analyzer-cpu-util", None, "10.0", str(psutil.cpu_count() * 80.0)), - OptionStruct("int", "profile", "--num-configs-per-model", None, "10", "3"), - OptionStruct("int", "profile", "--num-top-model-configs", None, "10", "0"), + OptionStruct("int", "profile", "--client-max-retries", "-r", "125", str(DEFAULT_MAX_RETRIES)), + OptionStruct("int", "profile", "--duration-seconds", "-d", "10", str(DEFAULT_DURATION_SECONDS)), + OptionStruct("int", "profile", "--perf-analyzer-timeout", None, "100", str(DEFAULT_PERF_ANALYZER_TIMEOUT)), + OptionStruct("int", "profile", "--perf-analyzer-max-auto-adjusts", None, "100", str(DEFAULT_PERF_MAX_AUTO_ADJUSTS)), + OptionStruct("int", "profile", "--run-config-search-min-concurrency", None, "2", str(DEFAULT_RUN_CONFIG_MIN_CONCURRENCY)), + OptionStruct("int", "profile", "--run-config-search-max-concurrency", None, "100", str(DEFAULT_RUN_CONFIG_MAX_CONCURRENCY)), + OptionStruct("int", "profile", "--run-config-search-min-request-rate", None, "2", str(DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE)), + OptionStruct("int", "profile", "--run-config-search-max-request-rate", None, "100", str(DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE)), + OptionStruct("int", "profile", "--run-config-search-min-model-batch-size", None, "100", str(DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE)), + OptionStruct("int", "profile", "--run-config-search-max-model-batch-size", None, "100", str(DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE)), + OptionStruct("int", "profile", "--run-config-search-min-instance-count", None, "2", str(DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT)), + OptionStruct("int", "profile", "--run-config-search-max-instance-count", None, "10", str(DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT)), + OptionStruct("int", "profile", "--run-config-search-max-binary-search-steps", None, "10", str(DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS)), + OptionStruct("int", "profile", "--run-config-search-min-prompt-length", None, "10", str(DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH)), + OptionStruct("int", "profile", "--run-config-search-max-prompt-length", None, "10", str(DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH)), + OptionStruct("int", "profile", "--run-config-search-min-token-count", None, "10", str(DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT)), + OptionStruct("int", "profile", "--run-config-search-max-token-count", None, "10", str(DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT)), + OptionStruct("float", "profile", "--monitoring-interval", "-i", "10.0", str(DEFAULT_MONITORING_INTERVAL)), + OptionStruct("float", "profile", "--perf-analyzer-cpu-util", None, "10.0", str(psutil.cpu_count() * DEFAULT_PERF_ANALYZER_CPU_UTIL)), + OptionStruct("int", "profile", "--num-configs-per-model", None, "10", str(DEFAULT_NUM_CONFIGS_PER_MODEL)), + OptionStruct("int", "profile", "--num-top-model-configs", None, "10", str(DEFAULT_NUM_TOP_MODEL_CONFIGS)), OptionStruct("int", "profile", "--latency-budget", None, "200", None), OptionStruct("int", "profile", "--min-throughput", None, "300", None), @@ -135,6 +141,8 @@ def get_test_options(): OptionStruct("intlist", "profile", "--batch-sizes", "-b", "2, 4, 6", "1"), OptionStruct("intlist", "profile", "--concurrency", "-c", "1, 2, 3", None), OptionStruct("intlist", "profile", "--request-rate", None, "1, 2, 3", None), + OptionStruct("intlist", "profile", "--prompt-length", None, "1, 2, 3", None), + OptionStruct("intlist", "profile", "--max-token-count", None, "1, 2, 3", None), OptionStruct("stringlist", "profile", "--triton-docker-mounts", None, "a:b:c, d:e:f", None, extra_commands=["--triton-launch-mode", "docker"]), OptionStruct("stringlist", "profile", "--gpus", None, "a, b, c", "all"), OptionStruct("stringlist", "profile", "--inference-output-fields", None, "a, b, c", diff --git a/tests/test_model_config_measurement.py b/tests/test_model_config_measurement.py index e760f1efa..98f6e0732 100755 --- a/tests/test_model_config_measurement.py +++ b/tests/test_model_config_measurement.py @@ -16,7 +16,7 @@ import json import unittest -from unittest.mock import patch +from unittest.mock import MagicMock, patch from model_analyzer.result.model_config_measurement import ModelConfigMeasurement from tests.common.test_utils import convert_non_gpu_metrics_to_data, default_encode