Skip to content

Commit

Permalink
Adding new options for LLM
Browse files Browse the repository at this point in the history
  • Loading branch information
nv-braf committed Oct 2, 2023
1 parent a387c11 commit e000bb4
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 74 deletions.
121 changes: 65 additions & 56 deletions model_analyzer/config/input/config_command_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,62 +36,7 @@
from model_analyzer.triton.server.server_config import TritonServerConfig

from .config_command import ConfigCommand
from .config_defaults import (
DEFAULT_ALWAYS_REPORT_GPU_METRICS,
DEFAULT_BATCH_SIZES,
DEFAULT_CHECKPOINT_DIRECTORY,
DEFAULT_CLIENT_PROTOCOL,
DEFAULT_COLLECT_CPU_METRICS,
DEFAULT_DURATION_SECONDS,
DEFAULT_EXPORT_PATH,
DEFAULT_FILENAME_MODEL_GPU,
DEFAULT_FILENAME_MODEL_INFERENCE,
DEFAULT_FILENAME_SERVER_ONLY,
DEFAULT_GPU_OUTPUT_FIELDS,
DEFAULT_GPUS,
DEFAULT_INFERENCE_OUTPUT_FIELDS,
DEFAULT_MAX_RETRIES,
DEFAULT_MODEL_WEIGHTING,
DEFAULT_MONITORING_INTERVAL,
DEFAULT_NUM_CONFIGS_PER_MODEL,
DEFAULT_NUM_TOP_MODEL_CONFIGS,
DEFAULT_OFFLINE_OBJECTIVES,
DEFAULT_OFFLINE_PLOTS,
DEFAULT_ONLINE_OBJECTIVES,
DEFAULT_ONLINE_PLOTS,
DEFAULT_OUTPUT_MODEL_REPOSITORY,
DEFAULT_OVERRIDE_OUTPUT_REPOSITORY_FLAG,
DEFAULT_PERF_ANALYZER_CPU_UTIL,
DEFAULT_PERF_ANALYZER_PATH,
DEFAULT_PERF_ANALYZER_TIMEOUT,
DEFAULT_PERF_MAX_AUTO_ADJUSTS,
DEFAULT_PERF_OUTPUT_FLAG,
DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS,
DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS,
DEFAULT_REQUEST_RATE_SEARCH_ENABLE,
DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS,
DEFAULT_RUN_CONFIG_MAX_CONCURRENCY,
DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT,
DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE,
DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE,
DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT,
DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE,
DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE,
DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE,
DEFAULT_RUN_CONFIG_SEARCH_DISABLE,
DEFAULT_RUN_CONFIG_SEARCH_MODE,
DEFAULT_SERVER_OUTPUT_FIELDS,
DEFAULT_SKIP_DETAILED_REPORTS,
DEFAULT_SKIP_SUMMARY_REPORTS,
DEFAULT_TRITON_DOCKER_IMAGE,
DEFAULT_TRITON_GRPC_ENDPOINT,
DEFAULT_TRITON_HTTP_ENDPOINT,
DEFAULT_TRITON_INSTALL_PATH,
DEFAULT_TRITON_LAUNCH_MODE,
DEFAULT_TRITON_METRICS_URL,
DEFAULT_TRITON_SERVER_PATH,
)
from .config_defaults import *
from .config_enum import ConfigEnum
from .config_field import ConfigField
from .config_list_generic import ConfigListGeneric
Expand Down Expand Up @@ -624,6 +569,24 @@ def _add_profile_models_configs(self):
" to be used during profiling",
)
)
self._add_config(
ConfigField(
"prompt_length",
flags=["--prompt-length"],
field_type=ConfigListNumeric(int),
description="Comma-delimited list of prompt length values or ranges <start:end:step>"
" to be used during profiling LLMs",
)
)
self._add_config(
ConfigField(
"max_token_count",
flags=["--max-token-count"],
field_type=ConfigListNumeric(int),
description="Comma-delimited list of max token values or ranges <start:end:step>"
" to be used during profiling LLMs",
)
)
self._add_config(
ConfigField(
"reload_model_disable",
Expand Down Expand Up @@ -841,6 +804,52 @@ def _add_run_search_configs(self):
description="Enables the searching of request rate (instead of concurrency).",
)
)
self._add_config(
ConfigField(
"llm_search_enable",
flags=["--llm-search-enable"],
field_type=ConfigPrimitive(bool),
parser_args={"action": "store_true"},
default_value=DEFAULT_LLM_SEARCH_ENABLE,
description="Enables searching values are important to LLMs: prompt length, max token, etc...",
)
)
self._add_config(
ConfigField(
"run_config_search_min_prompt_length",
flags=["--run-config-search-min-prompt-length"],
field_type=ConfigPrimitive(int),
default_value=DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH,
description="Min prompt length that run config search should start with.",
)
)
self._add_config(
ConfigField(
"run_config_search_max_prompt_length",
flags=["--run-config-search-max-prompt-length"],
field_type=ConfigPrimitive(int),
default_value=DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH,
description="Max prompt length that run config search will not go beyond.",
)
)
self._add_config(
ConfigField(
"run_config_search_min_token_count",
flags=["--run-config-search-min-token-count"],
field_type=ConfigPrimitive(int),
default_value=DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT,
description="Min token count that run config search should start with.",
)
)
self._add_config(
ConfigField(
"run_config_search_max_token_count",
flags=["--run-config-search-max-token-count"],
field_type=ConfigPrimitive(int),
default_value=DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT,
description="Max token count that run config search will not go beyond.",
)
)

def _add_triton_configs(self):
"""
Expand Down
5 changes: 5 additions & 0 deletions model_analyzer/config/input/config_defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,15 @@
DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE = 1
DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE = 128
DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS = 5
DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH = 1
DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH = 1000
DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT = 1
DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT = 256
DEFAULT_RUN_CONFIG_SEARCH_DISABLE = False
DEFAULT_RUN_CONFIG_SEARCH_MODE = "brute"
DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE = False
DEFAULT_REQUEST_RATE_SEARCH_ENABLE = False
DEFAULT_LLM_SEARCH_ENABLE = False
DEFAULT_TRITON_LAUNCH_MODE = "local"
DEFAULT_TRITON_DOCKER_IMAGE = "nvcr.io/nvidia/tritonserver:23.09-py3"
DEFAULT_TRITON_HTTP_ENDPOINT = "localhost:8000"
Expand Down
42 changes: 25 additions & 17 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from model_analyzer.cli.cli import CLI
from model_analyzer.config.input.config_command_profile import ConfigCommandProfile
from model_analyzer.config.input.config_command_report import ConfigCommandReport
from model_analyzer.config.input.config_defaults import *
from model_analyzer.config.input.config_defaults import DEFAULT_TRITON_DOCKER_IMAGE
from model_analyzer.config.input.config_status import ConfigStatus
from model_analyzer.constants import CONFIG_PARSER_SUCCESS
Expand Down Expand Up @@ -60,6 +61,7 @@ def get_test_options():
OptionStruct("bool", "profile","--run-config-search-disable"),
OptionStruct("bool", "profile","--run-config-profile-models-concurrently-enable"),
OptionStruct("bool", "profile","--request-rate-search-enable"),
OptionStruct("bool", "profile","--llm-search-enable"),
OptionStruct("bool", "profile","--reload-model-disable"),
OptionStruct("bool", "profile","--early-exit-enable"),
OptionStruct("bool", "profile","--skip-summary-reports"),
Expand All @@ -71,23 +73,27 @@ def get_test_options():
# The following options can be None:
# short_option
# expected_default_value
OptionStruct("int", "profile", "--client-max-retries", "-r", "125", "50"),
OptionStruct("int", "profile", "--duration-seconds", "-d", "10", "3"),
OptionStruct("int", "profile", "--perf-analyzer-timeout", None, "100", "600"),
OptionStruct("int", "profile", "--perf-analyzer-max-auto-adjusts", None, "100", "10"),
OptionStruct("int", "profile", "--run-config-search-min-concurrency", None, "2", "1"),
OptionStruct("int", "profile", "--run-config-search-max-concurrency", None, "100", "1024"),
OptionStruct("int", "profile", "--run-config-search-min-request-rate", None, "2", "16"),
OptionStruct("int", "profile", "--run-config-search-max-request-rate", None, "100", "8192"),
OptionStruct("int", "profile", "--run-config-search-min-model-batch-size", None, "100", "1"),
OptionStruct("int", "profile", "--run-config-search-max-model-batch-size", None, "100", "128"),
OptionStruct("int", "profile", "--run-config-search-min-instance-count", None, "2", "1"),
OptionStruct("int", "profile", "--run-config-search-max-instance-count", None, "10", "5"),
OptionStruct("int", "profile", "--run-config-search-max-binary-search-steps", None, "10", "5"),
OptionStruct("float", "profile", "--monitoring-interval", "-i", "10.0", "1.0"),
OptionStruct("float", "profile", "--perf-analyzer-cpu-util", None, "10.0", str(psutil.cpu_count() * 80.0)),
OptionStruct("int", "profile", "--num-configs-per-model", None, "10", "3"),
OptionStruct("int", "profile", "--num-top-model-configs", None, "10", "0"),
OptionStruct("int", "profile", "--client-max-retries", "-r", "125", str(DEFAULT_MAX_RETRIES)),
OptionStruct("int", "profile", "--duration-seconds", "-d", "10", str(DEFAULT_DURATION_SECONDS)),
OptionStruct("int", "profile", "--perf-analyzer-timeout", None, "100", str(DEFAULT_PERF_ANALYZER_TIMEOUT)),
OptionStruct("int", "profile", "--perf-analyzer-max-auto-adjusts", None, "100", str(DEFAULT_PERF_MAX_AUTO_ADJUSTS)),
OptionStruct("int", "profile", "--run-config-search-min-concurrency", None, "2", str(DEFAULT_RUN_CONFIG_MIN_CONCURRENCY)),
OptionStruct("int", "profile", "--run-config-search-max-concurrency", None, "100", str(DEFAULT_RUN_CONFIG_MAX_CONCURRENCY)),
OptionStruct("int", "profile", "--run-config-search-min-request-rate", None, "2", str(DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE)),
OptionStruct("int", "profile", "--run-config-search-max-request-rate", None, "100", str(DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE)),
OptionStruct("int", "profile", "--run-config-search-min-model-batch-size", None, "100", str(DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE)),
OptionStruct("int", "profile", "--run-config-search-max-model-batch-size", None, "100", str(DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE)),
OptionStruct("int", "profile", "--run-config-search-min-instance-count", None, "2", str(DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT)),
OptionStruct("int", "profile", "--run-config-search-max-instance-count", None, "10", str(DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT)),
OptionStruct("int", "profile", "--run-config-search-max-binary-search-steps", None, "10", str(DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS)),
OptionStruct("int", "profile", "--run-config-search-min-prompt-length", None, "10", str(DEFAULT_RUN_CONFIG_MIN_PROMPT_LENGTH)),
OptionStruct("int", "profile", "--run-config-search-max-prompt-length", None, "10", str(DEFAULT_RUN_CONFIG_MAX_PROMPT_LENGTH)),
OptionStruct("int", "profile", "--run-config-search-min-token-count", None, "10", str(DEFAULT_RUN_CONFIG_MIN_TOKEN_COUNT)),
OptionStruct("int", "profile", "--run-config-search-max-token-count", None, "10", str(DEFAULT_RUN_CONFIG_MAX_TOKEN_COUNT)),
OptionStruct("float", "profile", "--monitoring-interval", "-i", "10.0", str(DEFAULT_MONITORING_INTERVAL)),
OptionStruct("float", "profile", "--perf-analyzer-cpu-util", None, "10.0", str(psutil.cpu_count() * DEFAULT_PERF_ANALYZER_CPU_UTIL)),
OptionStruct("int", "profile", "--num-configs-per-model", None, "10", str(DEFAULT_NUM_CONFIGS_PER_MODEL)),
OptionStruct("int", "profile", "--num-top-model-configs", None, "10", str(DEFAULT_NUM_TOP_MODEL_CONFIGS)),
OptionStruct("int", "profile", "--latency-budget", None, "200", None),
OptionStruct("int", "profile", "--min-throughput", None, "300", None),

Expand Down Expand Up @@ -135,6 +141,8 @@ def get_test_options():
OptionStruct("intlist", "profile", "--batch-sizes", "-b", "2, 4, 6", "1"),
OptionStruct("intlist", "profile", "--concurrency", "-c", "1, 2, 3", None),
OptionStruct("intlist", "profile", "--request-rate", None, "1, 2, 3", None),
OptionStruct("intlist", "profile", "--prompt-length", None, "1, 2, 3", None),
OptionStruct("intlist", "profile", "--max-token-count", None, "1, 2, 3", None),
OptionStruct("stringlist", "profile", "--triton-docker-mounts", None, "a:b:c, d:e:f", None, extra_commands=["--triton-launch-mode", "docker"]),
OptionStruct("stringlist", "profile", "--gpus", None, "a, b, c", "all"),
OptionStruct("stringlist", "profile", "--inference-output-fields", None, "a, b, c",
Expand Down
2 changes: 1 addition & 1 deletion tests/test_model_config_measurement.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

import json
import unittest
from unittest.mock import patch
from unittest.mock import MagicMock, patch

from model_analyzer.result.model_config_measurement import ModelConfigMeasurement
from tests.common.test_utils import convert_non_gpu_metrics_to_data, default_encode
Expand Down

0 comments on commit e000bb4

Please sign in to comment.