Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge LLM Metrics changes to main #855

Merged
merged 7 commits into from
Apr 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 14 additions & 8 deletions docs/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,9 @@ cpu_only_composing_models: <comma-delimited-string-list>
# Skips the generation of detailed reports and tables
[ skip_detailed_reports: <bool> | default: false]

# Type of model being profiled: generic or LLM
[ model_type: <string> | default: generic]

# Number of top configs to show in summary plots
[ num_configs_per_model: <int> | default: 3]

Expand Down Expand Up @@ -364,14 +367,17 @@ Before proceeding, it will be helpful to see the documentation on [Model Analyze

### `<constraint>`

A constraint, specifies the bounds that determine a successful run. There are
three constraints allowed:

| Option Name | Units | Constraint | Description |
| :----------------- | :-------: | :--------: | :--------------------------------------------------- |
| `perf_throughput` | inf / sec | min | Specify minimum desired throughput. |
| `perf_latency_p99` | ms | max | Specify maximum tolerable latency or latency budget. |
| `gpu_used_memory` | MB | max | Specify maximum GPU memory used by model. |
A constraint, specifies the bounds that determine a successful run. The table below shows examples
of the types of constraints allowed:

| Option Name | Units | Constraint | Description |
| :------------------------ | :-------: | :--------: | :----------------------------------------------------- |
| `perf_throughput` | inf / sec | min | Specify minimum desired throughput. |
| `perf_latency_p99` | ms | max | Specify maximum tolerable latency or latency budget. |
| `output_token_throughput` | tok / sec | min | Specify minimum desired output token throughput. |
| `inter_token_latency_p99` | ms | max | Specify maximum tolerable input token latency. |
| `time_to_first_token_p99` | ms | max | Specify maximum tolerable time to first token latency. |
| `gpu_used_memory` | MB | max | Specify maximum GPU memory used by model. |

<br>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def _generate_subset(
self._send_results_to_generator(index)

def _make_run_config(self) -> RunConfig:
run_config = RunConfig(self._triton_env)
run_config = RunConfig(self._triton_env, self._models[0].genai_perf_flags())
for index in range(len(self._models)):
run_config.add_model_run_config(self._curr_model_run_configs[index])
return run_config
Expand Down
131 changes: 131 additions & 0 deletions model_analyzer/config/input/config_command_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
)
from model_analyzer.constants import LOGGER_NAME
from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
from model_analyzer.perf_analyzer.genai_perf_config import GenaiPerfConfig
from model_analyzer.perf_analyzer.perf_config import PerfAnalyzerConfig
from model_analyzer.record.record import RecordType
from model_analyzer.triton.server.server_config import TritonServerConfig
Expand All @@ -50,7 +51,9 @@
DEFAULT_GPU_OUTPUT_FIELDS,
DEFAULT_GPUS,
DEFAULT_INFERENCE_OUTPUT_FIELDS,
DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS,
DEFAULT_MAX_RETRIES,
DEFAULT_MODEL_TYPE,
DEFAULT_MODEL_WEIGHTING,
DEFAULT_MONITORING_INTERVAL,
DEFAULT_NUM_CONFIGS_PER_MODEL,
Expand Down Expand Up @@ -297,6 +300,15 @@ def _fill_config(self):
description="Skips the generation of detailed summary reports and tables.",
)
)
self._add_config(
ConfigField(
"model_type",
flags=["--model-type"],
field_type=ConfigPrimitive(str),
default_value=DEFAULT_MODEL_TYPE,
description="Type of model being profiled: generic or LLM",
)
)

self._add_repository_configs()
self._add_client_configs()
Expand Down Expand Up @@ -364,6 +376,10 @@ def _add_profile_models_configs(self):
}
)

genai_perf_flags_scheme = ConfigObject(
schema={k: ConfigPrimitive(str) for k in GenaiPerfConfig.allowed_keys()}
)

triton_server_environment_scheme = ConfigObject(
schema={"*": ConfigPrimitive(str)}
)
Expand Down Expand Up @@ -444,6 +460,13 @@ def _add_profile_models_configs(self):
description="Allows custom configuration of the perf analyzer instances used by model analyzer.",
)
)
self._add_config(
ConfigField(
"genai_perf_flags",
field_type=genai_perf_flags_scheme,
description="Allows custom configuration of the GenAI Perf instances used by model analyzer.",
)
)
self._add_config(
ConfigField(
"triton_server_flags",
Expand Down Expand Up @@ -484,6 +507,11 @@ def _add_profile_models_configs(self):
"min": ConfigPrimitive(int),
}
),
"output_token_throughput": ConfigObject(
schema={
"min": ConfigPrimitive(int),
}
),
"perf_latency_avg": ConfigObject(
schema={
"max": ConfigPrimitive(int),
Expand Down Expand Up @@ -514,6 +542,96 @@ def _add_profile_models_configs(self):
"max": ConfigPrimitive(int),
}
),
"inter_token_latency_p99": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"inter_token_latency_p95": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"inter_token_latency_p90": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"inter_token_latency_p75": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"inter_token_latency_p50": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"inter_token_latency_p25": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"inter_token_latency_min": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"inter_token_latency_max": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"inter_token_latency_avg": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"time_to_first_token_p99": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"time_to_first_token_p95": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"time_to_first_token_p90": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"time_to_first_token_p75": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"time_to_first_token_p50": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"time_to_first_token_p25": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"time_to_first_token_min": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"time_to_first_token_max": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"time_to_first_token_avg": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
}
)
self._add_config(
Expand Down Expand Up @@ -560,6 +678,7 @@ def _add_profile_models_configs(self):
"weighting": ConfigPrimitive(type_=int),
"model_config_parameters": model_config_fields,
"perf_analyzer_flags": perf_analyzer_flags_scheme,
"genai_perf_flags": genai_perf_flags_scheme,
"triton_server_flags": triton_server_flags_scheme,
"triton_server_environment": triton_server_environment_scheme,
"triton_docker_args": triton_docker_args_scheme,
Expand Down Expand Up @@ -1344,6 +1463,12 @@ def _autofill_values(self):
if not self._fields["gpu_output_fields"].is_set_by_user():
self.gpu_output_fields = DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS

# Switch default output fields if user specifies model type of LLM
# and the user didn't specify a custom output field
if self.model_type == "LLM":
if not self._fields["inference_output_fields"].is_set_by_user():
self.inference_output_fields = DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS

new_profile_models = {}
for i, model in enumerate(self.profile_models):
new_model = {"cpu_only": (model.cpu_only() or cpu_only)}
Expand Down Expand Up @@ -1447,6 +1572,12 @@ def _autofill_values(self):
else:
new_model["perf_analyzer_flags"] = model.perf_analyzer_flags()

# GenAI Perf flags
if not model.genai_perf_flags():
new_model["genai_perf_flags"] = self.genai_perf_flags
else:
new_model["genai_perf_flags"] = model.genai_perf_flags()

# triton server flags
if not model.triton_server_flags():
new_model["triton_server_flags"] = self.triton_server_flags
Expand Down
15 changes: 15 additions & 0 deletions model_analyzer/config/input/config_defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
DEFAULT_PERF_OUTPUT_FLAG = False
DEFAULT_PERF_MAX_AUTO_ADJUSTS = 10
DEFAULT_MEASUREMENT_MODE = "count_windows"
DEFAULT_MODEL_TYPE = "generic"

DEFAULT_ONLINE_PLOTS = {
"throughput_v_latency": {
Expand Down Expand Up @@ -118,6 +119,20 @@
"perf_throughput",
"perf_latency_p99",
]
DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS = [
"model_name",
"batch_size",
"concurrency",
"model_config_path",
"instance_group",
"max_batch_size",
"satisfies_constraints",
"perf_throughput",
"perf_latency_p99",
"inter_token_latency_p99",
"time_to_first_token_p99",
"output_token_throughput",
]
DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS = [
"model_name",
"batch_size",
Expand Down
18 changes: 18 additions & 0 deletions model_analyzer/config/input/objects/config_model_profile_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def __init__(
parameters=None,
model_config_parameters=None,
perf_analyzer_flags=None,
genai_perf_flags=None,
triton_server_flags=None,
triton_server_environment=None,
triton_docker_args=None,
Expand All @@ -58,6 +59,9 @@ def __init__(
perf_analyzer_flags : dict
The custom perf analyzer configuration
for this model
genai_perf_flags : dict
The custom GenAI perf configuration
for this model
triton_server_flags : dict
The configuration for the triton server instance launched
for this model
Expand All @@ -78,6 +82,7 @@ def __init__(
self._parameters = parameters
self._model_config_parameters = model_config_parameters
self._perf_analyzer_flags = perf_analyzer_flags
self._genai_perf_flags = genai_perf_flags
self._triton_server_flags = triton_server_flags
self._triton_server_environment = triton_server_environment
self._triton_docker_args = triton_docker_args
Expand Down Expand Up @@ -162,6 +167,16 @@ def perf_analyzer_flags(self):

return self._perf_analyzer_flags

def genai_perf_flags(self):
"""
Returns
-------
dict:
the genai_perf_flags
"""

return self._genai_perf_flags

def triton_server_flags(self):
"""
Returns
Expand Down Expand Up @@ -304,4 +319,7 @@ def __repr__(self):
if self._perf_analyzer_flags:
model_object["perf_analyzer_flags"] = self._perf_analyzer_flags

if self._genai_perf_flags:
model_object["genai_perf_flags"] = self._genai_perf_flags

return str(model_object)
11 changes: 10 additions & 1 deletion model_analyzer/config/run/run_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from typing import List

from model_analyzer.config.run.model_run_config import ModelRunConfig
from model_analyzer.perf_analyzer.genai_perf_config import GenaiPerfConfig


class RunConfig:
Expand All @@ -25,16 +26,21 @@ class RunConfig:
at the same time in Perf Analyzer
"""

def __init__(self, triton_env):
def __init__(self, triton_env, genai_perf_flags=None):
"""
Parameters
----------
triton_env : dict
A dictionary of environment variables to set
when launching tritonserver

genai_perf_flags: dict
The set of flags used when calling genai_perf for LLM models
"""

self._triton_env = triton_env
self._genai_perf_config = GenaiPerfConfig()
self._genai_perf_config.update_config(genai_perf_flags)
self._model_run_configs: List[ModelRunConfig] = []

def add_model_run_config(self, model_run_config):
Expand Down Expand Up @@ -103,6 +109,9 @@ def triton_environment(self):

return self._triton_env

def genai_perf_config(self):
return self._genai_perf_config

def models_name(self):
"""Returns a single comma-joined name of the original model names"""
return ",".join([mrc.model_name() for mrc in self.model_run_configs()])
Expand Down
4 changes: 4 additions & 0 deletions model_analyzer/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,7 @@

# Model analyzer package name
PACKAGE_NAME = "triton-model-analyzer"

# GENAI-PERF
GENAI_PERF_CSV = "profile_export_genai_perf.csv"
GENAI_PERF_COLLATERAL = ["llm_inputs.json", "profile_export.json"]
Loading
Loading