Skip to content

Commit

Permalink
Merge LLM Metrics changes to main (#855)
Browse files Browse the repository at this point in the history
* New Records for LLM metrics (#839)

* Adding new LLM metrics

* Adding base class for perf, inter_token, and time_to_first latency records

* Add --llm-mode option (#842)

* Adding CLI hook for LLM

* Changing to use --model-type

* Capture LLM metrics from genai-perf in MA (#844)

* Successfully reading from LLM CSV

* General cleanup

* All unit tests passing

* Fixing metric table typos

* Fixing typos

* Update constraints for LLMs (#845)

* Adding LLM values to list of possible constraints

* Fixing typo

* Adding new output fields for LLM (#846)

* Profiling model using genai-perf (#849)

* Initial changes to run genai-perf in MA

* Gating call to get LLM records

* Fixing captilization issue

* Removing debug

* Adding TODO

---------

Co-authored-by: root <[email protected]>

* Add genai_perf CLI options to MA (#854)

* Added support for genai_perf CLI

* Remove dead code

* Removing genai_perf collateral

* Fixing codeQL issue

* Adding streaming to genai_perf_config

---------

Co-authored-by: root <[email protected]>
  • Loading branch information
nv-braf and root authored Apr 8, 2024
1 parent 792f2a4 commit 8298d83
Show file tree
Hide file tree
Showing 41 changed files with 2,329 additions and 285 deletions.
22 changes: 14 additions & 8 deletions docs/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,9 @@ cpu_only_composing_models: <comma-delimited-string-list>
# Skips the generation of detailed reports and tables
[ skip_detailed_reports: <bool> | default: false]
# Type of model being profiled: generic or LLM
[ model_type: <string> | default: generic]
# Number of top configs to show in summary plots
[ num_configs_per_model: <int> | default: 3]
Expand Down Expand Up @@ -364,14 +367,17 @@ Before proceeding, it will be helpful to see the documentation on [Model Analyze

### `<constraint>`

A constraint, specifies the bounds that determine a successful run. There are
three constraints allowed:

| Option Name | Units | Constraint | Description |
| :----------------- | :-------: | :--------: | :--------------------------------------------------- |
| `perf_throughput` | inf / sec | min | Specify minimum desired throughput. |
| `perf_latency_p99` | ms | max | Specify maximum tolerable latency or latency budget. |
| `gpu_used_memory` | MB | max | Specify maximum GPU memory used by model. |
A constraint, specifies the bounds that determine a successful run. The table below shows examples
of the types of constraints allowed:

| Option Name | Units | Constraint | Description |
| :------------------------ | :-------: | :--------: | :----------------------------------------------------- |
| `perf_throughput` | inf / sec | min | Specify minimum desired throughput. |
| `perf_latency_p99` | ms | max | Specify maximum tolerable latency or latency budget. |
| `output_token_throughput` | tok / sec | min | Specify minimum desired output token throughput. |
| `inter_token_latency_p99` | ms | max | Specify maximum tolerable input token latency. |
| `time_to_first_token_p99` | ms | max | Specify maximum tolerable time to first token latency. |
| `gpu_used_memory` | MB | max | Specify maximum GPU memory used by model. |

<br>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def _generate_subset(
self._send_results_to_generator(index)

def _make_run_config(self) -> RunConfig:
run_config = RunConfig(self._triton_env)
run_config = RunConfig(self._triton_env, self._models[0].genai_perf_flags())
for index in range(len(self._models)):
run_config.add_model_run_config(self._curr_model_run_configs[index])
return run_config
Expand Down
131 changes: 131 additions & 0 deletions model_analyzer/config/input/config_command_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
)
from model_analyzer.constants import LOGGER_NAME
from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
from model_analyzer.perf_analyzer.genai_perf_config import GenaiPerfConfig
from model_analyzer.perf_analyzer.perf_config import PerfAnalyzerConfig
from model_analyzer.record.record import RecordType
from model_analyzer.triton.server.server_config import TritonServerConfig
Expand All @@ -50,7 +51,9 @@
DEFAULT_GPU_OUTPUT_FIELDS,
DEFAULT_GPUS,
DEFAULT_INFERENCE_OUTPUT_FIELDS,
DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS,
DEFAULT_MAX_RETRIES,
DEFAULT_MODEL_TYPE,
DEFAULT_MODEL_WEIGHTING,
DEFAULT_MONITORING_INTERVAL,
DEFAULT_NUM_CONFIGS_PER_MODEL,
Expand Down Expand Up @@ -297,6 +300,15 @@ def _fill_config(self):
description="Skips the generation of detailed summary reports and tables.",
)
)
self._add_config(
ConfigField(
"model_type",
flags=["--model-type"],
field_type=ConfigPrimitive(str),
default_value=DEFAULT_MODEL_TYPE,
description="Type of model being profiled: generic or LLM",
)
)

self._add_repository_configs()
self._add_client_configs()
Expand Down Expand Up @@ -364,6 +376,10 @@ def _add_profile_models_configs(self):
}
)

genai_perf_flags_scheme = ConfigObject(
schema={k: ConfigPrimitive(str) for k in GenaiPerfConfig.allowed_keys()}
)

triton_server_environment_scheme = ConfigObject(
schema={"*": ConfigPrimitive(str)}
)
Expand Down Expand Up @@ -444,6 +460,13 @@ def _add_profile_models_configs(self):
description="Allows custom configuration of the perf analyzer instances used by model analyzer.",
)
)
self._add_config(
ConfigField(
"genai_perf_flags",
field_type=genai_perf_flags_scheme,
description="Allows custom configuration of the GenAI Perf instances used by model analyzer.",
)
)
self._add_config(
ConfigField(
"triton_server_flags",
Expand Down Expand Up @@ -484,6 +507,11 @@ def _add_profile_models_configs(self):
"min": ConfigPrimitive(int),
}
),
"output_token_throughput": ConfigObject(
schema={
"min": ConfigPrimitive(int),
}
),
"perf_latency_avg": ConfigObject(
schema={
"max": ConfigPrimitive(int),
Expand Down Expand Up @@ -514,6 +542,96 @@ def _add_profile_models_configs(self):
"max": ConfigPrimitive(int),
}
),
"inter_token_latency_p99": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"inter_token_latency_p95": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"inter_token_latency_p90": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"inter_token_latency_p75": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"inter_token_latency_p50": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"inter_token_latency_p25": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"inter_token_latency_min": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"inter_token_latency_max": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"inter_token_latency_avg": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"time_to_first_token_p99": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"time_to_first_token_p95": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"time_to_first_token_p90": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"time_to_first_token_p75": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"time_to_first_token_p50": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"time_to_first_token_p25": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"time_to_first_token_min": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"time_to_first_token_max": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
"time_to_first_token_avg": ConfigObject(
schema={
"max": ConfigPrimitive(int),
}
),
}
)
self._add_config(
Expand Down Expand Up @@ -560,6 +678,7 @@ def _add_profile_models_configs(self):
"weighting": ConfigPrimitive(type_=int),
"model_config_parameters": model_config_fields,
"perf_analyzer_flags": perf_analyzer_flags_scheme,
"genai_perf_flags": genai_perf_flags_scheme,
"triton_server_flags": triton_server_flags_scheme,
"triton_server_environment": triton_server_environment_scheme,
"triton_docker_args": triton_docker_args_scheme,
Expand Down Expand Up @@ -1344,6 +1463,12 @@ def _autofill_values(self):
if not self._fields["gpu_output_fields"].is_set_by_user():
self.gpu_output_fields = DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS

# Switch default output fields if user specifies model type of LLM
# and the user didn't specify a custom output field
if self.model_type == "LLM":
if not self._fields["inference_output_fields"].is_set_by_user():
self.inference_output_fields = DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS

new_profile_models = {}
for i, model in enumerate(self.profile_models):
new_model = {"cpu_only": (model.cpu_only() or cpu_only)}
Expand Down Expand Up @@ -1447,6 +1572,12 @@ def _autofill_values(self):
else:
new_model["perf_analyzer_flags"] = model.perf_analyzer_flags()

# GenAI Perf flags
if not model.genai_perf_flags():
new_model["genai_perf_flags"] = self.genai_perf_flags
else:
new_model["genai_perf_flags"] = model.genai_perf_flags()

# triton server flags
if not model.triton_server_flags():
new_model["triton_server_flags"] = self.triton_server_flags
Expand Down
15 changes: 15 additions & 0 deletions model_analyzer/config/input/config_defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
DEFAULT_PERF_OUTPUT_FLAG = False
DEFAULT_PERF_MAX_AUTO_ADJUSTS = 10
DEFAULT_MEASUREMENT_MODE = "count_windows"
DEFAULT_MODEL_TYPE = "generic"

DEFAULT_ONLINE_PLOTS = {
"throughput_v_latency": {
Expand Down Expand Up @@ -118,6 +119,20 @@
"perf_throughput",
"perf_latency_p99",
]
DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS = [
"model_name",
"batch_size",
"concurrency",
"model_config_path",
"instance_group",
"max_batch_size",
"satisfies_constraints",
"perf_throughput",
"perf_latency_p99",
"inter_token_latency_p99",
"time_to_first_token_p99",
"output_token_throughput",
]
DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS = [
"model_name",
"batch_size",
Expand Down
18 changes: 18 additions & 0 deletions model_analyzer/config/input/objects/config_model_profile_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def __init__(
parameters=None,
model_config_parameters=None,
perf_analyzer_flags=None,
genai_perf_flags=None,
triton_server_flags=None,
triton_server_environment=None,
triton_docker_args=None,
Expand All @@ -58,6 +59,9 @@ def __init__(
perf_analyzer_flags : dict
The custom perf analyzer configuration
for this model
genai_perf_flags : dict
The custom GenAI perf configuration
for this model
triton_server_flags : dict
The configuration for the triton server instance launched
for this model
Expand All @@ -78,6 +82,7 @@ def __init__(
self._parameters = parameters
self._model_config_parameters = model_config_parameters
self._perf_analyzer_flags = perf_analyzer_flags
self._genai_perf_flags = genai_perf_flags
self._triton_server_flags = triton_server_flags
self._triton_server_environment = triton_server_environment
self._triton_docker_args = triton_docker_args
Expand Down Expand Up @@ -162,6 +167,16 @@ def perf_analyzer_flags(self):

return self._perf_analyzer_flags

def genai_perf_flags(self):
"""
Returns
-------
dict:
the genai_perf_flags
"""

return self._genai_perf_flags

def triton_server_flags(self):
"""
Returns
Expand Down Expand Up @@ -304,4 +319,7 @@ def __repr__(self):
if self._perf_analyzer_flags:
model_object["perf_analyzer_flags"] = self._perf_analyzer_flags

if self._genai_perf_flags:
model_object["genai_perf_flags"] = self._genai_perf_flags

return str(model_object)
11 changes: 10 additions & 1 deletion model_analyzer/config/run/run_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from typing import List

from model_analyzer.config.run.model_run_config import ModelRunConfig
from model_analyzer.perf_analyzer.genai_perf_config import GenaiPerfConfig


class RunConfig:
Expand All @@ -25,16 +26,21 @@ class RunConfig:
at the same time in Perf Analyzer
"""

def __init__(self, triton_env):
def __init__(self, triton_env, genai_perf_flags=None):
"""
Parameters
----------
triton_env : dict
A dictionary of environment variables to set
when launching tritonserver
genai_perf_flags: dict
The set of flags used when calling genai_perf for LLM models
"""

self._triton_env = triton_env
self._genai_perf_config = GenaiPerfConfig()
self._genai_perf_config.update_config(genai_perf_flags)
self._model_run_configs: List[ModelRunConfig] = []

def add_model_run_config(self, model_run_config):
Expand Down Expand Up @@ -103,6 +109,9 @@ def triton_environment(self):

return self._triton_env

def genai_perf_config(self):
return self._genai_perf_config

def models_name(self):
"""Returns a single comma-joined name of the original model names"""
return ",".join([mrc.model_name() for mrc in self.model_run_configs()])
Expand Down
4 changes: 4 additions & 0 deletions model_analyzer/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,7 @@

# Model analyzer package name
PACKAGE_NAME = "triton-model-analyzer"

# GENAI-PERF
GENAI_PERF_CSV = "profile_export_genai_perf.csv"
GENAI_PERF_COLLATERAL = ["llm_inputs.json", "profile_export.json"]
Loading

0 comments on commit 8298d83

Please sign in to comment.