diff --git a/docs/config.md b/docs/config.md index 2fccc32f5..1eacfbdbd 100644 --- a/docs/config.md +++ b/docs/config.md @@ -236,6 +236,9 @@ cpu_only_composing_models: # Skips the generation of detailed reports and tables [ skip_detailed_reports: | default: false] +# Type of model being profiled: generic or LLM +[ model_type: | default: generic] + # Number of top configs to show in summary plots [ num_configs_per_model: | default: 3] @@ -364,14 +367,17 @@ Before proceeding, it will be helpful to see the documentation on [Model Analyze ### `` -A constraint, specifies the bounds that determine a successful run. There are -three constraints allowed: - -| Option Name | Units | Constraint | Description | -| :----------------- | :-------: | :--------: | :--------------------------------------------------- | -| `perf_throughput` | inf / sec | min | Specify minimum desired throughput. | -| `perf_latency_p99` | ms | max | Specify maximum tolerable latency or latency budget. | -| `gpu_used_memory` | MB | max | Specify maximum GPU memory used by model. | +A constraint, specifies the bounds that determine a successful run. The table below shows examples +of the types of constraints allowed: + +| Option Name | Units | Constraint | Description | +| :------------------------ | :-------: | :--------: | :----------------------------------------------------- | +| `perf_throughput` | inf / sec | min | Specify minimum desired throughput. | +| `perf_latency_p99` | ms | max | Specify maximum tolerable latency or latency budget. | +| `output_token_throughput` | tok / sec | min | Specify minimum desired output token throughput. | +| `inter_token_latency_p99` | ms | max | Specify maximum tolerable input token latency. | +| `time_to_first_token_p99` | ms | max | Specify maximum tolerable time to first token latency. | +| `gpu_used_memory` | MB | max | Specify maximum GPU memory used by model. |
diff --git a/model_analyzer/config/generate/brute_run_config_generator.py b/model_analyzer/config/generate/brute_run_config_generator.py index d226811aa..61d1accd4 100755 --- a/model_analyzer/config/generate/brute_run_config_generator.py +++ b/model_analyzer/config/generate/brute_run_config_generator.py @@ -129,7 +129,7 @@ def _generate_subset( self._send_results_to_generator(index) def _make_run_config(self) -> RunConfig: - run_config = RunConfig(self._triton_env) + run_config = RunConfig(self._triton_env, self._models[0].genai_perf_flags()) for index in range(len(self._models)): run_config.add_model_run_config(self._curr_model_run_configs[index]) return run_config diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py index ed6cc9d5b..fc13cdb08 100755 --- a/model_analyzer/config/input/config_command_profile.py +++ b/model_analyzer/config/input/config_command_profile.py @@ -31,6 +31,7 @@ ) from model_analyzer.constants import LOGGER_NAME from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException +from model_analyzer.perf_analyzer.genai_perf_config import GenaiPerfConfig from model_analyzer.perf_analyzer.perf_config import PerfAnalyzerConfig from model_analyzer.record.record import RecordType from model_analyzer.triton.server.server_config import TritonServerConfig @@ -50,7 +51,9 @@ DEFAULT_GPU_OUTPUT_FIELDS, DEFAULT_GPUS, DEFAULT_INFERENCE_OUTPUT_FIELDS, + DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS, DEFAULT_MAX_RETRIES, + DEFAULT_MODEL_TYPE, DEFAULT_MODEL_WEIGHTING, DEFAULT_MONITORING_INTERVAL, DEFAULT_NUM_CONFIGS_PER_MODEL, @@ -297,6 +300,15 @@ def _fill_config(self): description="Skips the generation of detailed summary reports and tables.", ) ) + self._add_config( + ConfigField( + "model_type", + flags=["--model-type"], + field_type=ConfigPrimitive(str), + default_value=DEFAULT_MODEL_TYPE, + description="Type of model being profiled: generic or LLM", + ) + ) self._add_repository_configs() self._add_client_configs() @@ -364,6 +376,10 @@ def _add_profile_models_configs(self): } ) + genai_perf_flags_scheme = ConfigObject( + schema={k: ConfigPrimitive(str) for k in GenaiPerfConfig.allowed_keys()} + ) + triton_server_environment_scheme = ConfigObject( schema={"*": ConfigPrimitive(str)} ) @@ -444,6 +460,13 @@ def _add_profile_models_configs(self): description="Allows custom configuration of the perf analyzer instances used by model analyzer.", ) ) + self._add_config( + ConfigField( + "genai_perf_flags", + field_type=genai_perf_flags_scheme, + description="Allows custom configuration of the GenAI Perf instances used by model analyzer.", + ) + ) self._add_config( ConfigField( "triton_server_flags", @@ -484,6 +507,11 @@ def _add_profile_models_configs(self): "min": ConfigPrimitive(int), } ), + "output_token_throughput": ConfigObject( + schema={ + "min": ConfigPrimitive(int), + } + ), "perf_latency_avg": ConfigObject( schema={ "max": ConfigPrimitive(int), @@ -514,6 +542,96 @@ def _add_profile_models_configs(self): "max": ConfigPrimitive(int), } ), + "inter_token_latency_p99": ConfigObject( + schema={ + "max": ConfigPrimitive(int), + } + ), + "inter_token_latency_p95": ConfigObject( + schema={ + "max": ConfigPrimitive(int), + } + ), + "inter_token_latency_p90": ConfigObject( + schema={ + "max": ConfigPrimitive(int), + } + ), + "inter_token_latency_p75": ConfigObject( + schema={ + "max": ConfigPrimitive(int), + } + ), + "inter_token_latency_p50": ConfigObject( + schema={ + "max": ConfigPrimitive(int), + } + ), + "inter_token_latency_p25": ConfigObject( + schema={ + "max": ConfigPrimitive(int), + } + ), + "inter_token_latency_min": ConfigObject( + schema={ + "max": ConfigPrimitive(int), + } + ), + "inter_token_latency_max": ConfigObject( + schema={ + "max": ConfigPrimitive(int), + } + ), + "inter_token_latency_avg": ConfigObject( + schema={ + "max": ConfigPrimitive(int), + } + ), + "time_to_first_token_p99": ConfigObject( + schema={ + "max": ConfigPrimitive(int), + } + ), + "time_to_first_token_p95": ConfigObject( + schema={ + "max": ConfigPrimitive(int), + } + ), + "time_to_first_token_p90": ConfigObject( + schema={ + "max": ConfigPrimitive(int), + } + ), + "time_to_first_token_p75": ConfigObject( + schema={ + "max": ConfigPrimitive(int), + } + ), + "time_to_first_token_p50": ConfigObject( + schema={ + "max": ConfigPrimitive(int), + } + ), + "time_to_first_token_p25": ConfigObject( + schema={ + "max": ConfigPrimitive(int), + } + ), + "time_to_first_token_min": ConfigObject( + schema={ + "max": ConfigPrimitive(int), + } + ), + "time_to_first_token_max": ConfigObject( + schema={ + "max": ConfigPrimitive(int), + } + ), + "time_to_first_token_avg": ConfigObject( + schema={ + "max": ConfigPrimitive(int), + } + ), } ) self._add_config( @@ -560,6 +678,7 @@ def _add_profile_models_configs(self): "weighting": ConfigPrimitive(type_=int), "model_config_parameters": model_config_fields, "perf_analyzer_flags": perf_analyzer_flags_scheme, + "genai_perf_flags": genai_perf_flags_scheme, "triton_server_flags": triton_server_flags_scheme, "triton_server_environment": triton_server_environment_scheme, "triton_docker_args": triton_docker_args_scheme, @@ -1344,6 +1463,12 @@ def _autofill_values(self): if not self._fields["gpu_output_fields"].is_set_by_user(): self.gpu_output_fields = DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS + # Switch default output fields if user specifies model type of LLM + # and the user didn't specify a custom output field + if self.model_type == "LLM": + if not self._fields["inference_output_fields"].is_set_by_user(): + self.inference_output_fields = DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS + new_profile_models = {} for i, model in enumerate(self.profile_models): new_model = {"cpu_only": (model.cpu_only() or cpu_only)} @@ -1447,6 +1572,12 @@ def _autofill_values(self): else: new_model["perf_analyzer_flags"] = model.perf_analyzer_flags() + # GenAI Perf flags + if not model.genai_perf_flags(): + new_model["genai_perf_flags"] = self.genai_perf_flags + else: + new_model["genai_perf_flags"] = model.genai_perf_flags() + # triton server flags if not model.triton_server_flags(): new_model["triton_server_flags"] = self.triton_server_flags diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py index 368aed008..8685537f7 100755 --- a/model_analyzer/config/input/config_defaults.py +++ b/model_analyzer/config/input/config_defaults.py @@ -68,6 +68,7 @@ DEFAULT_PERF_OUTPUT_FLAG = False DEFAULT_PERF_MAX_AUTO_ADJUSTS = 10 DEFAULT_MEASUREMENT_MODE = "count_windows" +DEFAULT_MODEL_TYPE = "generic" DEFAULT_ONLINE_PLOTS = { "throughput_v_latency": { @@ -118,6 +119,20 @@ "perf_throughput", "perf_latency_p99", ] +DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS = [ + "model_name", + "batch_size", + "concurrency", + "model_config_path", + "instance_group", + "max_batch_size", + "satisfies_constraints", + "perf_throughput", + "perf_latency_p99", + "inter_token_latency_p99", + "time_to_first_token_p99", + "output_token_throughput", +] DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS = [ "model_name", "batch_size", diff --git a/model_analyzer/config/input/objects/config_model_profile_spec.py b/model_analyzer/config/input/objects/config_model_profile_spec.py index d45e68d41..250cc4980 100755 --- a/model_analyzer/config/input/objects/config_model_profile_spec.py +++ b/model_analyzer/config/input/objects/config_model_profile_spec.py @@ -33,6 +33,7 @@ def __init__( parameters=None, model_config_parameters=None, perf_analyzer_flags=None, + genai_perf_flags=None, triton_server_flags=None, triton_server_environment=None, triton_docker_args=None, @@ -58,6 +59,9 @@ def __init__( perf_analyzer_flags : dict The custom perf analyzer configuration for this model + genai_perf_flags : dict + The custom GenAI perf configuration + for this model triton_server_flags : dict The configuration for the triton server instance launched for this model @@ -78,6 +82,7 @@ def __init__( self._parameters = parameters self._model_config_parameters = model_config_parameters self._perf_analyzer_flags = perf_analyzer_flags + self._genai_perf_flags = genai_perf_flags self._triton_server_flags = triton_server_flags self._triton_server_environment = triton_server_environment self._triton_docker_args = triton_docker_args @@ -162,6 +167,16 @@ def perf_analyzer_flags(self): return self._perf_analyzer_flags + def genai_perf_flags(self): + """ + Returns + ------- + dict: + the genai_perf_flags + """ + + return self._genai_perf_flags + def triton_server_flags(self): """ Returns @@ -304,4 +319,7 @@ def __repr__(self): if self._perf_analyzer_flags: model_object["perf_analyzer_flags"] = self._perf_analyzer_flags + if self._genai_perf_flags: + model_object["genai_perf_flags"] = self._genai_perf_flags + return str(model_object) diff --git a/model_analyzer/config/run/run_config.py b/model_analyzer/config/run/run_config.py index 29efcaf08..9b53d8266 100755 --- a/model_analyzer/config/run/run_config.py +++ b/model_analyzer/config/run/run_config.py @@ -17,6 +17,7 @@ from typing import List from model_analyzer.config.run.model_run_config import ModelRunConfig +from model_analyzer.perf_analyzer.genai_perf_config import GenaiPerfConfig class RunConfig: @@ -25,16 +26,21 @@ class RunConfig: at the same time in Perf Analyzer """ - def __init__(self, triton_env): + def __init__(self, triton_env, genai_perf_flags=None): """ Parameters ---------- triton_env : dict A dictionary of environment variables to set when launching tritonserver + + genai_perf_flags: dict + The set of flags used when calling genai_perf for LLM models """ self._triton_env = triton_env + self._genai_perf_config = GenaiPerfConfig() + self._genai_perf_config.update_config(genai_perf_flags) self._model_run_configs: List[ModelRunConfig] = [] def add_model_run_config(self, model_run_config): @@ -103,6 +109,9 @@ def triton_environment(self): return self._triton_env + def genai_perf_config(self): + return self._genai_perf_config + def models_name(self): """Returns a single comma-joined name of the original model names""" return ",".join([mrc.model_name() for mrc in self.model_run_configs()]) diff --git a/model_analyzer/constants.py b/model_analyzer/constants.py index 886360d34..6d7682515 100755 --- a/model_analyzer/constants.py +++ b/model_analyzer/constants.py @@ -70,3 +70,7 @@ # Model analyzer package name PACKAGE_NAME = "triton-model-analyzer" + +# GENAI-PERF +GENAI_PERF_CSV = "profile_export_genai_perf.csv" +GENAI_PERF_COLLATERAL = ["llm_inputs.json", "profile_export.json"] diff --git a/model_analyzer/perf_analyzer/genai_perf_config.py b/model_analyzer/perf_analyzer/genai_perf_config.py new file mode 100755 index 000000000..9e5a77201 --- /dev/null +++ b/model_analyzer/perf_analyzer/genai_perf_config.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException + + +class GenaiPerfConfig: + """ + A config class to set arguments to the genai_perf. + An argument set to None will use the genai_perf's default. + """ + + genai_perf_args = [ + "backend", + "endpoint", + "service-kind", + "url", + "expected-output-tokens", + "input-dataset", + "input-tokens-mean", + "input-tokens-stddev", + "input-type", + "num-of-output-prompts", + "random-seed", + "streaming", + "tokenizer", + ] + + boolean_args = ["streaming"] + + def __init__(self): + """ + Construct a GenaiPerfConfig + """ + + self._args = {k: None for k in self.genai_perf_args} + + @classmethod + def allowed_keys(cls): + """ + Returns + ------- + list of str + The keys that are allowed to be + passed into perf_analyzer + """ + + return cls.genai_perf_args + + def update_config(self, params=None): + """ + Allows setting values from a params dict + + Parameters + ---------- + params: dict + keys are allowed args to perf_analyzer + """ + + if params and type(params) is dict: + for key in params: + self[key] = params[key] + + @classmethod + def from_dict(cls, genai_perf_config_dict): + genai_perf_config = GenaiPerfConfig() + for key in [ + "_args", + ]: + if key in genai_perf_config_dict: + setattr(genai_perf_config, key, genai_perf_config_dict[key]) + return genai_perf_config + + def representation(self): + """ + Returns + ------- + str + a string representation of the Genai Perf config + that removes values which can vary between + runs, but should be ignored when determining + if a previous (checkpointed) run can be used + """ + cli_string = self.to_cli_string() + + return cli_string + + def to_cli_string(self) -> str: + """ + Utility function to convert a config into a + string of arguments to the perf_analyzer with CLI. + + Returns + ------- + str + cli command string consisting of all arguments + to the perf_analyzer set in the config, without + the executable name. + """ + + # single dashed options, then verbose flags, then main args + args = [] + args.extend(self._parse_options()) + + return " ".join(args) + + def _parse_options(self): + """ + Parse the genai perf args + """ + temp_args = [] + for key, value in self._args.items(): + if key in self.boolean_args: + temp_args = self._parse_boolean_args(key, value, temp_args) + elif value: + temp_args.append(f"--{key}={value}") + return temp_args + + def _parse_boolean_args(self, key, value, temp_args): + """ + Parse genai perf args that should not add a value to the cli string + """ + assert type(value) in [ + str, + type(None), + ], f"Data type for arg {key} must be a (boolean) string instead of {type(value)}" + if value != None and value.lower() == "true": + temp_args.append(f"--{key}") + return temp_args + + def __getitem__(self, key): + """ + Gets an arguments value in config + + Parameters + ---------- + key : str + The name of the argument to the genai perf config + + Returns + ------- + object + The value that the argument is set to in this config + + Raises + ------ + KeyError + If argument not found in the config + """ + + if key in self._args: + return self._args[key] + else: + raise TritonModelAnalyzerException( + f"Key {key} does not exist in genai_perf_flags." + ) + + def __setitem__(self, key, value): + """ + Sets an arguments value in config + after checking if defined/supported. + + Parameters + ---------- + key : str + The name of the argument in genai_perf + value : (any) + The value to which the argument is being set + + Raises + ------ + TritonModelAnalyzerException + If key is unsupported or undefined in the + config class + """ + + if key in self._args: + self._args[key] = value + else: + raise TritonModelAnalyzerException( + f"The argument '{key}' to the genai_perf " + "is not supported by model analyzer." + ) + + def __contains__(self, key): + """ + Returns + ------- + True if key is in perf_config i.e. the key is a + genai perf config argument + """ + + return key in GenaiPerfConfig.allowed_keys() diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py index c88f8e655..b301ee97e 100755 --- a/model_analyzer/perf_analyzer/perf_analyzer.py +++ b/model_analyzer/perf_analyzer/perf_analyzer.py @@ -21,12 +21,16 @@ import re import signal import tempfile +from csv import DictReader from subprocess import STDOUT, Popen -from typing import Dict, List +from typing import Dict, List, Optional import psutil +from model_analyzer.config.input.config_defaults import DEFAULT_MODEL_TYPE from model_analyzer.constants import ( + GENAI_PERF_COLLATERAL, + GENAI_PERF_CSV, INTERVAL_SLEEP_TIME, LOGGER_NAME, MEASUREMENT_REQUEST_COUNT_STEP, @@ -40,6 +44,16 @@ from model_analyzer.record.types.gpu_power_usage import GPUPowerUsage from model_analyzer.record.types.gpu_used_memory import GPUUsedMemory from model_analyzer.record.types.gpu_utilization import GPUUtilization +from model_analyzer.record.types.inter_token_latency_avg import InterTokenLatencyAvg +from model_analyzer.record.types.inter_token_latency_max import InterTokenLatencyMax +from model_analyzer.record.types.inter_token_latency_min import InterTokenLatencyMin +from model_analyzer.record.types.inter_token_latency_p25 import InterTokenLatencyP25 +from model_analyzer.record.types.inter_token_latency_p50 import InterTokenLatencyP50 +from model_analyzer.record.types.inter_token_latency_p75 import InterTokenLatencyP75 +from model_analyzer.record.types.inter_token_latency_p90 import InterTokenLatencyP90 +from model_analyzer.record.types.inter_token_latency_p95 import InterTokenLatencyP95 +from model_analyzer.record.types.inter_token_latency_p99 import InterTokenLatencyP99 +from model_analyzer.record.types.output_token_throughput import OutputTokenThroughput from model_analyzer.record.types.perf_client_response_wait import PerfClientResponseWait from model_analyzer.record.types.perf_client_send_recv import PerfClientSendRecv from model_analyzer.record.types.perf_latency_avg import PerfLatencyAvg @@ -53,6 +67,15 @@ ) from model_analyzer.record.types.perf_server_queue import PerfServerQueue from model_analyzer.record.types.perf_throughput import PerfThroughput +from model_analyzer.record.types.time_to_first_token_avg import TimeToFirstTokenAvg +from model_analyzer.record.types.time_to_first_token_max import TimeToFirstTokenMax +from model_analyzer.record.types.time_to_first_token_min import TimeToFirstTokenMin +from model_analyzer.record.types.time_to_first_token_p25 import TimeToFirstTokenP25 +from model_analyzer.record.types.time_to_first_token_p50 import TimeToFirstTokenP50 +from model_analyzer.record.types.time_to_first_token_p75 import TimeToFirstTokenP75 +from model_analyzer.record.types.time_to_first_token_p90 import TimeToFirstTokenP90 +from model_analyzer.record.types.time_to_first_token_p95 import TimeToFirstTokenP95 +from model_analyzer.record.types.time_to_first_token_p99 import TimeToFirstTokenP99 logger = logging.getLogger(LOGGER_NAME) @@ -91,6 +114,28 @@ class PerfAnalyzer: ["gpu_used_memory", "Max GPU Memory Usage", GPUUsedMemory, "1000000"], ["gpu_free_memory", "Total GPU Memory", GPUFreeMemory, "1000000"] ] + + llm_metric_table = [ + ["time_to_first_token_avg", "Time To First Token (ns) avg", TimeToFirstTokenAvg, "1000"], + ["time_to_first_token_min", "Time To First Token (ns) min", TimeToFirstTokenMin, "1000"], + ["time_to_first_token_max", "Time To First Token (ns) max", TimeToFirstTokenMax, "1000"], + ["time_to_first_token_p99", "Time To First Token (ns) p99", TimeToFirstTokenP99, "1000"], + ["time_to_first_token_p95", "Time To First Token (ns) p95", TimeToFirstTokenP95, "1000"], + ["time_to_first_token_p90", "Time To First Token (ns) p90", TimeToFirstTokenP90, "1000"], + ["time_to_first_token_p75", "Time To First Token (ns) p75", TimeToFirstTokenP75, "1000"], + ["time_to_first_token_p50", "Time To First Token (ns) p50", TimeToFirstTokenP50, "1000"], + ["time_to_first_token_p25", "Time To First Token (ns) p25", TimeToFirstTokenP25, "1000"], + ["inter_token_latency_avg", "Inter Token Latency (ns) avg", InterTokenLatencyAvg, "1000"], + ["inter_token_latency_min", "Inter Token Latency (ns) min", InterTokenLatencyMin, "1000"], + ["inter_token_latency_max", "Inter Token Latency (ns) max", InterTokenLatencyMax, "1000"], + ["inter_token_latency_p99", "Inter Token Latency (ns) p99", InterTokenLatencyP99, "1000"], + ["inter_token_latency_p95", "Inter Token Latency (ns) p95", InterTokenLatencyP95, "1000"], + ["inter_token_latency_p90", "Inter Token Latency (ns) p90", InterTokenLatencyP90, "1000"], + ["inter_token_latency_p75", "Inter Token Latency (ns) p75", InterTokenLatencyP75, "1000"], + ["inter_token_latency_p50", "Inter Token Latency (ns) p50", InterTokenLatencyP50, "1000"], + ["inter_token_latency_p25", "Inter Token Latency (ns) p25", InterTokenLatencyP25, "1000"], + ["output_token_throughput", "Output Token Throughput (per sec) avg", OutputTokenThroughput, "1"] + ] # yapf: enable @staticmethod @@ -109,7 +154,23 @@ def get_gpu_metrics(): ] return gpu_metrics - def __init__(self, path, config, max_retries, timeout, max_cpu_util): + @staticmethod + def get_llm_metrics(): + llm_metrics = [ + llm_metric[PerfAnalyzer.RECORD_CLASS] + for llm_metric in PerfAnalyzer.llm_metric_table + ] + return llm_metrics + + def __init__( + self, + path, + config, + max_retries, + timeout, + max_cpu_util, + model_type=DEFAULT_MODEL_TYPE, + ): """ Parameters ---------- @@ -133,8 +194,10 @@ def __init__(self, path, config, max_retries, timeout, max_cpu_util): self._timeout = timeout self._output = "" self._perf_records = {} + self._llm_records = {} self._gpu_records = [] self._max_cpu_util = max_cpu_util + self._model_type = model_type def run(self, metrics, env=None): """ @@ -195,7 +258,20 @@ def get_perf_records(self): if self._perf_records: return self._perf_records raise TritonModelAnalyzerException( - "Attempted to get perf_analyzer results" "without calling run first." + "Attempted to get perf_analyzer results without calling run first." + ) + + def get_llm_records(self): + """ + Returns + ------- + The LLM records from the last perf_analyzer run + """ + + if self._llm_records: + return self._llm_records + raise TritonModelAnalyzerException( + "Attempted to get perf_analyzer results without calling run first." ) def get_gpu_records(self): @@ -248,14 +324,32 @@ def _get_cmd(self): return cmd def _get_single_model_cmd(self, index): - cmd = [self.bin_path] - if self._is_multi_model(): - cmd += ["--enable-mpi"] - cmd += self._get_pa_cli_command(index).replace("=", " ").split() + if self._model_type == "LLM": + cmd = ["genai-perf", "-m", self._config.models_name()] + cmd += self._get_genai_perf_cli_command(index).replace("=", " ").split() + cmd += ["--"] + cmd += ( + self._get_pa_cli_command(index, exclude_model_name=True) + .replace("=", " ") + .split() + ) + else: + cmd = [self.bin_path] + if self._is_multi_model(): + cmd += ["--enable-mpi"] + cmd += self._get_pa_cli_command(index).replace("=", " ").split() + return cmd - def _get_pa_cli_command(self, index): - return self._config.model_run_configs()[index].perf_config().to_cli_string() + def _get_pa_cli_command(self, index, exclude_model_name=False): + return ( + self._config.model_run_configs()[index] + .perf_config() + .to_cli_string(exclude_model_name) + ) + + def _get_genai_perf_cli_command(self, index): + return self._config.genai_perf_config().to_cli_string() def _create_env(self, env): perf_analyzer_env = os.environ.copy() @@ -438,6 +532,12 @@ def _is_multi_model(self): return len(self._config.model_run_configs()) > 1 def _parse_outputs(self, metrics): + self._parse_generic_outputs(metrics) + + if self._model_type == "LLM": + self._parse_llm_outputs(metrics) + + def _parse_generic_outputs(self, metrics): """ Extract records from the Perf Analyzer run for each model """ @@ -464,6 +564,26 @@ def _parse_outputs(self, metrics): for f in glob.glob(f"*{perf_config['latency-report-file']}"): os.remove(f) + def _parse_llm_outputs(self, metrics): + """ + Extract records from the Perf Analyzer run for each model + """ + + perf_config = self._config.model_run_configs()[0].perf_config() + + logger.debug(f"Reading GENAI-PERF results from {GENAI_PERF_CSV}") + with open(GENAI_PERF_CSV, mode="r") as f: + csv_reader = list(csv.DictReader(f, delimiter=",")) + + # See test_perf_analyzer::test_pa_llm_csv_output() for CSV output example + self._llm_records[perf_config["model-name"]] = self._extract_llm_records( + metrics, csv_reader + ) + + os.remove(GENAI_PERF_CSV) + for filename in GENAI_PERF_COLLATERAL: + os.remove(filename) + def _extract_perf_records_from_row( self, requested_metrics: List[Record], row_metrics: Dict[str, str] ) -> List[Record]: @@ -526,6 +646,46 @@ def _extract_gpu_records_from_row( self._cleanup_gpu_records(gpu_records) return gpu_records + def _extract_llm_records( + self, requested_metrics: List[Record], csv_reader: DictReader + ) -> List[Record]: + llm_records: List[Record] = [] + + for requested_metric in requested_metrics: + new_llm_record = self._get_llm_record_from_csv(requested_metric, csv_reader) + if new_llm_record: + llm_records.append(new_llm_record) + + return llm_records + + def _get_llm_record_from_csv( + self, requested_metric: Record, csv_reader: DictReader + ) -> Optional[Record]: + for row in csv_reader: + for key, value in row.items(): + metric_string = f"{row['Metric']} {key}" + llm_metric = self._find_corresponding_llm_metric_row(metric_string) + + if ( + llm_metric + and llm_metric[PerfAnalyzer.METRIC_TAG] == requested_metric.tag + ): + adjusted_value = float(value) / float( + llm_metric[PerfAnalyzer.REDUCTION_FACTOR] + ) + + llm_record = llm_metric[PerfAnalyzer.RECORD_CLASS](adjusted_value) # type: ignore + return llm_record + + return None + + def _find_corresponding_llm_metric_row(self, metric_string: str) -> Optional[List]: + for row in PerfAnalyzer.llm_metric_table: + if metric_string == row[PerfAnalyzer.CSV_STRING]: + return row + + return None + def _cleanup_gpu_records(self, gpu_records): # Recalculate GPUFreeMemory by removing the value of the associated GPUUsedMemory # Remove any GPUFreeMemory records that don't have a matching GPUUsedMemory diff --git a/model_analyzer/perf_analyzer/perf_config.py b/model_analyzer/perf_analyzer/perf_config.py index e9160a44a..521cc1629 100755 --- a/model_analyzer/perf_analyzer/perf_config.py +++ b/model_analyzer/perf_analyzer/perf_config.py @@ -14,6 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import List + from model_analyzer.config.input.config_defaults import DEFAULT_MEASUREMENT_MODE from model_analyzer.constants import SECONDS_TO_MILLISECONDS_MULTIPLIER from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException @@ -325,7 +327,7 @@ def remove_mrc_from_cli_string(cls, cli_string): return " ".join(perf_str_tokens) - def to_cli_string(self): + def to_cli_string(self, exclude_model_name: bool = False) -> str: """ Utility function to convert a config into a string of arguments to the perf_analyzer with CLI. @@ -340,19 +342,22 @@ def to_cli_string(self): # single dashed options, then verbose flags, then main args args = [] - args.extend(self._parse_short_options()) + args.extend(self._parse_short_options(exclude_model_name)) args.extend(self._parse_verbose_options()) args.extend(self._parse_long_options()) return " ".join(args) - def _parse_short_options(self): + def _parse_short_options(self, exclude_model_name: bool = False) -> List: """ Parse the perf analyzer single dash options """ temp_args = [] for key, value in self._options.items(): if value: + if exclude_model_name and key == "-m": + continue + if key in self._additive_args: for additive_value in value: temp_args.append(f"{key} {additive_value}") diff --git a/model_analyzer/record/metrics_manager.py b/model_analyzer/record/metrics_manager.py index 581cae88b..849731935 100755 --- a/model_analyzer/record/metrics_manager.py +++ b/model_analyzer/record/metrics_manager.py @@ -69,6 +69,25 @@ class MetricsManager: "gpu_power_usage", "cpu_available_ram", "cpu_used_ram", + "time_to_first_token_avg", + "time_to_first_token_min", + "time_to_first_token_max", + "time_to_first_token_p99", + "time_to_first_token_p95", + "time_to_first_token_p90", + "time_to_first_token_p75", + "time_to_first_token_p50", + "time_to_first_token_p25", + "inter_token_latency_avg", + "inter_token_latency_min", + "inter_token_latency_max", + "inter_token_latency_p99", + "inter_token_latency_p95", + "inter_token_latency_p90", + "inter_token_latency_p75", + "inter_token_latency_p50", + "inter_token_latency_p25", + "output_token_throughput", ] def __init__(self, config, client, server, gpus, result_manager, state_manager): @@ -115,6 +134,7 @@ def __init__(self, config, client, server, gpus, result_manager, state_manager): ( self._gpu_metrics, self._perf_metrics, + self._llm_metrics, self._cpu_metrics, ) = self._categorize_metrics(self.metrics, self._config.collect_cpu_metrics) self._gpus = gpus @@ -160,21 +180,23 @@ def _categorize_metrics(metric_tags, collect_cpu_metrics=False): Returns ------- - (list,list,list) - tuple of three lists (DCGM, PerfAnalyzer, CPU) metrics + (list,list,list,list) + tuple of four lists (DCGM, PerfAnalyzer, LLM, CPU) metrics """ - gpu_metrics, perf_metrics, cpu_metrics = [], [], [] + gpu_metrics, perf_metrics, llm_metrics, cpu_metrics = [], [], [], [] # Separates metrics and objectives into related lists for metric in MetricsManager.get_metric_types(metric_tags): if metric in PerfAnalyzer.get_gpu_metrics(): gpu_metrics.append(metric) elif metric in PerfAnalyzer.get_perf_metrics(): perf_metrics.append(metric) + elif metric in PerfAnalyzer.get_llm_metrics(): + llm_metrics.append(metric) elif collect_cpu_metrics and (metric in CPUMonitor.cpu_metrics): cpu_metrics.append(metric) - return gpu_metrics, perf_metrics, cpu_metrics + return gpu_metrics, perf_metrics, llm_metrics, cpu_metrics def profile_server(self): """ @@ -589,9 +611,10 @@ def _run_perf_analyzer( max_retries=self._config.perf_analyzer_max_auto_adjusts, timeout=self._config.perf_analyzer_timeout, max_cpu_util=self._config.perf_analyzer_cpu_util, + model_type=self._config.model_type, ) - metrics_to_gather = self._perf_metrics + self._gpu_metrics + metrics_to_gather = self._perf_metrics + self._llm_metrics + self._gpu_metrics status = perf_analyzer.run(metrics_to_gather, env=perf_analyzer_env) self._write_perf_analyzer_output(perf_output_writer, perf_analyzer) @@ -601,6 +624,12 @@ def _run_perf_analyzer( return (None, None) perf_records = perf_analyzer.get_perf_records() + + if self._config.model_type == "LLM": + perf_records[run_config.models_name()].extend( + perf_analyzer.get_llm_records()[run_config.models_name()] + ) + gpu_records = perf_analyzer.get_gpu_records() aggregated_perf_records = self._aggregate_perf_records(perf_records) @@ -824,6 +853,17 @@ def is_perf_analyzer_metric(tag): metric = MetricsManager.get_metric_types([tag])[0] return metric in PerfAnalyzer.get_perf_metrics() + @staticmethod + def is_llm_metric(tag): + """ + Returns + ------ + True if the given tag is a supported perf_analyzer metric + False otherwise + """ + metric = MetricsManager.get_metric_types([tag])[0] + return metric in PerfAnalyzer.get_llm_metrics() + @staticmethod def is_cpu_metric(tag): """ diff --git a/model_analyzer/record/types/inter_token_latency_avg.py b/model_analyzer/record/types/inter_token_latency_avg.py new file mode 100755 index 000000000..fe1dc7dfb --- /dev/null +++ b/model_analyzer/record/types/inter_token_latency_avg.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase + + +@total_ordering +class InterTokenLatencyAvg(InterTokenLatencyBase): + """ + A record for perf_analyzer Inter token latency metric + """ + + tag = "inter_token_latency_avg" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "Avg Inter Token Latency (ms)" diff --git a/model_analyzer/record/types/inter_token_latency_base.py b/model_analyzer/record/types/inter_token_latency_base.py new file mode 100755 index 000000000..dda70cefa --- /dev/null +++ b/model_analyzer/record/types/inter_token_latency_base.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.record import DecreasingRecord + + +@total_ordering +class InterTokenLatencyBase(DecreasingRecord): + """ + A record for perf_analyzer Inter token latency metric + """ + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + def __eq__(self, other): + """ + Allows checking for + equality between two records + """ + + return self.value() == other.value() + + def __lt__(self, other): + """ + Allows checking if + this record is less than + the other + """ + + return self.value() > other.value() + + def __add__(self, other): + """ + Allows adding two records together + to produce a brand new record. + """ + + return self.__class__(value=(self.value() + other.value())) + + def __sub__(self, other): + """ + Allows subbing two records together + to produce a brand new record. + + ** Note this does reverse subtraction because + of the inverted nature of latency (lower is better) + """ + + return self.__class__(value=(other.value() - self.value())) diff --git a/model_analyzer/record/types/inter_token_latency_max.py b/model_analyzer/record/types/inter_token_latency_max.py new file mode 100755 index 000000000..ce2484144 --- /dev/null +++ b/model_analyzer/record/types/inter_token_latency_max.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase + + +@total_ordering +class InterTokenLatencyMax(InterTokenLatencyBase): + """ + A record for perf_analyzer Inter token latency metric + """ + + tag = "inter_token_latency_max" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "Max Inter Token Latency (ms)" diff --git a/model_analyzer/record/types/inter_token_latency_min.py b/model_analyzer/record/types/inter_token_latency_min.py new file mode 100755 index 000000000..21e44883b --- /dev/null +++ b/model_analyzer/record/types/inter_token_latency_min.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase + + +@total_ordering +class InterTokenLatencyMin(InterTokenLatencyBase): + """ + A record for perf_analyzer Inter token latency metric + """ + + tag = "inter_token_latency_min" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "Min Inter Token Latency (ms)" diff --git a/model_analyzer/record/types/inter_token_latency_p25.py b/model_analyzer/record/types/inter_token_latency_p25.py new file mode 100755 index 000000000..8a0c80173 --- /dev/null +++ b/model_analyzer/record/types/inter_token_latency_p25.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase + + +@total_ordering +class InterTokenLatencyP25(InterTokenLatencyBase): + """ + A record for perf_analyzer Inter token latency metric + """ + + tag = "inter_token_latency_p25" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p25 Inter Token Latency (ms)" diff --git a/model_analyzer/record/types/inter_token_latency_p50.py b/model_analyzer/record/types/inter_token_latency_p50.py new file mode 100755 index 000000000..190920874 --- /dev/null +++ b/model_analyzer/record/types/inter_token_latency_p50.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase + + +@total_ordering +class InterTokenLatencyP50(InterTokenLatencyBase): + """ + A record for perf_analyzer Inter token latency metric + """ + + tag = "inter_token_latency_p50" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p50 Inter Token Latency (ms)" diff --git a/model_analyzer/record/types/inter_token_latency_p75.py b/model_analyzer/record/types/inter_token_latency_p75.py new file mode 100755 index 000000000..1234306fd --- /dev/null +++ b/model_analyzer/record/types/inter_token_latency_p75.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase + + +@total_ordering +class InterTokenLatencyP75(InterTokenLatencyBase): + """ + A record for perf_analyzer Inter token latency metric + """ + + tag = "inter_token_latency_p75" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p75 Inter Token Latency (ms)" diff --git a/model_analyzer/record/types/inter_token_latency_p90.py b/model_analyzer/record/types/inter_token_latency_p90.py new file mode 100755 index 000000000..60019088a --- /dev/null +++ b/model_analyzer/record/types/inter_token_latency_p90.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase + + +@total_ordering +class InterTokenLatencyP90(InterTokenLatencyBase): + """ + A record for perf_analyzer Inter token latency metric + """ + + tag = "inter_token_latency_p90" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p90 Inter Token Latency (ms)" diff --git a/model_analyzer/record/types/inter_token_latency_p95.py b/model_analyzer/record/types/inter_token_latency_p95.py new file mode 100755 index 000000000..b77fd9118 --- /dev/null +++ b/model_analyzer/record/types/inter_token_latency_p95.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase + + +@total_ordering +class InterTokenLatencyP95(InterTokenLatencyBase): + """ + A record for perf_analyzer Inter token latency metric + """ + + tag = "inter_token_latency_p95" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p95 Inter Token Latency (ms)" diff --git a/model_analyzer/record/types/inter_token_latency_p99.py b/model_analyzer/record/types/inter_token_latency_p99.py new file mode 100755 index 000000000..d9f722772 --- /dev/null +++ b/model_analyzer/record/types/inter_token_latency_p99.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase + + +@total_ordering +class InterTokenLatencyP99(InterTokenLatencyBase): + """ + A record for perf_analyzer Inter token latency metric + """ + + tag = "inter_token_latency_p99" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p99 Inter Token Latency (ms)" diff --git a/model_analyzer/record/types/output_token_throughput.py b/model_analyzer/record/types/output_token_throughput.py new file mode 100755 index 000000000..f7edf7cb8 --- /dev/null +++ b/model_analyzer/record/types/output_token_throughput.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.record import IncreasingRecord + + +@total_ordering +class OutputTokenThroughput(IncreasingRecord): + """ + A record for perf_analyzer + metric 'Output Token Throughput' + """ + + tag = "output_token_throughput" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + The throughput from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @staticmethod + def value_function(): + """ + Returns the total value from a list + + Returns + ------- + Total value of the list + """ + return sum + + @staticmethod + def header(aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "Output Token Throughput (infer/sec)" + + def __eq__(self, other): + """ + Allows checking for + equality between two records + """ + + return self.value() == other.value() + + def __lt__(self, other): + """ + Allows checking if + this record is less than + the other + """ + + return self.value() < other.value() + + def __add__(self, other): + """ + Allows adding two records together + to produce a brand new record. + """ + + return self.__class__(value=(self.value() + other.value())) + + def __sub__(self, other): + """ + Allows subtracting two records together + to produce a brand new record. + """ + + return self.__class__(value=(self.value() - other.value())) diff --git a/model_analyzer/record/types/perf_latency_avg.py b/model_analyzer/record/types/perf_latency_avg.py index 5452c0b79..aafbcbeb2 100755 --- a/model_analyzer/record/types/perf_latency_avg.py +++ b/model_analyzer/record/types/perf_latency_avg.py @@ -16,11 +16,11 @@ from functools import total_ordering -from model_analyzer.record.record import DecreasingRecord +from model_analyzer.record.types.perf_latency_base import PerfLatencyBase @total_ordering -class PerfLatencyAvg(DecreasingRecord): +class PerfLatencyAvg(PerfLatencyBase): """ A record for perf_analyzer latency metric """ @@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False): """ return "Avg Latency (ms)" - - def __eq__(self, other): - """ - Allows checking for - equality between two records - """ - - return self.value() == other.value() - - def __lt__(self, other): - """ - Allows checking if - this record is less than - the other - """ - - return self.value() > other.value() - - def __add__(self, other): - """ - Allows adding two records together - to produce a brand new record. - """ - - return self.__class__(value=(self.value() + other.value())) - - def __sub__(self, other): - """ - Allows subbing two records together - to produce a brand new record. - - ** Note this does reverse subtraction because - of the inverted nature of latency (lower is better) - """ - - return self.__class__(value=(other.value() - self.value())) diff --git a/model_analyzer/record/types/perf_latency_base.py b/model_analyzer/record/types/perf_latency_base.py new file mode 100755 index 000000000..3c3e76cac --- /dev/null +++ b/model_analyzer/record/types/perf_latency_base.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.record import DecreasingRecord + + +@total_ordering +class PerfLatencyBase(DecreasingRecord): + """ + A base class for perf_analyzer latency metric + """ + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + def __eq__(self, other): + """ + Allows checking for + equality between two records + """ + + return self.value() == other.value() + + def __lt__(self, other): + """ + Allows checking if + this record is less than + the other + """ + + return self.value() > other.value() + + def __add__(self, other): + """ + Allows adding two records together + to produce a brand new record. + """ + + return self.__class__(value=(self.value() + other.value())) + + def __sub__(self, other): + """ + Allows subbing two records together + to produce a brand new record. + + ** Note this does reverse subtraction because + of the inverted nature of latency (lower is better) + """ + + return self.__class__(value=(other.value() - self.value())) diff --git a/model_analyzer/record/types/perf_latency_p90.py b/model_analyzer/record/types/perf_latency_p90.py index c6718fe40..7eafa3b28 100755 --- a/model_analyzer/record/types/perf_latency_p90.py +++ b/model_analyzer/record/types/perf_latency_p90.py @@ -16,11 +16,11 @@ from functools import total_ordering -from model_analyzer.record.record import DecreasingRecord +from model_analyzer.record.types.perf_latency_base import PerfLatencyBase @total_ordering -class PerfLatencyP90(DecreasingRecord): +class PerfLatencyP90(PerfLatencyBase): """ A record for perf_analyzer latency metric """ @@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False): """ return "p90 Latency (ms)" - - def __eq__(self, other): - """ - Allows checking for - equality between two records - """ - - return self.value() == other.value() - - def __lt__(self, other): - """ - Allows checking if - this record is less than - the other - """ - - return self.value() > other.value() - - def __add__(self, other): - """ - Allows adding two records together - to produce a brand new record. - """ - - return self.__class__(value=(self.value() + other.value())) - - def __sub__(self, other): - """ - Allows subbing two records together - to produce a brand new record. - - ** Note this does reverse subtraction because - of the inverted nature of latency (lower is better) - """ - - return self.__class__(value=(other.value() - self.value())) diff --git a/model_analyzer/record/types/perf_latency_p95.py b/model_analyzer/record/types/perf_latency_p95.py index 84ed9e648..ccb9f8c01 100755 --- a/model_analyzer/record/types/perf_latency_p95.py +++ b/model_analyzer/record/types/perf_latency_p95.py @@ -16,11 +16,11 @@ from functools import total_ordering -from model_analyzer.record.record import DecreasingRecord +from model_analyzer.record.types.perf_latency_base import PerfLatencyBase @total_ordering -class PerfLatencyP95(DecreasingRecord): +class PerfLatencyP95(PerfLatencyBase): """ A record for perf_analyzer latency metric """ @@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False): """ return "p95 Latency (ms)" - - def __eq__(self, other): - """ - Allows checking for - equality between two records - """ - - return self.value() == other.value() - - def __lt__(self, other): - """ - Allows checking if - this record is less than - the other - """ - - return self.value() > other.value() - - def __add__(self, other): - """ - Allows adding two records together - to produce a brand new record. - """ - - return self.__class__(value=(self.value() + other.value())) - - def __sub__(self, other): - """ - Allows subbing two records together - to produce a brand new record. - - ** Note this does reverse subtraction because - of the inverted nature of latency (lower is better) - """ - - return self.__class__(value=(other.value() - self.value())) diff --git a/model_analyzer/record/types/perf_latency_p99.py b/model_analyzer/record/types/perf_latency_p99.py index af4d06da4..46d352021 100755 --- a/model_analyzer/record/types/perf_latency_p99.py +++ b/model_analyzer/record/types/perf_latency_p99.py @@ -16,11 +16,11 @@ from functools import total_ordering -from model_analyzer.record.record import DecreasingRecord +from model_analyzer.record.types.perf_latency_base import PerfLatencyBase @total_ordering -class PerfLatencyP99(DecreasingRecord): +class PerfLatencyP99(PerfLatencyBase): """ A record for perf_analyzer latency metric """ @@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False): """ return "p99 Latency (ms)" - - def __eq__(self, other): - """ - Allows checking for - equality between two records - """ - - return self.value() == other.value() - - def __lt__(self, other): - """ - Allows checking if - this record is less than - the other - """ - - return self.value() > other.value() - - def __add__(self, other): - """ - Allows adding two records together - to produce a brand new record. - """ - - return self.__class__(value=(self.value() + other.value())) - - def __sub__(self, other): - """ - Allows subbing two records together - to produce a brand new record. - - ** Note this does reverse subtraction because - of the inverted nature of latency (lower is better) - """ - - return self.__class__(value=(other.value() - self.value())) diff --git a/model_analyzer/record/types/time_to_first_token_avg.py b/model_analyzer/record/types/time_to_first_token_avg.py new file mode 100755 index 000000000..28da5d294 --- /dev/null +++ b/model_analyzer/record/types/time_to_first_token_avg.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase + + +@total_ordering +class TimeToFirstTokenAvg(TimeToFirstTokenBase): + """ + A record for perf_analyzer Time to first token metric + """ + + tag = "time_to_first_token_avg" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "Avg Time To First Token (ms)" diff --git a/model_analyzer/record/types/time_to_first_token_base.py b/model_analyzer/record/types/time_to_first_token_base.py new file mode 100755 index 000000000..5ef6e9070 --- /dev/null +++ b/model_analyzer/record/types/time_to_first_token_base.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.record import DecreasingRecord + + +@total_ordering +class TimeToFirstTokenBase(DecreasingRecord): + """ + A base class record for perf_analyzer time to first token metric + """ + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + def __eq__(self, other): + """ + Allows checking for + equality between two records + """ + + return self.value() == other.value() + + def __lt__(self, other): + """ + Allows checking if + this record is less than + the other + """ + + return self.value() > other.value() + + def __add__(self, other): + """ + Allows adding two records together + to produce a brand new record. + """ + + return self.__class__(value=(self.value() + other.value())) + + def __sub__(self, other): + """ + Allows subbing two records together + to produce a brand new record. + + ** Note this does reverse subtraction because + of the inverted nature of latency (lower is better) + """ + + return self.__class__(value=(other.value() - self.value())) diff --git a/model_analyzer/record/types/time_to_first_token_max.py b/model_analyzer/record/types/time_to_first_token_max.py new file mode 100755 index 000000000..f9ccc0a52 --- /dev/null +++ b/model_analyzer/record/types/time_to_first_token_max.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase + + +@total_ordering +class TimeToFirstTokenMax(TimeToFirstTokenBase): + """ + A record for perf_analyzer Time to first token metric + """ + + tag = "time_to_first_token_max" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "Max Time To First Token (ms)" diff --git a/model_analyzer/record/types/time_to_first_token_min.py b/model_analyzer/record/types/time_to_first_token_min.py new file mode 100755 index 000000000..4cc563c86 --- /dev/null +++ b/model_analyzer/record/types/time_to_first_token_min.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase + + +@total_ordering +class TimeToFirstTokenMin(TimeToFirstTokenBase): + """ + A record for perf_analyzer Time to first token metric + """ + + tag = "time_to_first_token_min" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "Min Time To First Token (ms)" diff --git a/model_analyzer/record/types/time_to_first_token_p25.py b/model_analyzer/record/types/time_to_first_token_p25.py new file mode 100755 index 000000000..5938ca3eb --- /dev/null +++ b/model_analyzer/record/types/time_to_first_token_p25.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase + + +@total_ordering +class TimeToFirstTokenP25(TimeToFirstTokenBase): + """ + A record for perf_analyzer Time to first token metric + """ + + tag = "time_to_first_token_p25" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p25 Time To First Token (ms)" diff --git a/model_analyzer/record/types/time_to_first_token_p50.py b/model_analyzer/record/types/time_to_first_token_p50.py new file mode 100755 index 000000000..a3440b456 --- /dev/null +++ b/model_analyzer/record/types/time_to_first_token_p50.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase + + +@total_ordering +class TimeToFirstTokenP50(TimeToFirstTokenBase): + """ + A record for perf_analyzer Time to first token metric + """ + + tag = "time_to_first_token_p50" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p50 Time To First Token (ms)" diff --git a/model_analyzer/record/types/time_to_first_token_p75.py b/model_analyzer/record/types/time_to_first_token_p75.py new file mode 100755 index 000000000..042972368 --- /dev/null +++ b/model_analyzer/record/types/time_to_first_token_p75.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase + + +@total_ordering +class TimeToFirstTokenP75(TimeToFirstTokenBase): + """ + A record for perf_analyzer Time to first token metric + """ + + tag = "time_to_first_token_p75" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p75 Time To First Token (ms)" diff --git a/model_analyzer/record/types/time_to_first_token_p90.py b/model_analyzer/record/types/time_to_first_token_p90.py new file mode 100755 index 000000000..853adbdb4 --- /dev/null +++ b/model_analyzer/record/types/time_to_first_token_p90.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase + + +@total_ordering +class TimeToFirstTokenP90(TimeToFirstTokenBase): + """ + A record for perf_analyzer Time to first token metric + """ + + tag = "time_to_first_token_p90" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p90 Time To First Token (ms)" diff --git a/model_analyzer/record/types/time_to_first_token_p95.py b/model_analyzer/record/types/time_to_first_token_p95.py new file mode 100755 index 000000000..6e466c4e2 --- /dev/null +++ b/model_analyzer/record/types/time_to_first_token_p95.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase + + +@total_ordering +class TimeToFirstTokenP95(TimeToFirstTokenBase): + """ + A record for perf_analyzer Time to first token metric + """ + + tag = "time_to_first_token_p95" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p95 Time To First Token (ms)" diff --git a/model_analyzer/record/types/time_to_first_token_p99.py b/model_analyzer/record/types/time_to_first_token_p99.py new file mode 100755 index 000000000..24f2ff088 --- /dev/null +++ b/model_analyzer/record/types/time_to_first_token_p99.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase + + +@total_ordering +class TimeToFirstTokenP99(TimeToFirstTokenBase): + """ + A record for perf_analyzer Time to first token metric + """ + + tag = "time_to_first_token_p99" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p99 Time To First Token (ms)" diff --git a/tests/test_cli.py b/tests/test_cli.py index 98ec60237..33a0dd4e0 100755 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -65,6 +65,7 @@ def get_test_options(): OptionStruct("bool", "profile","--skip-summary-reports"), OptionStruct("bool", "profile","--skip-detailed-reports"), OptionStruct("bool", "profile","--always-report-gpu-metrics"), + #Int/Float options # Options format: # (int/float, MA step, long_option, short_option, test_value, expected_default_value) @@ -125,6 +126,7 @@ def get_test_options(): OptionStruct("string", "report", "--config-file", "-f", "baz", None, None), OptionStruct("string", "profile", "--triton-docker-shm-size", None, "1G", None, extra_commands=["--triton-launch-mode", "docker"]), OptionStruct("string", "profile","--run-config-search-mode", None, ["quick", "brute"], "brute", "SHOULD_FAIL"), + OptionStruct("string", "profile", "--model-type", None, ["generic", "LLM"], "generic", None), #List Options: # Options format: @@ -163,6 +165,7 @@ def get_test_options(): OptionStruct("noop", "yaml_profile", "weighting"), OptionStruct("noop", "yaml_profile", "triton_server_flags"), OptionStruct("noop", "yaml_profile", "perf_analyzer_flags"), + OptionStruct("noop", "yaml_profile", "genai_perf_flags"), OptionStruct("noop", "yaml_profile", "triton_docker_labels"), OptionStruct("noop", "yaml_profile", "triton_server_environment"), OptionStruct("noop", "yaml_profile", "triton_docker_args"), diff --git a/tests/test_config.py b/tests/test_config.py index f056eb76a..190075dea 100755 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -24,6 +24,9 @@ from model_analyzer.cli.cli import CLI from model_analyzer.config.input.config_command_profile import ConfigCommandProfile from model_analyzer.config.input.config_command_report import ConfigCommandReport +from model_analyzer.config.input.config_defaults import ( + DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS, +) from model_analyzer.config.input.config_enum import ConfigEnum from model_analyzer.config.input.config_list_generic import ConfigListGeneric from model_analyzer.config.input.config_list_numeric import ConfigListNumeric @@ -2356,6 +2359,28 @@ def test_mixing_request_rate_and_concurrency(self): with self.assertRaises(TritonModelAnalyzerException): self._evaluate_config(args, yaml_content, subcommand="profile") + def test_model_type_llm(self): + """ + Test that model type of LLM chooses the correct inference outputs + """ + args = [ + "model-analyzer", + "profile", + "--model-repository", + "cli-repository", + "--profile-models", + "modelA", + "--model-type", + "LLM", + ] + yaml_content = "" + + config = self._evaluate_config(args, yaml_content) + + self.assertEqual( + config.inference_output_fields, DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS + ) + def _test_request_rate_config_conflicts( self, base_args: List[Any], yaml_content: str ) -> None: diff --git a/tests/test_perf_analyzer.py b/tests/test_perf_analyzer.py index e95f0d4a1..0b57701b8 100755 --- a/tests/test_perf_analyzer.py +++ b/tests/test_perf_analyzer.py @@ -33,6 +33,16 @@ from model_analyzer.record.types.gpu_power_usage import GPUPowerUsage from model_analyzer.record.types.gpu_used_memory import GPUUsedMemory from model_analyzer.record.types.gpu_utilization import GPUUtilization +from model_analyzer.record.types.inter_token_latency_avg import InterTokenLatencyAvg +from model_analyzer.record.types.inter_token_latency_max import InterTokenLatencyMax +from model_analyzer.record.types.inter_token_latency_min import InterTokenLatencyMin +from model_analyzer.record.types.inter_token_latency_p25 import InterTokenLatencyP25 +from model_analyzer.record.types.inter_token_latency_p50 import InterTokenLatencyP50 +from model_analyzer.record.types.inter_token_latency_p75 import InterTokenLatencyP75 +from model_analyzer.record.types.inter_token_latency_p90 import InterTokenLatencyP90 +from model_analyzer.record.types.inter_token_latency_p95 import InterTokenLatencyP95 +from model_analyzer.record.types.inter_token_latency_p99 import InterTokenLatencyP99 +from model_analyzer.record.types.output_token_throughput import OutputTokenThroughput from model_analyzer.record.types.perf_client_response_wait import PerfClientResponseWait from model_analyzer.record.types.perf_client_send_recv import PerfClientSendRecv from model_analyzer.record.types.perf_latency_avg import PerfLatencyAvg @@ -46,6 +56,15 @@ ) from model_analyzer.record.types.perf_server_queue import PerfServerQueue from model_analyzer.record.types.perf_throughput import PerfThroughput +from model_analyzer.record.types.time_to_first_token_avg import TimeToFirstTokenAvg +from model_analyzer.record.types.time_to_first_token_max import TimeToFirstTokenMax +from model_analyzer.record.types.time_to_first_token_min import TimeToFirstTokenMin +from model_analyzer.record.types.time_to_first_token_p25 import TimeToFirstTokenP25 +from model_analyzer.record.types.time_to_first_token_p50 import TimeToFirstTokenP50 +from model_analyzer.record.types.time_to_first_token_p75 import TimeToFirstTokenP75 +from model_analyzer.record.types.time_to_first_token_p90 import TimeToFirstTokenP90 +from model_analyzer.record.types.time_to_first_token_p95 import TimeToFirstTokenP95 +from model_analyzer.record.types.time_to_first_token_p99 import TimeToFirstTokenP99 from model_analyzer.triton.client.client_factory import TritonClientFactory from model_analyzer.triton.server.server_config import TritonServerConfig from model_analyzer.triton.server.server_factory import TritonServerFactory @@ -248,7 +267,10 @@ def test_perf_analyzer_ssl_args(self): ) self.assertEqual(self.config.to_cli_string(), expected_cli_str) - def test_run(self): + def test_pa_csv_output(self): + """ + Tests the ability to read PA's CSV output + """ server_config = TritonServerConfig() server_config["model-repository"] = MODEL_REPOSITORY_PATH @@ -287,122 +309,40 @@ def test_run(self): self.assertEqual(len(records[TEST_MODEL_NAME]), 1) self.assertEqual(records[TEST_MODEL_NAME][0].value(), 5) - # Test p90 latency parsing - perf_metrics = [PerfLatencyP90] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 4.7) - - # Test p95 latency parsing - perf_metrics = [PerfLatencyP95] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 4.8) - - # Test p99 latency parsing - perf_metrics = [PerfLatencyP99] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 4.9) + # Test latency parsing + self._test_metrics_from_csv(perf_analyzer, pa_csv_mock, [PerfLatencyP90], [4.7]) + self._test_metrics_from_csv(perf_analyzer, pa_csv_mock, [PerfLatencyP95], [4.8]) + self._test_metrics_from_csv(perf_analyzer, pa_csv_mock, [PerfLatencyP99], [4.9]) # Test throughput parsing - perf_metrics = [PerfThroughput] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 46.8) + self._test_metrics_from_csv( + perf_analyzer, pa_csv_mock, [PerfThroughput], [46.8] + ) # Test client response wait - perf_metrics = [PerfClientResponseWait] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 0.314) + self._test_metrics_from_csv( + perf_analyzer, pa_csv_mock, [PerfClientResponseWait], [0.314] + ) # Test server queue - perf_metrics = [PerfServerQueue] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 0.018) + self._test_metrics_from_csv( + perf_analyzer, pa_csv_mock, [PerfServerQueue], [0.018] + ) # Test server compute infer - perf_metrics = [PerfServerComputeInfer] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 0.065) + self._test_metrics_from_csv( + perf_analyzer, pa_csv_mock, [PerfServerComputeInfer], [0.065] + ) # Test server compute input - perf_metrics = [PerfServerComputeInput] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 0.034) - - # Test server compute infer - perf_metrics = [PerfServerComputeOutput] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) + self._test_metrics_from_csv( + perf_analyzer, pa_csv_mock, [PerfServerComputeInput], [0.034] + ) - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 0.016) + # Test server compute output + self._test_metrics_from_csv( + perf_analyzer, pa_csv_mock, [PerfServerComputeOutput], [0.016] + ) # Test Avg GPU Utilizations. Perf metric is ignored for get_gpu_records() gpu_metrics = [GPUUtilization, PerfLatencyAvg] @@ -544,6 +484,206 @@ def test_run(self): self.assertTrue(perf_analyzer.run(perf_metrics)) self.server.stop() + def test_pa_llm_csv_output(self): + """ + Tests the ability to read PA's LLM CSV output + """ + server_config = TritonServerConfig() + server_config["model-repository"] = MODEL_REPOSITORY_PATH + + # Create server, client, PerfAnalyzer, and wait for server ready + self.server = TritonServerFactory.create_server_local( + path=TRITON_LOCAL_BIN_PATH, config=server_config, gpus=self.gpus + ) + + perf_analyzer = PerfAnalyzer( + path=PERF_BIN_PATH, + config=self.run_config, + max_retries=10, + timeout=100, + max_cpu_util=50, + model_type="LLM", + ) + self.client = TritonClientFactory.create_grpc_client(server_url=TEST_GRPC_URL) + self.server.start() + self.client.wait_for_server_ready(num_retries=1) + + pa_llm_csv_mock = """Metric,avg,min,max,p99,p95,p90,p75,p50,p25\n""" + pa_llm_csv_mock += """Time To First Token (ns),4238735,3367978,6702240,6371118,5344958,5006259,4841394,4146648,3484484\n""" + pa_llm_csv_mock += """Inter Token Latency (ns),27202264,3849435,138324924,28283424,27737593,27469154,27067290,26979956,26926962\n""" + pa_llm_csv_mock += """Request Latency (ns),3363927003,3367978,14238834483,14091273510,13740917508,13692672723,3752510140,4846258,3612270\n""" + pa_llm_csv_mock += """Num Output Token,126,0,584,562,509,505,135,0,0\n""" + pa_llm_csv_mock += """\n""" + pa_llm_csv_mock += """Metric,Value\n""" + pa_llm_csv_mock += """Output Token Throughput (per sec),36.37\n""" + pa_llm_csv_mock += """Request Throughput (per sec),0.29""" + + # Test all Time to first token values + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenAvg], + [4238.735], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenMin], + [3367.978], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenMax], + [6702.240], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenP99], + [6371.118], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenP95], + [5344.958], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenP90], + [5006.259], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenP75], + [4841.394], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenP50], + [4146.648], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenP25], + [3484.484], + is_llm=True, + ) + + # Test all Inter token latency values + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyAvg], + [27202.264], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyMin], + [3849.435], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyMax], + [138324.924], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyP99], + [28283.424], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyP95], + [27737.593], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyP90], + [27469.154], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyP75], + [27067.290], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyP50], + [26979.956], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyP25], + [26926.962], + is_llm=True, + ) + + # Test output token throughput + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [OutputTokenThroughput], + [36.37], + is_llm=True, + ) + + # Test combination + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenP90, InterTokenLatencyP50, OutputTokenThroughput], + [5006.259, 26979.956, 36.37], + is_llm=True, + ) + + def _test_metrics_from_csv( + self, perf_analyzer, read_data, metrics, expected_values, is_llm=False + ): + with patch( + "model_analyzer.perf_analyzer.perf_analyzer.open", + mock_open(read_data=read_data), + ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): + perf_analyzer.run(metrics) + + if is_llm: + records = perf_analyzer.get_llm_records() + else: + records = perf_analyzer.get_perf_records() + + self.assertEqual(len(records[TEST_MODEL_NAME]), len(expected_values)) + for i, expected_value in enumerate(expected_values): + self.assertEqual(records[TEST_MODEL_NAME][i].value(), expected_value) + def test_measurement_interval_increase(self): server_config = TritonServerConfig() server_config["model-repository"] = MODEL_REPOSITORY_PATH diff --git a/tests/test_record_types.py b/tests/test_record_types.py index 4bd6d8b32..1279e06df 100755 --- a/tests/test_record_types.py +++ b/tests/test_record_types.py @@ -49,6 +49,24 @@ def setUp(self): "perf_latency_p90", "perf_latency_p95", "perf_latency_p99", + "inter_token_latency_min", + "inter_token_latency_max", + "inter_token_latency_avg", + "inter_token_latency_p25", + "inter_token_latency_p50", + "inter_token_latency_p75", + "inter_token_latency_p90", + "inter_token_latency_p95", + "inter_token_latency_p99", + "time_to_first_token_min", + "time_to_first_token_max", + "time_to_first_token_avg", + "time_to_first_token_p25", + "time_to_first_token_p50", + "time_to_first_token_p75", + "time_to_first_token_p90", + "time_to_first_token_p95", + "time_to_first_token_p99", "gpu_used_memory", "cpu_used_ram", "perf_server_compute_infer", @@ -65,6 +83,7 @@ def setUp(self): record_types[k] for k in [ "perf_throughput", + "output_token_throughput", "gpu_free_memory", "gpu_utilization", "cpu_available_ram",