Merge LLM Metrics changes to main (#855)

* New Records for LLM metrics (#839) * Adding new LLM metrics * Adding base class for perf, inter_token, and time_to_first latency records * Add --llm-mode option (#842) * Adding CLI hook for LLM * Changing to use --model-type * Capture LLM metrics from genai-perf in MA (#844) * Successfully reading from LLM CSV * General cleanup * All unit tests passing * Fixing metric table typos * Fixing typos * Update constraints for LLMs (#845) * Adding LLM values to list of possible constraints * Fixing typo * Adding new output fields for LLM (#846) * Profiling model using genai-perf (#849) * Initial changes to run genai-perf in MA * Gating call to get LLM records * Fixing captilization issue * Removing debug * Adding TODO --------- Co-authored-by: root <[email protected]> * Add genai_perf CLI options to MA (#854) * Added support for genai_perf CLI * Remove dead code * Removing genai_perf collateral * Fixing codeQL issue * Adding streaming to genai_perf_config --------- Co-authored-by: root <[email protected]>
triton-inference-server · Apr 8, 2024 · 8298d83 · 8298d83
1 parent 792f2a4
commit 8298d83
Show file tree

Hide file tree

Showing 41 changed files with 2,329 additions and 285 deletions.
diff --git a/docs/config.md b/docs/config.md
@@ -236,6 +236,9 @@ cpu_only_composing_models: <comma-delimited-string-list>
 # Skips the generation of detailed reports and tables
 [ skip_detailed_reports: <bool> | default: false]
 
+# Type of model being profiled: generic or LLM
+[ model_type: <string> | default: generic]
+
 # Number of top configs to show in summary plots
 [ num_configs_per_model: <int> | default: 3]
 
@@ -364,14 +367,17 @@ Before proceeding, it will be helpful to see the documentation on [Model Analyze
 
 ### `<constraint>`
 
-A constraint, specifies the bounds that determine a successful run. There are
-three constraints allowed:
-
-| Option Name        |   Units   | Constraint | Description                                          |
-| :----------------- | :-------: | :--------: | :--------------------------------------------------- |
-| `perf_throughput`  | inf / sec |    min     | Specify minimum desired throughput.                  |
-| `perf_latency_p99` |    ms     |    max     | Specify maximum tolerable latency or latency budget. |
-| `gpu_used_memory`  |    MB     |    max     | Specify maximum GPU memory used by model.            |
+A constraint, specifies the bounds that determine a successful run. The table below shows examples
+of the types of constraints allowed:
+
+| Option Name               |   Units   | Constraint | Description                                            |
+| :------------------------ | :-------: | :--------: | :----------------------------------------------------- |
+| `perf_throughput`         | inf / sec |    min     | Specify minimum desired throughput.                    |
+| `perf_latency_p99`        |    ms     |    max     | Specify maximum tolerable latency or latency budget.   |
+| `output_token_throughput` | tok / sec |    min     | Specify minimum desired output token throughput.       |
+| `inter_token_latency_p99` |    ms     |    max     | Specify maximum tolerable input token latency.         |
+| `time_to_first_token_p99` |    ms     |    max     | Specify maximum tolerable time to first token latency. |
+| `gpu_used_memory`         |    MB     |    max     | Specify maximum GPU memory used by model.              |
 
 <br>
 

diff --git a/model_analyzer/config/generate/brute_run_config_generator.py b/model_analyzer/config/generate/brute_run_config_generator.py
@@ -129,7 +129,7 @@ def _generate_subset(
             self._send_results_to_generator(index)
 
     def _make_run_config(self) -> RunConfig:
-        run_config = RunConfig(self._triton_env)
+        run_config = RunConfig(self._triton_env, self._models[0].genai_perf_flags())
         for index in range(len(self._models)):
             run_config.add_model_run_config(self._curr_model_run_configs[index])
         return run_config

diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py
@@ -31,6 +31,7 @@
 )
 from model_analyzer.constants import LOGGER_NAME
 from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
+from model_analyzer.perf_analyzer.genai_perf_config import GenaiPerfConfig
 from model_analyzer.perf_analyzer.perf_config import PerfAnalyzerConfig
 from model_analyzer.record.record import RecordType
 from model_analyzer.triton.server.server_config import TritonServerConfig
@@ -50,7 +51,9 @@
     DEFAULT_GPU_OUTPUT_FIELDS,
     DEFAULT_GPUS,
     DEFAULT_INFERENCE_OUTPUT_FIELDS,
+    DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS,
     DEFAULT_MAX_RETRIES,
+    DEFAULT_MODEL_TYPE,
     DEFAULT_MODEL_WEIGHTING,
     DEFAULT_MONITORING_INTERVAL,
     DEFAULT_NUM_CONFIGS_PER_MODEL,
@@ -297,6 +300,15 @@ def _fill_config(self):
                 description="Skips the generation of detailed summary reports and tables.",
             )
         )
+        self._add_config(
+            ConfigField(
+                "model_type",
+                flags=["--model-type"],
+                field_type=ConfigPrimitive(str),
+                default_value=DEFAULT_MODEL_TYPE,
+                description="Type of model being profiled: generic or LLM",
+            )
+        )
 
         self._add_repository_configs()
         self._add_client_configs()
@@ -364,6 +376,10 @@ def _add_profile_models_configs(self):
             }
         )
 
+        genai_perf_flags_scheme = ConfigObject(
+            schema={k: ConfigPrimitive(str) for k in GenaiPerfConfig.allowed_keys()}
+        )
+
         triton_server_environment_scheme = ConfigObject(
             schema={"*": ConfigPrimitive(str)}
         )
@@ -444,6 +460,13 @@ def _add_profile_models_configs(self):
                 description="Allows custom configuration of the perf analyzer instances used by model analyzer.",
             )
         )
+        self._add_config(
+            ConfigField(
+                "genai_perf_flags",
+                field_type=genai_perf_flags_scheme,
+                description="Allows custom configuration of the GenAI Perf instances used by model analyzer.",
+            )
+        )
         self._add_config(
             ConfigField(
                 "triton_server_flags",
@@ -484,6 +507,11 @@ def _add_profile_models_configs(self):
                         "min": ConfigPrimitive(int),
                     }
                 ),
+                "output_token_throughput": ConfigObject(
+                    schema={
+                        "min": ConfigPrimitive(int),
+                    }
+                ),
                 "perf_latency_avg": ConfigObject(
                     schema={
                         "max": ConfigPrimitive(int),
@@ -514,6 +542,96 @@ def _add_profile_models_configs(self):
                         "max": ConfigPrimitive(int),
                     }
                 ),
+                "inter_token_latency_p99": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "inter_token_latency_p95": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "inter_token_latency_p90": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "inter_token_latency_p75": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "inter_token_latency_p50": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "inter_token_latency_p25": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "inter_token_latency_min": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "inter_token_latency_max": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "inter_token_latency_avg": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "time_to_first_token_p99": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "time_to_first_token_p95": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "time_to_first_token_p90": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "time_to_first_token_p75": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "time_to_first_token_p50": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "time_to_first_token_p25": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "time_to_first_token_min": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "time_to_first_token_max": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "time_to_first_token_avg": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
             }
         )
         self._add_config(
@@ -560,6 +678,7 @@ def _add_profile_models_configs(self):
                         "weighting": ConfigPrimitive(type_=int),
                         "model_config_parameters": model_config_fields,
                         "perf_analyzer_flags": perf_analyzer_flags_scheme,
+                        "genai_perf_flags": genai_perf_flags_scheme,
                         "triton_server_flags": triton_server_flags_scheme,
                         "triton_server_environment": triton_server_environment_scheme,
                         "triton_docker_args": triton_docker_args_scheme,
@@ -1344,6 +1463,12 @@ def _autofill_values(self):
             if not self._fields["gpu_output_fields"].is_set_by_user():
                 self.gpu_output_fields = DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS
 
+        # Switch default output fields if user specifies model type of LLM
+        # and the user didn't specify a custom output field
+        if self.model_type == "LLM":
+            if not self._fields["inference_output_fields"].is_set_by_user():
+                self.inference_output_fields = DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS
+
         new_profile_models = {}
         for i, model in enumerate(self.profile_models):
             new_model = {"cpu_only": (model.cpu_only() or cpu_only)}
@@ -1447,6 +1572,12 @@ def _autofill_values(self):
             else:
                 new_model["perf_analyzer_flags"] = model.perf_analyzer_flags()
 
+            # GenAI Perf flags
+            if not model.genai_perf_flags():
+                new_model["genai_perf_flags"] = self.genai_perf_flags
+            else:
+                new_model["genai_perf_flags"] = model.genai_perf_flags()
+
             # triton server flags
             if not model.triton_server_flags():
                 new_model["triton_server_flags"] = self.triton_server_flags

diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py
@@ -68,6 +68,7 @@
 DEFAULT_PERF_OUTPUT_FLAG = False
 DEFAULT_PERF_MAX_AUTO_ADJUSTS = 10
 DEFAULT_MEASUREMENT_MODE = "count_windows"
+DEFAULT_MODEL_TYPE = "generic"
 
 DEFAULT_ONLINE_PLOTS = {
     "throughput_v_latency": {
@@ -118,6 +119,20 @@
     "perf_throughput",
     "perf_latency_p99",
 ]
+DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS = [
+    "model_name",
+    "batch_size",
+    "concurrency",
+    "model_config_path",
+    "instance_group",
+    "max_batch_size",
+    "satisfies_constraints",
+    "perf_throughput",
+    "perf_latency_p99",
+    "inter_token_latency_p99",
+    "time_to_first_token_p99",
+    "output_token_throughput",
+]
 DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS = [
     "model_name",
     "batch_size",

diff --git a/model_analyzer/config/input/objects/config_model_profile_spec.py b/model_analyzer/config/input/objects/config_model_profile_spec.py
@@ -33,6 +33,7 @@ def __init__(
         parameters=None,
         model_config_parameters=None,
         perf_analyzer_flags=None,
+        genai_perf_flags=None,
         triton_server_flags=None,
         triton_server_environment=None,
         triton_docker_args=None,
@@ -58,6 +59,9 @@ def __init__(
         perf_analyzer_flags : dict
             The custom perf analyzer configuration
             for this model
+        genai_perf_flags : dict
+            The custom GenAI perf configuration
+            for this model
         triton_server_flags : dict
             The configuration for the triton server instance launched
             for this model
@@ -78,6 +82,7 @@ def __init__(
         self._parameters = parameters
         self._model_config_parameters = model_config_parameters
         self._perf_analyzer_flags = perf_analyzer_flags
+        self._genai_perf_flags = genai_perf_flags
         self._triton_server_flags = triton_server_flags
         self._triton_server_environment = triton_server_environment
         self._triton_docker_args = triton_docker_args
@@ -162,6 +167,16 @@ def perf_analyzer_flags(self):
 
         return self._perf_analyzer_flags
 
+    def genai_perf_flags(self):
+        """
+        Returns
+        -------
+        dict:
+             the genai_perf_flags
+        """
+
+        return self._genai_perf_flags
+
     def triton_server_flags(self):
         """
         Returns
@@ -304,4 +319,7 @@ def __repr__(self):
         if self._perf_analyzer_flags:
             model_object["perf_analyzer_flags"] = self._perf_analyzer_flags
 
+        if self._genai_perf_flags:
+            model_object["genai_perf_flags"] = self._genai_perf_flags
+
         return str(model_object)
diff --git a/model_analyzer/config/run/run_config.py b/model_analyzer/config/run/run_config.py
@@ -17,6 +17,7 @@
 from typing import List
 
 from model_analyzer.config.run.model_run_config import ModelRunConfig
+from model_analyzer.perf_analyzer.genai_perf_config import GenaiPerfConfig
 
 
 class RunConfig:
@@ -25,16 +26,21 @@ class RunConfig:
     at the same time in Perf Analyzer
     """
 
-    def __init__(self, triton_env):
+    def __init__(self, triton_env, genai_perf_flags=None):
         """
         Parameters
         ----------
         triton_env : dict
             A dictionary of environment variables to set
             when launching tritonserver
+
+        genai_perf_flags: dict
+            The set of flags used when calling genai_perf for LLM models
         """
 
         self._triton_env = triton_env
+        self._genai_perf_config = GenaiPerfConfig()
+        self._genai_perf_config.update_config(genai_perf_flags)
         self._model_run_configs: List[ModelRunConfig] = []
 
     def add_model_run_config(self, model_run_config):
@@ -103,6 +109,9 @@ def triton_environment(self):
 
         return self._triton_env
 
+    def genai_perf_config(self):
+        return self._genai_perf_config
+
     def models_name(self):
         """Returns a single comma-joined name of the original model names"""
         return ",".join([mrc.model_name() for mrc in self.model_run_configs()])

diff --git a/model_analyzer/constants.py b/model_analyzer/constants.py
@@ -70,3 +70,7 @@
 
 # Model analyzer package name
 PACKAGE_NAME = "triton-model-analyzer"
+
+# GENAI-PERF
+GENAI_PERF_CSV = "profile_export_genai_perf.csv"
+GENAI_PERF_COLLATERAL = ["llm_inputs.json", "profile_export.json"]