triton-inference-server · nv-braf · Oct 17, 2023 · Oct 10, 2023 · Oct 9, 2023 · Oct 10, 2023
diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py
@@ -16,6 +16,7 @@
 
 import csv
 import glob
+import json
 import logging
 import os
 import re
@@ -25,6 +26,7 @@
 from typing import Dict, List
 
 import psutil
+from numpy import mean
 
 from model_analyzer.constants import (
     INTERVAL_SLEEP_TIME,
@@ -118,6 +120,14 @@ def get_gpu_metrics():
         ]
         return gpu_metrics
 
+    @staticmethod
+    def get_llm_metrics():
+        llm_metrics = [
+            llm_metric[PerfAnalyzer.RECORD_CLASS]
+            for llm_metric in PerfAnalyzer.llm_metric_table
+        ]
+        return llm_metrics
+
     def __init__(self, path, config, max_retries, timeout, max_cpu_util):
         """
         Parameters
@@ -143,6 +153,7 @@ def __init__(self, path, config, max_retries, timeout, max_cpu_util):
         self._output = ""
         self._perf_records = {}
         self._gpu_records = []
+        self._llm_records = {}
         self._max_cpu_util = max_cpu_util
 
     def run(self, metrics, env=None):
@@ -216,6 +227,19 @@ def get_gpu_records(self):
 
         return self._gpu_records
 
+    def get_llm_records(self):
+        """
+        Returns
+        -------
+        The LLM records from the last perf_analyzer run
+        """
+
+        if self._llm_records:
+            return self._llm_records
+        raise TritonModelAnalyzerException(
+            "Attempted to get perf_analyzer results without calling run first."
+        )
+
     def output(self):
         """
         Returns
@@ -457,21 +481,82 @@ def _parse_outputs(self, metrics):
             logger.debug(
                 f"Reading PA results from {perf_config['latency-report-file']}"
             )
-            with open(perf_config["latency-report-file"], mode="r") as f:
-                csv_reader = csv.DictReader(f, delimiter=",")
-
-                for row in csv_reader:
-                    self._perf_records[
-                        perf_config["model-name"]
-                    ] = self._extract_perf_records_from_row(metrics, row)
-                    self._gpu_records = self._extract_gpu_records_from_row(metrics, row)
+            self._extract_gpu_records(perf_config, metrics)
+            self._extract_llm_records(perf_config, metrics)
 
         for perf_config in [
             mrc.perf_config() for mrc in self._config.model_run_configs()
         ]:
-            # Remove the latency file and all associated composing model latency files
+            # Remove the latency/profile export files and all associated composing model latency files
             for f in glob.glob(f"*{perf_config['latency-report-file']}"):
                 os.remove(f)
+            for f in glob.glob(f"*{perf_config['profile-export-file']}"):
+                os.remove(f)
+
+    def _extract_gpu_records(self, perf_config, metrics):
+        if perf_config["profile-export-file"]:
+            return
+
+        with open(perf_config["latency-report-file"], mode="r") as f:
+            csv_reader = csv.DictReader(f, delimiter=",")
+
+            for row in csv_reader:
+                self._perf_records[
+                    perf_config["model-name"]
+                ] = self._extract_perf_records_from_row(metrics, row)
+                self._gpu_records = self._extract_gpu_records_from_row(metrics, row)
+
+    def _extract_llm_records(self, perf_config, metrics):
+        if not perf_config["profile-export-file"]:
+            return
+
+        self._llm_records[perf_config["model-name"]] = []
+
+        with open(perf_config["profile-export-file"], mode="r") as f:
+            llm_output = json.load(f)
+
+            avg_first_token_latency = self._calculate_avg_first_token_latency(
+                llm_output
+            )
+            record = PerfAnalyzer.llm_metric_table[0][PerfAnalyzer.RECORD_CLASS](
+                value=avg_first_token_latency
+            )  # type: ignore
+
+            self._llm_records[perf_config["model-name"]].append(record)
+
+            avg_token_to_token_latency = self._calculate_avg_token_to_token_latency(
+                llm_output
+            )
+            record = PerfAnalyzer.llm_metric_table[1][PerfAnalyzer.RECORD_CLASS](
+                value=avg_token_to_token_latency
+            )  # type: ignore
+            self._llm_records[perf_config["model-name"]].append(record)
+
+    def _calculate_avg_first_token_latency(self, llm_output: Dict) -> float:
+        total_first_token_latencies = []
+        for request in llm_output["experiments"][0]["requests"]:
+            total_first_token_latencies.append(
+                request["response_timestamps"][0] - request["timestamp"]
+            )
+
+        avg_first_token_latency = mean(total_first_token_latencies)
+
+        return avg_first_token_latency
+
+    def _calculate_avg_token_to_token_latency(self, llm_output: Dict) -> float:
+        token_to_token_latencies = []
+        for request in llm_output["experiments"][0]["requests"]:
+            response_to_response_latencies = []
+            prev_response = request["response_timestamps"][0]
+            for response in request["response_timestamps"][1:]:
+                response_to_response_latencies.append(response - prev_response)
+                prev_response = response
+
+            token_to_token_latencies.append(mean(response_to_response_latencies))
+
+        avg_token_to_token_latency = mean(token_to_token_latencies)
+
+        return avg_token_to_token_latency
 
     def _extract_perf_records_from_row(
         self, requested_metrics: List[Record], row_metrics: Dict[str, str]

diff --git a/model_analyzer/perf_analyzer/perf_config.py b/model_analyzer/perf_analyzer/perf_config.py
@@ -73,6 +73,7 @@ class PerfAnalyzerConfig:
         "metrics-interval",
         "bls-composing-models",
         "request-parameter",
+        "request-period",
     ]
 
     input_to_options = [
@@ -82,6 +83,7 @@ class PerfAnalyzerConfig:
         "url",
         "protocol",
         "latency-report-file",
+        "profile-export-file",
         "http-header",
     ]
 
@@ -112,6 +114,7 @@ def __init__(self):
             "-u": None,
             "-i": None,
             "-f": None,
+            "--profile-export-file": None,
             "-H": None,
         }
         self._verbose = {"-v": None, "-v -v": None, "--verbose-csv": None}
@@ -123,6 +126,7 @@ def __init__(self):
             "url": "-u",
             "protocol": "-i",
             "latency-report-file": "-f",
+            "profile-export-file": "--profile-export-file",
             "http-header": "-H",
         }
 
@@ -193,6 +197,9 @@ def update_config_from_profile_config(self, model_name, profile_config):
             "verbose-csv": "--verbose-csv",
         }
 
+        if profile_config.is_llm_model():
+            params.update({"profile-export-file": model_name + "-results.json"})
+
         if profile_config.triton_launch_mode == "c_api":
             params.update(
                 {
@@ -307,7 +314,7 @@ def remove_url_from_cli_string(cls, cli_string):
     @classmethod
     def remove_mrc_from_cli_string(cls, cli_string):
         """
-        utility function strips the measruement request count
+        utility function strips the measurement request count
         from a cli string representation
 
         Parameters

diff --git a/model_analyzer/record/metrics_manager.py b/model_analyzer/record/metrics_manager.py
@@ -69,6 +69,8 @@ class MetricsManager:
         "gpu_power_usage",
         "cpu_available_ram",
         "cpu_used_ram",
+        "avg_first_token_latency",
+        "avg_token_to_token_latency",
     ]
 
     def __init__(self, config, client, server, gpus, result_manager, state_manager):
@@ -116,6 +118,7 @@ def __init__(self, config, client, server, gpus, result_manager, state_manager):
             self._gpu_metrics,
             self._perf_metrics,
             self._cpu_metrics,
+            self._llm_metrics,
         ) = self._categorize_metrics(self.metrics, self._config.collect_cpu_metrics)
         self._gpus = gpus
         self._init_state()
@@ -160,21 +163,23 @@ def _categorize_metrics(metric_tags, collect_cpu_metrics=False):
 
         Returns
         -------
-        (list,list,list)
-            tuple of three lists (DCGM, PerfAnalyzer, CPU) metrics
+        (list,list,list,list)
+            tuple of four lists (DCGM, PerfAnalyzer, CPU, LLM) metrics
         """
 
-        gpu_metrics, perf_metrics, cpu_metrics = [], [], []
+        gpu_metrics, perf_metrics, cpu_metrics, llm_metrics = [], [], [], []
         # Separates metrics and objectives into related lists
         for metric in MetricsManager.get_metric_types(metric_tags):
             if metric in PerfAnalyzer.get_gpu_metrics():
                 gpu_metrics.append(metric)
             elif metric in PerfAnalyzer.get_perf_metrics():
                 perf_metrics.append(metric)
+            elif metric in PerfAnalyzer.get_llm_metrics():
+                llm_metrics.append(metric)
             elif collect_cpu_metrics and (metric in CPUMonitor.cpu_metrics):
                 cpu_metrics.append(metric)
 
-        return gpu_metrics, perf_metrics, cpu_metrics
+        return gpu_metrics, perf_metrics, cpu_metrics, llm_metrics
 
     def profile_server(self):
         """
@@ -556,6 +561,9 @@ def _run_perf_analyzer(
         )
 
         metrics_to_gather = self._perf_metrics + self._gpu_metrics
+        if self._config.is_llm_model():
+            metrics_to_gather += self._llm_metrics
+
         status = perf_analyzer.run(metrics_to_gather, env=perf_analyzer_env)
 
         self._write_perf_analyzer_output(perf_output_writer, perf_analyzer)
@@ -564,7 +572,9 @@ def _run_perf_analyzer(
             self._handle_unsuccessful_perf_analyzer_run(perf_analyzer)
             return (None, None)
 
-        perf_records = perf_analyzer.get_perf_records()
+        perf_records = (
+            perf_analyzer.get_perf_records() + perf_analyzer.get_llm_records()
+        )
         gpu_records = perf_analyzer.get_gpu_records()
 
         aggregated_perf_records = self._aggregate_perf_records(perf_records)

diff --git a/model_analyzer/record/types/avg_first_token_latency.py b/model_analyzer/record/types/avg_first_token_latency.py
@@ -22,7 +22,7 @@
 @total_ordering
 class AvgFirstTokenLatency(DecreasingRecord):
     """
-    A record for perf_analyzer avg first token to token latency metric
+    A record for perf_analyzer average first token latency metric
     """
 
     tag = "avg_first_token_latency"

diff --git a/model_analyzer/record/types/avg_token_to_token_latency.py b/model_analyzer/record/types/avg_token_to_token_latency.py
@@ -22,7 +22,7 @@
 @total_ordering
 class AvgTokenToTokenLatency(DecreasingRecord):
     """
-    A record for perf_analyzer avg token-to-token latency metric
+    A record for perf_analyzer average token-to-token latency metric
     """
 
     tag = "avg_token_to_token_latency"

diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py
@@ -235,6 +235,7 @@ def convert_avg_gpu_metrics_to_data(avg_gpu_metric_values):
 def construct_perf_analyzer_config(
     model_name="my-model",
     output_file_name="my-model-results.csv",
+    export_file_name="my-model-results.json",
     batch_size=DEFAULT_BATCH_SIZES,
     concurrency=1,
     request_rate=None,
@@ -253,6 +254,8 @@ def construct_perf_analyzer_config(
         The name of the model
     output_file_name: str
         The name of the output file
+    export_file_name: str
+        The name of the export file
     batch_size: int
         The batch size for this PA configuration
     concurrency: int
@@ -279,6 +282,9 @@ def construct_perf_analyzer_config(
     pa_config._options["-f"] = output_file_name
     pa_config._options["-b"] = batch_size
 
+    if llm_search_mode:
+        pa_config._options["--profile-export-file"] = export_file_name
+
     if request_rate:
         pa_config._args["request-rate-range"] = request_rate
     elif llm_search_mode: