triton-inference-server · dyastremsky · Mar 7, 2024 · Mar 7, 2024 · Mar 7, 2024 · Mar 7, 2024
diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import contextlib
+import csv
 import io
 import json
 from dataclasses import dataclass
@@ -47,6 +48,26 @@
 class Metrics:
     """A base class for all the metrics class that contains common metrics."""
 
+    metric_labels = [
+        "time_to_first_token",
+        "inter_token_latency",
+        "request_latency",
+        "output_token_throughput",
+        "request_throughput",
+        "num_output_token",
+    ]
+
+    time_fields = [
+        "inter_token_latency",
+        "time_to_first_token",
+        "request_latency",
+    ]
+
+    throughput_fields = [
+        "request_throughput",
+        "output_token_throughput",
+    ]
+
     def __init__(
         self,
         request_throughputs: list[float] = [],
@@ -152,38 +173,81 @@ def __repr__(self):
         attr_strs = ",".join([f"{k}={v}" for k, v in self.__dict__.items()])
         return f"Statistics({attr_strs})"
 
+    def _is_throughput_field(self, field: str):
+        return field in Metrics.throughput_fields
+
     def _is_time_field(self, field: str):
-        time_fields = [
-            "inter_token_latency",
-            "time_to_first_token",
-            "end_to_end_latency",
-        ]
-        return field in time_fields
+        return field in Metrics.time_fields
 
     def pretty_print(self):
         table = Table(title="PA LLM Metrics")
 
         table.add_column("Statistic", justify="right", style="cyan", no_wrap=True)
-        stats = ["avg", "min", "max", "p99", "p95", "p90", "p75", "p50", "p25"]
+        stats = ["avg", "min", "max", "p99", "p90", "p75"]
         for stat in stats:
             table.add_column(stat, justify="right", style="green")
 
-        metrics = ["inter_token_latency", "time_to_first_token"]
-        for metric in metrics:
+        for metric in Metrics.metric_labels:
             formatted_metric = metric.replace("_", " ").capitalize()
-            is_time_field = self._is_time_field(metric)
-            if is_time_field:
+            if self._is_time_field(metric):
                 formatted_metric += " (ns)"
+            elif self._is_throughput_field(metric):
+                formatted_metric += " (per sec)"
+
             row_values = [formatted_metric]
 
             for stat in stats:
                 value = self.__dict__.get(f"{stat}_{metric}", -1)
                 row_values.append("{:,.0f}".format(value))
-            table.add_row(*row_values)
+
+            # Without streaming, there is no inter-token latency available.
+            if metric == "inter_token_latency":
+                if all(value == -1 for value in row_values[1:]):
+                    continue
+            else:
+                table.add_row(*row_values)
 
         console = Console()
         console.print(table)
 
+    def export_to_csv(self, csv_filename: str):
+        header = [
+            "Statistic",
+            "avg",
+            "min",
+            "max",
+            "p99",
+            "p95",
+            "p90",
+            "p75",
+            "p50",
+            "p25",
+        ]
+
+        with open(csv_filename, mode="w", newline="") as csvfile:
+            csv_writer = csv.writer(csvfile)
+            csv_writer.writerow(header)
+
+            for metric in Metrics.metric_labels:
+                formatted_metric = metric
+                if self._is_time_field(metric):
+                    formatted_metric += "(ns)"
+                elif self._is_throughput_field(metric):
+                    formatted_metric += "(per sec)"
+
+                row_values = [formatted_metric]
+
+                for stat in header[1:]:
+                    value = self.__dict__.get(f"{stat}_{metric}", -1)
+                    row_values.append(f"{value:.0f}")
+
+                # Without streaming, there is no inter-token latency available.
+                if metric == "inter_token_latency":
+                    if all(value == -1 for value in row_values[1:]):
+                        continue
+                else:
+                    csv_writer.writerow(row_values)
+
 
 class ProfileDataParser:
     """Base profile data parser class that reads the profile data JSON file to

diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/main.py b/src/c++/perf_analyzer/genai-pa/genai_pa/main.py
@@ -85,6 +85,10 @@ def report_output(metrics: LLMProfileDataParser, args):
             "Neither concurrency_range nor request_rate_range was found in args when reporting metrics"
         )
     stats = metrics.get_statistics(infer_mode, int(load_level))
+    export_csv_name = args.profile_export_file.with_name(
+        args.profile_export_file.stem + "_genai_pa" + args.profile_export_file.suffix
+    )
+    stats.export_to_csv(export_csv_name)
     stats.pretty_print()
 
 

diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py b/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py
@@ -153,9 +153,11 @@ def _add_profile_args(parser):
         "--profile-export-file",
         type=Path,
         default="profile_export.json",
-        help="Specifies the path where the profile export will be "
-        "generated. By default, the profile export will not be "
-        "generated.",
+        help="Specifies the path where the perf_analyzer profile export will be "
+        "generated. By default, the profile export will be to profile_export.json. "
+        "The GenAi-PA file will be exported to <profile_export_file>_genai_pa.<file_extension>. "
+        "For example, if the profile export file is profile_export.json, the GenAi-PA file will be "
+        "exported to profile_export_genai_pa.json.",
     )
     load_management_group.add_argument(
         "--request-rate",