From 98f1273b06ef16dd05582e51d1108fccc6a2519e Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Thu, 7 Mar 2024 11:49:32 -0800
Subject: [PATCH 1/6] Add export to CSV file functionality

---
 .../genai-pa/genai_pa/llm_metrics.py          | 66 +++++++++++++++----
 .../perf_analyzer/genai-pa/genai_pa/main.py   |  2 +
 2 files changed, 54 insertions(+), 14 deletions(-)

diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py
index 29badd843..2ae3c957b 100755
--- a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py
+++ b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import contextlib
+import csv
 import io
 from dataclasses import dataclass
 from itertools import pairwise
@@ -51,6 +52,17 @@ class LLMMetrics:
     inter_token_latencies: list[int]
     output_token_throughputs: list[int]
 
+    metric_labels = [
+        "time_to_first_token",
+        "inter_token_latency",
+    ]
+
+    time_fields = [
+        "inter_token_latency",
+        "time_to_first_token",
+        "end_to_end_latency",
+    ]
+
     def get_base_name(self, attr_name: str) -> str:
         # Attempted to extract and store the mapping as a dataclass member as a
         # dictionary but encountered two issues: (1) Python does not allow
@@ -124,23 +136,17 @@ def __repr__(self):
         return f"Statistics({attr_strs})"
 
     def _is_time_field(self, field: str):
-        time_fields = [
-            "inter_token_latency",
-            "time_to_first_token",
-            "end_to_end_latency",
-        ]
-        return field in time_fields
+        return field in LLMMetrics.time_fields
 
     def pretty_print(self):
         table = Table(title="PA LLM Metrics")
 
         table.add_column("Statistic", justify="right", style="cyan", no_wrap=True)
-        stats = ["avg", "min", "max", "p99", "p95", "p90", "p75", "p50", "p25"]
+        stats = ["avg", "min", "max", "p99", "p90", "p75"]
         for stat in stats:
             table.add_column(stat, justify="right", style="green")
 
-        metrics = ["inter_token_latency", "time_to_first_token"]
-        for metric in metrics:
+        for metric in LLMMetrics.metric_labels:
             formatted_metric = metric.replace("_", " ").capitalize()
             is_time_field = self._is_time_field(metric)
             if is_time_field:
@@ -155,6 +161,38 @@ def pretty_print(self):
         console = Console()
         console.print(table)
 
+    def export_to_csv(self, csv_filename: str):
+        header = [
+            "Statistic",
+            "avg",
+            "min",
+            "max",
+            "p99",
+            "p95",
+            "p90",
+            "p75",
+            "p50",
+            "p25",
+        ]
+
+        with open(csv_filename, mode="w", newline="") as csvfile:
+            csv_writer = csv.writer(csvfile)
+            csv_writer.writerow(header)
+
+            for metric in LLMMetrics.metric_labels:
+                formatted_metric = metric
+                is_time_field = self._is_time_field(metric)
+                if is_time_field:
+                    formatted_metric += "(ns)"
+
+                row_values = [formatted_metric]
+
+                for stat in header[1:]:
+                    value = self.__dict__.get(f"{stat}_{metric}", -1)
+                    row_values.append(f"{value:.0f}")
+
+                csv_writer.writerow(row_values)
+
 
 class LLMProfileData:
     """A class that calculates and aggregates all the LLM performance statistics
@@ -202,16 +240,16 @@ def _collect_llm_metrics(
         for request in requests:
             req_timestamp = request["timestamp"]
             res_timestamps = request["response_timestamps"]
-            res_outputs = request["response_outputs"]
+            # res_outputs = request["response_outputs"]
 
             # time to first token
             time_to_first_tokens.append(res_timestamps[0] - req_timestamp)
 
             # output token throughput
-            output_tokens = tokenizer(res_outputs)["input_ids"]
-            total_output_tokens = np.sum(list(map(len, output_tokens)))
-            req_latency = res_timestamps[-1] - req_timestamp
-            output_token_throughputs.append(total_output_tokens / req_latency)
+            # output_tokens = tokenizer(res_outputs)["input_ids"]
+            # total_output_tokens = np.sum(list(map(len, output_tokens)))
+            # req_latency = res_timestamps[-1] - req_timestamp
+            # output_token_throughputs.append(total_output_tokens / req_latency)
 
             # inter token latency
             for t1, t2 in pairwise(res_timestamps):
diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/main.py b/src/c++/perf_analyzer/genai-pa/genai_pa/main.py
index df4ca1bc0..d8182fad1 100755
--- a/src/c++/perf_analyzer/genai-pa/genai_pa/main.py
+++ b/src/c++/perf_analyzer/genai-pa/genai_pa/main.py
@@ -77,6 +77,8 @@ def report_output(metrics: LLMProfileData, args):
             "Neither concurrency_range nor request_rate_range was found in args when reporting metrics"
         )
     stats = metrics.get_statistics(infer_mode, int(load_level))
+    if args.profile_export_file is not None:
+        stats.export_to_csv(args.profile_export_file)
     stats.pretty_print()
 
 

From 322adbd4644bbd8b6b883bb027adab4073997741 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Thu, 7 Mar 2024 12:06:17 -0800
Subject: [PATCH 2/6] Create genai-pa export file

---
 src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py | 8 ++++----
 src/c++/perf_analyzer/genai-pa/genai_pa/main.py        | 6 ++++--
 src/c++/perf_analyzer/genai-pa/genai_pa/parser.py      | 8 +++++---
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py
index 2ae3c957b..5be8efce8 100755
--- a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py
+++ b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py
@@ -246,10 +246,10 @@ def _collect_llm_metrics(
             time_to_first_tokens.append(res_timestamps[0] - req_timestamp)
 
             # output token throughput
-            # output_tokens = tokenizer(res_outputs)["input_ids"]
-            # total_output_tokens = np.sum(list(map(len, output_tokens)))
-            # req_latency = res_timestamps[-1] - req_timestamp
-            # output_token_throughputs.append(total_output_tokens / req_latency)
+            output_tokens = tokenizer(res_outputs)["input_ids"]
+            total_output_tokens = np.sum(list(map(len, output_tokens)))
+            req_latency = res_timestamps[-1] - req_timestamp
+            output_token_throughputs.append(total_output_tokens / req_latency)
 
             # inter token latency
             for t1, t2 in pairwise(res_timestamps):
diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/main.py b/src/c++/perf_analyzer/genai-pa/genai_pa/main.py
index d8182fad1..919a3a2ce 100755
--- a/src/c++/perf_analyzer/genai-pa/genai_pa/main.py
+++ b/src/c++/perf_analyzer/genai-pa/genai_pa/main.py
@@ -77,8 +77,10 @@ def report_output(metrics: LLMProfileData, args):
             "Neither concurrency_range nor request_rate_range was found in args when reporting metrics"
         )
     stats = metrics.get_statistics(infer_mode, int(load_level))
-    if args.profile_export_file is not None:
-        stats.export_to_csv(args.profile_export_file)
+    export_csv_name = args.profile_export_file.with_name(
+        args.profile_export_file.stem + "_genai_pa" + args.profile_export_file.suffix
+    )
+    stats.export_to_csv(export_csv_name)
     stats.pretty_print()
 
 
diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py b/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py
index c45026f63..5372b7ebc 100644
--- a/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py
+++ b/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py
@@ -117,9 +117,11 @@ def _add_profile_args(parser):
         "--profile-export-file",
         type=Path,
         default="profile_export.json",
-        help="Specifies the path where the profile export will be "
-        "generated. By default, the profile export will not be "
-        "generated.",
+        help="Specifies the path where the perf_analyzer profile export will be "
+        "generated. By default, the profile export will be to profile_export.json. "
+        "The GenAi-PA file will be exported to <profile_export_file>_genai_pa. For example,"
+        "if the profile export file is profile_export.json, the GenAi-PA file will be "
+        "exported to profile_export_genai_pa.json.",
     )
     load_management_group.add_argument(
         "--request-rate",

From d18158e3e3f683d3b317e381e49e83e03d1fc13f Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Thu, 7 Mar 2024 12:07:05 -0800
Subject: [PATCH 3/6] Remove commented code

---
 src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py
index 5be8efce8..350bd8a73 100755
--- a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py
+++ b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py
@@ -240,7 +240,7 @@ def _collect_llm_metrics(
         for request in requests:
             req_timestamp = request["timestamp"]
             res_timestamps = request["response_timestamps"]
-            # res_outputs = request["response_outputs"]
+            res_outputs = request["response_outputs"]
 
             # time to first token
             time_to_first_tokens.append(res_timestamps[0] - req_timestamp)

From 6f46de703c79cefcbac41b7eb9d3ea177cd4813b Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Thu, 7 Mar 2024 12:45:55 -0800
Subject: [PATCH 4/6] Clarify parser message

---
 src/c++/perf_analyzer/genai-pa/genai_pa/parser.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py b/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py
index 5372b7ebc..5d5dbad75 100644
--- a/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py
+++ b/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py
@@ -119,8 +119,8 @@ def _add_profile_args(parser):
         default="profile_export.json",
         help="Specifies the path where the perf_analyzer profile export will be "
         "generated. By default, the profile export will be to profile_export.json. "
-        "The GenAi-PA file will be exported to <profile_export_file>_genai_pa. For example,"
-        "if the profile export file is profile_export.json, the GenAi-PA file will be "
+        "The GenAi-PA file will be exported to <profile_export_file>_genai_pa.<file_extension>. "
+        "For example, if the profile export file is profile_export.json, the GenAi-PA file will be "
         "exported to profile_export_genai_pa.json.",
     )
     load_management_group.add_argument(

From 057576fabe4997d07abbd22378d41c6c7bac4862 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Thu, 7 Mar 2024 15:26:27 -0800
Subject: [PATCH 5/6] Add other metrics

---
 .../genai-pa/genai_pa/llm_metrics.py          | 66 +++++++++++++------
 1 file changed, 46 insertions(+), 20 deletions(-)

diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py
index 3704e3f42..55e4dd9a3 100755
--- a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py
+++ b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py
@@ -48,6 +48,26 @@
 class Metrics:
     """A base class for all the metrics class that contains common metrics."""
 
+    metric_labels = [
+        "time_to_first_token",
+        "inter_token_latency",
+        "request_latency",
+        "output_token_throughput",
+        "request_throughput",
+        "num_output_token",
+    ]
+
+    time_fields = [
+        "inter_token_latency",
+        "time_to_first_token",
+        "request_latency",
+    ]
+
+    throughput_fields = [
+        "request_throughput",
+        "output_token_throughput",
+    ]
+
     def __init__(
         self,
         request_throughputs: list[float] = [],
@@ -91,17 +111,6 @@ def __init__(
         self.output_token_throughputs = output_token_throughputs
         self.num_output_tokens = num_output_tokens
 
-        metric_labels = [
-            "time_to_first_token",
-            "inter_token_latency",
-        ]
-
-        time_fields = [
-            "inter_token_latency",
-            "time_to_first_token",
-            "end_to_end_latency",
-        ]
-
         # add base name mapping
         self._base_names["time_to_first_tokens"] = "time_to_first_token"
         self._base_names["inter_token_latencies"] = "inter_token_latency"
@@ -164,8 +173,11 @@ def __repr__(self):
         attr_strs = ",".join([f"{k}={v}" for k, v in self.__dict__.items()])
         return f"Statistics({attr_strs})"
 
+    def _is_throughput_field(self, field: str):
+        return field in Metrics.throughput_fields
+
     def _is_time_field(self, field: str):
-        return field in LLMMetrics.time_fields
+        return field in Metrics.time_fields
 
     def pretty_print(self):
         table = Table(title="PA LLM Metrics")
@@ -175,17 +187,25 @@ def pretty_print(self):
         for stat in stats:
             table.add_column(stat, justify="right", style="green")
 
-        for metric in LLMMetrics.metric_labels:
+        for metric in Metrics.metric_labels:
             formatted_metric = metric.replace("_", " ").capitalize()
-            is_time_field = self._is_time_field(metric)
-            if is_time_field:
+            if self._is_time_field(metric):
                 formatted_metric += " (ns)"
+            elif self._is_throughput_field(metric):
+                formatted_metric += " (per sec)"
+
             row_values = [formatted_metric]
 
             for stat in stats:
                 value = self.__dict__.get(f"{stat}_{metric}", -1)
                 row_values.append("{:,.0f}".format(value))
-            table.add_row(*row_values)
+
+            # Without streaming, there is no inter-token latency available.
+            if metric == "inter_token_latency":
+                if all(value == -1 for value in row_values[1:]):
+                    continue
+            else:
+                table.add_row(*row_values)
 
         console = Console()
         console.print(table)
@@ -208,11 +228,12 @@ def export_to_csv(self, csv_filename: str):
             csv_writer = csv.writer(csvfile)
             csv_writer.writerow(header)
 
-            for metric in LLMMetrics.metric_labels:
+            for metric in Metrics.metric_labels:
                 formatted_metric = metric
-                is_time_field = self._is_time_field(metric)
-                if is_time_field:
+                if self._is_time_field(metric):
                     formatted_metric += "(ns)"
+                elif self._is_throughput_field(metric):
+                    formatted_metric += "(per sec)"
 
                 row_values = [formatted_metric]
 
@@ -220,7 +241,12 @@ def export_to_csv(self, csv_filename: str):
                     value = self.__dict__.get(f"{stat}_{metric}", -1)
                     row_values.append(f"{value:.0f}")
 
-                csv_writer.writerow(row_values)
+                # Without streaming, there is no inter-token latency available.
+                if metric == "inter_token_latency":
+                    if all(value == -1 for value in row_values[1:]):
+                        continue
+                else:
+                    csv_writer.writerow(row_values)
 
 
 class ProfileDataParser:

From d57c35eca8d65dbbe4acbfa0d22ee2cff1112714 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Thu, 7 Mar 2024 15:39:24 -0800
Subject: [PATCH 6/6] Use csv extension

---
 src/c++/perf_analyzer/genai-pa/genai_pa/main.py   | 2 +-
 src/c++/perf_analyzer/genai-pa/genai_pa/parser.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/main.py b/src/c++/perf_analyzer/genai-pa/genai_pa/main.py
index 3e524fd9b..75894495b 100755
--- a/src/c++/perf_analyzer/genai-pa/genai_pa/main.py
+++ b/src/c++/perf_analyzer/genai-pa/genai_pa/main.py
@@ -86,7 +86,7 @@ def report_output(metrics: LLMProfileDataParser, args):
         )
     stats = metrics.get_statistics(infer_mode, int(load_level))
     export_csv_name = args.profile_export_file.with_name(
-        args.profile_export_file.stem + "_genai_pa" + args.profile_export_file.suffix
+        args.profile_export_file.stem + "_genai_pa.csv"
     )
     stats.export_to_csv(export_csv_name)
     stats.pretty_print()
diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py b/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py
index 66835d141..9e5890eef 100644
--- a/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py
+++ b/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py
@@ -155,9 +155,9 @@ def _add_profile_args(parser):
         default="profile_export.json",
         help="Specifies the path where the perf_analyzer profile export will be "
         "generated. By default, the profile export will be to profile_export.json. "
-        "The GenAi-PA file will be exported to <profile_export_file>_genai_pa.<file_extension>. "
+        "The GenAi-PA file will be exported to <profile_export_file>_genai_pa.csv. "
         "For example, if the profile export file is profile_export.json, the GenAi-PA file will be "
-        "exported to profile_export_genai_pa.json.",
+        "exported to profile_export_genai_pa.csv.",
     )
     load_management_group.add_argument(
         "--request-rate",