From 98f1273b06ef16dd05582e51d1108fccc6a2519e Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Thu, 7 Mar 2024 11:49:32 -0800 Subject: [PATCH 1/6] Add export to CSV file functionality --- .../genai-pa/genai_pa/llm_metrics.py | 66 +++++++++++++++---- .../perf_analyzer/genai-pa/genai_pa/main.py | 2 + 2 files changed, 54 insertions(+), 14 deletions(-) diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py index 29badd843..2ae3c957b 100755 --- a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py +++ b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py @@ -27,6 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import contextlib +import csv import io from dataclasses import dataclass from itertools import pairwise @@ -51,6 +52,17 @@ class LLMMetrics: inter_token_latencies: list[int] output_token_throughputs: list[int] + metric_labels = [ + "time_to_first_token", + "inter_token_latency", + ] + + time_fields = [ + "inter_token_latency", + "time_to_first_token", + "end_to_end_latency", + ] + def get_base_name(self, attr_name: str) -> str: # Attempted to extract and store the mapping as a dataclass member as a # dictionary but encountered two issues: (1) Python does not allow @@ -124,23 +136,17 @@ def __repr__(self): return f"Statistics({attr_strs})" def _is_time_field(self, field: str): - time_fields = [ - "inter_token_latency", - "time_to_first_token", - "end_to_end_latency", - ] - return field in time_fields + return field in LLMMetrics.time_fields def pretty_print(self): table = Table(title="PA LLM Metrics") table.add_column("Statistic", justify="right", style="cyan", no_wrap=True) - stats = ["avg", "min", "max", "p99", "p95", "p90", "p75", "p50", "p25"] + stats = ["avg", "min", "max", "p99", "p90", "p75"] for stat in stats: table.add_column(stat, justify="right", style="green") - metrics = ["inter_token_latency", "time_to_first_token"] - for metric in metrics: + for metric in LLMMetrics.metric_labels: formatted_metric = metric.replace("_", " ").capitalize() is_time_field = self._is_time_field(metric) if is_time_field: @@ -155,6 +161,38 @@ def pretty_print(self): console = Console() console.print(table) + def export_to_csv(self, csv_filename: str): + header = [ + "Statistic", + "avg", + "min", + "max", + "p99", + "p95", + "p90", + "p75", + "p50", + "p25", + ] + + with open(csv_filename, mode="w", newline="") as csvfile: + csv_writer = csv.writer(csvfile) + csv_writer.writerow(header) + + for metric in LLMMetrics.metric_labels: + formatted_metric = metric + is_time_field = self._is_time_field(metric) + if is_time_field: + formatted_metric += "(ns)" + + row_values = [formatted_metric] + + for stat in header[1:]: + value = self.__dict__.get(f"{stat}_{metric}", -1) + row_values.append(f"{value:.0f}") + + csv_writer.writerow(row_values) + class LLMProfileData: """A class that calculates and aggregates all the LLM performance statistics @@ -202,16 +240,16 @@ def _collect_llm_metrics( for request in requests: req_timestamp = request["timestamp"] res_timestamps = request["response_timestamps"] - res_outputs = request["response_outputs"] + # res_outputs = request["response_outputs"] # time to first token time_to_first_tokens.append(res_timestamps[0] - req_timestamp) # output token throughput - output_tokens = tokenizer(res_outputs)["input_ids"] - total_output_tokens = np.sum(list(map(len, output_tokens))) - req_latency = res_timestamps[-1] - req_timestamp - output_token_throughputs.append(total_output_tokens / req_latency) + # output_tokens = tokenizer(res_outputs)["input_ids"] + # total_output_tokens = np.sum(list(map(len, output_tokens))) + # req_latency = res_timestamps[-1] - req_timestamp + # output_token_throughputs.append(total_output_tokens / req_latency) # inter token latency for t1, t2 in pairwise(res_timestamps): diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/main.py b/src/c++/perf_analyzer/genai-pa/genai_pa/main.py index df4ca1bc0..d8182fad1 100755 --- a/src/c++/perf_analyzer/genai-pa/genai_pa/main.py +++ b/src/c++/perf_analyzer/genai-pa/genai_pa/main.py @@ -77,6 +77,8 @@ def report_output(metrics: LLMProfileData, args): "Neither concurrency_range nor request_rate_range was found in args when reporting metrics" ) stats = metrics.get_statistics(infer_mode, int(load_level)) + if args.profile_export_file is not None: + stats.export_to_csv(args.profile_export_file) stats.pretty_print() From 322adbd4644bbd8b6b883bb027adab4073997741 Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Thu, 7 Mar 2024 12:06:17 -0800 Subject: [PATCH 2/6] Create genai-pa export file --- src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py | 8 ++++---- src/c++/perf_analyzer/genai-pa/genai_pa/main.py | 6 ++++-- src/c++/perf_analyzer/genai-pa/genai_pa/parser.py | 8 +++++--- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py index 2ae3c957b..5be8efce8 100755 --- a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py +++ b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py @@ -246,10 +246,10 @@ def _collect_llm_metrics( time_to_first_tokens.append(res_timestamps[0] - req_timestamp) # output token throughput - # output_tokens = tokenizer(res_outputs)["input_ids"] - # total_output_tokens = np.sum(list(map(len, output_tokens))) - # req_latency = res_timestamps[-1] - req_timestamp - # output_token_throughputs.append(total_output_tokens / req_latency) + output_tokens = tokenizer(res_outputs)["input_ids"] + total_output_tokens = np.sum(list(map(len, output_tokens))) + req_latency = res_timestamps[-1] - req_timestamp + output_token_throughputs.append(total_output_tokens / req_latency) # inter token latency for t1, t2 in pairwise(res_timestamps): diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/main.py b/src/c++/perf_analyzer/genai-pa/genai_pa/main.py index d8182fad1..919a3a2ce 100755 --- a/src/c++/perf_analyzer/genai-pa/genai_pa/main.py +++ b/src/c++/perf_analyzer/genai-pa/genai_pa/main.py @@ -77,8 +77,10 @@ def report_output(metrics: LLMProfileData, args): "Neither concurrency_range nor request_rate_range was found in args when reporting metrics" ) stats = metrics.get_statistics(infer_mode, int(load_level)) - if args.profile_export_file is not None: - stats.export_to_csv(args.profile_export_file) + export_csv_name = args.profile_export_file.with_name( + args.profile_export_file.stem + "_genai_pa" + args.profile_export_file.suffix + ) + stats.export_to_csv(export_csv_name) stats.pretty_print() diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py b/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py index c45026f63..5372b7ebc 100644 --- a/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py +++ b/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py @@ -117,9 +117,11 @@ def _add_profile_args(parser): "--profile-export-file", type=Path, default="profile_export.json", - help="Specifies the path where the profile export will be " - "generated. By default, the profile export will not be " - "generated.", + help="Specifies the path where the perf_analyzer profile export will be " + "generated. By default, the profile export will be to profile_export.json. " + "The GenAi-PA file will be exported to _genai_pa. For example," + "if the profile export file is profile_export.json, the GenAi-PA file will be " + "exported to profile_export_genai_pa.json.", ) load_management_group.add_argument( "--request-rate", From d18158e3e3f683d3b317e381e49e83e03d1fc13f Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Thu, 7 Mar 2024 12:07:05 -0800 Subject: [PATCH 3/6] Remove commented code --- src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py index 5be8efce8..350bd8a73 100755 --- a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py +++ b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py @@ -240,7 +240,7 @@ def _collect_llm_metrics( for request in requests: req_timestamp = request["timestamp"] res_timestamps = request["response_timestamps"] - # res_outputs = request["response_outputs"] + res_outputs = request["response_outputs"] # time to first token time_to_first_tokens.append(res_timestamps[0] - req_timestamp) From 6f46de703c79cefcbac41b7eb9d3ea177cd4813b Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Thu, 7 Mar 2024 12:45:55 -0800 Subject: [PATCH 4/6] Clarify parser message --- src/c++/perf_analyzer/genai-pa/genai_pa/parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py b/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py index 5372b7ebc..5d5dbad75 100644 --- a/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py +++ b/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py @@ -119,8 +119,8 @@ def _add_profile_args(parser): default="profile_export.json", help="Specifies the path where the perf_analyzer profile export will be " "generated. By default, the profile export will be to profile_export.json. " - "The GenAi-PA file will be exported to _genai_pa. For example," - "if the profile export file is profile_export.json, the GenAi-PA file will be " + "The GenAi-PA file will be exported to _genai_pa.. " + "For example, if the profile export file is profile_export.json, the GenAi-PA file will be " "exported to profile_export_genai_pa.json.", ) load_management_group.add_argument( From 057576fabe4997d07abbd22378d41c6c7bac4862 Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Thu, 7 Mar 2024 15:26:27 -0800 Subject: [PATCH 5/6] Add other metrics --- .../genai-pa/genai_pa/llm_metrics.py | 66 +++++++++++++------ 1 file changed, 46 insertions(+), 20 deletions(-) diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py index 3704e3f42..55e4dd9a3 100755 --- a/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py +++ b/src/c++/perf_analyzer/genai-pa/genai_pa/llm_metrics.py @@ -48,6 +48,26 @@ class Metrics: """A base class for all the metrics class that contains common metrics.""" + metric_labels = [ + "time_to_first_token", + "inter_token_latency", + "request_latency", + "output_token_throughput", + "request_throughput", + "num_output_token", + ] + + time_fields = [ + "inter_token_latency", + "time_to_first_token", + "request_latency", + ] + + throughput_fields = [ + "request_throughput", + "output_token_throughput", + ] + def __init__( self, request_throughputs: list[float] = [], @@ -91,17 +111,6 @@ def __init__( self.output_token_throughputs = output_token_throughputs self.num_output_tokens = num_output_tokens - metric_labels = [ - "time_to_first_token", - "inter_token_latency", - ] - - time_fields = [ - "inter_token_latency", - "time_to_first_token", - "end_to_end_latency", - ] - # add base name mapping self._base_names["time_to_first_tokens"] = "time_to_first_token" self._base_names["inter_token_latencies"] = "inter_token_latency" @@ -164,8 +173,11 @@ def __repr__(self): attr_strs = ",".join([f"{k}={v}" for k, v in self.__dict__.items()]) return f"Statistics({attr_strs})" + def _is_throughput_field(self, field: str): + return field in Metrics.throughput_fields + def _is_time_field(self, field: str): - return field in LLMMetrics.time_fields + return field in Metrics.time_fields def pretty_print(self): table = Table(title="PA LLM Metrics") @@ -175,17 +187,25 @@ def pretty_print(self): for stat in stats: table.add_column(stat, justify="right", style="green") - for metric in LLMMetrics.metric_labels: + for metric in Metrics.metric_labels: formatted_metric = metric.replace("_", " ").capitalize() - is_time_field = self._is_time_field(metric) - if is_time_field: + if self._is_time_field(metric): formatted_metric += " (ns)" + elif self._is_throughput_field(metric): + formatted_metric += " (per sec)" + row_values = [formatted_metric] for stat in stats: value = self.__dict__.get(f"{stat}_{metric}", -1) row_values.append("{:,.0f}".format(value)) - table.add_row(*row_values) + + # Without streaming, there is no inter-token latency available. + if metric == "inter_token_latency": + if all(value == -1 for value in row_values[1:]): + continue + else: + table.add_row(*row_values) console = Console() console.print(table) @@ -208,11 +228,12 @@ def export_to_csv(self, csv_filename: str): csv_writer = csv.writer(csvfile) csv_writer.writerow(header) - for metric in LLMMetrics.metric_labels: + for metric in Metrics.metric_labels: formatted_metric = metric - is_time_field = self._is_time_field(metric) - if is_time_field: + if self._is_time_field(metric): formatted_metric += "(ns)" + elif self._is_throughput_field(metric): + formatted_metric += "(per sec)" row_values = [formatted_metric] @@ -220,7 +241,12 @@ def export_to_csv(self, csv_filename: str): value = self.__dict__.get(f"{stat}_{metric}", -1) row_values.append(f"{value:.0f}") - csv_writer.writerow(row_values) + # Without streaming, there is no inter-token latency available. + if metric == "inter_token_latency": + if all(value == -1 for value in row_values[1:]): + continue + else: + csv_writer.writerow(row_values) class ProfileDataParser: From d57c35eca8d65dbbe4acbfa0d22ee2cff1112714 Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Thu, 7 Mar 2024 15:39:24 -0800 Subject: [PATCH 6/6] Use csv extension --- src/c++/perf_analyzer/genai-pa/genai_pa/main.py | 2 +- src/c++/perf_analyzer/genai-pa/genai_pa/parser.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/main.py b/src/c++/perf_analyzer/genai-pa/genai_pa/main.py index 3e524fd9b..75894495b 100755 --- a/src/c++/perf_analyzer/genai-pa/genai_pa/main.py +++ b/src/c++/perf_analyzer/genai-pa/genai_pa/main.py @@ -86,7 +86,7 @@ def report_output(metrics: LLMProfileDataParser, args): ) stats = metrics.get_statistics(infer_mode, int(load_level)) export_csv_name = args.profile_export_file.with_name( - args.profile_export_file.stem + "_genai_pa" + args.profile_export_file.suffix + args.profile_export_file.stem + "_genai_pa.csv" ) stats.export_to_csv(export_csv_name) stats.pretty_print() diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py b/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py index 66835d141..9e5890eef 100644 --- a/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py +++ b/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py @@ -155,9 +155,9 @@ def _add_profile_args(parser): default="profile_export.json", help="Specifies the path where the perf_analyzer profile export will be " "generated. By default, the profile export will be to profile_export.json. " - "The GenAi-PA file will be exported to _genai_pa.. " + "The GenAi-PA file will be exported to _genai_pa.csv. " "For example, if the profile export file is profile_export.json, the GenAi-PA file will be " - "exported to profile_export_genai_pa.json.", + "exported to profile_export_genai_pa.csv.", ) load_management_group.add_argument( "--request-rate",