From cb0d6323fb965534d265e0597996db6f2db38748 Mon Sep 17 00:00:00 2001
From: braf <braf@nvidia.com>
Date: Mon, 9 Oct 2023 14:49:05 +0000
Subject: [PATCH] New measurement fields created.

---
 model_analyzer/perf_analyzer/perf_analyzer.py |  7 ++
 .../record/types/avg_first_token_latency.py   | 96 +++++++++++++++++++
 .../record/types/avg_token_latency.py         | 96 +++++++++++++++++++
 tests/test_record_types.py                    |  2 +
 4 files changed, 201 insertions(+)
 create mode 100755 model_analyzer/record/types/avg_first_token_latency.py
 create mode 100755 model_analyzer/record/types/avg_token_latency.py

diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py
index c88f8e655..eefc57abc 100755
--- a/model_analyzer/perf_analyzer/perf_analyzer.py
+++ b/model_analyzer/perf_analyzer/perf_analyzer.py
@@ -36,6 +36,8 @@
 )
 from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
 from model_analyzer.record.record import Record
+from model_analyzer.record.types.avg_first_token_latency import AvgFirstTokenLatency
+from model_analyzer.record.types.avg_token_latency import AvgTokenLatency
 from model_analyzer.record.types.gpu_free_memory import GPUFreeMemory
 from model_analyzer.record.types.gpu_power_usage import GPUPowerUsage
 from model_analyzer.record.types.gpu_used_memory import GPUUsedMemory
@@ -91,6 +93,11 @@ class PerfAnalyzer:
         ["gpu_used_memory",            "Max GPU Memory Usage",  GPUUsedMemory,        "1000000"],
         ["gpu_free_memory",            "Total GPU Memory",      GPUFreeMemory,        "1000000"]
     ]
+
+    llm_metric_table = [
+        ["perf_latency_avg",           "Avg first token latency",     AvgFirstTokenLatency,     "1000"],
+        ["perf_latency_avg",           "Avg token latency",           AvgTokenLatency,          "1000"]
+    ]
     # yapf: enable
 
     @staticmethod
diff --git a/model_analyzer/record/types/avg_first_token_latency.py b/model_analyzer/record/types/avg_first_token_latency.py
new file mode 100755
index 000000000..15badd92a
--- /dev/null
+++ b/model_analyzer/record/types/avg_first_token_latency.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import DecreasingRecord
+
+
+@total_ordering
+class AvgFirstTokenLatency(DecreasingRecord):
+    """
+    A record for perf_analyzer avg first token to token latency metric
+    """
+
+    tag = "avg_first_token_latency"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed avg time for first token-to-token latency
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "avg first token-to-token latency (ms)"
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() > other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/avg_token_latency.py b/model_analyzer/record/types/avg_token_latency.py
new file mode 100755
index 000000000..93937cafd
--- /dev/null
+++ b/model_analyzer/record/types/avg_token_latency.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import DecreasingRecord
+
+
+@total_ordering
+class AvgTokenLatency(DecreasingRecord):
+    """
+    A record for perf_analyzer avg token-to-token latency metric
+    """
+
+    tag = "avg_token_latency"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed avg time for token-to-token latency
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "avg token-to-token latency (ms)"
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() > other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/tests/test_record_types.py b/tests/test_record_types.py
index 4bd6d8b32..3b31e9402 100755
--- a/tests/test_record_types.py
+++ b/tests/test_record_types.py
@@ -59,6 +59,8 @@ def setUp(self):
                 "perf_client_send_recv",
                 "perf_server_compute_input",
                 "gpu_power_usage",
+                "avg_first_token_latency",
+                "avg_token_latency",
             ]
         }
         self.more_is_better_types = {