From 3df2487e9f405f43a045a30cbbe296ef59a0aa86 Mon Sep 17 00:00:00 2001
From: braf <braf@nvidia.com>
Date: Mon, 18 Mar 2024 17:08:46 +0000
Subject: [PATCH 1/2] Adding new LLM metrics

---
 .../record/types/inter_token_latency_avg.py   |  96 ++++++++++++++++
 .../record/types/inter_token_latency_max.py   |  96 ++++++++++++++++
 .../record/types/inter_token_latency_min.py   |  96 ++++++++++++++++
 .../record/types/inter_token_latency_p75.py   |  96 ++++++++++++++++
 .../record/types/inter_token_latency_p90.py   |  96 ++++++++++++++++
 .../record/types/inter_token_latency_p99.py   |  96 ++++++++++++++++
 .../record/types/output_token_throughput.py   | 105 ++++++++++++++++++
 .../record/types/time_to_first_token_avg.py   |  96 ++++++++++++++++
 .../record/types/time_to_first_token_max.py   |  96 ++++++++++++++++
 .../record/types/time_to_first_token_min.py   |  96 ++++++++++++++++
 .../record/types/time_to_first_token_p75.py   |  96 ++++++++++++++++
 .../record/types/time_to_first_token_p90.py   |  96 ++++++++++++++++
 .../record/types/time_to_first_token_p99.py   |  96 ++++++++++++++++
 tests/test_record_types.py                    |  13 +++
 14 files changed, 1270 insertions(+)
 create mode 100755 model_analyzer/record/types/inter_token_latency_avg.py
 create mode 100755 model_analyzer/record/types/inter_token_latency_max.py
 create mode 100755 model_analyzer/record/types/inter_token_latency_min.py
 create mode 100755 model_analyzer/record/types/inter_token_latency_p75.py
 create mode 100755 model_analyzer/record/types/inter_token_latency_p90.py
 create mode 100755 model_analyzer/record/types/inter_token_latency_p99.py
 create mode 100755 model_analyzer/record/types/output_token_throughput.py
 create mode 100755 model_analyzer/record/types/time_to_first_token_avg.py
 create mode 100755 model_analyzer/record/types/time_to_first_token_max.py
 create mode 100755 model_analyzer/record/types/time_to_first_token_min.py
 create mode 100755 model_analyzer/record/types/time_to_first_token_p75.py
 create mode 100755 model_analyzer/record/types/time_to_first_token_p90.py
 create mode 100755 model_analyzer/record/types/time_to_first_token_p99.py

diff --git a/model_analyzer/record/types/inter_token_latency_avg.py b/model_analyzer/record/types/inter_token_latency_avg.py
new file mode 100755
index 000000000..3810d1c7e
--- /dev/null
+++ b/model_analyzer/record/types/inter_token_latency_avg.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import DecreasingRecord
+
+
+@total_ordering
+class InterTokenLatencyAvg(DecreasingRecord):
+    """
+    A record for perf_analyzer Inter token latency metric
+    """
+
+    tag = "inter_token_latency_avg"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "Avg Inter Token Latency (ms)"
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() > other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/inter_token_latency_max.py b/model_analyzer/record/types/inter_token_latency_max.py
new file mode 100755
index 000000000..ffb3879fb
--- /dev/null
+++ b/model_analyzer/record/types/inter_token_latency_max.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import DecreasingRecord
+
+
+@total_ordering
+class InterTokenLatencyMax(DecreasingRecord):
+    """
+    A record for perf_analyzer Inter token latency metric
+    """
+
+    tag = "inter_token_latency_max"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "Max Inter Token Latency (ms)"
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() > other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/inter_token_latency_min.py b/model_analyzer/record/types/inter_token_latency_min.py
new file mode 100755
index 000000000..3f6344bea
--- /dev/null
+++ b/model_analyzer/record/types/inter_token_latency_min.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import DecreasingRecord
+
+
+@total_ordering
+class InterTokenLatencyMin(DecreasingRecord):
+    """
+    A record for perf_analyzer Inter token latency metric
+    """
+
+    tag = "inter_token_latency_min"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "Min Inter Token Latency (ms)"
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() > other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/inter_token_latency_p75.py b/model_analyzer/record/types/inter_token_latency_p75.py
new file mode 100755
index 000000000..c2ff01664
--- /dev/null
+++ b/model_analyzer/record/types/inter_token_latency_p75.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import DecreasingRecord
+
+
+@total_ordering
+class InterTokenLatencyP75(DecreasingRecord):
+    """
+    A record for perf_analyzer Inter token latency metric
+    """
+
+    tag = "inter_token_latency_p75"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "p75 Inter Token Latency (ms)"
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() > other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/inter_token_latency_p90.py b/model_analyzer/record/types/inter_token_latency_p90.py
new file mode 100755
index 000000000..4f53c69fa
--- /dev/null
+++ b/model_analyzer/record/types/inter_token_latency_p90.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import DecreasingRecord
+
+
+@total_ordering
+class InterTokenLatencyP99(DecreasingRecord):
+    """
+    A record for perf_analyzer Inter token latency metric
+    """
+
+    tag = "inter_token_latency_p90"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "p90 Inter Token Latency (ms)"
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() > other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/inter_token_latency_p99.py b/model_analyzer/record/types/inter_token_latency_p99.py
new file mode 100755
index 000000000..f203f78c2
--- /dev/null
+++ b/model_analyzer/record/types/inter_token_latency_p99.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import DecreasingRecord
+
+
+@total_ordering
+class InterTokenLatencyP99(DecreasingRecord):
+    """
+    A record for perf_analyzer Inter token latency metric
+    """
+
+    tag = "inter_token_latency_p99"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "p99 Inter Token Latency (ms)"
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() > other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/output_token_throughput.py b/model_analyzer/record/types/output_token_throughput.py
new file mode 100755
index 000000000..f7edf7cb8
--- /dev/null
+++ b/model_analyzer/record/types/output_token_throughput.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import IncreasingRecord
+
+
+@total_ordering
+class OutputTokenThroughput(IncreasingRecord):
+    """
+    A record for perf_analyzer
+    metric 'Output Token Throughput'
+    """
+
+    tag = "output_token_throughput"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            The throughput from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @staticmethod
+    def value_function():
+        """
+        Returns the total value from a list
+
+        Returns
+        -------
+        Total value of the list
+        """
+        return sum
+
+    @staticmethod
+    def header(aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "Output Token Throughput (infer/sec)"
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() < other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subtracting two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() - other.value()))
diff --git a/model_analyzer/record/types/time_to_first_token_avg.py b/model_analyzer/record/types/time_to_first_token_avg.py
new file mode 100755
index 000000000..d87642b1d
--- /dev/null
+++ b/model_analyzer/record/types/time_to_first_token_avg.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import DecreasingRecord
+
+
+@total_ordering
+class TimeToFirstTokenAvg(DecreasingRecord):
+    """
+    A record for perf_analyzer Time to first token metric
+    """
+
+    tag = "time_to_first_token_avg"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "Avg Time To First Token (ms)"
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() > other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/time_to_first_token_max.py b/model_analyzer/record/types/time_to_first_token_max.py
new file mode 100755
index 000000000..d53741d7e
--- /dev/null
+++ b/model_analyzer/record/types/time_to_first_token_max.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import DecreasingRecord
+
+
+@total_ordering
+class TimeToFirstTokenMax(DecreasingRecord):
+    """
+    A record for perf_analyzer Time to first token metric
+    """
+
+    tag = "time_to_first_token_max"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "Max Time To First Token (ms)"
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() > other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/time_to_first_token_min.py b/model_analyzer/record/types/time_to_first_token_min.py
new file mode 100755
index 000000000..ee556f6ba
--- /dev/null
+++ b/model_analyzer/record/types/time_to_first_token_min.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import DecreasingRecord
+
+
+@total_ordering
+class TimeToFirstTokenMin(DecreasingRecord):
+    """
+    A record for perf_analyzer Time to first token metric
+    """
+
+    tag = "time_to_first_token_min"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "Min Time To First Token (ms)"
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() > other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/time_to_first_token_p75.py b/model_analyzer/record/types/time_to_first_token_p75.py
new file mode 100755
index 000000000..f996517e9
--- /dev/null
+++ b/model_analyzer/record/types/time_to_first_token_p75.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import DecreasingRecord
+
+
+@total_ordering
+class TimeToFirstTokenP75(DecreasingRecord):
+    """
+    A record for perf_analyzer Time to first token metric
+    """
+
+    tag = "time_to_first_token_p75"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "p75 Time To First Token (ms)"
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() > other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/time_to_first_token_p90.py b/model_analyzer/record/types/time_to_first_token_p90.py
new file mode 100755
index 000000000..6009d06ba
--- /dev/null
+++ b/model_analyzer/record/types/time_to_first_token_p90.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import DecreasingRecord
+
+
+@total_ordering
+class TimeToFirstTokenP90(DecreasingRecord):
+    """
+    A record for perf_analyzer Time to first token metric
+    """
+
+    tag = "time_to_first_token_p90"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "p90 Time To First Token (ms)"
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() > other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/time_to_first_token_p99.py b/model_analyzer/record/types/time_to_first_token_p99.py
new file mode 100755
index 000000000..2302c82d7
--- /dev/null
+++ b/model_analyzer/record/types/time_to_first_token_p99.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import DecreasingRecord
+
+
+@total_ordering
+class TimeToFirstTokenP99(DecreasingRecord):
+    """
+    A record for perf_analyzer Time to first token metric
+    """
+
+    tag = "time_to_first_token_p99"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "p99 Time To First Token (ms)"
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() > other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/tests/test_record_types.py b/tests/test_record_types.py
index 4bd6d8b32..54c353200 100755
--- a/tests/test_record_types.py
+++ b/tests/test_record_types.py
@@ -49,6 +49,18 @@ def setUp(self):
                 "perf_latency_p90",
                 "perf_latency_p95",
                 "perf_latency_p99",
+                "inter_token_latency_min",
+                "inter_token_latency_max",
+                "inter_token_latency_avg",
+                "inter_token_latency_p75",
+                "inter_token_latency_p90",
+                "inter_token_latency_p99",
+                "time_to_first_token_min",
+                "time_to_first_token_max",
+                "time_to_first_token_avg",
+                "time_to_first_token_p75",
+                "time_to_first_token_p90",
+                "time_to_first_token_p99",
                 "gpu_used_memory",
                 "cpu_used_ram",
                 "perf_server_compute_infer",
@@ -65,6 +77,7 @@ def setUp(self):
             record_types[k]
             for k in [
                 "perf_throughput",
+                "output_token_throughput",
                 "gpu_free_memory",
                 "gpu_utilization",
                 "cpu_available_ram",

From 8a52aed690a06202347bb8b180d8c9c6f6564062 Mon Sep 17 00:00:00 2001
From: braf <braf@nvidia.com>
Date: Tue, 19 Mar 2024 16:27:08 +0000
Subject: [PATCH 2/2] Adding base class for perf, inter_token, and
 time_to_first latency records

---
 .../record/types/inter_token_latency_avg.py   | 40 +---------
 .../record/types/inter_token_latency_base.py  | 74 +++++++++++++++++++
 .../record/types/inter_token_latency_max.py   | 40 +---------
 .../record/types/inter_token_latency_min.py   | 40 +---------
 .../record/types/inter_token_latency_p75.py   | 40 +---------
 .../record/types/inter_token_latency_p90.py   | 40 +---------
 .../record/types/inter_token_latency_p99.py   | 40 +---------
 .../record/types/perf_latency_avg.py          | 40 +---------
 .../record/types/perf_latency_base.py         | 74 +++++++++++++++++++
 .../record/types/perf_latency_p90.py          | 40 +---------
 .../record/types/perf_latency_p95.py          | 40 +---------
 .../record/types/perf_latency_p99.py          | 40 +---------
 .../record/types/time_to_first_token_avg.py   | 40 +---------
 .../record/types/time_to_first_token_base.py  | 74 +++++++++++++++++++
 .../record/types/time_to_first_token_max.py   | 40 +---------
 .../record/types/time_to_first_token_min.py   | 40 +---------
 .../record/types/time_to_first_token_p75.py   | 40 +---------
 .../record/types/time_to_first_token_p90.py   | 40 +---------
 .../record/types/time_to_first_token_p99.py   | 40 +---------
 19 files changed, 254 insertions(+), 608 deletions(-)
 create mode 100755 model_analyzer/record/types/inter_token_latency_base.py
 create mode 100755 model_analyzer/record/types/perf_latency_base.py
 create mode 100755 model_analyzer/record/types/time_to_first_token_base.py

diff --git a/model_analyzer/record/types/inter_token_latency_avg.py b/model_analyzer/record/types/inter_token_latency_avg.py
index 3810d1c7e..fe1dc7dfb 100755
--- a/model_analyzer/record/types/inter_token_latency_avg.py
+++ b/model_analyzer/record/types/inter_token_latency_avg.py
@@ -16,11 +16,11 @@
 
 from functools import total_ordering
 
-from model_analyzer.record.record import DecreasingRecord
+from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase
 
 
 @total_ordering
-class InterTokenLatencyAvg(DecreasingRecord):
+class InterTokenLatencyAvg(InterTokenLatencyBase):
     """
     A record for perf_analyzer Inter token latency metric
     """
@@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False):
         """
 
         return "Avg Inter Token Latency (ms)"
-
-    def __eq__(self, other):
-        """
-        Allows checking for
-        equality between two records
-        """
-
-        return self.value() == other.value()
-
-    def __lt__(self, other):
-        """
-        Allows checking if
-        this record is less than
-        the other
-        """
-
-        return self.value() > other.value()
-
-    def __add__(self, other):
-        """
-        Allows adding two records together
-        to produce a brand new record.
-        """
-
-        return self.__class__(value=(self.value() + other.value()))
-
-    def __sub__(self, other):
-        """
-        Allows subbing two records together
-        to produce a brand new record.
-
-        ** Note this does reverse subtraction because
-            of the inverted nature of latency (lower is better)
-        """
-
-        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/inter_token_latency_base.py b/model_analyzer/record/types/inter_token_latency_base.py
new file mode 100755
index 000000000..dda70cefa
--- /dev/null
+++ b/model_analyzer/record/types/inter_token_latency_base.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import DecreasingRecord
+
+
+@total_ordering
+class InterTokenLatencyBase(DecreasingRecord):
+    """
+    A record for perf_analyzer Inter token latency metric
+    """
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() > other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/inter_token_latency_max.py b/model_analyzer/record/types/inter_token_latency_max.py
index ffb3879fb..ce2484144 100755
--- a/model_analyzer/record/types/inter_token_latency_max.py
+++ b/model_analyzer/record/types/inter_token_latency_max.py
@@ -16,11 +16,11 @@
 
 from functools import total_ordering
 
-from model_analyzer.record.record import DecreasingRecord
+from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase
 
 
 @total_ordering
-class InterTokenLatencyMax(DecreasingRecord):
+class InterTokenLatencyMax(InterTokenLatencyBase):
     """
     A record for perf_analyzer Inter token latency metric
     """
@@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False):
         """
 
         return "Max Inter Token Latency (ms)"
-
-    def __eq__(self, other):
-        """
-        Allows checking for
-        equality between two records
-        """
-
-        return self.value() == other.value()
-
-    def __lt__(self, other):
-        """
-        Allows checking if
-        this record is less than
-        the other
-        """
-
-        return self.value() > other.value()
-
-    def __add__(self, other):
-        """
-        Allows adding two records together
-        to produce a brand new record.
-        """
-
-        return self.__class__(value=(self.value() + other.value()))
-
-    def __sub__(self, other):
-        """
-        Allows subbing two records together
-        to produce a brand new record.
-
-        ** Note this does reverse subtraction because
-            of the inverted nature of latency (lower is better)
-        """
-
-        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/inter_token_latency_min.py b/model_analyzer/record/types/inter_token_latency_min.py
index 3f6344bea..21e44883b 100755
--- a/model_analyzer/record/types/inter_token_latency_min.py
+++ b/model_analyzer/record/types/inter_token_latency_min.py
@@ -16,11 +16,11 @@
 
 from functools import total_ordering
 
-from model_analyzer.record.record import DecreasingRecord
+from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase
 
 
 @total_ordering
-class InterTokenLatencyMin(DecreasingRecord):
+class InterTokenLatencyMin(InterTokenLatencyBase):
     """
     A record for perf_analyzer Inter token latency metric
     """
@@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False):
         """
 
         return "Min Inter Token Latency (ms)"
-
-    def __eq__(self, other):
-        """
-        Allows checking for
-        equality between two records
-        """
-
-        return self.value() == other.value()
-
-    def __lt__(self, other):
-        """
-        Allows checking if
-        this record is less than
-        the other
-        """
-
-        return self.value() > other.value()
-
-    def __add__(self, other):
-        """
-        Allows adding two records together
-        to produce a brand new record.
-        """
-
-        return self.__class__(value=(self.value() + other.value()))
-
-    def __sub__(self, other):
-        """
-        Allows subbing two records together
-        to produce a brand new record.
-
-        ** Note this does reverse subtraction because
-            of the inverted nature of latency (lower is better)
-        """
-
-        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/inter_token_latency_p75.py b/model_analyzer/record/types/inter_token_latency_p75.py
index c2ff01664..1234306fd 100755
--- a/model_analyzer/record/types/inter_token_latency_p75.py
+++ b/model_analyzer/record/types/inter_token_latency_p75.py
@@ -16,11 +16,11 @@
 
 from functools import total_ordering
 
-from model_analyzer.record.record import DecreasingRecord
+from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase
 
 
 @total_ordering
-class InterTokenLatencyP75(DecreasingRecord):
+class InterTokenLatencyP75(InterTokenLatencyBase):
     """
     A record for perf_analyzer Inter token latency metric
     """
@@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False):
         """
 
         return "p75 Inter Token Latency (ms)"
-
-    def __eq__(self, other):
-        """
-        Allows checking for
-        equality between two records
-        """
-
-        return self.value() == other.value()
-
-    def __lt__(self, other):
-        """
-        Allows checking if
-        this record is less than
-        the other
-        """
-
-        return self.value() > other.value()
-
-    def __add__(self, other):
-        """
-        Allows adding two records together
-        to produce a brand new record.
-        """
-
-        return self.__class__(value=(self.value() + other.value()))
-
-    def __sub__(self, other):
-        """
-        Allows subbing two records together
-        to produce a brand new record.
-
-        ** Note this does reverse subtraction because
-            of the inverted nature of latency (lower is better)
-        """
-
-        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/inter_token_latency_p90.py b/model_analyzer/record/types/inter_token_latency_p90.py
index 4f53c69fa..58ae0ccb4 100755
--- a/model_analyzer/record/types/inter_token_latency_p90.py
+++ b/model_analyzer/record/types/inter_token_latency_p90.py
@@ -16,11 +16,11 @@
 
 from functools import total_ordering
 
-from model_analyzer.record.record import DecreasingRecord
+from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase
 
 
 @total_ordering
-class InterTokenLatencyP99(DecreasingRecord):
+class InterTokenLatencyP99(InterTokenLatencyBase):
     """
     A record for perf_analyzer Inter token latency metric
     """
@@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False):
         """
 
         return "p90 Inter Token Latency (ms)"
-
-    def __eq__(self, other):
-        """
-        Allows checking for
-        equality between two records
-        """
-
-        return self.value() == other.value()
-
-    def __lt__(self, other):
-        """
-        Allows checking if
-        this record is less than
-        the other
-        """
-
-        return self.value() > other.value()
-
-    def __add__(self, other):
-        """
-        Allows adding two records together
-        to produce a brand new record.
-        """
-
-        return self.__class__(value=(self.value() + other.value()))
-
-    def __sub__(self, other):
-        """
-        Allows subbing two records together
-        to produce a brand new record.
-
-        ** Note this does reverse subtraction because
-            of the inverted nature of latency (lower is better)
-        """
-
-        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/inter_token_latency_p99.py b/model_analyzer/record/types/inter_token_latency_p99.py
index f203f78c2..d9f722772 100755
--- a/model_analyzer/record/types/inter_token_latency_p99.py
+++ b/model_analyzer/record/types/inter_token_latency_p99.py
@@ -16,11 +16,11 @@
 
 from functools import total_ordering
 
-from model_analyzer.record.record import DecreasingRecord
+from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase
 
 
 @total_ordering
-class InterTokenLatencyP99(DecreasingRecord):
+class InterTokenLatencyP99(InterTokenLatencyBase):
     """
     A record for perf_analyzer Inter token latency metric
     """
@@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False):
         """
 
         return "p99 Inter Token Latency (ms)"
-
-    def __eq__(self, other):
-        """
-        Allows checking for
-        equality between two records
-        """
-
-        return self.value() == other.value()
-
-    def __lt__(self, other):
-        """
-        Allows checking if
-        this record is less than
-        the other
-        """
-
-        return self.value() > other.value()
-
-    def __add__(self, other):
-        """
-        Allows adding two records together
-        to produce a brand new record.
-        """
-
-        return self.__class__(value=(self.value() + other.value()))
-
-    def __sub__(self, other):
-        """
-        Allows subbing two records together
-        to produce a brand new record.
-
-        ** Note this does reverse subtraction because
-            of the inverted nature of latency (lower is better)
-        """
-
-        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/perf_latency_avg.py b/model_analyzer/record/types/perf_latency_avg.py
index 5452c0b79..aafbcbeb2 100755
--- a/model_analyzer/record/types/perf_latency_avg.py
+++ b/model_analyzer/record/types/perf_latency_avg.py
@@ -16,11 +16,11 @@
 
 from functools import total_ordering
 
-from model_analyzer.record.record import DecreasingRecord
+from model_analyzer.record.types.perf_latency_base import PerfLatencyBase
 
 
 @total_ordering
-class PerfLatencyAvg(DecreasingRecord):
+class PerfLatencyAvg(PerfLatencyBase):
     """
     A record for perf_analyzer latency metric
     """
@@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False):
         """
 
         return "Avg Latency (ms)"
-
-    def __eq__(self, other):
-        """
-        Allows checking for
-        equality between two records
-        """
-
-        return self.value() == other.value()
-
-    def __lt__(self, other):
-        """
-        Allows checking if
-        this record is less than
-        the other
-        """
-
-        return self.value() > other.value()
-
-    def __add__(self, other):
-        """
-        Allows adding two records together
-        to produce a brand new record.
-        """
-
-        return self.__class__(value=(self.value() + other.value()))
-
-    def __sub__(self, other):
-        """
-        Allows subbing two records together
-        to produce a brand new record.
-
-        ** Note this does reverse subtraction because
-            of the inverted nature of latency (lower is better)
-        """
-
-        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/perf_latency_base.py b/model_analyzer/record/types/perf_latency_base.py
new file mode 100755
index 000000000..3c3e76cac
--- /dev/null
+++ b/model_analyzer/record/types/perf_latency_base.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import DecreasingRecord
+
+
+@total_ordering
+class PerfLatencyBase(DecreasingRecord):
+    """
+    A base class for perf_analyzer latency metric
+    """
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() > other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/perf_latency_p90.py b/model_analyzer/record/types/perf_latency_p90.py
index c6718fe40..7eafa3b28 100755
--- a/model_analyzer/record/types/perf_latency_p90.py
+++ b/model_analyzer/record/types/perf_latency_p90.py
@@ -16,11 +16,11 @@
 
 from functools import total_ordering
 
-from model_analyzer.record.record import DecreasingRecord
+from model_analyzer.record.types.perf_latency_base import PerfLatencyBase
 
 
 @total_ordering
-class PerfLatencyP90(DecreasingRecord):
+class PerfLatencyP90(PerfLatencyBase):
     """
     A record for perf_analyzer latency metric
     """
@@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False):
         """
 
         return "p90 Latency (ms)"
-
-    def __eq__(self, other):
-        """
-        Allows checking for
-        equality between two records
-        """
-
-        return self.value() == other.value()
-
-    def __lt__(self, other):
-        """
-        Allows checking if
-        this record is less than
-        the other
-        """
-
-        return self.value() > other.value()
-
-    def __add__(self, other):
-        """
-        Allows adding two records together
-        to produce a brand new record.
-        """
-
-        return self.__class__(value=(self.value() + other.value()))
-
-    def __sub__(self, other):
-        """
-        Allows subbing two records together
-        to produce a brand new record.
-
-        ** Note this does reverse subtraction because
-            of the inverted nature of latency (lower is better)
-        """
-
-        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/perf_latency_p95.py b/model_analyzer/record/types/perf_latency_p95.py
index 84ed9e648..ccb9f8c01 100755
--- a/model_analyzer/record/types/perf_latency_p95.py
+++ b/model_analyzer/record/types/perf_latency_p95.py
@@ -16,11 +16,11 @@
 
 from functools import total_ordering
 
-from model_analyzer.record.record import DecreasingRecord
+from model_analyzer.record.types.perf_latency_base import PerfLatencyBase
 
 
 @total_ordering
-class PerfLatencyP95(DecreasingRecord):
+class PerfLatencyP95(PerfLatencyBase):
     """
     A record for perf_analyzer latency metric
     """
@@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False):
         """
 
         return "p95 Latency (ms)"
-
-    def __eq__(self, other):
-        """
-        Allows checking for
-        equality between two records
-        """
-
-        return self.value() == other.value()
-
-    def __lt__(self, other):
-        """
-        Allows checking if
-        this record is less than
-        the other
-        """
-
-        return self.value() > other.value()
-
-    def __add__(self, other):
-        """
-        Allows adding two records together
-        to produce a brand new record.
-        """
-
-        return self.__class__(value=(self.value() + other.value()))
-
-    def __sub__(self, other):
-        """
-        Allows subbing two records together
-        to produce a brand new record.
-
-        ** Note this does reverse subtraction because
-            of the inverted nature of latency (lower is better)
-        """
-
-        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/perf_latency_p99.py b/model_analyzer/record/types/perf_latency_p99.py
index af4d06da4..46d352021 100755
--- a/model_analyzer/record/types/perf_latency_p99.py
+++ b/model_analyzer/record/types/perf_latency_p99.py
@@ -16,11 +16,11 @@
 
 from functools import total_ordering
 
-from model_analyzer.record.record import DecreasingRecord
+from model_analyzer.record.types.perf_latency_base import PerfLatencyBase
 
 
 @total_ordering
-class PerfLatencyP99(DecreasingRecord):
+class PerfLatencyP99(PerfLatencyBase):
     """
     A record for perf_analyzer latency metric
     """
@@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False):
         """
 
         return "p99 Latency (ms)"
-
-    def __eq__(self, other):
-        """
-        Allows checking for
-        equality between two records
-        """
-
-        return self.value() == other.value()
-
-    def __lt__(self, other):
-        """
-        Allows checking if
-        this record is less than
-        the other
-        """
-
-        return self.value() > other.value()
-
-    def __add__(self, other):
-        """
-        Allows adding two records together
-        to produce a brand new record.
-        """
-
-        return self.__class__(value=(self.value() + other.value()))
-
-    def __sub__(self, other):
-        """
-        Allows subbing two records together
-        to produce a brand new record.
-
-        ** Note this does reverse subtraction because
-            of the inverted nature of latency (lower is better)
-        """
-
-        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/time_to_first_token_avg.py b/model_analyzer/record/types/time_to_first_token_avg.py
index d87642b1d..28da5d294 100755
--- a/model_analyzer/record/types/time_to_first_token_avg.py
+++ b/model_analyzer/record/types/time_to_first_token_avg.py
@@ -16,11 +16,11 @@
 
 from functools import total_ordering
 
-from model_analyzer.record.record import DecreasingRecord
+from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase
 
 
 @total_ordering
-class TimeToFirstTokenAvg(DecreasingRecord):
+class TimeToFirstTokenAvg(TimeToFirstTokenBase):
     """
     A record for perf_analyzer Time to first token metric
     """
@@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False):
         """
 
         return "Avg Time To First Token (ms)"
-
-    def __eq__(self, other):
-        """
-        Allows checking for
-        equality between two records
-        """
-
-        return self.value() == other.value()
-
-    def __lt__(self, other):
-        """
-        Allows checking if
-        this record is less than
-        the other
-        """
-
-        return self.value() > other.value()
-
-    def __add__(self, other):
-        """
-        Allows adding two records together
-        to produce a brand new record.
-        """
-
-        return self.__class__(value=(self.value() + other.value()))
-
-    def __sub__(self, other):
-        """
-        Allows subbing two records together
-        to produce a brand new record.
-
-        ** Note this does reverse subtraction because
-            of the inverted nature of latency (lower is better)
-        """
-
-        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/time_to_first_token_base.py b/model_analyzer/record/types/time_to_first_token_base.py
new file mode 100755
index 000000000..5ef6e9070
--- /dev/null
+++ b/model_analyzer/record/types/time_to_first_token_base.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import DecreasingRecord
+
+
+@total_ordering
+class TimeToFirstTokenBase(DecreasingRecord):
+    """
+    A base class record for perf_analyzer time to first token metric
+    """
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() > other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/time_to_first_token_max.py b/model_analyzer/record/types/time_to_first_token_max.py
index d53741d7e..f9ccc0a52 100755
--- a/model_analyzer/record/types/time_to_first_token_max.py
+++ b/model_analyzer/record/types/time_to_first_token_max.py
@@ -16,11 +16,11 @@
 
 from functools import total_ordering
 
-from model_analyzer.record.record import DecreasingRecord
+from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase
 
 
 @total_ordering
-class TimeToFirstTokenMax(DecreasingRecord):
+class TimeToFirstTokenMax(TimeToFirstTokenBase):
     """
     A record for perf_analyzer Time to first token metric
     """
@@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False):
         """
 
         return "Max Time To First Token (ms)"
-
-    def __eq__(self, other):
-        """
-        Allows checking for
-        equality between two records
-        """
-
-        return self.value() == other.value()
-
-    def __lt__(self, other):
-        """
-        Allows checking if
-        this record is less than
-        the other
-        """
-
-        return self.value() > other.value()
-
-    def __add__(self, other):
-        """
-        Allows adding two records together
-        to produce a brand new record.
-        """
-
-        return self.__class__(value=(self.value() + other.value()))
-
-    def __sub__(self, other):
-        """
-        Allows subbing two records together
-        to produce a brand new record.
-
-        ** Note this does reverse subtraction because
-            of the inverted nature of latency (lower is better)
-        """
-
-        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/time_to_first_token_min.py b/model_analyzer/record/types/time_to_first_token_min.py
index ee556f6ba..4cc563c86 100755
--- a/model_analyzer/record/types/time_to_first_token_min.py
+++ b/model_analyzer/record/types/time_to_first_token_min.py
@@ -16,11 +16,11 @@
 
 from functools import total_ordering
 
-from model_analyzer.record.record import DecreasingRecord
+from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase
 
 
 @total_ordering
-class TimeToFirstTokenMin(DecreasingRecord):
+class TimeToFirstTokenMin(TimeToFirstTokenBase):
     """
     A record for perf_analyzer Time to first token metric
     """
@@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False):
         """
 
         return "Min Time To First Token (ms)"
-
-    def __eq__(self, other):
-        """
-        Allows checking for
-        equality between two records
-        """
-
-        return self.value() == other.value()
-
-    def __lt__(self, other):
-        """
-        Allows checking if
-        this record is less than
-        the other
-        """
-
-        return self.value() > other.value()
-
-    def __add__(self, other):
-        """
-        Allows adding two records together
-        to produce a brand new record.
-        """
-
-        return self.__class__(value=(self.value() + other.value()))
-
-    def __sub__(self, other):
-        """
-        Allows subbing two records together
-        to produce a brand new record.
-
-        ** Note this does reverse subtraction because
-            of the inverted nature of latency (lower is better)
-        """
-
-        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/time_to_first_token_p75.py b/model_analyzer/record/types/time_to_first_token_p75.py
index f996517e9..042972368 100755
--- a/model_analyzer/record/types/time_to_first_token_p75.py
+++ b/model_analyzer/record/types/time_to_first_token_p75.py
@@ -16,11 +16,11 @@
 
 from functools import total_ordering
 
-from model_analyzer.record.record import DecreasingRecord
+from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase
 
 
 @total_ordering
-class TimeToFirstTokenP75(DecreasingRecord):
+class TimeToFirstTokenP75(TimeToFirstTokenBase):
     """
     A record for perf_analyzer Time to first token metric
     """
@@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False):
         """
 
         return "p75 Time To First Token (ms)"
-
-    def __eq__(self, other):
-        """
-        Allows checking for
-        equality between two records
-        """
-
-        return self.value() == other.value()
-
-    def __lt__(self, other):
-        """
-        Allows checking if
-        this record is less than
-        the other
-        """
-
-        return self.value() > other.value()
-
-    def __add__(self, other):
-        """
-        Allows adding two records together
-        to produce a brand new record.
-        """
-
-        return self.__class__(value=(self.value() + other.value()))
-
-    def __sub__(self, other):
-        """
-        Allows subbing two records together
-        to produce a brand new record.
-
-        ** Note this does reverse subtraction because
-            of the inverted nature of latency (lower is better)
-        """
-
-        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/time_to_first_token_p90.py b/model_analyzer/record/types/time_to_first_token_p90.py
index 6009d06ba..853adbdb4 100755
--- a/model_analyzer/record/types/time_to_first_token_p90.py
+++ b/model_analyzer/record/types/time_to_first_token_p90.py
@@ -16,11 +16,11 @@
 
 from functools import total_ordering
 
-from model_analyzer.record.record import DecreasingRecord
+from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase
 
 
 @total_ordering
-class TimeToFirstTokenP90(DecreasingRecord):
+class TimeToFirstTokenP90(TimeToFirstTokenBase):
     """
     A record for perf_analyzer Time to first token metric
     """
@@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False):
         """
 
         return "p90 Time To First Token (ms)"
-
-    def __eq__(self, other):
-        """
-        Allows checking for
-        equality between two records
-        """
-
-        return self.value() == other.value()
-
-    def __lt__(self, other):
-        """
-        Allows checking if
-        this record is less than
-        the other
-        """
-
-        return self.value() > other.value()
-
-    def __add__(self, other):
-        """
-        Allows adding two records together
-        to produce a brand new record.
-        """
-
-        return self.__class__(value=(self.value() + other.value()))
-
-    def __sub__(self, other):
-        """
-        Allows subbing two records together
-        to produce a brand new record.
-
-        ** Note this does reverse subtraction because
-            of the inverted nature of latency (lower is better)
-        """
-
-        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/time_to_first_token_p99.py b/model_analyzer/record/types/time_to_first_token_p99.py
index 2302c82d7..24f2ff088 100755
--- a/model_analyzer/record/types/time_to_first_token_p99.py
+++ b/model_analyzer/record/types/time_to_first_token_p99.py
@@ -16,11 +16,11 @@
 
 from functools import total_ordering
 
-from model_analyzer.record.record import DecreasingRecord
+from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase
 
 
 @total_ordering
-class TimeToFirstTokenP99(DecreasingRecord):
+class TimeToFirstTokenP99(TimeToFirstTokenBase):
     """
     A record for perf_analyzer Time to first token metric
     """
@@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False):
         """
 
         return "p99 Time To First Token (ms)"
-
-    def __eq__(self, other):
-        """
-        Allows checking for
-        equality between two records
-        """
-
-        return self.value() == other.value()
-
-    def __lt__(self, other):
-        """
-        Allows checking if
-        this record is less than
-        the other
-        """
-
-        return self.value() > other.value()
-
-    def __add__(self, other):
-        """
-        Allows adding two records together
-        to produce a brand new record.
-        """
-
-        return self.__class__(value=(self.value() + other.value()))
-
-    def __sub__(self, other):
-        """
-        Allows subbing two records together
-        to produce a brand new record.
-
-        ** Note this does reverse subtraction because
-            of the inverted nature of latency (lower is better)
-        """
-
-        return self.__class__(value=(other.value() - self.value()))