Add time-to-second-token metric (#217)

triton-inference-server · Dec 13, 2024 · 05b0949 · 05b0949
1 parent 5f1c7b3
commit 05b0949
Show file tree

Hide file tree

Showing 26 changed files with 471 additions and 46 deletions.
diff --git a/genai-perf/README.md b/genai-perf/README.md
@@ -33,6 +33,7 @@ generative AI models as served through an inference server.
 For large language models (LLMs), GenAI-Perf provides metrics such as
 [output token throughput](#output_token_throughput_metric),
 [time to first token](#time_to_first_token_metric),
+[time to second token](#time_to_second_token_metric),
 [inter token latency](#inter_token_latency_metric), and
 [request throughput](#request_throughput_metric).
 For a full list of metrics please see the [Metrics section](#metrics).
@@ -357,6 +358,7 @@ the inference server.
 | Metric | Description | Aggregations |
 | - | - | - |
 | <span id="time_to_first_token_metric">Time to First Token</span> | Time between when a request is sent and when its first response is received, one value per request in benchmark | Avg, min, max, p99, p90, p75 |
+| <span id="time_to_second_token_metric">Time to Second Token</span> | Time between when the first streaming response is received and when the second streaming response is received, one value per request in benchmark | Avg, min, max, p99, p90, p75 |
 | <span id="inter_token_latency_metric">Inter Token Latency</span> | Time between intermediate responses for a single request divided by the number of generated tokens of the latter response, one value per response per request in benchmark | Avg, min, max, p99, p90, p75 |
 | Request Latency | Time between when a request is sent and when its final response is received, one value per request in benchmark | Avg, min, max, p99, p90, p75 |
 | Output Sequence Length | Total number of output tokens of a request, one value per request in benchmark | Avg, min, max, p99, p90, p75 |

diff --git a/genai-perf/docs/files.md b/genai-perf/docs/files.md
@@ -46,7 +46,7 @@ genai-perf/
 
 ## File Types
 Within the artifacts and docs directories, several file types are generated,
-including .gzip, .csv, .json, .html, and .jpeg. Below is a detailed
+including .csv, .json, .html, and .jpeg. Below is a detailed
 explanation of each file and its purpose.
 
 ### Artifacts Directory
@@ -55,18 +55,6 @@ explanation of each file and its purpose.
 
 The data subdirectory contains the raw and processed performance data files.
 
-##### GZIP Files
-
-- all_data.gzip: Aggregated performance data from all collected metrics.
-- input_sequence_lengths_vs_output_sequence_lengths.gzip: This contains data on
-the input sequence lengths versus the output sequence lengths for each request.
-- request_latency.gzip: This contains the latency for each request.
-- time_to_first_token.gzip: This contains the time to first token for each request.
-- token_to_token_vs_output_position.gzip: This contains the time from one token
-generation to the next versus the position of the output token for each token.
-- ttft_vs_input_sequence_lengths.gzip: This contains the time to first token
-versus the input sequence length for each request.
-
 ##### JSON Files
 
 - inputs.json: This contains the input prompts provided to the LLM during testing.
@@ -101,23 +89,6 @@ versus the input sequence lengths.
 To use the generated files, navigate to the artifacts/data directory. Then,
 the next steps depend on the file format you wish to work with.
 
-### GZIP Files
-
-The GZIP files contain Parquet files with calculated data, which can be read
-with Pandas in Python. For example, you can create a dataframe with these files:
-
-```
-import pandas
-df = pandas.read_partquet(path_to_file)`
-```
-
-You can then use Pandas to work with the data.
-
-```
-print(df.head())     # See the first few rows of the data.
-print(df.describe()) # Get summary statistics for the data
-```
-
 ### CSV and JSON Files
 Open .csv and .json files with spreadsheet or JSON parsing tools for structured
 data analysis. These can also be read via a text editor, like Vim.

diff --git a/genai-perf/genai_perf/export_data/console_exporter.py b/genai-perf/genai_perf/export_data/console_exporter.py
@@ -121,6 +121,7 @@ def _should_skip(self, metric_name: str) -> bool:
         streaming_metrics = [
             "inter_token_latency",
             "time_to_first_token",
+            "time_to_second_token",
         ]
         if not self._args.streaming and metric_name in streaming_metrics:
             return True

diff --git a/genai-perf/genai_perf/export_data/csv_exporter.py b/genai-perf/genai_perf/export_data/csv_exporter.py
@@ -119,6 +119,7 @@ def _should_skip(self, metric_name: str) -> bool:
         streaming_metrics = [
             "inter_token_latency",
             "time_to_first_token",
+            "time_to_second_token",
         ]
         if not self._args.streaming and metric_name in streaming_metrics:
             return True

diff --git a/genai-perf/genai_perf/goodput_calculator/llm_goodput_calculator.py b/genai-perf/genai_perf/goodput_calculator/llm_goodput_calculator.py
@@ -69,6 +69,7 @@ def _set_valid_metric_names(self) -> None:
 
     def _add_slo_mapping(self) -> None:
         self._slo_names["time_to_first_token"] = "time_to_first_tokens"  # nosec
+        self._slo_names["time_to_second_token"] = "time_to_second_tokens"  # nosec
         self._slo_names["inter_token_latency"] = "inter_token_latencies"  # nosec
         self._slo_names["output_token_throughput_per_request"] = (  # nosec
             "output_token_throughputs_per_request"

diff --git a/genai-perf/genai_perf/metrics/llm_metrics.py b/genai-perf/genai_perf/metrics/llm_metrics.py
@@ -36,6 +36,7 @@ class LLMMetrics(Metrics):
 
     LLM_REQUEST_TIME_METRICS = [
         MetricMetadata("time_to_first_token", "ms"),
+        MetricMetadata("time_to_second_token", "ms"),
         MetricMetadata("inter_token_latency", "ms"),
     ]
 
@@ -64,6 +65,7 @@ def __init__(
         request_throughputs: List[float] = [],
         request_latencies: List[int] = [],
         time_to_first_tokens: List[int] = [],
+        time_to_second_tokens: List[int] = [],
         inter_token_latencies: List[int] = [],
         output_token_throughputs: List[float] = [],
         output_token_throughputs_per_request: List[float] = [],
@@ -74,6 +76,7 @@ def __init__(
     ) -> None:
         super().__init__(request_throughputs, request_latencies, request_goodputs)
         self.time_to_first_tokens = time_to_first_tokens
+        self.time_to_second_tokens = time_to_second_tokens
         self.inter_token_latencies = inter_token_latencies
         self.output_token_throughputs = output_token_throughputs
         self.output_token_throughputs_per_request = output_token_throughputs_per_request
@@ -86,6 +89,7 @@ def __init__(
 
         # add base name mapping
         self._base_names["time_to_first_tokens"] = "time_to_first_token"
+        self._base_names["time_to_second_tokens"] = "time_to_second_token"
         self._base_names["inter_token_latencies"] = "inter_token_latency"  # nosec
         self._base_names["output_token_throughputs"] = (  # nosec
             "output_token_throughput"

diff --git a/genai-perf/genai_perf/metrics/statistics.py b/genai-perf/genai_perf/metrics/statistics.py
@@ -167,6 +167,7 @@ def _is_time_metric(self, field: str) -> bool:
         time_metrics = [
             "inter_token_latency",
             "time_to_first_token",
+            "time_to_second_token",
             "request_latency",
             "image_latency",
         ]

diff --git a/genai-perf/genai_perf/plots/plot_config_parser.py b/genai-perf/genai_perf/plots/plot_config_parser.py
@@ -120,6 +120,9 @@ def _get_metric(self, stats: Statistics, name: str) -> List[Union[int, float]]:
         elif name == "time_to_first_tokens":
             ttfts = stats.metrics.data[name]
             return [scale(x, (1 / 1e6)) for x in ttfts]  # ns to ms
+        elif name == "time_to_second_tokens":
+            ttsts = stats.metrics.data[name]
+            return [scale(x, (1 / 1e6)) for x in ttsts]  # ns to ms
         elif name == "request_latencies":
             req_latencies = stats.metrics.data[name]
             return [scale(x, (1 / 1e6)) for x in req_latencies]  # ns to ms

diff --git a/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py b/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py
@@ -79,6 +79,7 @@ def _parse_requests(self, requests: dict) -> Metrics:
         min_req_timestamp, max_res_timestamp = float("inf"), 0
         request_latencies = []
         time_to_first_tokens = []
+        time_to_second_tokens = []
         inter_token_latencies = []
         output_token_throughputs_per_request = []
         input_sequence_lengths = []
@@ -111,6 +112,11 @@ def _parse_requests(self, requests: dict) -> Metrics:
             ttft = res_timestamps[0] - req_timestamp
             time_to_first_tokens.append(ttft)
 
+            # time to second token (if available)
+            if len(res_timestamps) > 1:
+                ttst = res_timestamps[1] - res_timestamps[0]
+                time_to_second_tokens.append(ttst)
+
             # number of input tokens
             input_seq_len = self._get_input_token_count(req_inputs)
             input_sequence_lengths.append(input_seq_len)
@@ -154,6 +160,7 @@ def _parse_requests(self, requests: dict) -> Metrics:
             request_throughputs,
             request_latencies,
             time_to_first_tokens,
+            time_to_second_tokens,
             inter_token_latencies,
             output_token_throughputs,
             output_token_throughputs_per_request,

diff --git a/genai-perf/genai_perf/record/types/time_to_second_token_avg.py b/genai-perf/genai_perf/record/types/time_to_second_token_avg.py
@@ -0,0 +1,33 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from genai_perf.record.types.time_to_second_token_base import TimeToSecondTokenBase
+
+
+@total_ordering
+class TimeToSecondTokenAvg(TimeToSecondTokenBase):
+    """
+    A record for avg Time to second token metric
+    """
+
+    tag = TimeToSecondTokenBase.base_tag + "_avg"
+
+    def __init__(self, value, timestamp=0):
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False) -> str:
+        return "Avg Time To Second Token (ms)"
diff --git a/genai-perf/genai_perf/record/types/time_to_second_token_base.py b/genai-perf/genai_perf/record/types/time_to_second_token_base.py
@@ -0,0 +1,56 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from genai_perf.record.record import DecreasingRecord, ReductionFactor
+from genai_perf.types import RecordValue
+
+
+@total_ordering
+class TimeToSecondTokenBase(DecreasingRecord):
+    """
+    A base class record for the time to second token metric
+    """
+
+    base_tag = "time_to_second_token"
+    reduction_factor = ReductionFactor.NS_TO_MS
+
+    def __init__(self, value: RecordValue, timestamp: int = 0) -> None:
+        super().__init__(value, timestamp)
+
+    def __eq__(self, other: "TimeToSecondTokenBase") -> bool:  # type: ignore
+        return self.value() == other.value()
+
+    def __lt__(self, other: "TimeToSecondTokenBase") -> bool:
+        return self.value() > other.value()
+
+    def __add__(self, other: "TimeToSecondTokenBase") -> "TimeToSecondTokenBase":
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other: "TimeToSecondTokenBase") -> "TimeToSecondTokenBase":
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/genai-perf/genai_perf/record/types/time_to_second_token_max.py b/genai-perf/genai_perf/record/types/time_to_second_token_max.py
@@ -0,0 +1,33 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from genai_perf.record.types.time_to_second_token_base import TimeToSecondTokenBase
+
+
+@total_ordering
+class TimeToSecondTokenMax(TimeToSecondTokenBase):
+    """
+    A record for max Time to second token metric
+    """
+
+    tag = TimeToSecondTokenBase.base_tag + "_max"
+
+    def __init__(self, value, timestamp=0):
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False) -> str:
+        return "Max Time To Second Token (ms)"
diff --git a/genai-perf/genai_perf/record/types/time_to_second_token_min.py b/genai-perf/genai_perf/record/types/time_to_second_token_min.py
@@ -0,0 +1,33 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from genai_perf.record.types.time_to_second_token_base import TimeToSecondTokenBase
+
+
+@total_ordering
+class TimeToSecondTokenMin(TimeToSecondTokenBase):
+    """
+    A record for min Time to second token metric
+    """
+
+    tag = TimeToSecondTokenBase.base_tag + "_min"
+
+    def __init__(self, value, timestamp=0):
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False) -> str:
+        return "Min Time To Second Token (ms)"
diff --git a/genai-perf/genai_perf/record/types/time_to_second_token_p25.py b/genai-perf/genai_perf/record/types/time_to_second_token_p25.py
@@ -0,0 +1,33 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from genai_perf.record.types.time_to_second_token_base import TimeToSecondTokenBase
+
+
+@total_ordering
+class TimeToSecondTokenP25(TimeToSecondTokenBase):
+    """
+    A record for p25 Time to second token metric
+    """
+
+    tag = TimeToSecondTokenBase.base_tag + "_p25"
+
+    def __init__(self, value, timestamp=0):
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False) -> str:
+        return "p25 Time To Second Token (ms)"
diff --git a/genai-perf/genai_perf/record/types/time_to_second_token_p50.py b/genai-perf/genai_perf/record/types/time_to_second_token_p50.py
@@ -0,0 +1,33 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from genai_perf.record.types.time_to_second_token_base import TimeToSecondTokenBase
+
+
+@total_ordering
+class TimeToSecondTokenP50(TimeToSecondTokenBase):
+    """
+    A record for p50 Time to second token metric
+    """
+
+    tag = TimeToSecondTokenBase.base_tag + "_p50"
+
+    def __init__(self, value, timestamp=0):
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False) -> str:
+        return "p50 Time To Second Token (ms)"