From 4202055e8b2f6b74569add328a4defd6dfcb81c5 Mon Sep 17 00:00:00 2001 From: Brian Raf <92820864+nv-braf@users.noreply.github.com> Date: Fri, 22 Mar 2024 08:07:19 -0700 Subject: [PATCH] Capture LLM metrics from genai-perf in MA (#844) * Successfully reading from LLM CSV * General cleanup * All unit tests passing * Fixing metric table typos * Fixing typos --- model_analyzer/constants.py | 3 + model_analyzer/perf_analyzer/perf_analyzer.py | 146 +++++++- .../record/types/inter_token_latency_p25.py | 60 +++ .../record/types/inter_token_latency_p50.py | 60 +++ .../record/types/inter_token_latency_p90.py | 2 +- .../record/types/inter_token_latency_p95.py | 60 +++ .../record/types/time_to_first_token_p25.py | 60 +++ .../record/types/time_to_first_token_p50.py | 60 +++ .../record/types/time_to_first_token_p95.py | 60 +++ tests/test_perf_analyzer.py | 352 ++++++++++++------ tests/test_record_types.py | 6 + 11 files changed, 759 insertions(+), 110 deletions(-) create mode 100755 model_analyzer/record/types/inter_token_latency_p25.py create mode 100755 model_analyzer/record/types/inter_token_latency_p50.py create mode 100755 model_analyzer/record/types/inter_token_latency_p95.py create mode 100755 model_analyzer/record/types/time_to_first_token_p25.py create mode 100755 model_analyzer/record/types/time_to_first_token_p50.py create mode 100755 model_analyzer/record/types/time_to_first_token_p95.py diff --git a/model_analyzer/constants.py b/model_analyzer/constants.py index 886360d34..4fd91a480 100755 --- a/model_analyzer/constants.py +++ b/model_analyzer/constants.py @@ -70,3 +70,6 @@ # Model analyzer package name PACKAGE_NAME = "triton-model-analyzer" + +# GENAI-PERF CSV +GENAI_PERF_CSV = "profile_export_genai_perf.csv" diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py index c88f8e655..e5387f04e 100755 --- a/model_analyzer/perf_analyzer/perf_analyzer.py +++ b/model_analyzer/perf_analyzer/perf_analyzer.py @@ -21,12 +21,15 @@ import re import signal import tempfile +from csv import DictReader from subprocess import STDOUT, Popen -from typing import Dict, List +from typing import Dict, List, Optional import psutil +from model_analyzer.config.input.config_defaults import DEFAULT_MODEL_TYPE from model_analyzer.constants import ( + GENAI_PERF_CSV, INTERVAL_SLEEP_TIME, LOGGER_NAME, MEASUREMENT_REQUEST_COUNT_STEP, @@ -40,6 +43,16 @@ from model_analyzer.record.types.gpu_power_usage import GPUPowerUsage from model_analyzer.record.types.gpu_used_memory import GPUUsedMemory from model_analyzer.record.types.gpu_utilization import GPUUtilization +from model_analyzer.record.types.inter_token_latency_avg import InterTokenLatencyAvg +from model_analyzer.record.types.inter_token_latency_max import InterTokenLatencyMax +from model_analyzer.record.types.inter_token_latency_min import InterTokenLatencyMin +from model_analyzer.record.types.inter_token_latency_p25 import InterTokenLatencyP25 +from model_analyzer.record.types.inter_token_latency_p50 import InterTokenLatencyP50 +from model_analyzer.record.types.inter_token_latency_p75 import InterTokenLatencyP75 +from model_analyzer.record.types.inter_token_latency_p90 import InterTokenLatencyP90 +from model_analyzer.record.types.inter_token_latency_p95 import InterTokenLatencyP95 +from model_analyzer.record.types.inter_token_latency_p99 import InterTokenLatencyP99 +from model_analyzer.record.types.output_token_throughput import OutputTokenThroughput from model_analyzer.record.types.perf_client_response_wait import PerfClientResponseWait from model_analyzer.record.types.perf_client_send_recv import PerfClientSendRecv from model_analyzer.record.types.perf_latency_avg import PerfLatencyAvg @@ -53,6 +66,15 @@ ) from model_analyzer.record.types.perf_server_queue import PerfServerQueue from model_analyzer.record.types.perf_throughput import PerfThroughput +from model_analyzer.record.types.time_to_first_token_avg import TimeToFirstTokenAvg +from model_analyzer.record.types.time_to_first_token_max import TimeToFirstTokenMax +from model_analyzer.record.types.time_to_first_token_min import TimeToFirstTokenMin +from model_analyzer.record.types.time_to_first_token_p25 import TimeToFirstTokenP25 +from model_analyzer.record.types.time_to_first_token_p50 import TimeToFirstTokenP50 +from model_analyzer.record.types.time_to_first_token_p75 import TimeToFirstTokenP75 +from model_analyzer.record.types.time_to_first_token_p90 import TimeToFirstTokenP90 +from model_analyzer.record.types.time_to_first_token_p95 import TimeToFirstTokenP95 +from model_analyzer.record.types.time_to_first_token_p99 import TimeToFirstTokenP99 logger = logging.getLogger(LOGGER_NAME) @@ -91,6 +113,28 @@ class PerfAnalyzer: ["gpu_used_memory", "Max GPU Memory Usage", GPUUsedMemory, "1000000"], ["gpu_free_memory", "Total GPU Memory", GPUFreeMemory, "1000000"] ] + + llm_metric_table = [ + ["time_to_first_token_avg", "Time to First Token (ns) avg", TimeToFirstTokenAvg, "1000"], + ["time_to_first_token_min", "Time to First Token (ns) min", TimeToFirstTokenMin, "1000"], + ["time_to_first_token_max", "Time to First Token (ns) max", TimeToFirstTokenMax, "1000"], + ["time_to_first_token_p99", "Time to First Token (ns) p99", TimeToFirstTokenP99, "1000"], + ["time_to_first_token_p95", "Time to First Token (ns) p95", TimeToFirstTokenP95, "1000"], + ["time_to_first_token_p90", "Time to First Token (ns) p90", TimeToFirstTokenP90, "1000"], + ["time_to_first_token_p75", "Time to First Token (ns) p75", TimeToFirstTokenP75, "1000"], + ["time_to_first_token_p50", "Time to First Token (ns) p50", TimeToFirstTokenP50, "1000"], + ["time_to_first_token_p25", "Time to First Token (ns) p25", TimeToFirstTokenP25, "1000"], + ["inter_token_latency_avg", "Inter Token Latency (ns) avg", InterTokenLatencyAvg, "1000"], + ["inter_token_latency_min", "Inter Token Latency (ns) min", InterTokenLatencyMin, "1000"], + ["inter_token_latency_max", "Inter Token Latency (ns) max", InterTokenLatencyMax, "1000"], + ["inter_token_latency_p99", "Inter Token Latency (ns) p99", InterTokenLatencyP99, "1000"], + ["inter_token_latency_p95", "Inter Token Latency (ns) p95", InterTokenLatencyP95, "1000"], + ["inter_token_latency_p90", "Inter Token Latency (ns) p90", InterTokenLatencyP90, "1000"], + ["inter_token_latency_p75", "Inter Token Latency (ns) p75", InterTokenLatencyP75, "1000"], + ["inter_token_latency_p50", "Inter Token Latency (ns) p50", InterTokenLatencyP50, "1000"], + ["inter_token_latency_p25", "Inter Token Latency (ns) p25", InterTokenLatencyP25, "1000"], + ["output_token_throughput", "Output Token Throughput (per sec) avg", OutputTokenThroughput, "1"] + ] # yapf: enable @staticmethod @@ -109,7 +153,23 @@ def get_gpu_metrics(): ] return gpu_metrics - def __init__(self, path, config, max_retries, timeout, max_cpu_util): + @staticmethod + def get_llm_metrics(): + llm_metrics = [ + llm_metric[PerfAnalyzer.RECORD_CLASS] + for llm_metric in PerfAnalyzer.llm_metric_table + ] + return llm_metrics + + def __init__( + self, + path, + config, + max_retries, + timeout, + max_cpu_util, + model_type=DEFAULT_MODEL_TYPE, + ): """ Parameters ---------- @@ -133,8 +193,10 @@ def __init__(self, path, config, max_retries, timeout, max_cpu_util): self._timeout = timeout self._output = "" self._perf_records = {} + self._llm_records = {} self._gpu_records = [] self._max_cpu_util = max_cpu_util + self._model_type = model_type def run(self, metrics, env=None): """ @@ -195,7 +257,20 @@ def get_perf_records(self): if self._perf_records: return self._perf_records raise TritonModelAnalyzerException( - "Attempted to get perf_analyzer results" "without calling run first." + "Attempted to get perf_analyzer results without calling run first." + ) + + def get_llm_records(self): + """ + Returns + ------- + The LLM records from the last perf_analyzer run + """ + + if self._llm_records: + return self._llm_records + raise TritonModelAnalyzerException( + "Attempted to get perf_analyzer results without calling run first." ) def get_gpu_records(self): @@ -438,6 +513,12 @@ def _is_multi_model(self): return len(self._config.model_run_configs()) > 1 def _parse_outputs(self, metrics): + self._parse_generic_outputs(metrics) + + if self._model_type == "LLM": + self._parse_llm_outputs(metrics) + + def _parse_generic_outputs(self, metrics): """ Extract records from the Perf Analyzer run for each model """ @@ -464,6 +545,24 @@ def _parse_outputs(self, metrics): for f in glob.glob(f"*{perf_config['latency-report-file']}"): os.remove(f) + def _parse_llm_outputs(self, metrics): + """ + Extract records from the Perf Analyzer run for each model + """ + + perf_config = self._config.model_run_configs()[0].perf_config() + + logger.debug(f"Reading PA results from {GENAI_PERF_CSV}") + with open(GENAI_PERF_CSV, mode="r") as f: + csv_reader = csv.DictReader(f, delimiter=",") + + # See test_perf_analyzer::test_pa_llm_csv_output() for CSV output example + self._llm_records[perf_config["model-name"]] = self._extract_llm_records( + metrics, csv_reader + ) + + os.remove(f) + def _extract_perf_records_from_row( self, requested_metrics: List[Record], row_metrics: Dict[str, str] ) -> List[Record]: @@ -526,6 +625,47 @@ def _extract_gpu_records_from_row( self._cleanup_gpu_records(gpu_records) return gpu_records + def _extract_llm_records( + self, requested_metrics: List[Record], csv_reader: DictReader + ) -> List[Record]: + llm_records: List[Record] = [] + + for requested_metric in requested_metrics: + new_llm_record = self._get_llm_record_from_csv(requested_metric, csv_reader) + llm_records.append(new_llm_record) + + return llm_records + + def _get_llm_record_from_csv( + self, requested_metric: Record, csv_reader: DictReader + ) -> Record: + for row in csv_reader: + for key, value in row.items(): + metric_string = f"{row['Metric']} {key}" + llm_metric = self._find_corresponding_llm_metric_row(metric_string) + + if ( + llm_metric + and llm_metric[PerfAnalyzer.METRIC_TAG] == requested_metric.tag + ): + adjusted_value = float(value) / float( + llm_metric[PerfAnalyzer.REDUCTION_FACTOR] + ) + + llm_record = llm_metric[PerfAnalyzer.RECORD_CLASS](adjusted_value) # type: ignore + return llm_record + + raise TritonModelAnalyzerException( + f"Did not find {requested_metric.tag} in genai-perf CSV file" + ) + + def _find_corresponding_llm_metric_row(self, metric_string: str) -> Optional[List]: + for row in PerfAnalyzer.llm_metric_table: + if metric_string == row[PerfAnalyzer.CSV_STRING]: + return row + + return None + def _cleanup_gpu_records(self, gpu_records): # Recalculate GPUFreeMemory by removing the value of the associated GPUUsedMemory # Remove any GPUFreeMemory records that don't have a matching GPUUsedMemory diff --git a/model_analyzer/record/types/inter_token_latency_p25.py b/model_analyzer/record/types/inter_token_latency_p25.py new file mode 100755 index 000000000..8a0c80173 --- /dev/null +++ b/model_analyzer/record/types/inter_token_latency_p25.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase + + +@total_ordering +class InterTokenLatencyP25(InterTokenLatencyBase): + """ + A record for perf_analyzer Inter token latency metric + """ + + tag = "inter_token_latency_p25" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p25 Inter Token Latency (ms)" diff --git a/model_analyzer/record/types/inter_token_latency_p50.py b/model_analyzer/record/types/inter_token_latency_p50.py new file mode 100755 index 000000000..190920874 --- /dev/null +++ b/model_analyzer/record/types/inter_token_latency_p50.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase + + +@total_ordering +class InterTokenLatencyP50(InterTokenLatencyBase): + """ + A record for perf_analyzer Inter token latency metric + """ + + tag = "inter_token_latency_p50" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p50 Inter Token Latency (ms)" diff --git a/model_analyzer/record/types/inter_token_latency_p90.py b/model_analyzer/record/types/inter_token_latency_p90.py index 58ae0ccb4..60019088a 100755 --- a/model_analyzer/record/types/inter_token_latency_p90.py +++ b/model_analyzer/record/types/inter_token_latency_p90.py @@ -20,7 +20,7 @@ @total_ordering -class InterTokenLatencyP99(InterTokenLatencyBase): +class InterTokenLatencyP90(InterTokenLatencyBase): """ A record for perf_analyzer Inter token latency metric """ diff --git a/model_analyzer/record/types/inter_token_latency_p95.py b/model_analyzer/record/types/inter_token_latency_p95.py new file mode 100755 index 000000000..b77fd9118 --- /dev/null +++ b/model_analyzer/record/types/inter_token_latency_p95.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase + + +@total_ordering +class InterTokenLatencyP95(InterTokenLatencyBase): + """ + A record for perf_analyzer Inter token latency metric + """ + + tag = "inter_token_latency_p95" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p95 Inter Token Latency (ms)" diff --git a/model_analyzer/record/types/time_to_first_token_p25.py b/model_analyzer/record/types/time_to_first_token_p25.py new file mode 100755 index 000000000..5938ca3eb --- /dev/null +++ b/model_analyzer/record/types/time_to_first_token_p25.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase + + +@total_ordering +class TimeToFirstTokenP25(TimeToFirstTokenBase): + """ + A record for perf_analyzer Time to first token metric + """ + + tag = "time_to_first_token_p25" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p25 Time To First Token (ms)" diff --git a/model_analyzer/record/types/time_to_first_token_p50.py b/model_analyzer/record/types/time_to_first_token_p50.py new file mode 100755 index 000000000..a3440b456 --- /dev/null +++ b/model_analyzer/record/types/time_to_first_token_p50.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase + + +@total_ordering +class TimeToFirstTokenP50(TimeToFirstTokenBase): + """ + A record for perf_analyzer Time to first token metric + """ + + tag = "time_to_first_token_p50" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p50 Time To First Token (ms)" diff --git a/model_analyzer/record/types/time_to_first_token_p95.py b/model_analyzer/record/types/time_to_first_token_p95.py new file mode 100755 index 000000000..6e466c4e2 --- /dev/null +++ b/model_analyzer/record/types/time_to_first_token_p95.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase + + +@total_ordering +class TimeToFirstTokenP95(TimeToFirstTokenBase): + """ + A record for perf_analyzer Time to first token metric + """ + + tag = "time_to_first_token_p95" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p95 Time To First Token (ms)" diff --git a/tests/test_perf_analyzer.py b/tests/test_perf_analyzer.py index e95f0d4a1..0d063f81f 100755 --- a/tests/test_perf_analyzer.py +++ b/tests/test_perf_analyzer.py @@ -33,6 +33,16 @@ from model_analyzer.record.types.gpu_power_usage import GPUPowerUsage from model_analyzer.record.types.gpu_used_memory import GPUUsedMemory from model_analyzer.record.types.gpu_utilization import GPUUtilization +from model_analyzer.record.types.inter_token_latency_avg import InterTokenLatencyAvg +from model_analyzer.record.types.inter_token_latency_max import InterTokenLatencyMax +from model_analyzer.record.types.inter_token_latency_min import InterTokenLatencyMin +from model_analyzer.record.types.inter_token_latency_p25 import InterTokenLatencyP25 +from model_analyzer.record.types.inter_token_latency_p50 import InterTokenLatencyP50 +from model_analyzer.record.types.inter_token_latency_p75 import InterTokenLatencyP75 +from model_analyzer.record.types.inter_token_latency_p90 import InterTokenLatencyP90 +from model_analyzer.record.types.inter_token_latency_p95 import InterTokenLatencyP95 +from model_analyzer.record.types.inter_token_latency_p99 import InterTokenLatencyP99 +from model_analyzer.record.types.output_token_throughput import OutputTokenThroughput from model_analyzer.record.types.perf_client_response_wait import PerfClientResponseWait from model_analyzer.record.types.perf_client_send_recv import PerfClientSendRecv from model_analyzer.record.types.perf_latency_avg import PerfLatencyAvg @@ -46,6 +56,15 @@ ) from model_analyzer.record.types.perf_server_queue import PerfServerQueue from model_analyzer.record.types.perf_throughput import PerfThroughput +from model_analyzer.record.types.time_to_first_token_avg import TimeToFirstTokenAvg +from model_analyzer.record.types.time_to_first_token_max import TimeToFirstTokenMax +from model_analyzer.record.types.time_to_first_token_min import TimeToFirstTokenMin +from model_analyzer.record.types.time_to_first_token_p25 import TimeToFirstTokenP25 +from model_analyzer.record.types.time_to_first_token_p50 import TimeToFirstTokenP50 +from model_analyzer.record.types.time_to_first_token_p75 import TimeToFirstTokenP75 +from model_analyzer.record.types.time_to_first_token_p90 import TimeToFirstTokenP90 +from model_analyzer.record.types.time_to_first_token_p95 import TimeToFirstTokenP95 +from model_analyzer.record.types.time_to_first_token_p99 import TimeToFirstTokenP99 from model_analyzer.triton.client.client_factory import TritonClientFactory from model_analyzer.triton.server.server_config import TritonServerConfig from model_analyzer.triton.server.server_factory import TritonServerFactory @@ -248,7 +267,10 @@ def test_perf_analyzer_ssl_args(self): ) self.assertEqual(self.config.to_cli_string(), expected_cli_str) - def test_run(self): + def test_pa_csv_output(self): + """ + Tests the ability to read PA's CSV output + """ server_config = TritonServerConfig() server_config["model-repository"] = MODEL_REPOSITORY_PATH @@ -287,122 +309,40 @@ def test_run(self): self.assertEqual(len(records[TEST_MODEL_NAME]), 1) self.assertEqual(records[TEST_MODEL_NAME][0].value(), 5) - # Test p90 latency parsing - perf_metrics = [PerfLatencyP90] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 4.7) - - # Test p95 latency parsing - perf_metrics = [PerfLatencyP95] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 4.8) - - # Test p99 latency parsing - perf_metrics = [PerfLatencyP99] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 4.9) + # Test latency parsing + self._test_metrics_from_csv(perf_analyzer, pa_csv_mock, [PerfLatencyP90], [4.7]) + self._test_metrics_from_csv(perf_analyzer, pa_csv_mock, [PerfLatencyP95], [4.8]) + self._test_metrics_from_csv(perf_analyzer, pa_csv_mock, [PerfLatencyP99], [4.9]) # Test throughput parsing - perf_metrics = [PerfThroughput] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 46.8) + self._test_metrics_from_csv( + perf_analyzer, pa_csv_mock, [PerfThroughput], [46.8] + ) # Test client response wait - perf_metrics = [PerfClientResponseWait] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 0.314) + self._test_metrics_from_csv( + perf_analyzer, pa_csv_mock, [PerfClientResponseWait], [0.314] + ) # Test server queue - perf_metrics = [PerfServerQueue] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 0.018) + self._test_metrics_from_csv( + perf_analyzer, pa_csv_mock, [PerfServerQueue], [0.018] + ) # Test server compute infer - perf_metrics = [PerfServerComputeInfer] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 0.065) + self._test_metrics_from_csv( + perf_analyzer, pa_csv_mock, [PerfServerComputeInfer], [0.065] + ) # Test server compute input - perf_metrics = [PerfServerComputeInput] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 0.034) - - # Test server compute infer - perf_metrics = [PerfServerComputeOutput] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) + self._test_metrics_from_csv( + perf_analyzer, pa_csv_mock, [PerfServerComputeInput], [0.034] + ) - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 0.016) + # Test server compute output + self._test_metrics_from_csv( + perf_analyzer, pa_csv_mock, [PerfServerComputeOutput], [0.016] + ) # Test Avg GPU Utilizations. Perf metric is ignored for get_gpu_records() gpu_metrics = [GPUUtilization, PerfLatencyAvg] @@ -544,6 +484,206 @@ def test_run(self): self.assertTrue(perf_analyzer.run(perf_metrics)) self.server.stop() + def test_pa_llm_csv_output(self): + """ + Tests the ability to read PA's LLM CSV output + """ + server_config = TritonServerConfig() + server_config["model-repository"] = MODEL_REPOSITORY_PATH + + # Create server, client, PerfAnalyzer, and wait for server ready + self.server = TritonServerFactory.create_server_local( + path=TRITON_LOCAL_BIN_PATH, config=server_config, gpus=self.gpus + ) + + perf_analyzer = PerfAnalyzer( + path=PERF_BIN_PATH, + config=self.run_config, + max_retries=10, + timeout=100, + max_cpu_util=50, + model_type="LLM", + ) + self.client = TritonClientFactory.create_grpc_client(server_url=TEST_GRPC_URL) + self.server.start() + self.client.wait_for_server_ready(num_retries=1) + + pa_llm_csv_mock = """Metric,avg,min,max,p99,p95,p90,p75,p50,p25\n""" + pa_llm_csv_mock += """Time to First Token (ns),4238735,3367978,6702240,6371118,5344958,5006259,4841394,4146648,3484484\n""" + pa_llm_csv_mock += """Inter Token Latency (ns),27202264,3849435,138324924,28283424,27737593,27469154,27067290,26979956,26926962\n""" + pa_llm_csv_mock += """Request Latency (ns),3363927003,3367978,14238834483,14091273510,13740917508,13692672723,3752510140,4846258,3612270\n""" + pa_llm_csv_mock += """Num Output Token,126,0,584,562,509,505,135,0,0\n""" + pa_llm_csv_mock += """\n""" + pa_llm_csv_mock += """Metric,Value\n""" + pa_llm_csv_mock += """Output Token Throughput (per sec),36.37\n""" + pa_llm_csv_mock += """Request Throughput (per sec),0.29""" + + # Test all Time to first token values + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenAvg], + [4238.735], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenMin], + [3367.978], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenMax], + [6702.240], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenP99], + [6371.118], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenP95], + [5344.958], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenP90], + [5006.259], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenP75], + [4841.394], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenP50], + [4146.648], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenP25], + [3484.484], + is_llm=True, + ) + + # Test all Inter token latency values + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyAvg], + [27202.264], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyMin], + [3849.435], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyMax], + [138324.924], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyP99], + [28283.424], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyP95], + [27737.593], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyP90], + [27469.154], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyP75], + [27067.290], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyP50], + [26979.956], + is_llm=True, + ) + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyP25], + [26926.962], + is_llm=True, + ) + + # Test output token throughput + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [OutputTokenThroughput], + [36.37], + is_llm=True, + ) + + # Test combination + self._test_metrics_from_csv( + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenP90, InterTokenLatencyP50, OutputTokenThroughput], + [5006.259, 26979.956, 36.37], + is_llm=True, + ) + + def _test_metrics_from_csv( + self, perf_analyzer, read_data, metrics, expected_values, is_llm=False + ): + with patch( + "model_analyzer.perf_analyzer.perf_analyzer.open", + mock_open(read_data=read_data), + ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): + perf_analyzer.run(metrics) + + if is_llm: + records = perf_analyzer.get_llm_records() + else: + records = perf_analyzer.get_perf_records() + + self.assertEqual(len(records[TEST_MODEL_NAME]), len(expected_values)) + for i, expected_value in enumerate(expected_values): + self.assertEqual(records[TEST_MODEL_NAME][i].value(), expected_value) + def test_measurement_interval_increase(self): server_config = TritonServerConfig() server_config["model-repository"] = MODEL_REPOSITORY_PATH diff --git a/tests/test_record_types.py b/tests/test_record_types.py index 54c353200..1279e06df 100755 --- a/tests/test_record_types.py +++ b/tests/test_record_types.py @@ -52,14 +52,20 @@ def setUp(self): "inter_token_latency_min", "inter_token_latency_max", "inter_token_latency_avg", + "inter_token_latency_p25", + "inter_token_latency_p50", "inter_token_latency_p75", "inter_token_latency_p90", + "inter_token_latency_p95", "inter_token_latency_p99", "time_to_first_token_min", "time_to_first_token_max", "time_to_first_token_avg", + "time_to_first_token_p25", + "time_to_first_token_p50", "time_to_first_token_p75", "time_to_first_token_p90", + "time_to_first_token_p95", "time_to_first_token_p99", "gpu_used_memory", "cpu_used_ram",