From 74354f8f0f4bff2b5e8fc8baaab587b52c65eb36 Mon Sep 17 00:00:00 2001 From: braf Date: Thu, 21 Mar 2024 14:22:51 +0000 Subject: [PATCH 1/5] Successfully reading from LLM CSV --- model_analyzer/constants.py | 3 + model_analyzer/perf_analyzer/perf_analyzer.py | 144 +++++++++++++++++- .../record/types/inter_token_latency_p25.py | 60 ++++++++ .../record/types/inter_token_latency_p50.py | 60 ++++++++ .../record/types/inter_token_latency_p90.py | 2 +- .../record/types/inter_token_latency_p95.py | 60 ++++++++ .../record/types/time_to_first_token_p25.py | 60 ++++++++ .../record/types/time_to_first_token_p50.py | 60 ++++++++ .../record/types/time_to_first_token_p95.py | 60 ++++++++ tests/test_perf_analyzer.py | 142 ++++++++++++++++- 10 files changed, 647 insertions(+), 4 deletions(-) create mode 100755 model_analyzer/record/types/inter_token_latency_p25.py create mode 100755 model_analyzer/record/types/inter_token_latency_p50.py create mode 100755 model_analyzer/record/types/inter_token_latency_p95.py create mode 100755 model_analyzer/record/types/time_to_first_token_p25.py create mode 100755 model_analyzer/record/types/time_to_first_token_p50.py create mode 100755 model_analyzer/record/types/time_to_first_token_p95.py diff --git a/model_analyzer/constants.py b/model_analyzer/constants.py index 886360d34..4fd91a480 100755 --- a/model_analyzer/constants.py +++ b/model_analyzer/constants.py @@ -70,3 +70,6 @@ # Model analyzer package name PACKAGE_NAME = "triton-model-analyzer" + +# GENAI-PERF CSV +GENAI_PERF_CSV = "profile_export_genai_perf.csv" diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py index c88f8e655..52860c83d 100755 --- a/model_analyzer/perf_analyzer/perf_analyzer.py +++ b/model_analyzer/perf_analyzer/perf_analyzer.py @@ -21,12 +21,15 @@ import re import signal import tempfile +from csv import DictReader from subprocess import STDOUT, Popen -from typing import Dict, List +from typing import Dict, List, Optional import psutil +from model_analyzer.config.input.config_defaults import DEFAULT_MODEL_TYPE from model_analyzer.constants import ( + GENAI_PERF_CSV, INTERVAL_SLEEP_TIME, LOGGER_NAME, MEASUREMENT_REQUEST_COUNT_STEP, @@ -40,6 +43,16 @@ from model_analyzer.record.types.gpu_power_usage import GPUPowerUsage from model_analyzer.record.types.gpu_used_memory import GPUUsedMemory from model_analyzer.record.types.gpu_utilization import GPUUtilization +from model_analyzer.record.types.inter_token_latency_avg import InterTokenLatencyAvg +from model_analyzer.record.types.inter_token_latency_max import InterTokenLatencyMax +from model_analyzer.record.types.inter_token_latency_min import InterTokenLatencyMin +from model_analyzer.record.types.inter_token_latency_p25 import InterTokenLatencyP25 +from model_analyzer.record.types.inter_token_latency_p50 import InterTokenLatencyP50 +from model_analyzer.record.types.inter_token_latency_p75 import InterTokenLatencyP75 +from model_analyzer.record.types.inter_token_latency_p90 import InterTokenLatencyP90 +from model_analyzer.record.types.inter_token_latency_p95 import InterTokenLatencyP95 +from model_analyzer.record.types.inter_token_latency_p99 import InterTokenLatencyP99 +from model_analyzer.record.types.output_token_throughput import OutputTokenThroughput from model_analyzer.record.types.perf_client_response_wait import PerfClientResponseWait from model_analyzer.record.types.perf_client_send_recv import PerfClientSendRecv from model_analyzer.record.types.perf_latency_avg import PerfLatencyAvg @@ -53,6 +66,15 @@ ) from model_analyzer.record.types.perf_server_queue import PerfServerQueue from model_analyzer.record.types.perf_throughput import PerfThroughput +from model_analyzer.record.types.time_to_first_token_avg import TimeToFirstTokenAvg +from model_analyzer.record.types.time_to_first_token_max import TimeToFirstTokenMax +from model_analyzer.record.types.time_to_first_token_min import TimeToFirstTokenMin +from model_analyzer.record.types.time_to_first_token_p25 import TimeToFirstTokenP25 +from model_analyzer.record.types.time_to_first_token_p50 import TimeToFirstTokenP50 +from model_analyzer.record.types.time_to_first_token_p75 import TimeToFirstTokenP75 +from model_analyzer.record.types.time_to_first_token_p90 import TimeToFirstTokenP90 +from model_analyzer.record.types.time_to_first_token_p95 import TimeToFirstTokenP95 +from model_analyzer.record.types.time_to_first_token_p99 import TimeToFirstTokenP99 logger = logging.getLogger(LOGGER_NAME) @@ -91,6 +113,28 @@ class PerfAnalyzer: ["gpu_used_memory", "Max GPU Memory Usage", GPUUsedMemory, "1000000"], ["gpu_free_memory", "Total GPU Memory", GPUFreeMemory, "1000000"] ] + + llm_metric_table = [ + ["time_to_first_token_avg", "Time to First Token (ns) avg", TimeToFirstTokenAvg, "1000"], + ["time_to_first_token_min", "Time to First Token (ns) min", TimeToFirstTokenMin, "1000"], + ["time_to_first_token_max", "Time to First Token (ns) max", TimeToFirstTokenMax, "1000"], + ["time_to_first_token_p99", "Time to First Token (ns) p99", TimeToFirstTokenP99, "1000"], + ["time_to_first_token_p95", "Time to First Token (ns) p95", TimeToFirstTokenP99, "1000"], + ["time_to_first_token_p90", "Time to First Token (ns) p90", TimeToFirstTokenP90, "1000"], + ["time_to_first_token_p75", "Time to First Token (ns) p75", TimeToFirstTokenP75, "1000"], + ["time_to_first_token_p50", "Time to First Token (ns) p50", TimeToFirstTokenP99, "1000"], + ["time_to_first_token_p25", "Time to First Token (ns) p25", TimeToFirstTokenP99, "1000"], + ["inter_token_latency_avg", "Inter Token Latency (ns) avg", InterTokenLatencyAvg, "1000"], + ["inter_token_latency_min", "Inter Token Latency (ns) min", InterTokenLatencyMin, "1000"], + ["inter_token_latency_max", "Inter Token Latency (ns) max", InterTokenLatencyMax, "1000"], + ["inter_token_latency_p99", "Inter Token Latency (ns) p99", InterTokenLatencyP99, "1000"], + ["inter_token_latency_p95", "Inter Token Latency (ns) p95", InterTokenLatencyP95, "1000"], + ["inter_token_latency_p90", "Inter Token Latency (ns) p90", InterTokenLatencyP90, "1000"], + ["inter_token_latency_p75", "Inter Token Latency (ns) p75", InterTokenLatencyP75, "1000"], + ["inter_token_latency_p50", "Inter Token Latency (ns) p50", InterTokenLatencyP50, "1000"], + ["inter_token_latency_p25", "Inter Token Latency (ns) p25", InterTokenLatencyP25, "1000"], + ["output_token_throughput", "Output Token Throughput (per sec) avg", OutputTokenThroughput, "1"] + ] # yapf: enable @staticmethod @@ -109,7 +153,23 @@ def get_gpu_metrics(): ] return gpu_metrics - def __init__(self, path, config, max_retries, timeout, max_cpu_util): + @staticmethod + def get_llm_metrics(): + llm_metrics = [ + llm_metric[PerfAnalyzer.RECORD_CLASS] + for llm_metric in PerfAnalyzer.llm_metric_table + ] + return llm_metrics + + def __init__( + self, + path, + config, + max_retries, + timeout, + max_cpu_util, + model_type=DEFAULT_MODEL_TYPE, + ): """ Parameters ---------- @@ -133,8 +193,10 @@ def __init__(self, path, config, max_retries, timeout, max_cpu_util): self._timeout = timeout self._output = "" self._perf_records = {} + self._llm_records = {} self._gpu_records = [] self._max_cpu_util = max_cpu_util + self._model_type = model_type def run(self, metrics, env=None): """ @@ -198,6 +260,19 @@ def get_perf_records(self): "Attempted to get perf_analyzer results" "without calling run first." ) + def get_llm_records(self): + """ + Returns + ------- + The LLM records from the last perf_analyzer run + """ + + if self._llm_records: + return self._llm_records + raise TritonModelAnalyzerException( + "Attempted to get perf_analyzer results" "without calling run first." + ) + def get_gpu_records(self): """ Returns @@ -438,6 +513,12 @@ def _is_multi_model(self): return len(self._config.model_run_configs()) > 1 def _parse_outputs(self, metrics): + self._parse_generic_outputs(metrics) + + if self._model_type == "LLM": + self._parse_llm_outputs(metrics) + + def _parse_generic_outputs(self, metrics): """ Extract records from the Perf Analyzer run for each model """ @@ -464,6 +545,24 @@ def _parse_outputs(self, metrics): for f in glob.glob(f"*{perf_config['latency-report-file']}"): os.remove(f) + def _parse_llm_outputs(self, metrics): + """ + Extract records from the Perf Analyzer run for each model + """ + + perf_config = self._config.model_run_configs()[0].perf_config() + + logger.debug(f"Reading PA results from {GENAI_PERF_CSV}") + with open(GENAI_PERF_CSV, mode="r") as f: + csv_reader = csv.DictReader(f, delimiter=",") + + # See test_perf_analyzer::test_pa_llm_csv_output() for CSV output example + self._llm_records[perf_config["model-name"]] = self._extract_llm_records( + metrics, csv_reader + ) + + os.remove(f) + def _extract_perf_records_from_row( self, requested_metrics: List[Record], row_metrics: Dict[str, str] ) -> List[Record]: @@ -526,6 +625,47 @@ def _extract_gpu_records_from_row( self._cleanup_gpu_records(gpu_records) return gpu_records + def _extract_llm_records( + self, requested_metrics: List[Record], csv_reader: DictReader + ) -> List[Record]: + llm_records: List[Record] = [] + + for requested_metric in requested_metrics: + new_llm_record = self._get_llm_record_from_csv(requested_metric, csv_reader) + llm_records.append(new_llm_record) + + return llm_records + + def _get_llm_record_from_csv( + self, requested_metric: Record, csv_reader: DictReader + ) -> Record: + for row in csv_reader: + for key, value in row.items(): + metric_string = f"{row['Metric']} {key}" + llm_metric = self._find_corresponding_llm_metric_row(metric_string) + + if ( + llm_metric + and llm_metric[PerfAnalyzer.METRIC_TAG] == requested_metric.tag + ): + adjusted_value = float(value) / float( + llm_metric[PerfAnalyzer.REDUCTION_FACTOR] + ) + + llm_record = llm_metric[PerfAnalyzer.RECORD_CLASS](adjusted_value) # type: ignore + return llm_record + + raise TritonModelAnalyzerException( + f"Did not find {requested_metric.tag} in genai-perf CSV file" + ) + + def _find_corresponding_llm_metric_row(self, metric_string) -> Optional[List]: + for row in PerfAnalyzer.llm_metric_table: + if metric_string == row[PerfAnalyzer.CSV_STRING]: + return row + + return None + def _cleanup_gpu_records(self, gpu_records): # Recalculate GPUFreeMemory by removing the value of the associated GPUUsedMemory # Remove any GPUFreeMemory records that don't have a matching GPUUsedMemory diff --git a/model_analyzer/record/types/inter_token_latency_p25.py b/model_analyzer/record/types/inter_token_latency_p25.py new file mode 100755 index 000000000..6d761a683 --- /dev/null +++ b/model_analyzer/record/types/inter_token_latency_p25.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase + + +@total_ordering +class InterTokenLatencyP25(InterTokenLatencyBase): + """ + A record for perf_analyzer Inter token latency metric + """ + + tag = "inter_token_latency_p25" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p90 Inter Token Latency (ms)" diff --git a/model_analyzer/record/types/inter_token_latency_p50.py b/model_analyzer/record/types/inter_token_latency_p50.py new file mode 100755 index 000000000..4a9beafff --- /dev/null +++ b/model_analyzer/record/types/inter_token_latency_p50.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase + + +@total_ordering +class InterTokenLatencyP50(InterTokenLatencyBase): + """ + A record for perf_analyzer Inter token latency metric + """ + + tag = "inter_token_latency_p50" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p90 Inter Token Latency (ms)" diff --git a/model_analyzer/record/types/inter_token_latency_p90.py b/model_analyzer/record/types/inter_token_latency_p90.py index 58ae0ccb4..60019088a 100755 --- a/model_analyzer/record/types/inter_token_latency_p90.py +++ b/model_analyzer/record/types/inter_token_latency_p90.py @@ -20,7 +20,7 @@ @total_ordering -class InterTokenLatencyP99(InterTokenLatencyBase): +class InterTokenLatencyP90(InterTokenLatencyBase): """ A record for perf_analyzer Inter token latency metric """ diff --git a/model_analyzer/record/types/inter_token_latency_p95.py b/model_analyzer/record/types/inter_token_latency_p95.py new file mode 100755 index 000000000..596f3c55f --- /dev/null +++ b/model_analyzer/record/types/inter_token_latency_p95.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase + + +@total_ordering +class InterTokenLatencyP95(InterTokenLatencyBase): + """ + A record for perf_analyzer Inter token latency metric + """ + + tag = "inter_token_latency_p95" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p90 Inter Token Latency (ms)" diff --git a/model_analyzer/record/types/time_to_first_token_p25.py b/model_analyzer/record/types/time_to_first_token_p25.py new file mode 100755 index 000000000..bfcc1b63c --- /dev/null +++ b/model_analyzer/record/types/time_to_first_token_p25.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase + + +@total_ordering +class TimeToFirstTokenP25(TimeToFirstTokenBase): + """ + A record for perf_analyzer Time to first token metric + """ + + tag = "time_to_first_token_p25" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p99 Time To First Token (ms)" diff --git a/model_analyzer/record/types/time_to_first_token_p50.py b/model_analyzer/record/types/time_to_first_token_p50.py new file mode 100755 index 000000000..461b8e6d3 --- /dev/null +++ b/model_analyzer/record/types/time_to_first_token_p50.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase + + +@total_ordering +class TimeToFirstTokenP50(TimeToFirstTokenBase): + """ + A record for perf_analyzer Time to first token metric + """ + + tag = "time_to_first_token_p50" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p99 Time To First Token (ms)" diff --git a/model_analyzer/record/types/time_to_first_token_p95.py b/model_analyzer/record/types/time_to_first_token_p95.py new file mode 100755 index 000000000..d78c8af10 --- /dev/null +++ b/model_analyzer/record/types/time_to_first_token_p95.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase + + +@total_ordering +class TimeToFirstTokenP95(TimeToFirstTokenBase): + """ + A record for perf_analyzer Time to first token metric + """ + + tag = "time_to_first_token_p95" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed time from start of program + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "p99 Time To First Token (ms)" diff --git a/tests/test_perf_analyzer.py b/tests/test_perf_analyzer.py index e95f0d4a1..df6a1db34 100755 --- a/tests/test_perf_analyzer.py +++ b/tests/test_perf_analyzer.py @@ -33,6 +33,16 @@ from model_analyzer.record.types.gpu_power_usage import GPUPowerUsage from model_analyzer.record.types.gpu_used_memory import GPUUsedMemory from model_analyzer.record.types.gpu_utilization import GPUUtilization +from model_analyzer.record.types.inter_token_latency_avg import InterTokenLatencyAvg +from model_analyzer.record.types.inter_token_latency_max import InterTokenLatencyMax +from model_analyzer.record.types.inter_token_latency_min import InterTokenLatencyMin +from model_analyzer.record.types.inter_token_latency_p25 import InterTokenLatencyP25 +from model_analyzer.record.types.inter_token_latency_p50 import InterTokenLatencyP50 +from model_analyzer.record.types.inter_token_latency_p75 import InterTokenLatencyP75 +from model_analyzer.record.types.inter_token_latency_p90 import InterTokenLatencyP90 +from model_analyzer.record.types.inter_token_latency_p95 import InterTokenLatencyP95 +from model_analyzer.record.types.inter_token_latency_p99 import InterTokenLatencyP99 +from model_analyzer.record.types.output_token_throughput import OutputTokenThroughput from model_analyzer.record.types.perf_client_response_wait import PerfClientResponseWait from model_analyzer.record.types.perf_client_send_recv import PerfClientSendRecv from model_analyzer.record.types.perf_latency_avg import PerfLatencyAvg @@ -46,6 +56,15 @@ ) from model_analyzer.record.types.perf_server_queue import PerfServerQueue from model_analyzer.record.types.perf_throughput import PerfThroughput +from model_analyzer.record.types.time_to_first_token_avg import TimeToFirstTokenAvg +from model_analyzer.record.types.time_to_first_token_max import TimeToFirstTokenMax +from model_analyzer.record.types.time_to_first_token_min import TimeToFirstTokenMin +from model_analyzer.record.types.time_to_first_token_p25 import TimeToFirstTokenP25 +from model_analyzer.record.types.time_to_first_token_p50 import TimeToFirstTokenP50 +from model_analyzer.record.types.time_to_first_token_p75 import TimeToFirstTokenP75 +from model_analyzer.record.types.time_to_first_token_p90 import TimeToFirstTokenP90 +from model_analyzer.record.types.time_to_first_token_p95 import TimeToFirstTokenP95 +from model_analyzer.record.types.time_to_first_token_p99 import TimeToFirstTokenP99 from model_analyzer.triton.client.client_factory import TritonClientFactory from model_analyzer.triton.server.server_config import TritonServerConfig from model_analyzer.triton.server.server_factory import TritonServerFactory @@ -248,7 +267,10 @@ def test_perf_analyzer_ssl_args(self): ) self.assertEqual(self.config.to_cli_string(), expected_cli_str) - def test_run(self): + def test_pa_csv_output(self): + """ + Tests the ability to read PA's CSV output + """ server_config = TritonServerConfig() server_config["model-repository"] = MODEL_REPOSITORY_PATH @@ -544,6 +566,124 @@ def test_run(self): self.assertTrue(perf_analyzer.run(perf_metrics)) self.server.stop() + def test_pa_llm_csv_output(self): + """ + Tests the ability to read PA's LLM CSV output + """ + server_config = TritonServerConfig() + server_config["model-repository"] = MODEL_REPOSITORY_PATH + + # Create server, client, PerfAnalyzer, and wait for server ready + self.server = TritonServerFactory.create_server_local( + path=TRITON_LOCAL_BIN_PATH, config=server_config, gpus=self.gpus + ) + + perf_analyzer = PerfAnalyzer( + path=PERF_BIN_PATH, + config=self.run_config, + max_retries=10, + timeout=100, + max_cpu_util=50, + model_type="LLM", + ) + self.client = TritonClientFactory.create_grpc_client(server_url=TEST_GRPC_URL) + self.server.start() + self.client.wait_for_server_ready(num_retries=1) + + pa_llm_csv_mock = """Metric,avg,min,max,p99,p95,p90,p75,p50,p25\n""" + pa_llm_csv_mock += """Time to First Token (ns),4238735,3367978,6702240,6371118,5344958,5006259,4841394,4146648,3484484\n""" + pa_llm_csv_mock += """Inter Token Latency (ns),27202264,3849435,138324924,28283424,27737593,27469154,27067290,26979956,26926962\n""" + pa_llm_csv_mock += """Request Latency (ns),3363927003,3367978,14238834483,14091273510,13740917508,13692672723,3752510140,4846258,3612270\n""" + pa_llm_csv_mock += """Num Output Token,126,0,584,562,509,505,135,0,0\n""" + pa_llm_csv_mock += """\n""" + pa_llm_csv_mock += """Metric,Value\n""" + pa_llm_csv_mock += """Output Token Throughput (per sec),36.37\n""" + pa_llm_csv_mock += """Request Throughput (per sec),0.29""" + + # Test all Time to first token values + self._test_csv_output( + perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenAvg], [4238.735] + ) + self._test_csv_output( + perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenMin], [3367.978] + ) + self._test_csv_output( + perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenMax], [6702.240] + ) + self._test_csv_output( + perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenP99], [6371.118] + ) + self._test_csv_output( + perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenP95], [5344.958] + ) + self._test_csv_output( + perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenP90], [5006.259] + ) + self._test_csv_output( + perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenP75], [4841.394] + ) + self._test_csv_output( + perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenP50], [4146.648] + ) + self._test_csv_output( + perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenP25], [3484.484] + ) + + # Test all Inter token latency values + self._test_csv_output( + perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyAvg], [27202.264] + ) + self._test_csv_output( + perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyMin], [3849.435] + ) + self._test_csv_output( + perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyMax], [138324.924] + ) + self._test_csv_output( + perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyP99], [28283.424] + ) + self._test_csv_output( + perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyP95], [27737.593] + ) + self._test_csv_output( + perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyP90], [27469.154] + ) + self._test_csv_output( + perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyP75], [27067.290] + ) + self._test_csv_output( + perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyP50], [26979.956] + ) + self._test_csv_output( + perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyP25], [26926.962] + ) + + # Test output token throughput + self._test_csv_output( + perf_analyzer, pa_llm_csv_mock, [OutputTokenThroughput], [36.37] + ) + + # Test combination + self._test_csv_output( + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenP90, InterTokenLatencyP50, OutputTokenThroughput], + [5006.259, 26979.956, 36.37], + ) + + def _test_csv_output(self, perf_analyzer, read_data, metrics, expected_values): + with patch( + "model_analyzer.perf_analyzer.perf_analyzer.open", + mock_open(read_data=read_data), + ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): + perf_analyzer.run(metrics) + + records = perf_analyzer.get_llm_records() + + self.assertEqual(len(records[TEST_MODEL_NAME]), len(expected_values)) + for i, expected_value in enumerate(expected_values): + self.assertEqual(records[TEST_MODEL_NAME][i].value(), expected_value) + def test_measurement_interval_increase(self): server_config = TritonServerConfig() server_config["model-repository"] = MODEL_REPOSITORY_PATH From 9d8f0cf951f8fe6111b532526a5315e760b10467 Mon Sep 17 00:00:00 2001 From: braf Date: Thu, 21 Mar 2024 14:38:20 +0000 Subject: [PATCH 2/5] General cleanup --- model_analyzer/perf_analyzer/perf_analyzer.py | 2 +- tests/test_perf_analyzer.py | 172 +++++------------- 2 files changed, 47 insertions(+), 127 deletions(-) diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py index 52860c83d..e10a1cd58 100755 --- a/model_analyzer/perf_analyzer/perf_analyzer.py +++ b/model_analyzer/perf_analyzer/perf_analyzer.py @@ -659,7 +659,7 @@ def _get_llm_record_from_csv( f"Did not find {requested_metric.tag} in genai-perf CSV file" ) - def _find_corresponding_llm_metric_row(self, metric_string) -> Optional[List]: + def _find_corresponding_llm_metric_row(self, metric_string: str) -> Optional[List]: for row in PerfAnalyzer.llm_metric_table: if metric_string == row[PerfAnalyzer.CSV_STRING]: return row diff --git a/tests/test_perf_analyzer.py b/tests/test_perf_analyzer.py index df6a1db34..abd9bde58 100755 --- a/tests/test_perf_analyzer.py +++ b/tests/test_perf_analyzer.py @@ -309,122 +309,40 @@ def test_pa_csv_output(self): self.assertEqual(len(records[TEST_MODEL_NAME]), 1) self.assertEqual(records[TEST_MODEL_NAME][0].value(), 5) - # Test p90 latency parsing - perf_metrics = [PerfLatencyP90] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 4.7) - - # Test p95 latency parsing - perf_metrics = [PerfLatencyP95] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 4.8) - - # Test p99 latency parsing - perf_metrics = [PerfLatencyP99] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 4.9) + # Test latency parsing + self._test_metrics_from_csv(perf_analyzer, pa_csv_mock, [PerfLatencyP90], [4.7]) + self._test_metrics_from_csv(perf_analyzer, pa_csv_mock, [PerfLatencyP95], [4.8]) + self._test_metrics_from_csv(perf_analyzer, pa_csv_mock, [PerfLatencyP99], [4.9]) # Test throughput parsing - perf_metrics = [PerfThroughput] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 46.8) + self._test_metrics_from_csv( + perf_analyzer, pa_csv_mock, [PerfThroughput], [46.8] + ) # Test client response wait - perf_metrics = [PerfClientResponseWait] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 0.314) + self._test_metrics_from_csv( + perf_analyzer, pa_csv_mock, [PerfClientResponseWait], [0.314] + ) # Test server queue - perf_metrics = [PerfServerQueue] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 0.018) + self._test_metrics_from_csv( + perf_analyzer, pa_csv_mock, [PerfServerQueue], [0.018] + ) # Test server compute infer - perf_metrics = [PerfServerComputeInfer] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 0.065) + self._test_metrics_from_csv( + perf_analyzer, pa_csv_mock, [PerfServerComputeInfer], [0.065] + ) # Test server compute input - perf_metrics = [PerfServerComputeInput] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) - - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 0.034) - - # Test server compute infer - perf_metrics = [PerfServerComputeOutput] - - with patch( - "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), - ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): - perf_analyzer.run(perf_metrics) + self._test_metrics_from_csv( + perf_analyzer, pa_csv_mock, [PerfServerComputeInput], [0.034] + ) - records = perf_analyzer.get_perf_records() - self.assertEqual(len(records[TEST_MODEL_NAME]), 1) - self.assertEqual(records[TEST_MODEL_NAME][0].value(), 0.016) + # Test server compute output + self._test_metrics_from_csv( + perf_analyzer, pa_csv_mock, [PerfServerComputeOutput], [0.016] + ) # Test Avg GPU Utilizations. Perf metric is ignored for get_gpu_records() gpu_metrics = [GPUUtilization, PerfLatencyAvg] @@ -601,77 +519,79 @@ def test_pa_llm_csv_output(self): pa_llm_csv_mock += """Request Throughput (per sec),0.29""" # Test all Time to first token values - self._test_csv_output( + self._test_metrics_from_csv( perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenAvg], [4238.735] ) - self._test_csv_output( + self._test_metrics_from_csv( perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenMin], [3367.978] ) - self._test_csv_output( + self._test_metrics_from_csv( perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenMax], [6702.240] ) - self._test_csv_output( + self._test_metrics_from_csv( perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenP99], [6371.118] ) - self._test_csv_output( + self._test_metrics_from_csv( perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenP95], [5344.958] ) - self._test_csv_output( + self._test_metrics_from_csv( perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenP90], [5006.259] ) - self._test_csv_output( + self._test_metrics_from_csv( perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenP75], [4841.394] ) - self._test_csv_output( + self._test_metrics_from_csv( perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenP50], [4146.648] ) - self._test_csv_output( + self._test_metrics_from_csv( perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenP25], [3484.484] ) # Test all Inter token latency values - self._test_csv_output( + self._test_metrics_from_csv( perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyAvg], [27202.264] ) - self._test_csv_output( + self._test_metrics_from_csv( perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyMin], [3849.435] ) - self._test_csv_output( + self._test_metrics_from_csv( perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyMax], [138324.924] ) - self._test_csv_output( + self._test_metrics_from_csv( perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyP99], [28283.424] ) - self._test_csv_output( + self._test_metrics_from_csv( perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyP95], [27737.593] ) - self._test_csv_output( + self._test_metrics_from_csv( perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyP90], [27469.154] ) - self._test_csv_output( + self._test_metrics_from_csv( perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyP75], [27067.290] ) - self._test_csv_output( + self._test_metrics_from_csv( perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyP50], [26979.956] ) - self._test_csv_output( + self._test_metrics_from_csv( perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyP25], [26926.962] ) # Test output token throughput - self._test_csv_output( + self._test_metrics_from_csv( perf_analyzer, pa_llm_csv_mock, [OutputTokenThroughput], [36.37] ) # Test combination - self._test_csv_output( + self._test_metrics_from_csv( perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenP90, InterTokenLatencyP50, OutputTokenThroughput], [5006.259, 26979.956, 36.37], ) - def _test_csv_output(self, perf_analyzer, read_data, metrics, expected_values): + def _test_metrics_from_csv( + self, perf_analyzer, read_data, metrics, expected_values + ): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", mock_open(read_data=read_data), From 967a8dd2d7412bb305dcdb74480092f3e1a91a78 Mon Sep 17 00:00:00 2001 From: braf Date: Thu, 21 Mar 2024 14:53:46 +0000 Subject: [PATCH 3/5] All unit tests passing --- model_analyzer/perf_analyzer/perf_analyzer.py | 4 +- tests/test_perf_analyzer.py | 122 +++++++++++++++--- tests/test_record_types.py | 6 + 3 files changed, 109 insertions(+), 23 deletions(-) diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py index e10a1cd58..586ba2465 100755 --- a/model_analyzer/perf_analyzer/perf_analyzer.py +++ b/model_analyzer/perf_analyzer/perf_analyzer.py @@ -257,7 +257,7 @@ def get_perf_records(self): if self._perf_records: return self._perf_records raise TritonModelAnalyzerException( - "Attempted to get perf_analyzer results" "without calling run first." + "Attempted to get perf_analyzer results without calling run first." ) def get_llm_records(self): @@ -270,7 +270,7 @@ def get_llm_records(self): if self._llm_records: return self._llm_records raise TritonModelAnalyzerException( - "Attempted to get perf_analyzer results" "without calling run first." + "Attempted to get perf_analyzer results without calling run first." ) def get_gpu_records(self): diff --git a/tests/test_perf_analyzer.py b/tests/test_perf_analyzer.py index abd9bde58..0d063f81f 100755 --- a/tests/test_perf_analyzer.py +++ b/tests/test_perf_analyzer.py @@ -520,65 +520,141 @@ def test_pa_llm_csv_output(self): # Test all Time to first token values self._test_metrics_from_csv( - perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenAvg], [4238.735] + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenAvg], + [4238.735], + is_llm=True, ) self._test_metrics_from_csv( - perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenMin], [3367.978] + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenMin], + [3367.978], + is_llm=True, ) self._test_metrics_from_csv( - perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenMax], [6702.240] + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenMax], + [6702.240], + is_llm=True, ) self._test_metrics_from_csv( - perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenP99], [6371.118] + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenP99], + [6371.118], + is_llm=True, ) self._test_metrics_from_csv( - perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenP95], [5344.958] + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenP95], + [5344.958], + is_llm=True, ) self._test_metrics_from_csv( - perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenP90], [5006.259] + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenP90], + [5006.259], + is_llm=True, ) self._test_metrics_from_csv( - perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenP75], [4841.394] + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenP75], + [4841.394], + is_llm=True, ) self._test_metrics_from_csv( - perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenP50], [4146.648] + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenP50], + [4146.648], + is_llm=True, ) self._test_metrics_from_csv( - perf_analyzer, pa_llm_csv_mock, [TimeToFirstTokenP25], [3484.484] + perf_analyzer, + pa_llm_csv_mock, + [TimeToFirstTokenP25], + [3484.484], + is_llm=True, ) # Test all Inter token latency values self._test_metrics_from_csv( - perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyAvg], [27202.264] + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyAvg], + [27202.264], + is_llm=True, ) self._test_metrics_from_csv( - perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyMin], [3849.435] + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyMin], + [3849.435], + is_llm=True, ) self._test_metrics_from_csv( - perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyMax], [138324.924] + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyMax], + [138324.924], + is_llm=True, ) self._test_metrics_from_csv( - perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyP99], [28283.424] + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyP99], + [28283.424], + is_llm=True, ) self._test_metrics_from_csv( - perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyP95], [27737.593] + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyP95], + [27737.593], + is_llm=True, ) self._test_metrics_from_csv( - perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyP90], [27469.154] + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyP90], + [27469.154], + is_llm=True, ) self._test_metrics_from_csv( - perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyP75], [27067.290] + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyP75], + [27067.290], + is_llm=True, ) self._test_metrics_from_csv( - perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyP50], [26979.956] + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyP50], + [26979.956], + is_llm=True, ) self._test_metrics_from_csv( - perf_analyzer, pa_llm_csv_mock, [InterTokenLatencyP25], [26926.962] + perf_analyzer, + pa_llm_csv_mock, + [InterTokenLatencyP25], + [26926.962], + is_llm=True, ) # Test output token throughput self._test_metrics_from_csv( - perf_analyzer, pa_llm_csv_mock, [OutputTokenThroughput], [36.37] + perf_analyzer, + pa_llm_csv_mock, + [OutputTokenThroughput], + [36.37], + is_llm=True, ) # Test combination @@ -587,10 +663,11 @@ def test_pa_llm_csv_output(self): pa_llm_csv_mock, [TimeToFirstTokenP90, InterTokenLatencyP50, OutputTokenThroughput], [5006.259, 26979.956, 36.37], + is_llm=True, ) def _test_metrics_from_csv( - self, perf_analyzer, read_data, metrics, expected_values + self, perf_analyzer, read_data, metrics, expected_values, is_llm=False ): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", @@ -598,7 +675,10 @@ def _test_metrics_from_csv( ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(metrics) - records = perf_analyzer.get_llm_records() + if is_llm: + records = perf_analyzer.get_llm_records() + else: + records = perf_analyzer.get_perf_records() self.assertEqual(len(records[TEST_MODEL_NAME]), len(expected_values)) for i, expected_value in enumerate(expected_values): diff --git a/tests/test_record_types.py b/tests/test_record_types.py index 54c353200..1279e06df 100755 --- a/tests/test_record_types.py +++ b/tests/test_record_types.py @@ -52,14 +52,20 @@ def setUp(self): "inter_token_latency_min", "inter_token_latency_max", "inter_token_latency_avg", + "inter_token_latency_p25", + "inter_token_latency_p50", "inter_token_latency_p75", "inter_token_latency_p90", + "inter_token_latency_p95", "inter_token_latency_p99", "time_to_first_token_min", "time_to_first_token_max", "time_to_first_token_avg", + "time_to_first_token_p25", + "time_to_first_token_p50", "time_to_first_token_p75", "time_to_first_token_p90", + "time_to_first_token_p95", "time_to_first_token_p99", "gpu_used_memory", "cpu_used_ram", From 27ed51eace9b4431108331c7d3084fd4d5b8570b Mon Sep 17 00:00:00 2001 From: braf Date: Thu, 21 Mar 2024 17:00:02 +0000 Subject: [PATCH 4/5] Fixing metric table typos --- model_analyzer/perf_analyzer/perf_analyzer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py index 586ba2465..e5387f04e 100755 --- a/model_analyzer/perf_analyzer/perf_analyzer.py +++ b/model_analyzer/perf_analyzer/perf_analyzer.py @@ -119,11 +119,11 @@ class PerfAnalyzer: ["time_to_first_token_min", "Time to First Token (ns) min", TimeToFirstTokenMin, "1000"], ["time_to_first_token_max", "Time to First Token (ns) max", TimeToFirstTokenMax, "1000"], ["time_to_first_token_p99", "Time to First Token (ns) p99", TimeToFirstTokenP99, "1000"], - ["time_to_first_token_p95", "Time to First Token (ns) p95", TimeToFirstTokenP99, "1000"], + ["time_to_first_token_p95", "Time to First Token (ns) p95", TimeToFirstTokenP95, "1000"], ["time_to_first_token_p90", "Time to First Token (ns) p90", TimeToFirstTokenP90, "1000"], ["time_to_first_token_p75", "Time to First Token (ns) p75", TimeToFirstTokenP75, "1000"], - ["time_to_first_token_p50", "Time to First Token (ns) p50", TimeToFirstTokenP99, "1000"], - ["time_to_first_token_p25", "Time to First Token (ns) p25", TimeToFirstTokenP99, "1000"], + ["time_to_first_token_p50", "Time to First Token (ns) p50", TimeToFirstTokenP50, "1000"], + ["time_to_first_token_p25", "Time to First Token (ns) p25", TimeToFirstTokenP25, "1000"], ["inter_token_latency_avg", "Inter Token Latency (ns) avg", InterTokenLatencyAvg, "1000"], ["inter_token_latency_min", "Inter Token Latency (ns) min", InterTokenLatencyMin, "1000"], ["inter_token_latency_max", "Inter Token Latency (ns) max", InterTokenLatencyMax, "1000"], From 6bcc4480ef59877bead502ed30e6fbfb549ca95d Mon Sep 17 00:00:00 2001 From: braf Date: Thu, 21 Mar 2024 17:27:47 +0000 Subject: [PATCH 5/5] Fixing typos --- model_analyzer/record/types/inter_token_latency_p25.py | 2 +- model_analyzer/record/types/inter_token_latency_p50.py | 2 +- model_analyzer/record/types/inter_token_latency_p95.py | 2 +- model_analyzer/record/types/time_to_first_token_p25.py | 2 +- model_analyzer/record/types/time_to_first_token_p50.py | 2 +- model_analyzer/record/types/time_to_first_token_p95.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/model_analyzer/record/types/inter_token_latency_p25.py b/model_analyzer/record/types/inter_token_latency_p25.py index 6d761a683..8a0c80173 100755 --- a/model_analyzer/record/types/inter_token_latency_p25.py +++ b/model_analyzer/record/types/inter_token_latency_p25.py @@ -57,4 +57,4 @@ def header(cls, aggregation_tag=False): metric. """ - return "p90 Inter Token Latency (ms)" + return "p25 Inter Token Latency (ms)" diff --git a/model_analyzer/record/types/inter_token_latency_p50.py b/model_analyzer/record/types/inter_token_latency_p50.py index 4a9beafff..190920874 100755 --- a/model_analyzer/record/types/inter_token_latency_p50.py +++ b/model_analyzer/record/types/inter_token_latency_p50.py @@ -57,4 +57,4 @@ def header(cls, aggregation_tag=False): metric. """ - return "p90 Inter Token Latency (ms)" + return "p50 Inter Token Latency (ms)" diff --git a/model_analyzer/record/types/inter_token_latency_p95.py b/model_analyzer/record/types/inter_token_latency_p95.py index 596f3c55f..b77fd9118 100755 --- a/model_analyzer/record/types/inter_token_latency_p95.py +++ b/model_analyzer/record/types/inter_token_latency_p95.py @@ -57,4 +57,4 @@ def header(cls, aggregation_tag=False): metric. """ - return "p90 Inter Token Latency (ms)" + return "p95 Inter Token Latency (ms)" diff --git a/model_analyzer/record/types/time_to_first_token_p25.py b/model_analyzer/record/types/time_to_first_token_p25.py index bfcc1b63c..5938ca3eb 100755 --- a/model_analyzer/record/types/time_to_first_token_p25.py +++ b/model_analyzer/record/types/time_to_first_token_p25.py @@ -57,4 +57,4 @@ def header(cls, aggregation_tag=False): metric. """ - return "p99 Time To First Token (ms)" + return "p25 Time To First Token (ms)" diff --git a/model_analyzer/record/types/time_to_first_token_p50.py b/model_analyzer/record/types/time_to_first_token_p50.py index 461b8e6d3..a3440b456 100755 --- a/model_analyzer/record/types/time_to_first_token_p50.py +++ b/model_analyzer/record/types/time_to_first_token_p50.py @@ -57,4 +57,4 @@ def header(cls, aggregation_tag=False): metric. """ - return "p99 Time To First Token (ms)" + return "p50 Time To First Token (ms)" diff --git a/model_analyzer/record/types/time_to_first_token_p95.py b/model_analyzer/record/types/time_to_first_token_p95.py index d78c8af10..6e466c4e2 100755 --- a/model_analyzer/record/types/time_to_first_token_p95.py +++ b/model_analyzer/record/types/time_to_first_token_p95.py @@ -57,4 +57,4 @@ def header(cls, aggregation_tag=False): metric. """ - return "p99 Time To First Token (ms)" + return "p95 Time To First Token (ms)"