From 9da847ee6caf169fe289f987387538a911d9af1d Mon Sep 17 00:00:00 2001 From: braf Date: Tue, 10 Oct 2023 00:10:11 +0000 Subject: [PATCH 01/13] Initial code for aggregation of new LLM metrics --- .../config/input/config_command_profile.py | 4 + model_analyzer/perf_analyzer/perf_analyzer.py | 90 +++++++++++++++++-- model_analyzer/perf_analyzer/perf_config.py | 8 +- tests/common/test_utils.py | 12 ++- tests/test_perf_analyzer.py | 74 ++++++++++++--- 5 files changed, 166 insertions(+), 22 deletions(-) diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py index a215a2251..9c90bc75b 100755 --- a/model_analyzer/config/input/config_command_profile.py +++ b/model_analyzer/config/input/config_command_profile.py @@ -66,6 +66,10 @@ def __init__(self): super().__init__() self._fill_config() + # FIXME: placeholder until branch is merged + def is_llm_model(self): + return False + def _resolve_protobuf_field(self, field: FieldDescriptor) -> ConfigSweep: """ Recursively resolve protobuf fields. diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py index 51ad64151..7370d1955 100755 --- a/model_analyzer/perf_analyzer/perf_analyzer.py +++ b/model_analyzer/perf_analyzer/perf_analyzer.py @@ -16,6 +16,7 @@ import csv import glob +import json import logging import os import re @@ -118,6 +119,14 @@ def get_gpu_metrics(): ] return gpu_metrics + @staticmethod + def get_llm_metrics(): + llm_metrics = [ + llm_metric[PerfAnalyzer.RECORD_CLASS] + for llm_metric in PerfAnalyzer.llm_metric_table + ] + return llm_metrics + def __init__(self, path, config, max_retries, timeout, max_cpu_util): """ Parameters @@ -143,6 +152,7 @@ def __init__(self, path, config, max_retries, timeout, max_cpu_util): self._output = "" self._perf_records = {} self._gpu_records = [] + self._llm_records = {} self._max_cpu_util = max_cpu_util def run(self, metrics, env=None): @@ -216,6 +226,19 @@ def get_gpu_records(self): return self._gpu_records + def get_llm_records(self): + """ + Returns + ------- + The LLM records from the last perf_analyzer run + """ + + if self._llm_records: + return self._llm_records + raise TritonModelAnalyzerException( + "Attempted to get perf_analyzer results" "without calling run first." + ) + def output(self): """ Returns @@ -457,14 +480,8 @@ def _parse_outputs(self, metrics): logger.debug( f"Reading PA results from {perf_config['latency-report-file']}" ) - with open(perf_config["latency-report-file"], mode="r") as f: - csv_reader = csv.DictReader(f, delimiter=",") - - for row in csv_reader: - self._perf_records[ - perf_config["model-name"] - ] = self._extract_perf_records_from_row(metrics, row) - self._gpu_records = self._extract_gpu_records_from_row(metrics, row) + self._extract_gpu_records(perf_config, metrics) + self._extract_llm_records(perf_config, metrics) for perf_config in [ mrc.perf_config() for mrc in self._config.model_run_configs() @@ -473,6 +490,63 @@ def _parse_outputs(self, metrics): for f in glob.glob(f"*{perf_config['latency-report-file']}"): os.remove(f) + def _extract_gpu_records(self, perf_config, metrics): + with open(perf_config["latency-report-file"], mode="r") as f: + csv_reader = csv.DictReader(f, delimiter=",") + + for row in csv_reader: + self._perf_records[ + perf_config["model-name"] + ] = self._extract_perf_records_from_row(metrics, row) + self._gpu_records = self._extract_gpu_records_from_row(metrics, row) + + def _extract_llm_records(self, perf_config, metrics): + with open(perf_config["profile-export-file"], mode="r") as f: + llm_output = json.load(f) + + avg_first_token_to_token_latency = ( + self._calculate_avg_first_token_to_token_latency(llm_output) + ) + record = llm_metric[PerfAnalyzer.RECORD_CLASS]( + value=avg_first_token_to_token_latency + ) # type: ignore + + self._llm_records[perf_config["model-name"]].append(record) + + # avg_avg_token_to_token_latency = ( + # self._calculate_avg_avg_token_to_token_latency(llm_output) + # ) + + def _calculate_avg_first_token_to_token_latency(self, llm_output: str) -> float: + total_first_token_latency = 0 + for request in llm_output["experiments"][0]["requests"]: + total_first_token_latency += ( + request["response_timestamps"][0] - request["timestamp"] + ) + + avg_first_token_to_token_latency = total_first_token_latency / len( + llm_output["experiments"][0]["requests"] + ) + + return avg_first_token_to_token_latency + + def _calculate_avg_avg_token_to_token_latency(self, llm_output: str) -> float: + total_token_latency = 0 + for request in llm_output["experiments"][0]["requests"]: + total_response_latency = 0 + for response_timestamp in request["response_timestamps"]: + total_response_latency += response_timestamp - request["timestamp"] + + total_token_latency += total_response_latency / len( + request["response_timestamps"] + ) + + avg_avg_token_to_token_latency = total_token_latency / len( + llm_output["experiments"][0]["requests"] + ) + + return avg_avg_token_to_token_latency + def _extract_perf_records_from_row( self, requested_metrics: List[Record], row_metrics: Dict[str, str] ) -> List[Record]: diff --git a/model_analyzer/perf_analyzer/perf_config.py b/model_analyzer/perf_analyzer/perf_config.py index 7cab2dd3c..d81f10044 100755 --- a/model_analyzer/perf_analyzer/perf_config.py +++ b/model_analyzer/perf_analyzer/perf_config.py @@ -82,6 +82,7 @@ class PerfAnalyzerConfig: "url", "protocol", "latency-report-file", + "profile-export-file", "http-header", ] @@ -112,6 +113,7 @@ def __init__(self): "-u": None, "-i": None, "-f": None, + "--profile-export-file": None, "-H": None, } self._verbose = {"-v": None, "-v -v": None, "--verbose-csv": None} @@ -123,6 +125,7 @@ def __init__(self): "url": "-u", "protocol": "-i", "latency-report-file": "-f", + "profile-export-file": "--profile-export-file", "http-header": "-H", } @@ -193,6 +196,9 @@ def update_config_from_profile_config(self, model_name, profile_config): "verbose-csv": "--verbose-csv", } + if profile_config.is_llm_mode(): + params.update({"profile-export-file": model_name + "-llm-results.csv"}) + if profile_config.triton_launch_mode == "c_api": params.update( { @@ -307,7 +313,7 @@ def remove_url_from_cli_string(cls, cli_string): @classmethod def remove_mrc_from_cli_string(cls, cli_string): """ - utility function strips the measruement request count + utility function strips the measurement request count from a cli string representation Parameters diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py index caa9763ce..33db04f30 100755 --- a/tests/common/test_utils.py +++ b/tests/common/test_utils.py @@ -235,6 +235,7 @@ def convert_avg_gpu_metrics_to_data(avg_gpu_metric_values): def construct_perf_analyzer_config( model_name="my-model", output_file_name="my-model-results.csv", + output_llm_file_name="my-model-llm-results.csv", batch_size=DEFAULT_BATCH_SIZES, concurrency=1, request_rate=None, @@ -242,7 +243,7 @@ def construct_perf_analyzer_config( launch_mode=DEFAULT_TRITON_LAUNCH_MODE, client_protocol=DEFAULT_CLIENT_PROTOCOL, perf_analyzer_flags=None, - llm_search_mode=False, + is_llm_model=False, ): """ Constructs a Perf Analyzer Config @@ -253,6 +254,8 @@ def construct_perf_analyzer_config( The name of the model output_file_name: str The name of the output file + output_llm_file_name: str + The name of the LLM output file batch_size: int The batch size for this PA configuration concurrency: int @@ -265,8 +268,8 @@ def construct_perf_analyzer_config( The client protocol for this PA configuration perf_analyzer_flags: dict A dict of any additional PA flags to be set - llm_search_mode: bool - Indicates we should use LLM search parameters + is_llm_model: bool + Set if the model is an LLM Returns ------- @@ -279,6 +282,9 @@ def construct_perf_analyzer_config( pa_config._options["-f"] = output_file_name pa_config._options["-b"] = batch_size + if is_llm_model: + pa_config._options["--profile-export-file"] = output_llm_file_name + if request_rate: pa_config._args["request-rate-range"] = request_rate elif llm_search_mode: diff --git a/tests/test_perf_analyzer.py b/tests/test_perf_analyzer.py index e95f0d4a1..a80d55239 100755 --- a/tests/test_perf_analyzer.py +++ b/tests/test_perf_analyzer.py @@ -49,6 +49,7 @@ from model_analyzer.triton.client.client_factory import TritonClientFactory from model_analyzer.triton.server.server_config import TritonServerConfig from model_analyzer.triton.server.server_factory import TritonServerFactory +from tests.common.test_utils import construct_perf_analyzer_config from .common import test_result_collector as trc from .mocks.mock_client import MockTritonClientMethods @@ -67,7 +68,56 @@ TEST_GRPC_URL = "test_hostname:test_port" -class TestPerfAnalyzerMethods(trc.TestResultCollector): +def mock_open_method(*args, **kwargs): + pa_csv_mock = """Concurrency,Inferences/Second,Client Send,Network+Server Send/Recv,Server Queue,Server Compute Input,Server Compute Infer,Server Compute Output,""" + pa_csv_mock += """Client Recv,p50 latency,p90 latency,p95 latency,p99 latency,Avg latency,request/response,response wait,""" + pa_csv_mock += """Avg GPU Utilization,Avg GPU Power Usage,Max GPU Memory Usage,Total GPU Memory\n""" + pa_csv_mock += """1,46.8,2,187,18,34,65,16,1,4600,4700,4800,4900,5000,3,314,""" + pa_csv_mock += """GPU-aaf4fea0:0.809;GPU-aaf4fea1:0.901;GPU-aaf4fea2:0.745;,GPU-aaf4fea0:91.2;GPU-aaf4fea1:100;,GPU-aaf4fea0:1000000000;GPU-aaf4fea1:2000000000,GPU-aaf4fea0:1500000000;GPU-aaf4fea2:3000000000""" + + # yapf: disable + pa_json_mock = """ + { + "experiments": [ + { + "experiment": { + "mode": "concurrency", + "value": 4 + }, + "requests": [ + { + "timestamp": 1, + "sequence_id": 1, + "response_timestamps": [2,3,4] + }, + { + "timestamp": 4, + "sequence_id": 2, + "response_timestamps": [5,6] + }, + { + "timestamp": 6, + "sequence_id": 3, + "response_timestamps": [7,8,9] + } + ], + "window_boundaries": [1,5,6] + } + ], + "version": "1.2.3" + } + """ + # yapf: enable + + if args[0] == "my-model-results.csv": + return mock_open(read_data=pa_csv_mock)(*args, **kwargs) + elif args[0] == "my-model-llm-results.csv": + return mock_open(read_data=pa_json_mock)(*args, **kwargs) + else: + return mock_open(read_data=None)(*args, **kwargs) + + +class TestPerfAnalyzer(trc.TestResultCollector): def setUp(self): # Mocks self.server_local_mock = MockServerLocalMethods() @@ -80,7 +130,7 @@ def setUp(self): self.client_mock.start() # PerfAnalyzer config for all tests - self.config = PerfAnalyzerConfig() + self.config = construct_perf_analyzer_config() self.config["model-name"] = TEST_MODEL_NAME self.config["measurement-interval"] = 1000 self.config["measurement-request-count"] = 50 @@ -90,6 +140,16 @@ def setUp(self): ModelRunConfig("fake_name", MagicMock(), self.config) ) + self.llm_config = construct_perf_analyzer_config(is_llm_model=True) + self.llm_config["model-name"] = TEST_MODEL_NAME + self.llm_config["measurement-interval"] = 1000 + self.llm_config["measurement-request-count"] = 50 + + self.llm_run_config = RunConfig({}) + self.llm_run_config.add_model_run_config( + ModelRunConfig("fake_name", MagicMock(), self.llm_config) + ) + self.gpus = [GPUDevice("TEST_DEVICE_NAME", 0, "TEST_PCI_BUS_ID", "TEST_UUID")] # Triton Server @@ -259,7 +319,7 @@ def test_run(self): perf_analyzer = PerfAnalyzer( path=PERF_BIN_PATH, - config=self.run_config, + config=self.llm_run_config, max_retries=10, timeout=100, max_cpu_util=50, @@ -268,18 +328,12 @@ def test_run(self): self.server.start() self.client.wait_for_server_ready(num_retries=1) - pa_csv_mock = """Concurrency,Inferences/Second,Client Send,Network+Server Send/Recv,Server Queue,Server Compute Input,Server Compute Infer,Server Compute Output,""" - pa_csv_mock += """Client Recv,p50 latency,p90 latency,p95 latency,p99 latency,Avg latency,request/response,response wait,""" - pa_csv_mock += """Avg GPU Utilization,Avg GPU Power Usage,Max GPU Memory Usage,Total GPU Memory\n""" - pa_csv_mock += """1,46.8,2,187,18,34,65,16,1,4600,4700,4800,4900,5000,3,314,""" - pa_csv_mock += """GPU-aaf4fea0:0.809;GPU-aaf4fea1:0.901;GPU-aaf4fea2:0.745;,GPU-aaf4fea0:91.2;GPU-aaf4fea1:100;,GPU-aaf4fea0:1000000000;GPU-aaf4fea1:2000000000,GPU-aaf4fea0:1500000000;GPU-aaf4fea2:3000000000""" - # Test avg latency parsing. GPU metric is ignored for get_perf_records() perf_metrics = [PerfLatencyAvg, GPUUtilization] with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(perf_metrics) From a4e338cf0be52c884393501903b34bbcb2bb4c2d Mon Sep 17 00:00:00 2001 From: braf Date: Mon, 9 Oct 2023 14:49:05 +0000 Subject: [PATCH 02/13] New measurement fields created. --- .../record/types/avg_token_latency.py | 96 +++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100755 model_analyzer/record/types/avg_token_latency.py diff --git a/model_analyzer/record/types/avg_token_latency.py b/model_analyzer/record/types/avg_token_latency.py new file mode 100755 index 000000000..93937cafd --- /dev/null +++ b/model_analyzer/record/types/avg_token_latency.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.record import DecreasingRecord + + +@total_ordering +class AvgTokenLatency(DecreasingRecord): + """ + A record for perf_analyzer avg token-to-token latency metric + """ + + tag = "avg_token_latency" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed avg time for token-to-token latency + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "avg token-to-token latency (ms)" + + def __eq__(self, other): + """ + Allows checking for + equality between two records + """ + + return self.value() == other.value() + + def __lt__(self, other): + """ + Allows checking if + this record is less than + the other + """ + + return self.value() > other.value() + + def __add__(self, other): + """ + Allows adding two records together + to produce a brand new record. + """ + + return self.__class__(value=(self.value() + other.value())) + + def __sub__(self, other): + """ + Allows subbing two records together + to produce a brand new record. + + ** Note this does reverse subtraction because + of the inverted nature of latency (lower is better) + """ + + return self.__class__(value=(other.value() - self.value())) From ba3be0c0eb9a783a84e806f09537bcb24968ee70 Mon Sep 17 00:00:00 2001 From: braf Date: Tue, 10 Oct 2023 16:20:47 +0000 Subject: [PATCH 03/13] Fixing PA unit tests --- model_analyzer/perf_analyzer/perf_analyzer.py | 17 ++++- model_analyzer/perf_analyzer/perf_config.py | 2 +- tests/test_perf_analyzer.py | 73 +++++++++++++------ 3 files changed, 63 insertions(+), 29 deletions(-) diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py index 7370d1955..4f4311ace 100755 --- a/model_analyzer/perf_analyzer/perf_analyzer.py +++ b/model_analyzer/perf_analyzer/perf_analyzer.py @@ -501,21 +501,30 @@ def _extract_gpu_records(self, perf_config, metrics): self._gpu_records = self._extract_gpu_records_from_row(metrics, row) def _extract_llm_records(self, perf_config, metrics): + if not perf_config["profile-export-file"]: + return + + self._llm_records[perf_config["model-name"]] = [] + with open(perf_config["profile-export-file"], mode="r") as f: llm_output = json.load(f) avg_first_token_to_token_latency = ( self._calculate_avg_first_token_to_token_latency(llm_output) ) - record = llm_metric[PerfAnalyzer.RECORD_CLASS]( + record = PerfAnalyzer.llm_metric_table[0][PerfAnalyzer.RECORD_CLASS]( value=avg_first_token_to_token_latency ) # type: ignore self._llm_records[perf_config["model-name"]].append(record) - # avg_avg_token_to_token_latency = ( - # self._calculate_avg_avg_token_to_token_latency(llm_output) - # ) + avg_avg_token_to_token_latency = ( + self._calculate_avg_avg_token_to_token_latency(llm_output) + ) + record = PerfAnalyzer.llm_metric_table[0][PerfAnalyzer.RECORD_CLASS]( + value=avg_first_token_to_token_latency + ) # type: ignore + self._llm_records[perf_config["model-name"]].append(record) def _calculate_avg_first_token_to_token_latency(self, llm_output: str) -> float: total_first_token_latency = 0 diff --git a/model_analyzer/perf_analyzer/perf_config.py b/model_analyzer/perf_analyzer/perf_config.py index d81f10044..43427e1a9 100755 --- a/model_analyzer/perf_analyzer/perf_config.py +++ b/model_analyzer/perf_analyzer/perf_config.py @@ -196,7 +196,7 @@ def update_config_from_profile_config(self, model_name, profile_config): "verbose-csv": "--verbose-csv", } - if profile_config.is_llm_mode(): + if profile_config.is_llm_model(): params.update({"profile-export-file": model_name + "-llm-results.csv"}) if profile_config.triton_launch_mode == "c_api": diff --git a/tests/test_perf_analyzer.py b/tests/test_perf_analyzer.py index a80d55239..18fb52503 100755 --- a/tests/test_perf_analyzer.py +++ b/tests/test_perf_analyzer.py @@ -192,16 +192,17 @@ def test_perf_analyzer_config(self): def test_perf_analyzer_boolean_args(self): """Test that only positive boolean args get added""" - expected_cli_str = "-m test_model --measurement-interval=1000 --binary-search --measurement-request-count=50" + expected_cli_str = "-m test_model -b 1 -u localhost:8001 -i grpc -f my-model-results.csv --measurement-interval=1000 --concurrency-range=1 --binary-search --measurement-mode=count_windows --measurement-request-count=50 --collect-metrics --metrics-url=http://localhost:8002/metrics --metrics-interval=1000.0" self.config["async"] = "False" self.config["binary-search"] = "True" + foo = self.config.to_cli_string() self.assertEqual(self.config.to_cli_string(), expected_cli_str) def test_perf_analyzer_additive_args(self): shape = ["name1:1,2,3", "name2:4,5,6"] - expected_cli_str = "-m test_model --measurement-interval=1000 --shape=name1:1,2,3 --shape=name2:4,5,6 --measurement-request-count=50" + expected_cli_str = "-m test_model -b 1 -u localhost:8001 -i grpc -f my-model-results.csv --measurement-interval=1000 --concurrency-range=1 --shape=name1:1,2,3 --shape=name2:4,5,6 --measurement-mode=count_windows --measurement-request-count=50 --collect-metrics --metrics-url=http://localhost:8002/metrics --metrics-interval=1000.0" self.config["shape"] = shape[:] @@ -209,7 +210,7 @@ def test_perf_analyzer_additive_args(self): self.assertEqual(self.config.to_cli_string(), expected_cli_str) shape = "name1:1,2,3" - expected_cli_str = "-m test_model --measurement-interval=1000 --shape=name1:1,2,3 --measurement-request-count=50" + expected_cli_str = "-m test_model -b 1 -u localhost:8001 -i grpc -f my-model-results.csv --measurement-interval=1000 --concurrency-range=1 --shape=name1:1,2,3 --measurement-mode=count_windows --measurement-request-count=50 --collect-metrics --metrics-url=http://localhost:8002/metrics --metrics-interval=1000.0" self.config["shape"] = shape self.assertEqual(self.config.to_cli_string(), expected_cli_str) @@ -237,10 +238,13 @@ def test_perf_analyzer_ssl_args(self): ssl_https_private_key_file = "h" expected_cli_str = ( - f"-m test_model --measurement-interval=1000 --measurement-request-count=50 --ssl-grpc-use-ssl " + f"-m test_model -b 1 -u localhost:8001 -i grpc -f my-model-results.csv --measurement-interval=1000 " + f"--concurrency-range=1 --measurement-mode=count_windows --measurement-request-count=50 --ssl-grpc-use-ssl " f"--ssl-grpc-root-certifications-file=a --ssl-grpc-private-key-file=b --ssl-grpc-certificate-chain-file=c " - f"--ssl-https-verify-peer=1 --ssl-https-verify-host=2 --ssl-https-ca-certificates-file=d --ssl-https-client-certificate-type=e " - f"--ssl-https-client-certificate-file=f --ssl-https-private-key-type=g --ssl-https-private-key-file=h" + f"--ssl-https-verify-peer=1 --ssl-https-verify-host=2 --ssl-https-ca-certificates-file=d " + f"--ssl-https-client-certificate-type=e --ssl-https-client-certificate-file=f --ssl-https-private-key-type=g " + f"--ssl-https-private-key-file=h --collect-metrics --metrics-url=http://localhost:8002/metrics " + f"--metrics-interval=1000.0" ) self.config["ssl-grpc-use-ssl"] = ssl_grpc_use_ssl @@ -301,11 +305,15 @@ def test_perf_analyzer_ssl_args(self): self.config["ssl-grpc-use-ssl"] = ssl_grpc_use_ssl self.assertEqual(self.config["ssl-grpc-use-ssl"], ssl_grpc_use_ssl) expected_cli_str = ( - f"-m test_model --measurement-interval=1000 --measurement-request-count=50 " + f"-m test_model -b 1 -u localhost:8001 -i grpc -f my-model-results.csv --measurement-interval=1000 " + f"--concurrency-range=1 --measurement-mode=count_windows --measurement-request-count=50 " f"--ssl-grpc-root-certifications-file=a --ssl-grpc-private-key-file=b --ssl-grpc-certificate-chain-file=c " - f"--ssl-https-verify-peer=1 --ssl-https-verify-host=2 --ssl-https-ca-certificates-file=d --ssl-https-client-certificate-type=e " - f"--ssl-https-client-certificate-file=f --ssl-https-private-key-type=g --ssl-https-private-key-file=h" + f"--ssl-https-verify-peer=1 --ssl-https-verify-host=2 --ssl-https-ca-certificates-file=d " + f"--ssl-https-client-certificate-type=e --ssl-https-client-certificate-file=f " + f"--ssl-https-private-key-type=g --ssl-https-private-key-file=h --collect-metrics " + f"--metrics-url=http://localhost:8002/metrics --metrics-interval=1000.0" ) + self.assertEqual(self.config.to_cli_string(), expected_cli_str) def test_run(self): @@ -319,7 +327,7 @@ def test_run(self): perf_analyzer = PerfAnalyzer( path=PERF_BIN_PATH, - config=self.llm_run_config, + config=self.run_config, max_retries=10, timeout=100, max_cpu_util=50, @@ -346,7 +354,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(perf_metrics) @@ -359,7 +367,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(perf_metrics) @@ -372,7 +380,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(perf_metrics) @@ -385,7 +393,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(perf_metrics) @@ -398,7 +406,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(perf_metrics) @@ -411,7 +419,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(perf_metrics) @@ -424,7 +432,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(perf_metrics) @@ -437,7 +445,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(perf_metrics) @@ -450,7 +458,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(perf_metrics) @@ -463,7 +471,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(gpu_metrics) @@ -481,7 +489,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(gpu_metrics) @@ -497,7 +505,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(gpu_metrics) @@ -516,7 +524,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(gpu_metrics) @@ -541,7 +549,7 @@ def test_run(self): with patch( "model_analyzer.perf_analyzer.perf_analyzer.open", - mock_open(read_data=pa_csv_mock), + side_effect=mock_open_method, ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"): perf_analyzer.run(perf_metrics) @@ -705,10 +713,27 @@ def test_get_cmd_single_model(self): "perf_analyzer", "-m", "test_model", + "-b", + "1", + "-u", + "localhost:8001", + "-i", + "grpc", + "-f", + "my-model-results.csv", "--measurement-interval", "1000", + "--concurrency-range", + "1", + "--measurement-mode", + "count_windows", "--measurement-request-count", "50", + "--collect-metrics", + "--metrics-url", + "http://localhost:8002/metrics", + "--metrics-interval", + "1000.0", ] self.assertEqual(pa._get_cmd(), expected_cmd) From 527e09e926db22f6125f4f1b63c169052104ff9a Mon Sep 17 00:00:00 2001 From: braf Date: Tue, 10 Oct 2023 16:50:25 +0000 Subject: [PATCH 04/13] Adding hooks in metrics to capture new LLM fields --- model_analyzer/record/metrics_manager.py | 20 ++++++++++++++----- .../record/types/avg_first_token_latency.py | 2 +- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/model_analyzer/record/metrics_manager.py b/model_analyzer/record/metrics_manager.py index 176b632df..c8077dfd1 100755 --- a/model_analyzer/record/metrics_manager.py +++ b/model_analyzer/record/metrics_manager.py @@ -69,6 +69,8 @@ class MetricsManager: "gpu_power_usage", "cpu_available_ram", "cpu_used_ram", + "avg_first_latency", + "avg_token_latency", ] def __init__(self, config, client, server, gpus, result_manager, state_manager): @@ -116,6 +118,7 @@ def __init__(self, config, client, server, gpus, result_manager, state_manager): self._gpu_metrics, self._perf_metrics, self._cpu_metrics, + self._llm_metrics, ) = self._categorize_metrics(self.metrics, self._config.collect_cpu_metrics) self._gpus = gpus self._init_state() @@ -160,21 +163,23 @@ def _categorize_metrics(metric_tags, collect_cpu_metrics=False): Returns ------- - (list,list,list) - tuple of three lists (DCGM, PerfAnalyzer, CPU) metrics + (list,list,list,list) + tuple of four lists (DCGM, PerfAnalyzer, CPU, LLM) metrics """ - gpu_metrics, perf_metrics, cpu_metrics = [], [], [] + gpu_metrics, perf_metrics, cpu_metrics, llm_metrics = [], [], [], [] # Separates metrics and objectives into related lists for metric in MetricsManager.get_metric_types(metric_tags): if metric in PerfAnalyzer.get_gpu_metrics(): gpu_metrics.append(metric) elif metric in PerfAnalyzer.get_perf_metrics(): perf_metrics.append(metric) + elif metric in PerfAnalyzer.get_llm_metrics(): + llm_metrics.append(metric) elif collect_cpu_metrics and (metric in CPUMonitor.cpu_metrics): cpu_metrics.append(metric) - return gpu_metrics, perf_metrics, cpu_metrics + return gpu_metrics, perf_metrics, cpu_metrics, llm_metrics def profile_server(self): """ @@ -556,6 +561,9 @@ def _run_perf_analyzer( ) metrics_to_gather = self._perf_metrics + self._gpu_metrics + if self._config.is_llm_model(): + metrics_to_gather += self._llm_metrics + status = perf_analyzer.run(metrics_to_gather, env=perf_analyzer_env) self._write_perf_analyzer_output(perf_output_writer, perf_analyzer) @@ -564,7 +572,9 @@ def _run_perf_analyzer( self._handle_unsuccessful_perf_analyzer_run(perf_analyzer) return (None, None) - perf_records = perf_analyzer.get_perf_records() + perf_records = ( + perf_analyzer.get_perf_records() + perf_analyzer.get_llm_records() + ) gpu_records = perf_analyzer.get_gpu_records() aggregated_perf_records = self._aggregate_perf_records(perf_records) diff --git a/model_analyzer/record/types/avg_first_token_latency.py b/model_analyzer/record/types/avg_first_token_latency.py index 15badd92a..e012254b1 100755 --- a/model_analyzer/record/types/avg_first_token_latency.py +++ b/model_analyzer/record/types/avg_first_token_latency.py @@ -25,7 +25,7 @@ class AvgFirstTokenLatency(DecreasingRecord): A record for perf_analyzer avg first token to token latency metric """ - tag = "avg_first_token_latency" + tag = "avg_first_latency" def __init__(self, value, timestamp=0): """ From 259b794ae8b259000836a78fd0252d5589b9c40b Mon Sep 17 00:00:00 2001 From: braf Date: Wed, 11 Oct 2023 13:53:19 +0000 Subject: [PATCH 05/13] Fixing codeQL errors --- model_analyzer/perf_analyzer/perf_analyzer.py | 12 ++++++------ tests/test_perf_analyzer.py | 1 - 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py index 4f4311ace..94a2acea1 100755 --- a/model_analyzer/perf_analyzer/perf_analyzer.py +++ b/model_analyzer/perf_analyzer/perf_analyzer.py @@ -518,11 +518,11 @@ def _extract_llm_records(self, perf_config, metrics): self._llm_records[perf_config["model-name"]].append(record) - avg_avg_token_to_token_latency = ( - self._calculate_avg_avg_token_to_token_latency(llm_output) + avg_token_to_token_latency = self._calculate_avg_token_to_token_latency( + llm_output ) record = PerfAnalyzer.llm_metric_table[0][PerfAnalyzer.RECORD_CLASS]( - value=avg_first_token_to_token_latency + value=avg_token_to_token_latency ) # type: ignore self._llm_records[perf_config["model-name"]].append(record) @@ -539,7 +539,7 @@ def _calculate_avg_first_token_to_token_latency(self, llm_output: str) -> float: return avg_first_token_to_token_latency - def _calculate_avg_avg_token_to_token_latency(self, llm_output: str) -> float: + def _calculate_avg_token_to_token_latency(self, llm_output: str) -> float: total_token_latency = 0 for request in llm_output["experiments"][0]["requests"]: total_response_latency = 0 @@ -550,11 +550,11 @@ def _calculate_avg_avg_token_to_token_latency(self, llm_output: str) -> float: request["response_timestamps"] ) - avg_avg_token_to_token_latency = total_token_latency / len( + avg_token_to_token_latency = total_token_latency / len( llm_output["experiments"][0]["requests"] ) - return avg_avg_token_to_token_latency + return avg_token_to_token_latency def _extract_perf_records_from_row( self, requested_metrics: List[Record], row_metrics: Dict[str, str] diff --git a/tests/test_perf_analyzer.py b/tests/test_perf_analyzer.py index 18fb52503..d7df3aa7d 100755 --- a/tests/test_perf_analyzer.py +++ b/tests/test_perf_analyzer.py @@ -197,7 +197,6 @@ def test_perf_analyzer_boolean_args(self): self.config["async"] = "False" self.config["binary-search"] = "True" - foo = self.config.to_cli_string() self.assertEqual(self.config.to_cli_string(), expected_cli_str) def test_perf_analyzer_additive_args(self): From 5aeb538d7676c9afed5dec05539457ff25104230 Mon Sep 17 00:00:00 2001 From: braf Date: Wed, 11 Oct 2023 13:57:35 +0000 Subject: [PATCH 06/13] Fixing type checking errors --- model_analyzer/perf_analyzer/perf_analyzer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py index 94a2acea1..11464a00a 100755 --- a/model_analyzer/perf_analyzer/perf_analyzer.py +++ b/model_analyzer/perf_analyzer/perf_analyzer.py @@ -526,7 +526,7 @@ def _extract_llm_records(self, perf_config, metrics): ) # type: ignore self._llm_records[perf_config["model-name"]].append(record) - def _calculate_avg_first_token_to_token_latency(self, llm_output: str) -> float: + def _calculate_avg_first_token_to_token_latency(self, llm_output: Dict) -> float: total_first_token_latency = 0 for request in llm_output["experiments"][0]["requests"]: total_first_token_latency += ( @@ -539,8 +539,8 @@ def _calculate_avg_first_token_to_token_latency(self, llm_output: str) -> float: return avg_first_token_to_token_latency - def _calculate_avg_token_to_token_latency(self, llm_output: str) -> float: - total_token_latency = 0 + def _calculate_avg_token_to_token_latency(self, llm_output: Dict) -> float: + total_token_latency = 0.0 for request in llm_output["experiments"][0]["requests"]: total_response_latency = 0 for response_timestamp in request["response_timestamps"]: From cedc7a1e23bccbb4974903d27934f66bdd100364 Mon Sep 17 00:00:00 2001 From: braf Date: Thu, 12 Oct 2023 16:17:46 +0000 Subject: [PATCH 07/13] Changes needed post-merge from other branches --- .../config/input/config_command_profile.py | 4 - model_analyzer/record/metrics_manager.py | 4 +- .../record/types/avg_first_token_latency.py | 4 +- .../record/types/avg_token_latency.py | 96 ------------------- .../types/avg_token_to_token_latency.py | 2 +- tests/common/test_utils.py | 4 +- tests/test_perf_analyzer_config_generator.py | 4 +- 7 files changed, 9 insertions(+), 109 deletions(-) delete mode 100755 model_analyzer/record/types/avg_token_latency.py diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py index 9c90bc75b..a215a2251 100755 --- a/model_analyzer/config/input/config_command_profile.py +++ b/model_analyzer/config/input/config_command_profile.py @@ -66,10 +66,6 @@ def __init__(self): super().__init__() self._fill_config() - # FIXME: placeholder until branch is merged - def is_llm_model(self): - return False - def _resolve_protobuf_field(self, field: FieldDescriptor) -> ConfigSweep: """ Recursively resolve protobuf fields. diff --git a/model_analyzer/record/metrics_manager.py b/model_analyzer/record/metrics_manager.py index c8077dfd1..fe77f6eb8 100755 --- a/model_analyzer/record/metrics_manager.py +++ b/model_analyzer/record/metrics_manager.py @@ -69,8 +69,8 @@ class MetricsManager: "gpu_power_usage", "cpu_available_ram", "cpu_used_ram", - "avg_first_latency", - "avg_token_latency", + "avg_first_token_latency", + "avg_token_to_token_latency", ] def __init__(self, config, client, server, gpus, result_manager, state_manager): diff --git a/model_analyzer/record/types/avg_first_token_latency.py b/model_analyzer/record/types/avg_first_token_latency.py index e012254b1..72d539633 100755 --- a/model_analyzer/record/types/avg_first_token_latency.py +++ b/model_analyzer/record/types/avg_first_token_latency.py @@ -22,10 +22,10 @@ @total_ordering class AvgFirstTokenLatency(DecreasingRecord): """ - A record for perf_analyzer avg first token to token latency metric + A record for perf_analyzer average first token latency metric """ - tag = "avg_first_latency" + tag = "avg_first_token_latency" def __init__(self, value, timestamp=0): """ diff --git a/model_analyzer/record/types/avg_token_latency.py b/model_analyzer/record/types/avg_token_latency.py deleted file mode 100755 index 93937cafd..000000000 --- a/model_analyzer/record/types/avg_token_latency.py +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from functools import total_ordering - -from model_analyzer.record.record import DecreasingRecord - - -@total_ordering -class AvgTokenLatency(DecreasingRecord): - """ - A record for perf_analyzer avg token-to-token latency metric - """ - - tag = "avg_token_latency" - - def __init__(self, value, timestamp=0): - """ - Parameters - ---------- - value : float - the latency extracted from the perf analyzer output - timestamp : float - Elapsed avg time for token-to-token latency - """ - - super().__init__(value, timestamp) - - @classmethod - def header(cls, aggregation_tag=False): - """ - Parameters - ---------- - aggregation_tag: bool - An optional tag that may be displayed - as part of the header indicating that - this record has been aggregated using - max, min or average etc. - - Returns - ------- - str - The full name of the - metric. - """ - - return "avg token-to-token latency (ms)" - - def __eq__(self, other): - """ - Allows checking for - equality between two records - """ - - return self.value() == other.value() - - def __lt__(self, other): - """ - Allows checking if - this record is less than - the other - """ - - return self.value() > other.value() - - def __add__(self, other): - """ - Allows adding two records together - to produce a brand new record. - """ - - return self.__class__(value=(self.value() + other.value())) - - def __sub__(self, other): - """ - Allows subbing two records together - to produce a brand new record. - - ** Note this does reverse subtraction because - of the inverted nature of latency (lower is better) - """ - - return self.__class__(value=(other.value() - self.value())) diff --git a/model_analyzer/record/types/avg_token_to_token_latency.py b/model_analyzer/record/types/avg_token_to_token_latency.py index 2941da39b..66c93b6fc 100755 --- a/model_analyzer/record/types/avg_token_to_token_latency.py +++ b/model_analyzer/record/types/avg_token_to_token_latency.py @@ -22,7 +22,7 @@ @total_ordering class AvgTokenToTokenLatency(DecreasingRecord): """ - A record for perf_analyzer avg token-to-token latency metric + A record for perf_analyzer average token-to-token latency metric """ tag = "avg_token_to_token_latency" diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py index 33db04f30..9e48d0e9d 100755 --- a/tests/common/test_utils.py +++ b/tests/common/test_utils.py @@ -287,12 +287,12 @@ def construct_perf_analyzer_config( if request_rate: pa_config._args["request-rate-range"] = request_rate - elif llm_search_mode: + elif is_llm_model: pa_config._args["periodic-concurrency-range"] = concurrency else: pa_config._args["concurrency-range"] = concurrency - if llm_search_mode: + if is_llm_model: pa_config._args["request-parameter"] = ( "max_token:" + str(max_token_count) + ":int" ) diff --git a/tests/test_perf_analyzer_config_generator.py b/tests/test_perf_analyzer_config_generator.py index 69e42ef8d..4b99aa87c 100755 --- a/tests/test_perf_analyzer_config_generator.py +++ b/tests/test_perf_analyzer_config_generator.py @@ -578,7 +578,7 @@ def test_llm_search_max_token_count(self): max_token_counts = utils.generate_doubled_list(1, 256) expected_configs = [ - construct_perf_analyzer_config(max_token_count=mtc, llm_search_mode=True) + construct_perf_analyzer_config(max_token_count=mtc, is_llm_model=True) for mtc in max_token_counts ] @@ -612,7 +612,7 @@ def test_llm_search_text_input_length(self): text_input_lengths = utils.generate_doubled_list(1, 1024) expected_configs = [ - construct_perf_analyzer_config(llm_search_mode=True) + construct_perf_analyzer_config(is_llm_model=True) for pl in text_input_lengths ] From deba132a83bef4f0e91f25b0c20a32b3471d88fa Mon Sep 17 00:00:00 2001 From: braf Date: Thu, 12 Oct 2023 16:28:01 +0000 Subject: [PATCH 08/13] Revert naming mistake (due to merge). --- tests/common/test_utils.py | 12 ++++++------ tests/test_perf_analyzer.py | 2 +- tests/test_perf_analyzer_config_generator.py | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py index 9e48d0e9d..e5a336400 100755 --- a/tests/common/test_utils.py +++ b/tests/common/test_utils.py @@ -243,7 +243,7 @@ def construct_perf_analyzer_config( launch_mode=DEFAULT_TRITON_LAUNCH_MODE, client_protocol=DEFAULT_CLIENT_PROTOCOL, perf_analyzer_flags=None, - is_llm_model=False, + llm_search_mode=False, ): """ Constructs a Perf Analyzer Config @@ -268,8 +268,8 @@ def construct_perf_analyzer_config( The client protocol for this PA configuration perf_analyzer_flags: dict A dict of any additional PA flags to be set - is_llm_model: bool - Set if the model is an LLM + llm_search_mode: bool + Indicates we should use LLM search parameters Returns ------- @@ -282,17 +282,17 @@ def construct_perf_analyzer_config( pa_config._options["-f"] = output_file_name pa_config._options["-b"] = batch_size - if is_llm_model: + if llm_search_mode: pa_config._options["--profile-export-file"] = output_llm_file_name if request_rate: pa_config._args["request-rate-range"] = request_rate - elif is_llm_model: + elif llm_search_mode: pa_config._args["periodic-concurrency-range"] = concurrency else: pa_config._args["concurrency-range"] = concurrency - if is_llm_model: + if llm_search_mode: pa_config._args["request-parameter"] = ( "max_token:" + str(max_token_count) + ":int" ) diff --git a/tests/test_perf_analyzer.py b/tests/test_perf_analyzer.py index d7df3aa7d..a984279bd 100755 --- a/tests/test_perf_analyzer.py +++ b/tests/test_perf_analyzer.py @@ -140,7 +140,7 @@ def setUp(self): ModelRunConfig("fake_name", MagicMock(), self.config) ) - self.llm_config = construct_perf_analyzer_config(is_llm_model=True) + self.llm_config = construct_perf_analyzer_config(llm_search_mode=True) self.llm_config["model-name"] = TEST_MODEL_NAME self.llm_config["measurement-interval"] = 1000 self.llm_config["measurement-request-count"] = 50 diff --git a/tests/test_perf_analyzer_config_generator.py b/tests/test_perf_analyzer_config_generator.py index 4b99aa87c..69e42ef8d 100755 --- a/tests/test_perf_analyzer_config_generator.py +++ b/tests/test_perf_analyzer_config_generator.py @@ -578,7 +578,7 @@ def test_llm_search_max_token_count(self): max_token_counts = utils.generate_doubled_list(1, 256) expected_configs = [ - construct_perf_analyzer_config(max_token_count=mtc, is_llm_model=True) + construct_perf_analyzer_config(max_token_count=mtc, llm_search_mode=True) for mtc in max_token_counts ] @@ -612,7 +612,7 @@ def test_llm_search_text_input_length(self): text_input_lengths = utils.generate_doubled_list(1, 1024) expected_configs = [ - construct_perf_analyzer_config(is_llm_model=True) + construct_perf_analyzer_config(llm_search_mode=True) for pl in text_input_lengths ] From 53730da8a747d3f23569de90b025fa896b6ec534 Mon Sep 17 00:00:00 2001 From: braf Date: Fri, 13 Oct 2023 19:23:00 +0000 Subject: [PATCH 09/13] Changes uncovered during live testing --- model_analyzer/perf_analyzer/perf_analyzer.py | 9 +++++++-- model_analyzer/perf_analyzer/perf_config.py | 3 ++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py index 11464a00a..5d68eb916 100755 --- a/model_analyzer/perf_analyzer/perf_analyzer.py +++ b/model_analyzer/perf_analyzer/perf_analyzer.py @@ -486,11 +486,16 @@ def _parse_outputs(self, metrics): for perf_config in [ mrc.perf_config() for mrc in self._config.model_run_configs() ]: - # Remove the latency file and all associated composing model latency files + # Remove the latency/profile export files and all associated composing model latency files for f in glob.glob(f"*{perf_config['latency-report-file']}"): os.remove(f) + for f in glob.glob(f"*{perf_config['profile-export-file']}"): + os.remove(f) def _extract_gpu_records(self, perf_config, metrics): + if perf_config["profile-export-file"]: + return + with open(perf_config["latency-report-file"], mode="r") as f: csv_reader = csv.DictReader(f, delimiter=",") @@ -521,7 +526,7 @@ def _extract_llm_records(self, perf_config, metrics): avg_token_to_token_latency = self._calculate_avg_token_to_token_latency( llm_output ) - record = PerfAnalyzer.llm_metric_table[0][PerfAnalyzer.RECORD_CLASS]( + record = PerfAnalyzer.llm_metric_table[1][PerfAnalyzer.RECORD_CLASS]( value=avg_token_to_token_latency ) # type: ignore self._llm_records[perf_config["model-name"]].append(record) diff --git a/model_analyzer/perf_analyzer/perf_config.py b/model_analyzer/perf_analyzer/perf_config.py index 43427e1a9..a72cdc3b1 100755 --- a/model_analyzer/perf_analyzer/perf_config.py +++ b/model_analyzer/perf_analyzer/perf_config.py @@ -73,6 +73,7 @@ class PerfAnalyzerConfig: "metrics-interval", "bls-composing-models", "request-parameter", + "request-period", ] input_to_options = [ @@ -197,7 +198,7 @@ def update_config_from_profile_config(self, model_name, profile_config): } if profile_config.is_llm_model(): - params.update({"profile-export-file": model_name + "-llm-results.csv"}) + params.update({"profile-export-file": model_name + "-results.json"}) if profile_config.triton_launch_mode == "c_api": params.update( From e1091169fec9a829643bd1621291f5c8a2019cd9 Mon Sep 17 00:00:00 2001 From: braf Date: Mon, 16 Oct 2023 18:54:52 +0000 Subject: [PATCH 10/13] Fixes based on hwoo review --- model_analyzer/perf_analyzer/perf_analyzer.py | 28 ++++++++++--------- tests/common/test_utils.py | 2 +- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py index 5d68eb916..0075aa6b9 100755 --- a/model_analyzer/perf_analyzer/perf_analyzer.py +++ b/model_analyzer/perf_analyzer/perf_analyzer.py @@ -236,7 +236,7 @@ def get_llm_records(self): if self._llm_records: return self._llm_records raise TritonModelAnalyzerException( - "Attempted to get perf_analyzer results" "without calling run first." + "Attempted to get perf_analyzer results without calling run first." ) def output(self): @@ -514,11 +514,11 @@ def _extract_llm_records(self, perf_config, metrics): with open(perf_config["profile-export-file"], mode="r") as f: llm_output = json.load(f) - avg_first_token_to_token_latency = ( - self._calculate_avg_first_token_to_token_latency(llm_output) + avg_first_token_latency = self._calculate_avg_first_token_latency( + llm_output ) record = PerfAnalyzer.llm_metric_table[0][PerfAnalyzer.RECORD_CLASS]( - value=avg_first_token_to_token_latency + value=avg_first_token_latency ) # type: ignore self._llm_records[perf_config["model-name"]].append(record) @@ -531,31 +531,33 @@ def _extract_llm_records(self, perf_config, metrics): ) # type: ignore self._llm_records[perf_config["model-name"]].append(record) - def _calculate_avg_first_token_to_token_latency(self, llm_output: Dict) -> float: + def _calculate_avg_first_token_latency(self, llm_output: Dict) -> float: total_first_token_latency = 0 for request in llm_output["experiments"][0]["requests"]: total_first_token_latency += ( request["response_timestamps"][0] - request["timestamp"] ) - avg_first_token_to_token_latency = total_first_token_latency / len( + avg_first_token_latency = total_first_token_latency / len( llm_output["experiments"][0]["requests"] ) - return avg_first_token_to_token_latency + return avg_first_token_latency def _calculate_avg_token_to_token_latency(self, llm_output: Dict) -> float: - total_token_latency = 0.0 + total_token_to_token_latency = 0.0 for request in llm_output["experiments"][0]["requests"]: - total_response_latency = 0 - for response_timestamp in request["response_timestamps"]: - total_response_latency += response_timestamp - request["timestamp"] + total_response_to_response_latency = 0 + prev_response = request["response_timestamps"][0] + for response in request["response_timestamps"][1:]: + total_response_to_reponse_latency = response - prev_response + prev_response = response - total_token_latency += total_response_latency / len( + total_token_to_token_latency += total_response_to_response_latency / len( request["response_timestamps"] ) - avg_token_to_token_latency = total_token_latency / len( + avg_token_to_token_latency = total_token_to_token_latency / len( llm_output["experiments"][0]["requests"] ) diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py index e5a336400..dc65f5665 100755 --- a/tests/common/test_utils.py +++ b/tests/common/test_utils.py @@ -235,7 +235,7 @@ def convert_avg_gpu_metrics_to_data(avg_gpu_metric_values): def construct_perf_analyzer_config( model_name="my-model", output_file_name="my-model-results.csv", - output_llm_file_name="my-model-llm-results.csv", + output_llm_file_name="my-model-results.json", batch_size=DEFAULT_BATCH_SIZES, concurrency=1, request_rate=None, From 53f1c0f9f71d4993e803c5a56e2ffbc135e4ca64 Mon Sep 17 00:00:00 2001 From: braf Date: Mon, 16 Oct 2023 18:56:37 +0000 Subject: [PATCH 11/13] Fixing typo --- model_analyzer/perf_analyzer/perf_analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py index 0075aa6b9..83e95f763 100755 --- a/model_analyzer/perf_analyzer/perf_analyzer.py +++ b/model_analyzer/perf_analyzer/perf_analyzer.py @@ -550,7 +550,7 @@ def _calculate_avg_token_to_token_latency(self, llm_output: Dict) -> float: total_response_to_response_latency = 0 prev_response = request["response_timestamps"][0] for response in request["response_timestamps"][1:]: - total_response_to_reponse_latency = response - prev_response + total_response_to_response_latency = response - prev_response prev_response = response total_token_to_token_latency += total_response_to_response_latency / len( From 7d7271f8d2c6a6cb174ccf78e35a04e9b0f8faad Mon Sep 17 00:00:00 2001 From: braf Date: Mon, 16 Oct 2023 19:35:28 +0000 Subject: [PATCH 12/13] Change to use lists and mean() --- model_analyzer/perf_analyzer/perf_analyzer.py | 23 ++++++++----------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py index 83e95f763..e508934d8 100755 --- a/model_analyzer/perf_analyzer/perf_analyzer.py +++ b/model_analyzer/perf_analyzer/perf_analyzer.py @@ -22,6 +22,7 @@ import re import signal import tempfile +from statistics import mean from subprocess import STDOUT, Popen from typing import Dict, List @@ -532,34 +533,28 @@ def _extract_llm_records(self, perf_config, metrics): self._llm_records[perf_config["model-name"]].append(record) def _calculate_avg_first_token_latency(self, llm_output: Dict) -> float: - total_first_token_latency = 0 + total_first_token_latencies = [] for request in llm_output["experiments"][0]["requests"]: - total_first_token_latency += ( + total_first_token_latencies.append( request["response_timestamps"][0] - request["timestamp"] ) - avg_first_token_latency = total_first_token_latency / len( - llm_output["experiments"][0]["requests"] - ) + avg_first_token_latency = mean(total_first_token_latencies) return avg_first_token_latency def _calculate_avg_token_to_token_latency(self, llm_output: Dict) -> float: - total_token_to_token_latency = 0.0 + token_to_token_latencies = [] for request in llm_output["experiments"][0]["requests"]: - total_response_to_response_latency = 0 + response_to_response_latencies = [] prev_response = request["response_timestamps"][0] for response in request["response_timestamps"][1:]: - total_response_to_response_latency = response - prev_response + response_to_response_latencies.append(response - prev_response) prev_response = response - total_token_to_token_latency += total_response_to_response_latency / len( - request["response_timestamps"] - ) + token_to_token_latencies.append(mean(response_to_response_latencies)) - avg_token_to_token_latency = total_token_to_token_latency / len( - llm_output["experiments"][0]["requests"] - ) + avg_token_to_token_latency = mean(token_to_token_latencies) return avg_token_to_token_latency From e39c3d3f3e7d3b58624e94bb9ca2e3194063d966 Mon Sep 17 00:00:00 2001 From: braf Date: Tue, 17 Oct 2023 17:48:17 +0000 Subject: [PATCH 13/13] Changes based on hwoo review --- model_analyzer/perf_analyzer/perf_analyzer.py | 2 +- tests/common/test_utils.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py index e508934d8..49f15f5a2 100755 --- a/model_analyzer/perf_analyzer/perf_analyzer.py +++ b/model_analyzer/perf_analyzer/perf_analyzer.py @@ -22,11 +22,11 @@ import re import signal import tempfile -from statistics import mean from subprocess import STDOUT, Popen from typing import Dict, List import psutil +from numpy import mean from model_analyzer.constants import ( INTERVAL_SLEEP_TIME, diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py index dc65f5665..0e3d0c8e7 100755 --- a/tests/common/test_utils.py +++ b/tests/common/test_utils.py @@ -235,7 +235,7 @@ def convert_avg_gpu_metrics_to_data(avg_gpu_metric_values): def construct_perf_analyzer_config( model_name="my-model", output_file_name="my-model-results.csv", - output_llm_file_name="my-model-results.json", + export_file_name="my-model-results.json", batch_size=DEFAULT_BATCH_SIZES, concurrency=1, request_rate=None, @@ -254,8 +254,8 @@ def construct_perf_analyzer_config( The name of the model output_file_name: str The name of the output file - output_llm_file_name: str - The name of the LLM output file + export_file_name: str + The name of the export file batch_size: int The batch size for this PA configuration concurrency: int @@ -283,7 +283,7 @@ def construct_perf_analyzer_config( pa_config._options["-b"] = batch_size if llm_search_mode: - pa_config._options["--profile-export-file"] = output_llm_file_name + pa_config._options["--profile-export-file"] = export_file_name if request_rate: pa_config._args["request-rate-range"] = request_rate