diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py index 9c90bc75b..a215a2251 100755 --- a/model_analyzer/config/input/config_command_profile.py +++ b/model_analyzer/config/input/config_command_profile.py @@ -66,10 +66,6 @@ def __init__(self): super().__init__() self._fill_config() - # FIXME: placeholder until branch is merged - def is_llm_model(self): - return False - def _resolve_protobuf_field(self, field: FieldDescriptor) -> ConfigSweep: """ Recursively resolve protobuf fields. diff --git a/model_analyzer/record/metrics_manager.py b/model_analyzer/record/metrics_manager.py index c8077dfd1..fe77f6eb8 100755 --- a/model_analyzer/record/metrics_manager.py +++ b/model_analyzer/record/metrics_manager.py @@ -69,8 +69,8 @@ class MetricsManager: "gpu_power_usage", "cpu_available_ram", "cpu_used_ram", - "avg_first_latency", - "avg_token_latency", + "avg_first_token_latency", + "avg_token_to_token_latency", ] def __init__(self, config, client, server, gpus, result_manager, state_manager): diff --git a/model_analyzer/record/types/avg_first_token_latency.py b/model_analyzer/record/types/avg_first_token_latency.py index e012254b1..72d539633 100755 --- a/model_analyzer/record/types/avg_first_token_latency.py +++ b/model_analyzer/record/types/avg_first_token_latency.py @@ -22,10 +22,10 @@ @total_ordering class AvgFirstTokenLatency(DecreasingRecord): """ - A record for perf_analyzer avg first token to token latency metric + A record for perf_analyzer average first token latency metric """ - tag = "avg_first_latency" + tag = "avg_first_token_latency" def __init__(self, value, timestamp=0): """ diff --git a/model_analyzer/record/types/avg_token_latency.py b/model_analyzer/record/types/avg_token_latency.py deleted file mode 100755 index 93937cafd..000000000 --- a/model_analyzer/record/types/avg_token_latency.py +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from functools import total_ordering - -from model_analyzer.record.record import DecreasingRecord - - -@total_ordering -class AvgTokenLatency(DecreasingRecord): - """ - A record for perf_analyzer avg token-to-token latency metric - """ - - tag = "avg_token_latency" - - def __init__(self, value, timestamp=0): - """ - Parameters - ---------- - value : float - the latency extracted from the perf analyzer output - timestamp : float - Elapsed avg time for token-to-token latency - """ - - super().__init__(value, timestamp) - - @classmethod - def header(cls, aggregation_tag=False): - """ - Parameters - ---------- - aggregation_tag: bool - An optional tag that may be displayed - as part of the header indicating that - this record has been aggregated using - max, min or average etc. - - Returns - ------- - str - The full name of the - metric. - """ - - return "avg token-to-token latency (ms)" - - def __eq__(self, other): - """ - Allows checking for - equality between two records - """ - - return self.value() == other.value() - - def __lt__(self, other): - """ - Allows checking if - this record is less than - the other - """ - - return self.value() > other.value() - - def __add__(self, other): - """ - Allows adding two records together - to produce a brand new record. - """ - - return self.__class__(value=(self.value() + other.value())) - - def __sub__(self, other): - """ - Allows subbing two records together - to produce a brand new record. - - ** Note this does reverse subtraction because - of the inverted nature of latency (lower is better) - """ - - return self.__class__(value=(other.value() - self.value())) diff --git a/model_analyzer/record/types/avg_token_to_token_latency.py b/model_analyzer/record/types/avg_token_to_token_latency.py index 2941da39b..66c93b6fc 100755 --- a/model_analyzer/record/types/avg_token_to_token_latency.py +++ b/model_analyzer/record/types/avg_token_to_token_latency.py @@ -22,7 +22,7 @@ @total_ordering class AvgTokenToTokenLatency(DecreasingRecord): """ - A record for perf_analyzer avg token-to-token latency metric + A record for perf_analyzer average token-to-token latency metric """ tag = "avg_token_to_token_latency" diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py index 33db04f30..9e48d0e9d 100755 --- a/tests/common/test_utils.py +++ b/tests/common/test_utils.py @@ -287,12 +287,12 @@ def construct_perf_analyzer_config( if request_rate: pa_config._args["request-rate-range"] = request_rate - elif llm_search_mode: + elif is_llm_model: pa_config._args["periodic-concurrency-range"] = concurrency else: pa_config._args["concurrency-range"] = concurrency - if llm_search_mode: + if is_llm_model: pa_config._args["request-parameter"] = ( "max_token:" + str(max_token_count) + ":int" ) diff --git a/tests/test_perf_analyzer_config_generator.py b/tests/test_perf_analyzer_config_generator.py index 69e42ef8d..4b99aa87c 100755 --- a/tests/test_perf_analyzer_config_generator.py +++ b/tests/test_perf_analyzer_config_generator.py @@ -578,7 +578,7 @@ def test_llm_search_max_token_count(self): max_token_counts = utils.generate_doubled_list(1, 256) expected_configs = [ - construct_perf_analyzer_config(max_token_count=mtc, llm_search_mode=True) + construct_perf_analyzer_config(max_token_count=mtc, is_llm_model=True) for mtc in max_token_counts ] @@ -612,7 +612,7 @@ def test_llm_search_text_input_length(self): text_input_lengths = utils.generate_doubled_list(1, 1024) expected_configs = [ - construct_perf_analyzer_config(llm_search_mode=True) + construct_perf_analyzer_config(is_llm_model=True) for pl in text_input_lengths ]