From cb0d6323fb965534d265e0597996db6f2db38748 Mon Sep 17 00:00:00 2001 From: braf Date: Mon, 9 Oct 2023 14:49:05 +0000 Subject: [PATCH] New measurement fields created. --- model_analyzer/perf_analyzer/perf_analyzer.py | 7 ++ .../record/types/avg_first_token_latency.py | 96 +++++++++++++++++++ .../record/types/avg_token_latency.py | 96 +++++++++++++++++++ tests/test_record_types.py | 2 + 4 files changed, 201 insertions(+) create mode 100755 model_analyzer/record/types/avg_first_token_latency.py create mode 100755 model_analyzer/record/types/avg_token_latency.py diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py index c88f8e655..eefc57abc 100755 --- a/model_analyzer/perf_analyzer/perf_analyzer.py +++ b/model_analyzer/perf_analyzer/perf_analyzer.py @@ -36,6 +36,8 @@ ) from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException from model_analyzer.record.record import Record +from model_analyzer.record.types.avg_first_token_latency import AvgFirstTokenLatency +from model_analyzer.record.types.avg_token_latency import AvgTokenLatency from model_analyzer.record.types.gpu_free_memory import GPUFreeMemory from model_analyzer.record.types.gpu_power_usage import GPUPowerUsage from model_analyzer.record.types.gpu_used_memory import GPUUsedMemory @@ -91,6 +93,11 @@ class PerfAnalyzer: ["gpu_used_memory", "Max GPU Memory Usage", GPUUsedMemory, "1000000"], ["gpu_free_memory", "Total GPU Memory", GPUFreeMemory, "1000000"] ] + + llm_metric_table = [ + ["perf_latency_avg", "Avg first token latency", AvgFirstTokenLatency, "1000"], + ["perf_latency_avg", "Avg token latency", AvgTokenLatency, "1000"] + ] # yapf: enable @staticmethod diff --git a/model_analyzer/record/types/avg_first_token_latency.py b/model_analyzer/record/types/avg_first_token_latency.py new file mode 100755 index 000000000..15badd92a --- /dev/null +++ b/model_analyzer/record/types/avg_first_token_latency.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.record import DecreasingRecord + + +@total_ordering +class AvgFirstTokenLatency(DecreasingRecord): + """ + A record for perf_analyzer avg first token to token latency metric + """ + + tag = "avg_first_token_latency" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed avg time for first token-to-token latency + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "avg first token-to-token latency (ms)" + + def __eq__(self, other): + """ + Allows checking for + equality between two records + """ + + return self.value() == other.value() + + def __lt__(self, other): + """ + Allows checking if + this record is less than + the other + """ + + return self.value() > other.value() + + def __add__(self, other): + """ + Allows adding two records together + to produce a brand new record. + """ + + return self.__class__(value=(self.value() + other.value())) + + def __sub__(self, other): + """ + Allows subbing two records together + to produce a brand new record. + + ** Note this does reverse subtraction because + of the inverted nature of latency (lower is better) + """ + + return self.__class__(value=(other.value() - self.value())) diff --git a/model_analyzer/record/types/avg_token_latency.py b/model_analyzer/record/types/avg_token_latency.py new file mode 100755 index 000000000..93937cafd --- /dev/null +++ b/model_analyzer/record/types/avg_token_latency.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import total_ordering + +from model_analyzer.record.record import DecreasingRecord + + +@total_ordering +class AvgTokenLatency(DecreasingRecord): + """ + A record for perf_analyzer avg token-to-token latency metric + """ + + tag = "avg_token_latency" + + def __init__(self, value, timestamp=0): + """ + Parameters + ---------- + value : float + the latency extracted from the perf analyzer output + timestamp : float + Elapsed avg time for token-to-token latency + """ + + super().__init__(value, timestamp) + + @classmethod + def header(cls, aggregation_tag=False): + """ + Parameters + ---------- + aggregation_tag: bool + An optional tag that may be displayed + as part of the header indicating that + this record has been aggregated using + max, min or average etc. + + Returns + ------- + str + The full name of the + metric. + """ + + return "avg token-to-token latency (ms)" + + def __eq__(self, other): + """ + Allows checking for + equality between two records + """ + + return self.value() == other.value() + + def __lt__(self, other): + """ + Allows checking if + this record is less than + the other + """ + + return self.value() > other.value() + + def __add__(self, other): + """ + Allows adding two records together + to produce a brand new record. + """ + + return self.__class__(value=(self.value() + other.value())) + + def __sub__(self, other): + """ + Allows subbing two records together + to produce a brand new record. + + ** Note this does reverse subtraction because + of the inverted nature of latency (lower is better) + """ + + return self.__class__(value=(other.value() - self.value())) diff --git a/tests/test_record_types.py b/tests/test_record_types.py index 4bd6d8b32..3b31e9402 100755 --- a/tests/test_record_types.py +++ b/tests/test_record_types.py @@ -59,6 +59,8 @@ def setUp(self): "perf_client_send_recv", "perf_server_compute_input", "gpu_power_usage", + "avg_first_token_latency", + "avg_token_latency", ] } self.more_is_better_types = {