Skip to content

Commit

Permalink
New Records for LLM metrics (#839)
Browse files Browse the repository at this point in the history
* Adding new LLM metrics

* Adding base class for perf, inter_token, and time_to_first latency records
  • Loading branch information
nv-braf authored Mar 19, 2024
1 parent 16aadb9 commit 2156c34
Show file tree
Hide file tree
Showing 21 changed files with 1,068 additions and 152 deletions.
60 changes: 60 additions & 0 deletions model_analyzer/record/types/inter_token_latency_avg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/usr/bin/env python3

# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from functools import total_ordering

from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase


@total_ordering
class InterTokenLatencyAvg(InterTokenLatencyBase):
"""
A record for perf_analyzer Inter token latency metric
"""

tag = "inter_token_latency_avg"

def __init__(self, value, timestamp=0):
"""
Parameters
----------
value : float
the latency extracted from the perf analyzer output
timestamp : float
Elapsed time from start of program
"""

super().__init__(value, timestamp)

@classmethod
def header(cls, aggregation_tag=False):
"""
Parameters
----------
aggregation_tag: bool
An optional tag that may be displayed
as part of the header indicating that
this record has been aggregated using
max, min or average etc.
Returns
-------
str
The full name of the
metric.
"""

return "Avg Inter Token Latency (ms)"
74 changes: 74 additions & 0 deletions model_analyzer/record/types/inter_token_latency_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/usr/bin/env python3

# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from functools import total_ordering

from model_analyzer.record.record import DecreasingRecord


@total_ordering
class InterTokenLatencyBase(DecreasingRecord):
"""
A record for perf_analyzer Inter token latency metric
"""

def __init__(self, value, timestamp=0):
"""
Parameters
----------
value : float
the latency extracted from the perf analyzer output
timestamp : float
Elapsed time from start of program
"""

super().__init__(value, timestamp)

def __eq__(self, other):
"""
Allows checking for
equality between two records
"""

return self.value() == other.value()

def __lt__(self, other):
"""
Allows checking if
this record is less than
the other
"""

return self.value() > other.value()

def __add__(self, other):
"""
Allows adding two records together
to produce a brand new record.
"""

return self.__class__(value=(self.value() + other.value()))

def __sub__(self, other):
"""
Allows subbing two records together
to produce a brand new record.
** Note this does reverse subtraction because
of the inverted nature of latency (lower is better)
"""

return self.__class__(value=(other.value() - self.value()))
60 changes: 60 additions & 0 deletions model_analyzer/record/types/inter_token_latency_max.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/usr/bin/env python3

# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from functools import total_ordering

from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase


@total_ordering
class InterTokenLatencyMax(InterTokenLatencyBase):
"""
A record for perf_analyzer Inter token latency metric
"""

tag = "inter_token_latency_max"

def __init__(self, value, timestamp=0):
"""
Parameters
----------
value : float
the latency extracted from the perf analyzer output
timestamp : float
Elapsed time from start of program
"""

super().__init__(value, timestamp)

@classmethod
def header(cls, aggregation_tag=False):
"""
Parameters
----------
aggregation_tag: bool
An optional tag that may be displayed
as part of the header indicating that
this record has been aggregated using
max, min or average etc.
Returns
-------
str
The full name of the
metric.
"""

return "Max Inter Token Latency (ms)"
60 changes: 60 additions & 0 deletions model_analyzer/record/types/inter_token_latency_min.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/usr/bin/env python3

# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from functools import total_ordering

from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase


@total_ordering
class InterTokenLatencyMin(InterTokenLatencyBase):
"""
A record for perf_analyzer Inter token latency metric
"""

tag = "inter_token_latency_min"

def __init__(self, value, timestamp=0):
"""
Parameters
----------
value : float
the latency extracted from the perf analyzer output
timestamp : float
Elapsed time from start of program
"""

super().__init__(value, timestamp)

@classmethod
def header(cls, aggregation_tag=False):
"""
Parameters
----------
aggregation_tag: bool
An optional tag that may be displayed
as part of the header indicating that
this record has been aggregated using
max, min or average etc.
Returns
-------
str
The full name of the
metric.
"""

return "Min Inter Token Latency (ms)"
60 changes: 60 additions & 0 deletions model_analyzer/record/types/inter_token_latency_p75.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/usr/bin/env python3

# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from functools import total_ordering

from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase


@total_ordering
class InterTokenLatencyP75(InterTokenLatencyBase):
"""
A record for perf_analyzer Inter token latency metric
"""

tag = "inter_token_latency_p75"

def __init__(self, value, timestamp=0):
"""
Parameters
----------
value : float
the latency extracted from the perf analyzer output
timestamp : float
Elapsed time from start of program
"""

super().__init__(value, timestamp)

@classmethod
def header(cls, aggregation_tag=False):
"""
Parameters
----------
aggregation_tag: bool
An optional tag that may be displayed
as part of the header indicating that
this record has been aggregated using
max, min or average etc.
Returns
-------
str
The full name of the
metric.
"""

return "p75 Inter Token Latency (ms)"
60 changes: 60 additions & 0 deletions model_analyzer/record/types/inter_token_latency_p90.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/usr/bin/env python3

# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from functools import total_ordering

from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase


@total_ordering
class InterTokenLatencyP99(InterTokenLatencyBase):
"""
A record for perf_analyzer Inter token latency metric
"""

tag = "inter_token_latency_p90"

def __init__(self, value, timestamp=0):
"""
Parameters
----------
value : float
the latency extracted from the perf analyzer output
timestamp : float
Elapsed time from start of program
"""

super().__init__(value, timestamp)

@classmethod
def header(cls, aggregation_tag=False):
"""
Parameters
----------
aggregation_tag: bool
An optional tag that may be displayed
as part of the header indicating that
this record has been aggregated using
max, min or average etc.
Returns
-------
str
The full name of the
metric.
"""

return "p90 Inter Token Latency (ms)"
Loading

0 comments on commit 2156c34

Please sign in to comment.