diff --git a/docs/config.md b/docs/config.md
index 2fccc32f5..1eacfbdbd 100644
--- a/docs/config.md
+++ b/docs/config.md
@@ -236,6 +236,9 @@ cpu_only_composing_models: <comma-delimited-string-list>
 # Skips the generation of detailed reports and tables
 [ skip_detailed_reports: <bool> | default: false]
 
+# Type of model being profiled: generic or LLM
+[ model_type: <string> | default: generic]
+
 # Number of top configs to show in summary plots
 [ num_configs_per_model: <int> | default: 3]
 
@@ -364,14 +367,17 @@ Before proceeding, it will be helpful to see the documentation on [Model Analyze
 
 ### `<constraint>`
 
-A constraint, specifies the bounds that determine a successful run. There are
-three constraints allowed:
-
-| Option Name        |   Units   | Constraint | Description                                          |
-| :----------------- | :-------: | :--------: | :--------------------------------------------------- |
-| `perf_throughput`  | inf / sec |    min     | Specify minimum desired throughput.                  |
-| `perf_latency_p99` |    ms     |    max     | Specify maximum tolerable latency or latency budget. |
-| `gpu_used_memory`  |    MB     |    max     | Specify maximum GPU memory used by model.            |
+A constraint, specifies the bounds that determine a successful run. The table below shows examples
+of the types of constraints allowed:
+
+| Option Name               |   Units   | Constraint | Description                                            |
+| :------------------------ | :-------: | :--------: | :----------------------------------------------------- |
+| `perf_throughput`         | inf / sec |    min     | Specify minimum desired throughput.                    |
+| `perf_latency_p99`        |    ms     |    max     | Specify maximum tolerable latency or latency budget.   |
+| `output_token_throughput` | tok / sec |    min     | Specify minimum desired output token throughput.       |
+| `inter_token_latency_p99` |    ms     |    max     | Specify maximum tolerable input token latency.         |
+| `time_to_first_token_p99` |    ms     |    max     | Specify maximum tolerable time to first token latency. |
+| `gpu_used_memory`         |    MB     |    max     | Specify maximum GPU memory used by model.              |
 
 <br>
 
diff --git a/model_analyzer/config/generate/brute_run_config_generator.py b/model_analyzer/config/generate/brute_run_config_generator.py
index d226811aa..61d1accd4 100755
--- a/model_analyzer/config/generate/brute_run_config_generator.py
+++ b/model_analyzer/config/generate/brute_run_config_generator.py
@@ -129,7 +129,7 @@ def _generate_subset(
             self._send_results_to_generator(index)
 
     def _make_run_config(self) -> RunConfig:
-        run_config = RunConfig(self._triton_env)
+        run_config = RunConfig(self._triton_env, self._models[0].genai_perf_flags())
         for index in range(len(self._models)):
             run_config.add_model_run_config(self._curr_model_run_configs[index])
         return run_config
diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py
index ed6cc9d5b..fc13cdb08 100755
--- a/model_analyzer/config/input/config_command_profile.py
+++ b/model_analyzer/config/input/config_command_profile.py
@@ -31,6 +31,7 @@
 )
 from model_analyzer.constants import LOGGER_NAME
 from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
+from model_analyzer.perf_analyzer.genai_perf_config import GenaiPerfConfig
 from model_analyzer.perf_analyzer.perf_config import PerfAnalyzerConfig
 from model_analyzer.record.record import RecordType
 from model_analyzer.triton.server.server_config import TritonServerConfig
@@ -50,7 +51,9 @@
     DEFAULT_GPU_OUTPUT_FIELDS,
     DEFAULT_GPUS,
     DEFAULT_INFERENCE_OUTPUT_FIELDS,
+    DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS,
     DEFAULT_MAX_RETRIES,
+    DEFAULT_MODEL_TYPE,
     DEFAULT_MODEL_WEIGHTING,
     DEFAULT_MONITORING_INTERVAL,
     DEFAULT_NUM_CONFIGS_PER_MODEL,
@@ -297,6 +300,15 @@ def _fill_config(self):
                 description="Skips the generation of detailed summary reports and tables.",
             )
         )
+        self._add_config(
+            ConfigField(
+                "model_type",
+                flags=["--model-type"],
+                field_type=ConfigPrimitive(str),
+                default_value=DEFAULT_MODEL_TYPE,
+                description="Type of model being profiled: generic or LLM",
+            )
+        )
 
         self._add_repository_configs()
         self._add_client_configs()
@@ -364,6 +376,10 @@ def _add_profile_models_configs(self):
             }
         )
 
+        genai_perf_flags_scheme = ConfigObject(
+            schema={k: ConfigPrimitive(str) for k in GenaiPerfConfig.allowed_keys()}
+        )
+
         triton_server_environment_scheme = ConfigObject(
             schema={"*": ConfigPrimitive(str)}
         )
@@ -444,6 +460,13 @@ def _add_profile_models_configs(self):
                 description="Allows custom configuration of the perf analyzer instances used by model analyzer.",
             )
         )
+        self._add_config(
+            ConfigField(
+                "genai_perf_flags",
+                field_type=genai_perf_flags_scheme,
+                description="Allows custom configuration of the GenAI Perf instances used by model analyzer.",
+            )
+        )
         self._add_config(
             ConfigField(
                 "triton_server_flags",
@@ -484,6 +507,11 @@ def _add_profile_models_configs(self):
                         "min": ConfigPrimitive(int),
                     }
                 ),
+                "output_token_throughput": ConfigObject(
+                    schema={
+                        "min": ConfigPrimitive(int),
+                    }
+                ),
                 "perf_latency_avg": ConfigObject(
                     schema={
                         "max": ConfigPrimitive(int),
@@ -514,6 +542,96 @@ def _add_profile_models_configs(self):
                         "max": ConfigPrimitive(int),
                     }
                 ),
+                "inter_token_latency_p99": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "inter_token_latency_p95": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "inter_token_latency_p90": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "inter_token_latency_p75": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "inter_token_latency_p50": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "inter_token_latency_p25": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "inter_token_latency_min": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "inter_token_latency_max": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "inter_token_latency_avg": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "time_to_first_token_p99": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "time_to_first_token_p95": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "time_to_first_token_p90": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "time_to_first_token_p75": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "time_to_first_token_p50": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "time_to_first_token_p25": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "time_to_first_token_min": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "time_to_first_token_max": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
+                "time_to_first_token_avg": ConfigObject(
+                    schema={
+                        "max": ConfigPrimitive(int),
+                    }
+                ),
             }
         )
         self._add_config(
@@ -560,6 +678,7 @@ def _add_profile_models_configs(self):
                         "weighting": ConfigPrimitive(type_=int),
                         "model_config_parameters": model_config_fields,
                         "perf_analyzer_flags": perf_analyzer_flags_scheme,
+                        "genai_perf_flags": genai_perf_flags_scheme,
                         "triton_server_flags": triton_server_flags_scheme,
                         "triton_server_environment": triton_server_environment_scheme,
                         "triton_docker_args": triton_docker_args_scheme,
@@ -1344,6 +1463,12 @@ def _autofill_values(self):
             if not self._fields["gpu_output_fields"].is_set_by_user():
                 self.gpu_output_fields = DEFAULT_REQUEST_RATE_GPU_OUTPUT_FIELDS
 
+        # Switch default output fields if user specifies model type of LLM
+        # and the user didn't specify a custom output field
+        if self.model_type == "LLM":
+            if not self._fields["inference_output_fields"].is_set_by_user():
+                self.inference_output_fields = DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS
+
         new_profile_models = {}
         for i, model in enumerate(self.profile_models):
             new_model = {"cpu_only": (model.cpu_only() or cpu_only)}
@@ -1447,6 +1572,12 @@ def _autofill_values(self):
             else:
                 new_model["perf_analyzer_flags"] = model.perf_analyzer_flags()
 
+            # GenAI Perf flags
+            if not model.genai_perf_flags():
+                new_model["genai_perf_flags"] = self.genai_perf_flags
+            else:
+                new_model["genai_perf_flags"] = model.genai_perf_flags()
+
             # triton server flags
             if not model.triton_server_flags():
                 new_model["triton_server_flags"] = self.triton_server_flags
diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py
index 368aed008..8685537f7 100755
--- a/model_analyzer/config/input/config_defaults.py
+++ b/model_analyzer/config/input/config_defaults.py
@@ -68,6 +68,7 @@
 DEFAULT_PERF_OUTPUT_FLAG = False
 DEFAULT_PERF_MAX_AUTO_ADJUSTS = 10
 DEFAULT_MEASUREMENT_MODE = "count_windows"
+DEFAULT_MODEL_TYPE = "generic"
 
 DEFAULT_ONLINE_PLOTS = {
     "throughput_v_latency": {
@@ -118,6 +119,20 @@
     "perf_throughput",
     "perf_latency_p99",
 ]
+DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS = [
+    "model_name",
+    "batch_size",
+    "concurrency",
+    "model_config_path",
+    "instance_group",
+    "max_batch_size",
+    "satisfies_constraints",
+    "perf_throughput",
+    "perf_latency_p99",
+    "inter_token_latency_p99",
+    "time_to_first_token_p99",
+    "output_token_throughput",
+]
 DEFAULT_REQUEST_RATE_INFERENCE_OUTPUT_FIELDS = [
     "model_name",
     "batch_size",
diff --git a/model_analyzer/config/input/objects/config_model_profile_spec.py b/model_analyzer/config/input/objects/config_model_profile_spec.py
index d45e68d41..250cc4980 100755
--- a/model_analyzer/config/input/objects/config_model_profile_spec.py
+++ b/model_analyzer/config/input/objects/config_model_profile_spec.py
@@ -33,6 +33,7 @@ def __init__(
         parameters=None,
         model_config_parameters=None,
         perf_analyzer_flags=None,
+        genai_perf_flags=None,
         triton_server_flags=None,
         triton_server_environment=None,
         triton_docker_args=None,
@@ -58,6 +59,9 @@ def __init__(
         perf_analyzer_flags : dict
             The custom perf analyzer configuration
             for this model
+        genai_perf_flags : dict
+            The custom GenAI perf configuration
+            for this model
         triton_server_flags : dict
             The configuration for the triton server instance launched
             for this model
@@ -78,6 +82,7 @@ def __init__(
         self._parameters = parameters
         self._model_config_parameters = model_config_parameters
         self._perf_analyzer_flags = perf_analyzer_flags
+        self._genai_perf_flags = genai_perf_flags
         self._triton_server_flags = triton_server_flags
         self._triton_server_environment = triton_server_environment
         self._triton_docker_args = triton_docker_args
@@ -162,6 +167,16 @@ def perf_analyzer_flags(self):
 
         return self._perf_analyzer_flags
 
+    def genai_perf_flags(self):
+        """
+        Returns
+        -------
+        dict:
+             the genai_perf_flags
+        """
+
+        return self._genai_perf_flags
+
     def triton_server_flags(self):
         """
         Returns
@@ -304,4 +319,7 @@ def __repr__(self):
         if self._perf_analyzer_flags:
             model_object["perf_analyzer_flags"] = self._perf_analyzer_flags
 
+        if self._genai_perf_flags:
+            model_object["genai_perf_flags"] = self._genai_perf_flags
+
         return str(model_object)
diff --git a/model_analyzer/config/run/run_config.py b/model_analyzer/config/run/run_config.py
index 29efcaf08..9b53d8266 100755
--- a/model_analyzer/config/run/run_config.py
+++ b/model_analyzer/config/run/run_config.py
@@ -17,6 +17,7 @@
 from typing import List
 
 from model_analyzer.config.run.model_run_config import ModelRunConfig
+from model_analyzer.perf_analyzer.genai_perf_config import GenaiPerfConfig
 
 
 class RunConfig:
@@ -25,16 +26,21 @@ class RunConfig:
     at the same time in Perf Analyzer
     """
 
-    def __init__(self, triton_env):
+    def __init__(self, triton_env, genai_perf_flags=None):
         """
         Parameters
         ----------
         triton_env : dict
             A dictionary of environment variables to set
             when launching tritonserver
+
+        genai_perf_flags: dict
+            The set of flags used when calling genai_perf for LLM models
         """
 
         self._triton_env = triton_env
+        self._genai_perf_config = GenaiPerfConfig()
+        self._genai_perf_config.update_config(genai_perf_flags)
         self._model_run_configs: List[ModelRunConfig] = []
 
     def add_model_run_config(self, model_run_config):
@@ -103,6 +109,9 @@ def triton_environment(self):
 
         return self._triton_env
 
+    def genai_perf_config(self):
+        return self._genai_perf_config
+
     def models_name(self):
         """Returns a single comma-joined name of the original model names"""
         return ",".join([mrc.model_name() for mrc in self.model_run_configs()])
diff --git a/model_analyzer/constants.py b/model_analyzer/constants.py
index 886360d34..6d7682515 100755
--- a/model_analyzer/constants.py
+++ b/model_analyzer/constants.py
@@ -70,3 +70,7 @@
 
 # Model analyzer package name
 PACKAGE_NAME = "triton-model-analyzer"
+
+# GENAI-PERF
+GENAI_PERF_CSV = "profile_export_genai_perf.csv"
+GENAI_PERF_COLLATERAL = ["llm_inputs.json", "profile_export.json"]
diff --git a/model_analyzer/perf_analyzer/genai_perf_config.py b/model_analyzer/perf_analyzer/genai_perf_config.py
new file mode 100755
index 000000000..9e5a77201
--- /dev/null
+++ b/model_analyzer/perf_analyzer/genai_perf_config.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
+
+
+class GenaiPerfConfig:
+    """
+    A config class to set arguments to the genai_perf.
+    An argument set to None will use the genai_perf's default.
+    """
+
+    genai_perf_args = [
+        "backend",
+        "endpoint",
+        "service-kind",
+        "url",
+        "expected-output-tokens",
+        "input-dataset",
+        "input-tokens-mean",
+        "input-tokens-stddev",
+        "input-type",
+        "num-of-output-prompts",
+        "random-seed",
+        "streaming",
+        "tokenizer",
+    ]
+
+    boolean_args = ["streaming"]
+
+    def __init__(self):
+        """
+        Construct a GenaiPerfConfig
+        """
+
+        self._args = {k: None for k in self.genai_perf_args}
+
+    @classmethod
+    def allowed_keys(cls):
+        """
+        Returns
+        -------
+        list of str
+            The keys that are allowed to be
+            passed into perf_analyzer
+        """
+
+        return cls.genai_perf_args
+
+    def update_config(self, params=None):
+        """
+        Allows setting values from a params dict
+
+        Parameters
+        ----------
+        params: dict
+            keys are allowed args to perf_analyzer
+        """
+
+        if params and type(params) is dict:
+            for key in params:
+                self[key] = params[key]
+
+    @classmethod
+    def from_dict(cls, genai_perf_config_dict):
+        genai_perf_config = GenaiPerfConfig()
+        for key in [
+            "_args",
+        ]:
+            if key in genai_perf_config_dict:
+                setattr(genai_perf_config, key, genai_perf_config_dict[key])
+        return genai_perf_config
+
+    def representation(self):
+        """
+        Returns
+        -------
+        str
+            a string representation of the Genai Perf config
+            that removes values which can vary between
+            runs, but should be ignored when determining
+            if a previous (checkpointed) run can be used
+        """
+        cli_string = self.to_cli_string()
+
+        return cli_string
+
+    def to_cli_string(self) -> str:
+        """
+        Utility function to convert a config into a
+        string of arguments to the perf_analyzer with CLI.
+
+        Returns
+        -------
+        str
+            cli command string consisting of all arguments
+            to the perf_analyzer set in the config, without
+            the executable name.
+        """
+
+        # single dashed options, then verbose flags, then main args
+        args = []
+        args.extend(self._parse_options())
+
+        return " ".join(args)
+
+    def _parse_options(self):
+        """
+        Parse the genai perf args
+        """
+        temp_args = []
+        for key, value in self._args.items():
+            if key in self.boolean_args:
+                temp_args = self._parse_boolean_args(key, value, temp_args)
+            elif value:
+                temp_args.append(f"--{key}={value}")
+        return temp_args
+
+    def _parse_boolean_args(self, key, value, temp_args):
+        """
+        Parse genai perf args that should not add a value to the cli string
+        """
+        assert type(value) in [
+            str,
+            type(None),
+        ], f"Data type for arg {key} must be a (boolean) string instead of {type(value)}"
+        if value != None and value.lower() == "true":
+            temp_args.append(f"--{key}")
+        return temp_args
+
+    def __getitem__(self, key):
+        """
+        Gets an arguments value in config
+
+        Parameters
+        ----------
+        key : str
+            The name of the argument to the genai perf config
+
+        Returns
+        -------
+        object
+            The value that the argument is set to in this config
+
+        Raises
+        ------
+        KeyError
+            If argument not found in the config
+        """
+
+        if key in self._args:
+            return self._args[key]
+        else:
+            raise TritonModelAnalyzerException(
+                f"Key {key} does not exist in genai_perf_flags."
+            )
+
+    def __setitem__(self, key, value):
+        """
+        Sets an arguments value in config
+        after checking if defined/supported.
+
+        Parameters
+        ----------
+        key : str
+            The name of the argument in genai_perf
+        value : (any)
+            The value to which the argument is being set
+
+        Raises
+        ------
+        TritonModelAnalyzerException
+            If key is unsupported or undefined in the
+            config class
+        """
+
+        if key in self._args:
+            self._args[key] = value
+        else:
+            raise TritonModelAnalyzerException(
+                f"The argument '{key}' to the genai_perf "
+                "is not supported by model analyzer."
+            )
+
+    def __contains__(self, key):
+        """
+        Returns
+        -------
+        True if key is in perf_config i.e. the key is a
+        genai perf config argument
+        """
+
+        return key in GenaiPerfConfig.allowed_keys()
diff --git a/model_analyzer/perf_analyzer/perf_analyzer.py b/model_analyzer/perf_analyzer/perf_analyzer.py
index c88f8e655..b301ee97e 100755
--- a/model_analyzer/perf_analyzer/perf_analyzer.py
+++ b/model_analyzer/perf_analyzer/perf_analyzer.py
@@ -21,12 +21,16 @@
 import re
 import signal
 import tempfile
+from csv import DictReader
 from subprocess import STDOUT, Popen
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 import psutil
 
+from model_analyzer.config.input.config_defaults import DEFAULT_MODEL_TYPE
 from model_analyzer.constants import (
+    GENAI_PERF_COLLATERAL,
+    GENAI_PERF_CSV,
     INTERVAL_SLEEP_TIME,
     LOGGER_NAME,
     MEASUREMENT_REQUEST_COUNT_STEP,
@@ -40,6 +44,16 @@
 from model_analyzer.record.types.gpu_power_usage import GPUPowerUsage
 from model_analyzer.record.types.gpu_used_memory import GPUUsedMemory
 from model_analyzer.record.types.gpu_utilization import GPUUtilization
+from model_analyzer.record.types.inter_token_latency_avg import InterTokenLatencyAvg
+from model_analyzer.record.types.inter_token_latency_max import InterTokenLatencyMax
+from model_analyzer.record.types.inter_token_latency_min import InterTokenLatencyMin
+from model_analyzer.record.types.inter_token_latency_p25 import InterTokenLatencyP25
+from model_analyzer.record.types.inter_token_latency_p50 import InterTokenLatencyP50
+from model_analyzer.record.types.inter_token_latency_p75 import InterTokenLatencyP75
+from model_analyzer.record.types.inter_token_latency_p90 import InterTokenLatencyP90
+from model_analyzer.record.types.inter_token_latency_p95 import InterTokenLatencyP95
+from model_analyzer.record.types.inter_token_latency_p99 import InterTokenLatencyP99
+from model_analyzer.record.types.output_token_throughput import OutputTokenThroughput
 from model_analyzer.record.types.perf_client_response_wait import PerfClientResponseWait
 from model_analyzer.record.types.perf_client_send_recv import PerfClientSendRecv
 from model_analyzer.record.types.perf_latency_avg import PerfLatencyAvg
@@ -53,6 +67,15 @@
 )
 from model_analyzer.record.types.perf_server_queue import PerfServerQueue
 from model_analyzer.record.types.perf_throughput import PerfThroughput
+from model_analyzer.record.types.time_to_first_token_avg import TimeToFirstTokenAvg
+from model_analyzer.record.types.time_to_first_token_max import TimeToFirstTokenMax
+from model_analyzer.record.types.time_to_first_token_min import TimeToFirstTokenMin
+from model_analyzer.record.types.time_to_first_token_p25 import TimeToFirstTokenP25
+from model_analyzer.record.types.time_to_first_token_p50 import TimeToFirstTokenP50
+from model_analyzer.record.types.time_to_first_token_p75 import TimeToFirstTokenP75
+from model_analyzer.record.types.time_to_first_token_p90 import TimeToFirstTokenP90
+from model_analyzer.record.types.time_to_first_token_p95 import TimeToFirstTokenP95
+from model_analyzer.record.types.time_to_first_token_p99 import TimeToFirstTokenP99
 
 logger = logging.getLogger(LOGGER_NAME)
 
@@ -91,6 +114,28 @@ class PerfAnalyzer:
         ["gpu_used_memory",            "Max GPU Memory Usage",  GPUUsedMemory,        "1000000"],
         ["gpu_free_memory",            "Total GPU Memory",      GPUFreeMemory,        "1000000"]
     ]
+
+    llm_metric_table = [
+        ["time_to_first_token_avg", "Time To First Token (ns) avg",          TimeToFirstTokenAvg, "1000"],
+        ["time_to_first_token_min", "Time To First Token (ns) min",          TimeToFirstTokenMin, "1000"],
+        ["time_to_first_token_max", "Time To First Token (ns) max",          TimeToFirstTokenMax, "1000"],
+        ["time_to_first_token_p99", "Time To First Token (ns) p99",          TimeToFirstTokenP99, "1000"],
+        ["time_to_first_token_p95", "Time To First Token (ns) p95",          TimeToFirstTokenP95, "1000"],
+        ["time_to_first_token_p90", "Time To First Token (ns) p90",          TimeToFirstTokenP90, "1000"],
+        ["time_to_first_token_p75", "Time To First Token (ns) p75",          TimeToFirstTokenP75, "1000"],
+        ["time_to_first_token_p50", "Time To First Token (ns) p50",          TimeToFirstTokenP50, "1000"],
+        ["time_to_first_token_p25", "Time To First Token (ns) p25",          TimeToFirstTokenP25, "1000"],
+        ["inter_token_latency_avg", "Inter Token Latency (ns) avg",          InterTokenLatencyAvg, "1000"],
+        ["inter_token_latency_min", "Inter Token Latency (ns) min",          InterTokenLatencyMin, "1000"],
+        ["inter_token_latency_max", "Inter Token Latency (ns) max",          InterTokenLatencyMax, "1000"],
+        ["inter_token_latency_p99", "Inter Token Latency (ns) p99",          InterTokenLatencyP99, "1000"],
+        ["inter_token_latency_p95", "Inter Token Latency (ns) p95",          InterTokenLatencyP95, "1000"],
+        ["inter_token_latency_p90", "Inter Token Latency (ns) p90",          InterTokenLatencyP90, "1000"],
+        ["inter_token_latency_p75", "Inter Token Latency (ns) p75",          InterTokenLatencyP75, "1000"],
+        ["inter_token_latency_p50", "Inter Token Latency (ns) p50",          InterTokenLatencyP50, "1000"],
+        ["inter_token_latency_p25", "Inter Token Latency (ns) p25",          InterTokenLatencyP25, "1000"],
+        ["output_token_throughput", "Output Token Throughput (per sec) avg", OutputTokenThroughput, "1"]
+    ]
     # yapf: enable
 
     @staticmethod
@@ -109,7 +154,23 @@ def get_gpu_metrics():
         ]
         return gpu_metrics
 
-    def __init__(self, path, config, max_retries, timeout, max_cpu_util):
+    @staticmethod
+    def get_llm_metrics():
+        llm_metrics = [
+            llm_metric[PerfAnalyzer.RECORD_CLASS]
+            for llm_metric in PerfAnalyzer.llm_metric_table
+        ]
+        return llm_metrics
+
+    def __init__(
+        self,
+        path,
+        config,
+        max_retries,
+        timeout,
+        max_cpu_util,
+        model_type=DEFAULT_MODEL_TYPE,
+    ):
         """
         Parameters
         ----------
@@ -133,8 +194,10 @@ def __init__(self, path, config, max_retries, timeout, max_cpu_util):
         self._timeout = timeout
         self._output = ""
         self._perf_records = {}
+        self._llm_records = {}
         self._gpu_records = []
         self._max_cpu_util = max_cpu_util
+        self._model_type = model_type
 
     def run(self, metrics, env=None):
         """
@@ -195,7 +258,20 @@ def get_perf_records(self):
         if self._perf_records:
             return self._perf_records
         raise TritonModelAnalyzerException(
-            "Attempted to get perf_analyzer results" "without calling run first."
+            "Attempted to get perf_analyzer results without calling run first."
+        )
+
+    def get_llm_records(self):
+        """
+        Returns
+        -------
+        The LLM records from the last perf_analyzer run
+        """
+
+        if self._llm_records:
+            return self._llm_records
+        raise TritonModelAnalyzerException(
+            "Attempted to get perf_analyzer results without calling run first."
         )
 
     def get_gpu_records(self):
@@ -248,14 +324,32 @@ def _get_cmd(self):
         return cmd
 
     def _get_single_model_cmd(self, index):
-        cmd = [self.bin_path]
-        if self._is_multi_model():
-            cmd += ["--enable-mpi"]
-        cmd += self._get_pa_cli_command(index).replace("=", " ").split()
+        if self._model_type == "LLM":
+            cmd = ["genai-perf", "-m", self._config.models_name()]
+            cmd += self._get_genai_perf_cli_command(index).replace("=", " ").split()
+            cmd += ["--"]
+            cmd += (
+                self._get_pa_cli_command(index, exclude_model_name=True)
+                .replace("=", " ")
+                .split()
+            )
+        else:
+            cmd = [self.bin_path]
+            if self._is_multi_model():
+                cmd += ["--enable-mpi"]
+            cmd += self._get_pa_cli_command(index).replace("=", " ").split()
+
         return cmd
 
-    def _get_pa_cli_command(self, index):
-        return self._config.model_run_configs()[index].perf_config().to_cli_string()
+    def _get_pa_cli_command(self, index, exclude_model_name=False):
+        return (
+            self._config.model_run_configs()[index]
+            .perf_config()
+            .to_cli_string(exclude_model_name)
+        )
+
+    def _get_genai_perf_cli_command(self, index):
+        return self._config.genai_perf_config().to_cli_string()
 
     def _create_env(self, env):
         perf_analyzer_env = os.environ.copy()
@@ -438,6 +532,12 @@ def _is_multi_model(self):
         return len(self._config.model_run_configs()) > 1
 
     def _parse_outputs(self, metrics):
+        self._parse_generic_outputs(metrics)
+
+        if self._model_type == "LLM":
+            self._parse_llm_outputs(metrics)
+
+    def _parse_generic_outputs(self, metrics):
         """
         Extract records from the Perf Analyzer run for each model
         """
@@ -464,6 +564,26 @@ def _parse_outputs(self, metrics):
             for f in glob.glob(f"*{perf_config['latency-report-file']}"):
                 os.remove(f)
 
+    def _parse_llm_outputs(self, metrics):
+        """
+        Extract records from the Perf Analyzer run for each model
+        """
+
+        perf_config = self._config.model_run_configs()[0].perf_config()
+
+        logger.debug(f"Reading GENAI-PERF results from {GENAI_PERF_CSV}")
+        with open(GENAI_PERF_CSV, mode="r") as f:
+            csv_reader = list(csv.DictReader(f, delimiter=","))
+
+            # See test_perf_analyzer::test_pa_llm_csv_output() for CSV output example
+            self._llm_records[perf_config["model-name"]] = self._extract_llm_records(
+                metrics, csv_reader
+            )
+
+            os.remove(GENAI_PERF_CSV)
+            for filename in GENAI_PERF_COLLATERAL:
+                os.remove(filename)
+
     def _extract_perf_records_from_row(
         self, requested_metrics: List[Record], row_metrics: Dict[str, str]
     ) -> List[Record]:
@@ -526,6 +646,46 @@ def _extract_gpu_records_from_row(
         self._cleanup_gpu_records(gpu_records)
         return gpu_records
 
+    def _extract_llm_records(
+        self, requested_metrics: List[Record], csv_reader: DictReader
+    ) -> List[Record]:
+        llm_records: List[Record] = []
+
+        for requested_metric in requested_metrics:
+            new_llm_record = self._get_llm_record_from_csv(requested_metric, csv_reader)
+            if new_llm_record:
+                llm_records.append(new_llm_record)
+
+        return llm_records
+
+    def _get_llm_record_from_csv(
+        self, requested_metric: Record, csv_reader: DictReader
+    ) -> Optional[Record]:
+        for row in csv_reader:
+            for key, value in row.items():
+                metric_string = f"{row['Metric']} {key}"
+                llm_metric = self._find_corresponding_llm_metric_row(metric_string)
+
+                if (
+                    llm_metric
+                    and llm_metric[PerfAnalyzer.METRIC_TAG] == requested_metric.tag
+                ):
+                    adjusted_value = float(value) / float(
+                        llm_metric[PerfAnalyzer.REDUCTION_FACTOR]
+                    )
+
+                    llm_record = llm_metric[PerfAnalyzer.RECORD_CLASS](adjusted_value)  # type: ignore
+                    return llm_record
+
+        return None
+
+    def _find_corresponding_llm_metric_row(self, metric_string: str) -> Optional[List]:
+        for row in PerfAnalyzer.llm_metric_table:
+            if metric_string == row[PerfAnalyzer.CSV_STRING]:
+                return row
+
+        return None
+
     def _cleanup_gpu_records(self, gpu_records):
         # Recalculate GPUFreeMemory by removing the value of the associated GPUUsedMemory
         # Remove any GPUFreeMemory records that don't have a matching GPUUsedMemory
diff --git a/model_analyzer/perf_analyzer/perf_config.py b/model_analyzer/perf_analyzer/perf_config.py
index e9160a44a..521cc1629 100755
--- a/model_analyzer/perf_analyzer/perf_config.py
+++ b/model_analyzer/perf_analyzer/perf_config.py
@@ -14,6 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import List
+
 from model_analyzer.config.input.config_defaults import DEFAULT_MEASUREMENT_MODE
 from model_analyzer.constants import SECONDS_TO_MILLISECONDS_MULTIPLIER
 from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
@@ -325,7 +327,7 @@ def remove_mrc_from_cli_string(cls, cli_string):
 
         return " ".join(perf_str_tokens)
 
-    def to_cli_string(self):
+    def to_cli_string(self, exclude_model_name: bool = False) -> str:
         """
         Utility function to convert a config into a
         string of arguments to the perf_analyzer with CLI.
@@ -340,19 +342,22 @@ def to_cli_string(self):
 
         # single dashed options, then verbose flags, then main args
         args = []
-        args.extend(self._parse_short_options())
+        args.extend(self._parse_short_options(exclude_model_name))
         args.extend(self._parse_verbose_options())
         args.extend(self._parse_long_options())
 
         return " ".join(args)
 
-    def _parse_short_options(self):
+    def _parse_short_options(self, exclude_model_name: bool = False) -> List:
         """
         Parse the perf analyzer single dash options
         """
         temp_args = []
         for key, value in self._options.items():
             if value:
+                if exclude_model_name and key == "-m":
+                    continue
+
                 if key in self._additive_args:
                     for additive_value in value:
                         temp_args.append(f"{key} {additive_value}")
diff --git a/model_analyzer/record/metrics_manager.py b/model_analyzer/record/metrics_manager.py
index 581cae88b..849731935 100755
--- a/model_analyzer/record/metrics_manager.py
+++ b/model_analyzer/record/metrics_manager.py
@@ -69,6 +69,25 @@ class MetricsManager:
         "gpu_power_usage",
         "cpu_available_ram",
         "cpu_used_ram",
+        "time_to_first_token_avg",
+        "time_to_first_token_min",
+        "time_to_first_token_max",
+        "time_to_first_token_p99",
+        "time_to_first_token_p95",
+        "time_to_first_token_p90",
+        "time_to_first_token_p75",
+        "time_to_first_token_p50",
+        "time_to_first_token_p25",
+        "inter_token_latency_avg",
+        "inter_token_latency_min",
+        "inter_token_latency_max",
+        "inter_token_latency_p99",
+        "inter_token_latency_p95",
+        "inter_token_latency_p90",
+        "inter_token_latency_p75",
+        "inter_token_latency_p50",
+        "inter_token_latency_p25",
+        "output_token_throughput",
     ]
 
     def __init__(self, config, client, server, gpus, result_manager, state_manager):
@@ -115,6 +134,7 @@ def __init__(self, config, client, server, gpus, result_manager, state_manager):
         (
             self._gpu_metrics,
             self._perf_metrics,
+            self._llm_metrics,
             self._cpu_metrics,
         ) = self._categorize_metrics(self.metrics, self._config.collect_cpu_metrics)
         self._gpus = gpus
@@ -160,21 +180,23 @@ def _categorize_metrics(metric_tags, collect_cpu_metrics=False):
 
         Returns
         -------
-        (list,list,list)
-            tuple of three lists (DCGM, PerfAnalyzer, CPU) metrics
+        (list,list,list,list)
+            tuple of four lists (DCGM, PerfAnalyzer, LLM, CPU) metrics
         """
 
-        gpu_metrics, perf_metrics, cpu_metrics = [], [], []
+        gpu_metrics, perf_metrics, llm_metrics, cpu_metrics = [], [], [], []
         # Separates metrics and objectives into related lists
         for metric in MetricsManager.get_metric_types(metric_tags):
             if metric in PerfAnalyzer.get_gpu_metrics():
                 gpu_metrics.append(metric)
             elif metric in PerfAnalyzer.get_perf_metrics():
                 perf_metrics.append(metric)
+            elif metric in PerfAnalyzer.get_llm_metrics():
+                llm_metrics.append(metric)
             elif collect_cpu_metrics and (metric in CPUMonitor.cpu_metrics):
                 cpu_metrics.append(metric)
 
-        return gpu_metrics, perf_metrics, cpu_metrics
+        return gpu_metrics, perf_metrics, llm_metrics, cpu_metrics
 
     def profile_server(self):
         """
@@ -589,9 +611,10 @@ def _run_perf_analyzer(
             max_retries=self._config.perf_analyzer_max_auto_adjusts,
             timeout=self._config.perf_analyzer_timeout,
             max_cpu_util=self._config.perf_analyzer_cpu_util,
+            model_type=self._config.model_type,
         )
 
-        metrics_to_gather = self._perf_metrics + self._gpu_metrics
+        metrics_to_gather = self._perf_metrics + self._llm_metrics + self._gpu_metrics
         status = perf_analyzer.run(metrics_to_gather, env=perf_analyzer_env)
 
         self._write_perf_analyzer_output(perf_output_writer, perf_analyzer)
@@ -601,6 +624,12 @@ def _run_perf_analyzer(
             return (None, None)
 
         perf_records = perf_analyzer.get_perf_records()
+
+        if self._config.model_type == "LLM":
+            perf_records[run_config.models_name()].extend(
+                perf_analyzer.get_llm_records()[run_config.models_name()]
+            )
+
         gpu_records = perf_analyzer.get_gpu_records()
 
         aggregated_perf_records = self._aggregate_perf_records(perf_records)
@@ -824,6 +853,17 @@ def is_perf_analyzer_metric(tag):
         metric = MetricsManager.get_metric_types([tag])[0]
         return metric in PerfAnalyzer.get_perf_metrics()
 
+    @staticmethod
+    def is_llm_metric(tag):
+        """
+        Returns
+        ------
+        True if the given tag is a supported perf_analyzer metric
+        False otherwise
+        """
+        metric = MetricsManager.get_metric_types([tag])[0]
+        return metric in PerfAnalyzer.get_llm_metrics()
+
     @staticmethod
     def is_cpu_metric(tag):
         """
diff --git a/model_analyzer/record/types/inter_token_latency_avg.py b/model_analyzer/record/types/inter_token_latency_avg.py
new file mode 100755
index 000000000..fe1dc7dfb
--- /dev/null
+++ b/model_analyzer/record/types/inter_token_latency_avg.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase
+
+
+@total_ordering
+class InterTokenLatencyAvg(InterTokenLatencyBase):
+    """
+    A record for perf_analyzer Inter token latency metric
+    """
+
+    tag = "inter_token_latency_avg"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "Avg Inter Token Latency (ms)"
diff --git a/model_analyzer/record/types/inter_token_latency_base.py b/model_analyzer/record/types/inter_token_latency_base.py
new file mode 100755
index 000000000..dda70cefa
--- /dev/null
+++ b/model_analyzer/record/types/inter_token_latency_base.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import DecreasingRecord
+
+
+@total_ordering
+class InterTokenLatencyBase(DecreasingRecord):
+    """
+    A record for perf_analyzer Inter token latency metric
+    """
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() > other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/inter_token_latency_max.py b/model_analyzer/record/types/inter_token_latency_max.py
new file mode 100755
index 000000000..ce2484144
--- /dev/null
+++ b/model_analyzer/record/types/inter_token_latency_max.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase
+
+
+@total_ordering
+class InterTokenLatencyMax(InterTokenLatencyBase):
+    """
+    A record for perf_analyzer Inter token latency metric
+    """
+
+    tag = "inter_token_latency_max"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "Max Inter Token Latency (ms)"
diff --git a/model_analyzer/record/types/inter_token_latency_min.py b/model_analyzer/record/types/inter_token_latency_min.py
new file mode 100755
index 000000000..21e44883b
--- /dev/null
+++ b/model_analyzer/record/types/inter_token_latency_min.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase
+
+
+@total_ordering
+class InterTokenLatencyMin(InterTokenLatencyBase):
+    """
+    A record for perf_analyzer Inter token latency metric
+    """
+
+    tag = "inter_token_latency_min"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "Min Inter Token Latency (ms)"
diff --git a/model_analyzer/record/types/inter_token_latency_p25.py b/model_analyzer/record/types/inter_token_latency_p25.py
new file mode 100755
index 000000000..8a0c80173
--- /dev/null
+++ b/model_analyzer/record/types/inter_token_latency_p25.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase
+
+
+@total_ordering
+class InterTokenLatencyP25(InterTokenLatencyBase):
+    """
+    A record for perf_analyzer Inter token latency metric
+    """
+
+    tag = "inter_token_latency_p25"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "p25 Inter Token Latency (ms)"
diff --git a/model_analyzer/record/types/inter_token_latency_p50.py b/model_analyzer/record/types/inter_token_latency_p50.py
new file mode 100755
index 000000000..190920874
--- /dev/null
+++ b/model_analyzer/record/types/inter_token_latency_p50.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase
+
+
+@total_ordering
+class InterTokenLatencyP50(InterTokenLatencyBase):
+    """
+    A record for perf_analyzer Inter token latency metric
+    """
+
+    tag = "inter_token_latency_p50"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "p50 Inter Token Latency (ms)"
diff --git a/model_analyzer/record/types/inter_token_latency_p75.py b/model_analyzer/record/types/inter_token_latency_p75.py
new file mode 100755
index 000000000..1234306fd
--- /dev/null
+++ b/model_analyzer/record/types/inter_token_latency_p75.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase
+
+
+@total_ordering
+class InterTokenLatencyP75(InterTokenLatencyBase):
+    """
+    A record for perf_analyzer Inter token latency metric
+    """
+
+    tag = "inter_token_latency_p75"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "p75 Inter Token Latency (ms)"
diff --git a/model_analyzer/record/types/inter_token_latency_p90.py b/model_analyzer/record/types/inter_token_latency_p90.py
new file mode 100755
index 000000000..60019088a
--- /dev/null
+++ b/model_analyzer/record/types/inter_token_latency_p90.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase
+
+
+@total_ordering
+class InterTokenLatencyP90(InterTokenLatencyBase):
+    """
+    A record for perf_analyzer Inter token latency metric
+    """
+
+    tag = "inter_token_latency_p90"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "p90 Inter Token Latency (ms)"
diff --git a/model_analyzer/record/types/inter_token_latency_p95.py b/model_analyzer/record/types/inter_token_latency_p95.py
new file mode 100755
index 000000000..b77fd9118
--- /dev/null
+++ b/model_analyzer/record/types/inter_token_latency_p95.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase
+
+
+@total_ordering
+class InterTokenLatencyP95(InterTokenLatencyBase):
+    """
+    A record for perf_analyzer Inter token latency metric
+    """
+
+    tag = "inter_token_latency_p95"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "p95 Inter Token Latency (ms)"
diff --git a/model_analyzer/record/types/inter_token_latency_p99.py b/model_analyzer/record/types/inter_token_latency_p99.py
new file mode 100755
index 000000000..d9f722772
--- /dev/null
+++ b/model_analyzer/record/types/inter_token_latency_p99.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.types.inter_token_latency_base import InterTokenLatencyBase
+
+
+@total_ordering
+class InterTokenLatencyP99(InterTokenLatencyBase):
+    """
+    A record for perf_analyzer Inter token latency metric
+    """
+
+    tag = "inter_token_latency_p99"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "p99 Inter Token Latency (ms)"
diff --git a/model_analyzer/record/types/output_token_throughput.py b/model_analyzer/record/types/output_token_throughput.py
new file mode 100755
index 000000000..f7edf7cb8
--- /dev/null
+++ b/model_analyzer/record/types/output_token_throughput.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import IncreasingRecord
+
+
+@total_ordering
+class OutputTokenThroughput(IncreasingRecord):
+    """
+    A record for perf_analyzer
+    metric 'Output Token Throughput'
+    """
+
+    tag = "output_token_throughput"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            The throughput from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @staticmethod
+    def value_function():
+        """
+        Returns the total value from a list
+
+        Returns
+        -------
+        Total value of the list
+        """
+        return sum
+
+    @staticmethod
+    def header(aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "Output Token Throughput (infer/sec)"
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() < other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subtracting two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() - other.value()))
diff --git a/model_analyzer/record/types/perf_latency_avg.py b/model_analyzer/record/types/perf_latency_avg.py
index 5452c0b79..aafbcbeb2 100755
--- a/model_analyzer/record/types/perf_latency_avg.py
+++ b/model_analyzer/record/types/perf_latency_avg.py
@@ -16,11 +16,11 @@
 
 from functools import total_ordering
 
-from model_analyzer.record.record import DecreasingRecord
+from model_analyzer.record.types.perf_latency_base import PerfLatencyBase
 
 
 @total_ordering
-class PerfLatencyAvg(DecreasingRecord):
+class PerfLatencyAvg(PerfLatencyBase):
     """
     A record for perf_analyzer latency metric
     """
@@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False):
         """
 
         return "Avg Latency (ms)"
-
-    def __eq__(self, other):
-        """
-        Allows checking for
-        equality between two records
-        """
-
-        return self.value() == other.value()
-
-    def __lt__(self, other):
-        """
-        Allows checking if
-        this record is less than
-        the other
-        """
-
-        return self.value() > other.value()
-
-    def __add__(self, other):
-        """
-        Allows adding two records together
-        to produce a brand new record.
-        """
-
-        return self.__class__(value=(self.value() + other.value()))
-
-    def __sub__(self, other):
-        """
-        Allows subbing two records together
-        to produce a brand new record.
-
-        ** Note this does reverse subtraction because
-            of the inverted nature of latency (lower is better)
-        """
-
-        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/perf_latency_base.py b/model_analyzer/record/types/perf_latency_base.py
new file mode 100755
index 000000000..3c3e76cac
--- /dev/null
+++ b/model_analyzer/record/types/perf_latency_base.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import DecreasingRecord
+
+
+@total_ordering
+class PerfLatencyBase(DecreasingRecord):
+    """
+    A base class for perf_analyzer latency metric
+    """
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() > other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/perf_latency_p90.py b/model_analyzer/record/types/perf_latency_p90.py
index c6718fe40..7eafa3b28 100755
--- a/model_analyzer/record/types/perf_latency_p90.py
+++ b/model_analyzer/record/types/perf_latency_p90.py
@@ -16,11 +16,11 @@
 
 from functools import total_ordering
 
-from model_analyzer.record.record import DecreasingRecord
+from model_analyzer.record.types.perf_latency_base import PerfLatencyBase
 
 
 @total_ordering
-class PerfLatencyP90(DecreasingRecord):
+class PerfLatencyP90(PerfLatencyBase):
     """
     A record for perf_analyzer latency metric
     """
@@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False):
         """
 
         return "p90 Latency (ms)"
-
-    def __eq__(self, other):
-        """
-        Allows checking for
-        equality between two records
-        """
-
-        return self.value() == other.value()
-
-    def __lt__(self, other):
-        """
-        Allows checking if
-        this record is less than
-        the other
-        """
-
-        return self.value() > other.value()
-
-    def __add__(self, other):
-        """
-        Allows adding two records together
-        to produce a brand new record.
-        """
-
-        return self.__class__(value=(self.value() + other.value()))
-
-    def __sub__(self, other):
-        """
-        Allows subbing two records together
-        to produce a brand new record.
-
-        ** Note this does reverse subtraction because
-            of the inverted nature of latency (lower is better)
-        """
-
-        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/perf_latency_p95.py b/model_analyzer/record/types/perf_latency_p95.py
index 84ed9e648..ccb9f8c01 100755
--- a/model_analyzer/record/types/perf_latency_p95.py
+++ b/model_analyzer/record/types/perf_latency_p95.py
@@ -16,11 +16,11 @@
 
 from functools import total_ordering
 
-from model_analyzer.record.record import DecreasingRecord
+from model_analyzer.record.types.perf_latency_base import PerfLatencyBase
 
 
 @total_ordering
-class PerfLatencyP95(DecreasingRecord):
+class PerfLatencyP95(PerfLatencyBase):
     """
     A record for perf_analyzer latency metric
     """
@@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False):
         """
 
         return "p95 Latency (ms)"
-
-    def __eq__(self, other):
-        """
-        Allows checking for
-        equality between two records
-        """
-
-        return self.value() == other.value()
-
-    def __lt__(self, other):
-        """
-        Allows checking if
-        this record is less than
-        the other
-        """
-
-        return self.value() > other.value()
-
-    def __add__(self, other):
-        """
-        Allows adding two records together
-        to produce a brand new record.
-        """
-
-        return self.__class__(value=(self.value() + other.value()))
-
-    def __sub__(self, other):
-        """
-        Allows subbing two records together
-        to produce a brand new record.
-
-        ** Note this does reverse subtraction because
-            of the inverted nature of latency (lower is better)
-        """
-
-        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/perf_latency_p99.py b/model_analyzer/record/types/perf_latency_p99.py
index af4d06da4..46d352021 100755
--- a/model_analyzer/record/types/perf_latency_p99.py
+++ b/model_analyzer/record/types/perf_latency_p99.py
@@ -16,11 +16,11 @@
 
 from functools import total_ordering
 
-from model_analyzer.record.record import DecreasingRecord
+from model_analyzer.record.types.perf_latency_base import PerfLatencyBase
 
 
 @total_ordering
-class PerfLatencyP99(DecreasingRecord):
+class PerfLatencyP99(PerfLatencyBase):
     """
     A record for perf_analyzer latency metric
     """
@@ -58,39 +58,3 @@ def header(cls, aggregation_tag=False):
         """
 
         return "p99 Latency (ms)"
-
-    def __eq__(self, other):
-        """
-        Allows checking for
-        equality between two records
-        """
-
-        return self.value() == other.value()
-
-    def __lt__(self, other):
-        """
-        Allows checking if
-        this record is less than
-        the other
-        """
-
-        return self.value() > other.value()
-
-    def __add__(self, other):
-        """
-        Allows adding two records together
-        to produce a brand new record.
-        """
-
-        return self.__class__(value=(self.value() + other.value()))
-
-    def __sub__(self, other):
-        """
-        Allows subbing two records together
-        to produce a brand new record.
-
-        ** Note this does reverse subtraction because
-            of the inverted nature of latency (lower is better)
-        """
-
-        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/time_to_first_token_avg.py b/model_analyzer/record/types/time_to_first_token_avg.py
new file mode 100755
index 000000000..28da5d294
--- /dev/null
+++ b/model_analyzer/record/types/time_to_first_token_avg.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase
+
+
+@total_ordering
+class TimeToFirstTokenAvg(TimeToFirstTokenBase):
+    """
+    A record for perf_analyzer Time to first token metric
+    """
+
+    tag = "time_to_first_token_avg"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "Avg Time To First Token (ms)"
diff --git a/model_analyzer/record/types/time_to_first_token_base.py b/model_analyzer/record/types/time_to_first_token_base.py
new file mode 100755
index 000000000..5ef6e9070
--- /dev/null
+++ b/model_analyzer/record/types/time_to_first_token_base.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.record import DecreasingRecord
+
+
+@total_ordering
+class TimeToFirstTokenBase(DecreasingRecord):
+    """
+    A base class record for perf_analyzer time to first token metric
+    """
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    def __eq__(self, other):
+        """
+        Allows checking for
+        equality between two records
+        """
+
+        return self.value() == other.value()
+
+    def __lt__(self, other):
+        """
+        Allows checking if
+        this record is less than
+        the other
+        """
+
+        return self.value() > other.value()
+
+    def __add__(self, other):
+        """
+        Allows adding two records together
+        to produce a brand new record.
+        """
+
+        return self.__class__(value=(self.value() + other.value()))
+
+    def __sub__(self, other):
+        """
+        Allows subbing two records together
+        to produce a brand new record.
+
+        ** Note this does reverse subtraction because
+            of the inverted nature of latency (lower is better)
+        """
+
+        return self.__class__(value=(other.value() - self.value()))
diff --git a/model_analyzer/record/types/time_to_first_token_max.py b/model_analyzer/record/types/time_to_first_token_max.py
new file mode 100755
index 000000000..f9ccc0a52
--- /dev/null
+++ b/model_analyzer/record/types/time_to_first_token_max.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase
+
+
+@total_ordering
+class TimeToFirstTokenMax(TimeToFirstTokenBase):
+    """
+    A record for perf_analyzer Time to first token metric
+    """
+
+    tag = "time_to_first_token_max"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "Max Time To First Token (ms)"
diff --git a/model_analyzer/record/types/time_to_first_token_min.py b/model_analyzer/record/types/time_to_first_token_min.py
new file mode 100755
index 000000000..4cc563c86
--- /dev/null
+++ b/model_analyzer/record/types/time_to_first_token_min.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase
+
+
+@total_ordering
+class TimeToFirstTokenMin(TimeToFirstTokenBase):
+    """
+    A record for perf_analyzer Time to first token metric
+    """
+
+    tag = "time_to_first_token_min"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "Min Time To First Token (ms)"
diff --git a/model_analyzer/record/types/time_to_first_token_p25.py b/model_analyzer/record/types/time_to_first_token_p25.py
new file mode 100755
index 000000000..5938ca3eb
--- /dev/null
+++ b/model_analyzer/record/types/time_to_first_token_p25.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase
+
+
+@total_ordering
+class TimeToFirstTokenP25(TimeToFirstTokenBase):
+    """
+    A record for perf_analyzer Time to first token metric
+    """
+
+    tag = "time_to_first_token_p25"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "p25 Time To First Token (ms)"
diff --git a/model_analyzer/record/types/time_to_first_token_p50.py b/model_analyzer/record/types/time_to_first_token_p50.py
new file mode 100755
index 000000000..a3440b456
--- /dev/null
+++ b/model_analyzer/record/types/time_to_first_token_p50.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase
+
+
+@total_ordering
+class TimeToFirstTokenP50(TimeToFirstTokenBase):
+    """
+    A record for perf_analyzer Time to first token metric
+    """
+
+    tag = "time_to_first_token_p50"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "p50 Time To First Token (ms)"
diff --git a/model_analyzer/record/types/time_to_first_token_p75.py b/model_analyzer/record/types/time_to_first_token_p75.py
new file mode 100755
index 000000000..042972368
--- /dev/null
+++ b/model_analyzer/record/types/time_to_first_token_p75.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase
+
+
+@total_ordering
+class TimeToFirstTokenP75(TimeToFirstTokenBase):
+    """
+    A record for perf_analyzer Time to first token metric
+    """
+
+    tag = "time_to_first_token_p75"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "p75 Time To First Token (ms)"
diff --git a/model_analyzer/record/types/time_to_first_token_p90.py b/model_analyzer/record/types/time_to_first_token_p90.py
new file mode 100755
index 000000000..853adbdb4
--- /dev/null
+++ b/model_analyzer/record/types/time_to_first_token_p90.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase
+
+
+@total_ordering
+class TimeToFirstTokenP90(TimeToFirstTokenBase):
+    """
+    A record for perf_analyzer Time to first token metric
+    """
+
+    tag = "time_to_first_token_p90"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "p90 Time To First Token (ms)"
diff --git a/model_analyzer/record/types/time_to_first_token_p95.py b/model_analyzer/record/types/time_to_first_token_p95.py
new file mode 100755
index 000000000..6e466c4e2
--- /dev/null
+++ b/model_analyzer/record/types/time_to_first_token_p95.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase
+
+
+@total_ordering
+class TimeToFirstTokenP95(TimeToFirstTokenBase):
+    """
+    A record for perf_analyzer Time to first token metric
+    """
+
+    tag = "time_to_first_token_p95"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "p95 Time To First Token (ms)"
diff --git a/model_analyzer/record/types/time_to_first_token_p99.py b/model_analyzer/record/types/time_to_first_token_p99.py
new file mode 100755
index 000000000..24f2ff088
--- /dev/null
+++ b/model_analyzer/record/types/time_to_first_token_p99.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import total_ordering
+
+from model_analyzer.record.types.time_to_first_token_base import TimeToFirstTokenBase
+
+
+@total_ordering
+class TimeToFirstTokenP99(TimeToFirstTokenBase):
+    """
+    A record for perf_analyzer Time to first token metric
+    """
+
+    tag = "time_to_first_token_p99"
+
+    def __init__(self, value, timestamp=0):
+        """
+        Parameters
+        ----------
+        value : float
+            the latency extracted from the perf analyzer output
+        timestamp : float
+            Elapsed time from start of program
+        """
+
+        super().__init__(value, timestamp)
+
+    @classmethod
+    def header(cls, aggregation_tag=False):
+        """
+        Parameters
+        ----------
+        aggregation_tag: bool
+            An optional tag that may be displayed
+            as part of the header indicating that
+            this record has been aggregated using
+            max, min or average etc.
+
+        Returns
+        -------
+        str
+            The full name of the
+            metric.
+        """
+
+        return "p99 Time To First Token (ms)"
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 98ec60237..33a0dd4e0 100755
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -65,6 +65,7 @@ def get_test_options():
         OptionStruct("bool", "profile","--skip-summary-reports"),
         OptionStruct("bool", "profile","--skip-detailed-reports"),
         OptionStruct("bool", "profile","--always-report-gpu-metrics"),
+
         #Int/Float options
         # Options format:
         #   (int/float, MA step, long_option, short_option, test_value, expected_default_value)
@@ -125,6 +126,7 @@ def get_test_options():
         OptionStruct("string", "report", "--config-file", "-f", "baz", None, None),
         OptionStruct("string", "profile", "--triton-docker-shm-size", None, "1G", None, extra_commands=["--triton-launch-mode", "docker"]),
         OptionStruct("string", "profile","--run-config-search-mode", None, ["quick", "brute"], "brute", "SHOULD_FAIL"),
+        OptionStruct("string", "profile", "--model-type", None, ["generic", "LLM"], "generic", None),
 
         #List Options:
         # Options format:
@@ -163,6 +165,7 @@ def get_test_options():
         OptionStruct("noop", "yaml_profile", "weighting"),
         OptionStruct("noop", "yaml_profile", "triton_server_flags"),
         OptionStruct("noop", "yaml_profile", "perf_analyzer_flags"),
+        OptionStruct("noop", "yaml_profile", "genai_perf_flags"),
         OptionStruct("noop", "yaml_profile", "triton_docker_labels"),
         OptionStruct("noop", "yaml_profile", "triton_server_environment"),
         OptionStruct("noop", "yaml_profile", "triton_docker_args"),
diff --git a/tests/test_config.py b/tests/test_config.py
index f056eb76a..190075dea 100755
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -24,6 +24,9 @@
 from model_analyzer.cli.cli import CLI
 from model_analyzer.config.input.config_command_profile import ConfigCommandProfile
 from model_analyzer.config.input.config_command_report import ConfigCommandReport
+from model_analyzer.config.input.config_defaults import (
+    DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS,
+)
 from model_analyzer.config.input.config_enum import ConfigEnum
 from model_analyzer.config.input.config_list_generic import ConfigListGeneric
 from model_analyzer.config.input.config_list_numeric import ConfigListNumeric
@@ -2356,6 +2359,28 @@ def test_mixing_request_rate_and_concurrency(self):
         with self.assertRaises(TritonModelAnalyzerException):
             self._evaluate_config(args, yaml_content, subcommand="profile")
 
+    def test_model_type_llm(self):
+        """
+        Test that model type of LLM chooses the correct inference outputs
+        """
+        args = [
+            "model-analyzer",
+            "profile",
+            "--model-repository",
+            "cli-repository",
+            "--profile-models",
+            "modelA",
+            "--model-type",
+            "LLM",
+        ]
+        yaml_content = ""
+
+        config = self._evaluate_config(args, yaml_content)
+
+        self.assertEqual(
+            config.inference_output_fields, DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS
+        )
+
     def _test_request_rate_config_conflicts(
         self, base_args: List[Any], yaml_content: str
     ) -> None:
diff --git a/tests/test_perf_analyzer.py b/tests/test_perf_analyzer.py
index e95f0d4a1..0b57701b8 100755
--- a/tests/test_perf_analyzer.py
+++ b/tests/test_perf_analyzer.py
@@ -33,6 +33,16 @@
 from model_analyzer.record.types.gpu_power_usage import GPUPowerUsage
 from model_analyzer.record.types.gpu_used_memory import GPUUsedMemory
 from model_analyzer.record.types.gpu_utilization import GPUUtilization
+from model_analyzer.record.types.inter_token_latency_avg import InterTokenLatencyAvg
+from model_analyzer.record.types.inter_token_latency_max import InterTokenLatencyMax
+from model_analyzer.record.types.inter_token_latency_min import InterTokenLatencyMin
+from model_analyzer.record.types.inter_token_latency_p25 import InterTokenLatencyP25
+from model_analyzer.record.types.inter_token_latency_p50 import InterTokenLatencyP50
+from model_analyzer.record.types.inter_token_latency_p75 import InterTokenLatencyP75
+from model_analyzer.record.types.inter_token_latency_p90 import InterTokenLatencyP90
+from model_analyzer.record.types.inter_token_latency_p95 import InterTokenLatencyP95
+from model_analyzer.record.types.inter_token_latency_p99 import InterTokenLatencyP99
+from model_analyzer.record.types.output_token_throughput import OutputTokenThroughput
 from model_analyzer.record.types.perf_client_response_wait import PerfClientResponseWait
 from model_analyzer.record.types.perf_client_send_recv import PerfClientSendRecv
 from model_analyzer.record.types.perf_latency_avg import PerfLatencyAvg
@@ -46,6 +56,15 @@
 )
 from model_analyzer.record.types.perf_server_queue import PerfServerQueue
 from model_analyzer.record.types.perf_throughput import PerfThroughput
+from model_analyzer.record.types.time_to_first_token_avg import TimeToFirstTokenAvg
+from model_analyzer.record.types.time_to_first_token_max import TimeToFirstTokenMax
+from model_analyzer.record.types.time_to_first_token_min import TimeToFirstTokenMin
+from model_analyzer.record.types.time_to_first_token_p25 import TimeToFirstTokenP25
+from model_analyzer.record.types.time_to_first_token_p50 import TimeToFirstTokenP50
+from model_analyzer.record.types.time_to_first_token_p75 import TimeToFirstTokenP75
+from model_analyzer.record.types.time_to_first_token_p90 import TimeToFirstTokenP90
+from model_analyzer.record.types.time_to_first_token_p95 import TimeToFirstTokenP95
+from model_analyzer.record.types.time_to_first_token_p99 import TimeToFirstTokenP99
 from model_analyzer.triton.client.client_factory import TritonClientFactory
 from model_analyzer.triton.server.server_config import TritonServerConfig
 from model_analyzer.triton.server.server_factory import TritonServerFactory
@@ -248,7 +267,10 @@ def test_perf_analyzer_ssl_args(self):
         )
         self.assertEqual(self.config.to_cli_string(), expected_cli_str)
 
-    def test_run(self):
+    def test_pa_csv_output(self):
+        """
+        Tests the ability to read PA's CSV output
+        """
         server_config = TritonServerConfig()
         server_config["model-repository"] = MODEL_REPOSITORY_PATH
 
@@ -287,122 +309,40 @@ def test_run(self):
         self.assertEqual(len(records[TEST_MODEL_NAME]), 1)
         self.assertEqual(records[TEST_MODEL_NAME][0].value(), 5)
 
-        # Test p90 latency parsing
-        perf_metrics = [PerfLatencyP90]
-
-        with patch(
-            "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
-        ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
-            perf_analyzer.run(perf_metrics)
-
-        records = perf_analyzer.get_perf_records()
-        self.assertEqual(len(records[TEST_MODEL_NAME]), 1)
-        self.assertEqual(records[TEST_MODEL_NAME][0].value(), 4.7)
-
-        # Test p95 latency parsing
-        perf_metrics = [PerfLatencyP95]
-
-        with patch(
-            "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
-        ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
-            perf_analyzer.run(perf_metrics)
-
-        records = perf_analyzer.get_perf_records()
-        self.assertEqual(len(records[TEST_MODEL_NAME]), 1)
-        self.assertEqual(records[TEST_MODEL_NAME][0].value(), 4.8)
-
-        # Test p99 latency parsing
-        perf_metrics = [PerfLatencyP99]
-
-        with patch(
-            "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
-        ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
-            perf_analyzer.run(perf_metrics)
-
-        records = perf_analyzer.get_perf_records()
-        self.assertEqual(len(records[TEST_MODEL_NAME]), 1)
-        self.assertEqual(records[TEST_MODEL_NAME][0].value(), 4.9)
+        # Test latency parsing
+        self._test_metrics_from_csv(perf_analyzer, pa_csv_mock, [PerfLatencyP90], [4.7])
+        self._test_metrics_from_csv(perf_analyzer, pa_csv_mock, [PerfLatencyP95], [4.8])
+        self._test_metrics_from_csv(perf_analyzer, pa_csv_mock, [PerfLatencyP99], [4.9])
 
         # Test throughput parsing
-        perf_metrics = [PerfThroughput]
-
-        with patch(
-            "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
-        ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
-            perf_analyzer.run(perf_metrics)
-
-        records = perf_analyzer.get_perf_records()
-        self.assertEqual(len(records[TEST_MODEL_NAME]), 1)
-        self.assertEqual(records[TEST_MODEL_NAME][0].value(), 46.8)
+        self._test_metrics_from_csv(
+            perf_analyzer, pa_csv_mock, [PerfThroughput], [46.8]
+        )
 
         # Test client response wait
-        perf_metrics = [PerfClientResponseWait]
-
-        with patch(
-            "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
-        ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
-            perf_analyzer.run(perf_metrics)
-
-        records = perf_analyzer.get_perf_records()
-        self.assertEqual(len(records[TEST_MODEL_NAME]), 1)
-        self.assertEqual(records[TEST_MODEL_NAME][0].value(), 0.314)
+        self._test_metrics_from_csv(
+            perf_analyzer, pa_csv_mock, [PerfClientResponseWait], [0.314]
+        )
 
         # Test server queue
-        perf_metrics = [PerfServerQueue]
-
-        with patch(
-            "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
-        ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
-            perf_analyzer.run(perf_metrics)
-
-        records = perf_analyzer.get_perf_records()
-        self.assertEqual(len(records[TEST_MODEL_NAME]), 1)
-        self.assertEqual(records[TEST_MODEL_NAME][0].value(), 0.018)
+        self._test_metrics_from_csv(
+            perf_analyzer, pa_csv_mock, [PerfServerQueue], [0.018]
+        )
 
         # Test server compute infer
-        perf_metrics = [PerfServerComputeInfer]
-
-        with patch(
-            "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
-        ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
-            perf_analyzer.run(perf_metrics)
-
-        records = perf_analyzer.get_perf_records()
-        self.assertEqual(len(records[TEST_MODEL_NAME]), 1)
-        self.assertEqual(records[TEST_MODEL_NAME][0].value(), 0.065)
+        self._test_metrics_from_csv(
+            perf_analyzer, pa_csv_mock, [PerfServerComputeInfer], [0.065]
+        )
 
         # Test server compute input
-        perf_metrics = [PerfServerComputeInput]
-
-        with patch(
-            "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
-        ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
-            perf_analyzer.run(perf_metrics)
-
-        records = perf_analyzer.get_perf_records()
-        self.assertEqual(len(records[TEST_MODEL_NAME]), 1)
-        self.assertEqual(records[TEST_MODEL_NAME][0].value(), 0.034)
-
-        # Test server compute infer
-        perf_metrics = [PerfServerComputeOutput]
-
-        with patch(
-            "model_analyzer.perf_analyzer.perf_analyzer.open",
-            mock_open(read_data=pa_csv_mock),
-        ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
-            perf_analyzer.run(perf_metrics)
+        self._test_metrics_from_csv(
+            perf_analyzer, pa_csv_mock, [PerfServerComputeInput], [0.034]
+        )
 
-        records = perf_analyzer.get_perf_records()
-        self.assertEqual(len(records[TEST_MODEL_NAME]), 1)
-        self.assertEqual(records[TEST_MODEL_NAME][0].value(), 0.016)
+        # Test server compute output
+        self._test_metrics_from_csv(
+            perf_analyzer, pa_csv_mock, [PerfServerComputeOutput], [0.016]
+        )
 
         # Test Avg GPU Utilizations. Perf metric is ignored for get_gpu_records()
         gpu_metrics = [GPUUtilization, PerfLatencyAvg]
@@ -544,6 +484,206 @@ def test_run(self):
             self.assertTrue(perf_analyzer.run(perf_metrics))
         self.server.stop()
 
+    def test_pa_llm_csv_output(self):
+        """
+        Tests the ability to read PA's LLM CSV output
+        """
+        server_config = TritonServerConfig()
+        server_config["model-repository"] = MODEL_REPOSITORY_PATH
+
+        # Create server, client, PerfAnalyzer, and wait for server ready
+        self.server = TritonServerFactory.create_server_local(
+            path=TRITON_LOCAL_BIN_PATH, config=server_config, gpus=self.gpus
+        )
+
+        perf_analyzer = PerfAnalyzer(
+            path=PERF_BIN_PATH,
+            config=self.run_config,
+            max_retries=10,
+            timeout=100,
+            max_cpu_util=50,
+            model_type="LLM",
+        )
+        self.client = TritonClientFactory.create_grpc_client(server_url=TEST_GRPC_URL)
+        self.server.start()
+        self.client.wait_for_server_ready(num_retries=1)
+
+        pa_llm_csv_mock = """Metric,avg,min,max,p99,p95,p90,p75,p50,p25\n"""
+        pa_llm_csv_mock += """Time To First Token (ns),4238735,3367978,6702240,6371118,5344958,5006259,4841394,4146648,3484484\n"""
+        pa_llm_csv_mock += """Inter Token Latency (ns),27202264,3849435,138324924,28283424,27737593,27469154,27067290,26979956,26926962\n"""
+        pa_llm_csv_mock += """Request Latency (ns),3363927003,3367978,14238834483,14091273510,13740917508,13692672723,3752510140,4846258,3612270\n"""
+        pa_llm_csv_mock += """Num Output Token,126,0,584,562,509,505,135,0,0\n"""
+        pa_llm_csv_mock += """\n"""
+        pa_llm_csv_mock += """Metric,Value\n"""
+        pa_llm_csv_mock += """Output Token Throughput (per sec),36.37\n"""
+        pa_llm_csv_mock += """Request Throughput (per sec),0.29"""
+
+        # Test all Time to first token values
+        self._test_metrics_from_csv(
+            perf_analyzer,
+            pa_llm_csv_mock,
+            [TimeToFirstTokenAvg],
+            [4238.735],
+            is_llm=True,
+        )
+        self._test_metrics_from_csv(
+            perf_analyzer,
+            pa_llm_csv_mock,
+            [TimeToFirstTokenMin],
+            [3367.978],
+            is_llm=True,
+        )
+        self._test_metrics_from_csv(
+            perf_analyzer,
+            pa_llm_csv_mock,
+            [TimeToFirstTokenMax],
+            [6702.240],
+            is_llm=True,
+        )
+        self._test_metrics_from_csv(
+            perf_analyzer,
+            pa_llm_csv_mock,
+            [TimeToFirstTokenP99],
+            [6371.118],
+            is_llm=True,
+        )
+        self._test_metrics_from_csv(
+            perf_analyzer,
+            pa_llm_csv_mock,
+            [TimeToFirstTokenP95],
+            [5344.958],
+            is_llm=True,
+        )
+        self._test_metrics_from_csv(
+            perf_analyzer,
+            pa_llm_csv_mock,
+            [TimeToFirstTokenP90],
+            [5006.259],
+            is_llm=True,
+        )
+        self._test_metrics_from_csv(
+            perf_analyzer,
+            pa_llm_csv_mock,
+            [TimeToFirstTokenP75],
+            [4841.394],
+            is_llm=True,
+        )
+        self._test_metrics_from_csv(
+            perf_analyzer,
+            pa_llm_csv_mock,
+            [TimeToFirstTokenP50],
+            [4146.648],
+            is_llm=True,
+        )
+        self._test_metrics_from_csv(
+            perf_analyzer,
+            pa_llm_csv_mock,
+            [TimeToFirstTokenP25],
+            [3484.484],
+            is_llm=True,
+        )
+
+        # Test all Inter token latency values
+        self._test_metrics_from_csv(
+            perf_analyzer,
+            pa_llm_csv_mock,
+            [InterTokenLatencyAvg],
+            [27202.264],
+            is_llm=True,
+        )
+        self._test_metrics_from_csv(
+            perf_analyzer,
+            pa_llm_csv_mock,
+            [InterTokenLatencyMin],
+            [3849.435],
+            is_llm=True,
+        )
+        self._test_metrics_from_csv(
+            perf_analyzer,
+            pa_llm_csv_mock,
+            [InterTokenLatencyMax],
+            [138324.924],
+            is_llm=True,
+        )
+        self._test_metrics_from_csv(
+            perf_analyzer,
+            pa_llm_csv_mock,
+            [InterTokenLatencyP99],
+            [28283.424],
+            is_llm=True,
+        )
+        self._test_metrics_from_csv(
+            perf_analyzer,
+            pa_llm_csv_mock,
+            [InterTokenLatencyP95],
+            [27737.593],
+            is_llm=True,
+        )
+        self._test_metrics_from_csv(
+            perf_analyzer,
+            pa_llm_csv_mock,
+            [InterTokenLatencyP90],
+            [27469.154],
+            is_llm=True,
+        )
+        self._test_metrics_from_csv(
+            perf_analyzer,
+            pa_llm_csv_mock,
+            [InterTokenLatencyP75],
+            [27067.290],
+            is_llm=True,
+        )
+        self._test_metrics_from_csv(
+            perf_analyzer,
+            pa_llm_csv_mock,
+            [InterTokenLatencyP50],
+            [26979.956],
+            is_llm=True,
+        )
+        self._test_metrics_from_csv(
+            perf_analyzer,
+            pa_llm_csv_mock,
+            [InterTokenLatencyP25],
+            [26926.962],
+            is_llm=True,
+        )
+
+        # Test output token throughput
+        self._test_metrics_from_csv(
+            perf_analyzer,
+            pa_llm_csv_mock,
+            [OutputTokenThroughput],
+            [36.37],
+            is_llm=True,
+        )
+
+        # Test combination
+        self._test_metrics_from_csv(
+            perf_analyzer,
+            pa_llm_csv_mock,
+            [TimeToFirstTokenP90, InterTokenLatencyP50, OutputTokenThroughput],
+            [5006.259, 26979.956, 36.37],
+            is_llm=True,
+        )
+
+    def _test_metrics_from_csv(
+        self, perf_analyzer, read_data, metrics, expected_values, is_llm=False
+    ):
+        with patch(
+            "model_analyzer.perf_analyzer.perf_analyzer.open",
+            mock_open(read_data=read_data),
+        ), patch("model_analyzer.perf_analyzer.perf_analyzer.os.remove"):
+            perf_analyzer.run(metrics)
+
+        if is_llm:
+            records = perf_analyzer.get_llm_records()
+        else:
+            records = perf_analyzer.get_perf_records()
+
+        self.assertEqual(len(records[TEST_MODEL_NAME]), len(expected_values))
+        for i, expected_value in enumerate(expected_values):
+            self.assertEqual(records[TEST_MODEL_NAME][i].value(), expected_value)
+
     def test_measurement_interval_increase(self):
         server_config = TritonServerConfig()
         server_config["model-repository"] = MODEL_REPOSITORY_PATH
diff --git a/tests/test_record_types.py b/tests/test_record_types.py
index 4bd6d8b32..1279e06df 100755
--- a/tests/test_record_types.py
+++ b/tests/test_record_types.py
@@ -49,6 +49,24 @@ def setUp(self):
                 "perf_latency_p90",
                 "perf_latency_p95",
                 "perf_latency_p99",
+                "inter_token_latency_min",
+                "inter_token_latency_max",
+                "inter_token_latency_avg",
+                "inter_token_latency_p25",
+                "inter_token_latency_p50",
+                "inter_token_latency_p75",
+                "inter_token_latency_p90",
+                "inter_token_latency_p95",
+                "inter_token_latency_p99",
+                "time_to_first_token_min",
+                "time_to_first_token_max",
+                "time_to_first_token_avg",
+                "time_to_first_token_p25",
+                "time_to_first_token_p50",
+                "time_to_first_token_p75",
+                "time_to_first_token_p90",
+                "time_to_first_token_p95",
+                "time_to_first_token_p99",
                 "gpu_used_memory",
                 "cpu_used_ram",
                 "perf_server_compute_infer",
@@ -65,6 +83,7 @@ def setUp(self):
             record_types[k]
             for k in [
                 "perf_throughput",
+                "output_token_throughput",
                 "gpu_free_memory",
                 "gpu_utilization",
                 "cpu_available_ram",