Skip to content

Commit

Permalink
Profiling model using genai-perf (#849)
Browse files Browse the repository at this point in the history
* Initial changes to run genai-perf in MA

* Gating call to get LLM records

* Fixing captilization issue

* Removing debug

* Adding TODO

---------

Co-authored-by: root <[email protected]>
  • Loading branch information
nv-braf and root committed Apr 8, 2024
1 parent 6063f17 commit db55ca4
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 32 deletions.
65 changes: 42 additions & 23 deletions model_analyzer/perf_analyzer/perf_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,15 +115,15 @@ class PerfAnalyzer:
]

llm_metric_table = [
["time_to_first_token_avg", "Time to First Token (ns) avg", TimeToFirstTokenAvg, "1000"],
["time_to_first_token_min", "Time to First Token (ns) min", TimeToFirstTokenMin, "1000"],
["time_to_first_token_max", "Time to First Token (ns) max", TimeToFirstTokenMax, "1000"],
["time_to_first_token_p99", "Time to First Token (ns) p99", TimeToFirstTokenP99, "1000"],
["time_to_first_token_p95", "Time to First Token (ns) p95", TimeToFirstTokenP95, "1000"],
["time_to_first_token_p90", "Time to First Token (ns) p90", TimeToFirstTokenP90, "1000"],
["time_to_first_token_p75", "Time to First Token (ns) p75", TimeToFirstTokenP75, "1000"],
["time_to_first_token_p50", "Time to First Token (ns) p50", TimeToFirstTokenP50, "1000"],
["time_to_first_token_p25", "Time to First Token (ns) p25", TimeToFirstTokenP25, "1000"],
["time_to_first_token_avg", "Time To First Token (ns) avg", TimeToFirstTokenAvg, "1000"],
["time_to_first_token_min", "Time To First Token (ns) min", TimeToFirstTokenMin, "1000"],
["time_to_first_token_max", "Time To First Token (ns) max", TimeToFirstTokenMax, "1000"],
["time_to_first_token_p99", "Time To First Token (ns) p99", TimeToFirstTokenP99, "1000"],
["time_to_first_token_p95", "Time To First Token (ns) p95", TimeToFirstTokenP95, "1000"],
["time_to_first_token_p90", "Time To First Token (ns) p90", TimeToFirstTokenP90, "1000"],
["time_to_first_token_p75", "Time To First Token (ns) p75", TimeToFirstTokenP75, "1000"],
["time_to_first_token_p50", "Time To First Token (ns) p50", TimeToFirstTokenP50, "1000"],
["time_to_first_token_p25", "Time To First Token (ns) p25", TimeToFirstTokenP25, "1000"],
["inter_token_latency_avg", "Inter Token Latency (ns) avg", InterTokenLatencyAvg, "1000"],
["inter_token_latency_min", "Inter Token Latency (ns) min", InterTokenLatencyMin, "1000"],
["inter_token_latency_max", "Inter Token Latency (ns) max", InterTokenLatencyMax, "1000"],
Expand Down Expand Up @@ -323,14 +323,34 @@ def _get_cmd(self):
return cmd

def _get_single_model_cmd(self, index):
cmd = [self.bin_path]
if self._is_multi_model():
cmd += ["--enable-mpi"]
cmd += self._get_pa_cli_command(index).replace("=", " ").split()
# TODO: TMA-1771 - hook up the user defined CLI options
if self._model_type == "LLM":
cmd = [
"genai-perf",
"-m",
self._config.models_name(),
"--streaming",
"--",
]
cmd += (
self._get_pa_cli_command(index, exclude_model_name=True)
.replace("=", " ")
.split()
)
else:
cmd = [self.bin_path]
if self._is_multi_model():
cmd += ["--enable-mpi"]
cmd += self._get_pa_cli_command(index).replace("=", " ").split()

return cmd

def _get_pa_cli_command(self, index):
return self._config.model_run_configs()[index].perf_config().to_cli_string()
def _get_pa_cli_command(self, index, exclude_model_name=False):
return (
self._config.model_run_configs()[index]
.perf_config()
.to_cli_string(exclude_model_name)
)

def _create_env(self, env):
perf_analyzer_env = os.environ.copy()
Expand Down Expand Up @@ -552,16 +572,16 @@ def _parse_llm_outputs(self, metrics):

perf_config = self._config.model_run_configs()[0].perf_config()

logger.debug(f"Reading PA results from {GENAI_PERF_CSV}")
logger.debug(f"Reading GENAI-PERF results from {GENAI_PERF_CSV}")
with open(GENAI_PERF_CSV, mode="r") as f:
csv_reader = csv.DictReader(f, delimiter=",")
csv_reader = list(csv.DictReader(f, delimiter=","))

# See test_perf_analyzer::test_pa_llm_csv_output() for CSV output example
self._llm_records[perf_config["model-name"]] = self._extract_llm_records(
metrics, csv_reader
)

os.remove(f)
os.remove(GENAI_PERF_CSV)

def _extract_perf_records_from_row(
self, requested_metrics: List[Record], row_metrics: Dict[str, str]
Expand Down Expand Up @@ -632,13 +652,14 @@ def _extract_llm_records(

for requested_metric in requested_metrics:
new_llm_record = self._get_llm_record_from_csv(requested_metric, csv_reader)
llm_records.append(new_llm_record)
if new_llm_record:
llm_records.append(new_llm_record)

return llm_records

def _get_llm_record_from_csv(
self, requested_metric: Record, csv_reader: DictReader
) -> Record:
) -> Optional[Record]:
for row in csv_reader:
for key, value in row.items():
metric_string = f"{row['Metric']} {key}"
Expand All @@ -655,9 +676,7 @@ def _get_llm_record_from_csv(
llm_record = llm_metric[PerfAnalyzer.RECORD_CLASS](adjusted_value) # type: ignore
return llm_record

raise TritonModelAnalyzerException(
f"Did not find {requested_metric.tag} in genai-perf CSV file"
)
return None

def _find_corresponding_llm_metric_row(self, metric_string: str) -> Optional[List]:
for row in PerfAnalyzer.llm_metric_table:
Expand Down
11 changes: 8 additions & 3 deletions model_analyzer/perf_analyzer/perf_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List

from model_analyzer.config.input.config_defaults import DEFAULT_MEASUREMENT_MODE
from model_analyzer.constants import SECONDS_TO_MILLISECONDS_MULTIPLIER
from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
Expand Down Expand Up @@ -325,7 +327,7 @@ def remove_mrc_from_cli_string(cls, cli_string):

return " ".join(perf_str_tokens)

def to_cli_string(self):
def to_cli_string(self, exclude_model_name: bool = False) -> str:
"""
Utility function to convert a config into a
string of arguments to the perf_analyzer with CLI.
Expand All @@ -340,19 +342,22 @@ def to_cli_string(self):

# single dashed options, then verbose flags, then main args
args = []
args.extend(self._parse_short_options())
args.extend(self._parse_short_options(exclude_model_name))
args.extend(self._parse_verbose_options())
args.extend(self._parse_long_options())

return " ".join(args)

def _parse_short_options(self):
def _parse_short_options(self, exclude_model_name: bool = False) -> List:
"""
Parse the perf analyzer single dash options
"""
temp_args = []
for key, value in self._options.items():
if value:
if exclude_model_name and key == "-m":
continue

if key in self._additive_args:
for additive_value in value:
temp_args.append(f"{key} {additive_value}")
Expand Down
50 changes: 45 additions & 5 deletions model_analyzer/record/metrics_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,25 @@ class MetricsManager:
"gpu_power_usage",
"cpu_available_ram",
"cpu_used_ram",
"time_to_first_token_avg",
"time_to_first_token_min",
"time_to_first_token_max",
"time_to_first_token_p99",
"time_to_first_token_p95",
"time_to_first_token_p90",
"time_to_first_token_p75",
"time_to_first_token_p50",
"time_to_first_token_p25",
"inter_token_latency_avg",
"inter_token_latency_min",
"inter_token_latency_max",
"inter_token_latency_p99",
"inter_token_latency_p95",
"inter_token_latency_p90",
"inter_token_latency_p75",
"inter_token_latency_p50",
"inter_token_latency_p25",
"output_token_throughput",
]

def __init__(self, config, client, server, gpus, result_manager, state_manager):
Expand Down Expand Up @@ -115,6 +134,7 @@ def __init__(self, config, client, server, gpus, result_manager, state_manager):
(
self._gpu_metrics,
self._perf_metrics,
self._llm_metrics,
self._cpu_metrics,
) = self._categorize_metrics(self.metrics, self._config.collect_cpu_metrics)
self._gpus = gpus
Expand Down Expand Up @@ -160,21 +180,23 @@ def _categorize_metrics(metric_tags, collect_cpu_metrics=False):
Returns
-------
(list,list,list)
tuple of three lists (DCGM, PerfAnalyzer, CPU) metrics
(list,list,list,list)
tuple of four lists (DCGM, PerfAnalyzer, LLM, CPU) metrics
"""

gpu_metrics, perf_metrics, cpu_metrics = [], [], []
gpu_metrics, perf_metrics, llm_metrics, cpu_metrics = [], [], [], []
# Separates metrics and objectives into related lists
for metric in MetricsManager.get_metric_types(metric_tags):
if metric in PerfAnalyzer.get_gpu_metrics():
gpu_metrics.append(metric)
elif metric in PerfAnalyzer.get_perf_metrics():
perf_metrics.append(metric)
elif metric in PerfAnalyzer.get_llm_metrics():
llm_metrics.append(metric)
elif collect_cpu_metrics and (metric in CPUMonitor.cpu_metrics):
cpu_metrics.append(metric)

return gpu_metrics, perf_metrics, cpu_metrics
return gpu_metrics, perf_metrics, llm_metrics, cpu_metrics

def profile_server(self):
"""
Expand Down Expand Up @@ -589,9 +611,10 @@ def _run_perf_analyzer(
max_retries=self._config.perf_analyzer_max_auto_adjusts,
timeout=self._config.perf_analyzer_timeout,
max_cpu_util=self._config.perf_analyzer_cpu_util,
model_type=self._config.model_type,
)

metrics_to_gather = self._perf_metrics + self._gpu_metrics
metrics_to_gather = self._perf_metrics + self._llm_metrics + self._gpu_metrics
status = perf_analyzer.run(metrics_to_gather, env=perf_analyzer_env)

self._write_perf_analyzer_output(perf_output_writer, perf_analyzer)
Expand All @@ -601,6 +624,12 @@ def _run_perf_analyzer(
return (None, None)

perf_records = perf_analyzer.get_perf_records()

if self._config.model_type == "LLM":
perf_records[run_config.models_name()].extend(
perf_analyzer.get_llm_records()[run_config.models_name()]
)

gpu_records = perf_analyzer.get_gpu_records()

aggregated_perf_records = self._aggregate_perf_records(perf_records)
Expand Down Expand Up @@ -824,6 +853,17 @@ def is_perf_analyzer_metric(tag):
metric = MetricsManager.get_metric_types([tag])[0]
return metric in PerfAnalyzer.get_perf_metrics()

@staticmethod
def is_llm_metric(tag):
"""
Returns
------
True if the given tag is a supported perf_analyzer metric
False otherwise
"""
metric = MetricsManager.get_metric_types([tag])[0]
return metric in PerfAnalyzer.get_llm_metrics()

@staticmethod
def is_cpu_metric(tag):
"""
Expand Down
2 changes: 1 addition & 1 deletion tests/test_perf_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,7 @@ def test_pa_llm_csv_output(self):
self.client.wait_for_server_ready(num_retries=1)

pa_llm_csv_mock = """Metric,avg,min,max,p99,p95,p90,p75,p50,p25\n"""
pa_llm_csv_mock += """Time to First Token (ns),4238735,3367978,6702240,6371118,5344958,5006259,4841394,4146648,3484484\n"""
pa_llm_csv_mock += """Time To First Token (ns),4238735,3367978,6702240,6371118,5344958,5006259,4841394,4146648,3484484\n"""
pa_llm_csv_mock += """Inter Token Latency (ns),27202264,3849435,138324924,28283424,27737593,27469154,27067290,26979956,26926962\n"""
pa_llm_csv_mock += """Request Latency (ns),3363927003,3367978,14238834483,14091273510,13740917508,13692672723,3752510140,4846258,3612270\n"""
pa_llm_csv_mock += """Num Output Token,126,0,584,562,509,505,135,0,0\n"""
Expand Down

0 comments on commit db55ca4

Please sign in to comment.