Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Llm testing live run #778

Merged
merged 12 commits into from
Oct 19, 2023
5 changes: 4 additions & 1 deletion model_analyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,10 @@ def profile(
if not self._config.skip_summary_reports:
self._create_summary_tables(verbose)
self._create_summary_reports(mode)
self._create_detailed_reports(mode)

# FIXME: need to figure out detailed reporting for LLMs
if not self._config.is_llm_model():
self._create_detailed_reports(mode)

self._check_for_perf_analyzer_errors()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def __init__(

self._reset_max_batch_size()

if not self._early_exit_enable:
if not self._early_exit_enable and not self._config.is_llm_model():
raise TritonModelAnalyzerException(
"Early exit disable is not supported in automatic model config generator"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,11 @@ def _create_brute_run_config_generator(self) -> BruteRunConfigGenerator:

def _can_binary_search_top_results(self) -> bool:
for model in self._models:
if model.parameters()["concurrency"] or model.parameters()["request_rate"]:
if (
model.parameters()["concurrency"]
or model.parameters()["request_rate"]
or self._config.is_llm_model()
nv-hwoo marked this conversation as resolved.
Show resolved Hide resolved
):
return False

return True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def __init__(
self._curr_results: List = [[] for n in range(self._num_models)]
self._curr_generators: Dict[int, ConfigGeneratorInterface] = {}

self._skip_default_config = skip_default_config
self._skip_default_config = skip_default_config or config.is_llm_model()

def set_last_results(
self, measurements: List[Optional[RunConfigMeasurement]]
Expand Down
55 changes: 48 additions & 7 deletions model_analyzer/config/generate/perf_analyzer_config_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
DEFAULT_INPUT_JSON_PATH,
DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT,
DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD,
DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE,
DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH,
DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY,
Expand Down Expand Up @@ -115,13 +116,14 @@ def __init__(
self._batch_sizes = sorted(model_parameters["batch_sizes"])
self._text_input_lengths = self._create_text_input_length_list()
self._max_token_counts = self._create_max_token_count_list()
self._request_periods = self._create_request_period_list()

self._perf_config_parameter_values = self._create_parameter_perf_config_values()
self._parameter_count = len(
utils.generate_parameter_combinations(self._perf_config_parameter_values)
)

self._input_json_filename = DEFAULT_INPUT_JSON_PATH + "/input-data.json"
self._input_json_base_filename = DEFAULT_INPUT_JSON_PATH + "/input-data-"

self._generate_perf_configs()

Expand Down Expand Up @@ -321,6 +323,20 @@ def _create_max_token_count_list(self) -> List[int]:
self._cli_config.run_config_search_max_max_token_count,
)

def _create_request_period_list(self) -> List[int]:
if not self._cli_config.is_llm_model():
return []

if self._model_parameters["request_period"]:
return sorted(self._model_parameters["period"])
elif self._cli_config.run_config_search_disable:
return [DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD]
else:
return utils.generate_doubled_list(
self._cli_config.run_config_search_min_request_period,
self._cli_config.run_config_search_max_request_period,
)

def _generate_perf_configs(self) -> None:
parameter_combinations = utils.generate_parameter_combinations(
self._perf_config_parameter_values
Expand Down Expand Up @@ -377,8 +393,23 @@ def _extract_text_input_length(
def _update_perf_config_based_on_parameter_combination(
self, perf_config: PerfAnalyzerConfig, parameter_combination: Dict
) -> None:
if "request-parameter" in parameter_combination:
request_parameter = parameter_combination["request-parameter"]
max_tokens = self._extract_max_tokens_from_request_parameter(
request_parameter
)
parameter_combination["request-period"] = (
max_tokens
if max_tokens < parameter_combination["request-period"]
else parameter_combination["request-period"]
)

perf_config.update_config(parameter_combination)

def _extract_max_tokens_from_request_parameter(self, request_parameter: str) -> int:
_, max_tokens, _ = request_parameter.split(":")
return int(max_tokens)

def _update_perf_config_based_on_perf_analyzer_flags(
self, perf_config: PerfAnalyzerConfig
) -> None:
Expand All @@ -389,6 +420,7 @@ def _update_perf_config_based_on_inference_load(
) -> None:
if self._cli_config.is_llm_model():
perf_config.update_config({"periodic-concurrency-range": inference_load})
perf_config.update_config({"streaming": "True"})
elif self._cli_config.is_request_rate_specified(self._model_parameters):
perf_config.update_config({"request-rate-range": inference_load})
else:
Expand All @@ -400,21 +432,29 @@ def _update_perf_config_for_llm_model(
if not self._cli_config.is_llm_model():
return

input_json_filename = (
self._input_json_base_filename + f"{text_input_length}.json"
)
modified_input_dict = self._modify_text_in_input_dict(text_input_length)
self._write_modified_input_dict_to_file(modified_input_dict)
self._write_modified_input_dict_to_file(
modified_input_dict, input_json_filename
)

perf_config.update_config({"input-data": self._input_json_filename})
perf_config.update_config({"input-data": input_json_filename})

def _modify_text_in_input_dict(self, text_input_length: int) -> Dict:
modified_text = " ".join(repeat("Hello", text_input_length))

modified_input_dict = {k: v for k, v in self._llm_input_dict.items()}
modified_input_dict["data"][0]["text-input"] = modified_text
# FIXME: this needs to be updated once tritonserver/PA are updated TMA-1414
modified_input_dict["data"][0]["PROMPT"] = [modified_text]
nv-braf marked this conversation as resolved.
Show resolved Hide resolved

return modified_input_dict

def _write_modified_input_dict_to_file(self, modified_input_dict: Dict) -> None:
with open(self._input_json_filename, "w") as f:
def _write_modified_input_dict_to_file(
self, modified_input_dict: Dict, input_json_filename: str
) -> None:
with open(input_json_filename, "w") as f:
json.dump(modified_input_dict, f)

def _create_parameter_perf_config_values(self) -> dict:
Expand All @@ -424,8 +464,9 @@ def _create_parameter_perf_config_values(self) -> dict:

if self._cli_config.is_llm_model():
perf_config_values["request-parameter"] = [
"max_token:" + str(mtc) + ":int" for mtc in self._max_token_counts
f"max_tokens:{str(mtc)}:int" for mtc in self._max_token_counts
]
perf_config_values["request-period"] = self._request_periods
perf_config_values["text-input-length"] = self._text_input_lengths

return perf_config_values
Expand Down
14 changes: 11 additions & 3 deletions model_analyzer/perf_analyzer/perf_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ class PerfAnalyzer:
]

llm_metric_table = [
["avg_first_latency", None, AvgFirstTokenLatency, "1000"],
["avg_first_token_latency", None, AvgFirstTokenLatency, "1000"],
["avg_token_to_token_latency", None, AvgTokenToTokenLatency, "1000"]
]
# yapf: enable
Expand Down Expand Up @@ -285,6 +285,14 @@ def _get_single_model_cmd(self, index):
if self._is_multi_model():
cmd += ["--enable-mpi"]
cmd += self._get_pa_cli_command(index).replace("=", " ").split()

# OPTME: There should be a more elegant way of determining how to add EOS
# We have to do it here because we use a dictionary to create the PA command
# and it already contains `--request-parameter`
if "--periodic-concurrency-range" in cmd:
cmd.append("--request-parameter")
cmd.append("ignore_eos:true:bool")
nv-hwoo marked this conversation as resolved.
Show resolved Hide resolved

return cmd

def _get_pa_cli_command(self, index):
Expand Down Expand Up @@ -539,7 +547,7 @@ def _calculate_avg_first_token_latency(self, llm_output: Dict) -> float:
request["response_timestamps"][0] - request["timestamp"]
)

avg_first_token_latency = mean(total_first_token_latencies)
avg_first_token_latency = float(mean(total_first_token_latencies))
nv-hwoo marked this conversation as resolved.
Show resolved Hide resolved

return avg_first_token_latency

Expand All @@ -554,7 +562,7 @@ def _calculate_avg_token_to_token_latency(self, llm_output: Dict) -> float:

token_to_token_latencies.append(mean(response_to_response_latencies))

avg_token_to_token_latency = mean(token_to_token_latencies)
avg_token_to_token_latency = float(mean(token_to_token_latencies))
nv-hwoo marked this conversation as resolved.
Show resolved Hide resolved

return avg_token_to_token_latency

Expand Down
5 changes: 4 additions & 1 deletion model_analyzer/record/metrics_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,8 +572,11 @@ def _run_perf_analyzer(
self._handle_unsuccessful_perf_analyzer_run(perf_analyzer)
return (None, None)

# FIXME: PA does not return a latency report file if an export report file is specified
perf_records = (
perf_analyzer.get_perf_records() + perf_analyzer.get_llm_records()
perf_analyzer.get_llm_records()
if self._config.is_llm_model()
else perf_analyzer.get_perf_records()
)
gpu_records = perf_analyzer.get_gpu_records()

Expand Down
2 changes: 1 addition & 1 deletion model_analyzer/record/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def __init__(self, value, timestamp):
Parameters
----------
value : float or int
The value of the GPU metrtic
The value of the GPU metric
timestamp : int
The timestamp for the record in nanoseconds
"""
Expand Down
22 changes: 18 additions & 4 deletions tests/common/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
DEFAULT_OUTPUT_MODEL_REPOSITORY,
DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT,
DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD,
DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH,
DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY,
DEFAULT_TRITON_GRPC_ENDPOINT,
DEFAULT_TRITON_HTTP_ENDPOINT,
Expand Down Expand Up @@ -244,6 +246,8 @@ def construct_perf_analyzer_config(
periodic_concurrency=DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY,
request_rate=None,
max_token_count=DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT,
text_input_length=DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH,
request_period=DEFAULT_RUN_CONFIG_MIN_REQUEST_PERIOD,
launch_mode=DEFAULT_TRITON_LAUNCH_MODE,
client_protocol=DEFAULT_CLIENT_PROTOCOL,
perf_analyzer_flags=None,
Expand All @@ -266,6 +270,12 @@ def construct_perf_analyzer_config(
The concurrency value for this PA configuration
periodic_concurrency: list
The periodic concurrency value for this PA configuration
max_token_count: int
The max token count for this PA configuration
text_input_length: int
The text input length for this PA configuration
request_period: int
The request period for this PA configuration
request_rate: int
The request rate value for this PA configuration
launch_mode: str
Expand Down Expand Up @@ -299,10 +309,14 @@ def construct_perf_analyzer_config(
pa_config._args["concurrency-range"] = concurrency

if llm_search_mode:
pa_config._args["request-parameter"] = (
"max_token:" + str(max_token_count) + ":int"
)
pa_config._args["input-data"] = DEFAULT_INPUT_JSON_PATH + "/input-data.json"
pa_config._args["request-parameter"] = f"max_tokens:{str(max_token_count)}:int"

pa_config._args["request-period"] = request_period
pa_config._args[
"input-data"
] = f"{DEFAULT_INPUT_JSON_PATH}/input-data-{str(text_input_length)}.json"

pa_config._args["streaming"] = "True"

pa_config._args["measurement-mode"] = DEFAULT_MEASUREMENT_MODE

Expand Down
14 changes: 12 additions & 2 deletions tests/test_perf_analyzer_config_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,8 @@ def test_llm_search_max_token_count(self):
"32",
"--run-config-search-max-text-input-length",
"1",
"--run-config-search-max-request-period",
"1",
]
self._run_and_test_perf_analyzer_config_generator(
yaml_str, expected_configs, pa_cli_args
Expand All @@ -622,11 +624,13 @@ def test_llm_search_text_input_length(self):
periodic_concurrencies = ["16:32:4", "16:32:8", "16:32:16"]

expected_configs = []
for _ in text_input_lengths:
for til in text_input_lengths:
for pc in periodic_concurrencies:
expected_configs.append(
construct_perf_analyzer_config(
llm_search_mode=True, periodic_concurrency=pc
llm_search_mode=True,
periodic_concurrency=pc,
text_input_length=til,
)
)

Expand All @@ -636,6 +640,8 @@ def test_llm_search_text_input_length(self):
"32",
"--run-config-search-max-max-token-count",
"1",
"--run-config-search-max-request-period",
"1",
]
self._run_and_test_perf_analyzer_config_generator(
yaml_str, expected_configs, pa_cli_args
Expand Down Expand Up @@ -673,6 +679,8 @@ def test_periodic_concurrency_parameter(self):
"1",
"--run-config-search-max-text-input-length",
"1",
"--run-config-search-max-request-period",
"1",
]
self._run_and_test_perf_analyzer_config_generator(
yaml_str, expected_configs, pa_cli_args
Expand Down Expand Up @@ -722,6 +730,8 @@ def test_periodic_concurrency_search(self):
"64",
"--run-config-search-min-periodic-concurrency-step",
"8",
"--run-config-search-max-request-period",
"1",
]
self._run_and_test_perf_analyzer_config_generator(
yaml_str, expected_configs, pa_cli_args
Expand Down
Loading