Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Llm testing live run #778

Merged
merged 12 commits into from
Oct 19, 2023
5 changes: 4 additions & 1 deletion model_analyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,10 @@ def profile(
if not self._config.skip_summary_reports:
self._create_summary_tables(verbose)
self._create_summary_reports(mode)
self._create_detailed_reports(mode)

# FIXME: need to figure out detailed reporting for LLMs
if not self._config.is_llm_model():
self._create_detailed_reports(mode)

self._check_for_perf_analyzer_errors()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def __init__(

self._reset_max_batch_size()

if not self._early_exit_enable:
if not self._early_exit_enable and not self._config.is_llm_model():
raise TritonModelAnalyzerException(
"Early exit disable is not supported in automatic model config generator"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,11 @@ def _create_brute_run_config_generator(self) -> BruteRunConfigGenerator:

def _can_binary_search_top_results(self) -> bool:
for model in self._models:
if model.parameters()["concurrency"] or model.parameters()["request_rate"]:
if (
model.parameters()["concurrency"]
or model.parameters()["request_rate"]
or self._config.is_llm_model()
nv-hwoo marked this conversation as resolved.
Show resolved Hide resolved
):
return False

return True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def __init__(
self._curr_results: List = [[] for n in range(self._num_models)]
self._curr_generators: Dict[int, ConfigGeneratorInterface] = {}

self._skip_default_config = skip_default_config
self._skip_default_config = skip_default_config or config.is_llm_model()

def set_last_results(
self, measurements: List[Optional[RunConfigMeasurement]]
Expand Down
38 changes: 31 additions & 7 deletions model_analyzer/config/generate/perf_analyzer_config_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def __init__(
utils.generate_parameter_combinations(self._perf_config_parameter_values)
)

self._input_json_filename = DEFAULT_INPUT_JSON_PATH + "/input-data.json"
self._input_json_base_filename = DEFAULT_INPUT_JSON_PATH + "/input-data-"

self._generate_perf_configs()

Expand Down Expand Up @@ -377,8 +377,24 @@ def _extract_text_input_length(
def _update_perf_config_based_on_parameter_combination(
self, perf_config: PerfAnalyzerConfig, parameter_combination: Dict
) -> None:
if "request-parameter" in parameter_combination:
request_parameter = parameter_combination["request-parameter"]
max_token = self._extract_max_token_from_request_parameter(
nv-braf marked this conversation as resolved.
Show resolved Hide resolved
request_parameter
)
parameter_combination["request-period"] = (
max_token if max_token < 10 else 10
nv-hwoo marked this conversation as resolved.
Show resolved Hide resolved
)

perf_config.update_config(parameter_combination)

def _extract_max_token_from_request_parameter(self, request_parameter: str) -> int:
max_token_start = request_parameter.find(":")
max_token_stop = request_parameter.find(":", max_token_start + 1)
max_token = int(request_parameter[max_token_start + 1 : max_token_stop])

nv-braf marked this conversation as resolved.
Show resolved Hide resolved
return max_token
nv-braf marked this conversation as resolved.
Show resolved Hide resolved

def _update_perf_config_based_on_perf_analyzer_flags(
self, perf_config: PerfAnalyzerConfig
) -> None:
Expand All @@ -389,6 +405,7 @@ def _update_perf_config_based_on_inference_load(
) -> None:
if self._cli_config.is_llm_model():
perf_config.update_config({"periodic-concurrency-range": inference_load})
perf_config.update_config({"streaming": "True"})
elif self._cli_config.is_request_rate_specified(self._model_parameters):
perf_config.update_config({"request-rate-range": inference_load})
else:
Expand All @@ -400,21 +417,28 @@ def _update_perf_config_for_llm_model(
if not self._cli_config.is_llm_model():
return

input_json_filename = (
self._input_json_base_filename + f"{text_input_length}.json"
)
modified_input_dict = self._modify_text_in_input_dict(text_input_length)
self._write_modified_input_dict_to_file(modified_input_dict)
self._write_modified_input_dict_to_file(
modified_input_dict, input_json_filename
)

perf_config.update_config({"input-data": self._input_json_filename})
perf_config.update_config({"input-data": input_json_filename})

def _modify_text_in_input_dict(self, text_input_length: int) -> Dict:
modified_text = " ".join(repeat("Hello", text_input_length))

modified_input_dict = {k: v for k, v in self._llm_input_dict.items()}
modified_input_dict["data"][0]["text-input"] = modified_text
modified_input_dict["data"][0]["PROMPT"] = [modified_text]
nv-braf marked this conversation as resolved.
Show resolved Hide resolved

return modified_input_dict

def _write_modified_input_dict_to_file(self, modified_input_dict: Dict) -> None:
with open(self._input_json_filename, "w") as f:
def _write_modified_input_dict_to_file(
self, modified_input_dict: Dict, input_json_filename: str
) -> None:
with open(input_json_filename, "w") as f:
json.dump(modified_input_dict, f)

def _create_parameter_perf_config_values(self) -> dict:
Expand All @@ -424,7 +448,7 @@ def _create_parameter_perf_config_values(self) -> dict:

if self._cli_config.is_llm_model():
perf_config_values["request-parameter"] = [
"max_token:" + str(mtc) + ":int" for mtc in self._max_token_counts
"max_tokens:" + str(mtc) + ":int" for mtc in self._max_token_counts
nv-braf marked this conversation as resolved.
Show resolved Hide resolved
]
perf_config_values["text-input-length"] = self._text_input_lengths

Expand Down
14 changes: 11 additions & 3 deletions model_analyzer/perf_analyzer/perf_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ class PerfAnalyzer:
]

llm_metric_table = [
["avg_first_latency", None, AvgFirstTokenLatency, "1000"],
["avg_first_token_latency", None, AvgFirstTokenLatency, "1000"],
["avg_token_to_token_latency", None, AvgTokenToTokenLatency, "1000"]
]
# yapf: enable
Expand Down Expand Up @@ -285,6 +285,14 @@ def _get_single_model_cmd(self, index):
if self._is_multi_model():
cmd += ["--enable-mpi"]
cmd += self._get_pa_cli_command(index).replace("=", " ").split()

# OPTME: There should be a more elegant way of determining how to add EOS
# We have to do it here because we use a dictionary to create the PA command
# and it already contains `--request-parameter`
if "--periodic-concurrency-range" in cmd:
cmd.append("--request-parameter")
cmd.append("ignore_eos:true:bool")
nv-hwoo marked this conversation as resolved.
Show resolved Hide resolved

return cmd

def _get_pa_cli_command(self, index):
Expand Down Expand Up @@ -539,7 +547,7 @@ def _calculate_avg_first_token_latency(self, llm_output: Dict) -> float:
request["response_timestamps"][0] - request["timestamp"]
)

avg_first_token_latency = mean(total_first_token_latencies)
avg_first_token_latency = float(mean(total_first_token_latencies))
nv-hwoo marked this conversation as resolved.
Show resolved Hide resolved

return avg_first_token_latency

Expand All @@ -554,7 +562,7 @@ def _calculate_avg_token_to_token_latency(self, llm_output: Dict) -> float:

token_to_token_latencies.append(mean(response_to_response_latencies))

avg_token_to_token_latency = mean(token_to_token_latencies)
avg_token_to_token_latency = float(mean(token_to_token_latencies))
nv-hwoo marked this conversation as resolved.
Show resolved Hide resolved

return avg_token_to_token_latency

Expand Down
5 changes: 4 additions & 1 deletion model_analyzer/record/metrics_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,8 +572,11 @@ def _run_perf_analyzer(
self._handle_unsuccessful_perf_analyzer_run(perf_analyzer)
return (None, None)

# FIXME: PA does not return a latency report file if an export report file is specified
perf_records = (
perf_analyzer.get_perf_records() + perf_analyzer.get_llm_records()
perf_analyzer.get_llm_records()
if self._config.is_llm_model()
else perf_analyzer.get_perf_records()
)
gpu_records = perf_analyzer.get_gpu_records()

Expand Down
2 changes: 1 addition & 1 deletion model_analyzer/record/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def __init__(self, value, timestamp):
Parameters
----------
value : float or int
The value of the GPU metrtic
The value of the GPU metric
timestamp : int
The timestamp for the record in nanoseconds
"""
Expand Down
16 changes: 14 additions & 2 deletions tests/common/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
DEFAULT_OUTPUT_MODEL_REPOSITORY,
DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT,
DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH,
DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY,
DEFAULT_TRITON_GRPC_ENDPOINT,
DEFAULT_TRITON_HTTP_ENDPOINT,
Expand Down Expand Up @@ -244,6 +245,7 @@ def construct_perf_analyzer_config(
periodic_concurrency=DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY,
request_rate=None,
max_token_count=DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT,
text_input_length=DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH,
launch_mode=DEFAULT_TRITON_LAUNCH_MODE,
client_protocol=DEFAULT_CLIENT_PROTOCOL,
perf_analyzer_flags=None,
Expand All @@ -266,6 +268,10 @@ def construct_perf_analyzer_config(
The concurrency value for this PA configuration
periodic_concurrency: list
The periodic concurrency value for this PA configuration
max_token_count: int
The max token count for this PA configuration
text_input_length: int
The text input length for this PA configuration
request_rate: int
The request rate value for this PA configuration
launch_mode: str
Expand Down Expand Up @@ -300,9 +306,15 @@ def construct_perf_analyzer_config(

if llm_search_mode:
pa_config._args["request-parameter"] = (
"max_token:" + str(max_token_count) + ":int"
"max_tokens:" + str(max_token_count) + ":int"
nv-braf marked this conversation as resolved.
Show resolved Hide resolved
)
pa_config._args["input-data"] = DEFAULT_INPUT_JSON_PATH + "/input-data.json"
pa_config._args["request-period"] = (
max_token_count if max_token_count < 10 else 10
)
pa_config._args["input-data"] = (
DEFAULT_INPUT_JSON_PATH + "/input-data-" + str(text_input_length) + ".json"
nv-braf marked this conversation as resolved.
Show resolved Hide resolved
)
pa_config._args["streaming"] = "True"

pa_config._args["measurement-mode"] = DEFAULT_MEASUREMENT_MODE

Expand Down
6 changes: 4 additions & 2 deletions tests/test_perf_analyzer_config_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,11 +622,13 @@ def test_llm_search_text_input_length(self):
periodic_concurrencies = ["16:32:4", "16:32:8", "16:32:16"]

expected_configs = []
for _ in text_input_lengths:
for til in text_input_lengths:
for pc in periodic_concurrencies:
expected_configs.append(
construct_perf_analyzer_config(
llm_search_mode=True, periodic_concurrency=pc
llm_search_mode=True,
periodic_concurrency=pc,
text_input_length=til,
)
)

Expand Down
Loading