Skip to content

Commit

Permalink
Changes to fix live run
Browse files Browse the repository at this point in the history
  • Loading branch information
nv-braf committed Oct 18, 2023
1 parent 2ec0743 commit ee7565d
Show file tree
Hide file tree
Showing 10 changed files with 70 additions and 20 deletions.
5 changes: 4 additions & 1 deletion model_analyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,10 @@ def profile(
if not self._config.skip_summary_reports:
self._create_summary_tables(verbose)
self._create_summary_reports(mode)
self._create_detailed_reports(mode)

# FIXME: need to figure out detailed reporting for LLMs
if not self._config.is_llm_model():
self._create_detailed_reports(mode)

self._check_for_perf_analyzer_errors()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def __init__(

self._reset_max_batch_size()

if not self._early_exit_enable:
if not self._early_exit_enable and not self._config.is_llm_model():
raise TritonModelAnalyzerException(
"Early exit disable is not supported in automatic model config generator"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,11 @@ def _create_brute_run_config_generator(self) -> BruteRunConfigGenerator:

def _can_binary_search_top_results(self) -> bool:
for model in self._models:
if model.parameters()["concurrency"] or model.parameters()["request_rate"]:
if (
model.parameters()["concurrency"]
or model.parameters()["request_rate"]
or self._config.is_llm_model()
):
return False

return True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def __init__(
self._curr_results: List = [[] for n in range(self._num_models)]
self._curr_generators: Dict[int, ConfigGeneratorInterface] = {}

self._skip_default_config = skip_default_config
self._skip_default_config = skip_default_config or config.is_llm_model()

def set_last_results(
self, measurements: List[Optional[RunConfigMeasurement]]
Expand Down
31 changes: 24 additions & 7 deletions model_analyzer/config/generate/perf_analyzer_config_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def __init__(
utils.generate_parameter_combinations(self._perf_config_parameter_values)
)

self._input_json_filename = DEFAULT_INPUT_JSON_PATH + "/input-data.json"
self._input_json_base_filename = DEFAULT_INPUT_JSON_PATH + "/input-data-"

self._generate_perf_configs()

Expand Down Expand Up @@ -377,6 +377,15 @@ def _extract_text_input_length(
def _update_perf_config_based_on_parameter_combination(
self, perf_config: PerfAnalyzerConfig, parameter_combination: Dict
) -> None:
if "request-parameter" in parameter_combination:
request_parameter = parameter_combination["request-parameter"]
max_token_start = request_parameter.find(":")
max_token_stop = request_parameter.find(":", max_token_start + 1)
max_token = int(request_parameter[max_token_start + 1 : max_token_stop])
parameter_combination["request-period"] = (
max_token if max_token < 10 else 10
)

perf_config.update_config(parameter_combination)

def _update_perf_config_based_on_perf_analyzer_flags(
Expand All @@ -389,6 +398,7 @@ def _update_perf_config_based_on_inference_load(
) -> None:
if self._cli_config.is_llm_model():
perf_config.update_config({"periodic-concurrency-range": inference_load})
perf_config.update_config({"streaming": "True"})
elif self._cli_config.is_request_rate_specified(self._model_parameters):
perf_config.update_config({"request-rate-range": inference_load})
else:
Expand All @@ -400,21 +410,28 @@ def _update_perf_config_for_llm_model(
if not self._cli_config.is_llm_model():
return

input_json_filename = (
self._input_json_base_filename + f"{text_input_length}.json"
)
modified_input_dict = self._modify_text_in_input_dict(text_input_length)
self._write_modified_input_dict_to_file(modified_input_dict)
self._write_modified_input_dict_to_file(
modified_input_dict, input_json_filename
)

perf_config.update_config({"input-data": self._input_json_filename})
perf_config.update_config({"input-data": input_json_filename})

def _modify_text_in_input_dict(self, text_input_length: int) -> Dict:
modified_text = " ".join(repeat("Hello", text_input_length))

modified_input_dict = {k: v for k, v in self._llm_input_dict.items()}
modified_input_dict["data"][0]["text-input"] = modified_text
modified_input_dict["data"][0]["PROMPT"] = [modified_text]

return modified_input_dict

def _write_modified_input_dict_to_file(self, modified_input_dict: Dict) -> None:
with open(self._input_json_filename, "w") as f:
def _write_modified_input_dict_to_file(
self, modified_input_dict: Dict, input_json_filename: str
) -> None:
with open(input_json_filename, "w") as f:
json.dump(modified_input_dict, f)

def _create_parameter_perf_config_values(self) -> dict:
Expand All @@ -424,7 +441,7 @@ def _create_parameter_perf_config_values(self) -> dict:

if self._cli_config.is_llm_model():
perf_config_values["request-parameter"] = [
"max_token:" + str(mtc) + ":int" for mtc in self._max_token_counts
"max_tokens:" + str(mtc) + ":int" for mtc in self._max_token_counts
]
perf_config_values["text-input-length"] = self._text_input_lengths

Expand Down
14 changes: 11 additions & 3 deletions model_analyzer/perf_analyzer/perf_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ class PerfAnalyzer:
]

llm_metric_table = [
["avg_first_latency", None, AvgFirstTokenLatency, "1000"],
["avg_first_token_latency", None, AvgFirstTokenLatency, "1000"],
["avg_token_to_token_latency", None, AvgTokenToTokenLatency, "1000"]
]
# yapf: enable
Expand Down Expand Up @@ -285,6 +285,14 @@ def _get_single_model_cmd(self, index):
if self._is_multi_model():
cmd += ["--enable-mpi"]
cmd += self._get_pa_cli_command(index).replace("=", " ").split()

# OPTME: There should be a more elegant way of determining how to add EOS
# We have to do it here because we use a dictionary to create the PA command
# and it already contains `--request-parameter`
if "--periodic-concurrency-range" in cmd:
cmd.append("--request-parameter")
cmd.append("ignore_eos:true:bool")

return cmd

def _get_pa_cli_command(self, index):
Expand Down Expand Up @@ -539,7 +547,7 @@ def _calculate_avg_first_token_latency(self, llm_output: Dict) -> float:
request["response_timestamps"][0] - request["timestamp"]
)

avg_first_token_latency = mean(total_first_token_latencies)
avg_first_token_latency = float(mean(total_first_token_latencies))

return avg_first_token_latency

Expand All @@ -554,7 +562,7 @@ def _calculate_avg_token_to_token_latency(self, llm_output: Dict) -> float:

token_to_token_latencies.append(mean(response_to_response_latencies))

avg_token_to_token_latency = mean(token_to_token_latencies)
avg_token_to_token_latency = float(mean(token_to_token_latencies))

return avg_token_to_token_latency

Expand Down
5 changes: 4 additions & 1 deletion model_analyzer/record/metrics_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,8 +572,11 @@ def _run_perf_analyzer(
self._handle_unsuccessful_perf_analyzer_run(perf_analyzer)
return (None, None)

# FIXME: PA does not return a latency report file if an export report file is specified
perf_records = (
perf_analyzer.get_perf_records() + perf_analyzer.get_llm_records()
perf_analyzer.get_llm_records()
if self._config.is_llm_model()
else perf_analyzer.get_perf_records()
)
gpu_records = perf_analyzer.get_gpu_records()

Expand Down
2 changes: 1 addition & 1 deletion model_analyzer/record/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def __init__(self, value, timestamp):
Parameters
----------
value : float or int
The value of the GPU metrtic
The value of the GPU metric
timestamp : int
The timestamp for the record in nanoseconds
"""
Expand Down
16 changes: 14 additions & 2 deletions tests/common/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
DEFAULT_OUTPUT_MODEL_REPOSITORY,
DEFAULT_RUN_CONFIG_MIN_CONCURRENCY,
DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT,
DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH,
DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY,
DEFAULT_TRITON_GRPC_ENDPOINT,
DEFAULT_TRITON_HTTP_ENDPOINT,
Expand Down Expand Up @@ -244,6 +245,7 @@ def construct_perf_analyzer_config(
periodic_concurrency=DEFAULT_RUN_CONFIG_PERIODIC_CONCURRENCY,
request_rate=None,
max_token_count=DEFAULT_RUN_CONFIG_MIN_MAX_TOKEN_COUNT,
text_input_length=DEFAULT_RUN_CONFIG_MIN_TEXT_INPUT_LENGTH,
launch_mode=DEFAULT_TRITON_LAUNCH_MODE,
client_protocol=DEFAULT_CLIENT_PROTOCOL,
perf_analyzer_flags=None,
Expand All @@ -266,6 +268,10 @@ def construct_perf_analyzer_config(
The concurrency value for this PA configuration
periodic_concurrency: list
The periodic concurrency value for this PA configuration
max_token_count: int
The max token count for this PA configuration
text_input_length: int
The text input length for this PA configuration
request_rate: int
The request rate value for this PA configuration
launch_mode: str
Expand Down Expand Up @@ -300,9 +306,15 @@ def construct_perf_analyzer_config(

if llm_search_mode:
pa_config._args["request-parameter"] = (
"max_token:" + str(max_token_count) + ":int"
"max_tokens:" + str(max_token_count) + ":int"
)
pa_config._args["input-data"] = DEFAULT_INPUT_JSON_PATH + "/input-data.json"
pa_config._args["request-period"] = (
max_token_count if max_token_count < 10 else 10
)
pa_config._args["input-data"] = (
DEFAULT_INPUT_JSON_PATH + "/input-data-" + str(text_input_length) + ".json"
)
pa_config._args["streaming"] = "True"

pa_config._args["measurement-mode"] = DEFAULT_MEASUREMENT_MODE

Expand Down
7 changes: 5 additions & 2 deletions tests/test_perf_analyzer_config_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,11 +622,13 @@ def test_llm_search_text_input_length(self):
periodic_concurrencies = ["16:32:4", "16:32:8", "16:32:16"]

expected_configs = []
for _ in text_input_lengths:
for til in text_input_lengths:
for pc in periodic_concurrencies:
expected_configs.append(
construct_perf_analyzer_config(
llm_search_mode=True, periodic_concurrency=pc
llm_search_mode=True,
periodic_concurrency=pc,
text_input_length=til,
)
)

Expand Down Expand Up @@ -1005,6 +1007,7 @@ def _run_and_test_perf_analyzer_config_generator(
self.assertEqual(
expected_configs[i]._options, perf_analyzer_configs[i]._options
)
self.maxDiff = None
self.assertEqual(expected_configs[i]._args, perf_analyzer_configs[i]._args)
self.assertEqual(
expected_configs[i]._additive_args,
Expand Down

0 comments on commit ee7565d

Please sign in to comment.