From a6db17a650fd10c280f6cb6026262586c4b91a86 Mon Sep 17 00:00:00 2001 From: Hyunjae Woo Date: Thu, 25 Jul 2024 11:01:01 -0700 Subject: [PATCH] support input payload generation for tensorrtllm engine --- .../genai_perf/llm_inputs/llm_inputs.py | 96 +++++++++++++++++ .../genai-perf/genai_perf/parser.py | 5 +- .../genai-perf/genai_perf/wrapper.py | 7 ++ .../genai-perf/tests/test_cli.py | 2 +- .../genai-perf/tests/test_llm_inputs.py | 102 ++++++++++++++++++ 5 files changed, 209 insertions(+), 3 deletions(-) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py index e9d35bb37..2fb0d9dfc 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py @@ -217,6 +217,7 @@ def create_llm_inputs( json_in_pa_format = cls._convert_generic_json_to_output_format( output_format, + tokenizer, generic_dataset_json, add_model_name, add_stream, @@ -689,6 +690,7 @@ def _encode_images_in_input_dataset(cls, input_file_dataset: Dict) -> Dict: def _convert_generic_json_to_output_format( cls, output_format: OutputFormat, + tokenizer: Tokenizer, generic_dataset: Dict, add_model_name: bool, add_stream: bool, @@ -764,6 +766,16 @@ def _convert_generic_json_to_output_format( model_name, model_selection_strategy, ) + elif output_format == OutputFormat.TENSORRTLLM_ENGINE: + output_json = cls._convert_generic_json_to_trtllm_engine_format( + generic_dataset, + tokenizer, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + ) else: raise GenAIPerfException( f"Output format {output_format} is not currently supported" @@ -1011,6 +1023,28 @@ def _convert_generic_json_to_trtllm_format( return pa_json + @classmethod + def _convert_generic_json_to_trtllm_engine_format( + cls, + dataset_json: Dict, + tokenizer: Tokenizer, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + ) -> Dict: + pa_json = cls._populate_trtllm_engine_output_json( + dataset_json, + tokenizer, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + ) + return pa_json + @classmethod def _write_json_to_file(cls, json_in_pa_format: Dict, output_dir: Path) -> None: filename = output_dir / DEFAULT_INPUT_DATA_JSON @@ -1262,6 +1296,43 @@ def _populate_trtllm_output_json( return pa_json + @classmethod + def _populate_trtllm_engine_output_json( + cls, + dataset_json: Dict, + tokenizer: Tokenizer, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + ) -> Dict: + pa_json = cls._create_empty_trtllm_pa_json() + + for index, entry in enumerate(dataset_json["rows"]): + token_ids = tokenizer.encode(entry["text_input"]) + pa_json["data"].append( + { + "input_ids": { + "content": token_ids, + "shape": [len(token_ids)], + }, + "input_lengths": [len(token_ids)], + "request_output_len": [cls.DEFAULT_TENSORRTLLM_MAX_TOKENS], + } + ) + + pa_json = cls._add_optional_tags_to_trtllm_engine_json( + pa_json, + index, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + ) + return pa_json + @classmethod def _create_empty_openai_pa_json(cls) -> Dict: empty_pa_json = deepcopy(cls.EMPTY_JSON_IN_OPENAI_PA_FORMAT) @@ -1478,6 +1549,31 @@ def _add_optional_tags_to_trtllm_json( return pa_json + @classmethod + def _add_optional_tags_to_trtllm_engine_json( + cls, + pa_json: Dict, + index: int, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + ) -> Dict: + row = pa_json["data"][index] + if add_stream: + row["streaming"] = [True] + if output_tokens_mean != cls.DEFAULT_OUTPUT_TOKENS_MEAN: + num_tokens = int(random.gauss(output_tokens_mean, output_tokens_stddev)) + row["request_output_len"] = [num_tokens] + if output_tokens_deterministic: + row["min_length"] = [num_tokens] + + for key, value in extra_inputs.items(): + row[key] = [value] + + return pa_json + @classmethod def _add_required_tags_to_trtllm_json( cls, diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py index c415da9e4..f8c5fce62 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/parser.py @@ -191,10 +191,11 @@ def _check_conditional_args( "The --output-tokens-mean option is required when using --output-tokens-mean-deterministic." ) - if args.service_kind != "triton": + if args.service_kind not in ["triton", "tensorrtllm_engine"]: if args.output_tokens_mean_deterministic: parser.error( - "The --output-tokens-mean-deterministic option is only supported with the Triton service-kind." + "The --output-tokens-mean-deterministic option is only supported " + "with the Triton and TensorRT-LLM Engine service-kind." ) _check_conditional_args_embeddings_rankings(parser, args) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py b/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py index 76ef3e321..fe9abdbb4 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py @@ -122,6 +122,13 @@ def build_cmd(args: Namespace, extra_args: Optional[List[str]] = None) -> List[s cmd += [f"-{arg}"] else: cmd += [f"--{arg}"] + + # (TPA-237) GAP needs to call PA using triton_c_api service kind. + # Currently, it just calls using triton service kind to verify that + # it runs. + elif arg == "service_kind" and value == "tensorrtllm_engine": + cmd += ["--service-kind", "triton"] + args.service_kind = "triton" else: if len(arg) == 1: cmd += [f"-{arg}", f"{value}"] diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_cli.py b/src/c++/perf_analyzer/genai-perf/tests/test_cli.py index d35f4cf11..e8c5e2c41 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_cli.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_cli.py @@ -546,7 +546,7 @@ def test_unrecognized_arg(self, monkeypatch, capsys): "100", "--output-tokens-mean-deterministic", ], - "The --output-tokens-mean-deterministic option is only supported with the Triton service-kind", + "The --output-tokens-mean-deterministic option is only supported with the Triton and TensorRT-LLM Engine service-kind", ), ( [ diff --git a/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py b/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py index 028e72849..e2acd7aec 100644 --- a/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py +++ b/src/c++/perf_analyzer/genai-perf/tests/test_llm_inputs.py @@ -554,6 +554,107 @@ def test_llm_inputs_with_defaults(self, default_configured_url): # else: # assert False, f"Unsupported output format: {output_format}" + @pytest.mark.parametrize( + "generic_json, add_stream, output_tokens_mean, output_tokens_deterministic, expected_json", + [ + ( + # generic_json + { + "rows": [ + {"text_input": "test input one"}, + {"text_input": "test input two"}, + ] + }, + False, + -1, + False, + # expected_json + { + "data": [ + { + "input_ids": { + "content": [1243, 1881, 697], + "shape": [3], + }, + "input_lengths": [3], + "request_output_len": [ + LlmInputs.DEFAULT_TENSORRTLLM_MAX_TOKENS + ], + }, + { + "input_ids": { + "content": [1243, 1881, 1023], + "shape": [3], + }, + "input_lengths": [3], + "request_output_len": [ + LlmInputs.DEFAULT_TENSORRTLLM_MAX_TOKENS + ], + }, + ], + }, + ), + ( + # generic_json + { + "rows": [ + {"text_input": "test input one"}, + {"text_input": "test input two"}, + ] + }, + True, + 999, + True, + # expected_json + { + "data": [ + { + "input_ids": { + "content": [1243, 1881, 697], + "shape": [3], + }, + "input_lengths": [3], + "request_output_len": [999], + "min_length": [999], + "streaming": [True], + }, + { + "input_ids": { + "content": [1243, 1881, 1023], + "shape": [3], + }, + "input_lengths": [3], + "request_output_len": [999], + "min_length": [999], + "streaming": [True], + }, + ], + }, + ), + ], + ) + def test_generic_json_to_trtllm_engine_format( + self, + generic_json, + add_stream, + output_tokens_mean, + output_tokens_deterministic, + expected_json, + ) -> None: + trtllm_json = LlmInputs._convert_generic_json_to_output_format( + output_format=OutputFormat.TENSORRTLLM_ENGINE, + tokenizer=get_tokenizer(DEFAULT_TOKENIZER), + generic_dataset=generic_json, + add_model_name=False, + add_stream=add_stream, + extra_inputs={}, + output_tokens_mean=output_tokens_mean, + output_tokens_stddev=0, + output_tokens_deterministic=output_tokens_deterministic, + ) + + assert trtllm_json == expected_json + def test_add_image_inputs_openai_vision(self) -> None: generic_json = { "rows": [ @@ -606,6 +707,7 @@ def test_add_image_inputs_openai_vision(self) -> None: OutputFormat.OPENAI_VISION, OutputFormat.VLLM, OutputFormat.TENSORRTLLM, + OutputFormat.TENSORRTLLM_ENGINE, ], ) def test_get_input_dataset_from_synthetic(