diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py index 3137d2fe4..1431e9a65 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py @@ -37,6 +37,7 @@ class OutputFormat(Enum): OPENAI_CHAT_COMPLETIONS = auto() OPENAI_COMPLETIONS = auto() TENSORRTLLM = auto() + TENSORRTLLM_BACKEND = auto() VLLM = auto() def to_lowercase(self): @@ -160,6 +161,7 @@ def create_llm_inputs( elif input_type == PromptSource.SYNTHETIC: random.seed(random_seed) synthetic_dataset = cls._get_input_dataset_from_synthetic( + output_format, tokenizer, prompt_tokens_mean, prompt_tokens_stddev, @@ -230,21 +232,28 @@ def _get_input_dataset_from_url( @classmethod def _get_input_dataset_from_synthetic( cls, + output_format, tokenizer: Tokenizer, prompt_tokens_mean: int, prompt_tokens_stddev: int, num_of_output_prompts: int, ) -> Dict[str, Any]: dataset_json: Dict[str, Any] = {} - dataset_json["features"] = [{"name": "text_input"}] + if output_format != OutputFormat.TENSORRTLLM_BACKEND: + dataset_json["features"] = [{"name": "text_input"}] + else: + dataset_json["features"] = [{"name": "input_ids"}, {"name": "input_lengths"}] dataset_json["rows"] = [] for _ in range(num_of_output_prompts): - synthetic_prompt = cls._create_synthetic_prompt( + synthetic_prompt, prompt_tokens = cls._create_synthetic_prompt( tokenizer, prompt_tokens_mean, prompt_tokens_stddev, ) - dataset_json["rows"].append({"row": {"text_input": synthetic_prompt}}) + if output_format != OutputFormat.TENSORRTLLM_BACKEND: + dataset_json["rows"].append({"row": {"text_input": synthetic_prompt}}) + else: + dataset_json["rows"].append({"row": {"input_ids": {"content": prompt_tokens, "shape": [len(prompt_tokens)]}, "input_lengths": [len(prompt_tokens)]}}) return dataset_json @@ -400,6 +409,17 @@ def _convert_generic_json_to_output_format( output_tokens_deterministic, model_name, ) + elif output_format == OutputFormat.TENSORRTLLM_BACKEND: + output_json = cls._convert_generic_json_to_trtllm_backend_format( + generic_dataset, + add_model_name, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + model_name, + ) else: raise GenAIPerfException( f"Output format {output_format} is not currently supported" @@ -541,6 +561,40 @@ def _convert_generic_json_to_trtllm_format( return pa_json + @classmethod + def _convert_generic_json_to_trtllm_backend_format( + cls, + dataset_json: Dict, + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: str = "", + ) -> Dict: + ( + system_role_headers, + user_role_headers, + text_input_headers, + ) = cls._determine_json_feature_roles(dataset_json) + + pa_json = cls._populate_trtllm_backend_output_json( + dataset_json, + system_role_headers, + user_role_headers, + text_input_headers, + add_model_name, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + model_name, + ) + + return pa_json + @classmethod def _write_json_to_file(cls, json_in_pa_format: Dict, output_dir: Path) -> None: filename = output_dir / DEFAULT_INPUT_DATA_JSON @@ -765,6 +819,46 @@ def _populate_trtllm_output_json( return pa_json + @classmethod + def _populate_trtllm_backend_output_json( + cls, + dataset_json: Dict, + system_role_headers: List[str], + user_role_headers: List[str], + text_input_headers: List[str], + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: str = "", + ) -> Dict: + pa_json = cls._create_empty_trtllm_pa_json() + default_max_tokens = ( + "max_tokens" not in extra_inputs + or output_tokens_mean != cls.DEFAULT_OUTPUT_TOKENS_MEAN + ) + + for index, entry in enumerate(dataset_json["rows"]): + pa_json["data"].append({"input_ids": entry['input_ids'], "input_lengths": entry['input_lengths']}) + pa_json = cls._add_required_tags_to_trtllm_backend_json( + pa_json, index, default_max_tokens + ) + pa_json = cls._add_optional_tags_to_trtllm_backend_json( + pa_json, + index, + add_model_name, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + model_name, + ) + + return pa_json + @classmethod def _create_empty_openai_pa_json(cls) -> Dict: empty_pa_json = deepcopy(cls.EMPTY_JSON_IN_OPENAI_PA_FORMAT) @@ -994,6 +1088,49 @@ def _add_required_tags_to_trtllm_json( return pa_json + @classmethod + def _add_optional_tags_to_trtllm_backend_json( + cls, + pa_json: Dict, + index: int, + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: str = "", + ) -> Dict: + row = pa_json["data"][index] + if add_model_name: + row["model"] = model_name + if add_stream: + row["streaming"] = [True] + if output_tokens_mean != cls.DEFAULT_OUTPUT_TOKENS_MEAN: + number_of_tokens = int( + random.gauss(output_tokens_mean, output_tokens_stddev) + ) + if output_tokens_deterministic: + row["min_length"] = [number_of_tokens] + row["request_output_len"] = [number_of_tokens] + for key, value in extra_inputs.items(): + row[key] = [value] + + return pa_json + + @classmethod + def _add_required_tags_to_trtllm_backend_json( + cls, + pa_json: Dict, + index: int, + default_max_tokens: bool, + ) -> Dict: + row = pa_json["data"][index] + if default_max_tokens: + row["request_output_len"] = [cls.DEFAULT_TENSORRTLLM_MAX_TOKENS] + + return pa_json + @classmethod def _check_for_dataset_name_if_input_type_is_url( cls, input_type: PromptSource, dataset_name: str diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/synthetic_prompt_generator.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/synthetic_prompt_generator.py index 68b77fdc4..eff3add44 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/synthetic_prompt_generator.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/synthetic_prompt_generator.py @@ -53,7 +53,7 @@ def create_synthetic_prompt( num_prompt_tokens, farewell_lines, tokenizer ) - return prompt + return prompt, tokenizer.encode(prompt) @classmethod def _create_farewell_lines(cls) -> List[str]: diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py index 24fcb49f9..4856a5d82 100755 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py @@ -45,6 +45,7 @@ class ResponseFormat(Enum): OPENAI_CHAT_COMPLETIONS = auto() OPENAI_COMPLETIONS = auto() TRITON = auto() + TENSORRTLLM_BACKEND = auto() class Metrics: @@ -433,6 +434,8 @@ def _get_profile_metadata(self, data: dict) -> None: elif self._service_kind == "triton": self._response_format = ResponseFormat.TRITON + if "input_ids" in data["experiments"][0]["requests"][0]['request_inputs']: + self._response_format = ResponseFormat.TENSORRTLLM_BACKEND else: raise ValueError(f"Unknown service kind: {self._service_kind}") @@ -533,8 +536,8 @@ def _parse_requests(self, requests: dict) -> LLMMetrics: time_to_first_tokens.append(res_timestamps[0] - req_timestamp) # number of input tokens - input_tokens = self._tokenize_request_inputs(req_inputs) - num_input_tokens.append(len(input_tokens)) + len_input_tokens = self._tokenize_request_inputs(req_inputs) + num_input_tokens.append(len_input_tokens) # output token throughput per request output_tokens = self._tokenize_response_outputs(res_outputs) @@ -614,13 +617,19 @@ def _preprocess_response( def _tokenize_request_inputs(self, req_inputs: dict) -> List[int]: """Deserialize the request input and return tokenized inputs.""" - if self._service_kind == "triton": - return self._tokenize_triton_request_input(req_inputs) + if self._service_kind == "triton" and self._response_format == ResponseFormat.TENSORRTLLM_BACKEND: + return self._tokenize_trtllm_request_input(req_inputs) + elif self._service_kind == "triton": + return len(self._tokenize_triton_request_input(req_inputs)) elif self._service_kind == "openai": - return self._tokenize_openai_request_input(req_inputs) + return len(self._tokenize_openai_request_input(req_inputs)) else: raise ValueError(f"Unknown service kind: '{self._service_kind}'.") + def _tokenize_trtllm_request_input(self, req_inputs: dict) -> List[int]: + """Retrieve the token lengths of the input.""" + return req_inputs['input_lengths'] + def _tokenize_triton_request_input(self, req_inputs: dict) -> List[int]: """Tokenize the Triton request input texts.""" encodings = self._tokenizer(req_inputs["text_input"]) @@ -642,16 +651,24 @@ def _tokenize_openai_request_input(self, req_inputs: dict) -> List[int]: def _tokenize_response_outputs(self, res_outputs: dict) -> List[List[int]]: """Deserialize the response output and return tokenized outputs.""" - if self._service_kind == "triton": + if self._service_kind == "triton" and self._response_format == ResponseFormat.TENSORRTLLM_BACKEND: + return self._tokenize_trtllm_response_output(res_outputs) + elif self._service_kind == "triton": return self._tokenize_triton_response_output(res_outputs) elif self._service_kind == "openai": return self._tokenize_openai_response_output(res_outputs) else: raise ValueError(f"Unknown service kind: '{self._service_kind}'.") - def _tokenize_triton_response_output(self, res_outputs: dict) -> List[List[int]]: + def _tokenize_trtllm_response_output(self, res_outputs: dict) -> List[List[int]]: """Tokenize the Triton response output texts.""" output_texts = [] + for output in res_outputs: + output_texts.append([output["output_ids"]]) + return output_texts + + def _tokenize_triton_response_output(self, res_outputs: dict) -> List[List[int]]: + """Tokenize the Triton response output texts.""" for output in res_outputs: output_texts.append(output["text_output"]) return self._run_tokenizer(output_texts) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py b/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py index fa0049118..48bd50939 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py @@ -46,6 +46,8 @@ def add_protocol_args(args: Namespace) -> List[str]: cmd += ["-u", f"{DEFAULT_GRPC_URL}"] if args.output_format == OutputFormat.TENSORRTLLM: cmd += ["--shape", "max_tokens:1", "--shape", "text_input:1"] + elif args.output_format == OutputFormat.TENSORRTLLM_BACKEND: + cmd += ["--shape", "input_lengths:1", "--shape", "request_output_len:1"] elif args.service_kind == "openai": cmd += ["-i", "http"] return cmd