From f014db2ca4054adcb2c7474f37fc4bd0156a320c Mon Sep 17 00:00:00 2001 From: Izzy Putterman Date: Mon, 13 May 2024 11:03:09 -0700 Subject: [PATCH 1/5] Tokens to TRTLLM backend --- .../genai_perf/llm_inputs/llm_inputs.py | 37 ++++++++++--------- .../llm_inputs/synthetic_prompt_generator.py | 2 +- .../genai-perf/genai_perf/main.py | 13 +++---- .../genai-perf/genai_perf/wrapper.py | 3 +- 4 files changed, 29 insertions(+), 26 deletions(-) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py index 3137d2fe4..17abae92c 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py @@ -236,15 +236,17 @@ def _get_input_dataset_from_synthetic( num_of_output_prompts: int, ) -> Dict[str, Any]: dataset_json: Dict[str, Any] = {} - dataset_json["features"] = [{"name": "text_input"}] + # dataset_json["features"] = [{"name": "text_input"}] + dataset_json["features"] = [{"name": "input_ids"}, {"name": "input_lengths"}] dataset_json["rows"] = [] for _ in range(num_of_output_prompts): - synthetic_prompt = cls._create_synthetic_prompt( + synthetic_prompt, prompt_tokens = cls._create_synthetic_prompt( tokenizer, prompt_tokens_mean, prompt_tokens_stddev, ) - dataset_json["rows"].append({"row": {"text_input": synthetic_prompt}}) + # dataset_json["rows"].append({"row": {"text_input": synthetic_prompt}}) + dataset_json["rows"].append({"row": {"input_ids": prompt_tokens, "input_lengths": len(prompt_tokens)}}) return dataset_json @@ -733,20 +735,20 @@ def _populate_trtllm_output_json( ) for index, entry in enumerate(dataset_json["rows"]): - pa_json["data"].append({"text_input": [""]}) + pa_json["data"].append({"input_ids": entry['input_ids'], "input_lengths": entry['input_lengths']}) - for header, content in entry.items(): - new_text_input = cls._create_new_text_input( - header, - system_role_headers, - user_role_headers, - text_input_headers, - content, - ) + # for header, content in entry.items(): + # new_text_input = cls._create_new_text_input( + # header, + # system_role_headers, + # user_role_headers, + # text_input_headers, + # content, + # ) - pa_json = cls._add_new_text_input_to_json( - pa_json, index, new_text_input - ) + # pa_json = cls._add_new_text_input_to_json( + # pa_json, index, new_text_input + # ) pa_json = cls._add_required_tags_to_trtllm_json( pa_json, index, default_max_tokens @@ -975,7 +977,8 @@ def _add_optional_tags_to_trtllm_json( ) if output_tokens_deterministic: row["min_length"] = [number_of_tokens] - row["max_tokens"] = [number_of_tokens] + row["input_lengths"] = [2000] + row["request_output_len"] = [number_of_tokens] for key, value in extra_inputs.items(): row[key] = [value] @@ -990,7 +993,7 @@ def _add_required_tags_to_trtllm_json( ) -> Dict: row = pa_json["data"][index] if default_max_tokens: - row["max_tokens"] = [cls.DEFAULT_TENSORRTLLM_MAX_TOKENS] + row["request_output_len"] = [cls.DEFAULT_TENSORRTLLM_MAX_TOKENS] return pa_json diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/synthetic_prompt_generator.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/synthetic_prompt_generator.py index 68b77fdc4..eff3add44 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/synthetic_prompt_generator.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/synthetic_prompt_generator.py @@ -53,7 +53,7 @@ def create_synthetic_prompt( num_prompt_tokens, farewell_lines, tokenizer ) - return prompt + return prompt, tokenizer.encode(prompt) @classmethod def _create_farewell_lines(cls) -> List[str]: diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/main.py b/src/c++/perf_analyzer/genai-perf/genai_perf/main.py index 04dcc799e..7642102d7 100755 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/main.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/main.py @@ -145,13 +145,12 @@ def run(): def main(): # Interactive use will catch exceptions and log formatted errors rather than # tracebacks. - try: - run() - except Exception as e: - traceback.print_exc() - logger = logging.getLogger(__name__) - logger.error(e) - return 1 + run() + # except Exception as e: + # traceback.print_exc() + # logger = logging.getLogger(__name__) + # logger.error(e) + # return 1 return 0 diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py b/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py index fa0049118..42ce69029 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py @@ -45,7 +45,8 @@ def add_protocol_args(args: Namespace) -> List[str]: if args.u is None: # url cmd += ["-u", f"{DEFAULT_GRPC_URL}"] if args.output_format == OutputFormat.TENSORRTLLM: - cmd += ["--shape", "max_tokens:1", "--shape", "text_input:1"] + cmd += ["--shape", "input_ids:2000", "--shape", "input_lengths:1", "--shape", "request_output_len:1"] + # cmd += ["--shape", "max_tokens:1", "--shape", "text_input:1"] elif args.service_kind == "openai": cmd += ["-i", "http"] return cmd From 3deaa337423690874e51f492e9c664dad1007720 Mon Sep 17 00:00:00 2001 From: Izzy Putterman Date: Mon, 13 May 2024 13:51:00 -0700 Subject: [PATCH 2/5] Working with trtllm backend --- .../genai-perf/genai_perf/llm_inputs/llm_inputs.py | 4 ++-- .../genai-perf/genai_perf/llm_metrics.py | 14 ++++++++------ .../perf_analyzer/genai-perf/genai_perf/wrapper.py | 2 +- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py index 17abae92c..db77f05d8 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py @@ -246,7 +246,7 @@ def _get_input_dataset_from_synthetic( prompt_tokens_stddev, ) # dataset_json["rows"].append({"row": {"text_input": synthetic_prompt}}) - dataset_json["rows"].append({"row": {"input_ids": prompt_tokens, "input_lengths": len(prompt_tokens)}}) + dataset_json["rows"].append({"row": {"input_ids": {"content": prompt_tokens, "shape": [len(prompt_tokens)]}, "input_lengths": [len(prompt_tokens)]}}) return dataset_json @@ -970,7 +970,7 @@ def _add_optional_tags_to_trtllm_json( if add_model_name: row["model"] = model_name if add_stream: - row["stream"] = [True] + row["streaming"] = [True] if output_tokens_mean != cls.DEFAULT_OUTPUT_TOKENS_MEAN: number_of_tokens = int( random.gauss(output_tokens_mean, output_tokens_stddev) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py index 24fcb49f9..c84007aa4 100755 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py @@ -533,8 +533,9 @@ def _parse_requests(self, requests: dict) -> LLMMetrics: time_to_first_tokens.append(res_timestamps[0] - req_timestamp) # number of input tokens - input_tokens = self._tokenize_request_inputs(req_inputs) - num_input_tokens.append(len(input_tokens)) + # input_tokens = self._tokenize_request_inputs(req_inputs) + len_input_tokens = self._tokenize_request_inputs(req_inputs) + num_input_tokens.append(len_input_tokens) # output token throughput per request output_tokens = self._tokenize_response_outputs(res_outputs) @@ -623,8 +624,9 @@ def _tokenize_request_inputs(self, req_inputs: dict) -> List[int]: def _tokenize_triton_request_input(self, req_inputs: dict) -> List[int]: """Tokenize the Triton request input texts.""" - encodings = self._tokenizer(req_inputs["text_input"]) - return encodings.data["input_ids"] + return req_inputs['input_lengths'] + # encodings = self._tokenizer(req_inputs["text_input"]) + # return encodings.data["input_ids"] def _tokenize_openai_request_input(self, req_inputs: dict) -> List[int]: """Tokenize the OpenAI request input texts.""" @@ -653,8 +655,8 @@ def _tokenize_triton_response_output(self, res_outputs: dict) -> List[List[int]] """Tokenize the Triton response output texts.""" output_texts = [] for output in res_outputs: - output_texts.append(output["text_output"]) - return self._run_tokenizer(output_texts) + output_texts.append([output["output_ids"]]) + return output_texts def _tokenize_openai_response_output(self, res_outputs: dict) -> List[List[int]]: """Tokenize the OpenAI response output texts.""" diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py b/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py index 42ce69029..79c93ba11 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py @@ -45,7 +45,7 @@ def add_protocol_args(args: Namespace) -> List[str]: if args.u is None: # url cmd += ["-u", f"{DEFAULT_GRPC_URL}"] if args.output_format == OutputFormat.TENSORRTLLM: - cmd += ["--shape", "input_ids:2000", "--shape", "input_lengths:1", "--shape", "request_output_len:1"] + cmd += ["--shape", "input_lengths:1", "--shape", "request_output_len:1"] # cmd += ["--shape", "max_tokens:1", "--shape", "text_input:1"] elif args.service_kind == "openai": cmd += ["-i", "http"] From bd6d58661b308d28e3eadfc4a2f9e283b6890d0f Mon Sep 17 00:00:00 2001 From: Izzy Putterman Date: Mon, 13 May 2024 16:45:02 -0700 Subject: [PATCH 3/5] Create new trtllm_backend --- .../genai_perf/llm_inputs/llm_inputs.py | 170 ++++++++++++++++-- .../genai-perf/genai_perf/llm_metrics.py | 34 +++- .../genai-perf/genai_perf/wrapper.py | 3 +- 3 files changed, 180 insertions(+), 27 deletions(-) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py index db77f05d8..c3878b9c3 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py @@ -37,6 +37,7 @@ class OutputFormat(Enum): OPENAI_CHAT_COMPLETIONS = auto() OPENAI_COMPLETIONS = auto() TENSORRTLLM = auto() + TENSORRTLLM_BACKEND = auto() VLLM = auto() def to_lowercase(self): @@ -160,6 +161,7 @@ def create_llm_inputs( elif input_type == PromptSource.SYNTHETIC: random.seed(random_seed) synthetic_dataset = cls._get_input_dataset_from_synthetic( + output_format, tokenizer, prompt_tokens_mean, prompt_tokens_stddev, @@ -230,14 +232,17 @@ def _get_input_dataset_from_url( @classmethod def _get_input_dataset_from_synthetic( cls, + output_format, tokenizer: Tokenizer, prompt_tokens_mean: int, prompt_tokens_stddev: int, num_of_output_prompts: int, ) -> Dict[str, Any]: dataset_json: Dict[str, Any] = {} - # dataset_json["features"] = [{"name": "text_input"}] - dataset_json["features"] = [{"name": "input_ids"}, {"name": "input_lengths"}] + if output_format != OutputFormat.TENSORRTLLM_BACKEND: + dataset_json["features"] = [{"name": "text_input"}] + else: + dataset_json["features"] = [{"name": "input_ids"}, {"name": "input_lengths"}] dataset_json["rows"] = [] for _ in range(num_of_output_prompts): synthetic_prompt, prompt_tokens = cls._create_synthetic_prompt( @@ -245,8 +250,10 @@ def _get_input_dataset_from_synthetic( prompt_tokens_mean, prompt_tokens_stddev, ) - # dataset_json["rows"].append({"row": {"text_input": synthetic_prompt}}) - dataset_json["rows"].append({"row": {"input_ids": {"content": prompt_tokens, "shape": [len(prompt_tokens)]}, "input_lengths": [len(prompt_tokens)]}}) + if output_format != OutputFormat.TENSORRTLLM_BACKEND: + dataset_json["rows"].append({"row": {"text_input": synthetic_prompt}}) + else: + dataset_json["rows"].append({"row": {"input_ids": {"content": prompt_tokens, "shape": [len(prompt_tokens)]}, "input_lengths": [len(prompt_tokens)]}}) return dataset_json @@ -402,6 +409,17 @@ def _convert_generic_json_to_output_format( output_tokens_deterministic, model_name, ) + elif output_format == OutputFormat.TENSORRTLLM_BACKEND: + output_json = cls._convert_generic_json_to_trtllm_backend_format( + generic_dataset, + add_model_name, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + model_name, + ) else: raise GenAIPerfException( f"Output format {output_format} is not currently supported" @@ -543,6 +561,40 @@ def _convert_generic_json_to_trtllm_format( return pa_json + @classmethod + def _convert_generic_json_to_trtllm_backend_format( + cls, + dataset_json: Dict, + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: str = "", + ) -> Dict: + ( + system_role_headers, + user_role_headers, + text_input_headers, + ) = cls._determine_json_feature_roles(dataset_json) + + pa_json = cls._populate_trtllm_output_json( + dataset_json, + system_role_headers, + user_role_headers, + text_input_headers, + add_model_name, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + model_name, + ) + + return pa_json + @classmethod def _write_json_to_file(cls, json_in_pa_format: Dict, output_dir: Path) -> None: filename = output_dir / DEFAULT_INPUT_DATA_JSON @@ -735,20 +787,20 @@ def _populate_trtllm_output_json( ) for index, entry in enumerate(dataset_json["rows"]): - pa_json["data"].append({"input_ids": entry['input_ids'], "input_lengths": entry['input_lengths']}) + pa_json["data"].append({"text_input": [""]}) - # for header, content in entry.items(): - # new_text_input = cls._create_new_text_input( - # header, - # system_role_headers, - # user_role_headers, - # text_input_headers, - # content, - # ) + for header, content in entry.items(): + new_text_input = cls._create_new_text_input( + header, + system_role_headers, + user_role_headers, + text_input_headers, + content, + ) - # pa_json = cls._add_new_text_input_to_json( - # pa_json, index, new_text_input - # ) + pa_json = cls._add_new_text_input_to_json( + pa_json, index, new_text_input + ) pa_json = cls._add_required_tags_to_trtllm_json( pa_json, index, default_max_tokens @@ -767,6 +819,46 @@ def _populate_trtllm_output_json( return pa_json + @classmethod + def _populate_trtllm_backend_output_json( + cls, + dataset_json: Dict, + system_role_headers: List[str], + user_role_headers: List[str], + text_input_headers: List[str], + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: str = "", + ) -> Dict: + pa_json = cls._create_empty_trtllm_pa_json() + default_max_tokens = ( + "max_tokens" not in extra_inputs + or output_tokens_mean != cls.DEFAULT_OUTPUT_TOKENS_MEAN + ) + + for index, entry in enumerate(dataset_json["rows"]): + pa_json["data"].append({"input_ids": entry['input_ids'], "input_lengths": entry['input_lengths']}) + pa_json = cls._add_required_tags_to_trtllm_backend_json( + pa_json, index, default_max_tokens + ) + pa_json = cls._add_optional_tags_to_trtllm_backend_json( + pa_json, + index, + add_model_name, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + model_name, + ) + + return pa_json + @classmethod def _create_empty_openai_pa_json(cls) -> Dict: empty_pa_json = deepcopy(cls.EMPTY_JSON_IN_OPENAI_PA_FORMAT) @@ -965,6 +1057,49 @@ def _add_optional_tags_to_trtllm_json( output_tokens_stddev: int, output_tokens_deterministic: bool, model_name: str = "", + ) -> Dict: + row = pa_json["data"][index] + if add_model_name: + row["model"] = model_name + if add_stream: + row["stream"] = [True] + if output_tokens_mean != cls.DEFAULT_OUTPUT_TOKENS_MEAN: + number_of_tokens = int( + random.gauss(output_tokens_mean, output_tokens_stddev) + ) + if output_tokens_deterministic: + row["min_length"] = [number_of_tokens] + row["max_tokens"] = [number_of_tokens] + for key, value in extra_inputs.items(): + row[key] = [value] + + return pa_json + + @classmethod + def _add_required_tags_to_trtllm_json( + cls, + pa_json: Dict, + index: int, + default_max_tokens: bool, + ) -> Dict: + row = pa_json["data"][index] + if default_max_tokens: + row["max_tokens"] = [cls.DEFAULT_TENSORRTLLM_MAX_TOKENS] + + return pa_json + + @classmethod + def _add_optional_tags_to_trtllm_backend_json( + cls, + pa_json: Dict, + index: int, + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: str = "", ) -> Dict: row = pa_json["data"][index] if add_model_name: @@ -977,7 +1112,6 @@ def _add_optional_tags_to_trtllm_json( ) if output_tokens_deterministic: row["min_length"] = [number_of_tokens] - row["input_lengths"] = [2000] row["request_output_len"] = [number_of_tokens] for key, value in extra_inputs.items(): row[key] = [value] @@ -985,7 +1119,7 @@ def _add_optional_tags_to_trtllm_json( return pa_json @classmethod - def _add_required_tags_to_trtllm_json( + def _add_required_tags_to_trtllm_backend_json( cls, pa_json: Dict, index: int, diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py index c84007aa4..f7af08553 100755 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py @@ -45,6 +45,7 @@ class ResponseFormat(Enum): OPENAI_CHAT_COMPLETIONS = auto() OPENAI_COMPLETIONS = auto() TRITON = auto() + TENSORRTLLM_BACKEND = auto() class Metrics: @@ -428,11 +429,15 @@ def _get_profile_metadata(self, data: dict) -> None: self._response_format = ResponseFormat.OPENAI_CHAT_COMPLETIONS elif "text_completion" in response: self._response_format = ResponseFormat.OPENAI_COMPLETIONS + elif "input_ids" in response: + self._response_format = ResponseFormat.OPENAI_COMPLETIONS else: raise RuntimeError("Unknown OpenAI response format.") elif self._service_kind == "triton": self._response_format = ResponseFormat.TRITON + if "input_ids" in data["experiments"][0]["requests"][0]['request_inputs']: + self._response_format = ResponseFormat.TENSORRTLLM_BACKEND else: raise ValueError(f"Unknown service kind: {self._service_kind}") @@ -615,18 +620,23 @@ def _preprocess_response( def _tokenize_request_inputs(self, req_inputs: dict) -> List[int]: """Deserialize the request input and return tokenized inputs.""" - if self._service_kind == "triton": - return self._tokenize_triton_request_input(req_inputs) + if self._service_kind == "triton" and self._response_format == ResponseFormat.TENSORRTLLM_BACKEND: + return self._tokenize_trtllm_request_input(req_inputs) + elif self._service_kind == "triton": + return len(self._tokenize_triton_request_input(req_inputs)) elif self._service_kind == "openai": - return self._tokenize_openai_request_input(req_inputs) + return len(self._tokenize_openai_request_input(req_inputs)) else: raise ValueError(f"Unknown service kind: '{self._service_kind}'.") + def _tokenize_trtllm_request_input(self, req_inputs: dict) -> List[int]: + """Retrieve the token lengths of the input.""" + return req_inputs['input_lengths'] + def _tokenize_triton_request_input(self, req_inputs: dict) -> List[int]: """Tokenize the Triton request input texts.""" - return req_inputs['input_lengths'] - # encodings = self._tokenizer(req_inputs["text_input"]) - # return encodings.data["input_ids"] + encodings = self._tokenizer(req_inputs["text_input"]) + return encodings.data["input_ids"] def _tokenize_openai_request_input(self, req_inputs: dict) -> List[int]: """Tokenize the OpenAI request input texts.""" @@ -644,20 +654,28 @@ def _tokenize_openai_request_input(self, req_inputs: dict) -> List[int]: def _tokenize_response_outputs(self, res_outputs: dict) -> List[List[int]]: """Deserialize the response output and return tokenized outputs.""" - if self._service_kind == "triton": + if self._service_kind == "triton" and self._response_format == ResponseFormat.TENSORRTLLM_BACKEND: + return self._tokenize_trtllm_response_output(req_inputs) + elif self._service_kind == "triton": return self._tokenize_triton_response_output(res_outputs) elif self._service_kind == "openai": return self._tokenize_openai_response_output(res_outputs) else: raise ValueError(f"Unknown service kind: '{self._service_kind}'.") - def _tokenize_triton_response_output(self, res_outputs: dict) -> List[List[int]]: + def _tokenize_trtllm_response_output(self, res_outputs: dict) -> List[List[int]]: """Tokenize the Triton response output texts.""" output_texts = [] for output in res_outputs: output_texts.append([output["output_ids"]]) return output_texts + def _tokenize_triton_response_output(self, res_outputs: dict) -> List[List[int]]: + """Tokenize the Triton response output texts.""" + for output in res_outputs: + output_texts.append(output["text_output"]) + return self._run_tokenizer(output_texts) + def _tokenize_openai_response_output(self, res_outputs: dict) -> List[List[int]]: """Tokenize the OpenAI response output texts.""" output_texts = [] diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py b/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py index 79c93ba11..48bd50939 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py @@ -45,8 +45,9 @@ def add_protocol_args(args: Namespace) -> List[str]: if args.u is None: # url cmd += ["-u", f"{DEFAULT_GRPC_URL}"] if args.output_format == OutputFormat.TENSORRTLLM: + cmd += ["--shape", "max_tokens:1", "--shape", "text_input:1"] + elif args.output_format == OutputFormat.TENSORRTLLM_BACKEND: cmd += ["--shape", "input_lengths:1", "--shape", "request_output_len:1"] - # cmd += ["--shape", "max_tokens:1", "--shape", "text_input:1"] elif args.service_kind == "openai": cmd += ["-i", "http"] return cmd From 461accd3bfc5bbb5f4be0fbb335d8626bb5d044f Mon Sep 17 00:00:00 2001 From: Izzy Putterman Date: Mon, 13 May 2024 16:51:23 -0700 Subject: [PATCH 4/5] Some cleanup --- .../genai-perf/genai_perf/llm_metrics.py | 3 --- src/c++/perf_analyzer/genai-perf/genai_perf/main.py | 13 +++++++------ 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py index f7af08553..0928c75a9 100755 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py @@ -429,8 +429,6 @@ def _get_profile_metadata(self, data: dict) -> None: self._response_format = ResponseFormat.OPENAI_CHAT_COMPLETIONS elif "text_completion" in response: self._response_format = ResponseFormat.OPENAI_COMPLETIONS - elif "input_ids" in response: - self._response_format = ResponseFormat.OPENAI_COMPLETIONS else: raise RuntimeError("Unknown OpenAI response format.") @@ -538,7 +536,6 @@ def _parse_requests(self, requests: dict) -> LLMMetrics: time_to_first_tokens.append(res_timestamps[0] - req_timestamp) # number of input tokens - # input_tokens = self._tokenize_request_inputs(req_inputs) len_input_tokens = self._tokenize_request_inputs(req_inputs) num_input_tokens.append(len_input_tokens) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/main.py b/src/c++/perf_analyzer/genai-perf/genai_perf/main.py index 7642102d7..04dcc799e 100755 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/main.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/main.py @@ -145,12 +145,13 @@ def run(): def main(): # Interactive use will catch exceptions and log formatted errors rather than # tracebacks. - run() - # except Exception as e: - # traceback.print_exc() - # logger = logging.getLogger(__name__) - # logger.error(e) - # return 1 + try: + run() + except Exception as e: + traceback.print_exc() + logger = logging.getLogger(__name__) + logger.error(e) + return 1 return 0 From 93405bf6a26a8bd63bf5a43ec2e7eef0b3f615f4 Mon Sep 17 00:00:00 2001 From: Izzy Putterman Date: Mon, 13 May 2024 17:22:20 -0700 Subject: [PATCH 5/5] Bug fixes --- .../genai-perf/genai_perf/llm_inputs/llm_inputs.py | 4 ++-- src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py index c3878b9c3..1431e9a65 100644 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py @@ -579,7 +579,7 @@ def _convert_generic_json_to_trtllm_backend_format( text_input_headers, ) = cls._determine_json_feature_roles(dataset_json) - pa_json = cls._populate_trtllm_output_json( + pa_json = cls._populate_trtllm_backend_output_json( dataset_json, system_role_headers, user_role_headers, @@ -819,7 +819,7 @@ def _populate_trtllm_output_json( return pa_json - @classmethod + @classmethod def _populate_trtllm_backend_output_json( cls, dataset_json: Dict, diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py index 0928c75a9..4856a5d82 100755 --- a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py +++ b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py @@ -652,7 +652,7 @@ def _tokenize_openai_request_input(self, req_inputs: dict) -> List[int]: def _tokenize_response_outputs(self, res_outputs: dict) -> List[List[int]]: """Deserialize the response output and return tokenized outputs.""" if self._service_kind == "triton" and self._response_format == ResponseFormat.TENSORRTLLM_BACKEND: - return self._tokenize_trtllm_response_output(req_inputs) + return self._tokenize_trtllm_response_output(res_outputs) elif self._service_kind == "triton": return self._tokenize_triton_response_output(res_outputs) elif self._service_kind == "openai":