Skip to content

Commit

Permalink
Create new trtllm_backend
Browse files Browse the repository at this point in the history
  • Loading branch information
IzzyPutterman committed May 14, 2024
1 parent a72d340 commit ee960ee
Show file tree
Hide file tree
Showing 3 changed files with 180 additions and 27 deletions.
170 changes: 152 additions & 18 deletions src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class OutputFormat(Enum):
OPENAI_CHAT_COMPLETIONS = auto()
OPENAI_COMPLETIONS = auto()
TENSORRTLLM = auto()
TENSORRTLLM_BACKEND = auto()
VLLM = auto()

def to_lowercase(self):
Expand Down Expand Up @@ -160,6 +161,7 @@ def create_llm_inputs(
elif input_type == PromptSource.SYNTHETIC:
random.seed(random_seed)
synthetic_dataset = cls._get_input_dataset_from_synthetic(
output_format,
tokenizer,
prompt_tokens_mean,
prompt_tokens_stddev,
Expand Down Expand Up @@ -230,23 +232,28 @@ def _get_input_dataset_from_url(
@classmethod
def _get_input_dataset_from_synthetic(
cls,
output_format,
tokenizer: Tokenizer,
prompt_tokens_mean: int,
prompt_tokens_stddev: int,
num_of_output_prompts: int,
) -> Dict[str, Any]:
dataset_json: Dict[str, Any] = {}
# dataset_json["features"] = [{"name": "text_input"}]
dataset_json["features"] = [{"name": "input_ids"}, {"name": "input_lengths"}]
if output_format != OutputFormat.TENSORRTLLM_BACKEND:
dataset_json["features"] = [{"name": "text_input"}]
else:
dataset_json["features"] = [{"name": "input_ids"}, {"name": "input_lengths"}]
dataset_json["rows"] = []
for _ in range(num_of_output_prompts):
synthetic_prompt, prompt_tokens = cls._create_synthetic_prompt(
tokenizer,
prompt_tokens_mean,
prompt_tokens_stddev,
)
# dataset_json["rows"].append({"row": {"text_input": synthetic_prompt}})
dataset_json["rows"].append({"row": {"input_ids": {"content": prompt_tokens, "shape": [len(prompt_tokens)]}, "input_lengths": [len(prompt_tokens)]}})
if output_format != OutputFormat.TENSORRTLLM_BACKEND:
dataset_json["rows"].append({"row": {"text_input": synthetic_prompt}})
else:
dataset_json["rows"].append({"row": {"input_ids": {"content": prompt_tokens, "shape": [len(prompt_tokens)]}, "input_lengths": [len(prompt_tokens)]}})

return dataset_json

Expand Down Expand Up @@ -402,6 +409,17 @@ def _convert_generic_json_to_output_format(
output_tokens_deterministic,
model_name,
)
elif output_format == OutputFormat.TENSORRTLLM_BACKEND:
output_json = cls._convert_generic_json_to_trtllm_backend_format(
generic_dataset,
add_model_name,
add_stream,
extra_inputs,
output_tokens_mean,
output_tokens_stddev,
output_tokens_deterministic,
model_name,
)
else:
raise GenAIPerfException(
f"Output format {output_format} is not currently supported"
Expand Down Expand Up @@ -543,6 +561,40 @@ def _convert_generic_json_to_trtllm_format(

return pa_json

@classmethod
def _convert_generic_json_to_trtllm_backend_format(
cls,
dataset_json: Dict,
add_model_name: bool,
add_stream: bool,
extra_inputs: Dict,
output_tokens_mean: int,
output_tokens_stddev: int,
output_tokens_deterministic: bool,
model_name: str = "",
) -> Dict:
(
system_role_headers,
user_role_headers,
text_input_headers,
) = cls._determine_json_feature_roles(dataset_json)

pa_json = cls._populate_trtllm_output_json(
dataset_json,
system_role_headers,
user_role_headers,
text_input_headers,
add_model_name,
add_stream,
extra_inputs,
output_tokens_mean,
output_tokens_stddev,
output_tokens_deterministic,
model_name,
)

return pa_json

@classmethod
def _write_json_to_file(cls, json_in_pa_format: Dict, output_dir: Path) -> None:
filename = output_dir / DEFAULT_INPUT_DATA_JSON
Expand Down Expand Up @@ -735,20 +787,20 @@ def _populate_trtllm_output_json(
)

for index, entry in enumerate(dataset_json["rows"]):
pa_json["data"].append({"input_ids": entry['input_ids'], "input_lengths": entry['input_lengths']})
pa_json["data"].append({"text_input": [""]})

# for header, content in entry.items():
# new_text_input = cls._create_new_text_input(
# header,
# system_role_headers,
# user_role_headers,
# text_input_headers,
# content,
# )
for header, content in entry.items():
new_text_input = cls._create_new_text_input(
header,
system_role_headers,
user_role_headers,
text_input_headers,
content,
)

# pa_json = cls._add_new_text_input_to_json(
# pa_json, index, new_text_input
# )
pa_json = cls._add_new_text_input_to_json(
pa_json, index, new_text_input
)

pa_json = cls._add_required_tags_to_trtllm_json(
pa_json, index, default_max_tokens
Expand All @@ -767,6 +819,46 @@ def _populate_trtllm_output_json(

return pa_json

@classmethod
def _populate_trtllm_backend_output_json(
cls,
dataset_json: Dict,
system_role_headers: List[str],
user_role_headers: List[str],
text_input_headers: List[str],
add_model_name: bool,
add_stream: bool,
extra_inputs: Dict,
output_tokens_mean: int,
output_tokens_stddev: int,
output_tokens_deterministic: bool,
model_name: str = "",
) -> Dict:
pa_json = cls._create_empty_trtllm_pa_json()
default_max_tokens = (
"max_tokens" not in extra_inputs
or output_tokens_mean != cls.DEFAULT_OUTPUT_TOKENS_MEAN
)

for index, entry in enumerate(dataset_json["rows"]):
pa_json["data"].append({"input_ids": entry['input_ids'], "input_lengths": entry['input_lengths']})
pa_json = cls._add_required_tags_to_trtllm_backend_json(
pa_json, index, default_max_tokens
)
pa_json = cls._add_optional_tags_to_trtllm_backend_json(
pa_json,
index,
add_model_name,
add_stream,
extra_inputs,
output_tokens_mean,
output_tokens_stddev,
output_tokens_deterministic,
model_name,
)

return pa_json

@classmethod
def _create_empty_openai_pa_json(cls) -> Dict:
empty_pa_json = deepcopy(cls.EMPTY_JSON_IN_OPENAI_PA_FORMAT)
Expand Down Expand Up @@ -965,6 +1057,49 @@ def _add_optional_tags_to_trtllm_json(
output_tokens_stddev: int,
output_tokens_deterministic: bool,
model_name: str = "",
) -> Dict:
row = pa_json["data"][index]
if add_model_name:
row["model"] = model_name
if add_stream:
row["stream"] = [True]
if output_tokens_mean != cls.DEFAULT_OUTPUT_TOKENS_MEAN:
number_of_tokens = int(
random.gauss(output_tokens_mean, output_tokens_stddev)
)
if output_tokens_deterministic:
row["min_length"] = [number_of_tokens]
row["max_tokens"] = [number_of_tokens]
for key, value in extra_inputs.items():
row[key] = [value]

return pa_json

@classmethod
def _add_required_tags_to_trtllm_json(
cls,
pa_json: Dict,
index: int,
default_max_tokens: bool,
) -> Dict:
row = pa_json["data"][index]
if default_max_tokens:
row["max_tokens"] = [cls.DEFAULT_TENSORRTLLM_MAX_TOKENS]

return pa_json

@classmethod
def _add_optional_tags_to_trtllm_backend_json(
cls,
pa_json: Dict,
index: int,
add_model_name: bool,
add_stream: bool,
extra_inputs: Dict,
output_tokens_mean: int,
output_tokens_stddev: int,
output_tokens_deterministic: bool,
model_name: str = "",
) -> Dict:
row = pa_json["data"][index]
if add_model_name:
Expand All @@ -977,15 +1112,14 @@ def _add_optional_tags_to_trtllm_json(
)
if output_tokens_deterministic:
row["min_length"] = [number_of_tokens]
row["input_lengths"] = [2000]
row["request_output_len"] = [number_of_tokens]
for key, value in extra_inputs.items():
row[key] = [value]

return pa_json

@classmethod
def _add_required_tags_to_trtllm_json(
def _add_required_tags_to_trtllm_backend_json(
cls,
pa_json: Dict,
index: int,
Expand Down
34 changes: 26 additions & 8 deletions src/c++/perf_analyzer/genai-perf/genai_perf/llm_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class ResponseFormat(Enum):
OPENAI_CHAT_COMPLETIONS = auto()
OPENAI_COMPLETIONS = auto()
TRITON = auto()
TENSORRTLLM_BACKEND = auto()


class Metrics:
Expand Down Expand Up @@ -428,11 +429,15 @@ def _get_profile_metadata(self, data: dict) -> None:
self._response_format = ResponseFormat.OPENAI_CHAT_COMPLETIONS
elif "text_completion" in response:
self._response_format = ResponseFormat.OPENAI_COMPLETIONS
elif "input_ids" in response:
self._response_format = ResponseFormat.OPENAI_COMPLETIONS
else:
raise RuntimeError("Unknown OpenAI response format.")

elif self._service_kind == "triton":
self._response_format = ResponseFormat.TRITON
if "input_ids" in data["experiments"][0]["requests"][0]['request_inputs']:
self._response_format = ResponseFormat.TENSORRTLLM_BACKEND
else:
raise ValueError(f"Unknown service kind: {self._service_kind}")

Expand Down Expand Up @@ -615,18 +620,23 @@ def _preprocess_response(

def _tokenize_request_inputs(self, req_inputs: dict) -> List[int]:
"""Deserialize the request input and return tokenized inputs."""
if self._service_kind == "triton":
return self._tokenize_triton_request_input(req_inputs)
if self._service_kind == "triton" and self._response_format == ResponseFormat.TENSORRTLLM_BACKEND:
return self._tokenize_trtllm_request_input(req_inputs)
elif self._service_kind == "triton":
return len(self._tokenize_triton_request_input(req_inputs))
elif self._service_kind == "openai":
return self._tokenize_openai_request_input(req_inputs)
return len(self._tokenize_openai_request_input(req_inputs))
else:
raise ValueError(f"Unknown service kind: '{self._service_kind}'.")

def _tokenize_trtllm_request_input(self, req_inputs: dict) -> List[int]:
"""Retrieve the token lengths of the input."""
return req_inputs['input_lengths']

def _tokenize_triton_request_input(self, req_inputs: dict) -> List[int]:
"""Tokenize the Triton request input texts."""
return req_inputs['input_lengths']
# encodings = self._tokenizer(req_inputs["text_input"])
# return encodings.data["input_ids"]
encodings = self._tokenizer(req_inputs["text_input"])
return encodings.data["input_ids"]

def _tokenize_openai_request_input(self, req_inputs: dict) -> List[int]:
"""Tokenize the OpenAI request input texts."""
Expand All @@ -644,20 +654,28 @@ def _tokenize_openai_request_input(self, req_inputs: dict) -> List[int]:

def _tokenize_response_outputs(self, res_outputs: dict) -> List[List[int]]:
"""Deserialize the response output and return tokenized outputs."""
if self._service_kind == "triton":
if self._service_kind == "triton" and self._response_format == ResponseFormat.TENSORRTLLM_BACKEND:
return self._tokenize_trtllm_response_output(req_inputs)
elif self._service_kind == "triton":
return self._tokenize_triton_response_output(res_outputs)
elif self._service_kind == "openai":
return self._tokenize_openai_response_output(res_outputs)
else:
raise ValueError(f"Unknown service kind: '{self._service_kind}'.")

def _tokenize_triton_response_output(self, res_outputs: dict) -> List[List[int]]:
def _tokenize_trtllm_response_output(self, res_outputs: dict) -> List[List[int]]:
"""Tokenize the Triton response output texts."""
output_texts = []
for output in res_outputs:
output_texts.append([output["output_ids"]])
return output_texts

def _tokenize_triton_response_output(self, res_outputs: dict) -> List[List[int]]:
"""Tokenize the Triton response output texts."""
for output in res_outputs:
output_texts.append(output["text_output"])
return self._run_tokenizer(output_texts)

def _tokenize_openai_response_output(self, res_outputs: dict) -> List[List[int]]:
"""Tokenize the OpenAI response output texts."""
output_texts = []
Expand Down
3 changes: 2 additions & 1 deletion src/c++/perf_analyzer/genai-perf/genai_perf/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,9 @@ def add_protocol_args(args: Namespace) -> List[str]:
if args.u is None: # url
cmd += ["-u", f"{DEFAULT_GRPC_URL}"]
if args.output_format == OutputFormat.TENSORRTLLM:
cmd += ["--shape", "max_tokens:1", "--shape", "text_input:1"]
elif args.output_format == OutputFormat.TENSORRTLLM_BACKEND:
cmd += ["--shape", "input_lengths:1", "--shape", "request_output_len:1"]
# cmd += ["--shape", "max_tokens:1", "--shape", "text_input:1"]
elif args.service_kind == "openai":
cmd += ["-i", "http"]
return cmd
Expand Down

0 comments on commit ee960ee

Please sign in to comment.