Skip to content

Commit

Permalink
Merge branch 'main' into dyas-system-prompts
Browse files Browse the repository at this point in the history
  • Loading branch information
dyastremsky authored Dec 10, 2024
2 parents 457647a + d8998a0 commit 4770f2b
Show file tree
Hide file tree
Showing 11 changed files with 258 additions and 78 deletions.
19 changes: 15 additions & 4 deletions genai-perf/docs/tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,17 @@ Run GenAI-Perf inside the Triton Inference Server SDK container:
```bash
genai-perf profile \
-m gpt2 \
--tokenizer gpt2 \
--service-kind triton \
--backend tensorrtllm \
--synthetic-input-tokens-mean 200 \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean 100 \
--output-tokens-stddev 0 \
--output-tokens-mean-deterministic \
--streaming
--streaming \
--request-count 50 \
--warmup-request-count 10
```

Example output:
Expand Down Expand Up @@ -94,14 +97,17 @@ Run GenAI-Perf inside the Triton Inference Server SDK container:
```bash
genai-perf profile \
-m gpt2 \
--tokenizer gpt2 \
--service-kind triton \
--backend vllm \
--synthetic-input-tokens-mean 200 \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean 100 \
--output-tokens-stddev 0 \
--output-tokens-mean-deterministic \
--streaming
--streaming \
--request-count 50 \
--warmup-request-count 10
```

Example output:
Expand Down Expand Up @@ -136,14 +142,16 @@ Run GenAI-Perf inside the Triton Inference Server SDK container:
```bash
genai-perf profile \
-m HuggingFaceH4/zephyr-7b-beta \
--tokenizer HuggingFaceH4/zephyr-7b-beta \
--service-kind openai \
--endpoint-type chat \
--synthetic-input-tokens-mean 200 \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean 100 \
--output-tokens-stddev 0 \
--streaming \
--tokenizer HuggingFaceH4/zephyr-7b-beta
--request-count 50 \
--warmup-request-count 10
```

Example output:
Expand Down Expand Up @@ -178,12 +186,15 @@ Run GenAI-Perf inside the Triton Inference Server SDK container:
```bash
genai-perf profile \
-m gpt2 \
--tokenizer gpt2 \
--service-kind openai \
--endpoint-type completions \
--synthetic-input-tokens-mean 200 \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean 100 \
--output-tokens-stddev 0
--output-tokens-stddev 0 \
--request-count 50 \
--warmup-request-count 10
```

Example output:
Expand Down
2 changes: 2 additions & 0 deletions genai-perf/genai_perf/inputs/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from .rankings_converter import RankingsConverter
from .tensorrtllm_converter import TensorRTLLMConverter
from .tensorrtllm_engine_converter import TensorRTLLMEngineConverter
from .triton_generate_converter import TritonGenerateConverter
from .vllm_converter import VLLMConverter

__all__ = [
Expand All @@ -44,4 +45,5 @@
"TensorRTLLMConverter",
"TensorRTLLMEngineConverter",
"VLLMConverter",
"TritonGenerateConverter",
]
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def create(output_format: OutputFormat):
OutputFormat.TENSORRTLLM: TensorRTLLMConverter,
OutputFormat.TENSORRTLLM_ENGINE: TensorRTLLMEngineConverter,
OutputFormat.VLLM: VLLMConverter,
OutputFormat.TRITON_GENERATE: TritonGenerateConverter,
}
if output_format not in converters:
raise GenAIPerfException(f"Output format {output_format} is not supported")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

from typing import Any, Dict

from genai_perf.inputs.converters.base_converter import BaseConverter
from genai_perf.inputs.input_constants import DEFAULT_OUTPUT_TOKENS_MEAN
from genai_perf.inputs.inputs_config import InputsConfig
from genai_perf.inputs.retrievers.generic_dataset import GenericDataset
from genai_perf.utils import sample_bounded_normal


class TritonGenerateConverter(BaseConverter):
def convert(
self,
generic_dataset: GenericDataset,
config: InputsConfig,
) -> Dict[Any, Any]:
request_body: Dict[str, Any] = {"data": []}

for file_data in generic_dataset.files_data.values():
for _, row in enumerate(file_data.rows):
prompt = row.texts

payload = {
"text_input": prompt,
}
self._add_request_params(payload, config)
request_body["data"].append({"payload": [payload]})

return request_body

def _add_request_params(self, payload: Dict, config: InputsConfig) -> None:
if config.add_stream:
payload["stream"] = True
if config.output_tokens_mean != DEFAULT_OUTPUT_TOKENS_MEAN:
payload["max_tokens"] = int(
sample_bounded_normal(
mean=config.output_tokens_mean,
stddev=config.output_tokens_stddev,
lower=1, # output token must be >= 1
)
)
for key, value in config.extra_inputs.items():
payload[key] = value
1 change: 1 addition & 0 deletions genai-perf/genai_perf/inputs/input_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ class OutputFormat(Enum):
OPENAI_VISION = auto()
RANKINGS = auto()
TENSORRTLLM_ENGINE = auto()
TRITON_GENERATE = auto()

def to_lowercase(self):
return self.name.lower()
Expand Down
114 changes: 67 additions & 47 deletions genai-perf/genai_perf/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import argparse
import os
import sys
from dataclasses import dataclass
from enum import Enum, auto
from pathlib import Path
from typing import Optional, Tuple
Expand Down Expand Up @@ -63,14 +64,39 @@ def to_lowercase(self):

logger = logging.getLogger(__name__)


@dataclass
class EndpointConfig:
endpoint: Optional[str]
service_kind: str
output_format: ic.OutputFormat


_endpoint_type_map = {
"chat": "v1/chat/completions",
"completions": "v1/completions",
"embeddings": "v1/embeddings",
"image_retrieval": "v1/infer",
"nvclip": "v1/embeddings",
"rankings": "v1/ranking",
"vision": "v1/chat/completions",
"chat": EndpointConfig(
"v1/chat/completions", "openai", ic.OutputFormat.OPENAI_CHAT_COMPLETIONS
),
"completions": EndpointConfig(
"v1/completions", "openai", ic.OutputFormat.OPENAI_COMPLETIONS
),
"embeddings": EndpointConfig(
"v1/embeddings", "openai", ic.OutputFormat.OPENAI_EMBEDDINGS
),
"image_retrieval": EndpointConfig(
"v1/infer", "openai", ic.OutputFormat.IMAGE_RETRIEVAL
),
"nvclip": EndpointConfig("v1/embeddings", "openai", ic.OutputFormat.NVCLIP),
"rankings": EndpointConfig("v1/ranking", "openai", ic.OutputFormat.RANKINGS),
"vision": EndpointConfig(
"v1/chat/completions", "openai", ic.OutputFormat.OPENAI_VISION
),
"generate": EndpointConfig(
"v2/models/{MODEL_NAME}/generate", "triton", ic.OutputFormat.TRITON_GENERATE
),
"kserve": EndpointConfig(None, "triton", ic.OutputFormat.TENSORRTLLM),
"tensorrtllm_engine": EndpointConfig(
None, "tensorrtllm_engine", ic.OutputFormat.TENSORRTLLM_ENGINE
),
}


Expand Down Expand Up @@ -141,43 +167,47 @@ def _check_conditional_args(
parser.error(
"The --endpoint-type option is required when using the 'openai' service-kind."
)
else:
if args.endpoint_type == "chat":
args.output_format = ic.OutputFormat.OPENAI_CHAT_COMPLETIONS
elif args.endpoint_type == "completions":
args.output_format = ic.OutputFormat.OPENAI_COMPLETIONS
elif args.endpoint_type == "embeddings":
args.output_format = ic.OutputFormat.OPENAI_EMBEDDINGS
elif args.endpoint_type == "rankings":
args.output_format = ic.OutputFormat.RANKINGS
elif args.endpoint_type == "image_retrieval":
args.output_format = ic.OutputFormat.IMAGE_RETRIEVAL

# (TMA-1986) deduce vision format from chat completions + image CLI
# because there's no openai vision endpoint.
elif args.endpoint_type == "vision":
args.output_format = ic.OutputFormat.OPENAI_VISION
elif args.endpoint_type == "nvclip":
args.output_format = ic.OutputFormat.NVCLIP

if args.endpoint is not None:
args.endpoint = args.endpoint.lstrip(" /")
else:
args.endpoint = _endpoint_type_map[args.endpoint_type]
elif args.endpoint_type is not None:

if args.service_kind == "triton" and args.endpoint_type is None:
args.endpoint_type = "kserve"

if args.service_kind == "tensorrtllm_engine" and args.endpoint_type is None:
args.endpoint_type = "tensorrtllm_engine"

if args.endpoint_type and args.endpoint_type not in _endpoint_type_map:
parser.error(f"Invalid endpoint type {args.endpoint_type}")

endpoint_config = _endpoint_type_map[args.endpoint_type]
args.output_format = endpoint_config.output_format

if endpoint_config.service_kind != args.service_kind:
parser.error(
"The --endpoint-type option should only be used when using the 'openai' service-kind."
f"Invalid endpoint-type '{args.endpoint_type}' for service-kind '{args.service_kind}'."
)

if args.service_kind == "triton":
if args.endpoint is not None:
args.endpoint = args.endpoint.lstrip(" /")
else:
if args.model:
model_name = args.model[0]
else:
model_name = ""
if endpoint_config.endpoint:
args.endpoint = endpoint_config.endpoint.format(MODEL_NAME=model_name)

if args.service_kind == "triton" and args.endpoint_type == "kserve":
args = _convert_str_to_enum_entry(args, "backend", ic.OutputFormat)
args.output_format = args.backend
else:
if args.backend is not ic.DEFAULT_BACKEND:
parser.error(
"The --backend option should only be used when using the 'triton' service-kind."
"The --backend option should only be used when using the 'triton' service-kind and 'kserve' endpoint-type."
)

if args.service_kind == "triton" and args.endpoint_type == "generate":
# TODO: infer service_kind from endpoint_type and deprecate service_kind argument
args.service_kind = "openai"

if args.service_kind == "tensorrtllm_engine":
args.output_format = ic.OutputFormat.TENSORRTLLM_ENGINE

Expand Down Expand Up @@ -351,7 +381,6 @@ def parse_goodput(values):


def _infer_prompt_source(args: argparse.Namespace) -> argparse.Namespace:

args.synthetic_input_files = None

if args.input_file:
Expand Down Expand Up @@ -708,18 +737,9 @@ def _add_endpoint_args(parser):
endpoint_group.add_argument(
"--endpoint-type",
type=str,
choices=[
"chat",
"completions",
"embeddings",
"nvclip",
"image_retrieval",
"rankings",
"vision",
],
required=False,
help=f"The endpoint-type to send requests to on the "
'server. This is only used with the "openai" service-kind.',
choices=list(_endpoint_type_map.keys()),
required=False,
help=f"The endpoint-type to send requests to on the " "server.",
)

endpoint_group.add_argument(
Expand Down
Loading

0 comments on commit 4770f2b

Please sign in to comment.