Merge branch 'main' into dyas-system-prompts

triton-inference-server · Dec 10, 2024 · 4770f2b · 4770f2b
2 parents 457647a + d8998a0
commit 4770f2b
Show file tree

Hide file tree

Showing 11 changed files with 258 additions and 78 deletions.
diff --git a/genai-perf/docs/tutorial.md b/genai-perf/docs/tutorial.md
@@ -55,14 +55,17 @@ Run GenAI-Perf inside the Triton Inference Server SDK container:
 ```bash
 genai-perf profile \
   -m gpt2 \
+  --tokenizer gpt2 \
   --service-kind triton \
   --backend tensorrtllm \
   --synthetic-input-tokens-mean 200 \
   --synthetic-input-tokens-stddev 0 \
   --output-tokens-mean 100 \
   --output-tokens-stddev 0 \
   --output-tokens-mean-deterministic \
-  --streaming
+  --streaming \
+  --request-count 50 \
+  --warmup-request-count 10
 ```
 
 Example output:
@@ -94,14 +97,17 @@ Run GenAI-Perf inside the Triton Inference Server SDK container:
 ```bash
 genai-perf profile \
   -m gpt2 \
+  --tokenizer gpt2 \
   --service-kind triton \
   --backend vllm \
   --synthetic-input-tokens-mean 200 \
   --synthetic-input-tokens-stddev 0 \
   --output-tokens-mean 100 \
   --output-tokens-stddev 0 \
   --output-tokens-mean-deterministic \
-  --streaming
+  --streaming \
+  --request-count 50 \
+  --warmup-request-count 10
 ```
 
 Example output:
@@ -136,14 +142,16 @@ Run GenAI-Perf inside the Triton Inference Server SDK container:
 ```bash
 genai-perf profile \
   -m HuggingFaceH4/zephyr-7b-beta \
+  --tokenizer HuggingFaceH4/zephyr-7b-beta \
   --service-kind openai \
   --endpoint-type chat \
   --synthetic-input-tokens-mean 200 \
   --synthetic-input-tokens-stddev 0 \
   --output-tokens-mean 100 \
   --output-tokens-stddev 0 \
   --streaming \
-  --tokenizer HuggingFaceH4/zephyr-7b-beta
+  --request-count 50 \
+  --warmup-request-count 10
 ```
 
 Example output:
@@ -178,12 +186,15 @@ Run GenAI-Perf inside the Triton Inference Server SDK container:
 ```bash
 genai-perf profile \
   -m gpt2 \
+  --tokenizer gpt2 \
   --service-kind openai \
   --endpoint-type completions \
   --synthetic-input-tokens-mean 200 \
   --synthetic-input-tokens-stddev 0 \
   --output-tokens-mean 100 \
-  --output-tokens-stddev 0
+  --output-tokens-stddev 0 \
+  --request-count 50 \
+  --warmup-request-count 10
 ```
 
 Example output:

diff --git a/genai-perf/genai_perf/inputs/converters/__init__.py b/genai-perf/genai_perf/inputs/converters/__init__.py
@@ -32,6 +32,7 @@
 from .rankings_converter import RankingsConverter
 from .tensorrtllm_converter import TensorRTLLMConverter
 from .tensorrtllm_engine_converter import TensorRTLLMEngineConverter
+from .triton_generate_converter import TritonGenerateConverter
 from .vllm_converter import VLLMConverter
 
 __all__ = [
@@ -44,4 +45,5 @@
     "TensorRTLLMConverter",
     "TensorRTLLMEngineConverter",
     "VLLMConverter",
+    "TritonGenerateConverter",
 ]
diff --git a/genai-perf/genai_perf/inputs/converters/output_format_converter_factory.py b/genai-perf/genai_perf/inputs/converters/output_format_converter_factory.py
@@ -48,6 +48,7 @@ def create(output_format: OutputFormat):
             OutputFormat.TENSORRTLLM: TensorRTLLMConverter,
             OutputFormat.TENSORRTLLM_ENGINE: TensorRTLLMEngineConverter,
             OutputFormat.VLLM: VLLMConverter,
+            OutputFormat.TRITON_GENERATE: TritonGenerateConverter,
         }
         if output_format not in converters:
             raise GenAIPerfException(f"Output format {output_format} is not supported")

diff --git a/genai-perf/genai_perf/inputs/converters/triton_generate_converter.py b/genai-perf/genai_perf/inputs/converters/triton_generate_converter.py
@@ -0,0 +1,68 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from typing import Any, Dict
+
+from genai_perf.inputs.converters.base_converter import BaseConverter
+from genai_perf.inputs.input_constants import DEFAULT_OUTPUT_TOKENS_MEAN
+from genai_perf.inputs.inputs_config import InputsConfig
+from genai_perf.inputs.retrievers.generic_dataset import GenericDataset
+from genai_perf.utils import sample_bounded_normal
+
+
+class TritonGenerateConverter(BaseConverter):
+    def convert(
+        self,
+        generic_dataset: GenericDataset,
+        config: InputsConfig,
+    ) -> Dict[Any, Any]:
+        request_body: Dict[str, Any] = {"data": []}
+
+        for file_data in generic_dataset.files_data.values():
+            for _, row in enumerate(file_data.rows):
+                prompt = row.texts
+
+                payload = {
+                    "text_input": prompt,
+                }
+                self._add_request_params(payload, config)
+                request_body["data"].append({"payload": [payload]})
+
+        return request_body
+
+    def _add_request_params(self, payload: Dict, config: InputsConfig) -> None:
+        if config.add_stream:
+            payload["stream"] = True
+        if config.output_tokens_mean != DEFAULT_OUTPUT_TOKENS_MEAN:
+            payload["max_tokens"] = int(
+                sample_bounded_normal(
+                    mean=config.output_tokens_mean,
+                    stddev=config.output_tokens_stddev,
+                    lower=1,  # output token must be >= 1
+                )
+            )
+        for key, value in config.extra_inputs.items():
+            payload[key] = value
diff --git a/genai-perf/genai_perf/inputs/input_constants.py b/genai-perf/genai_perf/inputs/input_constants.py
@@ -55,6 +55,7 @@ class OutputFormat(Enum):
     OPENAI_VISION = auto()
     RANKINGS = auto()
     TENSORRTLLM_ENGINE = auto()
+    TRITON_GENERATE = auto()
 
     def to_lowercase(self):
         return self.name.lower()

diff --git a/genai-perf/genai_perf/parser.py b/genai-perf/genai_perf/parser.py
@@ -27,6 +27,7 @@
 import argparse
 import os
 import sys
+from dataclasses import dataclass
 from enum import Enum, auto
 from pathlib import Path
 from typing import Optional, Tuple
@@ -63,14 +64,39 @@ def to_lowercase(self):
 
 logger = logging.getLogger(__name__)
 
+
+@dataclass
+class EndpointConfig:
+    endpoint: Optional[str]
+    service_kind: str
+    output_format: ic.OutputFormat
+
+
 _endpoint_type_map = {
-    "chat": "v1/chat/completions",
-    "completions": "v1/completions",
-    "embeddings": "v1/embeddings",
-    "image_retrieval": "v1/infer",
-    "nvclip": "v1/embeddings",
-    "rankings": "v1/ranking",
-    "vision": "v1/chat/completions",
+    "chat": EndpointConfig(
+        "v1/chat/completions", "openai", ic.OutputFormat.OPENAI_CHAT_COMPLETIONS
+    ),
+    "completions": EndpointConfig(
+        "v1/completions", "openai", ic.OutputFormat.OPENAI_COMPLETIONS
+    ),
+    "embeddings": EndpointConfig(
+        "v1/embeddings", "openai", ic.OutputFormat.OPENAI_EMBEDDINGS
+    ),
+    "image_retrieval": EndpointConfig(
+        "v1/infer", "openai", ic.OutputFormat.IMAGE_RETRIEVAL
+    ),
+    "nvclip": EndpointConfig("v1/embeddings", "openai", ic.OutputFormat.NVCLIP),
+    "rankings": EndpointConfig("v1/ranking", "openai", ic.OutputFormat.RANKINGS),
+    "vision": EndpointConfig(
+        "v1/chat/completions", "openai", ic.OutputFormat.OPENAI_VISION
+    ),
+    "generate": EndpointConfig(
+        "v2/models/{MODEL_NAME}/generate", "triton", ic.OutputFormat.TRITON_GENERATE
+    ),
+    "kserve": EndpointConfig(None, "triton", ic.OutputFormat.TENSORRTLLM),
+    "tensorrtllm_engine": EndpointConfig(
+        None, "tensorrtllm_engine", ic.OutputFormat.TENSORRTLLM_ENGINE
+    ),
 }
 
 
@@ -141,43 +167,47 @@ def _check_conditional_args(
             parser.error(
                 "The --endpoint-type option is required when using the 'openai' service-kind."
             )
-        else:
-            if args.endpoint_type == "chat":
-                args.output_format = ic.OutputFormat.OPENAI_CHAT_COMPLETIONS
-            elif args.endpoint_type == "completions":
-                args.output_format = ic.OutputFormat.OPENAI_COMPLETIONS
-            elif args.endpoint_type == "embeddings":
-                args.output_format = ic.OutputFormat.OPENAI_EMBEDDINGS
-            elif args.endpoint_type == "rankings":
-                args.output_format = ic.OutputFormat.RANKINGS
-            elif args.endpoint_type == "image_retrieval":
-                args.output_format = ic.OutputFormat.IMAGE_RETRIEVAL
-
-            # (TMA-1986) deduce vision format from chat completions + image CLI
-            # because there's no openai vision endpoint.
-            elif args.endpoint_type == "vision":
-                args.output_format = ic.OutputFormat.OPENAI_VISION
-            elif args.endpoint_type == "nvclip":
-                args.output_format = ic.OutputFormat.NVCLIP
-
-            if args.endpoint is not None:
-                args.endpoint = args.endpoint.lstrip(" /")
-            else:
-                args.endpoint = _endpoint_type_map[args.endpoint_type]
-    elif args.endpoint_type is not None:
+
+    if args.service_kind == "triton" and args.endpoint_type is None:
+        args.endpoint_type = "kserve"
+
+    if args.service_kind == "tensorrtllm_engine" and args.endpoint_type is None:
+        args.endpoint_type = "tensorrtllm_engine"
+
+    if args.endpoint_type and args.endpoint_type not in _endpoint_type_map:
+        parser.error(f"Invalid endpoint type {args.endpoint_type}")
+
+    endpoint_config = _endpoint_type_map[args.endpoint_type]
+    args.output_format = endpoint_config.output_format
+
+    if endpoint_config.service_kind != args.service_kind:
         parser.error(
-            "The --endpoint-type option should only be used when using the 'openai' service-kind."
+            f"Invalid endpoint-type '{args.endpoint_type}' for service-kind '{args.service_kind}'."
         )
 
-    if args.service_kind == "triton":
+    if args.endpoint is not None:
+        args.endpoint = args.endpoint.lstrip(" /")
+    else:
+        if args.model:
+            model_name = args.model[0]
+        else:
+            model_name = ""
+        if endpoint_config.endpoint:
+            args.endpoint = endpoint_config.endpoint.format(MODEL_NAME=model_name)
+
+    if args.service_kind == "triton" and args.endpoint_type == "kserve":
         args = _convert_str_to_enum_entry(args, "backend", ic.OutputFormat)
         args.output_format = args.backend
     else:
         if args.backend is not ic.DEFAULT_BACKEND:
             parser.error(
-                "The --backend option should only be used when using the 'triton' service-kind."
+                "The --backend option should only be used when using the 'triton' service-kind and 'kserve' endpoint-type."
             )
 
+    if args.service_kind == "triton" and args.endpoint_type == "generate":
+        # TODO: infer service_kind from endpoint_type and deprecate service_kind argument
+        args.service_kind = "openai"
+
     if args.service_kind == "tensorrtllm_engine":
         args.output_format = ic.OutputFormat.TENSORRTLLM_ENGINE
 
@@ -351,7 +381,6 @@ def parse_goodput(values):
 
 
 def _infer_prompt_source(args: argparse.Namespace) -> argparse.Namespace:
-
     args.synthetic_input_files = None
 
     if args.input_file:
@@ -708,18 +737,9 @@ def _add_endpoint_args(parser):
     endpoint_group.add_argument(
         "--endpoint-type",
         type=str,
-        choices=[
-            "chat",
-            "completions",
-            "embeddings",
-            "nvclip",
-            "image_retrieval",
-            "rankings",
-            "vision",
-        ],
-        required=False,
-        help=f"The endpoint-type to send requests to on the "
-        'server. This is only used with the "openai" service-kind.',
+        choices=list(_endpoint_type_map.keys()),
+        required=False,
+        help=f"The endpoint-type to send requests to on the " "server.",
     )
 
     endpoint_group.add_argument(