triton-inference-server · debermudez · Feb 28, 2024 · Feb 27, 2024 · Feb 27, 2024 · Feb 27, 2024
diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/constants.py b/src/c++/perf_analyzer/genai-pa/genai_pa/constants.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without

diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py b/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -27,28 +26,52 @@
 
 import argparse
 import logging
+from pathlib import Path
 
 from genai_pa.constants import LOGGER_NAME
 
 logger = logging.getLogger(LOGGER_NAME)
 
+
+def prune_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    """
+    Prune the parsed arguments to remove args with None or False values.
+    """
+    print(args)
+    return argparse.Namespace(
+        **{k: v for k, v in vars(args).items() if v is not None if v is not False}
+    )
+
+
+def update_load_manager_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    """
+    Update GenAI-PA load manager attributes to PA format
+    """
+    for attr_key in ["concurrency", "request_rate"]:
+        attr_val = getattr(args, attr_key)
+        if attr_val is not None:
+            setattr(args, f"{attr_key}_range", f"{attr_val}")
+        delattr(args, attr_key)
+    return args
+
+
 ### Handlers ###
 
 
 # NOTE: Placeholder
 def handler(args):
     from genai_pa.wrapper import Profiler
 
-    Profiler.run(
-        model=args.model,
-    )
+    Profiler.run(model=args.model, args=args)
 
 
 ### Parsers ###
 
 
 def add_model_args(parser):
-    parser.add_argument(
+    model_group = parser.add_argument_group("Model")
+
+    model_group.add_argument(
         "-m",
         "--model",
         type=str,
@@ -58,64 +81,112 @@
 
 
 def add_profile_args(parser):
-    parser.add_argument(
+    profile_group = parser.add_argument_group("Profiling")
+    load_management_group = profile_group.add_mutually_exclusive_group()
+
+    profile_group.add_argument(
         "-b",
         "--batch-size",
         type=int,
         default=1,
         required=False,
-        help="The batch size / concurrency to benchmark. (Default: 1)",
+        help="The batch size to benchmark. The default value is 1.",
     )
-    parser.add_argument(
-        "--input-length",
+    load_management_group.add_argument(
+        "--concurrency",
         type=int,
-        default=128,
         required=False,
-        help="The input length (tokens) to use for benchmarking LLMs. (Default: 128)",
+        help="Sets the concurrency value to benchmark.",
     )
-    parser.add_argument(
-        "--output-length",
+    profile_group.add_argument(
+        "--max-threads",
         type=int,
-        default=128,
+        default=16,
+        required=False,
+        help="Sets the maximum number of threads that will be "
+        "created for providing desired concurrency or request rate. "
+        "The default value is 16.",
+    )
+    # TODO: necessary?
+    # parser.add_argument(
+    #     "--output-length",
+    #     type=int,
+    #     default=128,
+    #     required=False,
+    #     help="The output length (tokens) to use for benchmarking LLMs. (Default: 128)",
+    # )
+    profile_group.add_argument(
+        "--profile-export-file",
+        type=Path,
+        default="profile_export.json",
+        help="Specifies the path where the profile export will be "
+        "generated. By default, the profile export will not be "
+        "generated.",
+    )
+    load_management_group.add_argument(
+        "--request-rate",
+        type=float,
+        required=False,
+        help="Sets the request rate for the load generated by PA. ",
+    )
+    profile_group.add_argument(
+        "--service-kind",
+        type=str,
+        choices=["triton", "openai"],
+        default="triton",
+        required=False,
+        help="Describes the kind of service perf_analyzer will "
+        'generate load for. The options are "triton" and '
+        '"openai". The default value is "triton".',
+    )
+    profile_group.add_argument(
+        "--streaming",
+        action="store_true",
+        required=False,
+        help=f"Enables the use of the streaming API.",
+    )
+    profile_group.add_argument(
+        "--version",
+        action="store_true",
         required=False,
-        help="The output length (tokens) to use for benchmarking LLMs. (Default: 128)",
+        help=f"Prints the version and exits.",
     )
 
 
 def add_endpoint_args(parser):
-    parser.add_argument(
+    endpoint_group = parser.add_argument_group("Endpoint")
+
+    endpoint_group.add_argument(
+        "-u",
         "--url",
         type=str,
         default="localhost:8001",
         required=False,
         help="URL of the endpoint to target for benchmarking.",
     )
-    parser.add_argument(
-        "--provider",
-        type=str,
-        choices=["triton", "openai"],
-        required=False,
-        help="Provider format/schema to use for benchmarking.",
-    )
 
 
 def add_dataset_args(parser):
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        default="OpenOrca",
-        choices=["OpenOrca", "cnn_dailymail"],
-        required=False,
-        help="HuggingFace dataset to use for the benchmark.",
-    )
-    parser.add_argument(
-        "--tokenizer",
-        type=str,
-        default="auto",
-        choices=["auto"],
-        required=False,
-        help="The HuggingFace tokenizer to use to interpret token metrics from final text results",
-    )
+    pass
+
+    dataset_group = parser.add_argument_group("Dataset")
+    # TODO: Do we want to remove dataset and tokenizer?
+    # dataset_group.add_argument(
+    #     "--dataset",
+    #     type=str,
+    #     default="OpenOrca",
+    #     choices=["OpenOrca", "cnn_dailymail"],
+    #     required=False,
+    #     help="HuggingFace dataset to use for the benchmark.",
+    # )
+    # dataset_group.add_argument(
+    #     "--tokenizer",
+    #     type=str,
+    #     default="auto",
+    #     choices=["auto"],
+    #     required=False,
+    #     help="The HuggingFace tokenizer to use to interpret token metrics from final text results",
+    # )
 
 
 ### Entrypoint ###
@@ -125,22 +196,19 @@
 def parse_args(argv=None):
     parser = argparse.ArgumentParser(
         prog="genai-pa",
-        description="CLI to profile LLMs and Generative AI models with PA",
+        description="CLI to profile LLMs and Generative AI models with Perf Analyzer",
     )
     parser.set_defaults(func=handler)
 
     # Conceptually group args for easier visualization
-    model_group = parser.add_argument_group("Model")
-    add_model_args(model_group)
+    add_model_args(parser)
+    add_profile_args(parser)
+    add_endpoint_args(parser)
+    add_dataset_args(parser)
 
-    profile_group = parser.add_argument_group("Profiling")
-    add_profile_args(profile_group)
+    args = parser.parse_args(argv)
 
-    endpoint_group = parser.add_argument_group("Endpoint")
-    add_endpoint_args(endpoint_group)
+    args = update_load_manager_args(args)
+    args = prune_args(args)
 
-    dataset_group = parser.add_argument_group("Dataset")
-    add_dataset_args(dataset_group)
-
-    args = parser.parse_args(argv)
     return args
diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/utils.py b/src/c++/perf_analyzer/genai-pa/genai_pa/utils.py
@@ -0,0 +1,32 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from pathlib import Path
+
+
+def remove_file(file: Path):
+    if file.is_file():
+        file.unlink()
diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/wrapper.py b/src/c++/perf_analyzer/genai-pa/genai_pa/wrapper.py
@@ -24,33 +24,36 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import json
 import logging
 import subprocess
 
+import genai_pa.utils as utils
 from genai_pa.constants import LOGGER_NAME
 
 logger = logging.getLogger(LOGGER_NAME)
 
 
 class Profiler:
     @staticmethod
-    def run(model):
-        # TODO: Replace with other plumbing
-        input_file = "/tmp/input_data.json"
-        with open(input_file, "w") as f:
-            data = {"data": [{"text_input": ["hi"]}]}
-            json.dump(data, f)
+    def run(model, args=None):
+        skip_args = ["model", "func"]
+        if hasattr(args, "version"):
+            cmd = f"perf_analyzer --version"
+        else:
+            utils.remove_file(args.profile_export_file)
 
-        cmd = [
-            "perf_analyzer",
-            "-i",
-            "grpc",
-            "--streaming",
-            "-m",
-            model,
-            "--input-data",
-            input_file,
-        ]
+            cmd = f"perf_analyzer -m {model} --async "
+            for arg, value in vars(args).items():
+                if arg in skip_args:
+                    pass
+                elif value is True:
+                    cmd += f"--{arg} "
+                elif arg == "url":
+                    cmd += f"-u {value} "
+                elif arg == "batch-size":
+                    cmd += f"-b {value} "
+                else:
+                    cmd += f"--{arg} {value} "
+                cmd += f"\n"
         logger.info(f"Running Perf Analyzer : '{cmd}'")
-        subprocess.run(cmd)
+        subprocess.run(cmd, shell=True)