triton-inference-server · debermudez · Feb 28, 2024 · Feb 27, 2024 · Feb 27, 2024 · Feb 27, 2024
diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py b/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py
@@ -39,9 +39,7 @@
 def handler(args):
     from genai_pa.wrapper import Profiler
 
-    Profiler.run(
-        model=args.model,
-    )
+    Profiler.run(model=args.model, args=args)
 
 
 ### Parsers ###
@@ -58,20 +56,41 @@ def add_model_args(parser):
 
 
 def add_profile_args(parser):
+    parser.add_argument(
+        "--async",
+        action="store_true",
+        required=False,
+        help=f"Enables asynchronous mode in perf_analyzer. "
+        "By default, perf_analyzer will use synchronous API to "
+        "request inference. However, if the model is sequential, "
+        "then default mode is asynchronous. Specify --sync to "
+        "operate sequential models in synchronous mode. In synchronous "
+        "mode, perf_analyzer will start threads equal to the concurrency "
+        "level. Use asynchronous mode to limit the number of threads, yet "
+        "maintain the concurrency.",
+    )
     parser.add_argument(
         "-b",
-        "--batch-size",
         type=int,
         default=1,
         required=False,
-        help="The batch size / concurrency to benchmark. (Default: 1)",
+        help="The batch size to benchmark. The default value is 1.",
     )
     parser.add_argument(
-        "--input-length",
+        "--concurrency",
         type=int,
-        default=128,
         required=False,
-        help="The input length (tokens) to use for benchmarking LLMs. (Default: 128)",
+        help="Sets the concurrency value to benchmark.",
+    )
+    parser.add_argument(
+        "--max-threads",
+        type=int,
+        default=16,
+        required=False,
+        help="Sets the maximum number of threads that will be "
+        "created for providing desired concurrency or request rate. "
+        "However, when running in synchronous mode,this value will be ignored. "
+        "The default value is 16.",
     )
     parser.add_argument(
         "--output-length",
@@ -80,52 +99,99 @@ def add_profile_args(parser):
         required=False,
         help="The output length (tokens) to use for benchmarking LLMs. (Default: 128)",
     )
-
-
-def add_endpoint_args(parser):
     parser.add_argument(
-        "--url",
+        "--profile-export-file",
         type=str,
-        default="localhost:8001",
         required=False,
-        help="URL of the endpoint to target for benchmarking.",
+        help="Specifies the path that the profile export will be "
+        "generated at. By default, the profile export will not be "
+        "generated.",
     )
     parser.add_argument(
-        "--provider",
+        "--request-rate",
+        type=float,
+        required=False,
+        help="Sets the request rates for load generated by analyzer. ",
+    )
+    parser.add_argument(
+        "--service-kind",
         type=str,
         choices=["triton", "openai"],
+        default="triton",
         required=False,
-        help="Provider format/schema to use for benchmarking.",
+        help="Sets the request rates for load generated by analyzer. "
+        "Describes the kind of service perf_analyzer to "
+        'generate load for. The options are "triton" and '
+        '"openai". The default value is "triton".',
     )
-
-
-def add_dataset_args(parser):
     parser.add_argument(
-        "--dataset",
-        type=str,
-        default="OpenOrca",
-        choices=["OpenOrca", "cnn_dailymail"],
+        "--streaming",
+        action="store_true",
+        required=False,
+        help=f"Enables the use of streaming API. This flag is "
+        "only valid with gRPC protocol. By default, it is set false.",
+    )
+    parser.add_argument(
+        "--sync",
+        action="store_true",
+        required=False,
+        help=f"Enables asynchronous mode in perf_analyzer. "
+        "By default, perf_analyzer will use synchronous API to "
+        "request inference. However, if the model is sequential, "
+        "then default mode is asynchronous. Specify --sync to "
+        "operate sequential models in synchronous mode. In synchronous "
+        "mode, perf_analyzer will start threads equal to the concurrency "
+        "level. Use asynchronous mode to limit the number of threads, yet "
+        "maintain the concurrency.",
+    )
+    parser.add_argument(
+        "--version",
+        action="store_true",
         required=False,
-        help="HuggingFace dataset to use for the benchmark.",
+        help=f"Enables the printing of the current version of perf_analyzer. "
+        "By default, it is set false.",
     )
+
+
+def add_endpoint_args(parser):
     parser.add_argument(
-        "--tokenizer",
+        "--u",
         type=str,
-        default="auto",
-        choices=["auto"],
+        default="localhost:8001",
         required=False,
-        help="The HuggingFace tokenizer to use to interpret token metrics from final text results",
+        help="URL of the endpoint to target for benchmarking.",
     )
 
 
+def add_dataset_args(parser):
+    pass
+    # TODO: Do we want to remove dataset and tokenizer?
+    # parser.add_argument(
+    #     "--dataset",
+    #     type=str,
+    #     default="OpenOrca",
+    #     choices=["OpenOrca", "cnn_dailymail"],
+    #     required=False,
+    #     help="HuggingFace dataset to use for the benchmark.",
+    # )
+    # parser.add_argument(
+    #     "--tokenizer",
+    #     type=str,
+    #     default="auto",
+    #     choices=["auto"],
+    #     required=False,
+    #     help="The HuggingFace tokenizer to use to interpret token metrics from final text results",
+    # )
+
+
 ### Entrypoint ###
 
 
 # Optional argv used for testing - will default to sys.argv if None.
 def parse_args(argv=None):
     parser = argparse.ArgumentParser(
         prog="genai-pa",
-        description="CLI to profile LLMs and Generative AI models with PA",
+        description="CLI to profile LLMs and Generative AI models with Perf Analyzer",
     )
     parser.set_defaults(func=handler)
 
@@ -143,4 +209,26 @@ def parse_args(argv=None):
     add_dataset_args(dataset_group)
 
     args = parser.parse_args(argv)
+
+    # Concurrency and request rate are mutually exclusive
+    # TODO: Review if there is a cleaner way to do this with argparse
+    if args.concurrency is not None and args.request_rate is not None:
+        parser.error(
+            "Arguments --concurrency and --request_rate are mutually exclusive."
+        )
+
+    if args.concurrency is None and args.request_rate is None:
+        args.concurrency = 1
+        print(
+            "Neither --concurrency nor --request_rate provided. Setting concurrency to 1."
+        )
+
+    # Update GenAI-PA non-range attributes to range format for PA
+    for attr_key in ["concurrency", "request_rate"]:
+        attr_val = getattr(args, attr_key)
+        if attr_val is not None:
+            setattr(args, f"{attr_key}_range", f"{attr_val}:{attr_val}:{attr_val}")
+        delattr(args, attr_key)
+
+    args = argparse.Namespace(**{k: v for k, v in vars(args).items() if v is not None})
     return args
diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/wrapper.py b/src/c++/perf_analyzer/genai-pa/genai_pa/wrapper.py
@@ -35,22 +35,19 @@
 
 class Profiler:
     @staticmethod
-    def run(model):
-        # TODO: Replace with other plumbing
-        input_file = "/tmp/input_data.json"
-        with open(input_file, "w") as f:
-            data = {"data": [{"text_input": ["hi"]}]}
-            json.dump(data, f)
+    def run(model, args):
+        skip_args = ["model", "func"]
+
+        cmd = f"perf_analyzer -m {model} "
+        for arg, value in vars(args).items():
+            if value is True:
+                cmd += f"--{arg} "
+            elif value is False:
+                pass
+            elif arg in skip_args:
+                pass
+            else:
+                cmd += f"--{arg} {value} "
 
-        cmd = [
-            "perf_analyzer",
-            "-i",
-            "grpc",
-            "--streaming",
-            "-m",
-            model,
-            "--input-data",
-            input_file,
-        ]
         logger.info(f"Running Perf Analyzer : '{cmd}'")
-        subprocess.run(cmd)
+        subprocess.run(cmd, shell=True)