verify option combos (#508)

* WIP update args and set args based on service kind * Update args and set args based on service kind * Fix some testing
triton-inference-server · Mar 12, 2024 · f433e99 · f433e99
1 parent 004b96e
commit f433e99
Show file tree

Hide file tree

Showing 4 changed files with 40 additions and 24 deletions.
diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py b/src/c++/perf_analyzer/genai-pa/genai_pa/parser.py
@@ -140,13 +140,14 @@ def _add_profile_args(parser):
     )
 
     profile_group.add_argument(
-        "--max-threads",
-        type=int,
-        default=16,
+        "-p",
+        "--measurement-interval",
         required=False,
-        help="Sets the maximum number of threads that will be "
-        "created for providing desired concurrency or request rate. "
-        "The default value is 16.",
+        help="Indicates the time interval used "
+        "for each measurement in milliseconds. The perf analyzer will "
+        "sample a time interval specified by -p and take measurement over "
+        "the requests completed within that time interval. The default "
+        "value is 5000 msec.",
     )
 
     profile_group.add_argument(
@@ -159,12 +160,14 @@ def _add_profile_args(parser):
         "For example, if the profile export file is profile_export.json, the GenAi-PA file will be "
         "exported to profile_export_genai_pa.csv.",
     )
+
     load_management_group.add_argument(
         "--request-rate",
         type=float,
         required=False,
         help="Sets the request rate for the load generated by PA. ",
     )
+
     profile_group.add_argument(
         "--service-kind",
         type=str,
@@ -175,12 +178,25 @@ def _add_profile_args(parser):
         'generate load for. The options are "triton" and '
         '"openai". The default value is "triton".',
     )
+
+    profile_group.add_argument(
+        "-s",
+        "--stability-percentage",
+        required=False,
+        help="Indicates the allowed variation in "
+        "latency measurements when determining if a result is stable. The "
+        "measurement is considered as stable if the ratio of max / min "
+        "from the recent 3 measurements is within (stability percentage) "
+        "in terms of both infer per second and latency.",
+    )
+
     profile_group.add_argument(
         "--streaming",
         action="store_true",
         required=False,
         help=f"Enables the use of the streaming API.",
     )
+
     profile_group.add_argument(
         "--version",
         action="store_true",
@@ -196,15 +212,6 @@ def _add_endpoint_args(parser):
         "--endpoint", type=str, required=False, help="Specify an endpoint."
     )
 
-    endpoint_group.add_argument(
-        "-i",
-        type=str.lower,
-        choices=["http", "grpc"],
-        default="http",
-        required=False,
-        help=f"Sets the protocol used to communicate with inference service",
-    )
-
     endpoint_group.add_argument(
         "-u",
         "--url",

diff --git a/src/c++/perf_analyzer/genai-pa/genai_pa/wrapper.py b/src/c++/perf_analyzer/genai-pa/genai_pa/wrapper.py
@@ -34,6 +34,17 @@
 
 
 class Profiler:
+    @staticmethod
+    def add_protocol_args(args):
+        cmd = ""
+        if args.service_kind == "triton":
+            cmd += f"-i grpc "
+            if args.output_format == "trtllm":
+                cmd += f"--shape max_tokens:1 --shape text_input:1 "
+        elif args.service_kind == "openai":
+            cmd += f"-i http "
+        return cmd
+
     @staticmethod
     def build_cmd(model, args, extra_args):
         skip_args = [
@@ -70,10 +81,11 @@ def build_cmd(model, args, extra_args):
                         arg = utils.convert_option_name(arg)
                         cmd += f"--{arg} {value} "
 
+            cmd += Profiler.add_protocol_args(args)
+
             if extra_args is not None:
                 for arg in extra_args:
                     cmd += f"{arg} "
-        cmd += f" -p 10000 -s 999"
         return cmd
 
     @staticmethod

diff --git a/src/c++/perf_analyzer/genai-pa/tests/test_cli.py b/src/c++/perf_analyzer/genai-pa/tests/test_cli.py
@@ -59,15 +59,12 @@ def test_help_arguments_output_and_exit(self, arg, expected_output, capsys):
         [
             (["-b", "2"], {"batch_size": 2}),
             (["--batch-size", "2"], {"batch_size": 2}),
-            (["--max-threads", "4"], {"max_threads": 4}),
             (
                 ["--profile-export-file", "text.txt"],
                 {"profile_export_file": Path("text.txt")},
             ),
             (["--service-kind", "triton"], {"service_kind": "triton"}),
             (["--service-kind", "openai"], {"service_kind": "openai"}),
-            # TODO: Remove streaming from implementation. It is invalid with HTTP.
-            # (["--streaming"], {"streaming": True}),
             (["--version"], {"version": True}),
             (["-u", "test_url"], {"u": "test_url"}),
             (["--url", "test_url"], {"u": "test_url"}),

diff --git a/src/c++/perf_analyzer/genai-pa/tests/test_llm_metrics.py b/src/c++/perf_analyzer/genai-pa/tests/test_llm_metrics.py
@@ -199,7 +199,7 @@ def test_triton_llm_profile_data(self, prepare_triton_profile_data) -> None:
         )
 
         # experiment 1 statistics
-        stat = pd.get_statistics(infer_mode="concurrency", load_level=10)
+        stat = pd.get_statistics(infer_mode="concurrency", load_level="10")
 
         assert stat.avg_time_to_first_token == 2
         assert stat.avg_inter_token_latency == 2.25
@@ -227,7 +227,7 @@ def test_triton_llm_profile_data(self, prepare_triton_profile_data) -> None:
         assert stat.std_num_output_token == np.std([3, 5])
 
         # experiment 2 statistics
-        stat = pd.get_statistics(infer_mode="request_rate", load_level=2.0)
+        stat = pd.get_statistics(infer_mode="request_rate", load_level="2.0")
 
         assert stat.avg_time_to_first_token == 2.5
         assert stat.avg_inter_token_latency == 3
@@ -256,7 +256,7 @@ def test_triton_llm_profile_data(self, prepare_triton_profile_data) -> None:
 
         # check non-existing profile data
         with pytest.raises(KeyError):
-            pd.get_statistics(infer_mode="concurrency", load_level=30)
+            pd.get_statistics(infer_mode="concurrency", load_level="30")
 
     def test_openai_llm_profile_data(self, prepare_openai_profile_data) -> None:
         """Collect LLM metrics from profile export data and check values.
@@ -280,7 +280,7 @@ def test_openai_llm_profile_data(self, prepare_openai_profile_data) -> None:
         )
 
         # experiment 1 statistics
-        stat = pd.get_statistics(infer_mode="concurrency", load_level=10)
+        stat = pd.get_statistics(infer_mode="concurrency", load_level="10")
 
         assert stat.avg_time_to_first_token == 2
         assert stat.avg_inter_token_latency == 2.4
@@ -309,7 +309,7 @@ def test_openai_llm_profile_data(self, prepare_openai_profile_data) -> None:
 
         # check non-existing profile data
         with pytest.raises(KeyError):
-            pd.get_statistics(infer_mode="concurrency", load_level=40)
+            pd.get_statistics(infer_mode="concurrency", load_level="40")
 
     def test_llm_metrics_get_base_name(self) -> None:
         """Test get_base_name method in LLMMetrics class."""