Skip to content

Commit

Permalink
verify option combos (#508)
Browse files Browse the repository at this point in the history
* WIP update args and set args based on service kind

* Update args and set args based on service kind

* Fix some testing
  • Loading branch information
debermudez committed Mar 12, 2024
1 parent 004b96e commit f433e99
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 24 deletions.
37 changes: 22 additions & 15 deletions src/c++/perf_analyzer/genai-pa/genai_pa/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,13 +140,14 @@ def _add_profile_args(parser):
)

profile_group.add_argument(
"--max-threads",
type=int,
default=16,
"-p",
"--measurement-interval",
required=False,
help="Sets the maximum number of threads that will be "
"created for providing desired concurrency or request rate. "
"The default value is 16.",
help="Indicates the time interval used "
"for each measurement in milliseconds. The perf analyzer will "
"sample a time interval specified by -p and take measurement over "
"the requests completed within that time interval. The default "
"value is 5000 msec.",
)

profile_group.add_argument(
Expand All @@ -159,12 +160,14 @@ def _add_profile_args(parser):
"For example, if the profile export file is profile_export.json, the GenAi-PA file will be "
"exported to profile_export_genai_pa.csv.",
)

load_management_group.add_argument(
"--request-rate",
type=float,
required=False,
help="Sets the request rate for the load generated by PA. ",
)

profile_group.add_argument(
"--service-kind",
type=str,
Expand All @@ -175,12 +178,25 @@ def _add_profile_args(parser):
'generate load for. The options are "triton" and '
'"openai". The default value is "triton".',
)

profile_group.add_argument(
"-s",
"--stability-percentage",
required=False,
help="Indicates the allowed variation in "
"latency measurements when determining if a result is stable. The "
"measurement is considered as stable if the ratio of max / min "
"from the recent 3 measurements is within (stability percentage) "
"in terms of both infer per second and latency.",
)

profile_group.add_argument(
"--streaming",
action="store_true",
required=False,
help=f"Enables the use of the streaming API.",
)

profile_group.add_argument(
"--version",
action="store_true",
Expand All @@ -196,15 +212,6 @@ def _add_endpoint_args(parser):
"--endpoint", type=str, required=False, help="Specify an endpoint."
)

endpoint_group.add_argument(
"-i",
type=str.lower,
choices=["http", "grpc"],
default="http",
required=False,
help=f"Sets the protocol used to communicate with inference service",
)

endpoint_group.add_argument(
"-u",
"--url",
Expand Down
14 changes: 13 additions & 1 deletion src/c++/perf_analyzer/genai-pa/genai_pa/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,17 @@


class Profiler:
@staticmethod
def add_protocol_args(args):
cmd = ""
if args.service_kind == "triton":
cmd += f"-i grpc "
if args.output_format == "trtllm":
cmd += f"--shape max_tokens:1 --shape text_input:1 "
elif args.service_kind == "openai":
cmd += f"-i http "
return cmd

@staticmethod
def build_cmd(model, args, extra_args):
skip_args = [
Expand Down Expand Up @@ -70,10 +81,11 @@ def build_cmd(model, args, extra_args):
arg = utils.convert_option_name(arg)
cmd += f"--{arg} {value} "

cmd += Profiler.add_protocol_args(args)

if extra_args is not None:
for arg in extra_args:
cmd += f"{arg} "
cmd += f" -p 10000 -s 999"
return cmd

@staticmethod
Expand Down
3 changes: 0 additions & 3 deletions src/c++/perf_analyzer/genai-pa/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,15 +59,12 @@ def test_help_arguments_output_and_exit(self, arg, expected_output, capsys):
[
(["-b", "2"], {"batch_size": 2}),
(["--batch-size", "2"], {"batch_size": 2}),
(["--max-threads", "4"], {"max_threads": 4}),
(
["--profile-export-file", "text.txt"],
{"profile_export_file": Path("text.txt")},
),
(["--service-kind", "triton"], {"service_kind": "triton"}),
(["--service-kind", "openai"], {"service_kind": "openai"}),
# TODO: Remove streaming from implementation. It is invalid with HTTP.
# (["--streaming"], {"streaming": True}),
(["--version"], {"version": True}),
(["-u", "test_url"], {"u": "test_url"}),
(["--url", "test_url"], {"u": "test_url"}),
Expand Down
10 changes: 5 additions & 5 deletions src/c++/perf_analyzer/genai-pa/tests/test_llm_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def test_triton_llm_profile_data(self, prepare_triton_profile_data) -> None:
)

# experiment 1 statistics
stat = pd.get_statistics(infer_mode="concurrency", load_level=10)
stat = pd.get_statistics(infer_mode="concurrency", load_level="10")

assert stat.avg_time_to_first_token == 2
assert stat.avg_inter_token_latency == 2.25
Expand Down Expand Up @@ -227,7 +227,7 @@ def test_triton_llm_profile_data(self, prepare_triton_profile_data) -> None:
assert stat.std_num_output_token == np.std([3, 5])

# experiment 2 statistics
stat = pd.get_statistics(infer_mode="request_rate", load_level=2.0)
stat = pd.get_statistics(infer_mode="request_rate", load_level="2.0")

assert stat.avg_time_to_first_token == 2.5
assert stat.avg_inter_token_latency == 3
Expand Down Expand Up @@ -256,7 +256,7 @@ def test_triton_llm_profile_data(self, prepare_triton_profile_data) -> None:

# check non-existing profile data
with pytest.raises(KeyError):
pd.get_statistics(infer_mode="concurrency", load_level=30)
pd.get_statistics(infer_mode="concurrency", load_level="30")

def test_openai_llm_profile_data(self, prepare_openai_profile_data) -> None:
"""Collect LLM metrics from profile export data and check values.
Expand All @@ -280,7 +280,7 @@ def test_openai_llm_profile_data(self, prepare_openai_profile_data) -> None:
)

# experiment 1 statistics
stat = pd.get_statistics(infer_mode="concurrency", load_level=10)
stat = pd.get_statistics(infer_mode="concurrency", load_level="10")

assert stat.avg_time_to_first_token == 2
assert stat.avg_inter_token_latency == 2.4
Expand Down Expand Up @@ -309,7 +309,7 @@ def test_openai_llm_profile_data(self, prepare_openai_profile_data) -> None:

# check non-existing profile data
with pytest.raises(KeyError):
pd.get_statistics(infer_mode="concurrency", load_level=40)
pd.get_statistics(infer_mode="concurrency", load_level="40")

def test_llm_metrics_get_base_name(self) -> None:
"""Test get_base_name method in LLMMetrics class."""
Expand Down

0 comments on commit f433e99

Please sign in to comment.