Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add additional Cli parsing #474

Merged
merged 6 commits into from
Feb 28, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 117 additions & 29 deletions src/c++/perf_analyzer/genai-pa/genai_pa/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,7 @@
def handler(args):
from genai_pa.wrapper import Profiler

Profiler.run(
model=args.model,
)
Profiler.run(model=args.model, args=args)


### Parsers ###
Expand All @@ -58,20 +56,41 @@ def add_model_args(parser):


def add_profile_args(parser):
parser.add_argument(
"--async",
action="store_true",
required=False,
help=f"Enables asynchronous mode in perf_analyzer. "
"By default, perf_analyzer will use synchronous API to "
"request inference. However, if the model is sequential, "
"then default mode is asynchronous. Specify --sync to "
"operate sequential models in synchronous mode. In synchronous "
"mode, perf_analyzer will start threads equal to the concurrency "
"level. Use asynchronous mode to limit the number of threads, yet "
"maintain the concurrency.",
)
parser.add_argument(
"-b",
"--batch-size",
rmccorm4 marked this conversation as resolved.
Show resolved Hide resolved
type=int,
default=1,
required=False,
help="The batch size / concurrency to benchmark. (Default: 1)",
help="The batch size to benchmark. The default value is 1.",
)
parser.add_argument(
"--input-length",
"--concurrency",
type=int,
default=128,
required=False,
help="The input length (tokens) to use for benchmarking LLMs. (Default: 128)",
help="Sets the concurrency value to benchmark.",
)
parser.add_argument(
"--max-threads",
type=int,
default=16,
required=False,
help="Sets the maximum number of threads that will be "
"created for providing desired concurrency or request rate. "
"However, when running in synchronous mode,this value will be ignored. "
rmccorm4 marked this conversation as resolved.
Show resolved Hide resolved
"The default value is 16.",
)
parser.add_argument(
"--output-length",
Expand All @@ -80,52 +99,99 @@ def add_profile_args(parser):
required=False,
help="The output length (tokens) to use for benchmarking LLMs. (Default: 128)",
)


def add_endpoint_args(parser):
parser.add_argument(
"--url",
"--profile-export-file",
matthewkotila marked this conversation as resolved.
Show resolved Hide resolved
type=str,
default="localhost:8001",
required=False,
help="URL of the endpoint to target for benchmarking.",
help="Specifies the path that the profile export will be "
"generated at. By default, the profile export will not be "
"generated.",
)
parser.add_argument(
"--provider",
"--request-rate",
type=float,
required=False,
help="Sets the request rates for load generated by analyzer. ",
)
parser.add_argument(
"--service-kind",
type=str,
choices=["triton", "openai"],
matthewkotila marked this conversation as resolved.
Show resolved Hide resolved
default="triton",
required=False,
help="Provider format/schema to use for benchmarking.",
help="Sets the request rates for load generated by analyzer. "
"Describes the kind of service perf_analyzer to "
'generate load for. The options are "triton" and '
'"openai". The default value is "triton".',
rmccorm4 marked this conversation as resolved.
Show resolved Hide resolved
)


def add_dataset_args(parser):
parser.add_argument(
"--dataset",
type=str,
default="OpenOrca",
choices=["OpenOrca", "cnn_dailymail"],
"--streaming",
action="store_true",
required=False,
help=f"Enables the use of streaming API. This flag is "
"only valid with gRPC protocol. By default, it is set false.",
)
parser.add_argument(
"--sync",
rmccorm4 marked this conversation as resolved.
Show resolved Hide resolved
action="store_true",
required=False,
help=f"Enables asynchronous mode in perf_analyzer. "
matthewkotila marked this conversation as resolved.
Show resolved Hide resolved
debermudez marked this conversation as resolved.
Show resolved Hide resolved
"By default, perf_analyzer will use synchronous API to "
"request inference. However, if the model is sequential, "
"then default mode is asynchronous. Specify --sync to "
"operate sequential models in synchronous mode. In synchronous "
"mode, perf_analyzer will start threads equal to the concurrency "
"level. Use asynchronous mode to limit the number of threads, yet "
"maintain the concurrency.",
)
parser.add_argument(
"--version",
action="store_true",
required=False,
help="HuggingFace dataset to use for the benchmark.",
help=f"Enables the printing of the current version of perf_analyzer. "
matthewkotila marked this conversation as resolved.
Show resolved Hide resolved
"By default, it is set false.",
rmccorm4 marked this conversation as resolved.
Show resolved Hide resolved
)


def add_endpoint_args(parser):
parser.add_argument(
"--tokenizer",
"--u",
debermudez marked this conversation as resolved.
Show resolved Hide resolved
matthewkotila marked this conversation as resolved.
Show resolved Hide resolved
type=str,
default="auto",
choices=["auto"],
default="localhost:8001",
required=False,
help="The HuggingFace tokenizer to use to interpret token metrics from final text results",
help="URL of the endpoint to target for benchmarking.",
)


def add_dataset_args(parser):
pass
Dismissed Show dismissed Hide dismissed
# TODO: Do we want to remove dataset and tokenizer?
# parser.add_argument(
# "--dataset",
# type=str,
# default="OpenOrca",
# choices=["OpenOrca", "cnn_dailymail"],
# required=False,
# help="HuggingFace dataset to use for the benchmark.",
# )
# parser.add_argument(
# "--tokenizer",
# type=str,
# default="auto",
# choices=["auto"],
# required=False,
# help="The HuggingFace tokenizer to use to interpret token metrics from final text results",
# )


### Entrypoint ###


# Optional argv used for testing - will default to sys.argv if None.
def parse_args(argv=None):
parser = argparse.ArgumentParser(
prog="genai-pa",
description="CLI to profile LLMs and Generative AI models with PA",
description="CLI to profile LLMs and Generative AI models with Perf Analyzer",
)
parser.set_defaults(func=handler)

Expand All @@ -143,4 +209,26 @@ def parse_args(argv=None):
add_dataset_args(dataset_group)

args = parser.parse_args(argv)

# Concurrency and request rate are mutually exclusive
# TODO: Review if there is a cleaner way to do this with argparse
debermudez marked this conversation as resolved.
Show resolved Hide resolved
if args.concurrency is not None and args.request_rate is not None:
parser.error(
"Arguments --concurrency and --request_rate are mutually exclusive."
rmccorm4 marked this conversation as resolved.
Show resolved Hide resolved
)

if args.concurrency is None and args.request_rate is None:
args.concurrency = 1
print(
"Neither --concurrency nor --request_rate provided. Setting concurrency to 1."
)

# Update GenAI-PA non-range attributes to range format for PA
for attr_key in ["concurrency", "request_rate"]:
attr_val = getattr(args, attr_key)
if attr_val is not None:
setattr(args, f"{attr_key}_range", f"{attr_val}:{attr_val}:{attr_val}")
matthewkotila marked this conversation as resolved.
Show resolved Hide resolved
delattr(args, attr_key)

args = argparse.Namespace(**{k: v for k, v in vars(args).items() if v is not None})
matthewkotila marked this conversation as resolved.
Show resolved Hide resolved
return args
31 changes: 14 additions & 17 deletions src/c++/perf_analyzer/genai-pa/genai_pa/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,22 +35,19 @@

class Profiler:
@staticmethod
def run(model):
# TODO: Replace with other plumbing
input_file = "/tmp/input_data.json"
with open(input_file, "w") as f:
data = {"data": [{"text_input": ["hi"]}]}
json.dump(data, f)
def run(model, args):
skip_args = ["model", "func"]
matthewkotila marked this conversation as resolved.
Show resolved Hide resolved

cmd = f"perf_analyzer -m {model} "
for arg, value in vars(args).items():
if value is True:
cmd += f"--{arg} "
elif value is False:
pass
elif arg in skip_args:
pass
else:
cmd += f"--{arg} {value} "

cmd = [
"perf_analyzer",
"-i",
"grpc",
"--streaming",
"-m",
model,
"--input-data",
input_file,
]
logger.info(f"Running Perf Analyzer : '{cmd}'")
subprocess.run(cmd)
subprocess.run(cmd, shell=True)
Loading