diff --git a/src/c++/perf_analyzer/docs/examples/profile.py b/src/c++/perf_analyzer/docs/examples/profile.py index 39ad94f76..7b6d298bd 100644 --- a/src/c++/perf_analyzer/docs/examples/profile.py +++ b/src/c++/perf_analyzer/docs/examples/profile.py @@ -27,35 +27,78 @@ import argparse import json import subprocess +from dataclasses import dataclass from itertools import pairwise from pathlib import Path +from typing import Optional import numpy as np -TEMP_INPUT_FILE = "temp_input_data.json" +INPUT_FILENAME = "generated_input_data.json" +TITLE = "\n[ BENCHMARK SUMMARY ]\n" +PROMPT_SIZE = " Prompt size: {}" +FIRST_TOKEN_LATENCY = "Average first-token latency: {:.4f} sec" +T2T_LATENCY = "Average total token-to-token latency: {:.4f} sec" -def load_profile_data(): - with open("profile_export.json") as f: + +@dataclass +class ProfileResults: + prompt_size: int + avg_first_token_latency: int + avg_total_t2t_latency: int + avg_periodic_t2t_latencies: Optional[list[int]] = None + + +def load_json_data(filename): + with open(filename) as f: return json.load(f) -def print_benchmark_summary(results): - output = "\n[ Benchmark Summary ]" - for prompt_size, avg_first_token_latency, avg_token_to_token_latency in results: - output += ( - f"\n Prompt size: {prompt_size}, " - f"Average first-token latency: {avg_first_token_latency:.4f} sec" - ) - output += ( - f", Average token-to-token latency: {avg_token_to_token_latency:.4f} sec" - if avg_token_to_token_latency - else "" - ) - print(output) +def save_json_data(data, filename): + with open(filename, "w") as f: + json.dump(data, f) + + +def get_postfix(args, prompt_size): + """Generate postfix for profile export filename and plot. + + e.g. + - trtllm-prompt100-maxtokens256 + - trtllm-prompt100-periodic1_100_1-period32-maxtokens1024 + """ + postfix = f"{args.model}-prompt{prompt_size}-" + if args.periodic_concurrency_range: + start, end, step = args.periodic_concurrency_range + postfix += f"periodic{start}_{end}_{step}-period{args.request_period}-" + postfix += f"maxtokens{args.max_tokens}" + return postfix + + +def get_export_filename(args, prompt_size): + postfix = get_postfix(args, prompt_size) + filename = f"profile_export-{postfix}.json" + return filename + + +def get_plot_filename(args, prompt_size): + postfix = get_postfix(args, prompt_size) + filename = f"inflight_batching_benchmark-{postfix}.png" + return filename -def plot_results(latencies): +def print_benchmark_summary(profile_results): + output = [TITLE] + for pr in profile_results: + line = [PROMPT_SIZE.format(pr.prompt_size)] + line += [FIRST_TOKEN_LATENCY.format(pr.avg_first_token_latency)] + if pr.avg_total_t2t_latency: + line += [T2T_LATENCY.format(pr.avg_total_t2t_latency)] + output += [", ".join(line) + "\n"] + print("".join(output)) + + +def plot_results(latencies, filename="inflight_batching_benchmark.png"): """Plot in-flight batching LLM bencharmark results.""" import matplotlib.pyplot as plt # Lazy import @@ -70,8 +113,7 @@ def plot_results(latencies): ax.set_title("In-Flight Batching Benchmark Summary", fontsize=14) ax.set_ylim(bottom=0.0) - fig.savefig("inflight_batching_benchmark.png", dpi=300) - print("Saved benchmark result @ 'inflight_batching_benchmark.png'.") + fig.savefig(filename, dpi=300) def add_latencies_to_bins(bins, pos, responses, request_period): @@ -101,7 +143,7 @@ def update_start_position(request_id, start_pos, initial_requests, step): return start_pos -def collect_periodic_latencies(args): +def collect_periodic_latencies(args, filename): """Split the entire benchmark results into segments with size of request period and collect latencies for each segment. """ @@ -114,7 +156,7 @@ def collect_periodic_latencies(args): bins = [[] for _ in range(num_bins)] bin_start_position = 0 - data = load_profile_data() + data = load_json_data(filename) requests = data["experiments"][0]["requests"] for i, r in enumerate(requests): @@ -133,11 +175,9 @@ def collect_periodic_latencies(args): return bins -def calculate_avg_periodic_latencies(args): - """Calculate average token-to-token latency for each - request period. - """ - bins = collect_periodic_latencies(args) +def calculate_avg_periodic_latencies(args, filename): + """Calculate average token-to-token latency for each request period.""" + bins = collect_periodic_latencies(args, filename) latencies = [] for bin in bins: @@ -161,8 +201,9 @@ def collect_latencies(requests): return first_token_latencies, token_to_token_latencies -def calculate_avg_latencies(): - requests = load_profile_data() +def calculate_avg_latencies(filename): + """Calculate avg first-token and avg total token-to-token latencies.""" + requests = load_json_data(filename) first_token_latencies, token_to_token_latencies = collect_latencies(requests) # Compute mean and convert from nanosec to sec @@ -174,15 +215,45 @@ def calculate_avg_latencies(): return avg_first_token_latency, avg_token_to_token_latency -def profile(args, input_data_file): - # Clean up - export_file = Path("profile_export.json") - export_file.unlink(missing_ok=True) +def summarize_profile_results(args, prompts): + results = [] + for prompt in prompts: + prompt_size = len(prompt.split()) + export_file = get_export_filename(args, prompt_size) + avg_first_token_latency, avg_total_t2t_latency = calculate_avg_latencies( + filename=export_file + ) + + profile_result = ProfileResults( + prompt_size=prompt_size, + avg_first_token_latency=avg_first_token_latency, + avg_total_t2t_latency=avg_total_t2t_latency, + ) + + if args.periodic_concurrency_range: + periodic_latencies = calculate_avg_periodic_latencies(args, export_file) + profile_result.avg_periodic_t2t_latencies = periodic_latencies + plot_results( + latencies=periodic_latencies, + filename=get_plot_filename(args, prompt_size), + ) + + results.append(profile_result) + + print_benchmark_summary(results) + + if args.periodic_concurrency_range: + print( + "Saved in-flight batching benchmark plots " + "@ 'inflight_batching_benchmark-*.png'." + ) + +def profile(args, export_file): command = ( f"perf_analyzer -m {args.model} -i grpc --async --streaming " - f"--input-data={input_data_file} " - "--profile-export-file=profile_export.json " + f"--input-data={INPUT_FILENAME} " + f"--profile-export-file={export_file} " ) if args.periodic_concurrency_range: start, end, step = args.periodic_concurrency_range @@ -196,23 +267,87 @@ def profile(args, input_data_file): "--measurement-request-count=10 " "--stability-percentage=999" ) + + print("Running Perf Analyzer...") subprocess.run(args=[command], shell=True) -def generate_input_data(args, prompt_size, filename): - request_parameters = f""" - {{ - "max_tokens": {args.max_tokens}, - "ignore_eos": {"true" if args.ignore_eos else "false"} - }} +def prepare_export_file(args, prompt): + prompt_size = len(prompt.split()) + filename = get_export_filename(args, prompt_size) + + # If exists, clean up + export_file = Path(filename) + export_file.unlink(missing_ok=True) + return export_file + + +def prepare_input_data(input_data, prompt): + """Insert the prompt to send into input JSON data.""" + input_data["data"][0]["PROMPT"] = [prompt] + save_json_data(input_data, INPUT_FILENAME) + + +def generate_prompts(args, input_data): + """Generate dummy prompts if not specified by input JSON file.""" + prompt = input_data["data"][0]["PROMPT"][0] + + if not prompt: # Generate dummy prompt + assert args.prompt_size_range, "Must specify --prompt-size-range." + start, end, step = args.prompt_size_range + return [" ".join(["hi"] * size) for size in range(start, end + 1, step)] + return [prompt] + + +def construct_input_data(args): + """Construct input data that contains input tensors and parameters. + + Parse the input JSON file (if exists) to construct the input data. + When user sets parameters through command line, overwrite the + parameters set by input JSON file. """ - input_data = {"data": [{"STREAM": [True]}]} - input_data["data"][0]["SAMPLING_PARAMETERS"] = [request_parameters] + prompt = "" + stream = True + sampling_params = {} - prompt = ["hi"] * prompt_size # Generate dummy prompt - input_data["data"][0]["PROMPT"] = [" ".join(prompt)] - with open(filename, "w") as f: - json.dump(input_data, f) + if args.input_data: + data = load_json_data(filename=args.input_data)["data"][0] + stream = data["STREAM"][0] if "STREAM" in data else stream + prompt = data["PROMPT"][0] if "PROMPT" in data else prompt + if "SAMPLING_PARAMETERS" in data: + sampling_params = json.loads(data["SAMPLING_PARAMETERS"][0]) + + # If specified, overwrite max_tokens + if args.max_tokens: + sampling_params["max_tokens"] = args.max_tokens + else: + args.max_tokens = sampling_params["max_tokens"] + + # If specified, overwrite ignore_eos + if "ignore_eos" not in sampling_params: + sampling_params["ignore_eos"] = args.ignore_eos + elif args.ignore_eos: + sampling_params["ignore_eos"] = True + + input_data = {"data": [{}]} + input_data["data"][0]["PROMPT"] = [prompt] + input_data["data"][0]["STREAM"] = [stream] + input_data["data"][0]["SAMPLING_PARAMETERS"] = [json.dumps(sampling_params)] + return input_data + + +def main(args): + input_data = construct_input_data(args) + prompts = generate_prompts(args, input_data) + + for prompt in prompts: + prepare_input_data(input_data, prompt) + export_file = prepare_export_file(args, prompt) + + # Run Perf Analyzer + profile(args, export_file) + + summarize_profile_results(args, prompts) if __name__ == "__main__": @@ -229,7 +364,6 @@ def generate_input_data(args, prompt_size, filename): type=int, nargs=3, metavar=("START", "END", "STEP"), - default=[10, 10, 1], help="The range of prompt sizes '<[START, END], STEP>' where END is inclusive.", ) parser.add_argument( @@ -248,7 +382,6 @@ def generate_input_data(args, prompt_size, filename): parser.add_argument( "--max-tokens", type=int, - default=256, help="The maximum number of tokens to generate.", ) parser.add_argument( @@ -262,34 +395,4 @@ def generate_input_data(args, prompt_size, filename): help="The input data file to be used for inference request.", ) args = parser.parse_args() - - results = [] - - if args.input_data: - print(f"Using input data file '{args.input_data}' for inference request.\n") - with open(args.input_data) as f: - input_data = json.load(f) - prompt_size = len(input_data["data"][0]["PROMPT"][0].split()) - args.prompt_size_range = [prompt_size, prompt_size, 1] - - start, end, step = args.prompt_size_range - for prompt_size in range(start, end + 1, step): - if not args.input_data: - generate_input_data(args, prompt_size, TEMP_INPUT_FILE) - - profile(args, args.input_data if args.input_data else TEMP_INPUT_FILE) - - if not args.periodic_concurrency_range: - ( - avg_first_token_latency, - avg_token_to_token_latency, - ) = calculate_avg_latencies() - results.append( - (prompt_size, avg_first_token_latency, avg_token_to_token_latency) - ) - - if args.periodic_concurrency_range: - avg_latencies = calculate_avg_periodic_latencies(args) - plot_results(avg_latencies) - else: - print_benchmark_summary(results) + main(args) diff --git a/src/c++/perf_analyzer/docs/llm.md b/src/c++/perf_analyzer/docs/llm.md index 509bdd127..107b82ccb 100644 --- a/src/c++/perf_analyzer/docs/llm.md +++ b/src/c++/perf_analyzer/docs/llm.md @@ -72,17 +72,19 @@ of size 100, 300, and 500 and receive single token from the model for each promp python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 1 # Sample output -# [ Benchmark Summary ] -# Prompt size: 100, Average first-token latency: 0.0459 sec -# Prompt size: 300, Average first-token latency: 0.0415 sec -# Prompt size: 500, Average first-token latency: 0.0451 sec +# [ BENCHMARK SUMMARY ] +# Prompt size: 100, Average first-token latency: 0.0441 sec +# Prompt size: 300, Average first-token latency: 0.0427 sec +# Prompt size: 500, Average first-token latency: 0.0555 sec ``` > **Note** > -> In order to provide a specific prompt (instead of the dummy prompt generated by default), -> the user can provide input data JSON file using `--input-data` option. -> This will however *ignore* any parameters specified through the command line. +> Users can also run a custom prompt by providing input data JSON file using +> `--input-data` option. They can also specify input tensors or parameters to +> the model as well. However, when a parameter is defined in both input data +> JSON file and through command line option (e.g. `max_tokens`), the command +> line option value will overwrite the one in the input data JSON file. > ```bash > $ echo ' > { @@ -122,10 +124,10 @@ prompts. python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 256 --ignore-eos # Sample output -# [ Benchmark Summary ] -# Prompt size: 100, Average first-token latency: 0.0388 sec, Average token-to-token latency: 0.0066 sec -# Prompt size: 300, Average first-token latency: 0.0431 sec, Average token-to-token latency: 0.0071 sec -# Prompt size: 500, Average first-token latency: 0.0400 sec, Average token-to-token latency: 0.0070 sec +# [ BENCHMARK SUMMARY ] +# Prompt size: 100, Average first-token latency: 0.0388 sec, Average total token-to-token latency: 0.0066 sec +# Prompt size: 300, Average first-token latency: 0.0431 sec, Average total token-to-token latency: 0.0071 sec +# Prompt size: 500, Average first-token latency: 0.0400 sec, Average total token-to-token latency: 0.0070 sec ``` ## Benchmark 3: Profiling In-Flight Batching @@ -160,10 +162,13 @@ Run the following command inside the client container. pip install matplotlib # Run Perf Analyzer -python profile.py -m vllm --periodic-concurrency-range 1 100 1 --request-period 32 --max-tokens 1024 --ignore-eos +python profile.py -m vllm --prompt-size-range 10 10 1 --periodic-concurrency-range 1 100 1 --request-period 32 --max-tokens 1024 --ignore-eos # Sample output -# Saved benchmark result @ 'inflight_batching_benchmark.png'. +# [ BENCHMARK SUMMARY ] +# Prompt size: 10, Average first-token latency: 0.0799 sec, Average total token-to-token latency: 0.0324 sec +# +# Saved in-flight batching benchmark plots @ 'inflight_batching_benchmark-*.png'. ``` The resulting plot will look like