diff --git a/src/c++/perf_analyzer/docs/examples/inflight_batching_benchmark.png b/src/c++/perf_analyzer/docs/examples/inflight_batching_benchmark.png new file mode 100644 index 000000000..83e227410 Binary files /dev/null and b/src/c++/perf_analyzer/docs/examples/inflight_batching_benchmark.png differ diff --git a/src/c++/perf_analyzer/docs/examples/profile.py b/src/c++/perf_analyzer/docs/examples/profile.py index 868cd8d20..39ad94f76 100644 --- a/src/c++/perf_analyzer/docs/examples/profile.py +++ b/src/c++/perf_analyzer/docs/examples/profile.py @@ -27,8 +27,10 @@ import argparse import json import subprocess +from itertools import pairwise from pathlib import Path -from statistics import mean + +import numpy as np TEMP_INPUT_FILE = "temp_input_data.json" @@ -38,6 +40,111 @@ def load_profile_data(): return json.load(f) +def print_benchmark_summary(results): + output = "\n[ Benchmark Summary ]" + for prompt_size, avg_first_token_latency, avg_token_to_token_latency in results: + output += ( + f"\n Prompt size: {prompt_size}, " + f"Average first-token latency: {avg_first_token_latency:.4f} sec" + ) + output += ( + f", Average token-to-token latency: {avg_token_to_token_latency:.4f} sec" + if avg_token_to_token_latency + else "" + ) + print(output) + + +def plot_results(latencies): + """Plot in-flight batching LLM bencharmark results.""" + import matplotlib.pyplot as plt # Lazy import + + periods = np.arange(1, len(latencies) + 1) + fig, ax = plt.subplots() + ax.plot(periods, latencies) + + # Set pyplot parameters + ax.grid(linestyle="--") + ax.set_xlabel("i-th Request Period", fontsize=12) + ax.set_ylabel("Avg Token-to-Token Latency (sec)", fontsize=12) + ax.set_title("In-Flight Batching Benchmark Summary", fontsize=14) + ax.set_ylim(bottom=0.0) + + fig.savefig("inflight_batching_benchmark.png", dpi=300) + print("Saved benchmark result @ 'inflight_batching_benchmark.png'.") + + +def add_latencies_to_bins(bins, pos, responses, request_period): + """Add token-to-token latencies into the corresponding bin. + + Given the responses of a single request, calculate token-to-token + latency and add it into bin. Update the bin position to the next + for every request period. + """ + for response_id, (prev_res, res) in enumerate(pairwise(responses)): + bins[pos].append(res - prev_res) + if (response_id + 1) % request_period == 0: + pos += 1 + + +def update_start_position(request_id, start_pos, initial_requests, step): + """Shift the start position of the bin. + + Once we iterate through the entire requests, we shift + the start position. Then, we shift the start position for every + requests. + """ + if (request_id + 1) >= initial_requests: + num_requests_after_start = request_id + 1 - initial_requests + if num_requests_after_start % step == 0: + start_pos += 1 + return start_pos + + +def collect_periodic_latencies(args): + """Split the entire benchmark results into segments with size + of request period and collect latencies for each segment. + """ + start, end, step = args.periodic_concurrency_range + + num_bins = args.max_tokens // args.request_period + (end - start) // step + if args.max_tokens % args.request_period != 0: + num_bins += 1 # extra bin + + bins = [[] for _ in range(num_bins)] + bin_start_position = 0 + + data = load_profile_data() + requests = data["experiments"][0]["requests"] + + for i, r in enumerate(requests): + add_latencies_to_bins( + bins=bins, + pos=bin_start_position, + responses=r["response_timestamps"], + request_period=args.request_period, + ) + bin_start_position = update_start_position( + request_id=i, + start_pos=bin_start_position, + initial_requests=start, + step=step, + ) + return bins + + +def calculate_avg_periodic_latencies(args): + """Calculate average token-to-token latency for each + request period. + """ + bins = collect_periodic_latencies(args) + + latencies = [] + for bin in bins: + latencies.append(np.mean(bin) / 1_000_000_000) + return latencies + + def collect_latencies(requests): # Example json demonstrating format: # see client/src/c++/perf_analyzer/docs/examples/decoupled_output_file.json @@ -59,9 +166,9 @@ def calculate_avg_latencies(): first_token_latencies, token_to_token_latencies = collect_latencies(requests) # Compute mean and convert from nanosec to sec - avg_first_token_latency = mean(first_token_latencies) / 1_000_000_000 + avg_first_token_latency = np.mean(first_token_latencies) / 1_000_000_000 if token_to_token_latencies: - avg_token_to_token_latency = mean(token_to_token_latencies) / 1_000_000_000 + avg_token_to_token_latency = np.mean(token_to_token_latencies) / 1_000_000_000 else: avg_token_to_token_latency = None return avg_first_token_latency, avg_token_to_token_latency @@ -115,7 +222,6 @@ def generate_input_data(args, prompt_size, filename): "--model", type=str, default="vllm", - choices=["vllm"], help="The name of the model to profile.", ) parser.add_argument( @@ -172,20 +278,18 @@ def generate_input_data(args, prompt_size, filename): generate_input_data(args, prompt_size, TEMP_INPUT_FILE) profile(args, args.input_data if args.input_data else TEMP_INPUT_FILE) - avg_first_token_latency, avg_token_to_token_latency = calculate_avg_latencies() - results.append( - (prompt_size, avg_first_token_latency, avg_token_to_token_latency) - ) - print("\n[ Benchmark Summary ]") - for prompt_size, avg_first_token_latency, avg_token_to_token_latency in results: - line = ( - f" Prompt size: {prompt_size}, " - f"Average first-token latency: {avg_first_token_latency:.4f} sec" - ) - line += ( - f", Average token-token latency: {avg_token_to_token_latency:.4f} sec" - if avg_token_to_token_latency - else "" - ) - print(line) + if not args.periodic_concurrency_range: + ( + avg_first_token_latency, + avg_token_to_token_latency, + ) = calculate_avg_latencies() + results.append( + (prompt_size, avg_first_token_latency, avg_token_to_token_latency) + ) + + if args.periodic_concurrency_range: + avg_latencies = calculate_avg_periodic_latencies(args) + plot_results(avg_latencies) + else: + print_benchmark_summary(results) diff --git a/src/c++/perf_analyzer/docs/llm.md b/src/c++/perf_analyzer/docs/llm.md index 32c81bf68..509bdd127 100644 --- a/src/c++/perf_analyzer/docs/llm.md +++ b/src/c++/perf_analyzer/docs/llm.md @@ -115,47 +115,90 @@ input sizes and request the model to compute a fixed amount of tokens. #### Example Inside the client container, run the following command to generate dummy prompts -of size 100, 300, and 500 and receive total 256 tokens from the model for each prompts. +of size 100, 300, and 500 and receive total 256 tokens from the model for each +prompts. ```bash python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 256 --ignore-eos # Sample output # [ Benchmark Summary ] -# Prompt size: 100, Average first-token latency: 0.0388 sec, Average token-token latency: 0.0066 sec -# Prompt size: 300, Average first-token latency: 0.0431 sec, Average token-token latency: 0.0071 sec -# Prompt size: 500, Average first-token latency: 0.0400 sec, Average token-token latency: 0.0070 sec +# Prompt size: 100, Average first-token latency: 0.0388 sec, Average token-to-token latency: 0.0066 sec +# Prompt size: 300, Average first-token latency: 0.0431 sec, Average token-to-token latency: 0.0071 sec +# Prompt size: 500, Average first-token latency: 0.0400 sec, Average token-to-token latency: 0.0070 sec ``` -## Benchmark 3: Profiling Continuous Batch Size +## Benchmark 3: Profiling In-Flight Batching > **Note** > -> This benchmark relies on the feature that will be available from `23.10` release -> which is on its way soon. You can either wait until the `23.10` container -> is ready or build Perf Analyzer from the latest `main` branch (see [build from source instructions](install.md#build-from-source)). +> This benchmark relies on the feature that will be available from `23.10` +> release which is on its way soon. You can either wait until the `23.10` +> container is ready or build Perf Analyzer from the latest `main` branch +> (see [build from source instructions](install.md#build-from-source)). -In this benchmarking scenario, we want to measure the effect of continuous -batch size on token-to-token latency. We systematically issue requests to the -server of fixed input sizes and request the model to compute a fixed amount of -tokens in order to increase the continuous batching size over time. +In this benchmarking scenario, we want to measure the effect of in-flight +batch size on token-to-token (T2T) latency. We systematically issue requests to +the server of fixed input sizes and request the model to compute a fixed amount +of tokens in order to increase the in-flight batch size over time. #### Example -In this benchmark, we are interested in how continuous batch size affects token-to-token latency -by increasing the number of concurrent requests to the model. -Perf Analyzer will run in [periodic concurrency mode](https://github.com/triton-inference-server/client/blob/main/src/c%2B%2B/perf_analyzer/docs/inference_load_modes.md#periodic-concurrency-mode) -that periodically launches a new concurrent request to the model using `--periodic-concurrency-range START END STEP` option. -In this example, Perf Analyzer starts with a single request and launches the new ones until the total number reaches 30. -You can also specify the timing of the new requests: For example, setting the `--request-period` to 50 will make -Perf Analyzer to wait for all the requests to receive 50 responses before it launches the new requests. +In this benchmark, we will run Perf Analyzer in +[periodic concurrency mode](inference_load_modes.md#periodic-concurrency-mode) +that periodically launches a new concurrent request to the model using +`--periodic-concurrency-range START END STEP` option. +In this example, Perf Analyzer starts with a single request and launches the new +ones until the total number reaches 100. +You can also specify the timing of the new requests: +Setting `--request-period` to 32 (as shown below) will make Perf Analyzer to +wait for all the requests to receive 32 responses before launching new requests. +Run the following command inside the client container. ```bash -python profile.py -m vllm --prompt-size-range 100 500 200 --periodic-concurrency-range 1 30 1 --request-period 50 --max-tokens 256 --ignore-eos +# Install matplotlib to generate the benchmark plot +pip install matplotlib + +# Run Perf Analyzer +python profile.py -m vllm --periodic-concurrency-range 1 100 1 --request-period 32 --max-tokens 1024 --ignore-eos # Sample output -# [ Benchmark Summary ] -# Prompt size: 100, Average first-token latency: 0.0381 sec, Average token-token latency: 0.0106 sec -# Prompt size: 300, Average first-token latency: 0.0347 sec, Average token-token latency: 0.0109 sec -# Prompt size: 500, Average first-token latency: 0.0336 sec, Average token-token latency: 0.0101 sec +# Saved benchmark result @ 'inflight_batching_benchmark.png'. +``` + +The resulting plot will look like + + + +The plot demonstrates how the average T2T latency changes across the entire +benchmark process as we increase the number of requests. +To observe the change, we first align the responses of every requests and then +split them into multiple segments of responses. +For instance, assume we ran the following benchmark command: + +```bash +python profile.py -m vllm --periodic-concurrency-range 1 4 1 --request-period 32 --max-tokens 1024 --ignore-eos +``` + +We start from a single request and increment up to 4 requests one by one for +every 32 responses (defined by `--request-period`). +For each request, there are total 1024 generated responses (defined by `--max-tokens`). +We align these total 1024 generated responses and split them by request period, +giving us 1024/32 = 32 total segments per request as shown below: + ``` + 32 responses (=request period) + ┌────┐ +request 1 ──────┊──────┊──────┊──────┊─ ··· ─┊──────┊ +request 2 ┊──────┊──────┊──────┊─ ··· ─┊──────┊──────┊ +request 3 ┊ ┊──────┊──────┊─ ··· ─┊──────┊──────┊──────┊ +request 4 ┊ ┊ ┊──────┊─ ··· ─┊──────┊──────┊──────┊────── + +segment # 1 2 3 4 ··· 32 33 34 35 +``` + +Then for each segment, we compute the mean of T2T latencies of the responses. +This will allow us to visualize the change in T2T latency as the number of +requests increase, filling up the inflight batch slots, and as they terminate. +See [profile.py](examples/profile.py) for more details. +