Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve metric and generate plot for in-flight batch size benchmark #422

Merged
merged 17 commits into from
Oct 26, 2023
Merged
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
144 changes: 124 additions & 20 deletions src/c++/perf_analyzer/docs/examples/profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@
import argparse
import json
import subprocess
from itertools import pairwise
from pathlib import Path
from statistics import mean

import numpy as np

TEMP_INPUT_FILE = "temp_input_data.json"

Expand All @@ -38,6 +40,111 @@ def load_profile_data():
return json.load(f)


def print_benchmark_summary(results):
output = "\n[ Benchmark Summary ]"
for prompt_size, avg_first_token_latency, avg_token_to_token_latency in results:
output += (
f"\n Prompt size: {prompt_size}, "
f"Average first-token latency: {avg_first_token_latency:.4f} sec"
)
output += (
f", Average token-to-token latency: {avg_token_to_token_latency:.4f} sec"
if avg_token_to_token_latency
else ""
)
print(output)


def plot_results(latencies):
"""Plot in-flight batching LLM bencharmark results."""
import matplotlib.pyplot as plt # Lazy import

periods = np.arange(1, len(latencies) + 1)
fig, ax = plt.subplots()
ax.plot(periods, latencies)

# Set pyplot parameters
ax.grid(linestyle="--")
ax.set_xlabel("i-th Request Period", fontsize=12)
ax.set_ylabel("Avg Token-to-Token Latency (sec)", fontsize=12)
ax.set_title("In-Flight Batching Benchmark Summary", fontsize=14)
ax.set_ylim(bottom=0.0)

fig.savefig("inflight_batching_benchmark.png", dpi=300)
print("Saved benchmark result @ 'inflight_batching_benchmark.png'.")


def add_latencies_to_bins(bins, pos, responses, request_period):
"""Add token-to-token latencies into the corresponding bin.

Given the responses of a single request, calculate token-to-token
latency and add it into bin. Update the bin position to the next
for every request period.
"""
for response_id, (prev_res, res) in enumerate(pairwise(responses)):
bins[pos].append(res - prev_res)
if (response_id + 1) % request_period == 0:
pos += 1


def update_start_position(request_id, start_pos, initial_requests, step):
"""Shift the start position of the bin.

Once we iterate through the entire <start> requests, we shift
the start position. Then, we shift the start position for every
<step> requests.
"""
if (request_id + 1) >= initial_requests:
num_requests_after_start = request_id + 1 - initial_requests
if num_requests_after_start % step == 0:
start_pos += 1
return start_pos


def collect_periodic_latencies(args):
"""Split the entire benchmark results into segments with size
dyastremsky marked this conversation as resolved.
Show resolved Hide resolved
of request period and collect latencies for each segment.
"""
start, end, step = args.periodic_concurrency_range

num_bins = args.max_tokens // args.request_period + (end - start) // step
if args.max_tokens % args.request_period != 0:
num_bins += 1 # extra bin

bins = [[] for _ in range(num_bins)]
bin_start_position = 0

data = load_profile_data()
requests = data["experiments"][0]["requests"]

for i, r in enumerate(requests):
matthewkotila marked this conversation as resolved.
Show resolved Hide resolved
add_latencies_to_bins(
bins=bins,
pos=bin_start_position,
responses=r["response_timestamps"],
request_period=args.request_period,
)
bin_start_position = update_start_position(
request_id=i,
start_pos=bin_start_position,
initial_requests=start,
step=step,
)
return bins


def calculate_avg_periodic_latencies(args):
"""Calculate average token-to-token latency for each
request period.
"""
bins = collect_periodic_latencies(args)

latencies = []
for bin in bins:
latencies.append(np.mean(bin) / 1_000_000_000)
return latencies


def collect_latencies(requests):
# Example json demonstrating format:
# see client/src/c++/perf_analyzer/docs/examples/decoupled_output_file.json
Expand All @@ -59,9 +166,9 @@ def calculate_avg_latencies():
first_token_latencies, token_to_token_latencies = collect_latencies(requests)

# Compute mean and convert from nanosec to sec
avg_first_token_latency = mean(first_token_latencies) / 1_000_000_000
avg_first_token_latency = np.mean(first_token_latencies) / 1_000_000_000
if token_to_token_latencies:
avg_token_to_token_latency = mean(token_to_token_latencies) / 1_000_000_000
avg_token_to_token_latency = np.mean(token_to_token_latencies) / 1_000_000_000
else:
avg_token_to_token_latency = None
return avg_first_token_latency, avg_token_to_token_latency
Expand Down Expand Up @@ -115,7 +222,6 @@ def generate_input_data(args, prompt_size, filename):
"--model",
type=str,
default="vllm",
choices=["vllm"],
help="The name of the model to profile.",
)
parser.add_argument(
Expand Down Expand Up @@ -172,20 +278,18 @@ def generate_input_data(args, prompt_size, filename):
generate_input_data(args, prompt_size, TEMP_INPUT_FILE)

profile(args, args.input_data if args.input_data else TEMP_INPUT_FILE)
avg_first_token_latency, avg_token_to_token_latency = calculate_avg_latencies()
results.append(
(prompt_size, avg_first_token_latency, avg_token_to_token_latency)
)

print("\n[ Benchmark Summary ]")
for prompt_size, avg_first_token_latency, avg_token_to_token_latency in results:
line = (
f" Prompt size: {prompt_size}, "
f"Average first-token latency: {avg_first_token_latency:.4f} sec"
)
line += (
f", Average token-token latency: {avg_token_to_token_latency:.4f} sec"
if avg_token_to_token_latency
else ""
)
print(line)
if not args.periodic_concurrency_range:
(
avg_first_token_latency,
avg_token_to_token_latency,
) = calculate_avg_latencies()
results.append(
(prompt_size, avg_first_token_latency, avg_token_to_token_latency)
)

if args.periodic_concurrency_range:
avg_latencies = calculate_avg_periodic_latencies(args)
plot_results(avg_latencies)
else:
print_benchmark_summary(results)
91 changes: 67 additions & 24 deletions src/c++/perf_analyzer/docs/llm.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,47 +115,90 @@ input sizes and request the model to compute a fixed amount of tokens.
#### Example

Inside the client container, run the following command to generate dummy prompts
of size 100, 300, and 500 and receive total 256 tokens from the model for each prompts.
of size 100, 300, and 500 and receive total 256 tokens from the model for each
prompts.

```bash
python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 256 --ignore-eos

# Sample output
# [ Benchmark Summary ]
# Prompt size: 100, Average first-token latency: 0.0388 sec, Average token-token latency: 0.0066 sec
# Prompt size: 300, Average first-token latency: 0.0431 sec, Average token-token latency: 0.0071 sec
# Prompt size: 500, Average first-token latency: 0.0400 sec, Average token-token latency: 0.0070 sec
# Prompt size: 100, Average first-token latency: 0.0388 sec, Average token-to-token latency: 0.0066 sec
# Prompt size: 300, Average first-token latency: 0.0431 sec, Average token-to-token latency: 0.0071 sec
# Prompt size: 500, Average first-token latency: 0.0400 sec, Average token-to-token latency: 0.0070 sec
```

## Benchmark 3: Profiling Continuous Batch Size
## Benchmark 3: Profiling In-Flight Batching

> **Note**
>
> This benchmark relies on the feature that will be available from `23.10` release
> which is on its way soon. You can either wait until the `23.10` container
> is ready or build Perf Analyzer from the latest `main` branch (see [build from source instructions](install.md#build-from-source)).
> This benchmark relies on the feature that will be available from `23.10`
> release which is on its way soon. You can either wait until the `23.10`
> container is ready or build Perf Analyzer from the latest `main` branch
> (see [build from source instructions](install.md#build-from-source)).

In this benchmarking scenario, we want to measure the effect of continuous
batch size on token-to-token latency. We systematically issue requests to the
server of fixed input sizes and request the model to compute a fixed amount of
tokens in order to increase the continuous batching size over time.
In this benchmarking scenario, we want to measure the effect of in-flight
batch size on token-to-token (T2T) latency. We systematically issue requests to
the server of fixed input sizes and request the model to compute a fixed amount
of tokens in order to increase the in-flight batch size over time.

#### Example

In this benchmark, we are interested in how continuous batch size affects token-to-token latency
by increasing the number of concurrent requests to the model.
Perf Analyzer will run in [periodic concurrency mode](https://github.com/triton-inference-server/client/blob/main/src/c%2B%2B/perf_analyzer/docs/inference_load_modes.md#periodic-concurrency-mode)
that periodically launches a new concurrent request to the model using `--periodic-concurrency-range START END STEP` option.
In this example, Perf Analyzer starts with a single request and launches the new ones until the total number reaches 30.
You can also specify the timing of the new requests: For example, setting the `--request-period` to 50 will make
Perf Analyzer to wait for all the requests to receive 50 responses before it launches the new requests.
In this benchmark, we will run Perf Analyzer in
[periodic concurrency mode](inference_load_modes.md#periodic-concurrency-mode)
that periodically launches a new concurrent request to the model using
`--periodic-concurrency-range START END STEP` option.
In this example, Perf Analyzer starts with a single request and launches the new
ones until the total number reaches 100.
You can also specify the timing of the new requests:
Setting `--request-period` to 32 (as shown below) will make Perf Analyzer to
wait for all the requests to receive 32 responses before launching new requests.
Run the following command inside the client container.

```bash
python profile.py -m vllm --prompt-size-range 100 500 200 --periodic-concurrency-range 1 30 1 --request-period 50 --max-tokens 256 --ignore-eos
# Install matplotlib to generate the benchmark plot
pip install matplotlib

# Run Perf Analyzer
python profile.py -m vllm --periodic-concurrency-range 1 100 1 --request-period 32 --max-tokens 1024 --ignore-eos

# Sample output
# [ Benchmark Summary ]
# Prompt size: 100, Average first-token latency: 0.0381 sec, Average token-token latency: 0.0106 sec
# Prompt size: 300, Average first-token latency: 0.0347 sec, Average token-token latency: 0.0109 sec
# Prompt size: 500, Average first-token latency: 0.0336 sec, Average token-token latency: 0.0101 sec
# Saved benchmark result @ 'inflight_batching_benchmark.png'.
```

The resulting plot will look like

<img src="examples/inflight_batching_benchmark.png" width="600">

The plot demonstrates how the average T2T latency changes across the entire
benchmark process as we increase the number of requests.
To observe the change, we first align the responses of every requests and then
split them into multiple segments of responses.
For instance, assume we ran the following benchmark command:

```bash
python profile.py -m vllm --periodic-concurrency-range 1 4 1 --request-period 32 --max-tokens 1024 --ignore-eos
```

We start from a single request and increment up to 4 requests one by one for
every 32 responses (defined by `--request-period`).
For each request, there are total 1024 generated responses (defined by `--max-tokens`).
We align these total 1024 generated responses and split them by request period,
giving us 1024/32 = 32 total segments per request as shown below:

```
32 responses (=request period)
┌────┐
request 1 ──────┊──────┊──────┊──────┊─ ··· ─┊──────┊
request 2 ┊──────┊──────┊──────┊─ ··· ─┊──────┊──────┊
request 3 ┊ ┊──────┊──────┊─ ··· ─┊──────┊──────┊──────┊
request 4 ┊ ┊ ┊──────┊─ ··· ─┊──────┊──────┊──────┊──────

segment # 1 2 3 4 ··· 32 33 34 35
```

Then for each segment, we compute the mean of T2T latencies of the responses.
This will allow us to visualize the change in T2T latency as the number of
requests increase, filling up the inflight batch slots, and as they terminate.
See [profile.py](examples/profile.py) for more details.

Loading