Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more detailed metrics to the LLM benchmarks #431

Merged
merged 15 commits into from
Nov 7, 2023
Binary file not shown.
243 changes: 189 additions & 54 deletions src/c++/perf_analyzer/docs/examples/profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,29 +25,77 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import argparse
import csv
Fixed Show fixed Hide fixed
import json
import subprocess
from dataclasses import dataclass
from dataclasses import asdict, dataclass, fields
Fixed Show fixed Hide fixed
from itertools import pairwise
from pathlib import Path
from typing import Optional

import numpy as np

INPUT_FILENAME = "generated_input_data.json"

TITLE = "\n[ BENCHMARK SUMMARY ]\n"
PROMPT_SIZE = " Prompt size: {}"
FIRST_TOKEN_LATENCY = "Average first-token latency: {:.4f} sec"
T2T_LATENCY = "Average total token-to-token latency: {:.4f} sec"
METRIC_FIELDS = {
"max_first_token_latency": ("Max first token latency", "ms"),
matthewkotila marked this conversation as resolved.
Show resolved Hide resolved
"min_first_token_latency": ("Min first token latency", "ms"),
"avg_first_token_latency": ("Avg first token latency", "ms"),
"p50_first_token_latency": ("p50 first token latency", "ms"),
"p90_first_token_latency": ("p90 first token latency", "ms"),
"p95_first_token_latency": ("p95 first token latency", "ms"),
"p99_first_token_latency": ("p99 first token latency", "ms"),
"max_gen_latency": ("Max generation latency", "ms"),
"min_gen_latency": ("Min generation latency", "ms"),
"avg_gen_latency": ("Avg generation latency", "ms"),
"p50_gen_latency": ("p50 generation latency", "ms"),
Fixed Show fixed Hide fixed
"p90_gen_latency": ("p90 generation latency", "ms"),
Fixed Show fixed Hide fixed
"p95_gen_latency": ("p95 generation latency", "ms"),
Fixed Show fixed Hide fixed
"p99_gen_latency": ("p99 generation latency", "ms"),
Fixed Show fixed Hide fixed
"avg_token_latency": ("Avg token latency", "ms/token"),
"avg_total_t2t_latency": ("Avg total token-to-token latency", "ms"),
"max_e2e_latency": ("Max end-to-end latency", "ms"),
"min_e2e_latency": ("Min end-to-end latency", "ms"),
"avg_e2e_latency": ("Avg end-to-end latency", "ms"),
"max_token_throughput": ("Max token throughput", "tokens/s"),
"min_token_throughput": ("Min token throughput", "tokens/s"),
"avg_token_throughput": ("Avg token throughput", "tokens/s"),
"p50_token_throughput": ("p50 token throughput", "tokens/s"),
"p90_token_throughput": ("p90 token throughput", "tokens/s"),
"p95_token_throughput": ("p95 token throughput", "tokens/s"),
"p99_token_throughput": ("p99 token throughput", "tokens/s"),
}


@dataclass
class ProfileResults:
prompt_size: int
avg_first_token_latency: int
avg_total_t2t_latency: int
avg_periodic_t2t_latencies: Optional[list[int]] = None
max_first_token_latency: Optional[float] = None
min_first_token_latency: Optional[float] = None
avg_first_token_latency: Optional[float] = None
p50_first_token_latency: Optional[float] = None
p90_first_token_latency: Optional[float] = None
p95_first_token_latency: Optional[float] = None
p99_first_token_latency: Optional[float] = None
max_gen_latency: Optional[float] = None
min_gen_latency: Optional[float] = None
avg_gen_latency: Optional[float] = None
p50_gen_latency: Optional[float] = None
p90_gen_latency: Optional[float] = None
p95_gen_latency: Optional[float] = None
p99_gen_latency: Optional[float] = None
avg_token_latency: Optional[float] = None
avg_total_t2t_latency: Optional[float] = None
avg_periodic_t2t_latencies: Optional[list[float]] = None
max_e2e_latency: Optional[float] = None
min_e2e_latency: Optional[float] = None
avg_e2e_latency: Optional[float] = None
max_token_throughput: Optional[float] = None
min_token_throughput: Optional[float] = None
avg_token_throughput: Optional[float] = None
p50_token_throughput: Optional[float] = None
p90_token_throughput: Optional[float] = None
p95_token_throughput: Optional[float] = None
p99_token_throughput: Optional[float] = None


def load_json_data(filename):
Expand All @@ -67,7 +115,8 @@ def get_postfix(args, prompt_size):
- trtllm-prompt100-maxtokens256
- trtllm-prompt100-periodic1_100_1-period32-maxtokens1024
"""
postfix = f"{args.model}-prompt{prompt_size}-"
stream_type = "online" if args.stream else "offline"
postfix = f"{args.model}-{stream_type}-prompt{prompt_size}-"
if args.periodic_concurrency_range:
start, end, step = args.periodic_concurrency_range
postfix += f"periodic{start}_{end}_{step}-period{args.request_period}-"
Expand All @@ -87,15 +136,26 @@ def get_plot_filename(args, prompt_size):
return filename


def save_benchmark_results(args, profile_results):
for pr in profile_results:
postfix = get_postfix(args, pr.prompt_size)
results_csv = f"results-{postfix}.csv"
with open(results_csv, "w") as f:
fieldnames = [f.name for f in fields(pr)]
writer = csv.DictWriter(f, fieldnames=fieldnames)
matthewkotila marked this conversation as resolved.
Show resolved Hide resolved
writer.writeheader()
writer.writerow(asdict(pr))
print(f"Saved benchmark results @ '{results_csv}'")


def print_benchmark_summary(profile_results):
output = [TITLE]
print("[ BENCHMARK SUMMARY ]")
for pr in profile_results:
line = [PROMPT_SIZE.format(pr.prompt_size)]
line += [FIRST_TOKEN_LATENCY.format(pr.avg_first_token_latency)]
if pr.avg_total_t2t_latency:
line += [T2T_LATENCY.format(pr.avg_total_t2t_latency)]
output += [", ".join(line) + "\n"]
print("".join(output))
print(f"Prompt size: {pr.prompt_size}")
for metric, (name, unit) in METRIC_FIELDS.items():
if getattr(pr, metric):
print(f" * {name}: {getattr(pr, metric):.4f} {unit}")
print("")


def plot_results(latencies, filename="inflight_batching_benchmark.png"):
Expand All @@ -109,7 +169,7 @@ def plot_results(latencies, filename="inflight_batching_benchmark.png"):
# Set pyplot parameters
ax.grid(linestyle="--")
ax.set_xlabel("i-th Request Period", fontsize=12)
ax.set_ylabel("Avg Token-to-Token Latency (sec)", fontsize=12)
ax.set_ylabel("Avg Token-to-Token Latency (ms)", fontsize=12)
ax.set_title("In-Flight Batching Benchmark Summary", fontsize=14)
ax.set_ylim(bottom=0.0)

Expand Down Expand Up @@ -175,72 +235,135 @@ def collect_periodic_latencies(args, filename):
return bins


def calculate_avg_periodic_latencies(args, filename):
def calculate_avg_periodic_latencies(args, profile_result, filename):
matthewkotila marked this conversation as resolved.
Show resolved Hide resolved
"""Calculate average token-to-token latency for each request period."""
bins = collect_periodic_latencies(args, filename)

latencies = []
for bin in bins:
latencies.append(np.mean(bin) / 1_000_000_000)
return latencies
latencies.append(np.mean(bin) / 1_000_000)

profile_result.avg_periodic_t2t_latencies = latencies


def collect_latencies(requests):
# Example json demonstrating format:
# see client/src/c++/perf_analyzer/docs/examples/decoupled_output_file.json
first_token_latencies = []
generation_latencies = []
token_to_token_latencies = []
requests = requests["experiments"][0]["requests"]
nv-hwoo marked this conversation as resolved.
Show resolved Hide resolved
for r in requests:
init_request, responses = r["timestamp"], r["response_timestamps"]
first_token_latencies.append((responses[0] - init_request) / 1_000_000)
generation_latencies.append((responses[-1] - responses[0]) / 1_000_000)
matthewkotila marked this conversation as resolved.
Show resolved Hide resolved
token_to_token_latencies = []
for prev_res, res in pairwise(responses):
token_to_token_latencies.append((res - prev_res) / 1_000_000)
return first_token_latencies, generation_latencies, token_to_token_latencies


def calculate_online_metrics(args, profile_result, filename):
"""Calculate online metrics for more fine-grained performance information."""
if not args.stream:
return # skip if offline

requests = load_json_data(filename)
matthewkotila marked this conversation as resolved.
Show resolved Hide resolved
latencies = collect_latencies(requests)
first_token_latencies, generation_latencies, token_to_token_latencies = latencies

profile_result.avg_first_token_latency = np.mean(first_token_latencies)
profile_result.avg_total_t2t_latency = np.mean(token_to_token_latencies)

profile_result.max_first_token_latency = max(first_token_latencies)
profile_result.min_first_token_latency = min(first_token_latencies)
profile_result.avg_first_token_latency = np.mean(first_token_latencies)
nv-hwoo marked this conversation as resolved.
Show resolved Hide resolved
profile_result.p50_first_token_latency = np.percentile(
first_token_latencies, 50, method="lower"
)
profile_result.p90_first_token_latency = np.percentile(
first_token_latencies, 90, method="lower"
matthewkotila marked this conversation as resolved.
Show resolved Hide resolved
)
profile_result.p95_first_token_latency = np.percentile(
first_token_latencies, 95, method="lower"
)
profile_result.p99_first_token_latency = np.percentile(
first_token_latencies, 99, method="lower"
)

profile_result.max_gen_latency = max(generation_latencies)
profile_result.min_gen_latency = min(generation_latencies)
profile_result.avg_gen_latency = np.mean(generation_latencies)
profile_result.p50_gen_latency = np.percentile(
generation_latencies, 50, method="lower"
)
profile_result.p90_gen_latency = np.percentile(
generation_latencies, 90, method="lower"
)
profile_result.p95_gen_latency = np.percentile(
generation_latencies, 95, method="lower"
)
profile_result.p99_gen_latency = np.percentile(
generation_latencies, 99, method="lower"
)

token_latencies = [t / args.max_tokens for t in generation_latencies]
profile_result.avg_token_latency = np.mean(token_latencies)
nv-hwoo marked this conversation as resolved.
Show resolved Hide resolved


def collect_offline_metrics(requests, sequence_len):
end_to_end_latencies = []
throughputs = []
requests = requests["experiments"][0]["requests"]

for request in requests:
first_response, *remaining_responses, _ = request["response_timestamps"]
first_token_latencies.append(first_response - request["timestamp"])
prev_response = first_response
for response in remaining_responses:
token_to_token_latencies.append(response - prev_response)
prev_response = response
return first_token_latencies, token_to_token_latencies
total_time = request["response_timestamps"][-1] - request["timestamp"]
time_s = total_time / 1_000_000_000 # sec
time_ms = total_time / 1_000_000 # msec
end_to_end_latencies.append(time_ms)
throughputs.append(sequence_len / time_s)
matthewkotila marked this conversation as resolved.
Show resolved Hide resolved
return throughputs, end_to_end_latencies


def calculate_avg_latencies(filename):
"""Calculate avg first-token and avg total token-to-token latencies."""
def calculate_offline_metrics(args, profile_result, filename):
"""Calculate offline metrics that show end-to-end performance."""
requests = load_json_data(filename)
first_token_latencies, token_to_token_latencies = collect_latencies(requests)
throughputs, end_to_end_latencies = collect_offline_metrics(
requests=requests, sequence_len=profile_result.prompt_size + args.max_tokens
)

# Compute mean and convert from nanosec to sec
avg_first_token_latency = np.mean(first_token_latencies) / 1_000_000_000
if token_to_token_latencies:
avg_token_to_token_latency = np.mean(token_to_token_latencies) / 1_000_000_000
else:
avg_token_to_token_latency = None
return avg_first_token_latency, avg_token_to_token_latency
profile_result.max_e2e_latency = max(end_to_end_latencies)
profile_result.min_e2e_latency = min(end_to_end_latencies)
profile_result.avg_e2e_latency = np.mean(end_to_end_latencies)
profile_result.max_token_throughput = max(throughputs)
profile_result.min_token_throughput = min(throughputs)
profile_result.avg_token_throughput = np.mean(throughputs)
profile_result.p50_token_throughput = np.percentile(throughputs, 50, method="lower")
profile_result.p90_token_throughput = np.percentile(throughputs, 90, method="lower")
profile_result.p95_token_throughput = np.percentile(throughputs, 95, method="lower")
profile_result.p99_token_throughput = np.percentile(throughputs, 99, method="lower")


def summarize_profile_results(args, prompts):
results = []
for prompt in prompts:
prompt_size = len(prompt.split())
export_file = get_export_filename(args, prompt_size)
avg_first_token_latency, avg_total_t2t_latency = calculate_avg_latencies(
filename=export_file
)

profile_result = ProfileResults(
prompt_size=prompt_size,
avg_first_token_latency=avg_first_token_latency,
avg_total_t2t_latency=avg_total_t2t_latency,
)
profile_result = ProfileResults(prompt_size=prompt_size)
calculate_offline_metrics(args, profile_result, export_file)
calculate_online_metrics(args, profile_result, export_file)
matthewkotila marked this conversation as resolved.
Show resolved Hide resolved

if args.periodic_concurrency_range:
periodic_latencies = calculate_avg_periodic_latencies(args, export_file)
profile_result.avg_periodic_t2t_latencies = periodic_latencies
calculate_avg_periodic_latencies(args, profile_result, export_file)
plot_results(
latencies=periodic_latencies,
latencies=profile_result.avg_periodic_t2t_latencies,
filename=get_plot_filename(args, prompt_size),
)

results.append(profile_result)

print_benchmark_summary(results)
save_benchmark_results(args, results)

if args.periodic_concurrency_range:
print(
Expand Down Expand Up @@ -307,7 +430,7 @@ def construct_input_data(args):
parameters set by input JSON file.
"""
prompt = ""
stream = True
stream = False
nv-hwoo marked this conversation as resolved.
Show resolved Hide resolved
sampling_params = {}

if args.input_data:
Expand All @@ -317,13 +440,20 @@ def construct_input_data(args):
if "SAMPLING_PARAMETERS" in data:
sampling_params = json.loads(data["SAMPLING_PARAMETERS"][0])

# If specified, overwrite max_tokens
# If command line option is specified, overwrite
if args.stream:
stream = args.stream
else:
args.stream = stream

if args.max_tokens:
sampling_params["max_tokens"] = args.max_tokens
else:
elif "max_tokens" in sampling_params:
args.max_tokens = sampling_params["max_tokens"]
else:
args.max_tokens = 256 # default
sampling_params["max_tokens"] = args.max_tokens

# If specified, overwrite ignore_eos
if "ignore_eos" not in sampling_params:
sampling_params["ignore_eos"] = args.ignore_eos
elif args.ignore_eos:
Expand Down Expand Up @@ -394,5 +524,10 @@ def main(args):
type=str,
help="The input data file to be used for inference request.",
)
parser.add_argument(
"--stream",
action="store_true",
help="Whether to stream the model outputs.",
)
args = parser.parse_args()
main(args)
Loading
Loading