triton-inference-server · nv-hwoo · Oct 26, 2023 · Oct 13, 2023 · Oct 13, 2023 · Oct 13, 2023
diff --git a/src/c++/perf_analyzer/docs/examples/inflight_batching_benchmark.png b/src/c++/perf_analyzer/docs/examples/inflight_batching_benchmark.png
diff --git a/src/c++/perf_analyzer/docs/examples/profile.py b/src/c++/perf_analyzer/docs/examples/profile.py
@@ -27,8 +27,10 @@
 import argparse
 import json
 import subprocess
+from itertools import pairwise
 from pathlib import Path
-from statistics import mean
+
+import numpy as np
 
 TEMP_INPUT_FILE = "temp_input_data.json"
 
@@ -38,6 +40,111 @@ def load_profile_data():
         return json.load(f)
 
 
+def print_benchmark_summary(results):
+    output = "\n[ Benchmark Summary ]"
+    for prompt_size, avg_first_token_latency, avg_token_to_token_latency in results:
+        output += (
+            f"\n  Prompt size: {prompt_size}, "
+            f"Average first-token latency: {avg_first_token_latency:.4f} sec"
+        )
+        output += (
+            f", Average token-to-token latency: {avg_token_to_token_latency:.4f} sec"
+            if avg_token_to_token_latency
+            else ""
+        )
+    print(output)
+
+
+def plot_results(latencies):
+    """Plot in-flight batching LLM bencharmark results."""
+    import matplotlib.pyplot as plt  # Lazy import
+
+    periods = np.arange(1, len(latencies) + 1)
+    fig, ax = plt.subplots()
+    ax.plot(periods, latencies)
+
+    # Set pyplot parameters
+    ax.grid(linestyle="--")
+    ax.set_xlabel("i-th Request Period", fontsize=12)
+    ax.set_ylabel("Avg Token-to-Token Latency (sec)", fontsize=12)
+    ax.set_title("In-Flight Batching Benchmark Summary", fontsize=14)
+    ax.set_ylim(bottom=0.0)
+
+    fig.savefig("inflight_batching_benchmark.png", dpi=300)
+    print("Saved benchmark result @ 'inflight_batching_benchmark.png'.")
+
+
+def add_latencies_to_bins(bins, pos, responses, request_period):
+    """Add token-to-token latencies into the corresponding bin.
+
+    Given the responses of a single request, calculate token-to-token
+    latency and add it into bin. Update the bin position to the next
+    for every request period.
+    """
+    for response_id, (prev_res, res) in enumerate(pairwise(responses)):
+        bins[pos].append(res - prev_res)
+        if (response_id + 1) % request_period == 0:
+            pos += 1
+
+
+def update_start_position(request_id, start_pos, initial_requests, step):
+    """Shift the start position of the bin.
+
+    Once we iterate through the entire <start> requests, we shift
+    the start position. Then, we shift the start position for every
+    <step> requests.
+    """
+    if (request_id + 1) >= initial_requests:
+        num_requests_after_start = request_id + 1 - initial_requests
+        if num_requests_after_start % step == 0:
+            start_pos += 1
+    return start_pos
+
+
+def collect_periodic_latencies(args):
+    """Split the entire benchmark results into segments with size
+    of request period and collect latencies for each segment.
+    """
+    start, end, step = args.periodic_concurrency_range
+
+    num_bins = args.max_tokens // args.request_period + (end - start) // step
+    if args.max_tokens % args.request_period != 0:
+        num_bins += 1  # extra bin
+
+    bins = [[] for _ in range(num_bins)]
+    bin_start_position = 0
+
+    data = load_profile_data()
+    requests = data["experiments"][0]["requests"]
+
+    for i, r in enumerate(requests):
+        add_latencies_to_bins(
+            bins=bins,
+            pos=bin_start_position,
+            responses=r["response_timestamps"],
+            request_period=args.request_period,
+        )
+        bin_start_position = update_start_position(
+            request_id=i,
+            start_pos=bin_start_position,
+            initial_requests=start,
+            step=step,
+        )
+    return bins
+
+
+def calculate_avg_periodic_latencies(args):
+    """Calculate average token-to-token latency for each
+    request period.
+    """
+    bins = collect_periodic_latencies(args)
+
+    latencies = []
+    for bin in bins:
+        latencies.append(np.mean(bin) / 1_000_000_000)
+    return latencies
+
+
 def collect_latencies(requests):
     # Example json demonstrating format:
     #   see client/src/c++/perf_analyzer/docs/examples/decoupled_output_file.json
@@ -59,9 +166,9 @@ def calculate_avg_latencies():
     first_token_latencies, token_to_token_latencies = collect_latencies(requests)
 
     # Compute mean and convert from nanosec to sec
-    avg_first_token_latency = mean(first_token_latencies) / 1_000_000_000
+    avg_first_token_latency = np.mean(first_token_latencies) / 1_000_000_000
     if token_to_token_latencies:
-        avg_token_to_token_latency = mean(token_to_token_latencies) / 1_000_000_000
+        avg_token_to_token_latency = np.mean(token_to_token_latencies) / 1_000_000_000
     else:
         avg_token_to_token_latency = None
     return avg_first_token_latency, avg_token_to_token_latency
@@ -115,7 +222,6 @@ def generate_input_data(args, prompt_size, filename):
         "--model",
         type=str,
         default="vllm",
-        choices=["vllm"],
         help="The name of the model to profile.",
     )
     parser.add_argument(
@@ -172,20 +278,18 @@ def generate_input_data(args, prompt_size, filename):
             generate_input_data(args, prompt_size, TEMP_INPUT_FILE)
 
         profile(args, args.input_data if args.input_data else TEMP_INPUT_FILE)
-        avg_first_token_latency, avg_token_to_token_latency = calculate_avg_latencies()
-        results.append(
-            (prompt_size, avg_first_token_latency, avg_token_to_token_latency)
-        )
 
-    print("\n[ Benchmark Summary ]")
-    for prompt_size, avg_first_token_latency, avg_token_to_token_latency in results:
-        line = (
-            f"  Prompt size: {prompt_size}, "
-            f"Average first-token latency: {avg_first_token_latency:.4f} sec"
-        )
-        line += (
-            f", Average token-token latency: {avg_token_to_token_latency:.4f} sec"
-            if avg_token_to_token_latency
-            else ""
-        )
-        print(line)
+        if not args.periodic_concurrency_range:
+            (
+                avg_first_token_latency,
+                avg_token_to_token_latency,
+            ) = calculate_avg_latencies()
+            results.append(
+                (prompt_size, avg_first_token_latency, avg_token_to_token_latency)
+            )
+
+    if args.periodic_concurrency_range:
+        avg_latencies = calculate_avg_periodic_latencies(args)
+        plot_results(avg_latencies)
+    else:
+        print_benchmark_summary(results)
diff --git a/src/c++/perf_analyzer/docs/llm.md b/src/c++/perf_analyzer/docs/llm.md
@@ -115,47 +115,90 @@ input sizes and request the model to compute a fixed amount of tokens.
 #### Example
 
 Inside the client container, run the following command to generate dummy prompts
-of size 100, 300, and 500 and receive total 256 tokens from the model for each prompts.
+of size 100, 300, and 500 and receive total 256 tokens from the model for each
+prompts.
 
 ```bash
 python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 256 --ignore-eos
 
 # Sample output
 # [ Benchmark Summary ]
-#   Prompt size: 100, Average first-token latency: 0.0388 sec, Average token-token latency: 0.0066 sec
-#   Prompt size: 300, Average first-token latency: 0.0431 sec, Average token-token latency: 0.0071 sec
-#   Prompt size: 500, Average first-token latency: 0.0400 sec, Average token-token latency: 0.0070 sec
+#   Prompt size: 100, Average first-token latency: 0.0388 sec, Average token-to-token latency: 0.0066 sec
+#   Prompt size: 300, Average first-token latency: 0.0431 sec, Average token-to-token latency: 0.0071 sec
+#   Prompt size: 500, Average first-token latency: 0.0400 sec, Average token-to-token latency: 0.0070 sec
 ```
 
-## Benchmark 3: Profiling Continuous Batch Size
+## Benchmark 3: Profiling In-Flight Batching
 
 > **Note**
 >
-> This benchmark relies on the feature that will be available from `23.10` release
-> which is on its way soon. You can either wait until the `23.10` container
-> is ready or build Perf Analyzer from the latest `main` branch (see [build from source instructions](install.md#build-from-source)).
+> This benchmark relies on the feature that will be available from `23.10`
+> release which is on its way soon. You can either wait until the `23.10`
+> container is ready or build Perf Analyzer from the latest `main` branch
+> (see [build from source instructions](install.md#build-from-source)).
 
-In this benchmarking scenario, we want to measure the effect of continuous
-batch size on token-to-token latency. We systematically issue requests to the
-server of fixed input sizes and request the model to compute a fixed amount of
-tokens in order to increase the continuous batching size over time.
+In this benchmarking scenario, we want to measure the effect of in-flight
+batch size on token-to-token (T2T) latency. We systematically issue requests to
+the server of fixed input sizes and request the model to compute a fixed amount
+of tokens in order to increase the in-flight batch size over time.
 
 #### Example
 
-In this benchmark, we are interested in how continuous batch size affects token-to-token latency
-by increasing the number of concurrent requests to the model.
-Perf Analyzer will run in [periodic concurrency mode](https://github.com/triton-inference-server/client/blob/main/src/c%2B%2B/perf_analyzer/docs/inference_load_modes.md#periodic-concurrency-mode)
-that periodically launches a new concurrent request to the model using `--periodic-concurrency-range START END STEP` option.
-In this example, Perf Analyzer starts with a single request and launches the new ones until the total number reaches 30.
-You can also specify the timing of the new requests: For example, setting the `--request-period` to 50 will make
-Perf Analyzer to wait for all the requests to receive 50 responses before it launches the new requests.
+In this benchmark, we will run Perf Analyzer in
+[periodic concurrency mode](inference_load_modes.md#periodic-concurrency-mode)
+that periodically launches a new concurrent request to the model using
+`--periodic-concurrency-range START END STEP` option.
+In this example, Perf Analyzer starts with a single request and launches the new
+ones until the total number reaches 100.
+You can also specify the timing of the new requests:
+Setting `--request-period` to 32 (as shown below) will make Perf Analyzer to
+wait for all the requests to receive 32 responses before launching new requests.
+Run the following command inside the client container.
 
 ```bash
-python profile.py -m vllm --prompt-size-range 100 500 200 --periodic-concurrency-range 1 30 1 --request-period 50 --max-tokens 256 --ignore-eos
+# Install matplotlib to generate the benchmark plot
+pip install matplotlib
+
+# Run Perf Analyzer
+python profile.py -m vllm --periodic-concurrency-range 1 100 1 --request-period 32 --max-tokens 1024 --ignore-eos
 
 # Sample output
-# [ Benchmark Summary ]
-#   Prompt size: 100, Average first-token latency: 0.0381 sec, Average token-token latency: 0.0106 sec
-#   Prompt size: 300, Average first-token latency: 0.0347 sec, Average token-token latency: 0.0109 sec
-#   Prompt size: 500, Average first-token latency: 0.0336 sec, Average token-token latency: 0.0101 sec
+# Saved benchmark result @ 'inflight_batching_benchmark.png'.
+```
+
+The resulting plot will look like
+
+<img src="examples/inflight_batching_benchmark.png" width="600">
+
+The plot demonstrates how the average T2T latency changes across the entire
+benchmark process as we increase the number of requests.
+To observe the change, we first align the responses of every requests and then
+split them into multiple segments of responses.
+For instance, assume we ran the following benchmark command:
+
+```bash
+python profile.py -m vllm --periodic-concurrency-range 1 4 1 --request-period 32 --max-tokens 1024 --ignore-eos
+```
+
+We start from a single request and increment up to 4 requests one by one for
+every 32 responses (defined by `--request-period`).
+For each request, there are total 1024 generated responses (defined by `--max-tokens`).
+We align these total 1024 generated responses and split them by request period,
+giving us 1024/32 = 32 total segments per request as shown below:
+
 ```
+          32 responses (=request period)
+            ┌────┐
+request 1   ──────┊──────┊──────┊──────┊─ ··· ─┊──────┊
+request 2         ┊──────┊──────┊──────┊─ ··· ─┊──────┊──────┊
+request 3         ┊      ┊──────┊──────┊─ ··· ─┊──────┊──────┊──────┊
+request 4         ┊      ┊      ┊──────┊─ ··· ─┊──────┊──────┊──────┊──────
+
+segment #     1      2      3       4     ···     32     33     34     35
+```
+
+Then for each segment, we compute the mean of T2T latencies of the responses.
+This will allow us to visualize the change in T2T latency as the number of
+requests increase, filling up the inflight batch slots, and as they terminate.
+See [profile.py](examples/profile.py) for more details.
+