From 8ad47dd29662d45b51ac40dbd2bac7382af1d186 Mon Sep 17 00:00:00 2001
From: Hyunjae Woo <hwoo@nvidia.com>
Date: Wed, 29 Nov 2023 15:34:48 -0800
Subject: [PATCH 1/9] Support TRTLLM model and use vLLM backend

---
 .../perf_analyzer/docs/examples/profile.py    | 99 ++++++++++++++-----
 src/c++/perf_analyzer/docs/llm.md             | 34 ++++---
 2 files changed, 98 insertions(+), 35 deletions(-)

diff --git a/src/c++/perf_analyzer/docs/examples/profile.py b/src/c++/perf_analyzer/docs/examples/profile.py
index 534dcec95..ff7dee27f 100644
--- a/src/c++/perf_analyzer/docs/examples/profile.py
+++ b/src/c++/perf_analyzer/docs/examples/profile.py
@@ -449,13 +449,13 @@ def prepare_export_file(args, prompt):
 
 def prepare_input_data(input_data, prompt):
     """Insert the prompt to send into input JSON data."""
-    input_data["data"][0]["PROMPT"] = [prompt]
+    input_data["data"][0]["text_input"] = [prompt]
     save_json_data(input_data, INPUT_FILENAME)
 
 
 def generate_prompts(args, input_data):
     """Generate dummy prompts if not specified by input JSON file."""
-    prompt = input_data["data"][0]["PROMPT"][0]
+    prompt = input_data["data"][0]["text_input"][0]
 
     if not prompt:  # Generate dummy prompt
         assert args.prompt_size_range, "Must specify --prompt-size-range."
@@ -464,28 +464,42 @@ def generate_prompts(args, input_data):
     return [prompt]
 
 
-def construct_input_data(args):
-    """Construct input data that contains input tensors and parameters.
+def construct_vllm_input_data(args):
+    """Construct input data that contains input tensors and parameters for vLLM.
 
     Parse the input JSON file (if exists) to construct the input data.
     When user sets parameters through command line, overwrite the
     parameters set by input JSON file.
     """
-    prompt = ""
-    stream = True
-    sampling_params = {}
+    # Default sampling parameters
+    sampling_params = {
+        "max_tokens": 256,
+        "ignore_eos": False,
+    }
 
     if args.input_data:
-        data = load_json_data(filename=args.input_data)["data"][0]
-        stream = data["STREAM"][0] if "STREAM" in data else stream
-        prompt = data["PROMPT"][0] if "PROMPT" in data else prompt
-        if "SAMPLING_PARAMETERS" in data:
-            sampling_params = json.loads(data["SAMPLING_PARAMETERS"][0])
+        input_data = load_json_data(filename=args.input_data)
+        if "sampling_parameters" in input_data["data"][0]:
+            loaded_params = input_data["data"][0]["sampling_parameters"][0]
+            loaded_params = json.loads(loaded_params or "null")
+            sampling_params = loaded_params if loaded_params else sampling_params
+    else:
+        # Default input JSON
+        input_data = {
+            "data": [
+                {
+                    "text_input": [""],
+                    "stream": [True],
+                    "sampling_parameters": [""],
+                }
+            ]
+        }
+        
 
     # If command line option is specified, overwrite
     if args.offline:
-        stream = False
-    elif not stream:
+        input_data["data"][0]["stream"] = [False]
+    elif not input_data["data"][0]["stream"]:
         args.offline = True
 
     if args.max_tokens:
@@ -496,20 +510,61 @@ def construct_input_data(args):
         args.max_tokens = 256  # default
         sampling_params["max_tokens"] = args.max_tokens
 
-    if "ignore_eos" not in sampling_params:
+    if args.ignore_eos:
+        sampling_params["ignore_eos"] = args.ignore_eos
+    elif "ignore_eos" in sampling_params:
+        args.ignore_eos = sampling_params["ignore_eos"]
+    else:
+        args.ignore_eos = False  # default
         sampling_params["ignore_eos"] = args.ignore_eos
-    elif args.ignore_eos:
-        sampling_params["ignore_eos"] = True
 
-    input_data = {"data": [{}]}
-    input_data["data"][0]["PROMPT"] = [prompt]
-    input_data["data"][0]["STREAM"] = [stream]
-    input_data["data"][0]["SAMPLING_PARAMETERS"] = [json.dumps(sampling_params)]
+    input_data["data"][0]["sampling_parameters"] = [json.dumps(sampling_params)]
+    return input_data
+
+
+def construct_trtllm_input_data(args):
+    """Construct input data that contains input tensors and parameters for TRT-LLM.
+
+    Parse the input JSON file (if exists) to construct the input data.
+    When user sets parameters through command line, overwrite the
+    parameters set by input JSON file.
+    """
+    # Default input JSON
+    if args.input_data:
+        input_data = load_json_data(filename=args.input_data)
+    else:
+        input_data = {
+            "data": [
+                {
+                    "text_input": [""],
+                    "stream": [True],
+                    "max_tokens": [256],
+                    "bad_words": [""],
+                    "stop_words": [""],
+                }
+            ]
+        }
+
+    # If command line option is specified, overwrite
+    if args.offline:
+        input_data["data"][0]["stream"] = [False]
+    elif not input_data["data"][0]["stream"]:
+        args.offline = True
+
+    if args.max_tokens:
+        input_data["data"][0]["max_tokens"] = [args.max_tokens]
+    else:
+        args.max_tokens = input_data["data"][0]["max_tokens"]
+
     return input_data
 
 
 def main(args):
-    input_data = construct_input_data(args)
+    if args.model == "ensemble":
+        input_data = construct_trtllm_input_data(args)
+    elif args.model in "vllm_model":
+        input_data = construct_vllm_input_data(args)
+
     prompts = generate_prompts(args, input_data)
 
     for prompt in prompts:
diff --git a/src/c++/perf_analyzer/docs/llm.md b/src/c++/perf_analyzer/docs/llm.md
index 1de686c1b..82d365a44 100644
--- a/src/c++/perf_analyzer/docs/llm.md
+++ b/src/c++/perf_analyzer/docs/llm.md
@@ -33,20 +33,28 @@ The following guide shows the reader how to use Triton
 to measure and characterize the performance behaviors of Large Language Models
 (LLMs) using Triton with [vLLM](https://github.com/vllm-project/vllm).
 
-### Setup: Download and configure Triton Server environment
+### Setup: Download and configure Triton vLLM Backend
 
-From [Step 1 of the Triton vLLM tutorial](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#step-1-build-a-triton-container-image-with-vllm).
+Download the pre-built Triton Server Container with vLLM backend from
+[NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver)
+registry.
 
 ```bash
-git clone https://github.com/triton-inference-server/tutorials
-cd tutorials/Quick_Deploy/vLLM
-docker build -t tritonserver_vllm .
-# wait for command to finish, might take several minutes
+docker pull nvcr.io/nvidia/tritonserver:23.10-vllm-python-py3
 ```
 
-Upon successful build, run the following command to start the Triton Server container:
+Run the Triton Server container with
+[vLLM backend](https://github.com/triton-inference-server/vllm_backend) and
+launch the server.
 ```bash
-docker run --gpus all -it --rm -p 8001:8001 --shm-size=1G --ulimit memlock=-1 --ulimit stack=67108864 -v ${PWD}:/work -w /work tritonserver_vllm tritonserver --model-store ./model_repository
+git clone -b r23.10 https://github.com/triton-inference-server/vllm_backend.git
+cd vllm_backend
+
+docker run --gpus all --rm -it --net host \
+	--shm-size=2G --ulimit memlock=-1 --ulimit stack=67108864 \
+	-v $(pwd)/samples/model_repository:/model_repository \
+	nvcr.io/nvidia/tritonserver:23.10-vllm-python-py3 \
+    tritonserver --model-repository /model_repository
 ```
 
 Next run the following command to start the Triton SDK container:
@@ -69,7 +77,7 @@ Inside the client container, run the following command to generate dummy prompts
 of size 100, 300, and 500 and receive single token from the model for each prompt.
 
 ```bash
-python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 1
+python profile.py -m vllm_model --prompt-size-range 100 500 200 --max-tokens 1
 
 # [ BENCHMARK SUMMARY ]
 # Prompt size: 100
@@ -105,7 +113,7 @@ python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 1
 > }
 > ' > input_data.json
 >
-> $ python profile.py -m vllm --input-data input_data.json
+> $ python profile.py -m vllm_model --input-data input_data.json
 > ```
 
 
@@ -122,7 +130,7 @@ of size 100, 300, and 500 and receive total 256 tokens from the model for each
 prompts.
 
 ```bash
-python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 256 --ignore-eos
+python profile.py -m vllm_model --prompt-size-range 100 500 200 --max-tokens 256 --ignore-eos
 
 # [ BENCHMARK SUMMARY ]
 # Prompt size: 100
@@ -157,7 +165,7 @@ Run the following command inside the client container.
 pip install matplotlib
 
 # Run Perf Analyzer
-python profile.py -m vllm --prompt-size-range 10 10 1 --periodic-concurrency-range 1 100 1 --request-period 32 --max-tokens 1024 --ignore-eos
+python profile.py -m vllm_model --prompt-size-range 10 10 1 --periodic-concurrency-range 1 100 1 --request-period 32 --max-tokens 1024 --ignore-eos
 
 # [ BENCHMARK SUMMARY ]
 # Prompt size: 10
@@ -179,7 +187,7 @@ split them into multiple segments of responses.
 For instance, assume we ran the following benchmark command:
 
 ```bash
-python profile.py -m vllm --periodic-concurrency-range 1 4 1 --request-period 32 --max-tokens 1024 --ignore-eos
+python profile.py -m vllm_model --periodic-concurrency-range 1 4 1 --request-period 32 --max-tokens 1024 --ignore-eos
 ```
 
 We start from a single request and increment up to 4 requests one by one for

From 2b1e17bae9691b01840f1e359e3e510dd45c4012 Mon Sep 17 00:00:00 2001
From: Hyunjae Woo <hwoo@nvidia.com>
Date: Wed, 29 Nov 2023 15:45:20 -0800
Subject: [PATCH 2/9] Align spaces

---
 src/c++/perf_analyzer/docs/llm.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/c++/perf_analyzer/docs/llm.md b/src/c++/perf_analyzer/docs/llm.md
index 82d365a44..edd3ff08d 100644
--- a/src/c++/perf_analyzer/docs/llm.md
+++ b/src/c++/perf_analyzer/docs/llm.md
@@ -51,9 +51,9 @@ git clone -b r23.10 https://github.com/triton-inference-server/vllm_backend.git
 cd vllm_backend
 
 docker run --gpus all --rm -it --net host \
-	--shm-size=2G --ulimit memlock=-1 --ulimit stack=67108864 \
-	-v $(pwd)/samples/model_repository:/model_repository \
-	nvcr.io/nvidia/tritonserver:23.10-vllm-python-py3 \
+    --shm-size=2G --ulimit memlock=-1 --ulimit stack=67108864 \
+    -v $(pwd)/samples/model_repository:/model_repository \
+    nvcr.io/nvidia/tritonserver:23.10-vllm-python-py3 \
     tritonserver --model-repository /model_repository
 ```
 

From 514e9e12a97754c41b9145f35d190ea9ab00c4a6 Mon Sep 17 00:00:00 2001
From: Hyunjae Woo <hwoo@nvidia.com>
Date: Wed, 29 Nov 2023 15:46:09 -0800
Subject: [PATCH 3/9] Move comment

---
 src/c++/perf_analyzer/docs/examples/profile.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/c++/perf_analyzer/docs/examples/profile.py b/src/c++/perf_analyzer/docs/examples/profile.py
index ff7dee27f..d882b61e1 100644
--- a/src/c++/perf_analyzer/docs/examples/profile.py
+++ b/src/c++/perf_analyzer/docs/examples/profile.py
@@ -529,10 +529,10 @@ def construct_trtllm_input_data(args):
     When user sets parameters through command line, overwrite the
     parameters set by input JSON file.
     """
-    # Default input JSON
     if args.input_data:
         input_data = load_json_data(filename=args.input_data)
     else:
+        # Default input JSON
         input_data = {
             "data": [
                 {

From 0067a9de34dc83fcc22b55d33eb5506f9a762c48 Mon Sep 17 00:00:00 2001
From: Hyunjae Woo <hwoo@nvidia.com>
Date: Wed, 29 Nov 2023 16:58:46 -0800
Subject: [PATCH 4/9] Specify shape of input tensors

---
 src/c++/perf_analyzer/docs/examples/profile.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/c++/perf_analyzer/docs/examples/profile.py b/src/c++/perf_analyzer/docs/examples/profile.py
index d882b61e1..673b26cc6 100644
--- a/src/c++/perf_analyzer/docs/examples/profile.py
+++ b/src/c++/perf_analyzer/docs/examples/profile.py
@@ -420,6 +420,13 @@ def profile(args, export_file):
         f"--input-data={INPUT_FILENAME} "
         f"--profile-export-file={export_file} "
     )
+    if args.model == "ensemble":  # TRT-LLM
+       command += (
+           "--shape=text_input:1 "
+           "--shape=max_tokens:1 "
+           "--shape=bad_words:1 "
+           "--shape=stop_words:1 "
+       )
     if args.periodic_concurrency_range:
         start, end, step = args.periodic_concurrency_range
         command += (
@@ -554,7 +561,7 @@ def construct_trtllm_input_data(args):
     if args.max_tokens:
         input_data["data"][0]["max_tokens"] = [args.max_tokens]
     else:
-        args.max_tokens = input_data["data"][0]["max_tokens"]
+        args.max_tokens = input_data["data"][0]["max_tokens"][0]
 
     return input_data
 

From 6869902309829a78acb5b2f02a335f1900410f01 Mon Sep 17 00:00:00 2001
From: Hyunjae Woo <hwoo@nvidia.com>
Date: Wed, 29 Nov 2023 17:02:19 -0800
Subject: [PATCH 5/9] Fix pre-commit hooks

---
 src/c++/perf_analyzer/docs/examples/profile.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/c++/perf_analyzer/docs/examples/profile.py b/src/c++/perf_analyzer/docs/examples/profile.py
index 673b26cc6..ef6b613d8 100644
--- a/src/c++/perf_analyzer/docs/examples/profile.py
+++ b/src/c++/perf_analyzer/docs/examples/profile.py
@@ -421,12 +421,12 @@ def profile(args, export_file):
         f"--profile-export-file={export_file} "
     )
     if args.model == "ensemble":  # TRT-LLM
-       command += (
-           "--shape=text_input:1 "
-           "--shape=max_tokens:1 "
-           "--shape=bad_words:1 "
-           "--shape=stop_words:1 "
-       )
+        command += (
+            "--shape=text_input:1 "
+            "--shape=max_tokens:1 "
+            "--shape=bad_words:1 "
+            "--shape=stop_words:1 "
+        )
     if args.periodic_concurrency_range:
         start, end, step = args.periodic_concurrency_range
         command += (
@@ -501,7 +501,6 @@ def construct_vllm_input_data(args):
                 }
             ]
         }
-        
 
     # If command line option is specified, overwrite
     if args.offline:

From 07b1d74daaac1dfddcd233edec496a98a8df3023 Mon Sep 17 00:00:00 2001
From: Hyunjae Woo <hwoo@nvidia.com>
Date: Thu, 30 Nov 2023 21:25:37 -0800
Subject: [PATCH 6/9] Fix metric error when there is only single response

---
 src/c++/perf_analyzer/docs/examples/profile.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/c++/perf_analyzer/docs/examples/profile.py b/src/c++/perf_analyzer/docs/examples/profile.py
index ef6b613d8..8f25f943d 100644
--- a/src/c++/perf_analyzer/docs/examples/profile.py
+++ b/src/c++/perf_analyzer/docs/examples/profile.py
@@ -265,11 +265,12 @@ def collect_online_metrics(export_data, output_tokens):
     for r in requests:
         init_request, responses = r["timestamp"], r["response_timestamps"]
         first_token_latency = (responses[0] - init_request) / 1_000_000
-        generation_latency_ms = (responses[-1] - responses[0]) / 1_000_000  # msec
-        generation_latency_s = (responses[-1] - responses[0]) / 1_000_000_000  # sec
         first_token_latencies.append(first_token_latency)
-        generation_latencies.append(generation_latency_ms)
-        generation_throughputs.append(output_tokens / generation_latency_s)
+        if args.max_tokens > 1:
+            generation_latency_ms = (responses[-1] - responses[0]) / 1_000_000  # msec
+            generation_latency_s = (responses[-1] - responses[0]) / 1_000_000_000  # sec
+            generation_latencies.append(generation_latency_ms)
+            generation_throughputs.append(output_tokens / generation_latency_s)
         for prev_res, res in pairwise(responses):
             token_to_token_latencies.append((res - prev_res) / 1_000_000)
     return (
@@ -290,8 +291,6 @@ def calculate_online_metrics(args, profile_result, export_data):
         generation_throughputs,
     ) = latencies
 
-    profile_result.avg_total_t2t_latency = np.mean(token_to_token_latencies)
-
     profile_result.max_first_token_latency = max(first_token_latencies)
     profile_result.min_first_token_latency = min(first_token_latencies)
     profile_result.avg_first_token_latency = np.mean(first_token_latencies)
@@ -309,6 +308,8 @@ def calculate_online_metrics(args, profile_result, export_data):
     )
 
     if args.max_tokens > 1:
+        profile_result.avg_total_t2t_latency = np.mean(token_to_token_latencies)
+
         profile_result.max_gen_latency = max(generation_latencies)
         profile_result.min_gen_latency = min(generation_latencies)
         profile_result.avg_gen_latency = np.mean(generation_latencies)

From a5b5ae493d9e937b7f0c9d9be4ff8e53b6b89b5f Mon Sep 17 00:00:00 2001
From: Hyunjae Woo <hwoo@nvidia.com>
Date: Fri, 1 Dec 2023 09:17:02 -0800
Subject: [PATCH 7/9] Specify backend type for distinguishing input data

---
 .../perf_analyzer/docs/examples/profile.py    | 19 +++++++++++++------
 src/c++/perf_analyzer/docs/llm.md             |  6 +++---
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/c++/perf_analyzer/docs/examples/profile.py b/src/c++/perf_analyzer/docs/examples/profile.py
index 8f25f943d..975354936 100644
--- a/src/c++/perf_analyzer/docs/examples/profile.py
+++ b/src/c++/perf_analyzer/docs/examples/profile.py
@@ -133,11 +133,11 @@ def get_postfix(args, prompt_size):
     """Generate postfix for profile export filename and plot.
 
     e.g.
-      - trtllm-prompt100-maxtokens256
-      - trtllm-prompt100-periodic1_100_1-period32-maxtokens1024
+      - trtllm-ensemble-prompt100-maxtokens256
+      - trtllm-ensemble-prompt100-periodic1_100_1-period32-maxtokens1024
     """
     stream_type = "offline" if args.offline else "online"
-    postfix = f"{args.model}-{stream_type}-prompt{prompt_size}-"
+    postfix = f"{args.backend}-{args.model}-{stream_type}-prompt{prompt_size}-"
     if args.periodic_concurrency_range:
         start, end, step = args.periodic_concurrency_range
         postfix += f"periodic{start}_{end}_{step}-period{args.request_period}-"
@@ -421,7 +421,7 @@ def profile(args, export_file):
         f"--input-data={INPUT_FILENAME} "
         f"--profile-export-file={export_file} "
     )
-    if args.model == "ensemble":  # TRT-LLM
+    if args.backend == "trtllm":
         command += (
             "--shape=text_input:1 "
             "--shape=max_tokens:1 "
@@ -567,9 +567,9 @@ def construct_trtllm_input_data(args):
 
 
 def main(args):
-    if args.model == "ensemble":
+    if args.backend == "trtllm":
         input_data = construct_trtllm_input_data(args)
-    elif args.model in "vllm_model":
+    elif args.backend in "vllm":
         input_data = construct_vllm_input_data(args)
 
     prompts = generate_prompts(args, input_data)
@@ -593,6 +593,13 @@ def main(args):
         default="vllm",
         help="The name of the model to profile.",
     )
+    parser.add_argument(
+        "-b",
+        "--backend",
+        type=str,
+        default="vllm",
+        help="The name of the backend.",
+    )
     parser.add_argument(
         "--prompt-size-range",
         type=int,
diff --git a/src/c++/perf_analyzer/docs/llm.md b/src/c++/perf_analyzer/docs/llm.md
index edd3ff08d..331e7db55 100644
--- a/src/c++/perf_analyzer/docs/llm.md
+++ b/src/c++/perf_analyzer/docs/llm.md
@@ -99,13 +99,13 @@ python profile.py -m vllm_model --prompt-size-range 100 500 200 --max-tokens 1
 > {
 >     "data": [
 >         {
->             "PROMPT": [
+>             "text_input": [
 >                 "Hello, my name is"  // user-provided prompt
 >             ],
->             "STREAM": [
+>             "stream": [
 >                 true
 >             ],
->             "SAMPLING_PARAMETERS": [
+>             "sampling_parameters": [
 >                 "{ \"max_tokens\": 1 }"
 >             ]
 >         }

From 720579393dcfab93ce6a6ea85b52435893a17902 Mon Sep 17 00:00:00 2001
From: Hyunjae Woo <hwoo@nvidia.com>
Date: Fri, 1 Dec 2023 09:30:57 -0800
Subject: [PATCH 8/9] Raise error when unknown backend specified.

---
 src/c++/perf_analyzer/docs/examples/profile.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/c++/perf_analyzer/docs/examples/profile.py b/src/c++/perf_analyzer/docs/examples/profile.py
index 975354936..9793adb3e 100644
--- a/src/c++/perf_analyzer/docs/examples/profile.py
+++ b/src/c++/perf_analyzer/docs/examples/profile.py
@@ -571,6 +571,11 @@ def main(args):
         input_data = construct_trtllm_input_data(args)
     elif args.backend in "vllm":
         input_data = construct_vllm_input_data(args)
+    else:
+        raise ValueError(
+            "Unknown backend specified. Supported backend types are: 'trtllm' "
+            "and 'vllm'."
+        )
 
     prompts = generate_prompts(args, input_data)
 

From dc5114a6abae69fcf6bb1de34fd76b2511a1ba50 Mon Sep 17 00:00:00 2001
From: Hyunjae Woo <hwoo@nvidia.com>
Date: Fri, 1 Dec 2023 10:04:55 -0800
Subject: [PATCH 9/9] Change to direct string comparison

---
 src/c++/perf_analyzer/docs/examples/profile.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/c++/perf_analyzer/docs/examples/profile.py b/src/c++/perf_analyzer/docs/examples/profile.py
index 9793adb3e..7b79c9848 100644
--- a/src/c++/perf_analyzer/docs/examples/profile.py
+++ b/src/c++/perf_analyzer/docs/examples/profile.py
@@ -569,7 +569,7 @@ def construct_trtllm_input_data(args):
 def main(args):
     if args.backend == "trtllm":
         input_data = construct_trtllm_input_data(args)
-    elif args.backend in "vllm":
+    elif args.backend == "vllm":
         input_data = construct_vllm_input_data(args)
     else:
         raise ValueError(