Skip to content

Commit

Permalink
Support TRTLLM model and use vLLM backend
Browse files Browse the repository at this point in the history
  • Loading branch information
nv-hwoo committed Nov 29, 2023
1 parent f68f859 commit 8ad47dd
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 35 deletions.
99 changes: 77 additions & 22 deletions src/c++/perf_analyzer/docs/examples/profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,13 +449,13 @@ def prepare_export_file(args, prompt):

def prepare_input_data(input_data, prompt):
"""Insert the prompt to send into input JSON data."""
input_data["data"][0]["PROMPT"] = [prompt]
input_data["data"][0]["text_input"] = [prompt]
save_json_data(input_data, INPUT_FILENAME)


def generate_prompts(args, input_data):
"""Generate dummy prompts if not specified by input JSON file."""
prompt = input_data["data"][0]["PROMPT"][0]
prompt = input_data["data"][0]["text_input"][0]

if not prompt: # Generate dummy prompt
assert args.prompt_size_range, "Must specify --prompt-size-range."
Expand All @@ -464,28 +464,42 @@ def generate_prompts(args, input_data):
return [prompt]


def construct_input_data(args):
"""Construct input data that contains input tensors and parameters.
def construct_vllm_input_data(args):
"""Construct input data that contains input tensors and parameters for vLLM.
Parse the input JSON file (if exists) to construct the input data.
When user sets parameters through command line, overwrite the
parameters set by input JSON file.
"""
prompt = ""
stream = True
sampling_params = {}
# Default sampling parameters
sampling_params = {
"max_tokens": 256,
"ignore_eos": False,
}

if args.input_data:
data = load_json_data(filename=args.input_data)["data"][0]
stream = data["STREAM"][0] if "STREAM" in data else stream
prompt = data["PROMPT"][0] if "PROMPT" in data else prompt
if "SAMPLING_PARAMETERS" in data:
sampling_params = json.loads(data["SAMPLING_PARAMETERS"][0])
input_data = load_json_data(filename=args.input_data)
if "sampling_parameters" in input_data["data"][0]:
loaded_params = input_data["data"][0]["sampling_parameters"][0]
loaded_params = json.loads(loaded_params or "null")
sampling_params = loaded_params if loaded_params else sampling_params
else:
# Default input JSON
input_data = {
"data": [
{
"text_input": [""],
"stream": [True],
"sampling_parameters": [""],
}
]
}


# If command line option is specified, overwrite
if args.offline:
stream = False
elif not stream:
input_data["data"][0]["stream"] = [False]
elif not input_data["data"][0]["stream"]:
args.offline = True

if args.max_tokens:
Expand All @@ -496,20 +510,61 @@ def construct_input_data(args):
args.max_tokens = 256 # default
sampling_params["max_tokens"] = args.max_tokens

if "ignore_eos" not in sampling_params:
if args.ignore_eos:
sampling_params["ignore_eos"] = args.ignore_eos
elif "ignore_eos" in sampling_params:
args.ignore_eos = sampling_params["ignore_eos"]
else:
args.ignore_eos = False # default
sampling_params["ignore_eos"] = args.ignore_eos
elif args.ignore_eos:
sampling_params["ignore_eos"] = True

input_data = {"data": [{}]}
input_data["data"][0]["PROMPT"] = [prompt]
input_data["data"][0]["STREAM"] = [stream]
input_data["data"][0]["SAMPLING_PARAMETERS"] = [json.dumps(sampling_params)]
input_data["data"][0]["sampling_parameters"] = [json.dumps(sampling_params)]
return input_data


def construct_trtllm_input_data(args):
"""Construct input data that contains input tensors and parameters for TRT-LLM.
Parse the input JSON file (if exists) to construct the input data.
When user sets parameters through command line, overwrite the
parameters set by input JSON file.
"""
# Default input JSON
if args.input_data:
input_data = load_json_data(filename=args.input_data)
else:
input_data = {
"data": [
{
"text_input": [""],
"stream": [True],
"max_tokens": [256],
"bad_words": [""],
"stop_words": [""],
}
]
}

# If command line option is specified, overwrite
if args.offline:
input_data["data"][0]["stream"] = [False]
elif not input_data["data"][0]["stream"]:
args.offline = True

if args.max_tokens:
input_data["data"][0]["max_tokens"] = [args.max_tokens]
else:
args.max_tokens = input_data["data"][0]["max_tokens"]

return input_data


def main(args):
input_data = construct_input_data(args)
if args.model == "ensemble":
input_data = construct_trtllm_input_data(args)
elif args.model in "vllm_model":
input_data = construct_vllm_input_data(args)

prompts = generate_prompts(args, input_data)

for prompt in prompts:
Expand Down
34 changes: 21 additions & 13 deletions src/c++/perf_analyzer/docs/llm.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,28 @@ The following guide shows the reader how to use Triton
to measure and characterize the performance behaviors of Large Language Models
(LLMs) using Triton with [vLLM](https://github.com/vllm-project/vllm).

### Setup: Download and configure Triton Server environment
### Setup: Download and configure Triton vLLM Backend

From [Step 1 of the Triton vLLM tutorial](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#step-1-build-a-triton-container-image-with-vllm).
Download the pre-built Triton Server Container with vLLM backend from
[NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver)
registry.

```bash
git clone https://github.com/triton-inference-server/tutorials
cd tutorials/Quick_Deploy/vLLM
docker build -t tritonserver_vllm .
# wait for command to finish, might take several minutes
docker pull nvcr.io/nvidia/tritonserver:23.10-vllm-python-py3
```

Upon successful build, run the following command to start the Triton Server container:
Run the Triton Server container with
[vLLM backend](https://github.com/triton-inference-server/vllm_backend) and
launch the server.
```bash
docker run --gpus all -it --rm -p 8001:8001 --shm-size=1G --ulimit memlock=-1 --ulimit stack=67108864 -v ${PWD}:/work -w /work tritonserver_vllm tritonserver --model-store ./model_repository
git clone -b r23.10 https://github.com/triton-inference-server/vllm_backend.git
cd vllm_backend

docker run --gpus all --rm -it --net host \
--shm-size=2G --ulimit memlock=-1 --ulimit stack=67108864 \
-v $(pwd)/samples/model_repository:/model_repository \
nvcr.io/nvidia/tritonserver:23.10-vllm-python-py3 \
tritonserver --model-repository /model_repository
```

Next run the following command to start the Triton SDK container:
Expand All @@ -69,7 +77,7 @@ Inside the client container, run the following command to generate dummy prompts
of size 100, 300, and 500 and receive single token from the model for each prompt.

```bash
python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 1
python profile.py -m vllm_model --prompt-size-range 100 500 200 --max-tokens 1

# [ BENCHMARK SUMMARY ]
# Prompt size: 100
Expand Down Expand Up @@ -105,7 +113,7 @@ python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 1
> }
> ' > input_data.json
>
> $ python profile.py -m vllm --input-data input_data.json
> $ python profile.py -m vllm_model --input-data input_data.json
> ```
Expand All @@ -122,7 +130,7 @@ of size 100, 300, and 500 and receive total 256 tokens from the model for each
prompts.
```bash
python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 256 --ignore-eos
python profile.py -m vllm_model --prompt-size-range 100 500 200 --max-tokens 256 --ignore-eos
# [ BENCHMARK SUMMARY ]
# Prompt size: 100
Expand Down Expand Up @@ -157,7 +165,7 @@ Run the following command inside the client container.
pip install matplotlib

# Run Perf Analyzer
python profile.py -m vllm --prompt-size-range 10 10 1 --periodic-concurrency-range 1 100 1 --request-period 32 --max-tokens 1024 --ignore-eos
python profile.py -m vllm_model --prompt-size-range 10 10 1 --periodic-concurrency-range 1 100 1 --request-period 32 --max-tokens 1024 --ignore-eos

# [ BENCHMARK SUMMARY ]
# Prompt size: 10
Expand All @@ -179,7 +187,7 @@ split them into multiple segments of responses.
For instance, assume we ran the following benchmark command:

```bash
python profile.py -m vllm --periodic-concurrency-range 1 4 1 --request-period 32 --max-tokens 1024 --ignore-eos
python profile.py -m vllm_model --periodic-concurrency-range 1 4 1 --request-period 32 --max-tokens 1024 --ignore-eos
```

We start from a single request and increment up to 4 requests one by one for
Expand Down

1 comment on commit 8ad47dd

@matthewkotila
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this commit essentially include the work of #412?

Please sign in to comment.