diff --git a/src/c++/perf_analyzer/docs/examples/profile.py b/src/c++/perf_analyzer/docs/examples/profile.py index 534dcec95..ff7dee27f 100644 --- a/src/c++/perf_analyzer/docs/examples/profile.py +++ b/src/c++/perf_analyzer/docs/examples/profile.py @@ -449,13 +449,13 @@ def prepare_export_file(args, prompt): def prepare_input_data(input_data, prompt): """Insert the prompt to send into input JSON data.""" - input_data["data"][0]["PROMPT"] = [prompt] + input_data["data"][0]["text_input"] = [prompt] save_json_data(input_data, INPUT_FILENAME) def generate_prompts(args, input_data): """Generate dummy prompts if not specified by input JSON file.""" - prompt = input_data["data"][0]["PROMPT"][0] + prompt = input_data["data"][0]["text_input"][0] if not prompt: # Generate dummy prompt assert args.prompt_size_range, "Must specify --prompt-size-range." @@ -464,28 +464,42 @@ def generate_prompts(args, input_data): return [prompt] -def construct_input_data(args): - """Construct input data that contains input tensors and parameters. +def construct_vllm_input_data(args): + """Construct input data that contains input tensors and parameters for vLLM. Parse the input JSON file (if exists) to construct the input data. When user sets parameters through command line, overwrite the parameters set by input JSON file. """ - prompt = "" - stream = True - sampling_params = {} + # Default sampling parameters + sampling_params = { + "max_tokens": 256, + "ignore_eos": False, + } if args.input_data: - data = load_json_data(filename=args.input_data)["data"][0] - stream = data["STREAM"][0] if "STREAM" in data else stream - prompt = data["PROMPT"][0] if "PROMPT" in data else prompt - if "SAMPLING_PARAMETERS" in data: - sampling_params = json.loads(data["SAMPLING_PARAMETERS"][0]) + input_data = load_json_data(filename=args.input_data) + if "sampling_parameters" in input_data["data"][0]: + loaded_params = input_data["data"][0]["sampling_parameters"][0] + loaded_params = json.loads(loaded_params or "null") + sampling_params = loaded_params if loaded_params else sampling_params + else: + # Default input JSON + input_data = { + "data": [ + { + "text_input": [""], + "stream": [True], + "sampling_parameters": [""], + } + ] + } + # If command line option is specified, overwrite if args.offline: - stream = False - elif not stream: + input_data["data"][0]["stream"] = [False] + elif not input_data["data"][0]["stream"]: args.offline = True if args.max_tokens: @@ -496,20 +510,61 @@ def construct_input_data(args): args.max_tokens = 256 # default sampling_params["max_tokens"] = args.max_tokens - if "ignore_eos" not in sampling_params: + if args.ignore_eos: + sampling_params["ignore_eos"] = args.ignore_eos + elif "ignore_eos" in sampling_params: + args.ignore_eos = sampling_params["ignore_eos"] + else: + args.ignore_eos = False # default sampling_params["ignore_eos"] = args.ignore_eos - elif args.ignore_eos: - sampling_params["ignore_eos"] = True - input_data = {"data": [{}]} - input_data["data"][0]["PROMPT"] = [prompt] - input_data["data"][0]["STREAM"] = [stream] - input_data["data"][0]["SAMPLING_PARAMETERS"] = [json.dumps(sampling_params)] + input_data["data"][0]["sampling_parameters"] = [json.dumps(sampling_params)] + return input_data + + +def construct_trtllm_input_data(args): + """Construct input data that contains input tensors and parameters for TRT-LLM. + + Parse the input JSON file (if exists) to construct the input data. + When user sets parameters through command line, overwrite the + parameters set by input JSON file. + """ + # Default input JSON + if args.input_data: + input_data = load_json_data(filename=args.input_data) + else: + input_data = { + "data": [ + { + "text_input": [""], + "stream": [True], + "max_tokens": [256], + "bad_words": [""], + "stop_words": [""], + } + ] + } + + # If command line option is specified, overwrite + if args.offline: + input_data["data"][0]["stream"] = [False] + elif not input_data["data"][0]["stream"]: + args.offline = True + + if args.max_tokens: + input_data["data"][0]["max_tokens"] = [args.max_tokens] + else: + args.max_tokens = input_data["data"][0]["max_tokens"] + return input_data def main(args): - input_data = construct_input_data(args) + if args.model == "ensemble": + input_data = construct_trtllm_input_data(args) + elif args.model in "vllm_model": + input_data = construct_vllm_input_data(args) + prompts = generate_prompts(args, input_data) for prompt in prompts: diff --git a/src/c++/perf_analyzer/docs/llm.md b/src/c++/perf_analyzer/docs/llm.md index 1de686c1b..82d365a44 100644 --- a/src/c++/perf_analyzer/docs/llm.md +++ b/src/c++/perf_analyzer/docs/llm.md @@ -33,20 +33,28 @@ The following guide shows the reader how to use Triton to measure and characterize the performance behaviors of Large Language Models (LLMs) using Triton with [vLLM](https://github.com/vllm-project/vllm). -### Setup: Download and configure Triton Server environment +### Setup: Download and configure Triton vLLM Backend -From [Step 1 of the Triton vLLM tutorial](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#step-1-build-a-triton-container-image-with-vllm). +Download the pre-built Triton Server Container with vLLM backend from +[NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver) +registry. ```bash -git clone https://github.com/triton-inference-server/tutorials -cd tutorials/Quick_Deploy/vLLM -docker build -t tritonserver_vllm . -# wait for command to finish, might take several minutes +docker pull nvcr.io/nvidia/tritonserver:23.10-vllm-python-py3 ``` -Upon successful build, run the following command to start the Triton Server container: +Run the Triton Server container with +[vLLM backend](https://github.com/triton-inference-server/vllm_backend) and +launch the server. ```bash -docker run --gpus all -it --rm -p 8001:8001 --shm-size=1G --ulimit memlock=-1 --ulimit stack=67108864 -v ${PWD}:/work -w /work tritonserver_vllm tritonserver --model-store ./model_repository +git clone -b r23.10 https://github.com/triton-inference-server/vllm_backend.git +cd vllm_backend + +docker run --gpus all --rm -it --net host \ + --shm-size=2G --ulimit memlock=-1 --ulimit stack=67108864 \ + -v $(pwd)/samples/model_repository:/model_repository \ + nvcr.io/nvidia/tritonserver:23.10-vllm-python-py3 \ + tritonserver --model-repository /model_repository ``` Next run the following command to start the Triton SDK container: @@ -69,7 +77,7 @@ Inside the client container, run the following command to generate dummy prompts of size 100, 300, and 500 and receive single token from the model for each prompt. ```bash -python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 1 +python profile.py -m vllm_model --prompt-size-range 100 500 200 --max-tokens 1 # [ BENCHMARK SUMMARY ] # Prompt size: 100 @@ -105,7 +113,7 @@ python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 1 > } > ' > input_data.json > -> $ python profile.py -m vllm --input-data input_data.json +> $ python profile.py -m vllm_model --input-data input_data.json > ``` @@ -122,7 +130,7 @@ of size 100, 300, and 500 and receive total 256 tokens from the model for each prompts. ```bash -python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 256 --ignore-eos +python profile.py -m vllm_model --prompt-size-range 100 500 200 --max-tokens 256 --ignore-eos # [ BENCHMARK SUMMARY ] # Prompt size: 100 @@ -157,7 +165,7 @@ Run the following command inside the client container. pip install matplotlib # Run Perf Analyzer -python profile.py -m vllm --prompt-size-range 10 10 1 --periodic-concurrency-range 1 100 1 --request-period 32 --max-tokens 1024 --ignore-eos +python profile.py -m vllm_model --prompt-size-range 10 10 1 --periodic-concurrency-range 1 100 1 --request-period 32 --max-tokens 1024 --ignore-eos # [ BENCHMARK SUMMARY ] # Prompt size: 10 @@ -179,7 +187,7 @@ split them into multiple segments of responses. For instance, assume we ran the following benchmark command: ```bash -python profile.py -m vllm --periodic-concurrency-range 1 4 1 --request-period 32 --max-tokens 1024 --ignore-eos +python profile.py -m vllm_model --periodic-concurrency-range 1 4 1 --request-period 32 --max-tokens 1024 --ignore-eos ``` We start from a single request and increment up to 4 requests one by one for