From c9af25a1d50f939845fcfe63e44c037308880f80 Mon Sep 17 00:00:00 2001 From: Elias Bermudez Date: Wed, 21 Aug 2024 17:16:58 -0700 Subject: [PATCH 1/4] Add doc generation script with doc templates --- .../genai-perf-templates/README_template | 567 ++++++++++++++++++ .../genai-perf-templates/compare_template | 250 ++++++++ .../genai-perf-templates/embeddings_template | 106 ++++ templates/genai-perf-templates/files_template | 130 ++++ templates/genai-perf-templates/lora_template | 54 ++ .../genai-perf-templates/multi_modal_template | 123 ++++ .../genai-perf-templates/rankings_template | 101 ++++ .../genai-perf-templates/tutorial_template | 302 ++++++++++ templates/generate_docs.py | 55 ++ templates/template_vars.yaml | 43 ++ 10 files changed, 1731 insertions(+) create mode 100644 templates/genai-perf-templates/README_template create mode 100644 templates/genai-perf-templates/compare_template create mode 100644 templates/genai-perf-templates/embeddings_template create mode 100644 templates/genai-perf-templates/files_template create mode 100644 templates/genai-perf-templates/lora_template create mode 100644 templates/genai-perf-templates/multi_modal_template create mode 100644 templates/genai-perf-templates/rankings_template create mode 100644 templates/genai-perf-templates/tutorial_template create mode 100755 templates/generate_docs.py create mode 100644 templates/template_vars.yaml diff --git a/templates/genai-perf-templates/README_template b/templates/genai-perf-templates/README_template new file mode 100644 index 00000000..6bb310a4 --- /dev/null +++ b/templates/genai-perf-templates/README_template @@ -0,0 +1,567 @@ + + +# GenAI-Perf + +GenAI-Perf is a command line tool for measuring the throughput and latency of +generative AI models as served through an inference server. +For large language models (LLMs), GenAI-Perf provides metrics such as +[output token throughput](#output_token_throughput_metric), +[time to first token](#time_to_first_token_metric), +[inter token latency](#inter_token_latency_metric), and +[request throughput](#request_throughput_metric). +For a full list of metrics please see the [Metrics section](#metrics). + +Users specify a model name, an inference server URL, the type of inputs to use +(synthetic or from dataset), and the type of load to generate (number of +concurrent requests, request rate). + +GenAI-Perf generates the specified load, measures the performance of the +inference server and reports the metrics in a simple table as console output. +The tool also logs all results in a csv and json file that can be used to derive +additional metrics and visualizations. The inference server must already be +running when GenAI-Perf is run. + +You can use GenAI-Perf to run performance benchmarks on +- [Large Language Models](docs/tutorial.md) +- [Vision Language Models](docs/multi_modal.md) +- [Embedding Models](docs/embeddings.md) +- [Ranking Models](docs/rankings.md) +- [Multiple LoRA Adapters](docs/lora.md) + +> [!Note] +> GenAI-Perf is currently in early release and under rapid development. While we +> will try to remain consistent, command line options and functionality are +> subject to change as the tool matures. + +
+ + + +## Installation + +The easiest way to install GenAI-Perf is through +[Triton Server SDK container](https://ngc.nvidia.com/catalog/containers/nvidia:tritonserver). +Install the latest release using the following command: + +```bash +export RELEASE="{{ release }}" + +docker run -it --net=host --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk + +# Check out genai_perf command inside the container: +genai-perf --help +``` + +
+ +Alternatively, to install from source: + +Since GenAI-Perf depends on Perf Analyzer, +you'll need to install the Perf Analyzer binary: + +### Install Perf Analyzer (Ubuntu, Python 3.8+) + +**NOTE**: you must already have CUDA 12 installed +(checkout the [CUDA installation guide](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html)). + +```bash +pip install tritonclient + +apt update && apt install -y --no-install-recommends libb64-0d libcurl4 +``` + +You can also build Perf Analyzer [from source](../docs/install.md#build-from-source) as well. + +### Install GenAI-Perf from source + +```bash +git clone https://github.com/triton-inference-server/perf_analyzer.git && cd perf_analyzer + +pip install -e genai-perf +``` + +
+ +
+ + + +## Quick Start + +In this quick start, we will use GenAI-Perf to run performance benchmarking on +the GPT-2 model running on Triton Inference Server with a TensorRT-LLM engine. + +### Serve GPT-2 TensorRT-LLM model using Triton CLI + +You can follow the [quickstart guide](https://github.com/triton-inference-server/triton_cli?tab=readme-ov-file#serving-a-trt-llm-model) +on Triton CLI github repo to run GPT-2 model locally. +The full instructions are copied below for convenience: + +```bash +# This container comes with all of the dependencies for building TRT-LLM engines +# and serving the engine with Triton Inference Server. +docker run -ti \ + --gpus all \ + --network=host \ + --shm-size=1g --ulimit memlock=-1 \ + -v /tmp:/tmp \ + -v ${HOME}/models:/root/models \ + -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \ + nvcr.io/nvidia/tritonserver:{{ release }}-trtllm-python-py3 + +# Install the Triton CLI +pip install git+https://github.com/triton-inference-server/triton_cli.git@{{ triton_cli_version }} + +# Build TRT LLM engine and generate a Triton model repository pointing at it +triton remove -m all +triton import -m gpt2 --backend tensorrtllm + +# Start Triton pointing at the default model repository +triton start +``` + +### Running GenAI-Perf + +Now we can run GenAI-Perf from Triton Inference Server SDK container: + +```bash +export RELEASE="{{ release }}" + +docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk + +# Run GenAI-Perf in the container: +genai-perf profile \ + -m gpt2 \ + --service-kind triton \ + --backend tensorrtllm \ + --num-prompts 100 \ + --random-seed 123 \ + --synthetic-input-tokens-mean 200 \ + --synthetic-input-tokens-stddev 0 \ + --streaming \ + --output-tokens-mean 100 \ + --output-tokens-stddev 0 \ + --output-tokens-mean-deterministic \ + --tokenizer hf-internal-testing/llama-tokenizer \ + --concurrency 1 \ + --measurement-interval 4000 \ + --profile-export-file my_profile_export.json \ + --url localhost:8001 +``` + +Example output: + +``` + LLM Metrics +┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━┩ +│ Time to first token (ms) │ 11.70 │ 9.88 │ 17.21 │ 14.35 │ 12.01 │ 11.87 │ +│ Inter token latency (ms) │ 1.46 │ 1.08 │ 1.89 │ 1.87 │ 1.62 │ 1.52 │ +│ Request latency (ms) │ 161.24 │ 153.45 │ 200.74 │ 200.66 │ 179.43 │ 162.23 │ +│ Output sequence length │ 103.39 │ 95.00 │ 134.00 │ 120.08 │ 107.30 │ 105.00 │ +│ Input sequence length │ 200.01 │ 200.00 │ 201.00 │ 200.13 │ 200.00 │ 200.00 │ +└──────────────────────────┴────────┴────────┴────────┴────────┴────────┴────────┘ +Output token throughput (per sec): 635.61 +Request throughput (per sec): 6.15 +``` + +See [Tutorial](docs/tutorial.md) for additional examples. + +
+ + + +## Visualization + +GenAI-Perf can also generate various plots that visualize the performance of the +current profile run. This is disabled by default but users can easily enable it +by passing the `--generate-plots` option when running the benchmark: + +```bash +genai-perf profile \ + -m gpt2 \ + --service-kind triton \ + --backend tensorrtllm \ + --streaming \ + --concurrency 1 \ + --generate-plots +``` + +This will generate a [set of default plots](docs/compare.md#example-plots) such as: +- Time to first token (TTFT) analysis +- Request latency analysis +- TTFT vs Input sequence lengths +- Inter token latencies vs Token positions +- Input sequence lengths vs Output sequence lengths + + +### Using `compare` Subcommand to Visualize Multiple Runs + +The `compare` subcommand in GenAI-Perf facilitates users in comparing multiple +profile runs and visualizing the differences through plots. + +#### Usage +Assuming the user possesses two profile export JSON files, +namely `profile1.json` and `profile2.json`, +they can execute the `compare` subcommand using the `--files` option: + +```bash +genai-perf compare --files profile1.json profile2.json +``` + +Executing the above command will perform the following actions under the +`compare` directory: +1. Generate a YAML configuration file (e.g. `config.yaml`) containing the +metadata for each plot generated during the comparison process. +2. Automatically generate the [default set of plots](docs/compare.md#example-plots) +(e.g. TTFT vs. Input Sequence Lengths) that compare the two profile runs. + +``` +compare +├── config.yaml +├── distribution_of_input_sequence_lengths_to_output_sequence_lengths.jpeg +├── request_latency.jpeg +├── time_to_first_token.jpeg +├── time_to_first_token_vs_input_sequence_lengths.jpeg +├── token-to-token_latency_vs_output_token_position.jpeg +└── ... +``` + +#### Customization +Users have the flexibility to iteratively modify the generated YAML configuration +file to suit their specific requirements. +They can make alterations to the plots according to their preferences and execute +the command with the `--config` option followed by the path to the modified +configuration file: + +```bash +genai-perf compare --config compare/config.yaml +``` + +This command will regenerate the plots based on the updated configuration settings, +enabling users to refine the visual representation of the comparison results as +per their needs. + +See [Compare documentation](docs/compare.md) for more details. + +
+ + + +## Model Inputs + +GenAI-Perf supports model input prompts from either synthetically generated +inputs, or from the HuggingFace +[OpenOrca](https://huggingface.co/datasets/Open-Orca/OpenOrca) or +[CNN_DailyMail](https://huggingface.co/datasets/cnn_dailymail) datasets. This is +specified using the `--input-dataset` CLI option. + +When the dataset is synthetic, you can specify the following options: +* `--num-prompts `: The number of unique prompts to generate as stimulus, >= 1. +* `--synthetic-input-tokens-mean `: The mean of number of tokens in the + generated prompts when using synthetic data, >= 1. +* `--synthetic-input-tokens-stddev `: The standard deviation of number of + tokens in the generated prompts when using synthetic data, >= 0. +* `--random-seed `: The seed used to generate random values, >= 0. + +When the dataset is coming from HuggingFace, you can specify the following +options: +* `--input-dataset {openorca,cnn_dailymail}`: HuggingFace dataset to use for + benchmarking. +* `--num-prompts `: The number of unique prompts to generate as stimulus, >= 1. + +When the dataset is coming from a file, you can specify the following +options: +* `--input-file `: The input file containing the prompts to + use for benchmarking as JSON objects. + +For any dataset, you can specify the following options: +* `--output-tokens-mean `: The mean number of tokens in each output. Ensure + the `--tokenizer` value is set correctly, >= 1. +* `--output-tokens-stddev `: The standard deviation of the number of tokens + in each output. This is only used when output-tokens-mean is provided, >= 1. +* `--output-tokens-mean-deterministic`: When using `--output-tokens-mean`, this + flag can be set to improve precision by setting the minimum number of tokens + equal to the requested number of tokens. This is currently supported with the + Triton service-kind. Note that there is still some variability in the + requested number of output tokens, but GenAi-Perf attempts its best effort + with your model to get the right number of output tokens. + +You can optionally set additional model inputs with the following option: +* `--extra-inputs :`: An additional input for use with the + model with a singular value, such as `stream:true` or `max_tokens:5`. This + flag can be repeated to supply multiple extra inputs. + +For [Large Language Models](docs/tutorial.md), there is no batch size (i.e. +batch size is always `1`). Each request includes the inputs for one individual +inference. Other modes such as the [embeddings](docs/embeddings.md) and +[rankings](docs/rankings.md) endpoints support client-side batching, where +`--batch-size N` means that each request sent will include the inputs for `N` +separate inferences, allowing them to be processed together. + +
+ + + +## Metrics + +GenAI-Perf collects a diverse set of metrics that captures the performance of +the inference server. + +| Metric | Description | Aggregations | +| - | - | - | +| Time to First Token | Time between when a request is sent and when its first response is received, one value per request in benchmark | Avg, min, max, p99, p90, p75 | +| Inter Token Latency | Time between intermediate responses for a single request divided by the number of generated tokens of the latter response, one value per response per request in benchmark | Avg, min, max, p99, p90, p75 | +| Request Latency | Time between when a request is sent and when its final response is received, one value per request in benchmark | Avg, min, max, p99, p90, p75 | +| Output Sequence Length | Total number of output tokens of a request, one value per request in benchmark | Avg, min, max, p99, p90, p75 | +| Input Sequence Length | Total number of input tokens of a request, one value per request in benchmark | Avg, min, max, p99, p90, p75 | +| Output Token Throughput | Total number of output tokens from benchmark divided by benchmark duration | None–one value per benchmark | +| Request Throughput | Number of final responses from benchmark divided by benchmark duration | None–one value per benchmark | + +
+ + + +## Command Line Options + +##### `-h` +##### `--help` + +Show the help message and exit. + +### Endpoint Options: + +##### `-m ` +##### `--model ` + +The names of the models to benchmark. +A single model is recommended, unless you are +[profiling multiple LoRA adapters](docs/lora.md). (default: `None`) + +##### `--model-selection-strategy {round_robin, random}` + +When multiple models are specified, this is how a specific model +is assigned to a prompt. Round robin means that each model receives +a request in order. Random means that assignment is uniformly random +(default: `round_robin`) + +##### `--backend {tensorrtllm,vllm}` + +When using the "triton" service-kind, this is the backend of the model. For the +TRT-LLM backend, you currently must set `exclude_input_in_output` to true in the +model config to not echo the input tokens in the output. (default: tensorrtllm) + +##### `--endpoint ` + +Set a custom endpoint that differs from the OpenAI defaults. (default: `None`) + +##### `--endpoint-type {chat,completions,embeddings,rankings}` + +The endpoint-type to send requests to on the server. This is only used with the +`openai` service-kind. (default: `None`) + +##### `--service-kind {triton,openai}` + +The kind of service perf_analyzer will generate load for. In order to use +`openai`, you must specify an api via `--endpoint-type`. (default: `triton`) + +##### `--streaming` + +An option to enable the use of the streaming API. (default: `False`) + +##### `-u ` +##### `--url ` + +URL of the endpoint to target for benchmarking. (default: `None`) + +### Input Options + +##### `-b ` +##### `--batch-size ` + +The batch size of the requests GenAI-Perf should send. +This is currently only supported with the +[embeddings endpoint type](docs/embeddings.md). +(default: `1`) and +[rankings endpoint type](docs/rankings.md). + +##### `--extra-inputs ` + +Provide additional inputs to include with every request. You can repeat this +flag for multiple inputs. Inputs should be in an input_name:value format. +Alternatively, a string representing a json formatted dict can be provided. +(default: `None`) + +##### `--input-dataset {openorca,cnn_dailymail}` + +The HuggingFace dataset to use for prompts. +(default: `openorca`) + +##### `--input-file ` + +The input file containing the prompts to use for profiling. +Each line should be a JSON object with a 'text_input' field in JSONL format. +Example: {\"text_input\": \"Your prompt here\"}" + +##### `--num-prompts ` + +The number of unique prompts to generate as stimulus. (default: `100`) + +##### `--output-tokens-mean ` + +The mean number of tokens in each output. Ensure the `--tokenizer` value is set +correctly. (default: `-1`) + +##### `--output-tokens-mean-deterministic` + +When using `--output-tokens-mean`, this flag can be set to improve precision by +setting the minimum number of tokens equal to the requested number of tokens. +This is currently supported with the Triton service-kind. Note that there is +still some variability in the requested number of output tokens, but GenAi-Perf +attempts its best effort with your model to get the right number of output +tokens. (default: `False`) + +##### `--output-tokens-stddev ` + +The standard deviation of the number of tokens in each output. This is only used +when `--output-tokens-mean` is provided. (default: `0`) + +##### `--random-seed ` + +The seed used to generate random values. (default: `0`) + +##### `--synthetic-input-tokens-mean ` + +The mean of number of tokens in the generated prompts when using synthetic +data. (default: `550`) + +##### `--synthetic-input-tokens-stddev ` + +The standard deviation of number of tokens in the generated prompts when +using synthetic data. (default: `0`) + +### Profiling Options + +##### `--concurrency ` + +The concurrency value to benchmark. (default: `None`) + +##### `--measurement-interval ` +##### `-p ` + +The time interval used for each measurement in milliseconds. Perf Analyzer +will sample a time interval specified and take measurement over the requests +completed within that time interval. (default: `10000`) + +##### `--request-rate ` + +Sets the request rate for the load generated by PA. (default: `None`) + +##### `-s ` +##### `--stability-percentage ` + +The allowed variation in latency measurements when determining if a result is +stable. The measurement is considered as stable if the ratio of max / min from +the recent 3 measurements is within (stability percentage) in terms of both +infer per second and latency. (default: `999`) + +### Output Options + +##### `--artifact-dir` + +The directory to store all the (output) artifacts generated by GenAI-Perf and +Perf Analyzer. (default: `artifacts`) + +##### `--generate-plots` + +An option to enable the generation of plots. (default: False) + +##### `--profile-export-file ` + +The path where the perf_analyzer profile export will be generated. By default, +the profile export will be to `profile_export.json`. The genai-perf files will be +exported to `_genai_perf.json` and +`_genai_perf.csv`. For example, if the profile +export file is `profile_export.json`, the genai-perf file will be exported to +`profile_export_genai_perf.csv`. (default: `profile_export.json`) + +### Other Options + +##### `--tokenizer ` + +The HuggingFace tokenizer to use to interpret token metrics from prompts and +responses. (default: `hf-internal-testing/llama-tokenizer`) + +##### `-v` +##### `--verbose` + +An option to enable verbose mode. (default: `False`) + +##### `--version` + +An option to print the version and exit. + +
+ + + +## Known Issues + +* GenAI-Perf can be slow to finish if a high request-rate is provided +* Token counts may not be exact + diff --git a/templates/genai-perf-templates/compare_template b/templates/genai-perf-templates/compare_template new file mode 100644 index 00000000..26c7ceb9 --- /dev/null +++ b/templates/genai-perf-templates/compare_template @@ -0,0 +1,250 @@ + + +# GenAI-Perf Compare Subcommand + +There are two approaches for the users to use the `compare` subcommand to create +plots across multiple runs. First is to directly pass the profile export files +with `--files` option + +## Running initially with `--files` option + +If the user does not have a YAML configuration file, +they can run the `compare` subcommand with the `--files` option to generate a +set of default plots as well as a pre-filled YAML config file for the plots. + +```bash +genai-perf compare --files profile1.json profile2.json profile3.json +``` + +This will generate the default plots and compare across the three runs. +GenAI-Perf will also generate an initial YAML configuration file `config.yaml` +that is pre-filled with plot configurations as following: + +```yaml +plot1: + title: Time to First Token + x_metric: '' + y_metric: time_to_first_tokens + x_label: Time to First Token (ms) + y_label: '' + width: 1200 + height: 700 + type: box + paths: + - profile1.json + - profile2.json + - profile3.json + output: compare +plot2: + title: Request Latency + x_metric: '' + y_metric: request_latencies + x_label: Request Latency (ms) + y_label: '' + width: 1200 + height: 700 + type: box + paths: + - profile1.json + - profile2.json + - profile3.json + output: compare +plot3: + title: Distribution of Input Sequence Lengths to Output Sequence Lengths + x_metric: input_sequence_lengths + y_metric: output_sequence_lengths + x_label: Input Sequence Length + y_label: Output Sequence Length + width: 1200 + height: 450 + type: heatmap + paths: + - profile1.json + - profile2.json + - profile3.json + output: compare +plot4: + title: Time to First Token vs Input Sequence Lengths + x_metric: input_sequence_lengths + y_metric: time_to_first_tokens + x_label: Input Sequence Length + y_label: Time to First Token (ms) + width: 1200 + height: 700 + type: scatter + paths: + - profile1.json + - profile2.json + - profile3.json + output: compare +plot5: + title: Token-to-Token Latency vs Output Token Position + x_metric: token_positions + y_metric: inter_token_latencies + x_label: Output Token Position + y_label: Token-to-Token Latency (ms) + width: 1200 + height: 700 + type: scatter + paths: + - profile1.json + - profile2.json + - profile3.json + output: compare +``` + +Once the user has the YAML configuration file, +they can repeat the process of editing the config file and running with +`--config` option to re-generate the plots iteratively. + +```bash +# edit +vi config.yaml + +# re-generate the plots +genai-perf compare --config config.yaml +``` + +## Running directly with `--config` option + +If the user would like to create a custom plot (other than the default ones provided), +they can build their own YAML configuration file that contains the information +about the plots they would like to generate. +For instance, if the user would like to see how the inter token latencies change +by the number of output tokens, which is not part of the default plots, +they could add the following YAML block to the file: + +```yaml +plot1: + title: Inter Token Latency vs Output Tokens + x_metric: num_output_tokens + y_metric: inter_token_latencies + x_label: Num Output Tokens + y_label: Avg ITL (ms) + width: 1200 + height: 450 + type: scatter + paths: + - + - + output: compare +``` + +After adding the lines, the user can run the following command to generate the +plots specified in the configuration file (in this case, `config.yaml`): + +```bash +genai-perf compare --config config.yaml +``` + +The user can check the generated plots under the output directory: +``` +compare/ +├── inter_token_latency_vs_output_tokens.jpeg +└── ... +``` + +## YAML Schema + +Here are more details about the YAML configuration file and its stricture. +The general YAML schema for the plot configuration looks as following: + +```yaml +plot1: + title: [str] + x_metric: [str] + y_metric: [str] + x_label: [str] + y_label: [str] + width: [int] + height: [int] + type: [scatter,box,heatmap] + paths: + - [str] + - ... + output: [str] + +plot2: + title: [str] + x_metric: [str] + y_metric: [str] + x_label: [str] + y_label: [str] + width: [int] + height: [int] + type: [scatter,box,heatmap] + paths: + - [str] + - ... + output: [str] + +# add more plots +``` + +The user can add as many plots they would like to generate by adding the plot +blocks in the configuration file (they have a key pattern of `plot<#>`, +but that is not required and the user can set it to any arbitrary string). +For each plot block, the user can specify the following configurations: +- `title`: The title of the plot. +- `x_metric`: The name of the metric to be used on the x-axis. +- `y_metric`: The name of the metric to be used on the y-axis. +- `x_label`: The x-axis label (or description) +- `y_label`: The y-axis label (or description) +- `width`: The width of the entire plot +- `height`: The height of the entire plot +- `type`: The type of the plot. It must be one of the three: `scatter`, `box`, +or `heatmap`. +- `paths`: List of paths to the profile export files to compare. +- `output`: The path to the output directory to store all the plots and YAML +configuration file. + +> [!Note] +> User *MUST* provide at least one valid path to the profile export file. + + + +## Example Plots + +Here are the list of sample plots that gets created by default from running the +`compare` subcommand: + +### Distribution of Input Sequence Lengths to Output Sequence Lengths + + +### Request Latency Analysis + + +### Time to First Token Analysis + + +### Time to First Token vs. Input Sequence Lengths + + +### Token-to-Token Latency vs. Output Token Position + diff --git a/templates/genai-perf-templates/embeddings_template b/templates/genai-perf-templates/embeddings_template new file mode 100644 index 00000000..f0109532 --- /dev/null +++ b/templates/genai-perf-templates/embeddings_template @@ -0,0 +1,106 @@ + + +# Profile Embeddings Models with GenAI-Perf + +GenAI-Perf allows you to profile embedding models running on an +[OpenAI Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)-compatible server. + +## Create a Sample Embeddings Input File + +To create a sample embeddings input file, use the following command: + +```bash +echo '{"text": "What was the first car ever driven?"} +{"text": "Who served as the 5th President of the United States of America?"} +{"text": "Is the Sydney Opera House located in Australia?"} +{"text": "In what state did they film Shrek 2?"}' > embeddings.jsonl +``` + +This will generate a file named embeddings.jsonl with the following content: +```jsonl +{"text": "What was the first car ever driven?"} +{"text": "Who served as the 5th President of the United States of America?"} +{"text": "Is the Sydney Opera House located in Australia?"} +{"text": "In what state did they film Shrek 2?"} +``` + +## Start an OpenAI Embeddings-Compatible Server +To start an OpenAI embeddings-compatible server, run the following command: +```bash +docker run -it --net=host --rm --gpus=all vllm/vllm-openai:latest --model intfloat/e5-mistral-7b-instruct --dtype float16 --max-model-len 1024 +``` + +## Run GenAI-Perf +To profile embeddings models using GenAI-Perf, use the following command: + +```bash +genai-perf profile \ + -m intfloat/e5-mistral-7b-instruct \ + --service-kind openai \ + --endpoint-type embeddings \ + --batch-size 2 \ + --input-file embeddings.jsonl +``` + +* `-m intfloat/e5-mistral-7b-instruct` is to specify what model you want to run + (`intfloat/e5-mistral-7b-instruct`) +* `--service-kind openai` is to specify that the server type is OpenAI-API + compatible +* `--endpoint-type embeddings` is to specify that the sent requests should be + formatted to follow the [embeddings + API](https://platform.openai.com/docs/api-reference/embeddings/create) +* `--batch-size 2` is to specify that each request will contain the inputs for 2 + individual inferences, making a batch size of 2 +* `--input-file embeddings.jsonl` is to specify the input data to be used for + inferencing + +This will use default values for optional arguments. You can also pass in +additional arguments with the `--extra-inputs` [flag](../README.md#input-options). +For example, you could use this command: + +```bash +genai-perf profile \ + -m intfloat/e5-mistral-7b-instruct \ + --service-kind openai \ + --endpoint-type embeddings \ + --extra-inputs user:sample_user +``` + +Example output: + +``` + Embeddings Metrics +┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━┩ +│ Request latency (ms) │ 42.21 │ 28.18 │ 318.61 │ 56.50 │ 49.21 │ 43.07 │ +└──────────────────────┴───────┴───────┴────────┴───────┴───────┴───────┘ +Request throughput (per sec): 23.63 +``` + diff --git a/templates/genai-perf-templates/files_template b/templates/genai-perf-templates/files_template new file mode 100644 index 00000000..5859a49e --- /dev/null +++ b/templates/genai-perf-templates/files_template @@ -0,0 +1,130 @@ + + +# Generated File Structures + +## Overview + +This document serves as a guide to understanding the structure and contents of +the files generated by GenAi-Perf. + +## Directory Structure + +After running GenAi-Perf, your file tree should contain the following: + +``` +genai-perf/ +├── artifacts/ +│ ├── data/ +│ └── images/ +``` + +## File Types +Within the artifacts and docs directories, several file types are generated, +including .gzip, .csv, .json, .html, and .jpeg. Below is a detailed +explanation of each file and its purpose. + +### Artifacts Directory + +#### Data Subdirectory + +The data subdirectory contains the raw and processed performance data files. + +##### GZIP Files + +- all_data.gzip: Aggregated performance data from all collected metrics. +- input_sequence_lengths_vs_output_sequence_lengths.gzip: This contains data on +the input sequence lengths versus the output sequence lengths for each request. +- request_latency.gzip: This contains the latency for each request. +- time_to_first_token.gzip: This contains the time to first token for each request. +- token_to_token_vs_output_position.gzip: This contains the time from one token +generation to the next versus the position of the output token for each token. +- ttft_vs_input_sequence_lengths.gzip: This contains the time to first token +versus the input sequence length for each request. + +##### JSON Files + +- llm_inputs.json: This contains the input prompts provided to the LLM during testing. +- profile_export.json: This is provided by Perf Analyzer and contains the timestamps +for each event in the lifecycle of each request. This is low-level data used to calculate +metrics by GenAi-Perf. + +##### CSV File + +- profile_export_genai_perf.csv: A CSV of the output tables printed +in the GenAi-Perf output. These may have more detail than the printed tables. + +#### Images Subdirectory + +The images subdirectory contains visual representations of the performance +data. All images are in both HTML and JPEG formats. + +##### HTML and JPEG Files +- input_sequence_lengths_vs_output_sequence_lengths: A heat map showing the +relationship between input and generated tokens. +- request_latency: A box plot showing request latency. +- time_to_first_token: A box plot showing time to first token. +- token_to_token_vs_output_position: A scatterplot showing token-to-token +time versus output token position. +- ttft_vs_input_sequence_lengths: A scatterplot showing token-to-token time +versus the input sequence lengths. + +## Usage Instructions + +To use the generated files, navigate to the artifacts/data directory. Then, +the next steps depend on the file format you wish to work with. + +### GZIP Files + +The GZIP files contain Parquet files with calculated data, which can be read +with Pandas in Python. For example, you can create a dataframe with these files: + +``` +import pandas +df = pandas.read_partquet(path_to_file)` +``` + +You can then use Pandas to work with the data. + +``` +print(df.head()) # See the first few rows of the data. +print(df.describe()) # Get summary statistics for the data +``` + +### CSV and JSON Files +Open .csv and .json files with spreadsheet or JSON parsing tools for structured +data analysis. These can also be read via a text editor, like Vim. + +### HTML Files + +View .html visualizations in a web browser for interactive data exploration. + +### JPEG Files + +Use an image software to open .jpeg images for static visual representations. + diff --git a/templates/genai-perf-templates/lora_template b/templates/genai-perf-templates/lora_template new file mode 100644 index 00000000..056480da --- /dev/null +++ b/templates/genai-perf-templates/lora_template @@ -0,0 +1,54 @@ + + +# Profile Multiple LoRA Adapters +GenAI-Perf allows you to profile multiple LoRA adapters on top of a base model. + +## Select LoRA Adapters +To do this, list multiple adapters after the model name option `-m`: + +```bash +genai-perf -m lora_adapter1 lora_adapter2 lora_adapter3 +``` + +## Choose a Strategy for Selecting Models +When profiling with multiple models, you can specify how the models should be +assigned to prompts using the `--model-selection-strategy` option: + +```bash +genai-perf profile \ + -m lora_adapter1 lora_adapter2 lora_adapter3 \ + --model-selection-strategy round_robin +``` + +This setup will cycle through the lora_adapter1, lora_adapter2, and +lora_adapter3 models in a round-robin manner for each prompt. + +For more details on additional options and configurations, refer to the +[Command Line Options section](../README.md#command-line-options) in the README. + diff --git a/templates/genai-perf-templates/multi_modal_template b/templates/genai-perf-templates/multi_modal_template new file mode 100644 index 00000000..6c2bf45a --- /dev/null +++ b/templates/genai-perf-templates/multi_modal_template @@ -0,0 +1,123 @@ + + +# Profile Vision-Language Models with GenAI-Perf + +GenAI-Perf allows you to profile Vision-Language Models (VLM) running on +[OpenAI Chat Completions API](https://platform.openai.com/docs/guides/chat-completions)-compatible server +by sending [multi-modal content](https://platform.openai.com/docs/guides/vision) to the server. +Currently, you can send multi-modal contents with GenAI-Perf using the following two approaches: +1. The synthetic data generation approach, where GenAI-Perf generates the multi-modal data for you. +2. The Bring Your Own Data (BYOD) approach, where you provide GenAI-Perf with the data to send. + +Before we dive into the two approaches, +you can start OpenAI API compatible server with a VLM model using following command: + +```bash +docker run --runtime nvidia --gpus all \ + -p 8000:8000 --ipc=host \ + vllm/vllm-openai:latest \ + --model llava-hf/llava-v1.6-mistral-7b-hf --dtype float16 +``` + + +## Approach 1: Synthetic Multi-Modal Data Generation + +GenAI-Perf can generate synthetic multi-modal data such as texts or images using +the parameters provide by the user through CLI. + +```bash +genai-perf profile \ + -m llava-hf/llava-v1.6-mistral-7b-hf \ + --service-kind openai \ + --endpoint-type vision \ + --image-width-mean 512 \ + --image-width-stddev 30 \ + --image-height-mean 512 \ + --image-height-stddev 30 \ + --image-format png \ + --synthetic-input-tokens-mean 100 \ + --synthetic-input-tokens-stddev 0 \ + --streaming +``` + +> [!Note] +> Under the hood, GenAI-Perf generates synthetic images using a few source images +> under the `llm_inputs/source_images` directory. +> If you would like to add/remove/edit the source images, +> you can do so by directly editing the source images under the directory. +> GenAI-Perf will pickup the images under the directory automatically when +> generating the synthetic images. + + +## Approach 2: Bring Your Own Data (BYOD) + +Instead of letting GenAI-Perf create the synthetic data, +you can also provide GenAI-Perf with your own data using +[`--input-file`](../README.md#--input-file-path) CLI option. +The file needs to be in JSONL format and should contain both the prompt and +the filepath to the image to send. + +For instance, an example of input file would look something as following: +```bash +// input.jsonl +{"text_input": "What is in this image?", "image": "path/to/image1.png"} +{"text_input": "What is the color of the dog?", "image": "path/to/image2.jpeg"} +{"text_input": "Describe the scene in the picture.", "image": "path/to/image3.png"} +... +``` + +After you create the file, you can run GenAI-Perf using the following command: + +```bash +genai-perf profile \ + -m llava-hf/llava-v1.6-mistral-7b-hf \ + --service-kind openai \ + --endpoint-type vision \ + --input-file input.jsonl \ + --streaming +``` + +Running GenAI-Perf using either approach will give you an example output that +looks like below: + +```bash + LLM Metrics +┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━┩ +│ Time to first token (ms) │ 321.05 │ 291.30 │ 537.07 │ 497.88 │ 318.46 │ 317.35 │ +│ Inter token latency (ms) │ 12.28 │ 11.44 │ 12.88 │ 12.87 │ 12.81 │ 12.53 │ +│ Request latency (ms) │ 1,866.23 │ 1,044.70 │ 2,832.22 │ 2,779.63 │ 2,534.64 │ 2,054.03 │ +│ Output sequence length │ 126.68 │ 59.00 │ 204.00 │ 200.58 │ 177.80 │ 147.50 │ +│ Input sequence length │ 100.00 │ 100.00 │ 100.00 │ 100.00 │ 100.00 │ 100.00 │ +└──────────────────────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┘ +Output token throughput (per sec): 67.40 +Request throughput (per sec): 0.53 +``` + diff --git a/templates/genai-perf-templates/rankings_template b/templates/genai-perf-templates/rankings_template new file mode 100644 index 00000000..cad19011 --- /dev/null +++ b/templates/genai-perf-templates/rankings_template @@ -0,0 +1,101 @@ + + +# Profile Ranking Models with GenAI-Perf + + +GenAI-Perf allows you to profile ranking models compatible with Hugging Face's +[Text Embeddings Inference's re-ranker API](https://huggingface.co/docs/text-embeddings-inference/en/quick_tour#re-rankers). + +## Create a Sample Rankings Input Directory + +To create a sample rankings input directory, follow these steps: + +Create a directory called rankings_jsonl: +```bash +mkdir rankings_jsonl +``` + +Inside this directory, create a JSONL file named queries.jsonl with queries data: + +```bash +echo '{"text": "What was the first car ever driven?"} +{"text": "Who served as the 5th President of the United States of America?"} +{"text": "Is the Sydney Opera House located in Australia?"} +{"text": "In what state did they film Shrek 2?"}' > rankings_jsonl/queries.jsonl +``` + +Create another JSONL file named passages.jsonl with passages data: + +```bash +echo '{"text": "Eric Anderson (born January 18, 1968) is an American sociologist and sexologist."} +{"text": "Kevin Loader is a British film and television producer."} +{"text": "Francisco Antonio Zea Juan Francisco Antonio Hilari was a Colombian journalist, botanist, diplomat, politician, and statesman who served as the 1st Vice President of Colombia."} +{"text": "Daddys Home 2 Principal photography on the film began in Massachusetts in March 2017 and it was released in the United States by Paramount Pictures on November 10, 2017. Although the film received unfavorable reviews, it has grossed over $180 million worldwide on a $69 million budget."}' > rankings_jsonl/passages.jsonl +``` + +## Start a Hugging Face Re-Ranker-Compatible Server +To start a Hugging Face re-ranker-compatible server, run the following commands: + +```bash +model=BAAI/bge-reranker-large +revision=refs/pr/4 +volume=$PWD/data + +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.3 --model-id $model --revision $revision +``` + +## Run GenAI-Perf +To profile ranking models using GenAI-Perf, use the following command: + +```bash +genai-perf profile \ + -m BAAI/bge-reranker-large \ + --service-kind openai \ + --endpoint-type rankings \ + --endpoint rerank \ + --input-file rankings_jsonl/ \ + -u localhost:8080 \ + --extra-inputs rankings:tei \ + --batch-size 2 +``` + +This command specifies the use of Hugging Face's ranking API with `--endpoint rerank` and `--extra-inputs rankings:tei`. + +Example output: + +``` + Rankings Metrics +┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━┳━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━╇━━━━━━┩ +│ Request latency (ms) │ 5.48 │ 2.50 │ 23.91 │ 10.27 │ 8.34 │ 6.07 │ +└──────────────────────┴──────┴──────┴───────┴───────┴──────┴──────┘ +Request throughput (per sec): 180.11 +``` + diff --git a/templates/genai-perf-templates/tutorial_template b/templates/genai-perf-templates/tutorial_template new file mode 100644 index 00000000..27cd43ec --- /dev/null +++ b/templates/genai-perf-templates/tutorial_template @@ -0,0 +1,302 @@ + + +# Tutorials + +- [Profile GPT2 running on Triton + TensorRT-LLM](#tensorrt-llm) +- [Profile GPT2 running on Triton + vLLM](#triton-vllm) +- [Profile GPT2 running on OpenAI Chat Completions API-Compatible Server](#openai-chat) +- [Profile GPT2 running on OpenAI Completions API-Compatible Server](#openai-completions) + +--- + +## Profile GPT2 running on Triton + TensorRT-LLM + +### Run GPT2 on Triton Inference Server using TensorRT-LLM + +
+See instructions + +Run Triton Inference Server with TensorRT-LLM backend container: + +```bash +export RELEASE="{{ release }}" + +docker run -it --net=host --gpus=all --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tritonserver:${RELEASE}-trtllm-python-py3 + +# Install Triton CLI (~5 min): +pip install "git+https://github.com/triton-inference-server/triton_cli@{{ triton_cli_version }}" + +# Download model: +triton import -m gpt2 --backend tensorrtllm + +# Run server: +triton start +``` + +
+ +### Run GenAI-Perf + +Run GenAI-Perf from Triton Inference Server SDK container: + +```bash +export RELEASE="{{ release }}" + +docker run -it --net=host --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk + +# Run GenAI-Perf in the container: +genai-perf profile \ + -m gpt2 \ + --service-kind triton \ + --backend tensorrtllm \ + --num-prompts 100 \ + --random-seed 123 \ + --synthetic-input-tokens-mean 200 \ + --synthetic-input-tokens-stddev 0 \ + --streaming \ + --output-tokens-mean 100 \ + --output-tokens-stddev 0 \ + --output-tokens-mean-deterministic \ + --tokenizer hf-internal-testing/llama-tokenizer \ + --concurrency 1 \ + --measurement-interval 4000 \ + --profile-export-file my_profile_export.json \ + --url localhost:8001 +``` + +Example output: + +``` + LLM Metrics +┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ +│ Time to first token (ns) │ 13,266,974 │ 11,818,732 │ 18,351,779 │ 16,513,479 │ 13,741,986 │ 13,544,376 │ +│ Inter token latency (ns) │ 2,069,766 │ 42,023 │ 15,307,799 │ 3,256,375 │ 3,020,580 │ 2,090,930 │ +│ Request latency (ns) │ 223,532,625 │ 219,123,330 │ 241,004,192 │ 238,198,306 │ 229,676,183 │ 224,715,918 │ +│ Output sequence length │ 104 │ 100 │ 129 │ 128 │ 109 │ 105 │ +│ Input sequence length │ 199 │ 199 │ 199 │ 199 │ 199 │ 199 │ +└──────────────────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘ +Output token throughput (per sec): 460.42 +Request throughput (per sec): 4.44 +``` + +## Profile GPT2 running on Triton + vLLM + +### Run GPT2 on Triton Inference Server using vLLM + +
+See instructions + +Run Triton Inference Server with vLLM backend container: + +```bash +export RELEASE="{{ release }}" + + +docker run -it --net=host --gpus=1 --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tritonserver:${RELEASE}-vllm-python-py3 + +# Install Triton CLI (~5 min): +pip install "git+https://github.com/triton-inference-server/triton_cli@0.0.8" + +# Download model: +triton import -m gpt2 --backend vllm + +# Run server: +triton start +``` + +
+ +### Run GenAI-Perf + +Run GenAI-Perf from Triton Inference Server SDK container: + +```bash +export RELEASE="{{ release }}" + +docker run -it --net=host --gpus=1 nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk + +# Run GenAI-Perf in the container: +genai-perf profile \ + -m gpt2 \ + --service-kind triton \ + --backend vllm \ + --num-prompts 100 \ + --random-seed 123 \ + --synthetic-input-tokens-mean 200 \ + --synthetic-input-tokens-stddev 0 \ + --streaming \ + --output-tokens-mean 100 \ + --output-tokens-stddev 0 \ + --output-tokens-mean-deterministic \ + --tokenizer hf-internal-testing/llama-tokenizer \ + --concurrency 1 \ + --measurement-interval 4000 \ + --profile-export-file my_profile_export.json \ + --url localhost:8001 +``` + +Example output: + +``` + LLM Metrics +┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ +│ Time to first token (ns) │ 15,786,560 │ 11,437,189 │ 49,550,549 │ 40,129,652 │ 21,248,091 │ 17,824,695 │ +│ Inter token latency (ns) │ 3,543,380 │ 591,898 │ 10,013,690 │ 6,152,260 │ 5,039,278 │ 4,060,982 │ +│ Request latency (ns) │ 388,415,721 │ 312,552,612 │ 528,229,817 │ 518,189,390 │ 484,281,365 │ 459,417,637 │ +│ Output sequence length │ 113 │ 105 │ 123 │ 122 │ 119 │ 115 │ +│ Input sequence length │ 199 │ 199 │ 199 │ 199 │ 199 │ 199 │ +└──────────────────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘ +Output token throughput (per sec): 290.24 +Request throughput (per sec): 2.57 +``` + +## Profile GPT2 running on OpenAI Chat API-Compatible Server + +### Run GPT2 on [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat)-compatible server + +
+See instructions + +Run the vLLM inference server: + +```bash +docker run -it --net=host --gpus=all vllm/vllm-openai:latest --model gpt2 --dtype float16 --max-model-len 1024 +``` + +
+ +### Run GenAI-Perf + +Run GenAI-Perf from Triton Inference Server SDK container: + +```bash +export RELEASE="{{ release }}" + +docker run -it --net=host --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk + +# Run GenAI-Perf in the container: +genai-perf profile \ + -m gpt2 \ + --service-kind openai \ + --endpoint v1/chat/completions \ + --endpoint-type chat \ + --num-prompts 100 \ + --random-seed 123 \ + --synthetic-input-tokens-mean 200 \ + --synthetic-input-tokens-stddev 0 \ + --streaming \ + --output-tokens-mean 100 \ + --output-tokens-stddev 0 \ + --tokenizer hf-internal-testing/llama-tokenizer \ + --concurrency 1 \ + --measurement-interval 4000 \ + --profile-export-file my_profile_export.json \ + --url localhost:8000 +``` + +Example output: + +``` + LLM Metrics +┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ +│ Time to first token (ns) │ 13,546,815 │ 9,821,658 │ 48,317,756 │ 34,361,913 │ 16,541,625 │ 14,612,026 │ +│ Inter token latency (ns) │ 2,560,813 │ 457,703 │ 6,507,334 │ 3,754,617 │ 3,059,158 │ 2,953,540 │ +│ Request latency (ns) │ 283,597,027 │ 240,098,890 │ 361,730,568 │ 349,164,037 │ 323,279,761 │ 306,507,562 │ +│ Output sequence length │ 114 │ 103 │ 142 │ 136 │ 122 │ 119 │ +│ Input sequence length │ 199 │ 199 │ 199 │ 199 │ 199 │ 199 │ +└──────────────────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘ +Output token throughput (per sec): 401.62 +Request throughput (per sec): 3.52 +``` + +## Profile GPT2 running on OpenAI Completions API-Compatible Server + +### Running GPT2 on [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions)-compatible server + +
+See instructions + +Run the vLLM inference server: + +```bash +docker run -it --net=host --gpus=all vllm/vllm-openai:latest --model gpt2 --dtype float16 --max-model-len 1024 +``` + +
+ +### Run GenAI-Perf + +Run GenAI-Perf from Triton Inference Server SDK container: + +```bash +export RELEASE="{{ release }}" + +docker run -it --net=host --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk + + +# Run GenAI-Perf in the container: +genai-perf profile \ + -m gpt2 \ + --service-kind openai \ + --endpoint v1/completions \ + --endpoint-type completions \ + --num-prompts 100 \ + --random-seed 123 \ + --synthetic-input-tokens-mean 200 \ + --synthetic-input-tokens-stddev 0 \ + --output-tokens-mean 100 \ + --output-tokens-stddev 0 \ + --tokenizer hf-internal-testing/llama-tokenizer \ + --concurrency 1 \ + --measurement-interval 4000 \ + --profile-export-file my_profile_export.json \ + --url localhost:8000 +``` + +Example output: + +``` + LLM Metrics +┏━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ +│ Request latency (ns) │ 296,990,497 │ 43,312,449 │ 332,788,242 │ 327,475,292 │ 317,392,767 │ 310,343,333 │ +│ Output sequence length │ 109 │ 11 │ 158 │ 142 │ 118 │ 113 │ +│ Input sequence length │ 1 │ 1 │ 1 │ 1 │ 1 │ 1 │ +└────────────────────────┴─────────────┴────────────┴─────────────┴─────────────┴─────────────┴─────────────┘ +Output token throughput (per sec): 366.78 +Request throughput (per sec): 3.37 +``` + diff --git a/templates/generate_docs.py b/templates/generate_docs.py new file mode 100755 index 00000000..333d1bc1 --- /dev/null +++ b/templates/generate_docs.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +# mypy: ignore-errors +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import os +from pathlib import Path + +import yaml +from jinja2 import Environment, FileSystemLoader + +# read the yaml file +with open("template_vars.yaml") as file: + data = yaml.load(file, Loader=yaml.FullLoader) + +# create the jinja2 environment +env = Environment(loader=FileSystemLoader(".")) +for file in data.keys(): + template = env.get_template(data[file]["template"]) + + # render the template with the data and print the output + output = template.render(data[file]) + + # grab the path to the output directory + output_dir = os.path.join( + Path(data[file]["output_dir"]), Path(data[file]["filename"]) + ) + + # write the output to a file + with open(output_dir, "w") as file: + file.write(output) diff --git a/templates/template_vars.yaml b/templates/template_vars.yaml new file mode 100644 index 00000000..19ad987e --- /dev/null +++ b/templates/template_vars.yaml @@ -0,0 +1,43 @@ +README: + filename: README.md + template: genai-perf-templates/README_template + release: 24.08 + triton_cli_version: 0.0.8 + output_dir: ../genai-perf/ + +compare: + filename: compare.md + template: genai-perf-templates/compare_template + output_dir: ../genai-perf/docs/ + +embeddings: + filename: embeddings.md + template: genai-perf-templates/embeddings_template + output_dir: ../genai-perf/docs/ + +files: + filename: files.md + template: genai-perf-templates/files_template + output_dir: ../genai-perf/docs/ + +lora: + filename: lora.md + template: genai-perf-templates/lora_template + output_dir: ../genai-perf/docs/ + +multi_modal: + filename: multi_modal.md + template: genai-perf-templates/multi_modal_template + output_dir: ../genai-perf/docs/ + +rankings: + filename: rankings.md + template: genai-perf-templates/rankings_template + output_dir: ../genai-perf/docs/ + +tutorial: + filename: tutorial.md + template: genai-perf-templates/tutorial_template + release: 24.08 + triton_cli_version: 0.0.8 + output_dir: ../genai-perf/docs/ From a38f23519c08c1e7f98946562cc8ad5e320d8842 Mon Sep 17 00:00:00 2001 From: Elias Bermudez Date: Wed, 21 Aug 2024 17:32:23 -0700 Subject: [PATCH 2/4] Set autoescape to True to avoid XSS attacks --- templates/generate_docs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/generate_docs.py b/templates/generate_docs.py index 333d1bc1..0841e2f9 100755 --- a/templates/generate_docs.py +++ b/templates/generate_docs.py @@ -38,7 +38,7 @@ data = yaml.load(file, Loader=yaml.FullLoader) # create the jinja2 environment -env = Environment(loader=FileSystemLoader(".")) +env = Environment(loader=FileSystemLoader("."), autoescape=True) for file in data.keys(): template = env.get_template(data[file]["template"]) From 7f856ddc86692f97d0372eb0ab91083b940cf4f7 Mon Sep 17 00:00:00 2001 From: Elias Bermudez Date: Thu, 22 Aug 2024 12:58:36 -0700 Subject: [PATCH 3/4] Further centralize template vars to 1 entry with the ability to specialize --- templates/generate_docs.py | 10 +++++++++- templates/template_vars.yaml | 8 ++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/templates/generate_docs.py b/templates/generate_docs.py index 0841e2f9..38e68822 100755 --- a/templates/generate_docs.py +++ b/templates/generate_docs.py @@ -40,10 +40,17 @@ # create the jinja2 environment env = Environment(loader=FileSystemLoader("."), autoescape=True) for file in data.keys(): + if "template" not in data[file]: + continue + template = env.get_template(data[file]["template"]) + file_vars = data["General"].copy() + + if file in data: + file_vars.update(data[file]) # render the template with the data and print the output - output = template.render(data[file]) + output = template.render(file_vars) # grab the path to the output directory output_dir = os.path.join( @@ -53,3 +60,4 @@ # write the output to a file with open(output_dir, "w") as file: file.write(output) + file_vars.clear() diff --git a/templates/template_vars.yaml b/templates/template_vars.yaml index 19ad987e..f6abd04a 100644 --- a/templates/template_vars.yaml +++ b/templates/template_vars.yaml @@ -1,8 +1,10 @@ +General: + release: 24.08 + triton_cli_version: 0.0.8 + README: filename: README.md template: genai-perf-templates/README_template - release: 24.08 - triton_cli_version: 0.0.8 output_dir: ../genai-perf/ compare: @@ -38,6 +40,4 @@ rankings: tutorial: filename: tutorial.md template: genai-perf-templates/tutorial_template - release: 24.08 - triton_cli_version: 0.0.8 output_dir: ../genai-perf/docs/ From 1f9c6b06a67c87ef2f6f5ed051b8521ac815fdee Mon Sep 17 00:00:00 2001 From: Elias Bermudez Date: Thu, 22 Aug 2024 13:00:25 -0700 Subject: [PATCH 4/4] Add ability to run script from the root dir --- templates/generate_docs.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/templates/generate_docs.py b/templates/generate_docs.py index 38e68822..d98f0f98 100755 --- a/templates/generate_docs.py +++ b/templates/generate_docs.py @@ -33,6 +33,13 @@ import yaml from jinja2 import Environment, FileSystemLoader +# Global constants +PA_ABSLT_PATH = os.environ.get("PA_ABSLT_PATH", os.getcwd()) +PA_TEMPLATES_ABSLT_PATH = os.path.join(PA_ABSLT_PATH, "templates") + +# Change working directory to perf_analyzer/templates. +os.chdir(PA_TEMPLATES_ABSLT_PATH) + # read the yaml file with open("template_vars.yaml") as file: data = yaml.load(file, Loader=yaml.FullLoader) @@ -61,3 +68,6 @@ with open(output_dir, "w") as file: file.write(output) file_vars.clear() + +# Change working directory to perf_analyzer. +os.chdir(PA_ABSLT_PATH)