diff --git a/client_backend/openai/openai_client.cc b/client_backend/openai/openai_client.cc index cd517f6a..9b167fae 100644 --- a/client_backend/openai/openai_client.cc +++ b/client_backend/openai/openai_client.cc @@ -63,6 +63,7 @@ namespace openai { void ChatCompletionRequest::SendResponse(bool is_final, bool is_null) { + final_response_sent_ = is_final; response_callback_(new ChatCompletionResult( http_code_, std::move(response_buffer_), is_final, is_null, request_id_)); } @@ -172,7 +173,11 @@ ChatCompletionClient::AsyncInfer( request->timer_.CaptureTimestamp( triton::client::RequestTimers::Kind::REQUEST_END); UpdateInferStat(request->timer_); - if (!request->is_stream_) { + + // Send final response on request completion + // if it has not already been sent. + // (e.g. in the case of seeing [DONE] in streaming case) + if (!request->IsFinalResponseSent()) { request->SendResponse(true /* is_final */, false /* is_null */); } }; diff --git a/client_backend/openai/openai_client.h b/client_backend/openai/openai_client.h index aadcb325..00ccbd5f 100644 --- a/client_backend/openai/openai_client.h +++ b/client_backend/openai/openai_client.h @@ -121,12 +121,14 @@ class ChatCompletionRequest : public HttpRequest { request_id_(request_id) { } + bool IsFinalResponseSent() { return final_response_sent_; }; void SendResponse(bool is_final, bool is_null); bool is_stream_{false}; std::function response_callback_{nullptr}; // The timers for infer request. triton::client::RequestTimers timer_; const std::string request_id_; + bool final_response_sent_{false}; }; class ChatCompletionClient : public HttpClient { diff --git a/docs/cli.md b/docs/cli.md index 399596fd..bd82415c 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -157,6 +157,13 @@ will also be reported in the results. Default is `-1` indicating that the average latency is used to determine stability. +#### `--request-count=` + +Specifies a total number of requests to use for measurement. + +Default is `0`, which means that there is no request count and the measurement +will proceed using windows until stabilization is detected. + #### `-r ` #### `--max-trials=` diff --git a/genai-perf/README.md b/genai-perf/README.md index 24c1efe3..1d03b3dd 100644 --- a/genai-perf/README.md +++ b/genai-perf/README.md @@ -162,7 +162,7 @@ docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE} 2. Run GenAI-Perf: ```bash -genai-perf \ +genai-perf profile \ -m gpt2 \ --service-kind triton \ --backend tensorrtllm \ @@ -209,7 +209,7 @@ current profile run. This is disabled by default but users can easily enable it by passing the `--generate-plots` option when running the benchmark: ```bash -genai-perf \ +genai-perf profile \ -m gpt2 \ --service-kind triton \ --backend tensorrtllm \ @@ -301,8 +301,8 @@ options: When the dataset is coming from a file, you can specify the following options: -* `--input-file `: The input file containing the single prompt to - use for benchmarking. +* `--input-file `: The input file containing the prompts to + use for benchmarking as JSON objects. For any dataset, you can specify the following options: * `--output-tokens-mean `: The mean number of tokens in each output. Ensure @@ -373,7 +373,7 @@ model config to not echo the input tokens in the output. (default: tensorrtllm) Set a custom endpoint that differs from the OpenAI defaults. (default: `None`) -##### `--endpoint-type {chat,completions}` +##### `--endpoint-type {chat,completions,embeddings,rankings}` The endpoint-type to send requests to on the server. This is only used with the `openai` service-kind. (default: `None`) @@ -394,6 +394,15 @@ URL of the endpoint to target for benchmarking. (default: `None`) ## Input Options +##### `-b ` +##### `--batch-size ` + +The batch size of the requests GenAI-Perf should send. +This is currently only supported with the +[embeddings endpoint type](docs/embeddings.md). +(default: `1`) and +[rankings endpoint type](docs/rankings.md). + ##### `--extra-inputs ` Provide additional inputs to include with every request. You can repeat this diff --git a/genai-perf/docs/embeddings.md b/genai-perf/docs/embeddings.md new file mode 100644 index 00000000..e508f9ef --- /dev/null +++ b/genai-perf/docs/embeddings.md @@ -0,0 +1,93 @@ + + +# Profile Embeddings Models with GenAI-Perf + +GenAI-Perf allows you to profile embedding models running on an +[OpenAI Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)-compatible server. + +## Create a Sample Embeddings Input File + +To create a sample embeddings input file, use the following command: + +```bash +echo '{"text": "What was the first car ever driven?"} +{"text": "Who served as the 5th President of the United States of America?"} +{"text": "Is the Sydney Opera House located in Australia?"} +{"text": "In what state did they film Shrek 2?"}' > embeddings.jsonl +``` + +This will generate a file named embeddings.jsonl with the following content: +```jsonl +{"text": "What was the first car ever driven?"} +{"text": "Who served as the 5th President of the United States of America?"} +{"text": "Is the Sydney Opera House located in Australia?"} +{"text": "In what state did they film Shrek 2?"} +``` + +## Start an OpenAI Embeddings-Compatible Server +To start an OpenAI embeddings-compatible server, run the following command: +```bash +docker run -it --net=host --rm --gpus=all vllm/vllm-openai:latest --model intfloat/e5-mistral-7b-instruct --dtype float16 --max-model-len 1024 +``` + +## Run GenAI-Perf +To profile embeddings models using GenAI-Perf, use the following command: + +```bash +genai-perf profile \ + -m intfloat/e5-mistral-7b-instruct \ + --service-kind openai \ + --endpoint-type embeddings \ + --batch-size 2 \ + --input-file embeddings.jsonl +``` + +This will use default values for optional arguments. You can also pass in +additional arguments with the `--extra-inputs` [flag](../README.md#input-options). +For example, you could use this command: + +```bash +genai-perf profile \ + -m intfloat/e5-mistral-7b-instruct \ + --service-kind openai \ + --endpoint-type embeddings \ + --extra-inputs user:sample_user +``` + +Example output: + +``` + Embeddings Metrics +┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━┩ +│ Request latency (ms) │ 42.21 │ 28.18 │ 318.61 │ 56.50 │ 49.21 │ 43.07 │ +└──────────────────────┴───────┴───────┴────────┴───────┴───────┴───────┘ +Request throughput (per sec): 23.63 +``` diff --git a/genai-perf/docs/lora.md b/genai-perf/docs/lora.md index 60be30c9..d30867ed 100644 --- a/genai-perf/docs/lora.md +++ b/genai-perf/docs/lora.md @@ -26,22 +26,22 @@ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --> -# Profiling Multiple LoRA Adapters +# Profile Multiple LoRA Adapters GenAI-Perf allows you to profile multiple LoRA adapters on top of a base model. -## Selecting LoRA Adapters +## Select LoRA Adapters To do this, list multiple adapters after the model name option `-m`: ```bash genai-perf -m lora_adapter1 lora_adapter2 lora_adapter3 ``` -## Choosing a Strategy for Selecting Models +## Choose a Strategy for Selecting Models When profiling with multiple models, you can specify how the models should be assigned to prompts using the `--model-selection-strategy` option: ```bash -genai-perf \ +genai-perf profile \ -m lora_adapter1 lora_adapter2 lora_adapter3 \ --model-selection-strategy round_robin ``` diff --git a/genai-perf/docs/rankings.md b/genai-perf/docs/rankings.md new file mode 100644 index 00000000..a316ef85 --- /dev/null +++ b/genai-perf/docs/rankings.md @@ -0,0 +1,100 @@ + + +# Profile Ranking Models with GenAI-Perf + + +GenAI-Perf allows you to profile ranking models compatible with Hugging Face's +[Text Embeddings Inference's re-ranker API](https://huggingface.co/docs/text-embeddings-inference/en/quick_tour#re-rankers). + +## Create a Sample Rankings Input Directory + +To create a sample rankings input directory, follow these steps: + +Create a directory called rankings_jsonl: +```bash +mkdir rankings_jsonl +``` + +Inside this directory, create a JSONL file named queries.jsonl with queries data: + +```bash +echo '{"text": "What was the first car ever driven?"} +{"text": "Who served as the 5th President of the United States of America?"} +{"text": "Is the Sydney Opera House located in Australia?"} +{"text": "In what state did they film Shrek 2?"}' > rankings_jsonl/queries.jsonl +``` + +Create another JSONL file named passages.jsonl with passages data: + +```bash +echo '{"text": "Eric Anderson (born January 18, 1968) is an American sociologist and sexologist."} +{"text": "Kevin Loader is a British film and television producer."} +{"text": "Francisco Antonio Zea Juan Francisco Antonio Hilari was a Colombian journalist, botanist, diplomat, politician, and statesman who served as the 1st Vice President of Colombia."} +{"text": "Daddys Home 2 Principal photography on the film began in Massachusetts in March 2017 and it was released in the United States by Paramount Pictures on November 10, 2017. Although the film received unfavorable reviews, it has grossed over $180 million worldwide on a $69 million budget."}' > rankings_jsonl/passages.jsonl +``` + +## Start a Hugging Face Re-Ranker-Compatible Server +To start a Hugging Face re-ranker-compatible server, run the following commands: + +```bash +model=BAAI/bge-reranker-large +revision=refs/pr/4 +volume=$PWD/data + +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.3 --model-id $model --revision $revision +``` + +## Run GenAI-Perf +To profile ranking models using GenAI-Perf, use the following command: + +```bash +genai-perf profile \ + -m BAAI/bge-reranker-large \ + --service-kind openai \ + --endpoint-type rankings \ + --endpoint rerank \ + --input-file rankings_jsonl/ \ + -u localhost:8080 \ + --extra-inputs rankings:tei \ + --batch-size 2 +``` + +This command specifies the use of Hugging Face's ranking API with `--endpoint rerank` and `--extra-inputs rankings:tei`. + +Example output: + +``` + Rankings Metrics +┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━┳━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━╇━━━━━━┩ +│ Request latency (ms) │ 5.48 │ 2.50 │ 23.91 │ 10.27 │ 8.34 │ 6.07 │ +└──────────────────────┴──────┴──────┴───────┴───────┴──────┴──────┘ +Request throughput (per sec): 180.11 +``` diff --git a/genai-perf/docs/tutorial.md b/genai-perf/docs/tutorial.md index bc9dec71..1a37baf3 100644 --- a/genai-perf/docs/tutorial.md +++ b/genai-perf/docs/tutorial.md @@ -30,59 +30,49 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - [Profile GPT2 running on Triton + TensorRT-LLM](#tensorrt-llm) - [Profile GPT2 running on Triton + vLLM](#triton-vllm) -- [Profile GPT2 running on OpenAI API-Compatible Server](#openai) +- [Profile GPT2 running on OpenAI Chat Completions API-Compatible Server](#openai-chat) +- [Profile GPT2 running on OpenAI Completions API-Compatible Server](#openai-completions) --- ## Profile GPT2 running on Triton + TensorRT-LLM -### Running GPT2 on Triton Inference Server using TensorRT-LLM +### Run GPT2 on Triton Inference Server using TensorRT-LLM
See instructions -1. Run Triton Inference Server with TensorRT-LLM backend container: +Run Triton Inference Server with TensorRT-LLM backend container: ```bash -export RELEASE="yy.mm" # e.g. export RELEASE="24.03" +export RELEASE="yy.mm" # e.g. export RELEASE="24.06" -docker run -it --net=host --rm --gpus=all --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tritonserver:${RELEASE}-trtllm-python-py3 -``` - -2. Install Triton CLI (~5 min): +docker run -it --net=host --gpus=all --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tritonserver:${RELEASE}-trtllm-python-py3 -```bash +# Install Triton CLI (~5 min): pip install "git+https://github.com/triton-inference-server/triton_cli@0.0.8" -``` -3. Download model: - -```bash +# Download model: triton import -m gpt2 --backend tensorrtllm -``` -4. Run server: - -```bash +# Run server: triton start ```
-### Running GenAI-Perf +### Run GenAI-Perf -1. Run Triton Inference Server SDK container: +Run GenAI-Perf from Triton Inference Server SDK container: ```bash -export RELEASE="yy.mm" # e.g. export RELEASE="24.03" - -docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk -``` +export RELEASE="yy.mm" # e.g. export RELEASE="24.06" -2. Run GenAI-Perf: +docker run -it --net=host --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk +# Run GenAI-Perf in the container: ```bash -genai-perf \ +genai-perf profile \ -m gpt2 \ --service-kind triton \ --backend tensorrtllm \ @@ -120,53 +110,43 @@ Request throughput (per sec): 4.44 ## Profile GPT2 running on Triton + vLLM -### Running GPT2 on Triton Inference Server using vLLM +### Run GPT2 on Triton Inference Server using vLLM
See instructions -1. Run Triton Inference Server with vLLM backend container: +Run Triton Inference Server with vLLM backend container: ```bash -export RELEASE="yy.mm" # e.g. export RELEASE="24.03" +export RELEASE="yy.mm" # e.g. export RELEASE="24.06" -docker run -it --net=host --rm --gpus=all --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tritonserver:${RELEASE}-vllm-python-py3 -``` -2. Install Triton CLI (~5 min): +docker run -it --net=host --gpus=1 --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tritonserver:${RELEASE}-vllm-python-py3 -```bash +# Install Triton CLI (~5 min): pip install "git+https://github.com/triton-inference-server/triton_cli@0.0.8" -``` - -3. Download model: -```bash +# Download model: triton import -m gpt2 --backend vllm -``` - -4. Run server: -```bash +# Run server: triton start ```
-### Running GenAI-Perf +### Run GenAI-Perf -1. Run Triton Inference Server SDK container: +Run GenAI-Perf from Triton Inference Server SDK container: ```bash -export RELEASE="yy.mm" # e.g. export RELEASE="24.03" +export RELEASE="yy.mm" # e.g. export RELEASE="24.06" -docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk -``` - -2. Run GenAI-Perf: +docker run -it --net=host --gpus=1 nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk +# Run GenAI-Perf in the container: ```bash -genai-perf \ +genai-perf profile \ -m gpt2 \ --service-kind triton \ --backend vllm \ @@ -202,37 +182,33 @@ Output token throughput (per sec): 290.24 Request throughput (per sec): 2.57 ``` -## Profile GPT2 running on OpenAI API-Compatible Server - -### OpenAI Chat Completions API +## Profile GPT2 running on OpenAI Chat API-Compatible Server -#### Running GPT2 on [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat)-compatible server +### Run GPT2 on [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat)-compatible server
See instructions -1. Run the vLLM inference server: +Run the vLLM inference server: ```bash -docker run -it --net=host --rm --gpus=all vllm/vllm-openai:latest --model gpt2 --dtype float16 --max-model-len 1024 +docker run -it --net=host --gpus=all vllm/vllm-openai:latest --model gpt2 --dtype float16 --max-model-len 1024 ```
-#### Running GenAI-Perf +### Run GenAI-Perf -1. Run Triton Inference Server SDK container: +Run GenAI-Perf from Triton Inference Server SDK container: ```bash -export RELEASE="yy.mm" # e.g. export RELEASE="24.03" +export RELEASE="yy.mm" # e.g. export RELEASE="24.06" -docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk -``` - -2. Run GenAI-Perf: +docker run -it --net=host --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk +# Run GenAI-Perf in the container: ```bash -genai-perf \ +genai-perf profile \ -m gpt2 \ --service-kind openai \ --endpoint v1/chat/completions \ @@ -268,35 +244,34 @@ Output token throughput (per sec): 401.62 Request throughput (per sec): 3.52 ``` -### OpenAI Completions API +## Profile GPT2 running on OpenAI Completions API-Compatible Server -#### Running GPT2 on [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions)-compatible server +### Running GPT2 on [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions)-compatible server
See instructions -1. Run the vLLM inference server: +Run the vLLM inference server: ```bash -docker run -it --net=host --rm --gpus=all vllm/vllm-openai:latest --model gpt2 --dtype float16 --max-model-len 1024 +docker run -it --net=host --gpus=all vllm/vllm-openai:latest --model gpt2 --dtype float16 --max-model-len 1024 ```
-#### Running GenAI-Perf +### Run GenAI-Perf -1. Run Triton Inference Server SDK container: +Run GenAI-Perf from Triton Inference Server SDK container: ```bash -export RELEASE="yy.mm" # e.g. export RELEASE="24.03" +export RELEASE="yy.mm" # e.g. export RELEASE="24.06" -docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk -``` +docker run -it --net=host --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk -2. Run GenAI-Perf: +# Run GenAI-Perf in the container: ```bash -genai-perf \ +genai-perf profile \ -m gpt2 \ --service-kind openai \ --endpoint v1/completions \ diff --git a/genai-perf/genai_perf/__init__.py b/genai-perf/genai_perf/__init__.py index 025456b0..cb5c2699 100644 --- a/genai-perf/genai_perf/__init__.py +++ b/genai-perf/genai_perf/__init__.py @@ -24,4 +24,4 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -__version__ = "0.0.3dev" +__version__ = "0.0.4dev" diff --git a/genai-perf/genai_perf/export_data/console_exporter.py b/genai-perf/genai_perf/export_data/console_exporter.py index bbd02b75..460fe597 100644 --- a/genai-perf/genai_perf/export_data/console_exporter.py +++ b/genai-perf/genai_perf/export_data/console_exporter.py @@ -26,7 +26,6 @@ from genai_perf.export_data.exporter_config import ExporterConfig -from genai_perf.llm_metrics import Metrics from rich.console import Console from rich.table import Table @@ -36,74 +35,73 @@ class ConsoleExporter: A class to export the statistics and arg values to the console. """ + STAT_COLUMN_KEYS = ["avg", "min", "max", "p99", "p90", "p75"] + def __init__(self, config: ExporterConfig): self._stats = config.stats + self._metrics = config.metrics + self._args = config.args + + def _get_title(self): + if self._args.endpoint_type == "embeddings": + return "Embeddings Metrics" + elif self._args.endpoint_type == "rankings": + return "Rankings Metrics" + else: + return "LLM Metrics" def export(self) -> None: - singular_metric_rows = [] - table = Table(title="LLM Metrics") + table = Table(title=self._get_title()) table.add_column("Statistic", justify="right", style="cyan", no_wrap=True) - stats = ["avg", "min", "max", "p99", "p90", "p75"] - for stat in stats: + for stat in self.STAT_COLUMN_KEYS: table.add_column(stat, justify="right", style="green") - for metric in Metrics.metric_labels: - formatted_metric = metric.replace("_", " ").capitalize() + # Request metrics table + self._construct_table(table) - # Throughput fields are printed after the table - is_throughput_field = metric in Metrics.throughput_fields - if is_throughput_field: - value = self._stats.get(f"{metric}", -1).get(stats[0], -1) - formatted_metric += f" (per sec): {value:.2f}" - singular_metric_rows.append(formatted_metric) - continue + console = Console() + console.print(table) - # TODO (TMA-1712): need to decide if we need this metric. Remove - # from statistics display for now. - # TODO (TMA-1678): output_token_throughput_per_request is treated - # separately since the current code treats all throughput metrics to - # be displayed outside of the statistics table. - if metric == "output_token_throughput_per_request": - formatted_metric += f" (per sec)" + # System metrics are printed after the table + for metric in self._metrics.system_metrics: + line = metric.name.replace("_", " ").capitalize() + value = self._stats[metric.name]["avg"] + line += f" ({metric.unit}): {value:.2f}" + print(line) + + def _construct_table(self, table: Table) -> None: + for metric in self._metrics.request_metrics: + if self._should_skip(metric.name): continue - is_time_field = metric in Metrics.time_fields - if is_time_field: - formatted_metric += " (ms)" - - row_values = [formatted_metric] - for stat in stats: - value = self._stats.get(f"{metric}", -1) - # Need to check for -1 for the non streaming case - if value == -1: - row_values.append(f"{value:,.2f}") - else: - value = value.get(stat, -1) - row_values.append(f"{value:,.2f}") - - # Without streaming, there is no inter-token latency available, so do not print it. - if metric == "inter_token_latency": - if all(float(value) < 0 for value in row_values[1:]): - continue - # Without streaming, TTFT and request latency are the same, so do not print TTFT. - elif metric == "time_to_first_token": - unique_values = False - for stat in stats: - value_ttft = self._stats.get(f"{metric}", -1).get(stat, -1) - value_req_latency = self._stats.get("request_latency", -1).get( - stat, -1 - ) - if value_ttft != value_req_latency: - unique_values = True - break - if not unique_values: - continue + metric_str = metric.name.replace("_", " ").capitalize() + metric_str += f" ({metric.unit})" if metric.unit != "tokens" else "" + row_values = [metric_str] + for stat in self.STAT_COLUMN_KEYS: + value = self._stats[metric.name][stat] + row_values.append(f"{value:,.2f}") table.add_row(*row_values) - console = Console() - console.print(table) - - for row in singular_metric_rows: - print(row) + # (TMA-1976) Refactor this method as the csv exporter shares identical method. + def _should_skip(self, metric_name: str) -> bool: + if self._args.endpoint_type == "embeddings": + return False # skip nothing + + # TODO (TMA-1712): need to decide if we need this metric. Remove + # from statistics display for now. + # TODO (TMA-1678): output_token_throughput_per_request is treated + # separately since the current code treats all throughput metrics to + # be displayed outside of the statistics table. + if metric_name == "output_token_throughput_per_request": + return True + + # When non-streaming, skip ITL and TTFT + streaming_metrics = [ + "inter_token_latency", + "time_to_first_token", + ] + if not self._args.streaming and metric_name in streaming_metrics: + return True + return False diff --git a/genai-perf/genai_perf/export_data/csv_exporter.py b/genai-perf/genai_perf/export_data/csv_exporter.py index 3677fe35..efbb9b75 100644 --- a/genai-perf/genai_perf/export_data/csv_exporter.py +++ b/genai-perf/genai_perf/export_data/csv_exporter.py @@ -29,7 +29,6 @@ import genai_perf.logging as logging from genai_perf.export_data.exporter_config import ExporterConfig -from genai_perf.llm_metrics import Metrics DEFAULT_OUTPUT_DATA_CSV = "profile_export_genai_perf.csv" @@ -41,97 +40,80 @@ class CsvExporter: A class to export the statistics and arg values in a csv format. """ + REQUEST_METRICS_HEADER = [ + "Metric", + "avg", + "min", + "max", + "p99", + "p95", + "p90", + "p75", + "p50", + "p25", + ] + + SYSTEM_METRICS_HEADER = [ + "Metric", + "Value", + ] + def __init__(self, config: ExporterConfig): self._stats = config.stats + self._metrics = config.metrics self._output_dir = config.artifact_dir + self._args = config.args def export(self) -> None: csv_filename = self._output_dir / DEFAULT_OUTPUT_DATA_CSV logger.info(f"Generating {csv_filename}") - multiple_metric_header = [ - "Metric", - "avg", - "min", - "max", - "p99", - "p95", - "p90", - "p75", - "p50", - "p25", - ] - - single_metric_header = [ - "Metric", - "Value", - ] - with open(csv_filename, mode="w", newline="") as csvfile: - singular_metric_rows = [] - csv_writer = csv.writer(csvfile) - csv_writer.writerow(multiple_metric_header) - - for metric in Metrics.metric_labels: - formatted_metric = metric.replace("_", " ").title() - - is_throughput_field = metric in Metrics.throughput_fields - is_time_field = metric in Metrics.time_fields - - if is_time_field: - formatted_metric += " (ms)" - elif is_throughput_field: - formatted_metric += " (per sec)" - # TODO (TMA-1712): need to decide if we need this metric. Do not - # include in the csv for now. - # TODO (TMA-1678): output_token_throughput_per_request is treated - # separately since the current code treats all throughput metrics - # to be displayed outside of the statistics table. - elif metric == "output_token_throughput_per_request": - formatted_metric += " (per sec)" - continue - - row_values = [formatted_metric] - - if is_throughput_field: - value = self._stats.get(f"{metric}", -1).get( - multiple_metric_header[1], -1 - ) - row_values.append(f"{value:.2f}") - singular_metric_rows.append(row_values) - continue - - for stat in multiple_metric_header[1:]: - value = self._stats.get(f"{metric}", -1) - # Need to check for -1 for the non streaming case - if value == -1: - row_values.append(f"{value:,.2f}") - else: - value = value.get(stat, -1) - row_values.append(f"{value:,.2f}") - - # Without streaming, there is no inter-token latency available, so do not print it. - if metric == "inter_token_latency": - if all(value == "-1" for value in row_values[1:]): - continue - # Without streaming, TTFT and request latency are the same, so do not print TTFT. - elif metric == "time_to_first_token": - unique_values = False - for stat in multiple_metric_header[1:]: - value_ttft = self._stats.get(f"{metric}", -1).get(stat, -1) - value_req_latency = self._stats.get("request_latency", -1).get( - stat, -1 - ) - if value_ttft != value_req_latency: - unique_values = True - break - if not unique_values: - continue - - csv_writer.writerow(row_values) - + self._write_request_metrics(csv_writer) csv_writer.writerow([]) - csv_writer.writerow(single_metric_header) - for row in singular_metric_rows: - csv_writer.writerow(row) + self._write_system_metrics(csv_writer) + + def _write_request_metrics(self, csv_writer) -> None: + csv_writer.writerow(self.REQUEST_METRICS_HEADER) + for metric in self._metrics.request_metrics: + if self._should_skip(metric.name): + continue + + metric_str = metric.name.replace("_", " ").title() + metric_str += f" ({metric.unit})" if metric.unit != "tokens" else "" + row_values = [metric_str] + for stat in self.REQUEST_METRICS_HEADER[1:]: + value = self._stats[metric.name][stat] + row_values.append(f"{value:,.2f}") + + csv_writer.writerow(row_values) + + def _write_system_metrics(self, csv_writer) -> None: + csv_writer.writerow(self.SYSTEM_METRICS_HEADER) + for metric in self._metrics.system_metrics: + metric_str = metric.name.replace("_", " ").title() + metric_str += f" ({metric.unit})" + value = self._stats[metric.name]["avg"] + csv_writer.writerow([metric_str, f"{value:.2f}"]) + + def _should_skip(self, metric_name: str) -> bool: + if self._args.endpoint_type == "embeddings": + return False # skip nothing + + # TODO (TMA-1712): need to decide if we need this metric. Remove + # from statistics display for now. + # TODO (TMA-1678): output_token_throughput_per_request is treated + # separately since the current code treats all throughput metrics to + # be displayed outside of the statistics table. + if metric_name == "output_token_throughput_per_request": + return True + + # When non-streaming, skip ITL and TTFT + streaming_metrics = [ + "inter_token_latency", + "time_to_first_token", + ] + if not self._args.streaming and metric_name in streaming_metrics: + return True + return False diff --git a/genai-perf/genai_perf/export_data/exporter_config.py b/genai-perf/genai_perf/export_data/exporter_config.py index 3f045196..0d9c7cd0 100644 --- a/genai-perf/genai_perf/export_data/exporter_config.py +++ b/genai-perf/genai_perf/export_data/exporter_config.py @@ -25,9 +25,13 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from genai_perf.metrics import Metrics + + class ExporterConfig: def __init__(self): self._stats = None + self._metrics = None self._args = None self._extra_inputs = None self._artifact_dir = None @@ -40,6 +44,14 @@ def stats(self): def stats(self, stats_value): self._stats = stats_value + @property + def metrics(self): + return self._metrics + + @metrics.setter + def metrics(self, metrics: Metrics): + self._metrics = metrics + @property def args(self): return self._args diff --git a/genai-perf/genai_perf/export_data/json_exporter.py b/genai-perf/genai_perf/export_data/json_exporter.py index c5a0f36c..2ec24fae 100644 --- a/genai-perf/genai_perf/export_data/json_exporter.py +++ b/genai-perf/genai_perf/export_data/json_exporter.py @@ -58,8 +58,9 @@ def export(self) -> None: f.write(json.dumps(self._stats_and_args, indent=2)) def _prepare_args_for_export(self) -> None: - del self._args["func"] - del self._args["output_format"] + self._args.pop("func", None) + self._args.pop("output_format", None) + self._args.pop("input_file", None) self._args["profile_export_file"] = str(self._args["profile_export_file"]) self._args["artifact_dir"] = str(self._args["artifact_dir"]) for k, v in self._args.items(): diff --git a/genai-perf/genai_perf/export_data/output_reporter.py b/genai-perf/genai_perf/export_data/output_reporter.py index 0189ccfa..ec8123b9 100644 --- a/genai-perf/genai_perf/export_data/output_reporter.py +++ b/genai-perf/genai_perf/export_data/output_reporter.py @@ -29,7 +29,7 @@ from genai_perf.export_data.data_exporter_factory import DataExporterFactory from genai_perf.export_data.exporter_config import ExporterConfig -from genai_perf.llm_metrics import Statistics +from genai_perf.metrics import Statistics from genai_perf.parser import get_extra_inputs_as_dict @@ -54,6 +54,7 @@ def report_output(self) -> None: def _create_exporter_config(self) -> ExporterConfig: config = ExporterConfig() config.stats = self.stats.stats_dict + config.metrics = self.stats.metrics config.args = self.args config.artifact_dir = self.args.artifact_dir config.extra_inputs = get_extra_inputs_as_dict(self.args) diff --git a/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/genai-perf/genai_perf/llm_inputs/llm_inputs.py index 3613e564..39abc7ec 100644 --- a/genai-perf/genai_perf/llm_inputs/llm_inputs.py +++ b/genai-perf/genai_perf/llm_inputs/llm_inputs.py @@ -24,6 +24,7 @@ from genai_perf.exceptions import GenAIPerfException from genai_perf.llm_inputs.synthetic_prompt_generator import SyntheticPromptGenerator from genai_perf.tokenizer import DEFAULT_TOKENIZER, Tokenizer, get_tokenizer +from genai_perf.utils import load_json_str from requests import Response @@ -41,6 +42,8 @@ class PromptSource(Enum): class OutputFormat(Enum): OPENAI_CHAT_COMPLETIONS = auto() OPENAI_COMPLETIONS = auto() + OPENAI_EMBEDDINGS = auto() + RANKINGS = auto() TENSORRTLLM = auto() VLLM = auto() @@ -64,6 +67,7 @@ class LlmInputs: DEFAULT_TENSORRTLLM_MAX_TOKENS = 256 + DEFAULT_BATCH_SIZE = 1 DEFAULT_RANDOM_SEED = 0 DEFAULT_PROMPT_TOKENS_MEAN = 550 DEFAULT_PROMPT_TOKENS_STDDEV = 0 @@ -99,6 +103,7 @@ def create_llm_inputs( add_stream: bool = False, tokenizer: Tokenizer = get_tokenizer(DEFAULT_TOKENIZER), extra_inputs: Optional[Dict] = None, + batch_size: int = 1, output_dir: Path = Path(""), ) -> Dict: """ @@ -134,6 +139,8 @@ def create_llm_inputs( The standard deviation of the length of the output to generate. This is only used if output_tokens_mean is provided. output_tokens_deterministic: If true, the output tokens will set the minimum and maximum tokens to be equivalent. + batch_size: + The number of inputs per request (currently only used for the embeddings and rankings endpoints) Required Synthetic Prompt Generation Parameters ----------------------------------------------- @@ -156,36 +163,21 @@ def create_llm_inputs( input_type, dataset_name, starting_index, length, tokenizer ) - if input_type == PromptSource.DATASET: - dataset = cls._get_input_dataset_from_url( - dataset_name, starting_index, length - ) - generic_dataset_json = cls._convert_input_url_dataset_to_generic_json( - dataset - ) - elif input_type == PromptSource.SYNTHETIC: - random.seed(random_seed) - synthetic_dataset = cls._get_input_dataset_from_synthetic( - tokenizer, - prompt_tokens_mean, - prompt_tokens_stddev, - num_of_output_prompts, - ) - generic_dataset_json = ( - cls._convert_input_synthetic_or_file_dataset_to_generic_json( - synthetic_dataset - ) - ) - elif input_type == PromptSource.FILE: - input_filename = cast(Path, input_filename) - input_file_dataset = cls._get_input_dataset_from_file(input_filename) - generic_dataset_json = ( - cls._convert_input_synthetic_or_file_dataset_to_generic_json( - input_file_dataset - ) - ) - else: - raise GenAIPerfException("Input source is not recognized.") + random.seed(random_seed) + + generic_dataset_json = cls.get_generic_dataset_json( + input_type, + output_format, + dataset_name, + starting_index, + length, + tokenizer, + prompt_tokens_mean, + prompt_tokens_stddev, + num_of_output_prompts, + batch_size, + input_filename, + ) if extra_inputs is None: extra_inputs = {} @@ -206,6 +198,178 @@ def create_llm_inputs( return json_in_pa_format + @classmethod + def get_generic_dataset_json( + cls, + input_type: PromptSource, + output_format: OutputFormat, + dataset_name: str, + starting_index: int, + length: int, + tokenizer: Tokenizer, + prompt_tokens_mean: int, + prompt_tokens_stddev: int, + num_of_output_prompts: int, + batch_size: int, + input_filename: Optional[Path], + ) -> Dict: + """ + Retrieve and convert the dataset based on the input type. + + Parameters + ---------- + input_type: + Specify how the input is received + output_format: + Specify the output format + dataset_name: + The name of the dataset + starting_index: + Offset from within the list to start gathering inputs + length: + Number of entries to gather + tokenizer: + The tokenizer to use when generating synthetic prompts + prompt_tokens_mean: + The mean length of the prompt to generate + prompt_tokens_stddev: + The standard deviation of the length of the prompt to generate + num_of_output_prompts: + The number of synthetic output prompts to generate + batch_size: + The number of inputs per request (currently only used for the embeddings and rankings endpoints) + input_filename: + The path to the input file containing the prompts in JSONL format. + Returns + ------- + Dict: + The generic dataset JSON + """ + + if output_format == OutputFormat.OPENAI_EMBEDDINGS: + if input_type != PromptSource.FILE: + raise GenAIPerfException( + f"{OutputFormat.OPENAI_EMBEDDINGS.to_lowercase()} only supports a file as input." + ) + input_filename = cast(Path, input_filename) + input_file_dataset = cls._get_input_dataset_from_embeddings_file( + input_filename, + batch_size, + num_of_output_prompts, + ) + generic_dataset_json = ( + cls._convert_input_synthetic_or_file_dataset_to_generic_json( + input_file_dataset + ) + ) + elif output_format == OutputFormat.RANKINGS: + if input_type != PromptSource.FILE: + raise GenAIPerfException( + f"{OutputFormat.RANKINGS.to_lowercase()} only supports a directory as input." + ) + queries_filename = cast(Path, input_filename) / "queries.jsonl" + passages_filename = cast(Path, input_filename) / "passages.jsonl" + input_file_dataset = cls._get_input_dataset_from_rankings_files( + queries_filename, passages_filename, batch_size, num_of_output_prompts + ) + + generic_dataset_json = ( + cls._convert_input_synthetic_or_file_dataset_to_generic_json( + input_file_dataset + ) + ) + else: + if input_type == PromptSource.DATASET: + dataset = cls._get_input_dataset_from_url( + dataset_name, starting_index, length + ) + generic_dataset_json = cls._convert_input_url_dataset_to_generic_json( + dataset + ) + elif input_type == PromptSource.SYNTHETIC: + synthetic_dataset = cls._get_input_dataset_from_synthetic( + tokenizer, + prompt_tokens_mean, + prompt_tokens_stddev, + num_of_output_prompts, + ) + generic_dataset_json = ( + cls._convert_input_synthetic_or_file_dataset_to_generic_json( + synthetic_dataset + ) + ) + elif input_type == PromptSource.FILE: + input_filename = cast(Path, input_filename) + input_file_dataset = cls._get_input_dataset_from_file(input_filename) + generic_dataset_json = ( + cls._convert_input_synthetic_or_file_dataset_to_generic_json( + input_file_dataset + ) + ) + else: + raise GenAIPerfException("Input source is not recognized.") + + return generic_dataset_json + + @classmethod + def _get_input_dataset_from_embeddings_file( + cls, input_filename: Path, batch_size: int, num_prompts: int + ) -> Dict[str, Any]: + with open(input_filename, "r") as file: + file_content = [load_json_str(line) for line in file] + + texts = [item["text"] for item in file_content] + + if batch_size > len(texts): + raise ValueError( + "Batch size cannot be larger than the number of available texts" + ) + + dataset_json: Dict[str, Any] = {} + dataset_json["features"] = [{"name": "input"}] + dataset_json["rows"] = [] + + for _ in range(num_prompts): + sampled_texts = random.sample(texts, batch_size) + dataset_json["rows"].append({"row": {"payload": {"input": sampled_texts}}}) + + return dataset_json + + @classmethod + def _get_input_dataset_from_rankings_files( + cls, + queries_filename: Path, + passages_filename: Path, + batch_size: int, + num_prompts: int, + ) -> Dict[str, Any]: + + with open(queries_filename, "r") as file: + queries_content = [load_json_str(line) for line in file] + queries_texts = [item for item in queries_content] + + with open(passages_filename, "r") as file: + passages_content = [load_json_str(line) for line in file] + passages_texts = [item for item in passages_content] + + if batch_size > len(passages_texts): + raise ValueError( + "Batch size cannot be larger than the number of available passages" + ) + + dataset_json: Dict[str, Any] = {} + dataset_json["features"] = [{"name": "input"}] + dataset_json["rows"] = [] + + for _ in range(num_prompts): + sampled_texts = random.sample(passages_texts, batch_size) + query_sample = random.choice(queries_texts) + entry_dict: Dict = {} + entry_dict["query"] = query_sample + entry_dict["passages"] = sampled_texts + dataset_json["rows"].append({"row": {"payload": entry_dict}}) + return dataset_json + @classmethod def _check_for_valid_args( cls, @@ -373,7 +537,7 @@ def _get_prompts_from_input_file(cls, input_filename: Path) -> List[str]: with open(input_filename, mode="r", newline=None) as file: for line in file: if line.strip(): - prompts.append(json.loads(line).get("text_input", "").strip()) + prompts.append(load_json_str(line).get("text_input", "").strip()) return prompts @classmethod @@ -419,6 +583,20 @@ def _convert_generic_json_to_output_format( model_name, model_selection_strategy, ) + elif output_format == OutputFormat.OPENAI_EMBEDDINGS: + output_json = cls._convert_generic_json_to_openai_embeddings_format( + generic_dataset, + extra_inputs, + model_name, + model_selection_strategy, + ) + elif output_format == OutputFormat.RANKINGS: + output_json = cls._convert_generic_json_to_rankings_format( + generic_dataset, + extra_inputs, + model_name, + model_selection_strategy, + ) elif output_format == OutputFormat.VLLM: output_json = cls._convert_generic_json_to_vllm_format( generic_dataset, @@ -520,6 +698,104 @@ def _convert_generic_json_to_openai_completions_format( return pa_json + @classmethod + def _convert_generic_json_to_openai_embeddings_format( + cls, + generic_dataset: Dict, + extra_inputs: Dict, + model_name: list = [], + model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN, + ) -> Dict[str, Any]: + pa_json: Dict[str, Any] = {"data": []} + + for index, entry in enumerate(generic_dataset["rows"]): + iter_model_name = cls._select_model_name( + model_name, index, model_selection_strategy + ) + payload = entry.get("payload", {}) + input_values = payload.get("input") + + if input_values is None: + raise ValueError("Missing required fields 'input' in dataset entry") + if not isinstance(input_values, list): + raise ValueError( + f"Required field 'input' must be a list (actual: {type(input_values)})" + ) + + payload = { + "input": input_values, + "model": iter_model_name, + } + + for key, value in extra_inputs.items(): + payload[key] = value + + pa_json["data"].append({"payload": [payload]}) + + return pa_json + + @classmethod + def contains_rankings_tei(cls, extra_inputs: Optional[Dict]) -> bool: + """ + Check if user specified that they are using the Hugging Face + Text Embeddings Interface for ranking models + """ + if extra_inputs and extra_inputs.get("rankings") == "tei": + return True + return False + + @classmethod + def _convert_generic_json_to_rankings_format( + cls, + generic_dataset: Dict, + extra_inputs: Dict, + model_name: list = [], + model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN, + ) -> Dict[str, Any]: + pa_json: Dict[str, Any] = {"data": []} + use_tei_format = cls.contains_rankings_tei(extra_inputs) + + for index, entry in enumerate(generic_dataset["rows"]): + iter_model_name = cls._select_model_name( + model_name, index, model_selection_strategy + ) + payload = entry.get("payload", {}) + query_values = payload.get("query") + + if use_tei_format: + passage_values = payload.get("passages", []) + passage_values = [item.get("text", "") for item in passage_values] + else: + passage_values = payload.get("passages") + + if query_values is None: + raise ValueError("Missing required fields 'query' in dataset entry") + if passage_values is None: + raise ValueError( + f"Missing required fields '{'texts' if use_tei_format else 'passages'}' in dataset entry" + ) + if not isinstance(passage_values, list): + raise ValueError( + f"Required field '{'texts' if use_tei_format else 'passages'}' must be a list (actual: {type(passage_values)})" + ) + + if use_tei_format: + payload = {"query": query_values["text"], "texts": passage_values} + else: + payload = { + "query": query_values, + "passages": passage_values, + "model": iter_model_name, + } + + for key, value in extra_inputs.items(): + if not (key == "rankings" and value == "tei"): + payload[key] = value + + pa_json["data"].append({"payload": [payload]}) + + return pa_json + @classmethod def _convert_generic_json_to_vllm_format( cls, diff --git a/genai-perf/genai_perf/llm_metrics.py b/genai-perf/genai_perf/llm_metrics.py deleted file mode 100755 index 05c1ce59..00000000 --- a/genai-perf/genai_perf/llm_metrics.py +++ /dev/null @@ -1,619 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import csv -import json -from collections import defaultdict -from enum import Enum, auto -from itertools import tee -from pathlib import Path -from typing import Dict, List, Tuple, Union - -import numpy as np -import pandas as pd -from genai_perf.tokenizer import Tokenizer -from genai_perf.utils import load_json, remove_sse_prefix -from rich.console import Console -from rich.table import Table - - -class ResponseFormat(Enum): - OPENAI_CHAT_COMPLETIONS = auto() - OPENAI_COMPLETIONS = auto() - TRITON = auto() - - -class Metrics: - """A base class for all the metrics class that contains common metrics.""" - - metric_labels = [ - "time_to_first_token", - "inter_token_latency", - "request_latency", - "output_token_throughput", - "output_token_throughput_per_request", - "request_throughput", - "output_sequence_length", - "input_sequence_length", - ] - - time_fields = [ - "inter_token_latency", - "time_to_first_token", - "request_latency", - ] - - # TODO (TMA-1678): output_token_throughput_per_request is not on this list - # since the current code treats all the throughput metrics to be displayed - # outside of the statistics table. - throughput_fields = [ - "request_throughput", - "output_token_throughput", - ] - - def __init__( - self, - request_throughputs: List[float] = [], - request_latencies: List[int] = [], - ) -> None: - self.request_throughputs = request_throughputs - self.request_latencies = request_latencies - self._base_names = { - "request_throughputs": "request_throughput", - "request_latencies": "request_latency", - } - - def __repr__(self): - attr_strs = [] - for k, v in self.__dict__.items(): - if not k.startswith("_"): - attr_strs.append(f"{k}={v}") - return f"Metrics({','.join(attr_strs)})" - - @property - def data(self) -> dict: - """Returns all the metrics.""" - return {k: v for k, v in self.__dict__.items() if not k.startswith("_")} - - def get_base_name(self, metric_name: str) -> str: - """Returns singular name of a given metric.""" - if metric_name in self._base_names: - return self._base_names[metric_name] - else: - raise KeyError(f"No metric named '{metric_name}' exists.") - - -class LLMMetrics(Metrics): - """A simple dataclass that holds core LLM performance metrics.""" - - def __init__( - self, - request_throughputs: List[float] = [], - request_latencies: List[int] = [], - time_to_first_tokens: List[int] = [], - inter_token_latencies: List[int] = [], - output_token_throughputs: List[float] = [], - output_token_throughputs_per_request: List[int] = [], - output_sequence_lengths: List[int] = [], - input_sequence_lengths: List[int] = [], - chunked_inter_token_latencies: List[List[int]] = [[]], - ) -> None: - super().__init__(request_throughputs, request_latencies) - self.time_to_first_tokens = time_to_first_tokens - self.inter_token_latencies = inter_token_latencies - self.output_token_throughputs = output_token_throughputs - self.output_token_throughputs_per_request = output_token_throughputs_per_request - self.output_sequence_lengths = output_sequence_lengths - self.input_sequence_lengths = input_sequence_lengths - - # Keeping chunked ITL (old) as a WAR to preserve visualization. - # Excluded from data. - self._chunked_inter_token_latencies = chunked_inter_token_latencies - - # add base name mapping - self._base_names["time_to_first_tokens"] = "time_to_first_token" - self._base_names["inter_token_latencies"] = "inter_token_latency" - self._base_names["output_token_throughputs"] = "output_token_throughput" - self._base_names["output_token_throughputs_per_request"] = ( - "output_token_throughput_per_request" - ) - self._base_names["output_sequence_lengths"] = "output_sequence_length" - self._base_names["input_sequence_lengths"] = "input_sequence_length" - - -class Statistics: - """A class that aggregates various statistics from given metrics class. - - The Statistics class goes through each metric in the metrics class and - calculates several statistics such as: - - average (arithmetic mean) - - percentiles (p25, p50, p75, p90, p95, p99) - - minimum & maximum - - standard deviation - The class will store each calculated statistics as part of its attribute. - - Example: - - >>> metrics = LLMMetrics(request_throughputs=[2, 4]) - >>> stats = Statistics(metrics) - >>> print(stats.avg_request_throughput) # output: 3 - """ - - def __init__(self, metrics: Metrics): - # iterate through Metrics to calculate statistics and set attributes - self._metrics = metrics - self._stats_dict: Dict = defaultdict(dict) - for attr, data in metrics.data.items(): - if self._should_skip(data, attr): - continue - - attr = metrics.get_base_name(attr) - self._add_units(attr) - self._calculate_mean(data, attr) - if not self._is_throughput_field(attr): - self._calculate_percentiles(data, attr) - self._calculate_minmax(data, attr) - self._calculate_std(data, attr) - - def _should_skip(self, data: List[Union[int, float]], attr: str) -> bool: - """Checks if some metrics should be skipped.""" - # No data points - if len(data) == 0: - return True - # Skip ITL when non-streaming (all zero) - elif attr == "inter_token_latencies" and sum(data) == 0: - return True - return False - - def _calculate_mean(self, data: List[Union[int, float]], attr: str) -> None: - avg = np.mean(data) - setattr(self, "avg_" + attr, avg) - self._stats_dict[attr]["avg"] = float(avg) - - def _calculate_percentiles(self, data: List[Union[int, float]], attr: str) -> None: - p25, p50, p75 = np.percentile(data, [25, 50, 75]) - p90, p95, p99 = np.percentile(data, [90, 95, 99]) - setattr(self, "p25_" + attr, p25) - setattr(self, "p50_" + attr, p50) - setattr(self, "p75_" + attr, p75) - setattr(self, "p90_" + attr, p90) - setattr(self, "p95_" + attr, p95) - setattr(self, "p99_" + attr, p99) - self._stats_dict[attr]["p99"] = float(p99) - self._stats_dict[attr]["p95"] = float(p95) - self._stats_dict[attr]["p90"] = float(p90) - self._stats_dict[attr]["p75"] = float(p75) - self._stats_dict[attr]["p50"] = float(p50) - self._stats_dict[attr]["p25"] = float(p25) - - def _calculate_minmax(self, data: List[Union[int, float]], attr: str) -> None: - min, max = np.min(data), np.max(data) - setattr(self, "min_" + attr, min) - setattr(self, "max_" + attr, max) - self._stats_dict[attr]["max"] = float(max) - self._stats_dict[attr]["min"] = float(min) - - def _calculate_std(self, data: List[Union[int, float]], attr: str) -> None: - std = np.std(data) - setattr(self, "std_" + attr, std) - self._stats_dict[attr]["std"] = float(std) - - def scale_data(self, factor: float = 1 / 1e6) -> None: - for k1, v1 in self.stats_dict.items(): - if self._is_time_field(k1): - for k2, v2 in v1.items(): - if k2 != "unit": - self.stats_dict[k1][k2] = self._scale(v2, factor) - - def _scale(self, metric: float, factor: float = 1 / 1e6) -> float: - """ - Scale metrics from nanoseconds by factor. - Default is nanoseconds to milliseconds. - """ - return metric * factor - - def _add_units(self, key) -> None: - if self._is_time_field(key): - self._stats_dict[key]["unit"] = "ms" - if key == "request_throughput": - self._stats_dict[key]["unit"] = "requests/sec" - if key.startswith("output_token_throughput"): - self._stats_dict[key]["unit"] = "tokens/sec" - if "sequence_length" in key: - self._stats_dict[key]["unit"] = "tokens" - - def __repr__(self) -> str: - attr_strs = [] - for k, v in self.__dict__.items(): - if not k.startswith("_"): - attr_strs.append(f"{k}={v}") - return f"Statistics({','.join(attr_strs)})" - - @property - def data(self) -> dict: - """Return all the aggregated statistics.""" - return {k: v for k, v in self.__dict__.items() if not k.startswith("_")} - - @property - def metrics(self) -> Metrics: - """Return the underlying metrics used to calculate the statistics.""" - return self._metrics - - @property - def stats_dict(self) -> Dict: - return self._stats_dict - - def _is_throughput_field(self, field: str) -> bool: - return field in Metrics.throughput_fields - - def _is_time_field(self, field: str) -> bool: - return field in Metrics.time_fields - - def export_parquet(self, artifact_dir: Path, filename: str) -> None: - max_length = -1 - col_index = 0 - filler_list = [] - df = pd.DataFrame() - - # Data frames require all columns of the same length - # find the max length column - for key, value in self._metrics.data.items(): - max_length = max(max_length, len(value)) - - # Insert None for shorter columns to match longest column - for key, value in self._metrics.data.items(): - if len(value) < max_length: - diff = max_length - len(value) - filler_list = [None] * diff - df.insert(col_index, key, value + filler_list) - diff = 0 - filler_list = [] - col_index = col_index + 1 - - filepath = artifact_dir / f"{filename}.gzip" - df.to_parquet(filepath, compression="gzip") - - -class ProfileDataParser: - """Base profile data parser class that reads the profile data JSON file to - extract core metrics and calculate various performance statistics. - """ - - def __init__(self, filename: Path) -> None: - data = load_json(filename) - self._get_profile_metadata(data) - self._parse_profile_data(data) - - def _get_profile_metadata(self, data: dict) -> None: - self._service_kind = data["service_kind"] - if self._service_kind == "openai": - if data["endpoint"] == "v1/chat/completions": - self._response_format = ResponseFormat.OPENAI_CHAT_COMPLETIONS - elif data["endpoint"] == "v1/completions": - self._response_format = ResponseFormat.OPENAI_COMPLETIONS - else: - # TPA-66: add PA metadata to handle this case - # When endpoint field is either empty or custom endpoint, fall - # back to parsing the response to extract the response format. - request = data["experiments"][0]["requests"][0] - response = request["response_outputs"][0]["response"] - if "chat.completion" in response: - self._response_format = ResponseFormat.OPENAI_CHAT_COMPLETIONS - elif "text_completion" in response: - self._response_format = ResponseFormat.OPENAI_COMPLETIONS - else: - raise RuntimeError("Unknown OpenAI response format.") - - elif self._service_kind == "triton": - self._response_format = ResponseFormat.TRITON - else: - raise ValueError(f"Unknown service kind: {self._service_kind}") - - def _parse_profile_data(self, data: dict) -> None: - """Parse through the entire profile data to collect statistics.""" - self._profile_results = {} - for experiment in data["experiments"]: - infer_mode = experiment["experiment"]["mode"] - load_level = experiment["experiment"]["value"] - requests = experiment["requests"] - - metrics = self._parse_requests(requests) - - # aggregate and calculate statistics - statistics = Statistics(metrics) - self._profile_results[(infer_mode, str(load_level))] = statistics - - def _parse_requests(self, requests: dict) -> LLMMetrics: - """Parse each request in profile data to extract core metrics.""" - raise NotImplementedError - - def get_statistics(self, infer_mode: str, load_level: str) -> Statistics: - """Return profile statistics if it exists.""" - if (infer_mode, load_level) not in self._profile_results: - raise KeyError(f"Profile with {infer_mode}={load_level} does not exist.") - return self._profile_results[(infer_mode, load_level)] - - def get_profile_load_info(self) -> List[Tuple[str, str]]: - """Return available (infer_mode, load_level) tuple keys.""" - return [k for k, _ in self._profile_results.items()] - - -class LLMProfileDataParser(ProfileDataParser): - """A class that calculates and aggregates all the LLM performance statistics - across the Perf Analyzer profile results. - - The LLMProfileDataParser class parses profile export JSON file, collects the - core LLM performance metrics, and calculates summary statistics for each - different Perf Analyzer runs/experiments. - - Example: - - >>> ... # run Perf Analyzer with concurrency level 10 - >>> - >>> from transformers import AutoTokenizer - >>> - >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") - >>> pd = LLMProfileDataParser( - >>> filename="profile_export.json", - >>> tokenizer=tokenizer, - >>> ) - >>> stats = pd.get_statistics(infer_mode="concurrency", level=10) - >>> - >>> print(stats) # output: Statistics(avg_time_to_first_token=...) - >>> stats.pretty_print() # Output: time_to_first_token_s: ... - """ - - def __init__( - self, - filename: Path, - tokenizer: Tokenizer, - ) -> None: - self._tokenizer = tokenizer - super().__init__(filename) - - def _parse_requests(self, requests: dict) -> LLMMetrics: - """Parse each requests in profile export data to extract key metrics.""" - min_req_timestamp, max_res_timestamp = float("inf"), 0 - request_latencies = [] - time_to_first_tokens = [] - inter_token_latencies = [] - output_token_throughputs_per_request = [] - input_sequence_lengths = [] - output_sequence_lengths = [] - chunked_inter_token_latencies = [] - - for request in requests: - req_timestamp = request["timestamp"] - req_inputs = request["request_inputs"] - res_timestamps = request["response_timestamps"] - res_outputs = request["response_outputs"] - - self._preprocess_response(res_timestamps, res_outputs) - - # Skip requests with empty response. This happens sometimes when the - # model returns a single response with empty string. - if not res_timestamps: - continue - - # track entire benchmark duration - min_req_timestamp = min(min_req_timestamp, req_timestamp) - max_res_timestamp = max(max_res_timestamp, res_timestamps[-1]) - - # request latencies - req_latency_ns = res_timestamps[-1] - req_timestamp - request_latencies.append(req_latency_ns) # nanosec - req_latency_s = req_latency_ns / 1e9 # sec - - # time to first token - ttft = res_timestamps[0] - req_timestamp - time_to_first_tokens.append(ttft) - - # number of input tokens - input_seq_len = self._get_input_token_count(req_inputs) - input_sequence_lengths.append(input_seq_len) - - # output token throughput per request - output_token_counts, total_output_token = self._get_output_token_counts( - res_outputs - ) - output_token_throughputs_per_request.append( - total_output_token / req_latency_s - ) - output_sequence_lengths.append(total_output_token) - - # inter token latencies - if total_output_token > 1: - inter_token_latency = (req_latency_ns - ttft) / (total_output_token - 1) - inter_token_latencies.append(round(inter_token_latency)) - - # The new ITL calculation above loses all token-level ITL information - # and as a result breaks ITL vs token position visualization. Keep - # the old version of inter token latency as a WAR to preserve the - # visualization. - chunked_inter_token_latency = [] - for (t1, _), (t2, n2) in self._pairwise( - zip(res_timestamps, output_token_counts) - ): - # TMA-1676: handle empty first/last responses - # if the latter response has zero token (e.g. empty string), - # then set it default to one for the sake of inter token latency - # calculation and to avoid divide by zero. - num_token = 1 if n2 == 0 else n2 - chunked_inter_token_latency.append(round((t2 - t1) / num_token)) - chunked_inter_token_latencies.append(chunked_inter_token_latency) - - # request & output token throughput - benchmark_duration = (max_res_timestamp - min_req_timestamp) / 1e9 # nanosec - request_throughputs = [len(requests) / benchmark_duration] - output_token_throughputs = [sum(output_sequence_lengths) / benchmark_duration] - - return LLMMetrics( - request_throughputs, - request_latencies, - time_to_first_tokens, - inter_token_latencies, - output_token_throughputs, - output_token_throughputs_per_request, - output_sequence_lengths, - input_sequence_lengths, - chunked_inter_token_latencies, - ) - - def _pairwise(self, iterable): - """Generate pairs of consecutive elements from the given iterable.""" - a, b = tee(iterable) - next(b, None) - return zip(a, b) - - def _preprocess_response( - self, res_timestamps: List[int], res_outputs: List[Dict[str, str]] - ) -> None: - """Helper function to preprocess responses of a request.""" - if self._service_kind == "openai": - # PA sometimes receives multiple SSE responses at once (as a single - # response). Handle these responses by merging into a single response. - for i in range(len(res_outputs)): - response = res_outputs[i]["response"] - responses = response.strip().split("\n\n") - if len(responses) > 1: - merged_response = json.loads(remove_sse_prefix(responses[0])) - if ( - merged_response["choices"][0]["delta"].get("content", None) - is None - ): - merged_response["choices"][0]["delta"]["content"] = "" - for r in responses[1:]: - text = self._extract_openai_text_output(r) - merged_response["choices"][0]["delta"]["content"] += text - - res_outputs[i] = {"response": json.dumps(merged_response)} - - # Remove responses without any content - indices_to_remove = [] - for idx, out in enumerate(res_outputs): - if self._is_openai_empty_response(out["response"]): - indices_to_remove.append(idx) - indices_to_remove.sort(reverse=True) - for index in indices_to_remove: - res_timestamps.pop(index) - res_outputs.pop(index) - - def _get_input_token_count(self, req_inputs: dict) -> int: - """Deserialize the request input and return tokenized inputs.""" - if self._service_kind == "triton": - input_text = req_inputs["text_input"] - elif self._service_kind == "openai": - input_text = self._get_openai_input_text(req_inputs) - else: - raise ValueError(f"Unknown service kind: '{self._service_kind}'.") - - return len(self._tokenizer.encode(input_text)) - - def _get_openai_input_text(self, req_inputs: dict) -> str: - """Tokenize the OpenAI request input texts.""" - payload = json.loads(req_inputs["payload"]) - if self._response_format == ResponseFormat.OPENAI_CHAT_COMPLETIONS: - return payload["messages"][0]["content"] - elif self._response_format == ResponseFormat.OPENAI_COMPLETIONS: - return payload["prompt"] - else: - raise ValueError( - "Failed to parse OpenAI request input in profile export file." - ) - - def _get_output_token_counts( - self, res_outputs: List[Dict] - ) -> Tuple[List[int], int]: - """Return response-level token counts and total token count.""" - if self._service_kind == "triton": - output_texts = self._get_triton_output_tokens(res_outputs) - elif self._service_kind == "openai": - output_texts = self._get_openai_output_tokens(res_outputs) - else: - raise ValueError(f"Unknown service kind: '{self._service_kind}'.") - - full_text_token_count = len(self._tokenizer.encode("".join(output_texts))) - - output_tokens = self._get_response_output_tokens(output_texts) - output_token_counts = list(map(len, output_tokens)) - return output_token_counts, full_text_token_count - - def _get_triton_output_tokens(self, res_outputs: List[Dict]) -> List[str]: - """Return a list of Triton response texts.""" - return [r["text_output"] for r in res_outputs] - - def _get_openai_output_tokens(self, res_outputs: List[Dict]) -> List[str]: - """Return a list of OpenAI response texts.""" - output_texts = [] - for output in res_outputs: - text = self._extract_openai_text_output(output["response"]) - output_texts.append(text) - return output_texts - - def _get_response_output_tokens(self, output_texts: List[str]) -> List[List[int]]: - """Return a list of response output tokens.""" - # Exclamation mark trick forces the llama tokenization to consistently - # start each output with a specific token which allows us to safely skip - # the first token of every tokenized output and get only the ones that - # are returned by the model - encodings = self._tokenizer(["!" + txt for txt in output_texts]) - return [out[1:] for out in encodings.data["input_ids"]] - - def _extract_openai_text_output(self, response: str) -> str: - """Extracts text/content of the OpenAI response object.""" - response = remove_sse_prefix(response) - - if response == "[DONE]": - return "" - - data = json.loads(response) - completions = data["choices"][0] - - text_output = "" - if "object" not in data: - # FIXME: TPA-47 workaround for vLLM not following OpenAI Completions - # API specification when streaming, missing 'object' field: - # https://platform.openai.com/docs/api-reference/completions - text_output = completions.get("text", "") - elif data["object"] == "text_completion": # legacy - text_output = completions.get("text", "") - elif data["object"] == "chat.completion": # non-streaming - text_output = completions["message"].get("content", "") - elif data["object"] == "chat.completion.chunk": # streaming - text_output = completions["delta"].get("content", "") - else: - obj_type = data["object"] - raise ValueError(f"Unknown OpenAI response object type '{obj_type}'.") - return text_output - - def _is_openai_empty_response(self, response: str) -> bool: - """Returns true if the response is an openai response with no content (or empty content)""" - text = self._extract_openai_text_output(response) - if text: - return False - return True diff --git a/genai-perf/genai_perf/main.py b/genai-perf/genai_perf/main.py index 65b765d8..912ee472 100755 --- a/genai-perf/genai_perf/main.py +++ b/genai-perf/genai_perf/main.py @@ -33,26 +33,29 @@ import genai_perf.logging as logging from genai_perf import parser -from genai_perf.constants import DEFAULT_PARQUET_FILE from genai_perf.exceptions import GenAIPerfException from genai_perf.export_data.output_reporter import OutputReporter from genai_perf.llm_inputs.llm_inputs import LlmInputs -from genai_perf.llm_metrics import LLMProfileDataParser from genai_perf.plots.plot_config_parser import PlotConfigParser from genai_perf.plots.plot_manager import PlotManager +from genai_perf.profile_data_parser import LLMProfileDataParser, ProfileDataParser from genai_perf.tokenizer import Tokenizer, get_tokenizer def create_artifacts_dirs(args: Namespace) -> None: - # TMA-1911: support plots CLI option plot_dir = args.artifact_dir / "plots" os.makedirs(args.artifact_dir, exist_ok=True) - os.makedirs(plot_dir, exist_ok=True) + if hasattr(args, "generate_plots") and args.generate_plots: + os.makedirs(plot_dir, exist_ok=True) def generate_inputs(args: Namespace, tokenizer: Tokenizer) -> None: # TODO (TMA-1759): review if add_model_name is always true - input_filename = Path(args.input_file.name) if args.input_file else None + if args.input_file: + filepath, _ = args.input_file + input_filename = Path(filepath) + else: + input_filename = None add_model_name = True try: extra_input_dict = parser.get_extra_inputs_as_dict(args) @@ -79,18 +82,22 @@ def generate_inputs(args: Namespace, tokenizer: Tokenizer) -> None: add_stream=args.streaming, tokenizer=tokenizer, extra_inputs=extra_input_dict, + batch_size=args.batch_size, output_dir=args.artifact_dir, ) -def calculate_metrics(args: Namespace, tokenizer: Tokenizer) -> LLMProfileDataParser: - return LLMProfileDataParser( - filename=args.profile_export_file, - tokenizer=tokenizer, - ) +def calculate_metrics(args: Namespace, tokenizer: Tokenizer) -> ProfileDataParser: + if args.endpoint_type in ["embeddings", "rankings"]: + return ProfileDataParser(args.profile_export_file) + else: + return LLMProfileDataParser( + filename=args.profile_export_file, + tokenizer=tokenizer, + ) -def report_output(data_parser: LLMProfileDataParser, args: Namespace) -> None: +def report_output(data_parser: ProfileDataParser, args: Namespace) -> None: if args.concurrency: infer_mode = "concurrency" load_level = f"{args.concurrency}" diff --git a/genai-perf/genai_perf/metrics/__init__.py b/genai-perf/genai_perf/metrics/__init__.py new file mode 100644 index 00000000..01ca53c5 --- /dev/null +++ b/genai-perf/genai_perf/metrics/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from genai_perf.metrics.llm_metrics import LLMMetrics +from genai_perf.metrics.metrics import MetricMetadata, Metrics +from genai_perf.metrics.statistics import Statistics diff --git a/genai-perf/genai_perf/metrics/llm_metrics.py b/genai-perf/genai_perf/metrics/llm_metrics.py new file mode 100755 index 00000000..13dff8a6 --- /dev/null +++ b/genai-perf/genai_perf/metrics/llm_metrics.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from typing import List + +from genai_perf.metrics.metrics import MetricMetadata, Metrics + + +class LLMMetrics(Metrics): + """A simple dataclass that holds core LLM performance metrics.""" + + LLM_REQUEST_METRICS = [ + MetricMetadata("time_to_first_token", "ms"), + MetricMetadata("inter_token_latency", "ms"), + MetricMetadata("output_token_throughput_per_request", "tokens/sec"), + MetricMetadata("output_sequence_length", "tokens"), + MetricMetadata("input_sequence_length", "tokens"), + ] + + LLM_SYSTEM_METRICS = [ + # (TMA-1977) Make the unit consistent with statistics dict (e.g. tokens/sec) + MetricMetadata("output_token_throughput", "per sec"), + ] + + def __init__( + self, + request_throughputs: List[float] = [], + request_latencies: List[int] = [], + time_to_first_tokens: List[int] = [], + inter_token_latencies: List[int] = [], + output_token_throughputs: List[float] = [], + output_token_throughputs_per_request: List[int] = [], + output_sequence_lengths: List[int] = [], + input_sequence_lengths: List[int] = [], + chunked_inter_token_latencies: List[List[int]] = [[]], + ) -> None: + super().__init__(request_throughputs, request_latencies) + self.time_to_first_tokens = time_to_first_tokens + self.inter_token_latencies = inter_token_latencies + self.output_token_throughputs = output_token_throughputs + self.output_token_throughputs_per_request = output_token_throughputs_per_request + self.output_sequence_lengths = output_sequence_lengths + self.input_sequence_lengths = input_sequence_lengths + + # Keeping chunked ITL (old) as a WAR to preserve visualization. + # Excluded from data. + self._chunked_inter_token_latencies = chunked_inter_token_latencies + + # add base name mapping + self._base_names["time_to_first_tokens"] = "time_to_first_token" + self._base_names["inter_token_latencies"] = "inter_token_latency" + self._base_names["output_token_throughputs"] = "output_token_throughput" + self._base_names["output_token_throughputs_per_request"] = ( + "output_token_throughput_per_request" + ) + self._base_names["output_sequence_lengths"] = "output_sequence_length" + self._base_names["input_sequence_lengths"] = "input_sequence_length" + + @property + def request_metrics(self) -> List[MetricMetadata]: + base_metrics = super().request_metrics # base metrics + + # (TMA-1975) The order is hardcoded as below to avoid introducing any + # breaking changes to the users who might be parsing the outputs. However, + # we would eventually want to impose some consistent order such as a + # base metrics first and then task specific metrics. Uncomment the below + # line to enable this order: + # return base_metrics + self.LLM_REQUEST_METRICS + return ( + self.LLM_REQUEST_METRICS[:2] + base_metrics + self.LLM_REQUEST_METRICS[2:] + ) + + @property + def system_metrics(self) -> List[MetricMetadata]: + base_metrics = super().system_metrics # base metrics + + # (TMA-1975) The order is hardcoded as below to avoid introducing any + # breaking changes to the users who might be parsing the outputs. However, + # we would eventually want to impose some consistent order such as a + # base metrics first and then task specific metrics. Uncomment the below + # line to enable this order: + # return base_metrics + self.LLM_SYSTEM_METRICS + return self.LLM_SYSTEM_METRICS + base_metrics diff --git a/genai-perf/genai_perf/metrics/metrics.py b/genai-perf/genai_perf/metrics/metrics.py new file mode 100755 index 00000000..7e047094 --- /dev/null +++ b/genai-perf/genai_perf/metrics/metrics.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from dataclasses import dataclass +from typing import List + + +@dataclass +class MetricMetadata: + name: str + unit: str + + +class Metrics: + """A base class that contains common request level metrics.""" + + REQUEST_METRICS = [ + MetricMetadata("request_latency", "ms"), + ] + + SYSTEM_METRICS = [ + # (TMA-1977) Make the unit consistent with statistics dict (e.g. tokens/sec) + MetricMetadata("request_throughput", "per sec"), + ] + + def __init__( + self, + request_throughputs: List[float] = [], + request_latencies: List[int] = [], + ) -> None: + self.request_throughputs = request_throughputs + self.request_latencies = request_latencies + self._base_names = { + "request_throughputs": "request_throughput", + "request_latencies": "request_latency", + } + + def __repr__(self): + attr_strs = [] + for k, v in self.__dict__.items(): + if not k.startswith("_"): + attr_strs.append(f"{k}={v}") + return f"Metrics({','.join(attr_strs)})" + + @property + def request_metrics(self) -> List[MetricMetadata]: + return self.REQUEST_METRICS + + @property + def system_metrics(self) -> List[MetricMetadata]: + return self.SYSTEM_METRICS + + @property + def data(self) -> dict: + """Returns all the metrics.""" + return {k: v for k, v in self.__dict__.items() if not k.startswith("_")} + + def get_base_name(self, metric_name: str) -> str: + """Returns singular name of a given metric.""" + if metric_name in self._base_names: + return self._base_names[metric_name] + else: + raise KeyError(f"No metric named '{metric_name}' exists.") diff --git a/genai-perf/genai_perf/metrics/statistics.py b/genai-perf/genai_perf/metrics/statistics.py new file mode 100755 index 00000000..f0d12cef --- /dev/null +++ b/genai-perf/genai_perf/metrics/statistics.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from collections import defaultdict +from pathlib import Path +from typing import Dict, List, Union + +import numpy as np +import pandas as pd +from genai_perf.metrics.metrics import Metrics + + +class Statistics: + """A class that aggregates various statistics from given metrics class. + + The Statistics class goes through each metric in the metrics class and + calculates several statistics such as: + - average (arithmetic mean) + - percentiles (p25, p50, p75, p90, p95, p99) + - minimum & maximum + - standard deviation + The class will store each calculated statistics as part of its attribute. + + Example: + + >>> metrics = LLMMetrics(request_throughputs=[2, 4]) + >>> stats = Statistics(metrics) + >>> print(stats.avg_request_throughput) # output: 3 + """ + + def __init__(self, metrics: Metrics): + # iterate through Metrics to calculate statistics and set attributes + self._metrics = metrics + self._stats_dict: Dict = defaultdict(dict) + for attr, data in metrics.data.items(): + if self._should_skip(data, attr): + continue + + attr = metrics.get_base_name(attr) + self._add_units(attr) + self._calculate_mean(data, attr) + if not self._is_system_metric(metrics, attr): + self._calculate_percentiles(data, attr) + self._calculate_minmax(data, attr) + self._calculate_std(data, attr) + + def _should_skip(self, data: List[Union[int, float]], attr: str) -> bool: + """Checks if some metrics should be skipped.""" + # No data points + if len(data) == 0: + return True + # Skip ITL when non-streaming (all zero) + elif attr == "inter_token_latencies" and sum(data) == 0: + return True + return False + + def _calculate_mean(self, data: List[Union[int, float]], attr: str) -> None: + avg = np.mean(data) + setattr(self, "avg_" + attr, avg) + self._stats_dict[attr]["avg"] = float(avg) + + def _calculate_percentiles(self, data: List[Union[int, float]], attr: str) -> None: + p25, p50, p75 = np.percentile(data, [25, 50, 75]) + p90, p95, p99 = np.percentile(data, [90, 95, 99]) + setattr(self, "p25_" + attr, p25) + setattr(self, "p50_" + attr, p50) + setattr(self, "p75_" + attr, p75) + setattr(self, "p90_" + attr, p90) + setattr(self, "p95_" + attr, p95) + setattr(self, "p99_" + attr, p99) + self._stats_dict[attr]["p99"] = float(p99) + self._stats_dict[attr]["p95"] = float(p95) + self._stats_dict[attr]["p90"] = float(p90) + self._stats_dict[attr]["p75"] = float(p75) + self._stats_dict[attr]["p50"] = float(p50) + self._stats_dict[attr]["p25"] = float(p25) + + def _calculate_minmax(self, data: List[Union[int, float]], attr: str) -> None: + min, max = np.min(data), np.max(data) + setattr(self, "min_" + attr, min) + setattr(self, "max_" + attr, max) + self._stats_dict[attr]["max"] = float(max) + self._stats_dict[attr]["min"] = float(min) + + def _calculate_std(self, data: List[Union[int, float]], attr: str) -> None: + std = np.std(data) + setattr(self, "std_" + attr, std) + self._stats_dict[attr]["std"] = float(std) + + def scale_data(self, factor: float = 1 / 1e6) -> None: + for k1, v1 in self.stats_dict.items(): + if self._is_time_metric(k1): + for k2, v2 in v1.items(): + if k2 != "unit": + self.stats_dict[k1][k2] = self._scale(v2, factor) + + def _scale(self, metric: float, factor: float = 1 / 1e6) -> float: + """ + Scale metrics from nanoseconds by factor. + Default is nanoseconds to milliseconds. + """ + return metric * factor + + def _add_units(self, key) -> None: + if self._is_time_metric(key): + self._stats_dict[key]["unit"] = "ms" + elif key == "request_throughput": + self._stats_dict[key]["unit"] = "requests/sec" + elif key.startswith("output_token_throughput"): + self._stats_dict[key]["unit"] = "tokens/sec" + elif "sequence_length" in key: + self._stats_dict[key]["unit"] = "tokens" + else: + self._stats_dict[key]["unit"] = "" + + def __repr__(self) -> str: + attr_strs = [] + for k, v in self.__dict__.items(): + if not k.startswith("_"): + attr_strs.append(f"{k}={v}") + return f"Statistics({','.join(attr_strs)})" + + @property + def data(self) -> dict: + """Return all the aggregated statistics.""" + return {k: v for k, v in self.__dict__.items() if not k.startswith("_")} + + @property + def metrics(self) -> Metrics: + """Return the underlying metrics used to calculate the statistics.""" + return self._metrics + + @property + def stats_dict(self) -> Dict: + return self._stats_dict + + def _is_system_metric(self, metrics: Metrics, attr: str) -> bool: + return attr in [m.name for m in metrics.system_metrics] + + def _is_time_metric(self, field: str) -> bool: + # TPA-188: Remove the hardcoded time metrics list + time_metrics = [ + "inter_token_latency", + "time_to_first_token", + "request_latency", + ] + return field in time_metrics + + def export_parquet(self, artifact_dir: Path, filename: str) -> None: + max_length = -1 + col_index = 0 + filler_list = [] + df = pd.DataFrame() + + # Data frames require all columns of the same length + # find the max length column + for key, value in self._metrics.data.items(): + max_length = max(max_length, len(value)) + + # Insert None for shorter columns to match longest column + for key, value in self._metrics.data.items(): + if len(value) < max_length: + diff = max_length - len(value) + filler_list = [None] * diff + df.insert(col_index, key, value + filler_list) + diff = 0 + filler_list = [] + col_index = col_index + 1 + + filepath = artifact_dir / f"{filename}.gzip" + df.to_parquet(filepath, compression="gzip") diff --git a/genai-perf/genai_perf/parser.py b/genai-perf/genai_perf/parser.py index 24f98b42..901cf6ca 100644 --- a/genai-perf/genai_perf/parser.py +++ b/genai-perf/genai_perf/parser.py @@ -28,7 +28,9 @@ import json import os import sys +from enum import Enum, auto from pathlib import Path +from typing import Tuple import genai_perf.logging as logging import genai_perf.utils as utils @@ -50,9 +52,31 @@ from . import __version__ + +class PathType(Enum): + FILE = auto() + DIRECTORY = auto() + + def to_lowercase(self): + return self.name.lower() + + +class Subcommand(Enum): + PROFILE = auto() + COMPARE = auto() + + def to_lowercase(self): + return self.name.lower() + + logger = logging.getLogger(__name__) -_endpoint_type_map = {"chat": "v1/chat/completions", "completions": "v1/completions"} +_endpoint_type_map = { + "chat": "v1/chat/completions", + "completions": "v1/completions", + "embeddings": "v1/embeddings", + "rankings": "v1/ranking", +} def _check_model_args( @@ -61,7 +85,7 @@ def _check_model_args( """ Check if model name is provided. """ - if not args.subcommand and not args.model: + if not args.model: parser.error("The -m/--model option is required and cannot be empty.") args = _convert_str_to_enum_entry( args, "model_selection_strategy", ModelSelectionStrategy @@ -86,9 +110,8 @@ def _check_compare_args( """ Check compare subcommand args """ - if args.subcommand == "compare": - if not args.config and not args.files: - parser.error("Either the --config or --files option must be specified.") + if not args.config and not args.files: + parser.error("Either the --config or --files option must be specified.") return args @@ -110,6 +133,10 @@ def _check_conditional_args( args.output_format = OutputFormat.OPENAI_CHAT_COMPLETIONS elif args.endpoint_type == "completions": args.output_format = OutputFormat.OPENAI_COMPLETIONS + elif args.endpoint_type == "embeddings": + args.output_format = OutputFormat.OPENAI_EMBEDDINGS + elif args.endpoint_type == "rankings": + args.output_format = OutputFormat.RANKINGS if args.endpoint is not None: args.endpoint = args.endpoint.lstrip(" /") @@ -141,9 +168,48 @@ def _check_conditional_args( "The --output-tokens-mean-deterministic option is only supported with the Triton service-kind." ) + _check_conditional_args_embeddings_rankings(parser, args) + return args +def _check_conditional_args_embeddings_rankings( + parser: argparse.ArgumentParser, args: argparse.Namespace +): + + if args.output_format in [ + OutputFormat.OPENAI_EMBEDDINGS, + OutputFormat.RANKINGS, + ]: + if args.streaming: + parser.error( + f"The --streaming option is not supported with the {args.endpoint_type} endpoint type." + ) + + if args.generate_plots: + parser.error( + f"The --generate-plots option is not currently supported with the {args.endpoint_type} endpoint type." + ) + else: + if args.batch_size != LlmInputs.DEFAULT_BATCH_SIZE: + parser.error( + "The --batch-size option is currently only supported with the embeddings and rankings endpoint types." + ) + + if args.input_file: + _, path_type = args.input_file + if args.output_format != OutputFormat.RANKINGS: + if path_type == "directory": + parser.error( + "A directory is only currently supported for the rankings endpoint type." + ) + else: + if path_type == PathType.FILE: + parser.error( + "The rankings endpoint-type requires a directory value for the --input-file flag." + ) + + def _check_load_manager_args(args: argparse.Namespace) -> argparse.Namespace: """ Check inference load args @@ -201,7 +267,12 @@ def _infer_prompt_source(args: argparse.Namespace) -> argparse.Namespace: logger.debug(f"Input source is the following dataset: {args.input_dataset}") elif args.input_file: args.prompt_source = PromptSource.FILE - logger.debug(f"Input source is the following file: {args.input_file.name}") + if args.endpoint_type == "rankings": + logger.debug( + f"Input source is the following directory: {args.input_file[0]}" + ) + else: + logger.debug(f"Input source is the following file: {args.input_file[0]}") else: args.prompt_source = PromptSource.SYNTHETIC logger.debug("Input source is synthetic data") @@ -218,12 +289,34 @@ def _convert_str_to_enum_entry(args, option, enum): return args +### Types ### + + +def file_or_directory(path: str) -> Tuple[Path, PathType]: + if os.path.isfile(path): + return (Path(path), PathType.FILE) + elif os.path.isdir(path): + return (Path(path), PathType.DIRECTORY) + else: + raise ValueError(f"'{path}' is not a valid file or directory") + + ### Parsers ### def _add_input_args(parser): input_group = parser.add_argument_group("Input") + input_group.add_argument( + "--batch-size", + "-b", + type=int, + default=LlmInputs.DEFAULT_BATCH_SIZE, + required=False, + help=f"The batch size of the requests GenAI-Perf should send. " + "This is currently only supported with the embeddings and rankings endpoint types.", + ) + input_group.add_argument( "--extra-inputs", action="append", @@ -244,12 +337,14 @@ def _add_input_args(parser): prompt_source_group.add_argument( "--input-file", - type=argparse.FileType("r"), + type=file_or_directory, default=None, required=False, help="The input file containing the prompts to use for profiling. " "Each line should be a JSON object with a 'text_input' field in JSONL format. " - 'Example: {"text_input": "Your prompt here"}', + 'Example: {"text_input": "Your prompt here"}' + "For the rankings endpoint-type, a directory should be passed in instead with " + 'a "queries.jsonl" file and a "passages.jsonl" file with the same format.', ) input_group.add_argument( @@ -404,7 +499,7 @@ def _add_endpoint_args(parser): endpoint_group.add_argument( "--endpoint-type", type=str, - choices=["chat", "completions"], + choices=["chat", "completions", "embeddings", "rankings"], required=False, help=f"The endpoint-type to send requests to on the " 'server. This is only used with the "openai" service-kind.', @@ -485,20 +580,13 @@ def _add_other_args(parser): help="An option to enable verbose mode.", ) - other_group.add_argument( - "--version", - action="version", - version="%(prog)s " + __version__, - help=f"An option to print the version and exit.", - ) - def get_extra_inputs_as_dict(args: argparse.Namespace) -> dict: request_inputs = {} if args.extra_inputs: for input_str in args.extra_inputs: if input_str.startswith("{") and input_str.endswith("}"): - request_inputs.update(json.loads(input_str)) + request_inputs.update(utils.load_json_str(input_str)) else: semicolon_count = input_str.count(":") if semicolon_count != 1: @@ -538,10 +626,10 @@ def get_extra_inputs_as_dict(args: argparse.Namespace) -> dict: def _parse_compare_args(subparsers) -> argparse.ArgumentParser: compare = subparsers.add_parser( - "compare", + Subcommand.COMPARE.to_lowercase(), description="Subcommand to generate plots that compare multiple profile runs.", ) - compare_group = compare.add_argument_group("Compare") + compare_group = compare.add_argument_group("Input") mx_group = compare_group.add_mutually_exclusive_group(required=False) mx_group.add_argument( "--config", @@ -563,6 +651,20 @@ def _parse_compare_args(subparsers) -> argparse.ArgumentParser: return compare +def _parse_profile_args(subparsers) -> argparse.ArgumentParser: + profile = subparsers.add_parser( + Subcommand.PROFILE.to_lowercase(), + description="Subcommand to profile LLMs and Generative AI models.", + ) + _add_endpoint_args(profile) + _add_input_args(profile) + _add_profile_args(profile) + _add_output_args(profile) + _add_other_args(profile) + profile.set_defaults(func=profile_handler) + return profile + + ### Handlers ### @@ -571,12 +673,6 @@ def create_compare_dir() -> None: os.mkdir(DEFAULT_COMPARE_DIR) -def profile_handler(args, extra_args): - from genai_perf.wrapper import Profiler - - Profiler.run(args=args, extra_args=extra_args) - - def compare_handler(args: argparse.Namespace): """Handles `compare` subcommand workflow.""" if args.files: @@ -591,45 +687,75 @@ def compare_handler(args: argparse.Namespace): plot_manager.generate_plots() -### Entrypoint ### +def profile_handler(args, extra_args): + from genai_perf.wrapper import Profiler + Profiler.run(args=args, extra_args=extra_args) -def parse_args(): - argv = sys.argv +### Parser Initialization ### + + +def init_parsers(): parser = argparse.ArgumentParser( prog="genai-perf", description="CLI to profile LLMs and Generative AI models with Perf Analyzer", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) - parser.set_defaults(func=profile_handler) - - # Conceptually group args for easier visualization - _add_endpoint_args(parser) - _add_input_args(parser) - _add_profile_args(parser) - _add_output_args(parser) - _add_other_args(parser) + parser.add_argument( + "--version", + action="version", + version="%(prog)s " + __version__, + help=f"An option to print the version and exit.", + ) # Add subcommands subparsers = parser.add_subparsers( help="List of subparser commands.", dest="subcommand" ) - compare_parser = _parse_compare_args(subparsers) + _ = _parse_compare_args(subparsers) + _ = _parse_profile_args(subparsers) + subparsers.required = True + + return parser + - # Check for passthrough args +def get_passthrough_args_index(argv: list) -> int: if "--" in argv: passthrough_index = argv.index("--") logger.info(f"Detected passthrough args: {argv[passthrough_index + 1:]}") else: passthrough_index = len(argv) + return passthrough_index + + +def refine_args( + parser: argparse.ArgumentParser, args: argparse.Namespace +) -> argparse.Namespace: + if args.subcommand == Subcommand.PROFILE.to_lowercase(): + args = _infer_prompt_source(args) + args = _check_model_args(parser, args) + args = _check_conditional_args(parser, args) + args = _check_load_manager_args(args) + args = _set_artifact_paths(args) + elif args.subcommand == Subcommand.COMPARE.to_lowercase(): + args = _check_compare_args(parser, args) + else: + raise ValueError(f"Unknown subcommand: {args.subcommand}") + + return args + + +### Entrypoint ### + + +def parse_args(): + argv = sys.argv + + parser = init_parsers() + passthrough_index = get_passthrough_args_index(argv) args = parser.parse_args(argv[1:passthrough_index]) - args = _infer_prompt_source(args) - args = _check_model_args(parser, args) - args = _check_conditional_args(parser, args) - args = _check_compare_args(compare_parser, args) - args = _check_load_manager_args(args) - args = _set_artifact_paths(args) + args = refine_args(parser, args) return args, argv[passthrough_index + 1 :] diff --git a/genai-perf/genai_perf/plots/plot_config_parser.py b/genai-perf/genai_perf/plots/plot_config_parser.py index 1072bc30..00588f6b 100755 --- a/genai-perf/genai_perf/plots/plot_config_parser.py +++ b/genai-perf/genai_perf/plots/plot_config_parser.py @@ -33,8 +33,9 @@ # Skip type checking to avoid mypy error # Issue: https://github.com/python/mypy/issues/10632 import yaml # type: ignore -from genai_perf.llm_metrics import LLMProfileDataParser, Statistics +from genai_perf.metrics import Statistics from genai_perf.plots.plot_config import PlotConfig, PlotType, ProfileRunData +from genai_perf.profile_data_parser import LLMProfileDataParser from genai_perf.tokenizer import DEFAULT_TOKENIZER, get_tokenizer from genai_perf.utils import load_yaml, scale diff --git a/genai-perf/genai_perf/profile_data_parser/__init__.py b/genai-perf/genai_perf/profile_data_parser/__init__.py new file mode 100644 index 00000000..2e7798c4 --- /dev/null +++ b/genai-perf/genai_perf/profile_data_parser/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from genai_perf.profile_data_parser.llm_profile_data_parser import LLMProfileDataParser +from genai_perf.profile_data_parser.profile_data_parser import ( + ProfileDataParser, + ResponseFormat, +) diff --git a/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py b/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py new file mode 100755 index 00000000..4ec1bec6 --- /dev/null +++ b/genai-perf/genai_perf/profile_data_parser/llm_profile_data_parser.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +from itertools import tee +from pathlib import Path +from typing import Dict, List, Tuple + +from genai_perf.metrics import LLMMetrics, Metrics +from genai_perf.profile_data_parser.profile_data_parser import ( + ProfileDataParser, + ResponseFormat, +) +from genai_perf.tokenizer import Tokenizer +from genai_perf.utils import load_json_str, remove_sse_prefix + + +class LLMProfileDataParser(ProfileDataParser): + """A class that calculates and aggregates all the LLM performance statistics + across the Perf Analyzer profile results. + + The LLMProfileDataParser class parses profile export JSON file, collects the + core LLM performance metrics, and calculates summary statistics for each + different Perf Analyzer runs/experiments. + + Example: + + >>> ... # run Perf Analyzer with concurrency level 10 + >>> + >>> from transformers import AutoTokenizer + >>> + >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") + >>> pd = LLMProfileDataParser( + >>> filename="profile_export.json", + >>> tokenizer=tokenizer, + >>> ) + >>> stats = pd.get_statistics(infer_mode="concurrency", level=10) + >>> + >>> print(stats) # output: Statistics(avg_time_to_first_token=...) + >>> stats.pretty_print() # Output: time_to_first_token_s: ... + """ + + def __init__( + self, + filename: Path, + tokenizer: Tokenizer, + ) -> None: + self._tokenizer = tokenizer + super().__init__(filename) + + def _parse_requests(self, requests: dict) -> Metrics: + """Parse each requests in profile export data to extract key metrics.""" + min_req_timestamp, max_res_timestamp = float("inf"), 0 + request_latencies = [] + time_to_first_tokens = [] + inter_token_latencies = [] + output_token_throughputs_per_request = [] + input_sequence_lengths = [] + output_sequence_lengths = [] + chunked_inter_token_latencies = [] + + for request in requests: + req_timestamp = request["timestamp"] + req_inputs = request["request_inputs"] + res_timestamps = request["response_timestamps"] + res_outputs = request["response_outputs"] + + self._preprocess_response(res_timestamps, res_outputs) + + # Skip requests with empty response. This happens sometimes when the + # model returns a single response with empty string. + if not res_timestamps: + continue + + # track entire benchmark duration + min_req_timestamp = min(min_req_timestamp, req_timestamp) + max_res_timestamp = max(max_res_timestamp, res_timestamps[-1]) + + # request latencies + req_latency_ns = res_timestamps[-1] - req_timestamp + request_latencies.append(req_latency_ns) # nanosec + req_latency_s = req_latency_ns / 1e9 # sec + + # time to first token + ttft = res_timestamps[0] - req_timestamp + time_to_first_tokens.append(ttft) + + # number of input tokens + input_seq_len = self._get_input_token_count(req_inputs) + input_sequence_lengths.append(input_seq_len) + + # output token throughput per request + output_token_counts, total_output_token = self._get_output_token_counts( + res_outputs + ) + output_token_throughputs_per_request.append( + total_output_token / req_latency_s + ) + output_sequence_lengths.append(total_output_token) + + # inter token latencies + if total_output_token > 1: + inter_token_latency = (req_latency_ns - ttft) / (total_output_token - 1) + inter_token_latencies.append(round(inter_token_latency)) + + # The new ITL calculation above loses all token-level ITL information + # and as a result breaks ITL vs token position visualization. Keep + # the old version of inter token latency as a WAR to preserve the + # visualization. + chunked_inter_token_latency = [] + for (t1, _), (t2, n2) in self._pairwise( + zip(res_timestamps, output_token_counts) + ): + # TMA-1676: handle empty first/last responses + # if the latter response has zero token (e.g. empty string), + # then set it default to one for the sake of inter token latency + # calculation and to avoid divide by zero. + num_token = 1 if n2 == 0 else n2 + chunked_inter_token_latency.append(round((t2 - t1) / num_token)) + chunked_inter_token_latencies.append(chunked_inter_token_latency) + + # request & output token throughput + benchmark_duration = (max_res_timestamp - min_req_timestamp) / 1e9 # nanosec + request_throughputs = [len(requests) / benchmark_duration] + output_token_throughputs = [sum(output_sequence_lengths) / benchmark_duration] + + return LLMMetrics( + request_throughputs, + request_latencies, + time_to_first_tokens, + inter_token_latencies, + output_token_throughputs, + output_token_throughputs_per_request, + output_sequence_lengths, + input_sequence_lengths, + chunked_inter_token_latencies, + ) + + def _pairwise(self, iterable): + """Generate pairs of consecutive elements from the given iterable.""" + a, b = tee(iterable) + next(b, None) + return zip(a, b) + + def _preprocess_response( + self, res_timestamps: List[int], res_outputs: List[Dict[str, str]] + ) -> None: + """Helper function to preprocess responses of a request.""" + if self._service_kind == "openai": + # PA sometimes receives multiple SSE responses at once (as a single + # response). Handle these responses by merging into a single response. + for i in range(len(res_outputs)): + response = res_outputs[i]["response"] + responses = response.strip().split("\n\n") + if len(responses) > 1: + merged_response = load_json_str(remove_sse_prefix(responses[0])) + if ( + merged_response["choices"][0]["delta"].get("content", None) + is None + ): + merged_response["choices"][0]["delta"]["content"] = "" + for r in responses[1:]: + text = self._extract_openai_text_output(r) + merged_response["choices"][0]["delta"]["content"] += text + + res_outputs[i] = {"response": json.dumps(merged_response)} + + # Remove responses without any content + indices_to_remove = [] + for idx, out in enumerate(res_outputs): + if self._is_openai_empty_response(out["response"]): + indices_to_remove.append(idx) + indices_to_remove.sort(reverse=True) + for index in indices_to_remove: + res_timestamps.pop(index) + res_outputs.pop(index) + + def _get_input_token_count(self, req_inputs: dict) -> int: + """Deserialize the request input and return tokenized inputs.""" + if self._service_kind == "triton": + input_text = req_inputs["text_input"] + elif self._service_kind == "openai": + input_text = self._get_openai_input_text(req_inputs) + else: + raise ValueError(f"Unknown service kind: '{self._service_kind}'.") + + return len(self._tokenizer.encode(input_text)) + + def _get_openai_input_text(self, req_inputs: dict) -> str: + """Tokenize the OpenAI request input texts.""" + payload = load_json_str(req_inputs["payload"]) + if self._response_format == ResponseFormat.OPENAI_CHAT_COMPLETIONS: + return payload["messages"][0]["content"] + elif self._response_format == ResponseFormat.OPENAI_COMPLETIONS: + return payload["prompt"] + else: + raise ValueError( + "Failed to parse OpenAI request input in profile export file." + ) + + def _get_output_token_counts( + self, res_outputs: List[Dict] + ) -> Tuple[List[int], int]: + """Return response-level token counts and total token count.""" + if self._service_kind == "triton": + output_texts = self._get_triton_output_tokens(res_outputs) + elif self._service_kind == "openai": + output_texts = self._get_openai_output_tokens(res_outputs) + else: + raise ValueError(f"Unknown service kind: '{self._service_kind}'.") + + full_text_token_count = len(self._tokenizer.encode("".join(output_texts))) + + output_tokens = self._get_response_output_tokens(output_texts) + output_token_counts = list(map(len, output_tokens)) + return output_token_counts, full_text_token_count + + def _get_triton_output_tokens(self, res_outputs: List[Dict]) -> List[str]: + """Return a list of Triton response texts.""" + return [r["text_output"] for r in res_outputs] + + def _get_openai_output_tokens(self, res_outputs: List[Dict]) -> List[str]: + """Return a list of OpenAI response texts.""" + output_texts = [] + for output in res_outputs: + text = self._extract_openai_text_output(output["response"]) + output_texts.append(text) + return output_texts + + def _get_response_output_tokens(self, output_texts: List[str]) -> List[List[int]]: + """Return a list of response output tokens.""" + # Exclamation mark trick forces the llama tokenization to consistently + # start each output with a specific token which allows us to safely skip + # the first token of every tokenized output and get only the ones that + # are returned by the model + encodings = self._tokenizer(["!" + txt for txt in output_texts]) + return [out[1:] for out in encodings.data["input_ids"]] + + def _extract_openai_text_output(self, response: str) -> str: + """Extracts text/content of the OpenAI response object.""" + response = remove_sse_prefix(response) + + if response == "[DONE]": + return "" + + data = load_json_str(response) + completions = data["choices"][0] + + text_output = "" + if "object" not in data: + # FIXME: TPA-47 workaround for vLLM not following OpenAI Completions + # API specification when streaming, missing 'object' field: + # https://platform.openai.com/docs/api-reference/completions + text_output = completions.get("text", "") + elif data["object"] == "text_completion": # legacy + text_output = completions.get("text", "") + elif data["object"] == "chat.completion": # non-streaming + text_output = completions["message"].get("content", "") + elif data["object"] == "chat.completion.chunk": # streaming + text_output = completions["delta"].get("content", "") + else: + obj_type = data["object"] + raise ValueError(f"Unknown OpenAI response object type '{obj_type}'.") + return text_output + + def _is_openai_empty_response(self, response: str) -> bool: + """Returns true if the response is an openai response with no content (or empty content)""" + text = self._extract_openai_text_output(response) + if text: + return False + return True diff --git a/genai-perf/genai_perf/profile_data_parser/profile_data_parser.py b/genai-perf/genai_perf/profile_data_parser/profile_data_parser.py new file mode 100755 index 00000000..d18d8f6f --- /dev/null +++ b/genai-perf/genai_perf/profile_data_parser/profile_data_parser.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from enum import Enum, auto +from pathlib import Path +from typing import List, Tuple + +from genai_perf.metrics import Metrics, Statistics +from genai_perf.utils import load_json + + +class ResponseFormat(Enum): + HUGGINGFACE_RANKINGS = auto() + OPENAI_CHAT_COMPLETIONS = auto() + OPENAI_COMPLETIONS = auto() + OPENAI_EMBEDDINGS = auto() + RANKINGS = auto() + TRITON = auto() + + +class ProfileDataParser: + """Base profile data parser class that reads the profile data JSON file to + extract core metrics and calculate various performance statistics. + """ + + def __init__(self, filename: Path) -> None: + data = load_json(filename) + self._get_profile_metadata(data) + self._parse_profile_data(data) + + def _get_profile_metadata(self, data: dict) -> None: + self._service_kind = data["service_kind"] + if self._service_kind == "openai": + if data["endpoint"] == "rerank": + self._response_format = ResponseFormat.HUGGINGFACE_RANKINGS + elif data["endpoint"] == "v1/chat/completions": + self._response_format = ResponseFormat.OPENAI_CHAT_COMPLETIONS + elif data["endpoint"] == "v1/completions": + self._response_format = ResponseFormat.OPENAI_COMPLETIONS + elif data["endpoint"] == "v1/embeddings": + self._response_format = ResponseFormat.OPENAI_EMBEDDINGS + elif data["endpoint"] == "v1/ranking": + self._response_format = ResponseFormat.RANKINGS + else: + # TPA-66: add PA metadata to handle this case + # When endpoint field is either empty or custom endpoint, fall + # back to parsing the response to extract the response format. + request = data["experiments"][0]["requests"][0] + response = request["response_outputs"][0]["response"] + if "chat.completion" in response: + self._response_format = ResponseFormat.OPENAI_CHAT_COMPLETIONS + elif "text_completion" in response: + self._response_format = ResponseFormat.OPENAI_COMPLETIONS + elif "embedding" in response: + self._response_format = ResponseFormat.OPENAI_EMBEDDINGS + elif "ranking" in response: + self._response_format = ResponseFormat.RANKINGS + else: + raise RuntimeError("Unknown OpenAI response format.") + + elif self._service_kind == "triton": + self._response_format = ResponseFormat.TRITON + else: + raise ValueError(f"Unknown service kind: {self._service_kind}") + + def _parse_profile_data(self, data: dict) -> None: + """Parse through the entire profile data to collect statistics.""" + self._profile_results = {} + for experiment in data["experiments"]: + infer_mode = experiment["experiment"]["mode"] + load_level = experiment["experiment"]["value"] + requests = experiment["requests"] + + metrics = self._parse_requests(requests) + + # aggregate and calculate statistics + statistics = Statistics(metrics) + self._profile_results[(infer_mode, str(load_level))] = statistics + + def _parse_requests(self, requests: dict) -> Metrics: + """Parse each request in profile data to extract core metrics.""" + min_req_timestamp, max_res_timestamp = float("inf"), 0 + request_latencies = [] + + for request in requests: + req_timestamp = request["timestamp"] + res_timestamps = request["response_timestamps"] + + # track entire benchmark duration + min_req_timestamp = min(min_req_timestamp, req_timestamp) + max_res_timestamp = max(max_res_timestamp, res_timestamps[-1]) + + # request latencies + req_latency = res_timestamps[-1] - req_timestamp + request_latencies.append(req_latency) + + # request throughput + benchmark_duration = (max_res_timestamp - min_req_timestamp) / 1e9 # to seconds + request_throughputs = [len(requests) / benchmark_duration] + + return Metrics( + request_throughputs, + request_latencies, + ) + + def get_statistics(self, infer_mode: str, load_level: str) -> Statistics: + """Return profile statistics if it exists.""" + if (infer_mode, load_level) not in self._profile_results: + raise KeyError(f"Profile with {infer_mode}={load_level} does not exist.") + return self._profile_results[(infer_mode, load_level)] + + def get_profile_load_info(self) -> List[Tuple[str, str]]: + """Return available (infer_mode, load_level) tuple keys.""" + return [k for k, _ in self._profile_results.items()] diff --git a/genai-perf/genai_perf/test_end_to_end.py b/genai-perf/genai_perf/test_end_to_end.py index 3cc2999f..a4430434 100644 --- a/genai-perf/genai_perf/test_end_to_end.py +++ b/genai-perf/genai_perf/test_end_to_end.py @@ -10,7 +10,7 @@ # For all cases but vllm_openai, it assumes that the server will be on port 9999 # # This script will run a sweep of all combinations of values in the testing matrix -# by appending those options on to the genai-pa base command +# by appending those options on to the genai-perf base command # @@ -20,11 +20,11 @@ ] base_commands = { - "nim_chat": "genai-perf -s 999 -p 20000 -m llama-2-7b-chat -u http://localhost:9999 --service-kind openai --endpoint-type chat", - "nim_completions": "genai-perf -s 999 -p 20000 -m llama-2-7b -u http://localhost:9999 --service-kind openai --endpoint-type completions", - "vllm_openai": "genai-perf -s 999 -p 20000 -m mistralai/Mistral-7B-v0.1 --service-kind openai --endpoint-type chat", - "triton_tensorrtllm": "genai-perf -s 999 -p 20000 -m llama-2-7b -u 0.0.0.0:9999 --service-kind triton --backend tensorrtllm", - "triton_vllm": "genai-perf -s 999 -p 20000 -m gpt2_vllm --service-kind triton --backend vllm", + "nim_chat": "genai-perf profile -s 999 -p 20000 -m llama-2-7b-chat -u http://localhost:9999 --service-kind openai --endpoint-type chat", + "nim_completions": "genai-perf profile -s 999 -p 20000 -m llama-2-7b -u http://localhost:9999 --service-kind openai --endpoint-type completions", + "vllm_openai": "genai-perf profile -s 999 -p 20000 -m mistralai/Mistral-7B-v0.1 --service-kind openai --endpoint-type chat", + "triton_tensorrtllm": "genai-perf profile -s 999 -p 20000 -m llama-2-7b -u 0.0.0.0:9999 --service-kind triton --backend tensorrtllm", + "triton_vllm": "genai-perf profile -s 999 -p 20000 -m gpt2_vllm --service-kind triton --backend vllm", } testname = "" diff --git a/genai-perf/genai_perf/utils.py b/genai-perf/genai_perf/utils.py index a10befe1..6f66230c 100644 --- a/genai-perf/genai_perf/utils.py +++ b/genai-perf/genai_perf/utils.py @@ -29,10 +29,14 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Type +import genai_perf.logging as logging + # Skip type checking to avoid mypy error # Issue: https://github.com/python/mypy/issues/10632 import yaml # type: ignore +logger = logging.getLogger(__name__) + def remove_sse_prefix(msg: str) -> str: prefix = "data: " @@ -49,7 +53,17 @@ def load_yaml(filepath: Path) -> Dict[str, Any]: def load_json(filepath: Path) -> Dict[str, Any]: with open(str(filepath), encoding="utf-8", errors="ignore") as f: - return json.load(f) + content = f.read() + return load_json_str(content) + + +def load_json_str(json_str: str) -> Dict[str, Any]: + try: + return json.loads(json_str) + except json.JSONDecodeError: + snippet = json_str[:200] + ("..." if len(json_str) > 200 else "") + logger.error("Failed to parse JSON string: '%s'", snippet) + raise def remove_file(file: Path) -> None: diff --git a/genai-perf/genai_perf/wrapper.py b/genai-perf/genai_perf/wrapper.py index e5f70442..dbaacc32 100644 --- a/genai-perf/genai_perf/wrapper.py +++ b/genai-perf/genai_perf/wrapper.py @@ -64,6 +64,7 @@ def build_cmd(args: Namespace, extra_args: Optional[List[str]] = None) -> List[s skip_args = [ "artifact_dir", "backend", + "batch_size", "concurrency", "endpoint_type", "extra_inputs", diff --git a/genai-perf/tests/test_artifacts.py b/genai-perf/tests/test_artifacts.py index 56b1b38d..cdcc4afc 100644 --- a/genai-perf/tests/test_artifacts.py +++ b/genai-perf/tests/test_artifacts.py @@ -38,7 +38,7 @@ def mock_makedirs(mocker): def test_create_artifacts_dirs_custom_path(mock_makedirs): artifacts_dir_path = "/genai_perf_artifacts" - mock_args = Namespace(artifact_dir=Path(artifacts_dir_path)) + mock_args = Namespace(artifact_dir=Path(artifacts_dir_path), generate_plots=True) create_artifacts_dirs(mock_args) mock_makedirs.assert_any_call( Path(artifacts_dir_path), exist_ok=True @@ -47,3 +47,13 @@ def test_create_artifacts_dirs_custom_path(mock_makedirs): Path(artifacts_dir_path) / "plots", exist_ok=True ), f"Expected os.makedirs to create plots directory inside {artifacts_dir_path}/plots path." assert mock_makedirs.call_count == 2 + + +def test_create_artifacts_disable_generate_plots(mock_makedirs): + artifacts_dir_path = "/genai_perf_artifacts" + mock_args = Namespace(artifact_dir=Path(artifacts_dir_path)) + create_artifacts_dirs(mock_args) + mock_makedirs.assert_any_call( + Path(artifacts_dir_path), exist_ok=True + ), f"Expected os.makedirs to create artifacts directory inside {artifacts_dir_path} path." + assert mock_makedirs.call_count == 1 diff --git a/genai-perf/tests/test_cli.py b/genai-perf/tests/test_cli.py index 5cf84c36..eb891fd0 100644 --- a/genai-perf/tests/test_cli.py +++ b/genai-perf/tests/test_cli.py @@ -35,6 +35,7 @@ OutputFormat, PromptSource, ) +from genai_perf.parser import PathType class TestCLIArguments: @@ -51,10 +52,7 @@ class TestCLIArguments: [ (["-h"], expected_help_output), (["--help"], expected_help_output), - (["-m", "abc", "--help"], expected_help_output), - (["-m", "abc", "-h"], expected_help_output), (["--version"], expected_version_output), - (["-m", "abc", "--version"], expected_version_output), ], ) def test_help_version_arguments_output_and_exit( @@ -79,6 +77,28 @@ def test_help_version_arguments_output_and_exit( ["--artifact-dir", "test_artifact_dir"], {"artifact_dir": Path("test_artifact_dir")}, ), + ( + [ + "--batch-size", + "5", + "--endpoint-type", + "embeddings", + "--service-kind", + "openai", + ], + {"batch_size": 5}, + ), + ( + [ + "-b", + "5", + "--endpoint-type", + "embeddings", + "--service-kind", + "openai", + ], + {"batch_size": 5}, + ), (["--concurrency", "3"], {"concurrency": 3}), ( ["--endpoint-type", "completions", "--service-kind", "openai"], @@ -88,6 +108,10 @@ def test_help_version_arguments_output_and_exit( ["--endpoint-type", "chat", "--service-kind", "openai"], {"endpoint": "v1/chat/completions"}, ), + ( + ["--endpoint-type", "rankings", "--service-kind", "openai"], + {"endpoint": "v1/ranking"}, + ), ( [ "--endpoint-type", @@ -199,7 +223,7 @@ def test_help_version_arguments_output_and_exit( ) def test_non_file_flags_parsed(self, monkeypatch, arg, expected_attributes, capsys): logging.init_logging() - combined_args = ["genai-perf", "--model", "test_model"] + arg + combined_args = ["genai-perf", "profile", "--model", "test_model"] + arg monkeypatch.setattr("sys.argv", combined_args) args, _ = parser.parse_args() @@ -240,7 +264,7 @@ def test_multiple_model_args( self, monkeypatch, models, expected_model_list, formatted_name, capsys ): logging.init_logging() - combined_args = ["genai-perf"] + models + combined_args = ["genai-perf", "profile"] + models monkeypatch.setattr("sys.argv", combined_args) args, _ = parser.parse_args() @@ -257,9 +281,10 @@ def test_multiple_model_args( assert captured.out == "" def test_file_flags_parsed(self, monkeypatch, mocker): - mocked_open = mocker.patch("builtins.open", mocker.mock_open(read_data="data")) + _ = mocker.patch("os.path.isfile", return_value=True) combined_args = [ "genai-perf", + "profile", "--model", "test_model", "--input-file", @@ -267,9 +292,11 @@ def test_file_flags_parsed(self, monkeypatch, mocker): ] monkeypatch.setattr("sys.argv", combined_args) args, _ = parser.parse_args() - assert ( - args.input_file == mocked_open.return_value - ), "The file argument should be the mock object" + filepath, pathtype = args.input_file + assert filepath == Path( + "fakefile.txt" + ), "The file argument should be the path to the file" + assert pathtype == PathType.FILE @pytest.mark.parametrize( "arg, expected_path", @@ -282,6 +309,10 @@ def test_file_flags_parsed(self, monkeypatch, mocker): ["--service-kind", "openai", "--endpoint-type", "completions"], "artifacts/test_model-openai-completions-concurrency1", ), + ( + ["--service-kind", "openai", "--endpoint-type", "rankings"], + "artifacts/test_model-openai-rankings-concurrency1", + ), ( ["--service-kind", "triton", "--backend", "tensorrtllm"], "artifacts/test_model-triton-tensorrtllm-concurrency1", @@ -307,7 +338,7 @@ def test_default_profile_export_filepath( self, monkeypatch, arg, expected_path, capsys ): logging.init_logging() - combined_args = ["genai-perf", "--model", "test_model"] + arg + combined_args = ["genai-perf", "profile", "--model", "test_model"] + arg monkeypatch.setattr("sys.argv", combined_args) args, _ = parser.parse_args() @@ -347,7 +378,7 @@ def test_model_name_artifact_path( self, monkeypatch, arg, expected_path, expected_output, capsys ): logging.init_logging() - combined_args = ["genai-perf"] + arg + combined_args = ["genai-perf", "profile"] + arg monkeypatch.setattr("sys.argv", combined_args) args, _ = parser.parse_args() @@ -357,7 +388,9 @@ def test_model_name_artifact_path( def test_default_load_level(self, monkeypatch, capsys): logging.init_logging() - monkeypatch.setattr("sys.argv", ["genai-perf", "--model", "test_model"]) + monkeypatch.setattr( + "sys.argv", ["genai-perf", "profile", "--model", "test_model"] + ) args, _ = parser.parse_args() assert args.concurrency == 1 captured = capsys.readouterr() @@ -365,7 +398,8 @@ def test_default_load_level(self, monkeypatch, capsys): def test_load_level_mutually_exclusive(self, monkeypatch, capsys): monkeypatch.setattr( - "sys.argv", ["genai-perf", "--concurrency", "3", "--request-rate", "9.0"] + "sys.argv", + ["genai-perf", "profile", "--concurrency", "3", "--request-rate", "9.0"], ) expected_output = ( "argument --request-rate: not allowed with argument --concurrency" @@ -379,7 +413,7 @@ def test_load_level_mutually_exclusive(self, monkeypatch, capsys): assert expected_output in captured.err def test_model_not_provided(self, monkeypatch, capsys): - monkeypatch.setattr("sys.argv", ["genai-perf"]) + monkeypatch.setattr("sys.argv", ["genai-perf", "profile"]) expected_output = "The -m/--model option is required and cannot be empty." with pytest.raises(SystemExit) as excinfo: @@ -390,7 +424,7 @@ def test_model_not_provided(self, monkeypatch, capsys): assert expected_output in captured.err def test_pass_through_args(self, monkeypatch): - args = ["genai-perf", "-m", "test_model"] + args = ["genai-perf", "profile", "-m", "test_model"] other_args = ["--", "With", "great", "power"] monkeypatch.setattr("sys.argv", args + other_args) _, pass_through_args = parser.parse_args() @@ -402,6 +436,7 @@ def test_unrecognized_arg(self, monkeypatch, capsys): "sys.argv", [ "genai-perf", + "profile", "-m", "nonexistent_model", "--wrong-arg", @@ -420,12 +455,20 @@ def test_unrecognized_arg(self, monkeypatch, capsys): "args, expected_output", [ ( - ["genai-perf", "-m", "test_model", "--service-kind", "openai"], + [ + "genai-perf", + "profile", + "-m", + "test_model", + "--service-kind", + "openai", + ], "The --endpoint-type option is required when using the 'openai' service-kind.", ), ( [ "genai-perf", + "profile", "-m", "test_model", "--service-kind", @@ -436,12 +479,20 @@ def test_unrecognized_arg(self, monkeypatch, capsys): "The --endpoint-type option is required when using the 'openai' service-kind.", ), ( - ["genai-perf", "-m", "test_model", "--output-tokens-stddev", "5"], + [ + "genai-perf", + "profile", + "-m", + "test_model", + "--output-tokens-stddev", + "5", + ], "The --output-tokens-mean option is required when using --output-tokens-stddev.", ), ( [ "genai-perf", + "profile", "-m", "test_model", "--output-tokens-mean-deterministic", @@ -451,6 +502,7 @@ def test_unrecognized_arg(self, monkeypatch, capsys): ( [ "genai-perf", + "profile", "-m", "test_model", "--output-tokens-mean-deterministic", @@ -460,6 +512,7 @@ def test_unrecognized_arg(self, monkeypatch, capsys): ( [ "genai-perf", + "profile", "-m", "test_model", "--service-kind", @@ -472,6 +525,73 @@ def test_unrecognized_arg(self, monkeypatch, capsys): ], "The --output-tokens-mean-deterministic option is only supported with the Triton service-kind", ), + ( + [ + "genai-perf", + "profile", + "-m", + "test_model", + "--batch-size", + "10", + ], + "The --batch-size option is currently only supported with the embeddings and rankings endpoint types", + ), + ( + [ + "genai-perf", + "profile", + "-m", + "test_model", + "--service-kind", + "openai", + "--endpoint-type", + "embeddings", + "--streaming", + ], + "The --streaming option is not supported with the embeddings endpoint type", + ), + ( + [ + "genai-perf", + "profile", + "-m", + "test_model", + "--service-kind", + "openai", + "--endpoint-type", + "rankings", + "--streaming", + ], + "The --streaming option is not supported with the rankings endpoint type", + ), + ( + [ + "genai-perf", + "profile", + "-m", + "test_model", + "--service-kind", + "openai", + "--endpoint-type", + "embeddings", + "--generate-plots", + ], + "The --generate-plots option is not currently supported with the embeddings endpoint type", + ), + ( + [ + "genai-perf", + "profile", + "-m", + "test_model", + "--service-kind", + "openai", + "--endpoint-type", + "rankings", + "--generate-plots", + ], + "The --generate-plots option is not currently supported with the rankings endpoint type", + ), ], ) def test_conditional_errors(self, args, expected_output, monkeypatch, capsys): @@ -506,6 +626,10 @@ def test_conditional_errors(self, args, expected_output, monkeypatch, capsys): ], OutputFormat.OPENAI_COMPLETIONS, ), + ( + ["--service-kind", "openai", "--endpoint-type", "rankings"], + OutputFormat.RANKINGS, + ), ( ["--service-kind", "triton", "--backend", "tensorrtllm"], OutputFormat.TENSORRTLLM, @@ -514,7 +638,9 @@ def test_conditional_errors(self, args, expected_output, monkeypatch, capsys): ], ) def test_inferred_output_format(self, monkeypatch, args, expected_format): - monkeypatch.setattr("sys.argv", ["genai-perf", "-m", "test_model"] + args) + monkeypatch.setattr( + "sys.argv", ["genai-perf", "profile", "-m", "test_model"] + args + ) parsed_args, _ = parser.parse_args() assert parsed_args.output_format == expected_format @@ -545,7 +671,7 @@ def test_inferred_output_format(self, monkeypatch, args, expected_format): ], ) def test_repeated_extra_arg_warning(self, monkeypatch, args, expected_error): - combined_args = ["genai-perf", "-m", "test_model"] + args + combined_args = ["genai-perf", "profile", "-m", "test_model"] + args monkeypatch.setattr("sys.argv", combined_args) parsed_args, _ = parser.parse_args() @@ -571,7 +697,9 @@ def test_inferred_prompt_source( self, monkeypatch, mocker, args, expected_prompt_source ): _ = mocker.patch("builtins.open", mocker.mock_open(read_data="data")) - combined_args = ["genai-perf", "--model", "test_model"] + args + _ = mocker.patch("os.path.isfile", return_value=True) + _ = mocker.patch("os.path.isdir", return_value=True) + combined_args = ["genai-perf", "profile", "--model", "test_model"] + args monkeypatch.setattr("sys.argv", combined_args) args, _ = parser.parse_args() @@ -579,8 +707,11 @@ def test_inferred_prompt_source( def test_prompt_source_assertions(self, monkeypatch, mocker, capsys): _ = mocker.patch("builtins.open", mocker.mock_open(read_data="data")) + _ = mocker.patch("os.path.isfile", return_value=True) + _ = mocker.patch("os.path.isdir", return_value=True) args = [ "genai-perf", + "profile", "--model", "test_model", "--input-dataset", @@ -655,20 +786,6 @@ def test_compare_not_provided(self, monkeypatch, capsys): captured = capsys.readouterr() assert expected_output in captured.err - @pytest.mark.parametrize( - "args, expected_model", - [ - (["--files", "profile1.json", "profile2.json", "profile3.json"], None), - (["--config", "config.yaml"], None), - ], - ) - def test_compare_model_arg(self, monkeypatch, args, expected_model): - combined_args = ["genai-perf", "compare"] + args - monkeypatch.setattr("sys.argv", combined_args) - args, _ = parser.parse_args() - - assert args.model == expected_model - @pytest.mark.parametrize( "extra_inputs_list, expected_dict", [ diff --git a/genai-perf/tests/test_console_exporter.py b/genai-perf/tests/test_console_exporter.py index 2bf41441..dda62e04 100644 --- a/genai-perf/tests/test_console_exporter.py +++ b/genai-perf/tests/test_console_exporter.py @@ -24,39 +24,80 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from genai_perf import parser from genai_perf.export_data.console_exporter import ConsoleExporter from genai_perf.export_data.exporter_config import ExporterConfig -from genai_perf.llm_metrics import LLMMetrics, Statistics +from genai_perf.metrics import LLMMetrics, Metrics, Statistics class TestConsoleExporter: - def test_pretty_print_output(self, capsys) -> None: + def test_streaming_llm_output(self, monkeypatch, capsys) -> None: + argv = [ + "genai-perf", + "profile", + "-m", + "model_name", + "--service-kind", + "openai", + "--endpoint-type", + "chat", + "--streaming", + ] + monkeypatch.setattr("sys.argv", argv) + args, _ = parser.parse_args() + + metrics = LLMMetrics( + request_throughputs=[123], + request_latencies=[4, 5, 6], + time_to_first_tokens=[7, 8, 9], + inter_token_latencies=[10, 11, 12], + output_token_throughputs=[456], + output_sequence_lengths=[1, 2, 3], + input_sequence_lengths=[5, 6, 7], + ) + stats = Statistics(metrics=metrics) + config = ExporterConfig() - config.stats = stats + config.stats = stats.stats_dict + config.metrics = stats.metrics + config.args = args + exporter = ConsoleExporter(config) exporter.export() expected_content = ( - " LLM Metrics \n" - "┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┓\n" - "┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃\n" - "┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━┩\n" - "│ Time to first token (ms) │ 2.00 │ 2.00 │ 3.00 │ 2.99 │ 2.90 │ 2.75 │\n" - "│ Inter token latency (ms) │ 0.50 │ 0.00 │ 1.00 │ 0.99 │ 0.90 │ 0.75 │\n" - "│ Request latency (ms) │ 3.00 │ 3.00 │ 4.00 │ 3.99 │ 3.90 │ 3.75 │\n" - "│ Output sequence length │ 6.50 │ 6.00 │ 7.00 │ 6.99 │ 6.90 │ 6.75 │\n" - "│ Input sequence length │ 7.50 │ 7.00 │ 8.00 │ 7.99 │ 7.90 │ 7.75 │\n" - "└──────────────────────────┴──────┴──────┴──────┴──────┴──────┴──────┘\n" - "Output token throughput (per sec): 123.00\n" - "Request throughput (per sec): 456.00\n" + " LLM Metrics \n" + "┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┓\n" + "┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃\n" + "┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━┩\n" + "│ Time to first token (ms) │ 8.00 │ 7.00 │ 9.00 │ 8.98 │ 8.80 │ 8.50 │\n" + "│ Inter token latency (ms) │ 11.00 │ 10.00 │ 12.00 │ 11.98 │ 11.80 │ 11.50 │\n" + "│ Request latency (ms) │ 5.00 │ 4.00 │ 6.00 │ 5.98 │ 5.80 │ 5.50 │\n" + "│ Output sequence length │ 2.00 │ 1.00 │ 3.00 │ 2.98 │ 2.80 │ 2.50 │\n" + "│ Input sequence length │ 6.00 │ 5.00 │ 7.00 │ 6.98 │ 6.80 │ 6.50 │\n" + "└──────────────────────────┴───────┴───────┴───────┴───────┴───────┴───────┘\n" + "Output token throughput (per sec): 456.00\n" + "Request throughput (per sec): 123.00\n" ) returned_data = capsys.readouterr().out - assert returned_data == expected_content - def test_nonstreaming_llm_output(self, capsys) -> None: + def test_nonstreaming_llm_output(self, monkeypatch, capsys) -> None: + argv = [ + "genai-perf", + "profile", + "-m", + "model_name", + "--service-kind", + "openai", + "--endpoint-type", + "chat", + ] + monkeypatch.setattr("sys.argv", argv) + args, _ = parser.parse_args() + metrics = LLMMetrics( request_throughputs=[123], request_latencies=[4, 5, 6], @@ -70,6 +111,9 @@ def test_nonstreaming_llm_output(self, capsys) -> None: config = ExporterConfig() config.stats = stats.stats_dict + config.metrics = stats.metrics + config.args = args + exporter = ConsoleExporter(config) exporter.export() @@ -90,86 +134,43 @@ def test_nonstreaming_llm_output(self, capsys) -> None: returned_data = capsys.readouterr().out assert returned_data == expected_content + def test_embedding_output(self, monkeypatch, capsys) -> None: + argv = [ + "genai-perf", + "profile", + "-m", + "model_name", + "--service-kind", + "openai", + "--endpoint-type", + "embeddings", + ] + monkeypatch.setattr("sys.argv", argv) + args, _ = parser.parse_args() -stats = { - "request_throughput": {"unit": "requests/sec", "avg": 456.0}, - "request_latency": { - "unit": "ms", - "avg": 3.0, - "p99": 3.99, - "p95": 3.95, - "p90": 3.90, - "p75": 3.75, - "p50": 3.50, - "p25": 3.25, - "max": 4.0, - "min": 3.0, - "std": 3.50, - }, - "time_to_first_token": { - "unit": "ms", - "avg": 2.0, - "p99": 2.99, - "p95": 2.95, - "p90": 2.90, - "p75": 2.75, - "p50": 2.50, - "p25": 2.25, - "max": 3.00, - "min": 2.00, - "std": 2.50, - }, - "inter_token_latency": { - "unit": "ms", - "avg": 0.50, - "p99": 0.99, - "p95": 0.95, - "p90": 0.90, - "p75": 0.75, - "p50": 0.50, - "p25": 0.25, - "max": 1.00, - "min": 0.00, - "std": 0.50, - }, - "output_token_throughput": {"unit": "tokens/sec", "avg": 123.0}, - "output_token_throughput_per_request": { - "unit": "tokens/sec", - "avg": 300.00, - "p99": 300.00, - "p95": 300.00, - "p90": 300.00, - "p75": 300.00, - "p50": 300.00, - "p25": 300.00, - "max": 300.00, - "min": 300.00, - "std": 300.00, - }, - "output_sequence_length": { - "unit": "tokens", - "avg": 6.5, - "p99": 6.99, - "p95": 6.95, - "p90": 6.90, - "p75": 6.75, - "p50": 6.5, - "p25": 6.25, - "max": 7.0, - "min": 6.0, - "std": 6.5, - }, - "input_sequence_length": { - "unit": "tokens", - "avg": 7.5, - "p99": 7.99, - "p95": 7.95, - "p90": 7.90, - "p75": 7.75, - "p50": 7.5, - "p25": 7.25, - "max": 8.0, - "min": 7.0, - "std": 7.5, - }, -} + metrics = Metrics( + request_throughputs=[123], + request_latencies=[4, 5, 6], + ) + stats = Statistics(metrics=metrics) + + config = ExporterConfig() + config.stats = stats.stats_dict + config.metrics = stats.metrics + config.args = args + + exporter = ConsoleExporter(config) + exporter.export() + + expected_content = ( + " Embeddings Metrics \n" + "┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┓\n" + "┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃\n" + "┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━┩\n" + "│ Request latency (ms) │ 5.00 │ 4.00 │ 6.00 │ 5.98 │ 5.80 │ 5.50 │\n" + "└──────────────────────┴──────┴──────┴──────┴──────┴──────┴──────┘\n" + "Request throughput (per sec): 123.00\n" + ) + + returned_data = capsys.readouterr().out + assert returned_data == expected_content diff --git a/genai-perf/tests/test_csv_exporter.py b/genai-perf/tests/test_csv_exporter.py index 5372612e..6a60bc2d 100644 --- a/genai-perf/tests/test_csv_exporter.py +++ b/genai-perf/tests/test_csv_exporter.py @@ -24,16 +24,15 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import json from io import StringIO from pathlib import Path from typing import Any, List import pytest +from genai_perf import parser from genai_perf.export_data.csv_exporter import CsvExporter from genai_perf.export_data.exporter_config import ExporterConfig -from genai_perf.llm_metrics import LLMProfileDataParser -from genai_perf.tokenizer import DEFAULT_TOKENIZER, get_tokenizer +from genai_perf.metrics import LLMMetrics, Metrics, Statistics class TestCsvExporter: @@ -52,10 +51,7 @@ def write(self: Any, content: str) -> int: written_data.append(content) return len(content) - if str(filename) == "triton_profile_export.json": - tmp_file = StringIO(json.dumps(triton_profile_data)) - return tmp_file - elif str(filename) == "profile_export_genai_perf.csv": + if str(filename) == "profile_export_genai_perf.csv": tmp_file = StringIO() tmp_file.write = write.__get__(tmp_file) return tmp_file @@ -66,102 +62,152 @@ def write(self: Any, content: str) -> int: return written_data - def test_csv_output(self, mock_read_write: pytest.MonkeyPatch) -> None: + def test_streaming_llm_csv_output( + self, monkeypatch, mock_read_write: pytest.MonkeyPatch + ) -> None: """ Collect LLM metrics from profile export data and confirm correct values are printed in csv. """ - - tokenizer = get_tokenizer(DEFAULT_TOKENIZER) - pd = LLMProfileDataParser( - filename=Path("triton_profile_export.json"), - tokenizer=tokenizer, + argv = [ + "genai-perf", + "profile", + "-m", + "model_name", + "--service-kind", + "openai", + "--endpoint-type", + "chat", + "--streaming", + ] + monkeypatch.setattr("sys.argv", argv) + args, _ = parser.parse_args() + + metrics = LLMMetrics( + request_throughputs=[123], + request_latencies=[4, 5, 6], + time_to_first_tokens=[7, 8, 9], + inter_token_latencies=[10, 11, 12], + output_token_throughputs=[456], + output_sequence_lengths=[1, 2, 3], + input_sequence_lengths=[5, 6, 7], ) - stat = pd.get_statistics(infer_mode="concurrency", load_level="10") + stats = Statistics(metrics=metrics) + + config = ExporterConfig() + config.stats = stats.stats_dict + config.metrics = stats.metrics + config.artifact_dir = Path(".") + config.args = args + + exporter = CsvExporter(config) + exporter.export() expected_content = [ "Metric,avg,min,max,p99,p95,p90,p75,p50,p25\r\n", - "Time To First Token (ms),2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00\r\n", - "Inter Token Latency (ms),1.50,1.00,2.00,1.99,1.95,1.90,1.75,1.50,1.25\r\n", - "Request Latency (ms),8.00,7.00,9.00,8.98,8.90,8.80,8.50,8.00,7.50\r\n", - "Output Sequence Length,4.50,3.00,6.00,5.97,5.85,5.70,5.25,4.50,3.75\r\n", - "Input Sequence Length,3.50,3.00,4.00,3.99,3.95,3.90,3.75,3.50,3.25\r\n", + "Time To First Token (ms),8.00,7.00,9.00,8.98,8.90,8.80,8.50,8.00,7.50\r\n", + "Inter Token Latency (ms),11.00,10.00,12.00,11.98,11.90,11.80,11.50,11.00,10.50\r\n", + "Request Latency (ms),5.00,4.00,6.00,5.98,5.90,5.80,5.50,5.00,4.50\r\n", + "Output Sequence Length,2.00,1.00,3.00,2.98,2.90,2.80,2.50,2.00,1.50\r\n", + "Input Sequence Length,6.00,5.00,7.00,6.98,6.90,6.80,6.50,6.00,5.50\r\n", "\r\n", "Metric,Value\r\n", - "Output Token Throughput (per sec),900000000.00\r\n", - "Request Throughput (per sec),200000000.00\r\n", + "Output Token Throughput (per sec),456.00\r\n", + "Request Throughput (per sec),123.00\r\n", ] + returned_data = mock_read_write + assert returned_data == expected_content + + def test_nonstreaming_llm_csv_output( + self, monkeypatch, mock_read_write: pytest.MonkeyPatch + ) -> None: + """ + Collect LLM metrics from profile export data and confirm correct values are + printed in csv. + """ + argv = [ + "genai-perf", + "profile", + "-m", + "model_name", + "--service-kind", + "openai", + "--endpoint-type", + "chat", + ] + monkeypatch.setattr("sys.argv", argv) + args, _ = parser.parse_args() + + metrics = LLMMetrics( + request_throughputs=[123], + request_latencies=[4, 5, 6], + time_to_first_tokens=[4, 5, 6], # same as request_latency + inter_token_latencies=[], # no ITL + output_token_throughputs=[456], + output_sequence_lengths=[1, 2, 3], + input_sequence_lengths=[5, 6, 7], + ) + stats = Statistics(metrics=metrics) + config = ExporterConfig() - config.stats = stat.stats_dict + config.stats = stats.stats_dict + config.metrics = stats.metrics config.artifact_dir = Path(".") + config.args = args + exporter = CsvExporter(config) exporter.export() + expected_content = [ + "Metric,avg,min,max,p99,p95,p90,p75,p50,p25\r\n", + "Request Latency (ms),5.00,4.00,6.00,5.98,5.90,5.80,5.50,5.00,4.50\r\n", + "Output Sequence Length,2.00,1.00,3.00,2.98,2.90,2.80,2.50,2.00,1.50\r\n", + "Input Sequence Length,6.00,5.00,7.00,6.98,6.90,6.80,6.50,6.00,5.50\r\n", + "\r\n", + "Metric,Value\r\n", + "Output Token Throughput (per sec),456.00\r\n", + "Request Throughput (per sec),123.00\r\n", + ] returned_data = mock_read_write - assert returned_data == expected_content + def test_embedding_csv_output( + self, monkeypatch, mock_read_write: pytest.MonkeyPatch + ) -> None: + argv = [ + "genai-perf", + "profile", + "-m", + "model_name", + "--service-kind", + "openai", + "--endpoint-type", + "embeddings", + ] + monkeypatch.setattr("sys.argv", argv) + args, _ = parser.parse_args() + + metrics = Metrics( + request_throughputs=[123], + request_latencies=[4, 5, 6], + ) + stats = Statistics(metrics=metrics) + + config = ExporterConfig() + config.stats = stats.stats_dict + config.metrics = stats.metrics + config.artifact_dir = Path(".") + config.args = args -triton_profile_data = { - "service_kind": "triton", - "endpoint": "", - "experiments": [ - { - "experiment": { - "mode": "concurrency", - "value": 10, - }, - "requests": [ - { - "timestamp": 1, - "request_inputs": {"text_input": "This is test"}, - "response_timestamps": [3, 5, 8], - "response_outputs": [ - {"text_output": "I"}, - {"text_output": " like"}, - {"text_output": " dogs"}, - ], - }, - { - "timestamp": 2, - "request_inputs": {"text_input": "This is test too"}, - "response_timestamps": [4, 7, 11], - "response_outputs": [ - {"text_output": "I"}, - {"text_output": " don't"}, - {"text_output": " cook food"}, - ], - }, - ], - }, - { - "experiment": { - "mode": "request_rate", - "value": 2.0, - }, - "requests": [ - { - "timestamp": 5, - "request_inputs": {"text_input": "This is test"}, - "response_timestamps": [7, 8, 13, 18], - "response_outputs": [ - {"text_output": "cat"}, - {"text_output": " is"}, - {"text_output": " cool"}, - {"text_output": " too"}, - ], - }, - { - "timestamp": 3, - "request_inputs": {"text_input": "This is test too"}, - "response_timestamps": [6, 8, 11], - "response_outputs": [ - {"text_output": "it's"}, - {"text_output": " very"}, - {"text_output": " simple work"}, - ], - }, - ], - }, - ], -} + exporter = CsvExporter(config) + exporter.export() + + expected_content = [ + "Metric,avg,min,max,p99,p95,p90,p75,p50,p25\r\n", + "Request Latency (ms),5.00,4.00,6.00,5.98,5.90,5.80,5.50,5.00,4.50\r\n", + "\r\n", + "Metric,Value\r\n", + "Request Throughput (per sec),123.00\r\n", + ] + returned_data = mock_read_write + assert returned_data == expected_content diff --git a/genai-perf/tests/test_json_exporter.py b/genai-perf/tests/test_json_exporter.py index c59c688e..e4a29267 100644 --- a/genai-perf/tests/test_json_exporter.py +++ b/genai-perf/tests/test_json_exporter.py @@ -35,6 +35,7 @@ class TestJsonExporter: def test_generate_json(self, monkeypatch) -> None: cli_cmd = [ "genai-perf", + "profile", "-m", "gpt2_vllm", "--backend", @@ -234,13 +235,13 @@ def test_generate_json(self, monkeypatch) -> None: "formatted_model_name": "gpt2_vllm", "model_selection_strategy": "round_robin", "backend": "vllm", + "batch_size": 1, "endpoint": null, "endpoint_type": null, "service_kind": "triton", "streaming": true, "u": null, "input_dataset": null, - "input_file": null, "num_prompts": 100, "output_tokens_mean": -1, "output_tokens_mean_deterministic": false, @@ -257,7 +258,7 @@ def test_generate_json(self, monkeypatch) -> None: "artifact_dir": "artifacts/gpt2_vllm-triton-vllm-concurrency1", "tokenizer": "hf-internal-testing/llama-tokenizer", "verbose": false, - "subcommand": null, + "subcommand": "profile", "prompt_source": "synthetic", "extra_inputs": { "max_tokens": 256, diff --git a/genai-perf/tests/test_llm_inputs_embeddings.py b/genai-perf/tests/test_llm_inputs_embeddings.py new file mode 100644 index 00000000..0cefa38a --- /dev/null +++ b/genai-perf/tests/test_llm_inputs_embeddings.py @@ -0,0 +1,172 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from pathlib import Path +from unittest.mock import mock_open, patch + +import pytest +from genai_perf.llm_inputs.llm_inputs import LlmInputs, ModelSelectionStrategy + + +class TestLlmInputsEmbeddings: + @patch("pathlib.Path.exists", return_value=True) + @patch( + "builtins.open", + new_callable=mock_open, + read_data="\n".join( + [ + '{"text": "What production company co-owned by Kevin Loader and Rodger Michell produced My Cousin Rachel?"}', + '{"text": "Who served as the 1st Vice President of Colombia under El Libertador?"}', + '{"text": "Are the Barton Mine and Hermiston-McCauley Mine located in The United States of America?"}', + '{"text": "what state did they film daddy\'s home 2"}', + ] + ), + ) + def test_get_input_dataset_from_embeddings_file(self, mock_file, mock_exists): + input_filename = Path("embeddings.jsonl") + batch_size = 3 + dataset = LlmInputs._get_input_dataset_from_embeddings_file( + input_filename, batch_size, num_prompts=100 + ) + + assert dataset is not None + assert len(dataset["rows"]) == 100 + for row in dataset["rows"]: + assert "row" in row + assert "payload" in row["row"] + payload = row["row"]["payload"] + assert "input" in payload + assert isinstance(payload["input"], list) + assert len(payload["input"]) == batch_size + + # Try error case where batch size is larger than the number of available texts + with pytest.raises( + ValueError, + match="Batch size cannot be larger than the number of available texts", + ): + LlmInputs._get_input_dataset_from_embeddings_file( + input_filename, 5, num_prompts=10 + ) + + def test_convert_generic_json_to_openai_embeddings_format(self): + generic_dataset = { + "rows": [ + {"payload": {"input": ["text 1", "text 2"]}}, + {"payload": {"input": ["text 3", "text 4"]}}, + ] + } + + expected_result = { + "data": [ + { + "payload": [ + { + "input": ["text 1", "text 2"], + "model": "test_model", + } + ] + }, + { + "payload": [ + { + "input": ["text 3", "text 4"], + "model": "test_model", + } + ] + }, + ] + } + + result = LlmInputs._convert_generic_json_to_openai_embeddings_format( + generic_dataset, + extra_inputs={}, + model_name=["test_model"], + model_selection_strategy=ModelSelectionStrategy.ROUND_ROBIN, + ) + + assert result is not None + assert "data" in result + assert len(result["data"]) == len(expected_result["data"]) + + for i, item in enumerate(expected_result["data"]): + assert "payload" in result["data"][i] + assert result["data"][i]["payload"] == item["payload"] + + def test_convert_generic_json_to_openai_embeddings_format_with_extra_inputs(self): + generic_dataset = { + "rows": [ + {"payload": {"input": ["text 1", "text 2"]}}, + {"payload": {"input": ["text 3", "text 4"]}}, + ] + } + + extra_inputs = { + "encoding_format": "base64", + "truncate": "END", + "additional_key": "additional_value", + } + + expected_result = { + "data": [ + { + "payload": [ + { + "input": ["text 1", "text 2"], + "model": "test_model", + "encoding_format": "base64", + "truncate": "END", + "additional_key": "additional_value", + } + ] + }, + { + "payload": [ + { + "input": ["text 3", "text 4"], + "model": "test_model", + "encoding_format": "base64", + "truncate": "END", + "additional_key": "additional_value", + } + ] + }, + ] + } + + result = LlmInputs._convert_generic_json_to_openai_embeddings_format( + generic_dataset, + extra_inputs=extra_inputs, + model_name=["test_model"], + model_selection_strategy=ModelSelectionStrategy.ROUND_ROBIN, + ) + + assert result is not None + assert "data" in result + assert len(result["data"]) == len(expected_result["data"]) + + for i, item in enumerate(expected_result["data"]): + assert "payload" in result["data"][i] + assert result["data"][i]["payload"] == item["payload"] diff --git a/genai-perf/tests/test_llm_inputs_rankings.py b/genai-perf/tests/test_llm_inputs_rankings.py new file mode 100644 index 00000000..bfe2be48 --- /dev/null +++ b/genai-perf/tests/test_llm_inputs_rankings.py @@ -0,0 +1,182 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from pathlib import Path +from unittest.mock import mock_open, patch + +import pytest +from genai_perf.llm_inputs.llm_inputs import LlmInputs, ModelSelectionStrategy + + +class TestLlmInputsRankings: + + def open_side_effects(filepath, *args, **kwargs): + queries_content = "\n".join( + [ + '{"text": "What production company co-owned by Kevin Loader and Rodger Michell produced My Cousin Rachel?"}', + '{"text": "Who served as the 1st Vice President of Colombia under El Libertador?"}', + '{"text": "Are the Barton Mine and Hermiston-McCauley Mine located in The United States of America?"}', + ] + ) + passages_content = "\n".join( + [ + '{"text": "Eric Anderson (sociologist) Eric Anderson (born January 18, 1968) is an American sociologist"}', + '{"text": "Kevin Loader is a British film and television producer. "}', + '{"text": "Barton Mine, also known as Net Lake Mine, is an abandoned surface and underground mine in Northeastern Ontario"}', + ] + ) + + file_contents = { + "queries.jsonl": queries_content, + "passages.jsonl": passages_content, + } + return mock_open( + read_data=file_contents.get(filepath, file_contents["queries.jsonl"]) + )() + + mock_open_obj = mock_open() + mock_open_obj.side_effect = open_side_effects + + @patch("pathlib.Path.exists", return_value=True) + @patch("builtins.open", mock_open_obj) + def test_get_input_dataset_from_rankings_file(self, mock_file): + queries_filename = Path("queries.jsonl") + passages_filename = Path("passages.jsonl") + batch_size = 2 + dataset = LlmInputs._get_input_dataset_from_rankings_files( + queries_filename, passages_filename, batch_size, num_prompts=100 + ) + + assert dataset is not None + assert len(dataset["rows"]) == 100 + for row in dataset["rows"]: + assert "row" in row + assert "payload" in row["row"] + payload = row["row"]["payload"] + assert "query" in payload + assert "passages" in payload + assert isinstance(payload["passages"], list) + assert len(payload["passages"]) == batch_size + + # Try error case where batch size is larger than the number of available texts + with pytest.raises( + ValueError, + match="Batch size cannot be larger than the number of available passages", + ): + LlmInputs._get_input_dataset_from_rankings_files( + queries_filename, passages_filename, 5, num_prompts=10 + ) + + def test_convert_generic_json_to_openai_rankings_format(self): + generic_dataset = { + "rows": [ + { + "payload": { + "query": {"text": "1"}, + "passages": [{"text": "2"}, {"text": "3"}, {"text": "4"}], + } + } + ] + } + + expected_result = { + "data": [ + { + "payload": [ + { + "query": {"text": "1"}, + "passages": [{"text": "2"}, {"text": "3"}, {"text": "4"}], + "model": "test_model", + } + ] + } + ] + } + + result = LlmInputs._convert_generic_json_to_rankings_format( + generic_dataset, + extra_inputs={}, + model_name=["test_model"], + model_selection_strategy=ModelSelectionStrategy.ROUND_ROBIN, + ) + + assert result is not None + assert "data" in result + assert len(result["data"]) == len(expected_result["data"]) + + for i, item in enumerate(expected_result["data"]): + assert "payload" in result["data"][i] + assert result["data"][i]["payload"] == item["payload"] + + def test_convert_generic_json_to_openai_rankings_format_with_extra_inputs(self): + generic_dataset = { + "rows": [ + { + "payload": { + "query": {"text": "1"}, + "passages": [{"text": "2"}, {"text": "3"}, {"text": "4"}], + } + } + ] + } + + extra_inputs = { + "encoding_format": "base64", + "truncate": "END", + "additional_key": "additional_value", + } + + expected_result = { + "data": [ + { + "payload": [ + { + "query": {"text": "1"}, + "passages": [{"text": "2"}, {"text": "3"}, {"text": "4"}], + "model": "test_model", + "encoding_format": "base64", + "truncate": "END", + "additional_key": "additional_value", + } + ] + } + ] + } + + result = LlmInputs._convert_generic_json_to_rankings_format( + generic_dataset, + extra_inputs=extra_inputs, + model_name=["test_model"], + model_selection_strategy=ModelSelectionStrategy.ROUND_ROBIN, + ) + + assert result is not None + assert "data" in result + assert len(result["data"]) == len(expected_result["data"]) + + for i, item in enumerate(expected_result["data"]): + assert "payload" in result["data"][i] + assert result["data"][i]["payload"] == item["payload"] diff --git a/genai-perf/tests/test_llm_metrics.py b/genai-perf/tests/test_llm_metrics.py index d221b759..05de5b12 100644 --- a/genai-perf/tests/test_llm_metrics.py +++ b/genai-perf/tests/test_llm_metrics.py @@ -24,394 +24,57 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import json -from io import StringIO -from pathlib import Path -from typing import Any, List, Union - -import numpy as np import pytest -from genai_perf.llm_metrics import LLMMetrics, LLMProfileDataParser, ResponseFormat -from genai_perf.tokenizer import DEFAULT_TOKENIZER, get_tokenizer - - -def ns_to_sec(ns: int) -> Union[int, float]: - """Convert from nanosecond to second.""" - return ns / 1e9 - - -class TestLLMProfileDataParser: - @pytest.fixture - def mock_read_write(self, monkeypatch: pytest.MonkeyPatch) -> List[str]: - """ - This function will mock the open function for specific files: - - - For "triton_profile_export.json", it will read and return the - contents of self.triton_profile_data - - For "openai_profile_export.json", it will read and return the - contents of self.openai_profile_data - - For "profile_export.csv", it will capture all data written to - the file, and return it as the return value of this function - - For all other files, it will behave like the normal open function - """ - - written_data = [] - - original_open = open - - def custom_open(filename, *args, **kwargs): - def write(self: Any, content: str) -> int: - written_data.append(content) - return len(content) - - if filename == "triton_profile_export.json": - tmp_file = StringIO(json.dumps(self.triton_profile_data)) - return tmp_file - elif filename == "openai_profile_export.json": - tmp_file = StringIO(json.dumps(self.openai_profile_data)) - return tmp_file - elif filename == "empty_profile_export.json": - tmp_file = StringIO(json.dumps(self.empty_profile_data)) - return tmp_file - elif filename == "profile_export.csv": - tmp_file = StringIO() - tmp_file.write = write.__get__(tmp_file) - return tmp_file - else: - return original_open(filename, *args, **kwargs) - - monkeypatch.setattr("builtins.open", custom_open) - - return written_data - - def test_triton_llm_profile_data(self, mock_read_write: pytest.MonkeyPatch) -> None: - """Collect LLM metrics from profile export data and check values. - - Metrics - * time to first tokens - - experiment 1: [3 - 1, 4 - 2] = [2, 2] - - experiment 2: [7 - 5, 6 - 3] = [2, 3] - * inter token latencies - - experiment 1: [((8 - 1) - 2)/(3 - 1), ((11 - 2) - 2)/(6 - 1)] - : [2.5, 1.4] - : [2, 1] # rounded - - experiment 2: [((18 - 5) - 2)/(4 - 1), ((11 - 3) - 3)/(6 - 1)] - : [11/3, 1] - : [4, 1] # rounded - * output token throughputs per request - - experiment 1: [3/(8 - 1), 6/(11 - 2)] = [3/7, 6/9] - - experiment 2: [4/(18 - 5), 6/(11 - 3)] = [4/13, 6/8] - * output token throughputs - - experiment 1: [(3 + 6)/(11 - 1)] = [9/10] - - experiment 2: [(4 + 6)/(18 - 3)] = [2/3] - * output sequence lengths - - experiment 1: [3, 6] - - experiment 2: [4, 6] - * input sequence lengths - - experiment 1: [3, 4] - - experiment 2: [3, 4] - """ - tokenizer = get_tokenizer(DEFAULT_TOKENIZER) - pd = LLMProfileDataParser( - filename=Path("triton_profile_export.json"), - tokenizer=tokenizer, - ) - - # experiment 1 metrics & statistics - stat_obj = pd.get_statistics(infer_mode="concurrency", load_level="10") - metrics = stat_obj.metrics - stat = stat_obj.stats_dict - - assert isinstance(metrics, LLMMetrics) - - assert metrics.time_to_first_tokens == [2, 2] - assert metrics.inter_token_latencies == [2, 1] - ottpr = [3 / ns_to_sec(7), 6 / ns_to_sec(9)] - assert metrics.output_token_throughputs_per_request == pytest.approx(ottpr) - ott = [9 / ns_to_sec(10)] - assert metrics.output_token_throughputs == pytest.approx(ott) - assert metrics.output_sequence_lengths == [3, 6] - assert metrics.input_sequence_lengths == [3, 4] - - # Disable Pylance warnings for dynamically set attributes due to Statistics - # not having strict attributes listed. - assert stat["time_to_first_token"]["avg"] == 2 # type: ignore - assert stat["inter_token_latency"]["avg"] == 1.5 # type: ignore - assert stat["output_token_throughput_per_request"]["avg"] == pytest.approx( # type: ignore - np.mean(ottpr) - ) - assert stat["output_sequence_length"]["avg"] == 4.5 # type: ignore - assert stat["input_sequence_length"]["avg"] == 3.5 # type: ignore - - assert stat["time_to_first_token"]["p50"] == 2 # type: ignore - assert stat["inter_token_latency"]["p50"] == 1.5 # type: ignore - assert stat["output_token_throughput_per_request"]["p50"] == pytest.approx( # type: ignore - np.percentile(ottpr, 50) - ) - assert stat["output_sequence_length"]["p50"] == 4.5 # type: ignore - assert stat["input_sequence_length"]["p50"] == 3.5 # type: ignore +from genai_perf.metrics import LLMMetrics - assert stat["time_to_first_token"]["min"] == 2 # type: ignore - assert stat["inter_token_latency"]["min"] == 1 # type: ignore - min_ottpr = 3 / ns_to_sec(7) - assert stat["output_token_throughput_per_request"]["min"] == pytest.approx(min_ottpr) # type: ignore - assert stat["output_sequence_length"]["min"] == 3 # type: ignore - assert stat["input_sequence_length"]["min"] == 3 # type: ignore - assert stat["time_to_first_token"]["max"] == 2 # type: ignore - assert stat["inter_token_latency"]["max"] == 2 # type: ignore - max_ottpr = 6 / ns_to_sec(9) - assert stat["output_token_throughput_per_request"]["max"] == pytest.approx(max_ottpr) # type: ignore - assert stat["output_sequence_length"]["max"] == 6 # type: ignore - assert stat["input_sequence_length"]["max"] == 4 # type: ignore +class TestLLMMetrics: - assert stat["time_to_first_token"]["std"] == np.std([2, 2]) # type: ignore - assert stat["inter_token_latency"]["std"] == np.std([2, 1]) # type: ignore - assert stat["output_token_throughput_per_request"]["std"] == pytest.approx( # type: ignore - np.std(ottpr) - ) - assert stat["output_sequence_length"]["std"] == np.std([3, 6]) # type: ignore - assert stat["input_sequence_length"]["std"] == np.std([3, 4]) # type: ignore - - oott = 9 / ns_to_sec(10) - assert stat["output_token_throughput"]["avg"] == pytest.approx(oott) # type: ignore - - # experiment 2 statistics - stat_obj = pd.get_statistics(infer_mode="request_rate", load_level="2.0") - metrics = stat_obj.metrics - stat = stat_obj.stats_dict - assert isinstance(metrics, LLMMetrics) - - assert metrics.time_to_first_tokens == [2, 3] - assert metrics.inter_token_latencies == [4, 1] - ottpr = [4 / ns_to_sec(13), 6 / ns_to_sec(8)] - assert metrics.output_token_throughputs_per_request == pytest.approx(ottpr) - ott = [2 / ns_to_sec(3)] - assert metrics.output_token_throughputs == pytest.approx(ott) - assert metrics.output_sequence_lengths == [4, 6] - assert metrics.input_sequence_lengths == [3, 4] - - assert stat["time_to_first_token"]["avg"] == pytest.approx(2.5) # type: ignore - assert stat["inter_token_latency"]["avg"] == pytest.approx(2.5) # type: ignore - assert stat["output_token_throughput_per_request"]["avg"] == pytest.approx( # type: ignore - np.mean(ottpr) - ) - assert stat["output_sequence_length"]["avg"] == 5 # type: ignore - assert stat["input_sequence_length"]["avg"] == 3.5 # type: ignore - - assert stat["time_to_first_token"]["p50"] == pytest.approx(2.5) # type: ignore - assert stat["inter_token_latency"]["p50"] == pytest.approx(2.5) # type: ignore - assert stat["output_token_throughput_per_request"]["p50"] == pytest.approx( # type: ignore - np.percentile(ottpr, 50) - ) - assert stat["output_sequence_length"]["p50"] == 5 # type: ignore - assert stat["input_sequence_length"]["p50"] == 3.5 # type: ignore - - assert stat["time_to_first_token"]["min"] == pytest.approx(2) # type: ignore - assert stat["inter_token_latency"]["min"] == pytest.approx(1) # type: ignore - min_ottpr = 4 / ns_to_sec(13) - assert stat["output_token_throughput_per_request"]["min"] == pytest.approx(min_ottpr) # type: ignore - assert stat["output_sequence_length"]["min"] == 4 # type: ignore - assert stat["input_sequence_length"]["min"] == 3 # type: ignore - - assert stat["time_to_first_token"]["max"] == pytest.approx(3) # type: ignore - assert stat["inter_token_latency"]["max"] == pytest.approx(4) # type: ignore - max_ottpr = 6 / ns_to_sec(8) - assert stat["output_token_throughput_per_request"]["max"] == pytest.approx(max_ottpr) # type: ignore - assert stat["output_sequence_length"]["max"] == 6 # type: ignore - assert stat["input_sequence_length"]["max"] == 4 # type: ignore - - assert stat["time_to_first_token"]["std"] == np.std([2, 3]) * (1) # type: ignore - assert stat["inter_token_latency"]["std"] == np.std([4, 1]) * (1) # type: ignore - assert stat["output_token_throughput_per_request"]["std"] == pytest.approx( # type: ignore - np.std(ottpr) - ) - assert stat["output_sequence_length"]["std"] == np.std([4, 6]) # type: ignore - assert stat["input_sequence_length"]["std"] == np.std([3, 4]) # type: ignore - - oott = 2 / ns_to_sec(3) - assert stat["output_token_throughput"]["avg"] == pytest.approx(oott) # type: ignore - - # check non-existing profile data - with pytest.raises(KeyError): - pd.get_statistics(infer_mode="concurrency", load_level="30") - - def test_openai_llm_profile_data(self, mock_read_write: pytest.MonkeyPatch) -> None: - """Collect LLM metrics from profile export data and check values. - - Metrics - * time to first tokens - - experiment 1: [5 - 1, 7 - 2] = [4, 5] - * inter token latencies - - experiment 1: [((12 - 1) - 4)/(3 - 1), ((15 - 2) - 5)/(6 - 1)] - : [3.5, 1.6] - : [4, 2] # rounded - * output token throughputs per request - - experiment 1: [3/(12 - 1), 6/(15 - 2)] = [3/11, 6/13] - * output token throughputs - - experiment 1: [(3 + 6)/(15 - 1)] = [9/14] - * output sequence lengths - - experiment 1: [3, 6] - * input sequence lengths - - experiment 1: [3, 4] - """ - tokenizer = get_tokenizer(DEFAULT_TOKENIZER) - pd = LLMProfileDataParser( - filename=Path("openai_profile_export.json"), - tokenizer=tokenizer, - ) - - # experiment 1 statistics - stat_obj = pd.get_statistics(infer_mode="concurrency", load_level="10") - metrics = stat_obj.metrics - stat = stat_obj.stats_dict - assert isinstance(metrics, LLMMetrics) - - assert metrics.time_to_first_tokens == [4, 5] - assert metrics.inter_token_latencies == [4, 2] - ottpr = [3 / ns_to_sec(11), 6 / ns_to_sec(13)] - assert metrics.output_token_throughputs_per_request == pytest.approx(ottpr) - ott = [9 / ns_to_sec(14)] - assert metrics.output_token_throughputs == pytest.approx(ott) - assert metrics.output_sequence_lengths == [3, 6] - assert metrics.input_sequence_lengths == [3, 4] - - assert stat["time_to_first_token"]["avg"] == pytest.approx(4.5) # type: ignore - assert stat["inter_token_latency"]["avg"] == pytest.approx(3) # type: ignore - assert stat["output_token_throughput_per_request"]["avg"] == pytest.approx( # type: ignore - np.mean(ottpr) - ) - assert stat["output_sequence_length"]["avg"] == 4.5 # type: ignore - assert stat["input_sequence_length"]["avg"] == 3.5 # type: ignore - - assert stat["time_to_first_token"]["p50"] == pytest.approx(4.5) # type: ignore - assert stat["inter_token_latency"]["p50"] == pytest.approx(3) # type: ignore - assert stat["output_token_throughput_per_request"]["p50"] == pytest.approx( # type: ignore - np.percentile(ottpr, 50) - ) - assert stat["output_sequence_length"]["p50"] == 4.5 # type: ignore - assert stat["input_sequence_length"]["p50"] == 3.5 # type: ignore - - assert stat["time_to_first_token"]["min"] == pytest.approx(4) # type: ignore - assert stat["inter_token_latency"]["min"] == pytest.approx(2) # type: ignore - min_ottpr = 3 / ns_to_sec(11) - assert stat["output_token_throughput_per_request"]["min"] == pytest.approx(min_ottpr) # type: ignore - assert stat["output_sequence_length"]["min"] == 3 # type: ignore - assert stat["input_sequence_length"]["min"] == 3 # type: ignore - - assert stat["time_to_first_token"]["max"] == pytest.approx(5) # type: ignore - assert stat["inter_token_latency"]["max"] == pytest.approx(4) # type: ignore - max_ottpr = 6 / ns_to_sec(13) - assert stat["output_token_throughput_per_request"]["max"] == pytest.approx(max_ottpr) # type: ignore - assert stat["output_sequence_length"]["max"] == 6 # type: ignore - assert stat["input_sequence_length"]["max"] == 4 # type: ignore - - assert stat["time_to_first_token"]["std"] == np.std([4, 5]) * (1) # type: ignore - assert stat["inter_token_latency"]["std"] == np.std([4, 2]) * (1) # type: ignore - assert stat["output_token_throughput_per_request"]["std"] == pytest.approx( # type: ignore - np.std(ottpr) - ) - assert stat["output_sequence_length"]["std"] == np.std([3, 6]) # type: ignore - assert stat["input_sequence_length"]["std"] == np.std([3, 4]) # type: ignore - - oott = 9 / ns_to_sec(14) - assert stat["output_token_throughput"]["avg"] == pytest.approx(oott) # type: ignore - - # check non-existing profile data - with pytest.raises(KeyError): - pd.get_statistics(infer_mode="concurrency", load_level="40") - - def test_merged_sse_response(self, mock_read_write: pytest.MonkeyPatch) -> None: - """Test merging the multiple sse response.""" - res_timestamps = [0, 1, 2, 3] - res_outputs = [ - { - "response": 'data: {"choices":[{"delta":{"content":"aaa"}}],"object":"chat.completion.chunk"}\n\n' - }, - { - "response": ( - 'data: {"choices":[{"delta":{"content":"abc"}}],"object":"chat.completion.chunk"}\n\n' - 'data: {"choices":[{"delta":{"content":"1234"}}],"object":"chat.completion.chunk"}\n\n' - 'data: {"choices":[{"delta":{"content":"helloworld"}}],"object":"chat.completion.chunk"}\n\n' - ) - }, - {"response": "data: [DONE]\n\n"}, - ] - expected_response = '{"choices": [{"delta": {"content": "abc1234helloworld"}}], "object": "chat.completion.chunk"}' - - tokenizer = get_tokenizer(DEFAULT_TOKENIZER) - pd = LLMProfileDataParser( - filename=Path("openai_profile_export.json"), - tokenizer=tokenizer, - ) - - pd._preprocess_response(res_timestamps, res_outputs) - assert res_outputs[1]["response"] == expected_response - - def test_openai_output_token_counts( - self, mock_read_write: pytest.MonkeyPatch - ) -> None: - output_texts = [ - "Ad", - "idas", - " Orig", - "inals", - " are", - " now", - " available", - " in", - " more", - " than", - ] - res_outputs = [] - for text in output_texts: - response = f'data: {{"choices":[{{"delta":{{"content":"{text}"}}}}],"object":"chat.completion.chunk"}}\n\n' - res_outputs.append({"response": response}) - - tokenizer = get_tokenizer(DEFAULT_TOKENIZER) - pd = LLMProfileDataParser( - filename=Path("openai_profile_export.json"), - tokenizer=tokenizer, - ) - - output_token_counts, total_output_token = pd._get_output_token_counts( - res_outputs - ) - assert output_token_counts == [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] # total 10 - assert total_output_token == 9 - assert total_output_token != sum(output_token_counts) - - def test_triton_output_token_counts( - self, mock_read_write: pytest.MonkeyPatch - ) -> None: - output_texts = [ - "Ad", - "idas", - " Orig", - "inals", - " are", - " now", - " available", - " in", - " more", - " than", - ] - res_outputs = [] - for text in output_texts: - res_outputs.append({"text_output": text}) - - tokenizer = get_tokenizer(DEFAULT_TOKENIZER) - pd = LLMProfileDataParser( - filename=Path("triton_profile_export.json"), - tokenizer=tokenizer, + def test_llm_metric_request_metrics(self) -> None: + """Test request_metrics property.""" + m = LLMMetrics( + request_throughputs=[10.12, 11.33], + request_latencies=[3, 44], + time_to_first_tokens=[1, 2, 3], + inter_token_latencies=[4, 5], + output_token_throughputs=[22.13, 9423.02], + output_token_throughputs_per_request=[7, 8, 9], + output_sequence_lengths=[3, 4], + input_sequence_lengths=[12, 34], ) - - output_token_counts, total_output_token = pd._get_output_token_counts( - res_outputs + req_metrics = m.request_metrics + assert len(req_metrics) == 6 + assert req_metrics[0].name == "time_to_first_token" + assert req_metrics[0].unit == "ms" + assert req_metrics[1].name == "inter_token_latency" + assert req_metrics[1].unit == "ms" + assert req_metrics[2].name == "request_latency" + assert req_metrics[2].unit == "ms" + assert req_metrics[3].name == "output_token_throughput_per_request" + assert req_metrics[3].unit == "tokens/sec" + assert req_metrics[4].name == "output_sequence_length" + assert req_metrics[4].unit == "tokens" + assert req_metrics[5].name == "input_sequence_length" + assert req_metrics[5].unit == "tokens" + + def test_llm_metric_system_metrics(self) -> None: + """Test system_metrics property.""" + m = LLMMetrics( + request_throughputs=[10.12, 11.33], + request_latencies=[3, 44], + time_to_first_tokens=[1, 2, 3], + inter_token_latencies=[4, 5], + output_token_throughputs=[22.13, 9423.02], + output_token_throughputs_per_request=[7, 8, 9], + output_sequence_lengths=[3, 4], + input_sequence_lengths=[12, 34], ) - assert output_token_counts == [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] # total 10 - assert total_output_token == 9 - assert total_output_token != sum(output_token_counts) + sys_metrics = m.system_metrics + assert len(sys_metrics) == 2 + assert sys_metrics[0].name == "output_token_throughput" + assert sys_metrics[0].unit == "per sec" + assert sys_metrics[1].name == "request_throughput" + assert sys_metrics[1].unit == "per sec" def test_llm_metrics_get_base_name(self) -> None: """Test get_base_name method in LLMMetrics class.""" @@ -440,175 +103,3 @@ def test_llm_metrics_get_base_name(self) -> None: ) with pytest.raises(KeyError): metrics.get_base_name("hello1234") - - def test_empty_response(self, mock_read_write: pytest.MonkeyPatch) -> None: - """Check if it handles all empty responses.""" - tokenizer = get_tokenizer(DEFAULT_TOKENIZER) - - # Should not throw error - _ = LLMProfileDataParser( - filename=Path("empty_profile_export.json"), - tokenizer=tokenizer, - ) - - empty_profile_data = { - "service_kind": "openai", - "endpoint": "v1/chat/completions", - "experiments": [ - { - "experiment": { - "mode": "concurrency", - "value": 10, - }, - "requests": [ - { - "timestamp": 1, - "request_inputs": { - "payload": '{"messages":[{"role":"user","content":"This is test"}],"model":"llama-2-7b","stream":true}', - }, - "response_timestamps": [3, 5, 8], - "response_outputs": [ - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}\n\n' - }, - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":""},"finish_reason":null}]}\n\n' - }, - {"response": "data: [DONE]\n\n"}, - ], - }, - ], - }, - ], - } - - openai_profile_data = { - "service_kind": "openai", - "endpoint": "v1/chat/completions", - "experiments": [ - { - "experiment": { - "mode": "concurrency", - "value": 10, - }, - "requests": [ - { - "timestamp": 1, - "request_inputs": { - "payload": '{"messages":[{"role":"user","content":"This is test"}],"model":"llama-2-7b","stream":true}', - }, - # the first, and the last two responses will be ignored because they have no "content" - "response_timestamps": [3, 5, 8, 12, 13, 14], - "response_outputs": [ - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}\n\n' - }, - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":"I"},"finish_reason":null}]}\n\n' - }, - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":" like"},"finish_reason":null}]}\n\n' - }, - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":" dogs"},"finish_reason":null}]}\n\n' - }, - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{},"finish_reason":null}]}\n\n' - }, - {"response": "data: [DONE]\n\n"}, - ], - }, - { - "timestamp": 2, - "request_inputs": { - "payload": '{"messages":[{"role":"user","content":"This is test too"}],"model":"llama-2-7b","stream":true}', - }, - # the first, and the last two responses will be ignored because they have no "content" - "response_timestamps": [4, 7, 11, 15, 18, 19], - "response_outputs": [ - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}\n\n' - }, - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":"I"},"finish_reason":null}]}\n\n' - }, - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":"don\'t"},"finish_reason":null}]}\n\n' - }, - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":"cook food"},"finish_reason":null}]}\n\n' - }, - { - "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{},"finish_reason":null}]}\n\n' - }, - {"response": "data: [DONE]\n\n"}, - ], - }, - ], - }, - ], - } - - triton_profile_data = { - "service_kind": "triton", - "endpoint": "", - "experiments": [ - { - "experiment": { - "mode": "concurrency", - "value": 10, - }, - "requests": [ - { - "timestamp": 1, - "request_inputs": {"text_input": "This is test"}, - "response_timestamps": [3, 5, 8], - "response_outputs": [ - {"text_output": "I"}, - {"text_output": " like"}, - {"text_output": " dogs"}, - ], - }, - { - "timestamp": 2, - "request_inputs": {"text_input": "This is test too"}, - "response_timestamps": [4, 7, 11], - "response_outputs": [ - {"text_output": "I"}, - {"text_output": " don't"}, - {"text_output": " cook food"}, - ], - }, - ], - }, - { - "experiment": { - "mode": "request_rate", - "value": 2.0, - }, - "requests": [ - { - "timestamp": 5, - "request_inputs": {"text_input": "This is test"}, - "response_timestamps": [7, 8, 13, 18], - "response_outputs": [ - {"text_output": "cat"}, - {"text_output": " is"}, - {"text_output": " cool"}, - {"text_output": " too"}, - ], - }, - { - "timestamp": 3, - "request_inputs": {"text_input": "This is test too"}, - "response_timestamps": [6, 8, 11], - "response_outputs": [ - {"text_output": "it's"}, - {"text_output": " very"}, - {"text_output": " simple work"}, - ], - }, - ], - }, - ], - } diff --git a/genai-perf/tests/test_llm_profile_data_parser.py b/genai-perf/tests/test_llm_profile_data_parser.py new file mode 100644 index 00000000..75976189 --- /dev/null +++ b/genai-perf/tests/test_llm_profile_data_parser.py @@ -0,0 +1,587 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +from io import StringIO +from pathlib import Path +from typing import Any, List, Union + +import numpy as np +import pytest +from genai_perf.metrics import LLMMetrics +from genai_perf.profile_data_parser import LLMProfileDataParser +from genai_perf.tokenizer import DEFAULT_TOKENIZER, get_tokenizer + + +def ns_to_sec(ns: int) -> Union[int, float]: + """Convert from nanosecond to second.""" + return ns / 1e9 + + +class TestLLMProfileDataParser: + @pytest.fixture + def mock_read_write(self, monkeypatch: pytest.MonkeyPatch) -> List[str]: + """ + This function will mock the open function for specific files: + + - For "triton_profile_export.json", it will read and return the + contents of self.triton_profile_data + - For "openai_profile_export.json", it will read and return the + contents of self.openai_profile_data + - For "profile_export.csv", it will capture all data written to + the file, and return it as the return value of this function + - For all other files, it will behave like the normal open function + """ + + written_data = [] + + original_open = open + + def custom_open(filename, *args, **kwargs): + def write(self: Any, content: str) -> int: + written_data.append(content) + return len(content) + + if filename == "triton_profile_export.json": + tmp_file = StringIO(json.dumps(self.triton_profile_data)) + return tmp_file + elif filename == "openai_profile_export.json": + tmp_file = StringIO(json.dumps(self.openai_profile_data)) + return tmp_file + elif filename == "empty_profile_export.json": + tmp_file = StringIO(json.dumps(self.empty_profile_data)) + return tmp_file + elif filename == "profile_export.csv": + tmp_file = StringIO() + tmp_file.write = write.__get__(tmp_file) + return tmp_file + else: + return original_open(filename, *args, **kwargs) + + monkeypatch.setattr("builtins.open", custom_open) + + return written_data + + def test_triton_llm_profile_data(self, mock_read_write: pytest.MonkeyPatch) -> None: + """Collect LLM metrics from profile export data and check values. + + Metrics + * time to first tokens + - experiment 1: [3 - 1, 4 - 2] = [2, 2] + - experiment 2: [7 - 5, 6 - 3] = [2, 3] + * inter token latencies + - experiment 1: [((8 - 1) - 2)/(3 - 1), ((11 - 2) - 2)/(6 - 1)] + : [2.5, 1.4] + : [2, 1] # rounded + - experiment 2: [((18 - 5) - 2)/(4 - 1), ((11 - 3) - 3)/(6 - 1)] + : [11/3, 1] + : [4, 1] # rounded + * output token throughputs per request + - experiment 1: [3/(8 - 1), 6/(11 - 2)] = [3/7, 6/9] + - experiment 2: [4/(18 - 5), 6/(11 - 3)] = [4/13, 6/8] + * output token throughputs + - experiment 1: [(3 + 6)/(11 - 1)] = [9/10] + - experiment 2: [(4 + 6)/(18 - 3)] = [2/3] + * output sequence lengths + - experiment 1: [3, 6] + - experiment 2: [4, 6] + * input sequence lengths + - experiment 1: [3, 4] + - experiment 2: [3, 4] + """ + tokenizer = get_tokenizer(DEFAULT_TOKENIZER) + pd = LLMProfileDataParser( + filename=Path("triton_profile_export.json"), + tokenizer=tokenizer, + ) + + # experiment 1 metrics & statistics + stat_obj = pd.get_statistics(infer_mode="concurrency", load_level="10") + metrics = stat_obj.metrics + stat = stat_obj.stats_dict + + assert isinstance(metrics, LLMMetrics) + + assert metrics.time_to_first_tokens == [2, 2] + assert metrics.inter_token_latencies == [2, 1] + ottpr = [3 / ns_to_sec(7), 6 / ns_to_sec(9)] + assert metrics.output_token_throughputs_per_request == pytest.approx(ottpr) + ott = [9 / ns_to_sec(10)] + assert metrics.output_token_throughputs == pytest.approx(ott) + assert metrics.output_sequence_lengths == [3, 6] + assert metrics.input_sequence_lengths == [3, 4] + + # Disable Pylance warnings for dynamically set attributes due to Statistics + # not having strict attributes listed. + assert stat["time_to_first_token"]["avg"] == 2 # type: ignore + assert stat["inter_token_latency"]["avg"] == 1.5 # type: ignore + assert stat["output_token_throughput_per_request"]["avg"] == pytest.approx( # type: ignore + np.mean(ottpr) + ) + assert stat["output_sequence_length"]["avg"] == 4.5 # type: ignore + assert stat["input_sequence_length"]["avg"] == 3.5 # type: ignore + + assert stat["time_to_first_token"]["p50"] == 2 # type: ignore + assert stat["inter_token_latency"]["p50"] == 1.5 # type: ignore + assert stat["output_token_throughput_per_request"]["p50"] == pytest.approx( # type: ignore + np.percentile(ottpr, 50) + ) + assert stat["output_sequence_length"]["p50"] == 4.5 # type: ignore + assert stat["input_sequence_length"]["p50"] == 3.5 # type: ignore + + assert stat["time_to_first_token"]["min"] == 2 # type: ignore + assert stat["inter_token_latency"]["min"] == 1 # type: ignore + min_ottpr = 3 / ns_to_sec(7) + assert stat["output_token_throughput_per_request"]["min"] == pytest.approx(min_ottpr) # type: ignore + assert stat["output_sequence_length"]["min"] == 3 # type: ignore + assert stat["input_sequence_length"]["min"] == 3 # type: ignore + + assert stat["time_to_first_token"]["max"] == 2 # type: ignore + assert stat["inter_token_latency"]["max"] == 2 # type: ignore + max_ottpr = 6 / ns_to_sec(9) + assert stat["output_token_throughput_per_request"]["max"] == pytest.approx(max_ottpr) # type: ignore + assert stat["output_sequence_length"]["max"] == 6 # type: ignore + assert stat["input_sequence_length"]["max"] == 4 # type: ignore + + assert stat["time_to_first_token"]["std"] == np.std([2, 2]) # type: ignore + assert stat["inter_token_latency"]["std"] == np.std([2, 1]) # type: ignore + assert stat["output_token_throughput_per_request"]["std"] == pytest.approx( # type: ignore + np.std(ottpr) + ) + assert stat["output_sequence_length"]["std"] == np.std([3, 6]) # type: ignore + assert stat["input_sequence_length"]["std"] == np.std([3, 4]) # type: ignore + + oott = 9 / ns_to_sec(10) + assert stat["output_token_throughput"]["avg"] == pytest.approx(oott) # type: ignore + + # experiment 2 statistics + stat_obj = pd.get_statistics(infer_mode="request_rate", load_level="2.0") + metrics = stat_obj.metrics + stat = stat_obj.stats_dict + assert isinstance(metrics, LLMMetrics) + + assert metrics.time_to_first_tokens == [2, 3] + assert metrics.inter_token_latencies == [4, 1] + ottpr = [4 / ns_to_sec(13), 6 / ns_to_sec(8)] + assert metrics.output_token_throughputs_per_request == pytest.approx(ottpr) + ott = [2 / ns_to_sec(3)] + assert metrics.output_token_throughputs == pytest.approx(ott) + assert metrics.output_sequence_lengths == [4, 6] + assert metrics.input_sequence_lengths == [3, 4] + + assert stat["time_to_first_token"]["avg"] == pytest.approx(2.5) # type: ignore + assert stat["inter_token_latency"]["avg"] == pytest.approx(2.5) # type: ignore + assert stat["output_token_throughput_per_request"]["avg"] == pytest.approx( # type: ignore + np.mean(ottpr) + ) + assert stat["output_sequence_length"]["avg"] == 5 # type: ignore + assert stat["input_sequence_length"]["avg"] == 3.5 # type: ignore + + assert stat["time_to_first_token"]["p50"] == pytest.approx(2.5) # type: ignore + assert stat["inter_token_latency"]["p50"] == pytest.approx(2.5) # type: ignore + assert stat["output_token_throughput_per_request"]["p50"] == pytest.approx( # type: ignore + np.percentile(ottpr, 50) + ) + assert stat["output_sequence_length"]["p50"] == 5 # type: ignore + assert stat["input_sequence_length"]["p50"] == 3.5 # type: ignore + + assert stat["time_to_first_token"]["min"] == pytest.approx(2) # type: ignore + assert stat["inter_token_latency"]["min"] == pytest.approx(1) # type: ignore + min_ottpr = 4 / ns_to_sec(13) + assert stat["output_token_throughput_per_request"]["min"] == pytest.approx(min_ottpr) # type: ignore + assert stat["output_sequence_length"]["min"] == 4 # type: ignore + assert stat["input_sequence_length"]["min"] == 3 # type: ignore + + assert stat["time_to_first_token"]["max"] == pytest.approx(3) # type: ignore + assert stat["inter_token_latency"]["max"] == pytest.approx(4) # type: ignore + max_ottpr = 6 / ns_to_sec(8) + assert stat["output_token_throughput_per_request"]["max"] == pytest.approx(max_ottpr) # type: ignore + assert stat["output_sequence_length"]["max"] == 6 # type: ignore + assert stat["input_sequence_length"]["max"] == 4 # type: ignore + + assert stat["time_to_first_token"]["std"] == np.std([2, 3]) * (1) # type: ignore + assert stat["inter_token_latency"]["std"] == np.std([4, 1]) * (1) # type: ignore + assert stat["output_token_throughput_per_request"]["std"] == pytest.approx( # type: ignore + np.std(ottpr) + ) + assert stat["output_sequence_length"]["std"] == np.std([4, 6]) # type: ignore + assert stat["input_sequence_length"]["std"] == np.std([3, 4]) # type: ignore + + oott = 2 / ns_to_sec(3) + assert stat["output_token_throughput"]["avg"] == pytest.approx(oott) # type: ignore + + # check non-existing profile data + with pytest.raises(KeyError): + pd.get_statistics(infer_mode="concurrency", load_level="30") + + def test_openai_llm_profile_data(self, mock_read_write: pytest.MonkeyPatch) -> None: + """Collect LLM metrics from profile export data and check values. + + Metrics + * time to first tokens + - experiment 1: [5 - 1, 7 - 2] = [4, 5] + * inter token latencies + - experiment 1: [((12 - 1) - 4)/(3 - 1), ((15 - 2) - 5)/(6 - 1)] + : [3.5, 1.6] + : [4, 2] # rounded + * output token throughputs per request + - experiment 1: [3/(12 - 1), 6/(15 - 2)] = [3/11, 6/13] + * output token throughputs + - experiment 1: [(3 + 6)/(15 - 1)] = [9/14] + * output sequence lengths + - experiment 1: [3, 6] + * input sequence lengths + - experiment 1: [3, 4] + """ + tokenizer = get_tokenizer(DEFAULT_TOKENIZER) + pd = LLMProfileDataParser( + filename=Path("openai_profile_export.json"), + tokenizer=tokenizer, + ) + + # experiment 1 statistics + stat_obj = pd.get_statistics(infer_mode="concurrency", load_level="10") + metrics = stat_obj.metrics + stat = stat_obj.stats_dict + assert isinstance(metrics, LLMMetrics) + + assert metrics.time_to_first_tokens == [4, 5] + assert metrics.inter_token_latencies == [4, 2] + ottpr = [3 / ns_to_sec(11), 6 / ns_to_sec(13)] + assert metrics.output_token_throughputs_per_request == pytest.approx(ottpr) + ott = [9 / ns_to_sec(14)] + assert metrics.output_token_throughputs == pytest.approx(ott) + assert metrics.output_sequence_lengths == [3, 6] + assert metrics.input_sequence_lengths == [3, 4] + + assert stat["time_to_first_token"]["avg"] == pytest.approx(4.5) # type: ignore + assert stat["inter_token_latency"]["avg"] == pytest.approx(3) # type: ignore + assert stat["output_token_throughput_per_request"]["avg"] == pytest.approx( # type: ignore + np.mean(ottpr) + ) + assert stat["output_sequence_length"]["avg"] == 4.5 # type: ignore + assert stat["input_sequence_length"]["avg"] == 3.5 # type: ignore + + assert stat["time_to_first_token"]["p50"] == pytest.approx(4.5) # type: ignore + assert stat["inter_token_latency"]["p50"] == pytest.approx(3) # type: ignore + assert stat["output_token_throughput_per_request"]["p50"] == pytest.approx( # type: ignore + np.percentile(ottpr, 50) + ) + assert stat["output_sequence_length"]["p50"] == 4.5 # type: ignore + assert stat["input_sequence_length"]["p50"] == 3.5 # type: ignore + + assert stat["time_to_first_token"]["min"] == pytest.approx(4) # type: ignore + assert stat["inter_token_latency"]["min"] == pytest.approx(2) # type: ignore + min_ottpr = 3 / ns_to_sec(11) + assert stat["output_token_throughput_per_request"]["min"] == pytest.approx(min_ottpr) # type: ignore + assert stat["output_sequence_length"]["min"] == 3 # type: ignore + assert stat["input_sequence_length"]["min"] == 3 # type: ignore + + assert stat["time_to_first_token"]["max"] == pytest.approx(5) # type: ignore + assert stat["inter_token_latency"]["max"] == pytest.approx(4) # type: ignore + max_ottpr = 6 / ns_to_sec(13) + assert stat["output_token_throughput_per_request"]["max"] == pytest.approx(max_ottpr) # type: ignore + assert stat["output_sequence_length"]["max"] == 6 # type: ignore + assert stat["input_sequence_length"]["max"] == 4 # type: ignore + + assert stat["time_to_first_token"]["std"] == np.std([4, 5]) * (1) # type: ignore + assert stat["inter_token_latency"]["std"] == np.std([4, 2]) * (1) # type: ignore + assert stat["output_token_throughput_per_request"]["std"] == pytest.approx( # type: ignore + np.std(ottpr) + ) + assert stat["output_sequence_length"]["std"] == np.std([3, 6]) # type: ignore + assert stat["input_sequence_length"]["std"] == np.std([3, 4]) # type: ignore + + oott = 9 / ns_to_sec(14) + assert stat["output_token_throughput"]["avg"] == pytest.approx(oott) # type: ignore + + # check non-existing profile data + with pytest.raises(KeyError): + pd.get_statistics(infer_mode="concurrency", load_level="40") + + def test_merged_sse_response(self, mock_read_write: pytest.MonkeyPatch) -> None: + """Test merging the multiple sse response.""" + res_timestamps = [0, 1, 2, 3] + res_outputs = [ + { + "response": 'data: {"choices":[{"delta":{"content":"aaa"}}],"object":"chat.completion.chunk"}\n\n' + }, + { + "response": ( + 'data: {"choices":[{"delta":{"content":"abc"}}],"object":"chat.completion.chunk"}\n\n' + 'data: {"choices":[{"delta":{"content":"1234"}}],"object":"chat.completion.chunk"}\n\n' + 'data: {"choices":[{"delta":{"content":"helloworld"}}],"object":"chat.completion.chunk"}\n\n' + ) + }, + {"response": "data: [DONE]\n\n"}, + ] + expected_response = '{"choices": [{"delta": {"content": "abc1234helloworld"}}], "object": "chat.completion.chunk"}' + + tokenizer = get_tokenizer(DEFAULT_TOKENIZER) + pd = LLMProfileDataParser( + filename=Path("openai_profile_export.json"), + tokenizer=tokenizer, + ) + + pd._preprocess_response(res_timestamps, res_outputs) + assert res_outputs[1]["response"] == expected_response + + def test_openai_output_token_counts( + self, mock_read_write: pytest.MonkeyPatch + ) -> None: + output_texts = [ + "Ad", + "idas", + " Orig", + "inals", + " are", + " now", + " available", + " in", + " more", + " than", + ] + res_outputs = [] + for text in output_texts: + response = f'data: {{"choices":[{{"delta":{{"content":"{text}"}}}}],"object":"chat.completion.chunk"}}\n\n' + res_outputs.append({"response": response}) + + tokenizer = get_tokenizer(DEFAULT_TOKENIZER) + pd = LLMProfileDataParser( + filename=Path("openai_profile_export.json"), + tokenizer=tokenizer, + ) + + output_token_counts, total_output_token = pd._get_output_token_counts( + res_outputs + ) + assert output_token_counts == [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] # total 10 + assert total_output_token == 9 + assert total_output_token != sum(output_token_counts) + + def test_triton_output_token_counts( + self, mock_read_write: pytest.MonkeyPatch + ) -> None: + output_texts = [ + "Ad", + "idas", + " Orig", + "inals", + " are", + " now", + " available", + " in", + " more", + " than", + ] + res_outputs = [] + for text in output_texts: + res_outputs.append({"text_output": text}) + + tokenizer = get_tokenizer(DEFAULT_TOKENIZER) + pd = LLMProfileDataParser( + filename=Path("triton_profile_export.json"), + tokenizer=tokenizer, + ) + + output_token_counts, total_output_token = pd._get_output_token_counts( + res_outputs + ) + assert output_token_counts == [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] # total 10 + assert total_output_token == 9 + assert total_output_token != sum(output_token_counts) + + def test_empty_response(self, mock_read_write: pytest.MonkeyPatch) -> None: + """Check if it handles all empty responses.""" + tokenizer = get_tokenizer(DEFAULT_TOKENIZER) + + # Should not throw error + _ = LLMProfileDataParser( + filename=Path("empty_profile_export.json"), + tokenizer=tokenizer, + ) + + empty_profile_data = { + "service_kind": "openai", + "endpoint": "v1/chat/completions", + "experiments": [ + { + "experiment": { + "mode": "concurrency", + "value": 10, + }, + "requests": [ + { + "timestamp": 1, + "request_inputs": { + "payload": '{"messages":[{"role":"user","content":"This is test"}],"model":"llama-2-7b","stream":true}', + }, + "response_timestamps": [3, 5, 8], + "response_outputs": [ + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":""},"finish_reason":null}]}\n\n' + }, + {"response": "data: [DONE]\n\n"}, + ], + }, + ], + }, + ], + } + + openai_profile_data = { + "service_kind": "openai", + "endpoint": "v1/chat/completions", + "experiments": [ + { + "experiment": { + "mode": "concurrency", + "value": 10, + }, + "requests": [ + { + "timestamp": 1, + "request_inputs": { + "payload": '{"messages":[{"role":"user","content":"This is test"}],"model":"llama-2-7b","stream":true}', + }, + # the first, and the last two responses will be ignored because they have no "content" + "response_timestamps": [3, 5, 8, 12, 13, 14], + "response_outputs": [ + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":"I"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":" like"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":" dogs"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{},"finish_reason":null}]}\n\n' + }, + {"response": "data: [DONE]\n\n"}, + ], + }, + { + "timestamp": 2, + "request_inputs": { + "payload": '{"messages":[{"role":"user","content":"This is test too"}],"model":"llama-2-7b","stream":true}', + }, + # the first, and the last two responses will be ignored because they have no "content" + "response_timestamps": [4, 7, 11, 15, 18, 19], + "response_outputs": [ + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":"I"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":"don\'t"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":"cook food"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{},"finish_reason":null}]}\n\n' + }, + {"response": "data: [DONE]\n\n"}, + ], + }, + ], + }, + ], + } + + triton_profile_data = { + "service_kind": "triton", + "endpoint": "", + "experiments": [ + { + "experiment": { + "mode": "concurrency", + "value": 10, + }, + "requests": [ + { + "timestamp": 1, + "request_inputs": {"text_input": "This is test"}, + "response_timestamps": [3, 5, 8], + "response_outputs": [ + {"text_output": "I"}, + {"text_output": " like"}, + {"text_output": " dogs"}, + ], + }, + { + "timestamp": 2, + "request_inputs": {"text_input": "This is test too"}, + "response_timestamps": [4, 7, 11], + "response_outputs": [ + {"text_output": "I"}, + {"text_output": " don't"}, + {"text_output": " cook food"}, + ], + }, + ], + }, + { + "experiment": { + "mode": "request_rate", + "value": 2.0, + }, + "requests": [ + { + "timestamp": 5, + "request_inputs": {"text_input": "This is test"}, + "response_timestamps": [7, 8, 13, 18], + "response_outputs": [ + {"text_output": "cat"}, + {"text_output": " is"}, + {"text_output": " cool"}, + {"text_output": " too"}, + ], + }, + { + "timestamp": 3, + "request_inputs": {"text_input": "This is test too"}, + "response_timestamps": [6, 8, 11], + "response_outputs": [ + {"text_output": "it's"}, + {"text_output": " very"}, + {"text_output": " simple work"}, + ], + }, + ], + }, + ], + } diff --git a/genai-perf/tests/test_metrics.py b/genai-perf/tests/test_metrics.py new file mode 100644 index 00000000..2af489fc --- /dev/null +++ b/genai-perf/tests/test_metrics.py @@ -0,0 +1,64 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest +from genai_perf.metrics import Metrics + + +class TestMetrics: + + def test_metric_request_metrics(self) -> None: + """Test request_metrics property.""" + m = Metrics( + request_throughputs=[10.12, 11.33], + request_latencies=[3, 44], + ) + req_metrics = m.request_metrics + assert len(req_metrics) == 1 + assert req_metrics[0].name == "request_latency" + assert req_metrics[0].unit == "ms" + + def test_metric_system_metrics(self) -> None: + """Test system_metrics property.""" + m = Metrics( + request_throughputs=[10.12, 11.33], + request_latencies=[3, 44], + ) + sys_metrics = m.system_metrics + assert len(sys_metrics) == 1 + assert sys_metrics[0].name == "request_throughput" + assert sys_metrics[0].unit == "per sec" + + def test_metrics_get_base_name(self) -> None: + """Test get_base_name method in Metrics class.""" + metrics = Metrics( + request_throughputs=[10.12, 11.33], + request_latencies=[3, 44], + ) + assert metrics.get_base_name("request_throughputs") == "request_throughput" + assert metrics.get_base_name("request_latencies") == "request_latency" + with pytest.raises(KeyError): + metrics.get_base_name("hello1234") diff --git a/genai-perf/tests/test_profile_data_parser.py b/genai-perf/tests/test_profile_data_parser.py new file mode 100644 index 00000000..fe303c51 --- /dev/null +++ b/genai-perf/tests/test_profile_data_parser.py @@ -0,0 +1,297 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +from io import StringIO +from pathlib import Path +from typing import Any, List, Union + +import numpy as np +import pytest +from genai_perf.metrics import Metrics +from genai_perf.profile_data_parser import ProfileDataParser + + +def ns_to_sec(ns: int) -> Union[int, float]: + """Convert from nanosecond to second.""" + return ns / 1e9 + + +class TestProfileDataParser: + @pytest.fixture + def mock_read_write(self, monkeypatch: pytest.MonkeyPatch) -> List[str]: + """ + This function will mock the open function for specific files: + + - For "triton_profile_export.json", it will read and return the + contents of self.triton_profile_data + - For "openai_profile_export.json", it will read and return the + contents of self.openai_profile_data + - For "profile_export.csv", it will capture all data written to + the file, and return it as the return value of this function + - For all other files, it will behave like the normal open function + """ + + written_data = [] + + original_open = open + + def custom_open(filename, *args, **kwargs): + def write(self: Any, content: str) -> int: + written_data.append(content) + return len(content) + + if filename == "embedding_profile_export.json": + tmp_file = StringIO(json.dumps(self.embedding_profile_data)) + return tmp_file + elif filename == "ranking_profile_export.json": + tmp_file = StringIO(json.dumps(self.ranking_profile_data)) + return tmp_file + elif filename == "huggingface_ranking_profile_export.json": + tmp_file = StringIO(json.dumps(self.huggingface_ranking_profile_data)) + return tmp_file + elif filename == "profile_export.csv": + tmp_file = StringIO() + tmp_file.write = write.__get__(tmp_file) + return tmp_file + else: + return original_open(filename, *args, **kwargs) + + monkeypatch.setattr("builtins.open", custom_open) + + return written_data + + # ================================================ + # EMBEDDINGS API + # ================================================ + embedding_profile_data = { + "service_kind": "openai", + "endpoint": "v1/embeddings", + "experiments": [ + { + "experiment": { + "mode": "concurrency", + "value": 10, + }, + "requests": [ + { + "timestamp": 1, + "request_inputs": { + "payload": '{"input":"This is test","model":"NV-Embed-QA","input_type":"passage","encoding_format":"float","truncate":"NONE"}', + }, + "response_timestamps": [3], + "response_outputs": [ + { + "response": '{"object":"list","data":[{"index":0,"embedding":[1, 2, 3],"object":"embedding"}],"model":"NV-Embed-QA","usage":{"prompt_tokens":7,"total_tokens":7}}' + }, + ], + }, + { + "timestamp": 2, + "request_inputs": { + "payload": '{"input":"This is test too","model":"NV-Embed-QA","input_type":"passage","encoding_format":"float","truncate":"NONE"}', + }, + "response_timestamps": [5], + "response_outputs": [ + { + "response": '{"object":"list","data":[{"index":0,"embedding":[1, 2, 3, 4],"object":"embedding"}],"model":"NV-Embed-QA","usage":{"prompt_tokens":8,"total_tokens":8}}' + }, + ], + }, + ], + }, + ], + } + + def test_embedding_profile_data(self, mock_read_write: pytest.MonkeyPatch) -> None: + """Collect base metrics from profile export data and check values. + + Metrics + * request latencies + - [3 - 1, 5 - 2] = [2, 3] + * request throughputs + - [2 / (5e-9 - 1e-9)] = [5e8] + """ + pd = ProfileDataParser(filename=Path("embedding_profile_export.json")) + + # experiment 1 statistics + stats = pd.get_statistics(infer_mode="concurrency", load_level="10") + metrics = stats.metrics + stats_dict = stats.stats_dict + assert isinstance(metrics, Metrics) + + assert metrics.request_latencies == [2, 3] + assert metrics.request_throughputs == [pytest.approx(5e8)] + + assert stats_dict["request_latency"]["avg"] == pytest.approx(2.5) # type: ignore + assert stats_dict["request_latency"]["p50"] == pytest.approx(2.5) # type: ignore + assert stats_dict["request_latency"]["min"] == pytest.approx(2) # type: ignore + assert stats_dict["request_latency"]["max"] == pytest.approx(3) # type: ignore + assert stats_dict["request_latency"]["std"] == np.std([2, 3]) # type: ignore + + assert stats_dict["request_throughput"]["avg"] == pytest.approx(5e8) # type: ignore + + # ================================================ + # RANKINGS API + # ================================================ + ranking_profile_data = { + "service_kind": "openai", + "endpoint": "v1/ranking", + "experiments": [ + { + "experiment": { + "mode": "concurrency", + "value": 10, + }, + "requests": [ + { + "timestamp": 1, + "request_inputs": { + "payload": '{"query":{"text":"This is a test."},"passages":[{"text":"test output one"},{"text":"test output two"},{"text":"test output three"}],"model":"nv-rerank-qa-mistral-4b:1","truncate":"END"}', + }, + "response_timestamps": [3], + "response_outputs": [ + { + "response": '{"rankings":[{"index":0,"logit":-5.98828125},{"index":1,"logit":-6.828125},{"index":2,"logit":-7.60546875}]}' + }, + ], + }, + { + "timestamp": 2, + "request_inputs": { + "payload": '{"query":{"text":"This is a test."},"passages":[{"text":"test output one"},{"text":"test output two"},{"text":"test output three"}],"model":"nv-rerank-qa-mistral-4b:1","truncate":"END"}', + }, + "response_timestamps": [5], + "response_outputs": [ + { + "response": '{"rankings":[{"index":2,"logit":-6.15625},{"index":1,"logit":-7.83984375},{"index":0,"logit":-7.84765625}]}' + }, + ], + }, + ], + }, + ], + } + + def test_ranking_profile_data(self, mock_read_write: pytest.MonkeyPatch) -> None: + """Collect base metrics from profile export data and check values. + + Metrics + * request latencies + - [3 - 1, 5 - 2] = [2, 3] + * request throughputs + - [2 / (5e-9 - 1e-9)] = [5e8] + """ + pd = ProfileDataParser(filename=Path("ranking_profile_export.json")) + + # experiment 1 statistics + stats = pd.get_statistics(infer_mode="concurrency", load_level="10") + metrics = stats.metrics + stats_dict = stats.stats_dict + assert isinstance(metrics, Metrics) + + assert metrics.request_latencies == [2, 3] + assert metrics.request_throughputs == [pytest.approx(5e8)] + + assert stats_dict["request_latency"]["avg"] == pytest.approx(2.5) # type: ignore + assert stats_dict["request_latency"]["p50"] == pytest.approx(2.5) # type: ignore + assert stats_dict["request_latency"]["min"] == pytest.approx(2) # type: ignore + assert stats_dict["request_latency"]["max"] == pytest.approx(3) # type: ignore + assert stats_dict["request_latency"]["std"] == np.std([2, 3]) # type: ignore + + assert stats_dict["request_throughput"]["avg"] == pytest.approx(5e8) # type: ignore + + # ================================================ + # HUGGINGFACE RANKINGS API + # ================================================ + huggingface_ranking_profile_data = { + "service_kind": "openai", + "endpoint": "rerank", + "experiments": [ + { + "experiment": { + "mode": "concurrency", + "value": 10, + }, + "requests": [ + { + "timestamp": 1, + "request_inputs": { + "payload": '{"query":"What was the first car ever driven?","texts":["Daddys Home 2 Principal photography on the film began in Massachusetts in March 2017 and it was released in the United States by Paramount Pictures on November 10, 2017. Although the film received unfavorable reviews, it has grossed over $180 million worldwide on a $69 million budget.","Kevin Loader is a British film and television producer."]}' + }, + "response_timestamps": [3], + "response_outputs": [ + { + "response": '[{"index":0,"score":0.0032476764},{"index":1,"score":0.00036117696}]' + }, + ], + }, + { + "timestamp": 2, + "request_inputs": { + "payload": '{"query":"In what state did they film Shrek 2?","texts":["Francisco Antonio Zea Juan Francisco Antonio Hilari was a Colombian journalist, botanist, diplomat, politician, and statesman who served as the 1st Vice President of Colombia.","Daddys Home 2 Principal photography on the film began in Massachusetts in March 2017 and it was released in the United States by Paramount Pictures on November 10, 2017. Although the film received unfavorable reviews, it has grossed over $180 million worldwide on a $69 million budget."]}' + }, + "response_timestamps": [5], + "response_outputs": [ + { + "response": '[{"index":0,"score":0.020177318},{"index":1,"score":0.01461567}]' + }, + ], + }, + ], + }, + ], + } + + def test_huggingface_ranking_profile_data( + self, mock_read_write: pytest.MonkeyPatch + ) -> None: + """Collect base metrics from HuggingFace ranking profile export data and check values. + + Metrics + * request latencies + - [3 - 1, 5 - 2] = [2, 3] + * request throughputs + - [2 / (5e-9 - 1e-9)] = [5e8] + """ + pd = ProfileDataParser(filename=Path("huggingface_ranking_profile_export.json")) + + # experiment 1 statistics + stats = pd.get_statistics(infer_mode="concurrency", load_level="10") + metrics = stats.metrics + stats_dict = stats.stats_dict + assert isinstance(metrics, Metrics) + + assert metrics.request_latencies == [2, 3] + assert metrics.request_throughputs == [pytest.approx(5e8)] + + assert stats_dict["request_latency"]["avg"] == pytest.approx(2.5) # type: ignore + assert stats_dict["request_latency"]["p50"] == pytest.approx(2.5) # type: ignore + assert stats_dict["request_latency"]["min"] == pytest.approx(2) # type: ignore + assert stats_dict["request_latency"]["max"] == pytest.approx(3) # type: ignore + assert stats_dict["request_latency"]["std"] == np.std([2, 3]) # type: ignore + + assert stats_dict["request_throughput"]["avg"] == pytest.approx(5e8) # type: ignore diff --git a/genai-perf/tests/test_wrapper.py b/genai-perf/tests/test_wrapper.py index 184a47f1..fd4c34b5 100644 --- a/genai-perf/tests/test_wrapper.py +++ b/genai-perf/tests/test_wrapper.py @@ -43,7 +43,14 @@ class TestWrapper: ], ) def test_url_exactly_once_triton(self, monkeypatch, arg): - args = ["genai-perf", "-m", "test_model", "--service-kind", "triton"] + arg + args = [ + "genai-perf", + "profile", + "-m", + "test_model", + "--service-kind", + "triton", + ] + arg monkeypatch.setattr("sys.argv", args) args, extra_args = parser.parse_args() cmd = Profiler.build_cmd(args, extra_args) @@ -70,7 +77,14 @@ def test_url_exactly_once_triton(self, monkeypatch, arg): ], ) def test_profile_export_filepath(self, monkeypatch, arg, expected_filepath): - args = ["genai-perf", "-m", "test_model", "--service-kind", "triton"] + arg + args = [ + "genai-perf", + "profile", + "-m", + "test_model", + "--service-kind", + "triton", + ] + arg monkeypatch.setattr("sys.argv", args) args, extra_args = parser.parse_args() cmd = Profiler.build_cmd(args, extra_args) @@ -87,7 +101,14 @@ def test_profile_export_filepath(self, monkeypatch, arg, expected_filepath): ], ) def test_service_triton(self, monkeypatch, arg): - args = ["genai-perf", "-m", "test_model", "--service-kind", "triton"] + arg + args = [ + "genai-perf", + "profile", + "-m", + "test_model", + "--service-kind", + "triton", + ] + arg monkeypatch.setattr("sys.argv", args) args, extra_args = parser.parse_args() cmd = Profiler.build_cmd(args, extra_args) @@ -111,6 +132,7 @@ def test_service_triton(self, monkeypatch, arg): def test_service_openai(self, monkeypatch, arg): args = [ "genai-perf", + "profile", "-m", "test_model", "--service-kind", diff --git a/inference_profiler.cc b/inference_profiler.cc index 57a33942..a36f51c1 100644 --- a/inference_profiler.cc +++ b/inference_profiler.cc @@ -723,13 +723,22 @@ InferenceProfiler::ProfileHelper( measurement_perf_status.request_rate = experiment_perf_status.request_rate; RETURN_IF_ERROR(manager_->CheckHealth()); + MeasureConfig measure_config; if (measurement_mode_ == MeasurementMode::TIME_WINDOWS) { - error.push( - Measure(measurement_perf_status, measurement_window_ms_, false)); + measure_config.measurement_window = measurement_window_ms_; + measure_config.is_count_based = false; } else { - error.push( - Measure(measurement_perf_status, measurement_request_count_, true)); + measure_config.measurement_window = measurement_request_count_; + measure_config.is_count_based = true; } + + // When request_count is not 0, the experiment will run for exactly X + // requests. In that case, we are not measuring based on window stability, + // and instead need to clamp the windows to be from the start of the + // first request to the end of the last request of the request count + // + measure_config.clamp_window = (request_count != 0); + error.push(Measure(measurement_perf_status, measure_config)); measurement_perf_statuses.push_back(measurement_perf_status); if (error.size() > load_parameters_.stability_window) { @@ -1169,8 +1178,7 @@ InferenceProfiler::GetServerSideStatus( // Used for measurement cb::Error -InferenceProfiler::Measure( - PerfStatus& perf_status, uint64_t measurement_window, bool is_count_based) +InferenceProfiler::Measure(PerfStatus& perf_status, MeasureConfig config) { std::map start_status; std::map end_status; @@ -1207,10 +1215,10 @@ InferenceProfiler::Measure( } } - if (!is_count_based) { + if (!config.is_count_based) { // Wait for specified time interval in msec std::this_thread::sleep_for( - std::chrono::milliseconds((uint64_t)(measurement_window_ms_ * 1.2))); + std::chrono::milliseconds((uint64_t)(config.measurement_window * 1.2))); } else { do { // Check the health of the worker threads. @@ -1218,7 +1226,7 @@ InferenceProfiler::Measure( // Wait for 1s until enough samples have been collected. std::this_thread::sleep_for(std::chrono::milliseconds((uint64_t)1000)); - } while (manager_->CountCollectedRequests() < measurement_window); + } while (manager_->CountCollectedRequests() < config.measurement_window); } uint64_t window_end_ns = @@ -1249,7 +1257,7 @@ InferenceProfiler::Measure( RETURN_IF_ERROR(Summarize( start_status, end_status, start_stat, end_stat, perf_status, - window_start_ns, window_end_ns)); + window_start_ns, window_end_ns, config.clamp_window)); return cb::Error::Success; } @@ -1259,7 +1267,8 @@ InferenceProfiler::Summarize( const std::map& start_status, const std::map& end_status, const cb::InferStat& start_stat, const cb::InferStat& end_stat, - PerfStatus& summary, uint64_t window_start_ns, uint64_t window_end_ns) + PerfStatus& summary, uint64_t window_start_ns, uint64_t window_end_ns, + bool clamp_window) { size_t valid_sequence_count = 0; size_t delayed_request_count = 0; @@ -1267,13 +1276,19 @@ InferenceProfiler::Summarize( // Get measurement from requests that fall within the time interval std::pair valid_range{window_start_ns, window_end_ns}; - uint64_t window_duration_ns = valid_range.second - valid_range.first; std::vector latencies; std::vector valid_requests{}; ValidLatencyMeasurement( valid_range, valid_sequence_count, delayed_request_count, &latencies, response_count, valid_requests); + + if (clamp_window) { + auto [start, end] = ClampWindow(valid_requests); + } + + uint64_t window_duration_ns = window_end_ns - window_start_ns; + if (should_collect_profile_data_) { CollectData( summary, window_start_ns, window_end_ns, std::move(valid_requests)); @@ -1366,6 +1381,24 @@ InferenceProfiler::ValidLatencyMeasurement( std::sort(valid_latencies->begin(), valid_latencies->end()); } +std::pair +InferenceProfiler::ClampWindow(std::vector& requests) +{ + auto earliest_start = + std::chrono::time_point::max(); + auto latest_end = std::chrono::time_point::min(); + + for (auto x : requests) { + earliest_start = std::min(earliest_start, x.start_time_); + latest_end = std::max(latest_end, x.response_timestamps_.back()); + } + + return std::make_pair( + earliest_start.time_since_epoch().count(), + latest_end.time_since_epoch().count()); +} + + void InferenceProfiler::CollectData( PerfStatus& summary, uint64_t window_start_ns, uint64_t window_end_ns, diff --git a/inference_profiler.h b/inference_profiler.h index cfd2a3b6..a7365131 100644 --- a/inference_profiler.h +++ b/inference_profiler.h @@ -77,6 +77,13 @@ struct LoadStatus { uint64_t avg_latency = 0; }; +/// Configuration for the Measure function +struct MeasureConfig { + uint64_t measurement_window{0}; + bool is_count_based{false}; + bool clamp_window{false}; +}; + // Holds the total of the timiming components of composing models of an // ensemble. struct EnsembleDurations { @@ -475,14 +482,9 @@ class InferenceProfiler { /// Helper function to perform measurement. /// \param status_summary The summary of this measurement. - /// \param measurement_window Indicating the number of requests or the - /// duration in milliseconds to collect requests. - /// \param is_count_based determines whether measurement_window is indicating - /// time or count. + /// \param config The configuration for measurement. /// \return cb::Error object indicating success or failure. - cb::Error Measure( - PerfStatus& status_summary, uint64_t measurement_window, - bool is_count_based); + cb::Error Measure(PerfStatus& status_summary, MeasureConfig config); /// Gets the server side statistics /// \param model_status Returns the status of the models provided by @@ -501,12 +503,15 @@ class InferenceProfiler { /// \param summary Returns the summary of the measurement. /// \param window_start_ns The window start timestamp in nanoseconds. /// \param window_end_ns The window end timestamp in nanoseconds. + /// \param clamp_window If true, the actual window range is reduced to the + /// start of the first request to the final response. /// \return cb::Error object indicating success or failure. cb::Error Summarize( const std::map& start_status, const std::map& end_status, const cb::InferStat& start_stat, const cb::InferStat& end_stat, - PerfStatus& summary, uint64_t window_start_ns, uint64_t window_end_ns); + PerfStatus& summary, uint64_t window_start_ns, uint64_t window_end_ns, + bool clamp_window); /// \param valid_range The start and end timestamp of the measurement window. /// \param valid_sequence_count Returns the number of completed sequences @@ -522,6 +527,13 @@ class InferenceProfiler { std::vector* latencies, size_t& response_count, std::vector& valid_requests); + /// Clamp a window around a set of requests, from the earliest start time to + /// the latest response + /// \param requests A vector of requests to clamp the window around. + /// \return std::pair object containing of the window. + std::pair ClampWindow( + std::vector& requests); + /// Add the data from the request records to the Raw Data Collector /// \param perf_status PerfStatus of the current measurement /// \param window_start_ns The window start timestamp in nanoseconds. diff --git a/test_inference_profiler.cc b/test_inference_profiler.cc index 40813ce5..2941867f 100644 --- a/test_inference_profiler.cc +++ b/test_inference_profiler.cc @@ -107,6 +107,11 @@ class TestInferenceProfiler : public InferenceProfiler { return ip.IsDoneProfiling(ls, &is_stable); }; + std::pair ClampWindow(std::vector& reqs) + { + return InferenceProfiler::ClampWindow(reqs); + } + cb::Error MergeMetrics( const std::vector>& all_metrics, Metrics& merged_metrics) @@ -1060,6 +1065,41 @@ TEST_CASE( } } +TEST_CASE("clamp window") +{ + TestInferenceProfiler tip{}; + std::vector reqs{}; + + auto clock_epoch{std::chrono::time_point()}; + + auto request1_timestamp{clock_epoch + std::chrono::nanoseconds(5)}; + auto response1_timestamp{clock_epoch + std::chrono::nanoseconds(20)}; + + reqs.emplace_back( + request1_timestamp, + std::vector>{ + response1_timestamp}); + + auto request2_timestamp{clock_epoch + std::chrono::nanoseconds(3)}; + auto response2_timestamp{clock_epoch + std::chrono::nanoseconds(15)}; + reqs.emplace_back( + request2_timestamp, + std::vector>{ + response2_timestamp}); + + auto request3_timestamp{clock_epoch + std::chrono::nanoseconds(7)}; + auto response3_timestamp{clock_epoch + std::chrono::nanoseconds(17)}; + reqs.emplace_back( + request3_timestamp, + std::vector>{ + response3_timestamp}); + + auto window = tip.ClampWindow(reqs); + + CHECK(window.first == 3); + CHECK(window.second == 20); +} + TEST_CASE("summarize_client_stat: testing the SummarizeClientStat function") { MockInferenceProfiler mock_inference_profiler{};