Update batch-size and endpoint list

triton-inference-server · Jun 28, 2024 · 942043e · 942043e
1 parent d6074d1
commit 942043e
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 4 deletions.
diff --git a/src/c++/perf_analyzer/genai-perf/README.md b/src/c++/perf_analyzer/genai-perf/README.md
@@ -373,7 +373,7 @@ model config to not echo the input tokens in the output. (default: tensorrtllm)
 
 Set a custom endpoint that differs from the OpenAI defaults. (default: `None`)
 
-##### `--endpoint-type {chat,completions,embeddings}`
+##### `--endpoint-type {chat,completions,embeddings,rankings}`
 
 The endpoint-type to send requests to on the server. This is only used with the
 `openai` service-kind. (default: `None`)
@@ -400,7 +400,8 @@ URL of the endpoint to target for benchmarking. (default: `None`)
 The batch size of the requests GenAI-Perf should send.
 This is currently only supported with the
 [embeddings endpoint type](docs/embeddings.md).
-(default: `1`)
+(default: `1`) and
+[rankings endpoint type](docs/rankings.md).
 
 ##### `--extra-inputs <str>`
 

diff --git a/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py
@@ -139,7 +139,7 @@ def create_llm_inputs(
         output_tokens_deterministic:
             If true, the output tokens will set the minimum and maximum tokens to be equivalent.
         batch_size:
-            The number of inputs per request (currently only used for v1/embeddings)
+            The number of inputs per request (currently only used for the embeddings and rankings endpoints)
 
         Required Synthetic Prompt Generation Parameters
         -----------------------------------------------
@@ -236,7 +236,7 @@ def get_generic_dataset_json(
         num_of_output_prompts:
             The number of synthetic output prompts to generate
         batch_size:
-            The number of inputs per request (currently only used for v1/embeddings)
+            The number of inputs per request (currently only used for the embeddings and rankings endpoints)
         input_filename:
             The path to the input file containing the prompts in JSONL format.
         Returns