Merge branch 'main' into add-pa-batch-size-sweep-option

triton-inference-server · Dec 19, 2024 · 034120d · 034120d
2 parents 2228003 + 6824f7c
commit 034120d
Show file tree

Hide file tree

Showing 8 changed files with 95 additions and 10 deletions.
diff --git a/genai-perf/README.md b/genai-perf/README.md
@@ -343,12 +343,10 @@ AUTHENTICATION
 
 GenAI-Perf can benchmark secure endpoints such as OpenAI, which require API
 key authentication. To do so, you must add your API key directly in the command.
-At the end of your command, append the below flags. Replace the key with your
-API key. The `--` flag allows arguments to pass directly into Perf Analyzer in
-superuser mode. The `-H` flag is used to add HTTP headers.
+Add the following flag to your command.
 
 ```bash
--- -H "Authorization: Bearer ${API_KEY}" -H "Accept: text/event-stream"
+-H "Authorization: Bearer ${API_KEY}" -H "Accept: text/event-stream"
 ```
 
 </br>
@@ -377,6 +375,13 @@ the inference server.
 
 </br>
 
+### Telemetry Metrics
+
+When using the Triton service kind, Telemetry metrics will be reported in
+the GenAI-Perf profile export files. These include GPU power usage, GPU
+utilization, energy consumption, total GPU memory, and more. If you would like
+these to be printed as output, you can use the `--verbose` flag.
+
 <!--
 ======================
 COMMAND LINE OPTIONS
@@ -461,6 +466,12 @@ flag for multiple inputs. Inputs should be in an input_name:value format.
 Alternatively, a string representing a json formatted dict can be provided.
 (default: `None`)
 
+##### `--header <str>`
+##### `--H <str>`
+Add a custom header to the requests. Headers must be specified as
+'Header:Value'. You can repeat this flag for multiple headers.
+(default: `None`)
+
 ##### `--input-file <path>`
 
 The input file or directory containing the content to use for

diff --git a/genai-perf/genai_perf/parser.py b/genai-perf/genai_perf/parser.py
@@ -735,7 +735,7 @@ def _add_input_args(parser):
         "--extra-inputs",
         action="append",
         help="Provide additional inputs to include with every request. "
-        "You can repeat this flag for multiple inputs. Inputs should be in an input_name:value format. "
+        "You can repeat this flag for multiple inputs. Inputs should be in an 'input_name:value' format. "
         "Alternatively, a string representing a json formatted dict can be provided.",
     )
 
@@ -752,6 +752,15 @@ def _add_input_args(parser):
         "Multiple key:value pairs can be provided, separated by spaces. ",
     )
 
+    input_group.add_argument(
+        "--header",
+        "-H",
+        action="append",
+        help="Add a custom header to the requests. "
+        "Headers must be specified as 'Header:Value'. "
+        "You can repeat this flag for multiple headers.",
+    )
+
     input_group.add_argument(
         "--input-file",
         type=file_or_directory,

diff --git a/genai-perf/genai_perf/wrapper.py b/genai-perf/genai_perf/wrapper.py
@@ -140,6 +140,9 @@ def build_cmd(args: Namespace, extra_args: Optional[List[str]] = None) -> List[s
             # against tensorrtllm engine.
             elif arg == "service_kind" and value == "tensorrtllm_engine":
                 cmd += ["--service-kind", "triton_c_api", "--streaming"]
+            elif arg == "header":
+                for header in value:
+                    cmd += ["-H", header]
             else:
                 if len(arg) == 1:
                     cmd += [f"-{arg}", f"{value}"]

diff --git a/genai-perf/tests/test_cli.py b/genai-perf/tests/test_cli.py
@@ -186,6 +186,12 @@ def test_help_version_arguments_output_and_exit(
                     ]
                 },
             ),
+            (["-H", "header_name:value"], {"header": ["header_name:value"]}),
+            (["--header", "header_name:value"], {"header": ["header_name:value"]}),
+            (
+                ["--header", "header_name:value", "--header", "header_name_2:value_2"],
+                {"header": ["header_name:value", "header_name_2:value_2"]},
+            ),
             (["--measurement-interval", "100"], {"measurement_interval": 100}),
             (
                 ["--model-selection-strategy", "random"],

diff --git a/genai-perf/tests/test_exporters/test_json_exporter.py b/genai-perf/tests/test_exporters/test_json_exporter.py
@@ -253,6 +253,7 @@ class TestJsonExporter:
           "tokenizer_trust_remote_code": false,
           "verbose": false,
           "goodput": null,
+          "header": null,
           "subcommand": "profile",
           "prompt_source": "synthetic",
           "extra_inputs": {

diff --git a/genai-perf/tests/test_wrapper.py b/genai-perf/tests/test_wrapper.py
@@ -186,3 +186,46 @@ def test_stdout_not_verbose(self, mock_telemetry_collector, mock_subprocess_run)
             assert (
                 kwargs["stdout"] is subprocess.DEVNULL
             ), "When the verbose flag is not passed, stdout should be redirected to /dev/null."
+
+    @pytest.mark.parametrize(
+        "header_values, expected_headers",
+        [
+            (["Header1:Value1"], [("-H", "Header1:Value1")]),
+            (
+                ["Authorization:Bearer mytoken", "Content-Type:application/json"],
+                [
+                    ("-H", "Authorization:Bearer mytoken"),
+                    ("-H", "Content-Type:application/json"),
+                ],
+            ),
+        ],
+    )
+    def test_headers_passed_correctly(
+        self, monkeypatch, header_values, expected_headers
+    ):
+        args = [
+            "genai-perf",
+            "profile",
+            "-m",
+            "test_model",
+        ]
+        for header in header_values:
+            args += ["-H", header]
+        monkeypatch.setattr("sys.argv", args)
+
+        args, extra_args = parser.parse_args()
+        cmd = Profiler.build_cmd(args, extra_args)
+
+        for expected_flag, expected_value in expected_headers:
+            try:
+                flag_index = cmd.index(expected_flag)
+                assert cmd[flag_index + 1] == expected_value, (
+                    f"Header value mismatch for {expected_flag}: "
+                    f"Expected {expected_value}, Found {cmd[flag_index + 1]}"
+                )
+                cmd[flag_index] = None  # type: ignore
+                cmd[flag_index + 1] = None  # type: ignore
+            except ValueError:
+                assert (
+                    False
+                ), f"Missing expected header flag: {expected_flag} or value: {expected_value}"
diff --git a/src/client_backend/openai/openai_client.h b/src/client_backend/openai/openai_client.h
@@ -65,7 +65,8 @@ class ChatCompletionResult : public InferResult {
   {
     if ((http_code_ >= 400) && (http_code_ <= 599)) {
       return Error(
-          "OpenAI response returns HTTP code " + std::to_string(http_code_));
+          "OpenAI response returns HTTP code " + std::to_string(http_code_) +
+          ": " + serialized_response_);
     }
     return Error::Success;
   }

diff --git a/templates/genai-perf-templates/README_template b/templates/genai-perf-templates/README_template
@@ -332,12 +332,10 @@ AUTHENTICATION
 
 GenAI-Perf can benchmark secure endpoints such as OpenAI, which require API
 key authentication. To do so, you must add your API key directly in the command.
-At the end of your command, append the below flags. Replace the key with your
-API key. The `--` flag allows arguments to pass directly into Perf Analyzer in
-superuser mode. The `-H` flag is used to add HTTP headers.
+Add the following flag to your command.
 
 ```bash
--- -H "Authorization: Bearer ${API_KEY}" -H "Accept: text/event-stream"
+-h "Authorization: Bearer ${API_KEY}" -H "Accept: text/event-stream"
 ```
 
 </br>
@@ -366,6 +364,13 @@ the inference server.
 
 </br>
 
+### Telemetry Metrics
+
+When using the Triton service kind, Telemetry metrics will be reported in
+the GenAI-Perf profile export files. These include GPU power usage, GPU
+utilization, energy consumption, total GPU memory, and more. If you would like
+these to be printed as output, you can use the `--verbose` flag.
+
 <!--
 ======================
 COMMAND LINE OPTIONS
@@ -450,6 +455,12 @@ flag for multiple inputs. Inputs should be in an input_name:value format.
 Alternatively, a string representing a json formatted dict can be provided.
 (default: `None`)
 
+##### `--header <str>`
+##### `--h <str>`
+Add a custom header to the requests. Headers must be specified as
+'Header:Value'. You can repeat this flag for multiple headers.
+(default: `None`)
+
 ##### `--input-file <path>`
 
 The input file or directory containing the content to use for