From fde8dd50a4890118e2a173a3d7830a3ead914a0c Mon Sep 17 00:00:00 2001
From: tgerdes <tgerdes@nvidia.com>
Date: Wed, 28 Feb 2024 08:54:53 -0600
Subject: [PATCH 01/23] Add openai service-kind and add endpoint to CLI

---
 .../client_backend/client_backend.h             |  3 ++-
 src/c++/perf_analyzer/command_line_parser.cc    | 17 +++++++++++++++++
 src/c++/perf_analyzer/command_line_parser.h     |  1 +
 .../perf_analyzer/test_command_line_parser.cc   |  1 +
 4 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/c++/perf_analyzer/client_backend/client_backend.h b/src/c++/perf_analyzer/client_backend/client_backend.h
index 870ea3dd5..988957e98 100644
--- a/src/c++/perf_analyzer/client_backend/client_backend.h
+++ b/src/c++/perf_analyzer/client_backend/client_backend.h
@@ -135,7 +135,8 @@ enum BackendKind {
   TRITON = 0,
   TENSORFLOW_SERVING = 1,
   TORCHSERVE = 2,
-  TRITON_C_API = 3
+  TRITON_C_API = 3,
+  OPENAI = 4
 };
 enum ProtocolType { HTTP = 0, GRPC = 1, UNKNOWN = 2 };
 enum GrpcCompressionAlgorithm {
diff --git a/src/c++/perf_analyzer/command_line_parser.cc b/src/c++/perf_analyzer/command_line_parser.cc
index 711f1714e..9bcc5d46f 100644
--- a/src/c++/perf_analyzer/command_line_parser.cc
+++ b/src/c++/perf_analyzer/command_line_parser.cc
@@ -875,6 +875,7 @@ CLParser::ParseCommandLine(int argc, char** argv)
       {"periodic-concurrency-range", required_argument, 0, 59},
       {"request-period", required_argument, 0, 60},
       {"request-parameter", required_argument, 0, 61},
+      {"endpoint", required_argument, 0, 62},
       {0, 0, 0, 0}};
 
   // Parse commandline...
@@ -1169,6 +1170,8 @@ CLParser::ParseCommandLine(int argc, char** argv)
             params_->kind = cb::TORCHSERVE;
           } else if (arg.compare("triton_c_api") == 0) {
             params_->kind = cb::TRITON_C_API;
+          } else if (arg.compare("openai") == 0) {
+            params_->kind = cb::OPENAI;
           } else {
             Usage(
                 "Failed to parse --service-kind. Unsupported type provided: '" +
@@ -1608,6 +1611,9 @@ CLParser::ParseCommandLine(int argc, char** argv)
           params_->request_parameters[name] = param;
           break;
         }
+        case 62: {
+          params_->endpoint = optarg;
+        }
         case 'v':
           params_->extra_verbose = params_->verbose;
           params_->verbose = true;
@@ -1909,6 +1915,17 @@ CLParser::VerifyOptions()
     params_->protocol = cb::ProtocolType::UNKNOWN;
   }
 
+  if (params_->kind == cb::BackendKind::OPENAI) {
+    if (params_->user_data.empty()) {
+      Usage("Must supply --input-data for OpenAI service kind.");
+    }
+    if (params_->endpoint.empty()) {
+      Usage(
+          "Must supply --endpoint for OpenAI service kind. For example, "
+          "\"v1/chat/completions\"");
+    }
+  }
+
   if (params_->should_collect_metrics &&
       params_->kind != cb::BackendKind::TRITON) {
     Usage(
diff --git a/src/c++/perf_analyzer/command_line_parser.h b/src/c++/perf_analyzer/command_line_parser.h
index 9ff4869ff..79d387811 100644
--- a/src/c++/perf_analyzer/command_line_parser.h
+++ b/src/c++/perf_analyzer/command_line_parser.h
@@ -100,6 +100,7 @@ struct PerfAnalyzerParameters {
   bool dynamic_concurrency_mode = false;
   bool url_specified = false;
   std::string url{"localhost:8000"};
+  std::string endpoint{""};
   std::string model_name;
   std::string model_version;
   uint64_t batch_size = 1;
diff --git a/src/c++/perf_analyzer/test_command_line_parser.cc b/src/c++/perf_analyzer/test_command_line_parser.cc
index fd0d8af16..6428a0f2f 100644
--- a/src/c++/perf_analyzer/test_command_line_parser.cc
+++ b/src/c++/perf_analyzer/test_command_line_parser.cc
@@ -263,6 +263,7 @@ TEST_CASE("Testing PerfAnalyzerParameters")
   CHECK(params->sequence_length == 20);
   CHECK(params->percentile == -1);
   CHECK(params->user_data.size() == 0);
+  CHECK_STRING("endpoint", params->endpoint, "");
   CHECK(params->input_shapes.size() == 0);
   CHECK(params->measurement_window_ms == 5000);
   CHECK(params->using_concurrency_range == false);

From 269f4f99de9965c5876f8dd56d5eefaa38c5646f Mon Sep 17 00:00:00 2001
From: tgerdes <tgerdes@nvidia.com>
Date: Wed, 28 Feb 2024 09:04:07 -0600
Subject: [PATCH 02/23] Add openai to model parser

---
 src/c++/perf_analyzer/model_parser.cc  | 20 ++++++++++++++++++++
 src/c++/perf_analyzer/model_parser.h   |  4 ++++
 src/c++/perf_analyzer/perf_analyzer.cc |  5 +++++
 3 files changed, 29 insertions(+)

diff --git a/src/c++/perf_analyzer/model_parser.cc b/src/c++/perf_analyzer/model_parser.cc
index ee7ab5303..7dcd59819 100644
--- a/src/c++/perf_analyzer/model_parser.cc
+++ b/src/c++/perf_analyzer/model_parser.cc
@@ -265,6 +265,26 @@ ModelParser::InitTFServe(
   return cb::Error::Success;
 }
 
+cb::Error
+ModelParser::InitOpenAI(
+    const std::string& model_name, const std::string& model_version,
+    const int32_t batch_size)
+{
+  // OpenAI does not return model metadata hence we can not obtain any
+  // parameters.
+  model_name_ = model_name;
+  model_version_ = model_version;
+  max_batch_size_ = batch_size;
+
+  // OpenAI will take a single json input with a fully formed payload
+  auto it = inputs_->emplace("payload", ModelTensor()).first;
+  it->second.name_ = "payload";
+  it->second.datatype_ = "JSON";
+  it->second.shape_.push_back(1);
+
+  return cb::Error::Success;
+}
+
 cb::Error
 ModelParser::InitTorchServe(
     const std::string& model_name, const std::string& model_version,
diff --git a/src/c++/perf_analyzer/model_parser.h b/src/c++/perf_analyzer/model_parser.h
index 4646433ab..c1e16bac7 100644
--- a/src/c++/perf_analyzer/model_parser.h
+++ b/src/c++/perf_analyzer/model_parser.h
@@ -111,6 +111,10 @@ class ModelParser {
       const std::unordered_map<std::string, std::vector<int64_t>>& input_shapes,
       std::unique_ptr<cb::ClientBackend>& backend);
 
+  cb::Error InitOpenAI(
+      const std::string& model_name, const std::string& model_version,
+      const int32_t batch_size);
+
   cb::Error InitTorchServe(
       const std::string& model_name, const std::string& model_version,
       const int32_t batch_size);
diff --git a/src/c++/perf_analyzer/perf_analyzer.cc b/src/c++/perf_analyzer/perf_analyzer.cc
index 46b665757..a1a5ab635 100644
--- a/src/c++/perf_analyzer/perf_analyzer.cc
+++ b/src/c++/perf_analyzer/perf_analyzer.cc
@@ -108,6 +108,11 @@ PerfAnalyzer::CreateAnalyzerObjects()
             model_metadata, model_config, params_->model_version,
             params_->bls_composing_models, params_->input_shapes, backend_),
         "failed to create model parser");
+  } else if (params_->kind == cb::BackendKind::OPENAI) {
+    FAIL_IF_ERR(
+        parser_->InitOpenAI(
+            params_->model_name, params_->model_version, params_->batch_size),
+        "failed to create model parser");
   } else if (params_->kind == cb::BackendKind::TENSORFLOW_SERVING) {
     rapidjson::Document model_metadata;
     FAIL_IF_ERR(

From 2342215d642f27458cce89f88366f1dda022ac12 Mon Sep 17 00:00:00 2001
From: oandreeva-nv <oandreeva@nvidia.com>
Date: Wed, 28 Feb 2024 01:34:49 -0800
Subject: [PATCH 03/23] OpenAI client backend + cmake

---
 CMakeLists.txt                                |   8 +-
 src/c++/perf_analyzer/CMakeLists.txt          |   9 +-
 .../client_backend/CMakeLists.txt             |  15 +-
 .../client_backend/client_backend.h           |   2 +-
 .../client_backend/openai/CMakeLists.txt      |  56 +++++++
 .../openai/openai_client_backend.cc           | 154 ++++++++++++++++++
 .../openai/openai_client_backend.h            | 130 +++++++++++++++
 .../openai/openai_http_client.h               |  94 +++++++++++
 8 files changed, 464 insertions(+), 4 deletions(-)
 create mode 100644 src/c++/perf_analyzer/client_backend/openai/CMakeLists.txt
 create mode 100644 src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc
 create mode 100644 src/c++/perf_analyzer/client_backend/openai/openai_client_backend.h
 create mode 100644 src/c++/perf_analyzer/client_backend/openai/openai_http_client.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b1fc6ccf0..97f93ddaf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -45,6 +45,7 @@ option(TRITON_ENABLE_PERF_ANALYZER "Enable Performance Analyzer" OFF)
 option(TRITON_ENABLE_PERF_ANALYZER_C_API "Enable Performance Analyzer C API" OFF)
 option(TRITON_ENABLE_PERF_ANALYZER_TFS "Enable TensorFlow Serving support for Performance Analyzer" OFF)
 option(TRITON_ENABLE_PERF_ANALYZER_TS "Enable TorchServe support for Performance Analyzer" OFF)
+option(TRITON_ENABLE_PERF_ANALYZER_OPENAI "Enable OpenAI support for Performance Analyzer" OFF)
 option(TRITON_ENABLE_EXAMPLES "Include examples in build" OFF)
 option(TRITON_ENABLE_TESTS "Include tests in build" OFF)
 option(TRITON_ENABLE_GPU "Enable GPU support in libraries" OFF)
@@ -142,6 +143,9 @@ if(TRITON_ENABLE_CC_HTTP OR TRITON_ENABLE_CC_GRPC OR TRITON_ENABLE_PERF_ANALYZER
   if(NOT ${TRITON_ENABLE_PERF_ANALYZER} AND ${TRITON_ENABLE_PERF_ANALYZER_TS})
     message(FATAL_ERROR "TRITON_ENABLE_PERF_ANALYZER_TS=ON requires TRITON_ENABLE_PERF_ANALYZER=ON")
   endif() # NOT TRITON_ENABLE_PERF_ANALYZER AND TRITON_ENABLE_PERF_ANALYZER_TS
+  if(NOT ${TRITON_ENABLE_PERF_ANALYZER} AND ${TRITON_ENABLE_PERF_ANALYZER_OPENAI})
+    message(FATAL_ERROR "TRITON_ENABLE_PERF_ANALYZER_OPENAI=ON requires TRITON_ENABLE_PERF_ANALYZER=ON")
+  endif() # NOT TRITON_ENABLE_PERF_ANALYZER AND TRITON_ENABLE_PERF_ANALYZER_OPENAI
 
   ExternalProject_Add(cc-clients
     PREFIX cc-clients
@@ -167,6 +171,7 @@ if(TRITON_ENABLE_CC_HTTP OR TRITON_ENABLE_CC_GRPC OR TRITON_ENABLE_PERF_ANALYZER
       -DTRITON_ENABLE_PERF_ANALYZER_C_API:BOOL=${TRITON_ENABLE_PERF_ANALYZER_C_API}
       -DTRITON_ENABLE_PERF_ANALYZER_TFS:BOOL=${TRITON_ENABLE_PERF_ANALYZER_TFS}
       -DTRITON_ENABLE_PERF_ANALYZER_TS:BOOL=${TRITON_ENABLE_PERF_ANALYZER_TS}
+      -DTRITON_ENABLE_PERF_ANALYZER_OPENAI:BOOL=${TRITON_ENABLE_PERF_ANALYZER_OPENAI}
       -DTRITON_ENABLE_EXAMPLES:BOOL=${TRITON_ENABLE_EXAMPLES}
       -DTRITON_ENABLE_TESTS:BOOL=${TRITON_ENABLE_TESTS}
       -DTRITON_ENABLE_GPU:BOOL=${TRITON_ENABLE_GPU}
@@ -209,6 +214,7 @@ if(TRITON_ENABLE_PYTHON_HTTP OR TRITON_ENABLE_PYTHON_GRPC)
       -DTRITON_ENABLE_PERF_ANALYZER_C_API:BOOL=${TRITON_ENABLE_PERF_ANALYZER_C_API}
       -DTRITON_ENABLE_PERF_ANALYZER_TFS:BOOL=${TRITON_ENABLE_PERF_ANALYZER_TFS}
       -DTRITON_ENABLE_PERF_ANALYZER_TS:BOOL=${TRITON_ENABLE_PERF_ANALYZER_TS}
+      -DTRITON_ENABLE_PERF_ANALYZER_OPENAI:BOOL=${TRITON_ENABLE_PERF_ANALYZER_OPENAI}
       -DTRITON_ENABLE_EXAMPLES:BOOL=${TRITON_ENABLE_EXAMPLES}
       -DTRITON_ENABLE_TESTS:BOOL=${TRITON_ENABLE_TESTS}
       -DTRITON_ENABLE_GPU:BOOL=${TRITON_ENABLE_GPU}
diff --git a/src/c++/perf_analyzer/CMakeLists.txt b/src/c++/perf_analyzer/CMakeLists.txt
index bebdba4d5..fe34ace4f 100644
--- a/src/c++/perf_analyzer/CMakeLists.txt
+++ b/src/c++/perf_analyzer/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -170,6 +170,13 @@ if(TRITON_ENABLE_PERF_ANALYZER_TS)
   )
 endif()
 
+if(TRITON_ENABLE_PERF_ANALYZER_OPENAI)
+  target_compile_definitions(
+    client-backend-library
+    PUBLIC TRITON_ENABLE_PERF_ANALYZER_OPENAI=1
+  )
+endif()
+
 install(
   TARGETS perf_analyzer
   RUNTIME DESTINATION bin
diff --git a/src/c++/perf_analyzer/client_backend/CMakeLists.txt b/src/c++/perf_analyzer/client_backend/CMakeLists.txt
index 23da6f32e..2c780ee22 100644
--- a/src/c++/perf_analyzer/client_backend/CMakeLists.txt
+++ b/src/c++/perf_analyzer/client_backend/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -43,6 +43,10 @@ if(TRITON_ENABLE_PERF_ANALYZER_TS)
   add_subdirectory(torchserve)
 endif()
 
+if(TRITON_ENABLE_PERF_ANALYZER_OPENAI)
+  add_subdirectory(openai)
+endif()
+
 set(
   CLIENT_BACKEND_SRCS
   client_backend.cc
@@ -71,6 +75,12 @@ if(TRITON_ENABLE_PERF_ANALYZER_TS)
   set(TS_TARGET_INCLUDE_DIRECTORY PRIVATE $<TARGET_PROPERTY:ts-client-backend-library,INCLUDE_DIRECTORIES>)
 endif()
 
+if(TRITON_ENABLE_PERF_ANALYZER_OPENAI)
+  set(OPENAI_LIBRARY $<TARGET_OBJECTS:openai-client-backend-library>)
+  set(OPENAI_TARGET_LINK_LIBRARY PUBLIC $<TARGET_PROPERTY:openai-client-backend-library,LINK_LIBRARIES>)
+  set(OPENAI_TARGET_INCLUDE_DIRECTORY PRIVATE $<TARGET_PROPERTY:openai-client-backend-library,INCLUDE_DIRECTORIES>)
+endif()
+
 add_library(
   client-backend-library
   ${CLIENT_BACKEND_SRCS}
@@ -80,6 +90,7 @@ add_library(
   ${CAPI_LIBRARY}
   ${TFS_LIBRARY}
   ${TS_LIBRARY}
+  ${OPENAI_LIBRARY}
 )
 
 target_link_libraries(
@@ -89,6 +100,7 @@ target_link_libraries(
   ${CAPI_TARGET_LINK_LIBRARY}
   ${TFS_TARGET_LINK_LIBRARY}
   ${TS_TARGET_LINK_LIBRARY}
+  ${OPENAI_TARGET_LINK_LIBRARY}
 )
 
 target_include_directories(
@@ -97,4 +109,5 @@ target_include_directories(
   ${CAPI_TARGET_INCLUDE_DIRECTORY}
   ${TFS_TARGET_INCLUDE_DIRECTORY}
   ${TS_TARGET_INCLUDE_DIRECTORY}
+  ${OPENAI_TARGET_INCLUDE_DIRECTORY}
 )
diff --git a/src/c++/perf_analyzer/client_backend/client_backend.h b/src/c++/perf_analyzer/client_backend/client_backend.h
index 988957e98..487c215ce 100644
--- a/src/c++/perf_analyzer/client_backend/client_backend.h
+++ b/src/c++/perf_analyzer/client_backend/client_backend.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
diff --git a/src/c++/perf_analyzer/client_backend/openai/CMakeLists.txt b/src/c++/perf_analyzer/client_backend/openai/CMakeLists.txt
new file mode 100644
index 000000000..d9b5db33f
--- /dev/null
+++ b/src/c++/perf_analyzer/client_backend/openai/CMakeLists.txt
@@ -0,0 +1,56 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cmake_minimum_required (VERSION 3.18)
+
+set(
+    OPENAI_CLIENT_BACKEND_SRCS
+    openai_client_backend.cc
+)
+
+set(
+    OPENAI_CLIENT_BACKEND_HDRS
+    openai_client_backend.h
+    openai_http_client.h
+)
+
+add_library(
+    openai-client-backend-library  EXCLUDE_FROM_ALL OBJECT
+    ${OPENAI_CLIENT_BACKEND_SRCS}
+    ${OPENAI_CLIENT_BACKEND_HDRS}
+)
+
+target_link_libraries(
+  openai-client-backend-library
+  # TODO: Assuming we'll need curl libs
+  PUBLIC CURL::libcurl 
+  PUBLIC httpclient_static
+)
+
+if(${TRITON_ENABLE_GPU})
+    target_include_directories(openai-client-backend-library PUBLIC ${CUDA_INCLUDE_DIRS})
+    target_link_libraries(openai-client-backend-library PRIVATE ${CUDA_LIBRARIES})
+endif() # TRITON_ENABLE_GPU
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc b/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc
new file mode 100644
index 000000000..d9cca25d9
--- /dev/null
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "openai_client_backend.h"
+
+namespace triton { namespace perfanalyzer { namespace clientbackend {
+namespace openai {
+
+//==============================================================================
+
+Error
+OpenAiClientBackend::Create(
+    const std::string& url, const ProtocolType protocol,
+    std::shared_ptr<Headers> http_headers, const bool verbose,
+    std::unique_ptr<ClientBackend>* client_backend)
+{
+  if (protocol == ProtocolType::GRPC) {
+    return Error(
+        "perf_analyzer does not support gRPC protocol with OpenAI endpoints");
+  }
+  std::unique_ptr<OpenAiClientBackend> openai_client_backend(
+    new OpenAiClientBackend(http_headers));
+
+  // TODO: Adjust as needed
+  RETURN_IF_CB_ERROR(HttpClient::Create(
+      &(openai_client_backend->http_client_), url, verbose));
+
+  *client_backend = std::move(openai_client_backend);
+
+  return Error::Success;
+}
+
+Error
+OpenAiClientBackend::AsyncInfer(
+    OnCompleteFn callback, const InferOptions& options,
+    const std::vector<InferInput*>& inputs,
+    const std::vector<const InferRequestedOutput*>& outputs)
+{
+  auto wrapped_callback = [callback](cb::openai::InferResult* client_result) {
+    cb::InferResult* result = new OpenAiInferResult(client_result);
+    callback(result);
+  };
+
+  // TODO: make an async infer call
+  //RETURN_IF_CB_ERROR(http_client_->AsyncInfer(...));
+
+  return Error::Success;
+}
+
+
+Error
+OpenAiClientBackend::ClientInferStat(InferStat* infer_stat)
+{
+  // Reusing the common library utilities to collect and report the
+  // client side statistics.
+  tc::InferStat client_infer_stat;
+
+  RETURN_IF_TRITON_ERROR(http_client_->ClientInferStat(&client_infer_stat));
+
+  ParseInferStat(client_infer_stat, infer_stat);
+
+  return Error::Success;
+}
+
+void
+OpenAiClientBackend::ParseInferStat(
+    const tc::InferStat& tfserve_infer_stat, InferStat* infer_stat)
+{
+  // TODO: Implement
+  return;
+}
+
+//==============================================================================
+
+Error
+OpenAiInferRequestedOutput::Create(
+    InferRequestedOutput** infer_output, const std::string& name)
+{
+  OpenAiInferRequestedOutput* local_infer_output =
+      new OpenAiInferRequestedOutput(name);
+
+  tc::InferRequestedOutput* openai_infer_output;
+  RETURN_IF_TRITON_ERROR(
+      tc::InferRequestedOutput::Create(&openai_infer_output, name));
+  local_infer_output->output_.reset(openai_infer_output);
+
+  *infer_output = local_infer_output;
+
+  return Error::Success;
+}
+
+OpenAiInferRequestedOutput::OpenAiInferRequestedOutput(
+    const std::string& name)
+    : InferRequestedOutput(BackendKind::OPENAI, name)
+{
+}
+
+//==============================================================================
+
+OpenAiInferResult::OpenAiInferResult(cb::openai::InferResult* result)
+{
+  result_.reset(result);
+}
+
+Error
+OpenAiInferResult::Id(std::string* id) const
+{
+  id->clear();
+  return Error::Success;
+}
+
+Error
+OpenAiInferResult::RequestStatus() const
+{
+  RETURN_IF_CB_ERROR(result_->RequestStatus());
+  return Error::Success;
+}
+
+Error
+OpenAiInferResult::RawData(
+    const std::string& output_name, const uint8_t** buf,
+    size_t* byte_size) const
+{
+  return Error(
+      "Output retrieval is not currently supported for OpenAi client backend");
+}
+
+//==============================================================================
+
+
+}}}}  // namespace triton::perfanalyzer::clientbackend::openai
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.h b/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.h
new file mode 100644
index 000000000..c6c83222f
--- /dev/null
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.h
@@ -0,0 +1,130 @@
+// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <string>
+
+#include "../../perf_utils.h"
+#include "../client_backend.h"
+#include "openai_http_client.h"
+
+#define RETURN_IF_TRITON_ERROR(S)       \
+  do {                                  \
+    const tc::Error& status__ = (S);    \
+    if (!status__.IsOk()) {             \
+      return Error(status__.Message()); \
+    }                                   \
+  } while (false)
+
+namespace tc = triton::client;
+namespace cb = triton::perfanalyzer::clientbackend;
+
+namespace triton { namespace perfanalyzer { namespace clientbackend {
+namespace openai {
+
+
+//==============================================================================
+/// OpenAiClientBackend is used to generate load on the serving instance,
+/// which supports OpenAI Chat Completions API
+///
+class OpenAiClientBackend : public ClientBackend {
+ public:
+  /// Create an OpenAI client backend which can be used to interact with the
+  /// server.
+  /// \param url The inference server url and port.
+  /// \param protocol The protocol type used.
+  /// \param http_headers Map of HTTP headers. The map key/value indicates
+  /// the header name/value.
+  /// \param verbose Enables the verbose mode.
+  /// \param client_backend Returns a new OpenAiClientBackend
+  /// object.
+  /// \return Error object indicating success or failure.
+  static Error Create(
+      const std::string& url, const ProtocolType protocol,
+      std::shared_ptr<Headers> http_headers, const bool verbose,
+      std::unique_ptr<ClientBackend>* client_backend);
+
+  /// See ClientBackend::AsyncInfer()
+  Error AsyncInfer(
+      OnCompleteFn callback, const InferOptions& options,
+      const std::vector<InferInput*>& inputs,
+      const std::vector<const InferRequestedOutput*>& outputs) override;
+
+  /// See ClientBackend::ClientInferStat()
+  Error ClientInferStat(InferStat* infer_stat) override;
+
+ private:
+  OpenAiClientBackend(std::shared_ptr<Headers> http_headers)
+      : ClientBackend(BackendKind::OPENAI), http_headers_(http_headers)
+  {
+  }
+
+  void ParseInferStat(
+      const tc::InferStat& openai_infer_stat, InferStat* infer_stat);
+
+  std::unique_ptr<openai::HttpClient> http_client_;
+  std::shared_ptr<Headers> http_headers_;
+};
+
+//==============================================================
+/// OpenAiInferRequestedOutput is a wrapper around
+/// InferRequestedOutput object of triton common client library.
+///
+class OpenAiInferRequestedOutput : public InferRequestedOutput {
+ public:
+  static Error Create(
+      InferRequestedOutput** infer_output, const std::string& name);
+  /// Returns the raw InferRequestedOutput object required by OpenAi client
+  /// library.
+  tc::InferRequestedOutput* Get() const { return output_.get(); }
+
+ private:
+  explicit OpenAiInferRequestedOutput(const std::string& name);
+
+  std::unique_ptr<tc::InferRequestedOutput> output_;
+};
+
+//==============================================================
+/// OpenAiInferResult is a wrapper around InferResult object of
+/// OpenAi InferResult object.
+///
+class OpenAiInferResult : public cb::InferResult {
+ public:
+  explicit OpenAiInferResult(cb::openai::InferResult* result);
+  /// See InferResult::Id()
+  Error Id(std::string* id) const override;
+  /// See InferResult::RequestStatus()
+  Error RequestStatus() const override;
+  /// See InferResult::RawData()
+  Error RawData(
+      const std::string& output_name, const uint8_t** buf,
+      size_t* byte_size) const override;
+
+ private:
+  std::unique_ptr<cb::openai::InferResult> result_;
+};
+
+}}}}  // namespace triton::perfanalyzer::clientbackend::openai
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_http_client.h b/src/c++/perf_analyzer/client_backend/openai/openai_http_client.h
new file mode 100644
index 000000000..03e3f489f
--- /dev/null
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_http_client.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "../client_backend.h"
+#include "common.h"
+
+
+namespace tc = triton::client;
+
+namespace triton { namespace perfanalyzer { namespace clientbackend {
+namespace openai {
+
+class InferResult;
+class HttpInferRequest;
+
+//==============================================================================
+/// An HttpClient object is used to perform any kind of communication with the
+/// OpenAi service using <TODO: FILL IN>
+///
+/// \code
+///   std::unique_ptr<HttpClient> client;
+///   HttpClient::Create(&client, "localhost:8080");
+///   ...
+///   ...
+/// \endcode
+///
+class HttpClient : public tc::InferenceServerClient {
+ public:
+  ~HttpClient();
+
+  /// TODO: Adjust as needed
+  /// Create a client that can be used to communicate with the server.
+  /// \param client Returns a new InferenceServerHttpClient object.
+  /// \param server_url The inference server name and port.
+  /// \param verbose If true generate verbose output when contacting
+  /// the inference server.
+  /// \return Error object indicating success or failure.
+  static Error Create(
+      std::unique_ptr<HttpClient>* client, const std::string& server_url,
+      const bool verbose);
+
+ private:
+  HttpClient(const std::string& url, bool verbose);
+  
+  // The server url
+  const std::string url_;
+};
+
+//======================================================================
+
+class InferResult {
+ public:
+  static Error Create(
+      InferResult** infer_result,
+      std::shared_ptr<HttpInferRequest> infer_request);
+  Error RequestStatus() const;
+  Error Id(std::string* id) const;
+
+ private:
+  InferResult(std::shared_ptr<HttpInferRequest> infer_request);
+
+  // The status of the inference
+  Error status_;
+  // The pointer to the HttpInferRequest object
+  std::shared_ptr<HttpInferRequest> infer_request_;
+};
+
+//======================================================================
+
+}}}}  // namespace triton::perfanalyzer::clientbackend::openai

From 36e6cebf6dc37d80f6b5ba139e4e2165311b4fab Mon Sep 17 00:00:00 2001
From: tgerdes <tgerdes@nvidia.com>
Date: Wed, 28 Feb 2024 13:13:38 -0600
Subject: [PATCH 04/23] Create OpenAI backend

---
 .../client_backend/client_backend.cc          | 20 +++++++
 .../client_backend/openai/CMakeLists.txt      |  2 +-
 .../openai/openai_http_client.cc              | 60 +++++++++++++++++++
 .../openai/openai_http_client.h               |  8 +--
 4 files changed, 85 insertions(+), 5 deletions(-)
 create mode 100644 src/c++/perf_analyzer/client_backend/openai/openai_http_client.cc

diff --git a/src/c++/perf_analyzer/client_backend/client_backend.cc b/src/c++/perf_analyzer/client_backend/client_backend.cc
index 95b3ae0b6..282c6e181 100644
--- a/src/c++/perf_analyzer/client_backend/client_backend.cc
+++ b/src/c++/perf_analyzer/client_backend/client_backend.cc
@@ -32,6 +32,10 @@
 #include "triton_c_api/triton_c_api_backend.h"
 #endif  // TRITON_ENABLE_PERF_ANALYZER_C_API
 
+#ifdef TRITON_ENABLE_PERF_ANALYZER_OPENAI
+#include "openai/openai_client_backend.h"
+#endif  // TRITON_ENABLE_PERF_ANALYZER_OPENAI
+
 #ifdef TRITON_ENABLE_PERF_ANALYZER_TFS
 #include "tensorflow_serving/tfserve_client_backend.h"
 #endif  // TRITON_ENABLE_PERF_ANALYZER_TFS
@@ -172,6 +176,13 @@ ClientBackend::Create(
         metrics_url, input_tensor_format, output_tensor_format,
         &local_backend));
   }
+#ifdef TRITON_ENABLE_PERF_ANALYZER_OPENAI
+  // TODO -- I think this needs endpoint to be passed in?
+  else if (kind == OPENAI) {
+    RETURN_IF_CB_ERROR(openai::OpenAiClientBackend::Create(
+        url, protocol, http_headers, verbose, &local_backend));
+  }
+#endif  // TRITON_ENABLE_PERF_ANALYZER_OPENAI
 #ifdef TRITON_ENABLE_PERF_ANALYZER_TFS
   else if (kind == TENSORFLOW_SERVING) {
     RETURN_IF_CB_ERROR(tfserving::TFServeClientBackend::Create(
@@ -421,6 +432,15 @@ InferInput::Create(
     RETURN_IF_CB_ERROR(tritonremote::TritonInferInput::Create(
         infer_input, name, dims, datatype));
   }
+#ifdef TRITON_ENABLE_PERF_ANALYZER_OPENAI
+  else if (kind == OPENAI) {
+    RETURN_IF_CB_ERROR(
+        // FIXME TODO TKG
+        // openai::OpenAiInferInput::Create(infer_input, name, dims, datatype));
+        tritonremote::TritonInferInput::Create(
+            infer_input, name, dims, datatype));
+  }
+#endif  // TRITON_ENABLE_PERF_ANALYZER_OPENAI
 #ifdef TRITON_ENABLE_PERF_ANALYZER_TFS
   else if (kind == TENSORFLOW_SERVING) {
     RETURN_IF_CB_ERROR(tfserving::TFServeInferInput::Create(
diff --git a/src/c++/perf_analyzer/client_backend/openai/CMakeLists.txt b/src/c++/perf_analyzer/client_backend/openai/CMakeLists.txt
index d9b5db33f..ec839a2b3 100644
--- a/src/c++/perf_analyzer/client_backend/openai/CMakeLists.txt
+++ b/src/c++/perf_analyzer/client_backend/openai/CMakeLists.txt
@@ -29,6 +29,7 @@ cmake_minimum_required (VERSION 3.18)
 set(
     OPENAI_CLIENT_BACKEND_SRCS
     openai_client_backend.cc
+    openai_http_client.cc
 )
 
 set(
@@ -45,7 +46,6 @@ add_library(
 
 target_link_libraries(
   openai-client-backend-library
-  # TODO: Assuming we'll need curl libs
   PUBLIC CURL::libcurl 
   PUBLIC httpclient_static
 )
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_http_client.cc b/src/c++/perf_analyzer/client_backend/openai/openai_http_client.cc
new file mode 100644
index 000000000..5263407c3
--- /dev/null
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_http_client.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "openai_http_client.h"
+
+
+namespace triton { namespace perfanalyzer { namespace clientbackend {
+namespace openai {
+
+
+Error
+HttpClient::Create(
+    std::unique_ptr<HttpClient>* client, const std::string& server_url,
+    bool verbose)
+{
+  client->reset(new HttpClient(server_url, verbose));
+  return Error::Success;
+}
+
+
+HttpClient::HttpClient(const std::string& url, bool verbose)
+    : InferenceServerClient(verbose), url_(url)
+// ,easy_handle_(reinterpret_cast<void*>(curl_easy_init()) // TODO FIXME TKG
+{
+}
+
+HttpClient::~HttpClient()
+{
+  exiting_ = true;
+
+  // FIXME TODO TKG
+  // if (easy_handle_ != nullptr) {
+  //  curl_easy_cleanup(reinterpret_cast<CURL*>(easy_handle_));
+  //}
+}
+
+}}}}  // namespace triton::perfanalyzer::clientbackend::openai
\ No newline at end of file
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_http_client.h b/src/c++/perf_analyzer/client_backend/openai/openai_http_client.h
index 03e3f489f..67f7d9144 100644
--- a/src/c++/perf_analyzer/client_backend/openai/openai_http_client.h
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_http_client.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -65,7 +65,7 @@ class HttpClient : public tc::InferenceServerClient {
 
  private:
   HttpClient(const std::string& url, bool verbose);
-  
+
   // The server url
   const std::string url_;
 };
@@ -77,8 +77,8 @@ class InferResult {
   static Error Create(
       InferResult** infer_result,
       std::shared_ptr<HttpInferRequest> infer_request);
-  Error RequestStatus() const;
-  Error Id(std::string* id) const;
+  Error RequestStatus() const { return Error::Success; }      // TODO FIXME TKG
+  Error Id(std::string* id) const { return Error::Success; }  // TODO FIXME TKG
 
  private:
   InferResult(std::shared_ptr<HttpInferRequest> infer_request);

From ccd0b6876c46b17ab7212333b5c0cfbbbf25696a Mon Sep 17 00:00:00 2001
From: tgerdes <tgerdes@nvidia.com>
Date: Wed, 28 Feb 2024 17:43:11 -0600
Subject: [PATCH 05/23] New JSON datatype for PA. Show json data available at
 http_client level

---
 .../client_backend/client_backend.cc          |   5 +-
 .../client_backend/openai/CMakeLists.txt      |   2 +
 .../openai/openai_client_backend.cc           |  14 +--
 .../openai/openai_http_client.cc              |  60 ++++++++++
 .../openai/openai_http_client.h               |  12 ++
 .../openai/openai_infer_input.cc              | 112 ++++++++++++++++++
 .../openai/openai_infer_input.h               |  76 ++++++++++++
 src/c++/perf_analyzer/perf_utils.cc           |  23 ++++
 8 files changed, 292 insertions(+), 12 deletions(-)
 create mode 100644 src/c++/perf_analyzer/client_backend/openai/openai_infer_input.cc
 create mode 100644 src/c++/perf_analyzer/client_backend/openai/openai_infer_input.h

diff --git a/src/c++/perf_analyzer/client_backend/client_backend.cc b/src/c++/perf_analyzer/client_backend/client_backend.cc
index 282c6e181..869762942 100644
--- a/src/c++/perf_analyzer/client_backend/client_backend.cc
+++ b/src/c++/perf_analyzer/client_backend/client_backend.cc
@@ -435,10 +435,7 @@ InferInput::Create(
 #ifdef TRITON_ENABLE_PERF_ANALYZER_OPENAI
   else if (kind == OPENAI) {
     RETURN_IF_CB_ERROR(
-        // FIXME TODO TKG
-        // openai::OpenAiInferInput::Create(infer_input, name, dims, datatype));
-        tritonremote::TritonInferInput::Create(
-            infer_input, name, dims, datatype));
+        openai::OpenAiInferInput::Create(infer_input, name, dims, datatype));
   }
 #endif  // TRITON_ENABLE_PERF_ANALYZER_OPENAI
 #ifdef TRITON_ENABLE_PERF_ANALYZER_TFS
diff --git a/src/c++/perf_analyzer/client_backend/openai/CMakeLists.txt b/src/c++/perf_analyzer/client_backend/openai/CMakeLists.txt
index ec839a2b3..3ef867e9f 100644
--- a/src/c++/perf_analyzer/client_backend/openai/CMakeLists.txt
+++ b/src/c++/perf_analyzer/client_backend/openai/CMakeLists.txt
@@ -30,12 +30,14 @@ set(
     OPENAI_CLIENT_BACKEND_SRCS
     openai_client_backend.cc
     openai_http_client.cc
+    openai_infer_input.cc
 )
 
 set(
     OPENAI_CLIENT_BACKEND_HDRS
     openai_client_backend.h
     openai_http_client.h
+    openai_infer_input.h
 )
 
 add_library(
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc b/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc
index d9cca25d9..d017b8b23 100644
--- a/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc
@@ -42,11 +42,10 @@ OpenAiClientBackend::Create(
         "perf_analyzer does not support gRPC protocol with OpenAI endpoints");
   }
   std::unique_ptr<OpenAiClientBackend> openai_client_backend(
-    new OpenAiClientBackend(http_headers));
+      new OpenAiClientBackend(http_headers));
 
-  // TODO: Adjust as needed
-  RETURN_IF_CB_ERROR(HttpClient::Create(
-      &(openai_client_backend->http_client_), url, verbose));
+  RETURN_IF_CB_ERROR(
+      HttpClient::Create(&(openai_client_backend->http_client_), url, verbose));
 
   *client_backend = std::move(openai_client_backend);
 
@@ -64,8 +63,8 @@ OpenAiClientBackend::AsyncInfer(
     callback(result);
   };
 
-  // TODO: make an async infer call
-  //RETURN_IF_CB_ERROR(http_client_->AsyncInfer(...));
+  RETURN_IF_CB_ERROR(http_client_->AsyncInfer(
+      wrapped_callback, options, inputs, outputs, *http_headers_));
 
   return Error::Success;
 }
@@ -112,8 +111,7 @@ OpenAiInferRequestedOutput::Create(
   return Error::Success;
 }
 
-OpenAiInferRequestedOutput::OpenAiInferRequestedOutput(
-    const std::string& name)
+OpenAiInferRequestedOutput::OpenAiInferRequestedOutput(const std::string& name)
     : InferRequestedOutput(BackendKind::OPENAI, name)
 {
 }
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_http_client.cc b/src/c++/perf_analyzer/client_backend/openai/openai_http_client.cc
index 5263407c3..151eca2a6 100644
--- a/src/c++/perf_analyzer/client_backend/openai/openai_http_client.cc
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_http_client.cc
@@ -26,6 +26,8 @@
 
 #include "openai_http_client.h"
 
+#include <rapidjson/rapidjson.h>
+
 
 namespace triton { namespace perfanalyzer { namespace clientbackend {
 namespace openai {
@@ -40,6 +42,64 @@ HttpClient::Create(
   return Error::Success;
 }
 
+Error
+HttpClient::AsyncInfer(
+    OpenAiOnCompleteFn callback, const InferOptions& options,
+    const std::vector<InferInput*>& inputs,
+    const std::vector<const InferRequestedOutput*>& outputs,
+    const Headers& headers)
+{
+  // TODO FIXME implement
+
+  // TODO FIXME cleanup or remove this. It just proves the json data arrives
+  rapidjson::Document d{};
+
+  if (inputs.size() != 1) {
+    return Error("Only expecting one input");
+  }
+
+  auto raw_input = dynamic_cast<OpenAiInferInput*>(inputs[0]);
+
+  raw_input->PrepareForRequest();
+  bool end_of_input = false;
+  const uint8_t* buf;
+  size_t buf_size;
+  raw_input->GetNext(&buf, &buf_size, &end_of_input);
+  if (!end_of_input) {
+    return Error("Unexpected multiple json data inputs");
+  }
+  if (buf == nullptr) {
+    return Error("Unexpected null json data");
+  }
+
+  std::string json_str(reinterpret_cast<const char*>(buf), buf_size);
+  std::cout << "FIXME TODO: JSON data string is " << json_str << std::endl;
+
+
+  if (d.Parse(json_str.c_str()).HasParseError()) {
+    return Error("Unable to parse json string: " + json_str);
+  }
+
+  // FIXME TKG -- where/how would the 'streaming' option get plugged in?
+
+  // FIXME TKG -- GOOD GOD! Is it this hard to add a single value into a json
+  // object??
+  // FIXME TKG -- what if the user supplied this in the input json file?
+  d.AddMember(
+      "model",
+      rapidjson::Value().SetString(
+          options.model_name_.c_str(),
+          static_cast<rapidjson::SizeType>(options.model_name_.length()),
+          d.GetAllocator()),
+      d.GetAllocator());
+
+  for (auto itr = d.MemberBegin(); itr != d.MemberEnd(); ++itr) {
+    std::cout << "FIXME TODO: valid JSON object has key "
+              << itr->name.GetString() << std::endl;
+  }
+
+  return Error::Success;
+}
 
 HttpClient::HttpClient(const std::string& url, bool verbose)
     : InferenceServerClient(verbose), url_(url)
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_http_client.h b/src/c++/perf_analyzer/client_backend/openai/openai_http_client.h
index 67f7d9144..bbdaddfe9 100644
--- a/src/c++/perf_analyzer/client_backend/openai/openai_http_client.h
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_http_client.h
@@ -27,6 +27,7 @@
 
 #include "../client_backend.h"
 #include "common.h"
+#include "openai_infer_input.h"
 
 
 namespace tc = triton::client;
@@ -37,6 +38,8 @@ namespace openai {
 class InferResult;
 class HttpInferRequest;
 
+using OpenAiOnCompleteFn = std::function<void(InferResult*)>;
+
 //==============================================================================
 /// An HttpClient object is used to perform any kind of communication with the
 /// OpenAi service using <TODO: FILL IN>
@@ -63,6 +66,15 @@ class HttpClient : public tc::InferenceServerClient {
       std::unique_ptr<HttpClient>* client, const std::string& server_url,
       const bool verbose);
 
+  /// TODO FIXME: Update
+  /// Run asynchronous inference on server.
+  Error AsyncInfer(
+      OpenAiOnCompleteFn callback, const InferOptions& options,
+      const std::vector<InferInput*>& inputs,
+      const std::vector<const InferRequestedOutput*>& outputs =
+          std::vector<const InferRequestedOutput*>(),
+      const Headers& headers = Headers());
+
  private:
   HttpClient(const std::string& url, bool verbose);
 
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.cc b/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.cc
new file mode 100644
index 000000000..70d827e85
--- /dev/null
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.cc
@@ -0,0 +1,112 @@
+// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "openai_infer_input.h"
+
+namespace triton { namespace perfanalyzer { namespace clientbackend {
+namespace openai {
+
+Error
+OpenAiInferInput::Create(
+    InferInput** infer_input, const std::string& name,
+    const std::vector<int64_t>& dims, const std::string& datatype)
+{
+  OpenAiInferInput* local_infer_input =
+      new OpenAiInferInput(name, dims, datatype);
+
+  *infer_input = local_infer_input;
+  return Error::Success;
+}
+
+Error
+OpenAiInferInput::SetShape(const std::vector<int64_t>& shape)
+{
+  shape_ = shape;
+  return Error::Success;
+}
+
+Error
+OpenAiInferInput::Reset()
+{
+  bufs_.clear();
+  buf_byte_sizes_.clear();
+  bufs_idx_ = 0;
+  byte_size_ = 0;
+  return Error::Success;
+}
+
+Error
+OpenAiInferInput::AppendRaw(const uint8_t* input, size_t input_byte_size)
+{
+  byte_size_ += input_byte_size;
+
+  bufs_.push_back(input);
+  buf_byte_sizes_.push_back(input_byte_size);
+
+  return Error::Success;
+}
+
+Error
+OpenAiInferInput::ByteSize(size_t* byte_size) const
+{
+  *byte_size = byte_size_;
+  return Error::Success;
+}
+
+Error
+OpenAiInferInput::PrepareForRequest()
+{
+  // Reset position so request sends entire input.
+  bufs_idx_ = 0;
+  buf_pos_ = 0;
+  return Error::Success;
+}
+
+Error
+OpenAiInferInput::GetNext(
+    const uint8_t** buf, size_t* input_bytes, bool* end_of_input)
+{
+  if (bufs_idx_ < bufs_.size()) {
+    *buf = bufs_[bufs_idx_];
+    *input_bytes = buf_byte_sizes_[bufs_idx_];
+    bufs_idx_++;
+  } else {
+    *buf = nullptr;
+    *input_bytes = 0;
+  }
+  *end_of_input = (bufs_idx_ >= bufs_.size());
+
+  return Error::Success;
+}
+
+OpenAiInferInput::OpenAiInferInput(
+    const std::string& name, const std::vector<int64_t>& dims,
+    const std::string& datatype)
+    : InferInput(BackendKind::TENSORFLOW_SERVING, name, datatype), shape_(dims)
+{
+}
+
+}}}}  // namespace triton::perfanalyzer::clientbackend::openai
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.h b/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.h
new file mode 100644
index 000000000..a10b9312f
--- /dev/null
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.h
@@ -0,0 +1,76 @@
+// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <string>
+
+#include "../../perf_utils.h"
+#include "../client_backend.h"
+
+
+namespace triton { namespace perfanalyzer { namespace clientbackend {
+namespace openai {
+
+//==============================================================
+/// OpenAiInferInput instance holds the information regarding
+/// model input tensors and their corresponding generated data.
+///
+class OpenAiInferInput : public InferInput {
+ public:
+  static Error Create(
+      InferInput** infer_input, const std::string& name,
+      const std::vector<int64_t>& dims, const std::string& datatype);
+  /// See InferInput::Shape()
+  const std::vector<int64_t>& Shape() const override { return shape_; }
+  /// See InferInput::SetShape()
+  Error SetShape(const std::vector<int64_t>& shape) override;
+  /// See InferInput::Reset()
+  Error Reset() override;
+  /// See InferInput::AppendRaw()
+  Error AppendRaw(const uint8_t* input, size_t input_byte_size) override;
+  /// Gets the size of data added into this input in bytes.
+  /// \param byte_size The size of data added in bytes.
+  /// \return Error object indicating success or failure.
+  Error ByteSize(size_t* byte_size) const;
+  /// Resets the heads to start providing data from the beginning.
+  Error PrepareForRequest();
+  /// Get the next chunk of data if available.
+  Error GetNext(const uint8_t** buf, size_t* input_bytes, bool* end_of_input);
+
+ private:
+  explicit OpenAiInferInput(
+      const std::string& name, const std::vector<int64_t>& dims,
+      const std::string& datatype);
+
+  std::vector<int64_t> shape_;
+  size_t byte_size_{0};
+
+  size_t bufs_idx_, buf_pos_;
+  std::vector<const uint8_t*> bufs_;
+  std::vector<size_t> buf_byte_sizes_;
+};
+
+}}}}  // namespace triton::perfanalyzer::clientbackend::openai
diff --git a/src/c++/perf_analyzer/perf_utils.cc b/src/c++/perf_analyzer/perf_utils.cc
index d6c0a8a37..4c02f56ca 100644
--- a/src/c++/perf_analyzer/perf_utils.cc
+++ b/src/c++/perf_analyzer/perf_utils.cc
@@ -27,6 +27,8 @@
 #include "perf_utils.h"
 
 #include <fcntl.h>
+#include <rapidjson/stringbuffer.h>
+#include <rapidjson/writer.h>
 #include <sys/mman.h>
 #include <unistd.h>
 
@@ -200,6 +202,25 @@ SerializeExplicitTensor(
     std::copy(
         serialized.begin(), serialized.end(),
         std::back_inserter(*decoded_data));
+  } else if (dt.compare("JSON") == 0) {
+    std::string serialized = "";
+
+    for (const auto& value : tensor.GetArray()) {
+      rapidjson::StringBuffer buffer;
+      rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
+      value.Accept(writer);
+
+      std::string element = buffer.GetString();
+      uint32_t len = element.size();
+      // FIXME TODO - for BYTES we add the length. Is there any reason that
+      // would be needed here?
+      // serialized.append(reinterpret_cast<const char*>(&len),
+      // sizeof(uint32_t));
+      serialized.append(element);
+    }
+    std::copy(
+        serialized.begin(), serialized.end(),
+        std::back_inserter(*decoded_data));
   } else {
     for (const auto& value : tensor.GetArray()) {
       if (dt.compare("BOOL") == 0) {
@@ -298,6 +319,8 @@ SerializeExplicitTensor(
         double element(value.GetDouble());
         const char* src = reinterpret_cast<const char*>(&element);
         decoded_data->insert(decoded_data->end(), src, src + sizeof(double));
+      } else {
+        return cb::Error("Unexpected type " + dt);
       }
     }
   }

From 4b841b331ea64a3fe5163327ed71eaf76f1e5efd Mon Sep 17 00:00:00 2001
From: tgerdes <tgerdes@nvidia.com>
Date: Fri, 1 Mar 2024 15:16:15 -0600
Subject: [PATCH 06/23] Add an output to OpenAI models

---
 .../perf_analyzer/client_backend/client_backend.cc |  6 ++++++
 src/c++/perf_analyzer/model_parser.cc              | 14 ++++++++++----
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/c++/perf_analyzer/client_backend/client_backend.cc b/src/c++/perf_analyzer/client_backend/client_backend.cc
index 869762942..04a68fefb 100644
--- a/src/c++/perf_analyzer/client_backend/client_backend.cc
+++ b/src/c++/perf_analyzer/client_backend/client_backend.cc
@@ -522,6 +522,12 @@ InferRequestedOutput::Create(
     RETURN_IF_CB_ERROR(tritonremote::TritonInferRequestedOutput::Create(
         infer_output, name, class_count));
   }
+#ifdef TRITON_ENABLE_PERF_ANALYZER_OPENAI
+  else if (kind == OPENAI) {
+    RETURN_IF_CB_ERROR(
+        openai::OpenAiInferRequestedOutput::Create(infer_output, name));
+  }
+#endif  // TRITON_ENABLE_PERF_ANALYZER_OPENAI
 #ifdef TRITON_ENABLE_PERF_ANALYZER_TFS
   else if (kind == TENSORFLOW_SERVING) {
     RETURN_IF_CB_ERROR(
diff --git a/src/c++/perf_analyzer/model_parser.cc b/src/c++/perf_analyzer/model_parser.cc
index 7dcd59819..30e149c0c 100644
--- a/src/c++/perf_analyzer/model_parser.cc
+++ b/src/c++/perf_analyzer/model_parser.cc
@@ -277,10 +277,16 @@ ModelParser::InitOpenAI(
   max_batch_size_ = batch_size;
 
   // OpenAI will take a single json input with a fully formed payload
-  auto it = inputs_->emplace("payload", ModelTensor()).first;
-  it->second.name_ = "payload";
-  it->second.datatype_ = "JSON";
-  it->second.shape_.push_back(1);
+  auto in_it = inputs_->emplace("payload", ModelTensor()).first;
+  in_it->second.name_ = "payload";
+  in_it->second.datatype_ = "JSON";
+  in_it->second.shape_.push_back(1);
+
+  // OpenAI will reply with a single json output
+  auto out_it = outputs_->emplace("response", ModelTensor()).first;
+  out_it->second.name_ = "response";
+  out_it->second.datatype_ = "JSON";
+  out_it->second.shape_.push_back(1);
 
   return cb::Error::Success;
 }

From e9f4a221ce254cdd83a50a2634fa7624db96fcf9 Mon Sep 17 00:00:00 2001
From: GuanLuo <41310872+GuanLuo@users.noreply.github.com>
Date: Mon, 4 Mar 2024 08:00:27 -0800
Subject: [PATCH 07/23] Add OpenAI client (#482)

* Add OpenAI client

* Address comment
---
 .../client_backend/openai/CMakeLists.txt      |   8 +-
 .../client_backend/openai/http_client.cc      | 267 ++++++++++++++++
 .../client_backend/openai/http_client.h       | 191 +++++++++++
 .../client_backend/openai/openai_client.cc    | 298 ++++++++++++++++++
 .../client_backend/openai/openai_client.h     | 181 +++++++++++
 .../openai/openai_client_backend.cc           |  65 +---
 .../openai/openai_client_backend.h            |  25 +-
 .../openai/openai_http_client.cc              | 120 -------
 .../openai/openai_http_client.h               | 106 -------
 .../openai/openai_infer_input.cc              |  38 +--
 .../openai/openai_infer_input.h               |  13 +-
 11 files changed, 971 insertions(+), 341 deletions(-)
 create mode 100644 src/c++/perf_analyzer/client_backend/openai/http_client.cc
 create mode 100644 src/c++/perf_analyzer/client_backend/openai/http_client.h
 create mode 100644 src/c++/perf_analyzer/client_backend/openai/openai_client.cc
 create mode 100644 src/c++/perf_analyzer/client_backend/openai/openai_client.h
 delete mode 100644 src/c++/perf_analyzer/client_backend/openai/openai_http_client.cc
 delete mode 100644 src/c++/perf_analyzer/client_backend/openai/openai_http_client.h

diff --git a/src/c++/perf_analyzer/client_backend/openai/CMakeLists.txt b/src/c++/perf_analyzer/client_backend/openai/CMakeLists.txt
index 3ef867e9f..93963e378 100644
--- a/src/c++/perf_analyzer/client_backend/openai/CMakeLists.txt
+++ b/src/c++/perf_analyzer/client_backend/openai/CMakeLists.txt
@@ -28,15 +28,17 @@ cmake_minimum_required (VERSION 3.18)
 
 set(
     OPENAI_CLIENT_BACKEND_SRCS
+    http_client.cc
     openai_client_backend.cc
-    openai_http_client.cc
+    openai_client.cc
     openai_infer_input.cc
 )
 
 set(
     OPENAI_CLIENT_BACKEND_HDRS
+    http_client.h
     openai_client_backend.h
-    openai_http_client.h
+    openai_client.h
     openai_infer_input.h
 )
 
@@ -48,7 +50,7 @@ add_library(
 
 target_link_libraries(
   openai-client-backend-library
-  PUBLIC CURL::libcurl 
+  PUBLIC CURL::libcurl
   PUBLIC httpclient_static
 )
 
diff --git a/src/c++/perf_analyzer/client_backend/openai/http_client.cc b/src/c++/perf_analyzer/client_backend/openai/http_client.cc
new file mode 100644
index 000000000..4c8632c52
--- /dev/null
+++ b/src/c++/perf_analyzer/client_backend/openai/http_client.cc
@@ -0,0 +1,267 @@
+// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "http_client.h"
+
+#include <functional>
+#include <iostream>
+
+namespace triton { namespace perfanalyzer { namespace clientbackend {
+namespace openai {
+
+HttpRequest::HttpRequest(
+    std::function<void(HttpRequest*)>&& completion_callback, const bool verbose)
+    : completion_callback_(std::move(completion_callback)), verbose_(verbose)
+{
+}
+
+HttpRequest::~HttpRequest()
+{
+  if (header_list_ != nullptr) {
+    curl_slist_free_all(header_list_);
+    header_list_ = nullptr;
+  }
+}
+
+void
+HttpRequest::AddInput(uint8_t* buf, size_t byte_size)
+{
+  data_buffers_.push_back(std::pair<uint8_t*, size_t>(buf, byte_size));
+  total_input_byte_size_ += byte_size;
+}
+
+void
+HttpRequest::GetNextInput(uint8_t* buf, size_t size, size_t* input_bytes)
+{
+  *input_bytes = 0;
+
+  while (!data_buffers_.empty() && size > 0) {
+    const size_t csz = std::min(data_buffers_.front().second, size);
+    if (csz > 0) {
+      const uint8_t* input_ptr = data_buffers_.front().first;
+      std::copy(input_ptr, input_ptr + csz, buf);
+      size -= csz;
+      buf += csz;
+      *input_bytes += csz;
+
+      data_buffers_.front().first += csz;
+      data_buffers_.front().second -= csz;
+    }
+    if (data_buffers_.front().second == 0) {
+      data_buffers_.pop_front();
+    }
+  }
+}
+
+HttpClient::HttpClient(
+    const std::string& server_url, bool verbose,
+    const HttpSslOptions& ssl_options)
+    : url_(server_url), verbose_(verbose), ssl_options_(ssl_options)
+{
+  auto* ver = curl_version_info(CURLVERSION_NOW);
+  if (ver->features & CURL_VERSION_THREADSAFE == 0) {
+    throw std::runtime_error(
+        "HTTP client has dependency on CURL library to have thread-safe "
+        "support (CURL_VERSION_THREADSAFE set)");
+  }
+  if (curl_global_init(CURL_GLOBAL_ALL) != 0) {
+    throw std::runtime_error("CURL global initialization failed");
+  }
+
+  multi_handle_ = curl_multi_init();
+
+  worker_ = std::thread(&HttpClient::AsyncTransfer, this);
+}
+
+HttpClient::~HttpClient()
+{
+  exiting_ = true;
+
+  // thread not joinable if AsyncInfer() is not called
+  // (it is default constructed thread before the first AsyncInfer() call)
+  if (worker_.joinable()) {
+    cv_.notify_all();
+    worker_.join();
+  }
+
+  for (auto& request : ongoing_async_requests_) {
+    CURL* easy_handle = reinterpret_cast<CURL*>(request.first);
+    curl_multi_remove_handle(multi_handle_, easy_handle);
+    curl_easy_cleanup(easy_handle);
+  }
+  curl_multi_cleanup(multi_handle_);
+
+  curl_global_cleanup();
+}
+
+const std::string&
+HttpClient::ParseSslCertType(HttpSslOptions::CERTTYPE cert_type)
+{
+  static std::string pem_str{"PEM"};
+  static std::string der_str{"DER"};
+  switch (cert_type) {
+    case HttpSslOptions::CERTTYPE::CERT_PEM:
+      return pem_str;
+    case HttpSslOptions::CERTTYPE::CERT_DER:
+      return der_str;
+  }
+  throw std::runtime_error(
+      "Unexpected SSL certificate type encountered. Only PEM and DER are "
+      "supported.");
+}
+
+const std::string&
+HttpClient::ParseSslKeyType(HttpSslOptions::KEYTYPE key_type)
+{
+  static std::string pem_str{"PEM"};
+  static std::string der_str{"DER"};
+  switch (key_type) {
+    case HttpSslOptions::KEYTYPE::KEY_PEM:
+      return pem_str;
+    case HttpSslOptions::KEYTYPE::KEY_DER:
+      return der_str;
+  }
+  throw std::runtime_error(
+      "unsupported SSL key type encountered. Only PEM and DER are "
+      "supported.");
+}
+
+void
+HttpClient::SetSSLCurlOptions(CURL* curl_handle)
+{
+  curl_easy_setopt(
+      curl_handle, CURLOPT_SSL_VERIFYPEER, ssl_options_.verify_peer);
+  curl_easy_setopt(
+      curl_handle, CURLOPT_SSL_VERIFYHOST, ssl_options_.verify_host);
+  if (!ssl_options_.ca_info.empty()) {
+    curl_easy_setopt(curl_handle, CURLOPT_CAINFO, ssl_options_.ca_info.c_str());
+  }
+  const auto& curl_cert_type = ParseSslCertType(ssl_options_.cert_type);
+  curl_easy_setopt(curl_handle, CURLOPT_SSLCERTTYPE, curl_cert_type.c_str());
+  if (!ssl_options_.cert.empty()) {
+    curl_easy_setopt(curl_handle, CURLOPT_SSLCERT, ssl_options_.cert.c_str());
+  }
+  const auto& curl_key_type = ParseSslKeyType(ssl_options_.key_type);
+  curl_easy_setopt(curl_handle, CURLOPT_SSLKEYTYPE, curl_key_type.c_str());
+  if (!ssl_options_.key.empty()) {
+    curl_easy_setopt(curl_handle, CURLOPT_SSLKEY, ssl_options_.key.c_str());
+  }
+}
+
+void
+HttpClient::Send(CURL* handle, std::unique_ptr<HttpRequest>&& request)
+{
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  auto insert_result = ongoing_async_requests_.emplace(
+      std::make_pair(reinterpret_cast<uintptr_t>(handle), std::move(request)));
+  if (!insert_result.second) {
+    curl_easy_cleanup(handle);
+    throw std::runtime_error(
+        "Failed to insert new asynchronous request context.");
+  }
+  curl_multi_add_handle(multi_handle_, handle);
+  cv_.notify_all();
+}
+
+void
+HttpClient::AsyncTransfer()
+{
+  int place_holder = 0;
+  CURLMsg* msg = nullptr;
+  do {
+    std::vector<std::unique_ptr<HttpRequest>> request_list;
+
+    // sleep if no work is available
+    std::unique_lock<std::mutex> lock(mutex_);
+    cv_.wait(lock, [this] {
+      if (this->exiting_) {
+        return true;
+      }
+      // wake up if an async request has been generated
+      return !this->ongoing_async_requests_.empty();
+    });
+
+    CURLMcode mc = curl_multi_perform(multi_handle_, &place_holder);
+    int numfds;
+    if (mc == CURLM_OK) {
+      // Wait for activity. If there are no descriptors in the multi_handle_
+      // then curl_multi_wait will return immediately
+      mc = curl_multi_wait(multi_handle_, NULL, 0, INT_MAX, &numfds);
+      if (mc == CURLM_OK) {
+        while ((msg = curl_multi_info_read(multi_handle_, &place_holder))) {
+          uintptr_t identifier = reinterpret_cast<uintptr_t>(msg->easy_handle);
+          auto itr = ongoing_async_requests_.find(identifier);
+          // This shouldn't happen
+          if (itr == ongoing_async_requests_.end()) {
+            std::cerr
+                << "Unexpected error: received completed request that is not "
+                   "in the list of asynchronous requests"
+                << std::endl;
+            curl_multi_remove_handle(multi_handle_, msg->easy_handle);
+            curl_easy_cleanup(msg->easy_handle);
+            continue;
+          }
+
+          long http_code = 400;
+          if (msg->data.result == CURLE_OK) {
+            curl_easy_getinfo(
+                msg->easy_handle, CURLINFO_RESPONSE_CODE, &http_code);
+          } else if (msg->data.result == CURLE_OPERATION_TIMEDOUT) {
+            http_code = 499;
+          }
+
+          request_list.emplace_back(std::move(itr->second));
+          ongoing_async_requests_.erase(itr);
+          curl_multi_remove_handle(multi_handle_, msg->easy_handle);
+          curl_easy_cleanup(msg->easy_handle);
+
+          std::unique_ptr<HttpRequest>& async_request = request_list.back();
+          async_request->http_code_ = http_code;
+
+          if (msg->msg != CURLMSG_DONE) {
+            // Something wrong happened.
+            std::cerr << "Unexpected error: received CURLMsg=" << msg->msg
+                      << std::endl;
+          }
+        }
+      } else {
+        std::cerr << "Unexpected error: curl_multi failed. Code:" << mc
+                  << std::endl;
+      }
+    } else {
+      std::cerr << "Unexpected error: curl_multi failed. Code:" << mc
+                << std::endl;
+    }
+    lock.unlock();
+
+    for (auto& this_request : request_list) {
+      this_request->completion_callback_(this_request.get());
+    }
+  } while (!exiting_);
+}
+
+}}}}  // namespace triton::perfanalyzer::clientbackend::openai
diff --git a/src/c++/perf_analyzer/client_backend/openai/http_client.h b/src/c++/perf_analyzer/client_backend/openai/http_client.h
new file mode 100644
index 000000000..c6acfd524
--- /dev/null
+++ b/src/c++/perf_analyzer/client_backend/openai/http_client.h
@@ -0,0 +1,191 @@
+// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <curl/curl.h>
+
+#include <condition_variable>
+#include <deque>
+#include <functional>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <thread>
+
+// [TODO] Below should already be a generic class for any HTTP use,
+// relocate it so that it can be used elsewhere
+namespace triton { namespace perfanalyzer { namespace clientbackend {
+namespace openai {
+
+// [FIXME] add back "parameter" handling
+// [FIXME] add back "compression" handling
+
+/// The key-value map type to be included in the request
+/// as custom headers.
+typedef std::map<std::string, std::string> Headers;
+/// The key-value map type to be included as URL parameters.
+typedef std::map<std::string, std::string> Parameters;
+
+// The options for authorizing and authenticating SSL/TLS connections.
+struct HttpSslOptions {
+  enum CERTTYPE { CERT_PEM = 0, CERT_DER = 1 };
+  enum KEYTYPE {
+    KEY_PEM = 0,
+    KEY_DER = 1
+    // TODO: Support loading private key from crypto engine
+    // KEY_ENG = 2
+  };
+  explicit HttpSslOptions()
+      : verify_peer(1), verify_host(2), cert_type(CERTTYPE::CERT_PEM),
+        key_type(KEYTYPE::KEY_PEM)
+  {
+  }
+  // This option determines whether curl verifies the authenticity of the peer's
+  // certificate. A value of 1 means curl verifies; 0 (zero) means it does not.
+  // Default value is 1. See here for more details:
+  // https://curl.se/libcurl/c/CURLOPT_SSL_VERIFYPEER.html
+  long verify_peer;
+  // This option determines whether libcurl verifies that the server cert is for
+  // the server it is known as. The default value for this option is 2 which
+  // means that certificate must indicate that the server is the server to which
+  // you meant to connect, or the connection fails. See here for more details:
+  // https://curl.se/libcurl/c/CURLOPT_SSL_VERIFYHOST.html
+  long verify_host;
+  // File holding one or more certificates to verify the peer with. If not
+  // specified, client will look for the system path where cacert bundle is
+  // assumed to be stored, as established at build time. See here for more
+  // information: https://curl.se/libcurl/c/CURLOPT_CAINFO.html
+  std::string ca_info;
+  // The format of client certificate. By default it is CERT_PEM. See here for
+  // more details: https://curl.se/libcurl/c/CURLOPT_SSLCERTTYPE.html
+  CERTTYPE cert_type;
+  // The file name of your client certificate. See here for more details:
+  // https://curl.se/libcurl/c/CURLOPT_SSLCERT.html
+  std::string cert;
+  // The format of the private key. By default it is KEY_PEM. See here for more
+  // details: https://curl.se/libcurl/c/CURLOPT_SSLKEYTYPE.html.
+  KEYTYPE key_type;
+  // The private key. See here for more details:
+  // https://curl.se/libcurl/c/CURLOPT_SSLKEY.html.
+  std::string key;
+};
+
+// HttpRequest object representing the context of a HTTP transaction. Currently
+// it is also designed to be the placeholder for response data, but how the
+// response is stored can be revisited later.
+// 'completion_callback' doesn't transfer ownership of HttpRequest, caller must
+// not keep the reference and access HttpRequest object after
+// 'completion_callback' returns
+class HttpRequest {
+ public:
+  HttpRequest(
+      std::function<void(HttpRequest*)>&& completion_callback,
+      const bool verbose = false);
+  virtual ~HttpRequest();
+
+  // Adds the input data to be delivered to the server, note that the HTTP
+  // request does not own the buffer.
+  void AddInput(uint8_t* buf, size_t byte_size);
+
+  // Helper function for CURL
+  // Copy into 'buf' up to 'size' bytes of input data. Return the
+  // actual amount copied in 'input_bytes'.
+  void GetNextInput(uint8_t* buf, size_t size, size_t* input_bytes);
+
+  // [FIXME] define default callback like
+  // CURLOPT_READFUNCTION, CURLOPT_WRITEFUNCTION here?
+  // the specialized HttpRequest can override the callbacks when read / write
+  // schema has changed.
+
+  // Buffer that accumulates the response body.
+  std::string response_buffer_;
+
+  size_t total_input_byte_size_{0};
+
+  // HTTP response code for the inference request
+  uint32_t http_code_{200};
+
+  std::function<void(HttpRequest*)> completion_callback_{nullptr};
+
+  // Pointer to the list of the HTTP request header, keep it such that it will
+  // be valid during the transfer and can be freed once transfer is completed.
+  struct curl_slist* header_list_{nullptr};
+
+ protected:
+  const bool verbose_{false};
+
+  // Pointers to the input data.
+  std::deque<std::pair<uint8_t*, size_t>> data_buffers_;
+};
+
+// Base class for common HTTP functionalities
+class HttpClient {
+ public:
+  enum class CompressionType { NONE, DEFLATE, GZIP };
+
+  virtual ~HttpClient();
+
+ protected:
+  void SetSSLCurlOptions(CURL* curl_handle);
+
+  HttpClient(
+      const std::string& server_url, bool verbose = false,
+      const HttpSslOptions& ssl_options = HttpSslOptions());
+
+  // Note that this function does not block
+  void Send(CURL* handle, std::unique_ptr<HttpRequest>&& request);
+
+  // [FIXME] provide more helper functions to encapsulate CURL detail
+
+ protected:
+  void AsyncTransfer();
+
+  bool exiting_{false};
+
+  std::thread worker_;
+  std::mutex mutex_;
+  std::condition_variable cv_;
+
+  // The server url
+  const std::string url_;
+  // The options for authorizing and authenticating SSL/TLS connections
+  HttpSslOptions ssl_options_;
+
+  using AsyncReqMap = std::map<uintptr_t, std::unique_ptr<HttpRequest>>;
+  // curl multi handle for processing asynchronous requests
+  void* multi_handle_;
+  // map to record ongoing asynchronous requests with pointer to easy handle
+  // or tag id as key
+  AsyncReqMap ongoing_async_requests_;
+
+  bool verbose_;
+
+ private:
+  // [FIXME] should belong to SSL option struct as helper function
+  const std::string& ParseSslKeyType(HttpSslOptions::KEYTYPE key_type);
+  const std::string& ParseSslCertType(HttpSslOptions::CERTTYPE cert_type);
+};
+}}}}  // namespace triton::perfanalyzer::clientbackend::openai
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc
new file mode 100644
index 000000000..f83c3976b
--- /dev/null
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc
@@ -0,0 +1,298 @@
+// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Include this first to make sure we are a friend of common classes.
+#define TRITON_INFERENCE_SERVER_CLIENT_CLASS InferenceServerHttpClient
+#include "openai_client.h"
+
+#include <curl/curl.h>
+
+#include <algorithm>
+#include <atomic>
+#include <cctype>
+#include <climits>
+#include <cstdint>
+#include <deque>
+#include <iostream>
+#include <string>
+#include <utility>
+
+#include "common.h"
+
+#ifdef TRITON_ENABLE_ZLIB
+#include <zlib.h>
+#endif
+
+extern "C" {
+#include "cencode.h"
+}
+
+#ifdef _WIN32
+#define strncasecmp(x, y, z) _strnicmp(x, y, z)
+#undef min  // NOMINMAX did not resolve std::min compile error
+#endif      //_WIN32
+
+namespace triton { namespace perfanalyzer { namespace clientbackend {
+namespace openai {
+
+//==============================================================================
+
+void
+ChatCompletionRequest::SendResponse(bool is_final, bool is_null)
+{
+  response_callback_(new ChatCompletionResult(
+      http_code_, std::move(response_buffer_), is_final, is_null, request_id_));
+}
+
+ChatCompletionClient::ChatCompletionClient(
+    const std::string& url, bool verbose, const HttpSslOptions& ssl_options)
+    : HttpClient(
+          std::string(url + "/v1/chat/completions"), verbose, ssl_options)
+{
+}
+
+size_t
+ChatCompletionClient::RequestProvider(
+    void* contents, size_t size, size_t nmemb, void* userp)
+{
+  auto request = reinterpret_cast<ChatCompletionRequest*>(userp);
+
+  size_t input_bytes = 0;
+  request->GetNextInput(
+      reinterpret_cast<uint8_t*>(contents), size * nmemb, &input_bytes);
+
+  if (input_bytes == 0) {
+    request->timer_.CaptureTimestamp(
+        triton::client::RequestTimers::Kind::SEND_END);
+  }
+
+  return input_bytes;
+}
+
+size_t
+ChatCompletionClient::ResponseHeaderHandler(
+    void* contents, size_t size, size_t nmemb, void* userp)
+{
+  auto request = reinterpret_cast<ChatCompletionRequest*>(userp);
+
+  char* buf = reinterpret_cast<char*>(contents);
+  size_t byte_size = size * nmemb;
+
+  std::string hdr(buf, byte_size);
+  std::transform(hdr.begin(), hdr.end(), hdr.begin(), [](unsigned char c) {
+    return std::tolower(c);
+  });
+  if (hdr.find("content-type") != std::string::npos) {
+    request->is_stream_ = (hdr.find("text/event-stream") != std::string::npos);
+  }
+
+  return byte_size;
+}
+
+size_t
+ChatCompletionClient::ResponseHandler(
+    void* contents, size_t size, size_t nmemb, void* userp)
+{
+  // [WIP] verify if the SSE responses received are complete, or the response
+  // need to be stitched first
+  auto request = reinterpret_cast<ChatCompletionRequest*>(userp);
+  if (request->timer_.Timestamp(
+          triton::client::RequestTimers::Kind::RECV_START) == 0) {
+    request->timer_.CaptureTimestamp(
+        triton::client::RequestTimers::Kind::RECV_START);
+  }
+
+  char* buf = reinterpret_cast<char*>(contents);
+  size_t result_bytes = size * nmemb;
+  request->response_buffer_.append(buf, result_bytes);
+  // Send response now if streaming, otherwise wait until request has been
+  // completed
+  if (request->is_stream_) {
+    // [FIXME] assume it is proper chunked of response
+    auto done_signal =
+        (request->response_buffer_.find("data: [DONE]") != std::string::npos);
+    request->SendResponse(
+        done_signal /* is_final */, done_signal /* is_null */);
+  }
+
+  // ResponseHandler may be called multiple times so we overwrite
+  // RECV_END so that we always have the time of the last.
+  request->timer_.CaptureTimestamp(
+      triton::client::RequestTimers::Kind::RECV_END);
+
+  return result_bytes;
+}
+
+
+Error
+ChatCompletionClient::AsyncInfer(
+    std::function<void(InferResult*)> callback,
+    std::string& serialized_request_body, const std::string& request_id)
+{
+  if (callback == nullptr) {
+    return Error(
+        "Callback function must be provided along with AsyncInfer() call.");
+  }
+
+  auto completion_callback = [this](HttpRequest* req) {
+    auto request = static_cast<ChatCompletionRequest*>(req);
+    if (!request->is_stream_) {
+      request->SendResponse(true /* is_final */, false /* is_null */);
+    }
+    request->timer_.CaptureTimestamp(
+        triton::client::RequestTimers::Kind::REQUEST_END);
+    UpdateInferStat(request->timer_);
+  };
+  std::unique_ptr<HttpRequest> request(new ChatCompletionRequest(
+      std::move(completion_callback), std::move(callback), request_id,
+      verbose_));
+  auto raw_request = static_cast<ChatCompletionRequest*>(request.get());
+  raw_request->timer_.CaptureTimestamp(
+      triton::client::RequestTimers::Kind::REQUEST_START);
+  request->AddInput(
+      reinterpret_cast<uint8_t*>(serialized_request_body.data()),
+      serialized_request_body.size());
+
+  CURL* multi_easy_handle = curl_easy_init();
+  Error err = PreRunProcessing(multi_easy_handle, raw_request);
+  if (!err.IsOk()) {
+    curl_easy_cleanup(multi_easy_handle);
+    return err;
+  }
+
+  raw_request->timer_.CaptureTimestamp(
+      triton::client::RequestTimers::Kind::SEND_START);
+  Send(multi_easy_handle, std::move(request));
+  return Error::Success;
+}
+
+Error
+ChatCompletionClient::PreRunProcessing(
+    CURL* curl, ChatCompletionRequest* request)
+{
+  curl_easy_setopt(curl, CURLOPT_URL, url_.c_str());
+  curl_easy_setopt(curl, CURLOPT_USERAGENT, "libcurl-agent/1.0");
+  curl_easy_setopt(curl, CURLOPT_POST, 1L);
+  curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1L);
+
+  if (verbose_) {
+    curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L);
+  }
+
+  const long buffer_byte_size = 16 * 1024 * 1024;
+  curl_easy_setopt(curl, CURLOPT_UPLOAD_BUFFERSIZE, buffer_byte_size);
+  curl_easy_setopt(curl, CURLOPT_BUFFERSIZE, buffer_byte_size);
+
+  // request data provided by RequestProvider()
+  curl_easy_setopt(curl, CURLOPT_READFUNCTION, RequestProvider);
+  curl_easy_setopt(curl, CURLOPT_READDATA, request);
+
+  // response headers handled by ResponseHeaderHandler()
+  curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, ResponseHeaderHandler);
+  curl_easy_setopt(curl, CURLOPT_HEADERDATA, request);
+
+  // response data handled by ResponseHandler()
+  curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, ResponseHandler);
+  curl_easy_setopt(curl, CURLOPT_WRITEDATA, request);
+
+  const curl_off_t post_byte_size = request->total_input_byte_size_;
+  curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE_LARGE, post_byte_size);
+
+  SetSSLCurlOptions(curl);
+
+  struct curl_slist* list = nullptr;
+  list = curl_slist_append(list, "Expect:");
+  list = curl_slist_append(list, "Content-Type: application/json");
+  curl_easy_setopt(curl, CURLOPT_HTTPHEADER, list);
+
+  // The list will be freed when the request is destructed
+  request->header_list_ = list;
+
+  return Error::Success;
+}
+
+Error
+ChatCompletionClient::UpdateInferStat(
+    const triton::client::RequestTimers& timer)
+{
+  const uint64_t request_time_ns = timer.Duration(
+      triton::client::RequestTimers::Kind::REQUEST_START,
+      triton::client::RequestTimers::Kind::REQUEST_END);
+  const uint64_t send_time_ns = timer.Duration(
+      triton::client::RequestTimers::Kind::SEND_START,
+      triton::client::RequestTimers::Kind::SEND_END);
+  const uint64_t recv_time_ns = timer.Duration(
+      triton::client::RequestTimers::Kind::RECV_START,
+      triton::client::RequestTimers::Kind::RECV_END);
+
+  if ((request_time_ns == std::numeric_limits<uint64_t>::max()) ||
+      (send_time_ns == std::numeric_limits<uint64_t>::max()) ||
+      (recv_time_ns == std::numeric_limits<uint64_t>::max())) {
+    return Error(
+        "Timer not set correctly." +
+        ((timer.Timestamp(triton::client::RequestTimers::Kind::REQUEST_START) >
+          timer.Timestamp(triton::client::RequestTimers::Kind::REQUEST_END))
+             ? (" Request time from " +
+                std::to_string(timer.Timestamp(
+                    triton::client::RequestTimers::Kind::REQUEST_START)) +
+                " to " +
+                std::to_string(timer.Timestamp(
+                    triton::client::RequestTimers::Kind::REQUEST_END)) +
+                ".")
+             : "") +
+        ((timer.Timestamp(triton::client::RequestTimers::Kind::SEND_START) >
+          timer.Timestamp(triton::client::RequestTimers::Kind::SEND_END))
+             ? (" Send time from " +
+                std::to_string(timer.Timestamp(
+                    triton::client::RequestTimers::Kind::SEND_START)) +
+                " to " +
+                std::to_string(timer.Timestamp(
+                    triton::client::RequestTimers::Kind::SEND_END)) +
+                ".")
+             : "") +
+        ((timer.Timestamp(triton::client::RequestTimers::Kind::RECV_START) >
+          timer.Timestamp(triton::client::RequestTimers::Kind::RECV_END))
+             ? (" Receive time from " +
+                std::to_string(timer.Timestamp(
+                    triton::client::RequestTimers::Kind::RECV_START)) +
+                " to " +
+                std::to_string(timer.Timestamp(
+                    triton::client::RequestTimers::Kind::RECV_END)) +
+                ".")
+             : ""));
+  }
+
+  infer_stat_.completed_request_count++;
+  infer_stat_.cumulative_total_request_time_ns += request_time_ns;
+  infer_stat_.cumulative_send_time_ns += send_time_ns;
+  infer_stat_.cumulative_receive_time_ns += recv_time_ns;
+
+  return Error::Success;
+}
+
+//==============================================================================
+
+}}}}  // namespace triton::perfanalyzer::clientbackend::openai
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.h b/src/c++/perf_analyzer/client_backend/openai/openai_client.h
new file mode 100644
index 000000000..bff2d299f
--- /dev/null
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_client.h
@@ -0,0 +1,181 @@
+// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+/// \file
+
+#include <map>
+#include <memory>
+
+#include "../client_backend.h"
+#include "common.h"
+#include "http_client.h"
+
+
+namespace triton { namespace perfanalyzer { namespace clientbackend {
+namespace openai {
+
+class ChatCompletionResult : public InferResult {
+ public:
+  ChatCompletionResult(
+      uint32_t http_code, std::string&& serialized_response, bool is_final,
+      bool is_null, const std::string& request_id)
+      : http_code_(http_code),
+        serialized_response_(std::move(serialized_response)),
+        is_final_(is_final), is_null_(is_null), request_id_(request_id)
+  {
+  }
+  virtual ~ChatCompletionResult() = default;
+
+  /// Get the id of the request which generated this response.
+  /// \param id Returns the request id that generated the result.
+  /// \return Error object indicating success or failure.
+  Error Id(std::string* id) const override
+  {
+    *id = request_id_;
+    return Error::Success;
+  }
+
+
+  /// Returns the status of the request.
+  /// \return Error object indicating the success or failure of the
+  /// request.
+  Error RequestStatus() const override
+  {
+    if ((http_code_ >= 400) && (http_code_ <= 599)) {
+      return Error(
+          "OpenAI response returns HTTP code" + std::to_string(http_code_));
+    }
+    return Error::Success;
+  }
+
+  /// Returns the raw data of the output.
+  /// \return Error object indicating the success or failure of the
+  /// request.
+  Error RawData(
+      const std::string& output_name, const uint8_t** buf,
+      size_t* byte_size) const override
+  {
+    // [FIXME] disregard "output_name" which is not compatible to
+    // OpenAI protocol
+    *buf = reinterpret_cast<const uint8_t*>(serialized_response_.c_str());
+    *byte_size = serialized_response_.size();
+    return Error::Success;
+  }
+
+  /// Get final response bool for this response.
+  /// \return Error object indicating the success or failure.
+  Error IsFinalResponse(bool* is_final_response) const override
+  {
+    *is_final_response = is_final_;
+    return Error::Success;
+  };
+
+  /// Get null response bool for this response.
+  /// \return Error object indicating the success or failure.
+  Error IsNullResponse(bool* is_null_response) const override
+  {
+    *is_null_response = is_null_;
+    return Error::Success;
+  };
+
+ private:
+  const uint32_t http_code_{200};
+  const std::string serialized_response_;
+  const bool is_final_{false};
+  const bool is_null_{false};
+  const std::string request_id_;
+};
+
+
+class ChatCompletionRequest : public HttpRequest {
+ public:
+  virtual ~ChatCompletionRequest() {}
+  ChatCompletionRequest(
+      std::function<void(HttpRequest*)>&& completion_callback,
+      std::function<void(InferResult*)>&& response_callback,
+      const std::string& request_id, const bool verbose = false)
+      : HttpRequest(std::move(completion_callback), verbose),
+        response_callback_(std::move(response_callback)),
+        request_id_(request_id)
+  {
+  }
+  void SendResponse(bool is_final, bool is_null);
+  bool is_stream_{false};
+  std::function<void(InferResult*)> response_callback_{nullptr};
+  // The timers for infer request.
+  triton::client::RequestTimers timer_;
+  const std::string request_id_;
+};
+
+class ChatCompletionClient : public HttpClient {
+ public:
+  virtual ~ChatCompletionClient() = default;
+
+  /// Create a client that can be used to communicate with the server.
+  /// \param server_url The inference server name, port, optional
+  /// scheme and optional base path in the following format:
+  /// <scheme://>host:port/<base-path>.
+  /// \param verbose If true generate verbose output when contacting
+  /// the inference server.
+  /// \param ssl_options Specifies the settings for configuring
+  /// SSL encryption and authorization. Providing these options
+  /// do not ensure that SSL/TLS will be used in communication.
+  /// The use of SSL/TLS depends entirely on the server endpoint.
+  /// These options will be ignored if the server_url does not
+  /// expose `https://` scheme.
+  ChatCompletionClient(
+      const std::string& server_url, bool verbose = false,
+      const HttpSslOptions& ssl_options = HttpSslOptions());
+
+  /// Simplified AsyncInfer() where the request body is expected to be
+  /// prepared by the caller, the client here is responsible to communicate
+  /// with a OpenAI-compatible server in both streaming and non-streaming case.
+  Error AsyncInfer(
+      std::function<void(InferResult*)> callback,
+      std::string& serialized_request_body, const std::string& request_id);
+
+  const InferStat& ClientInferStat() { return infer_stat_; }
+
+  /// [TODO?] Add AsyncInfer() variant that prepare the request body from
+  /// function arguments. Similar to Triton client library.
+
+ private:
+  // setup curl handle
+  Error PreRunProcessing(CURL* curl, ChatCompletionRequest* request);
+
+  static size_t ResponseHandler(
+      void* contents, size_t size, size_t nmemb, void* userp);
+  static size_t RequestProvider(
+      void* contents, size_t size, size_t nmemb, void* userp);
+  static size_t ResponseHeaderHandler(
+      void* contents, size_t size, size_t nmemb, void* userp);
+
+  Error UpdateInferStat(const triton::client::RequestTimers& timer);
+  InferStat infer_stat_;
+};
+
+}}}}  // namespace triton::perfanalyzer::clientbackend::openai
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc b/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc
index d017b8b23..9f62beb29 100644
--- a/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc
@@ -26,6 +26,8 @@
 
 #include "openai_client_backend.h"
 
+#include "openai_infer_input.h"
+
 namespace triton { namespace perfanalyzer { namespace clientbackend {
 namespace openai {
 
@@ -44,8 +46,8 @@ OpenAiClientBackend::Create(
   std::unique_ptr<OpenAiClientBackend> openai_client_backend(
       new OpenAiClientBackend(http_headers));
 
-  RETURN_IF_CB_ERROR(
-      HttpClient::Create(&(openai_client_backend->http_client_), url, verbose));
+  openai_client_backend->http_client_.reset(
+      new ChatCompletionClient(url, verbose));
 
   *client_backend = std::move(openai_client_backend);
 
@@ -58,14 +60,14 @@ OpenAiClientBackend::AsyncInfer(
     const std::vector<InferInput*>& inputs,
     const std::vector<const InferRequestedOutput*>& outputs)
 {
-  auto wrapped_callback = [callback](cb::openai::InferResult* client_result) {
-    cb::InferResult* result = new OpenAiInferResult(client_result);
-    callback(result);
-  };
+  if (inputs.size() != 1) {
+    return Error("Only expecting one input");
+  }
 
+  auto raw_input = dynamic_cast<OpenAiInferInput*>(inputs[0]);
+  raw_input->PrepareForRequest();
   RETURN_IF_CB_ERROR(http_client_->AsyncInfer(
-      wrapped_callback, options, inputs, outputs, *http_headers_));
-
+      callback, raw_input->DataString(), options.request_id_));
   return Error::Success;
 }
 
@@ -73,25 +75,10 @@ OpenAiClientBackend::AsyncInfer(
 Error
 OpenAiClientBackend::ClientInferStat(InferStat* infer_stat)
 {
-  // Reusing the common library utilities to collect and report the
-  // client side statistics.
-  tc::InferStat client_infer_stat;
-
-  RETURN_IF_TRITON_ERROR(http_client_->ClientInferStat(&client_infer_stat));
-
-  ParseInferStat(client_infer_stat, infer_stat);
-
+  *infer_stat = http_client_->ClientInferStat();
   return Error::Success;
 }
 
-void
-OpenAiClientBackend::ParseInferStat(
-    const tc::InferStat& tfserve_infer_stat, InferStat* infer_stat)
-{
-  // TODO: Implement
-  return;
-}
-
 //==============================================================================
 
 Error
@@ -118,35 +105,5 @@ OpenAiInferRequestedOutput::OpenAiInferRequestedOutput(const std::string& name)
 
 //==============================================================================
 
-OpenAiInferResult::OpenAiInferResult(cb::openai::InferResult* result)
-{
-  result_.reset(result);
-}
-
-Error
-OpenAiInferResult::Id(std::string* id) const
-{
-  id->clear();
-  return Error::Success;
-}
-
-Error
-OpenAiInferResult::RequestStatus() const
-{
-  RETURN_IF_CB_ERROR(result_->RequestStatus());
-  return Error::Success;
-}
-
-Error
-OpenAiInferResult::RawData(
-    const std::string& output_name, const uint8_t** buf,
-    size_t* byte_size) const
-{
-  return Error(
-      "Output retrieval is not currently supported for OpenAi client backend");
-}
-
-//==============================================================================
-
 
 }}}}  // namespace triton::perfanalyzer::clientbackend::openai
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.h b/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.h
index c6c83222f..ea9a49a82 100644
--- a/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.h
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.h
@@ -29,7 +29,8 @@
 
 #include "../../perf_utils.h"
 #include "../client_backend.h"
-#include "openai_http_client.h"
+#include "openai_client.h"
+#include "openai_infer_input.h"
 
 #define RETURN_IF_TRITON_ERROR(S)       \
   do {                                  \
@@ -85,7 +86,7 @@ class OpenAiClientBackend : public ClientBackend {
   void ParseInferStat(
       const tc::InferStat& openai_infer_stat, InferStat* infer_stat);
 
-  std::unique_ptr<openai::HttpClient> http_client_;
+  std::unique_ptr<openai::ChatCompletionClient> http_client_;
   std::shared_ptr<Headers> http_headers_;
 };
 
@@ -107,24 +108,4 @@ class OpenAiInferRequestedOutput : public InferRequestedOutput {
   std::unique_ptr<tc::InferRequestedOutput> output_;
 };
 
-//==============================================================
-/// OpenAiInferResult is a wrapper around InferResult object of
-/// OpenAi InferResult object.
-///
-class OpenAiInferResult : public cb::InferResult {
- public:
-  explicit OpenAiInferResult(cb::openai::InferResult* result);
-  /// See InferResult::Id()
-  Error Id(std::string* id) const override;
-  /// See InferResult::RequestStatus()
-  Error RequestStatus() const override;
-  /// See InferResult::RawData()
-  Error RawData(
-      const std::string& output_name, const uint8_t** buf,
-      size_t* byte_size) const override;
-
- private:
-  std::unique_ptr<cb::openai::InferResult> result_;
-};
-
 }}}}  // namespace triton::perfanalyzer::clientbackend::openai
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_http_client.cc b/src/c++/perf_analyzer/client_backend/openai/openai_http_client.cc
deleted file mode 100644
index 151eca2a6..000000000
--- a/src/c++/perf_analyzer/client_backend/openai/openai_http_client.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include "openai_http_client.h"
-
-#include <rapidjson/rapidjson.h>
-
-
-namespace triton { namespace perfanalyzer { namespace clientbackend {
-namespace openai {
-
-
-Error
-HttpClient::Create(
-    std::unique_ptr<HttpClient>* client, const std::string& server_url,
-    bool verbose)
-{
-  client->reset(new HttpClient(server_url, verbose));
-  return Error::Success;
-}
-
-Error
-HttpClient::AsyncInfer(
-    OpenAiOnCompleteFn callback, const InferOptions& options,
-    const std::vector<InferInput*>& inputs,
-    const std::vector<const InferRequestedOutput*>& outputs,
-    const Headers& headers)
-{
-  // TODO FIXME implement
-
-  // TODO FIXME cleanup or remove this. It just proves the json data arrives
-  rapidjson::Document d{};
-
-  if (inputs.size() != 1) {
-    return Error("Only expecting one input");
-  }
-
-  auto raw_input = dynamic_cast<OpenAiInferInput*>(inputs[0]);
-
-  raw_input->PrepareForRequest();
-  bool end_of_input = false;
-  const uint8_t* buf;
-  size_t buf_size;
-  raw_input->GetNext(&buf, &buf_size, &end_of_input);
-  if (!end_of_input) {
-    return Error("Unexpected multiple json data inputs");
-  }
-  if (buf == nullptr) {
-    return Error("Unexpected null json data");
-  }
-
-  std::string json_str(reinterpret_cast<const char*>(buf), buf_size);
-  std::cout << "FIXME TODO: JSON data string is " << json_str << std::endl;
-
-
-  if (d.Parse(json_str.c_str()).HasParseError()) {
-    return Error("Unable to parse json string: " + json_str);
-  }
-
-  // FIXME TKG -- where/how would the 'streaming' option get plugged in?
-
-  // FIXME TKG -- GOOD GOD! Is it this hard to add a single value into a json
-  // object??
-  // FIXME TKG -- what if the user supplied this in the input json file?
-  d.AddMember(
-      "model",
-      rapidjson::Value().SetString(
-          options.model_name_.c_str(),
-          static_cast<rapidjson::SizeType>(options.model_name_.length()),
-          d.GetAllocator()),
-      d.GetAllocator());
-
-  for (auto itr = d.MemberBegin(); itr != d.MemberEnd(); ++itr) {
-    std::cout << "FIXME TODO: valid JSON object has key "
-              << itr->name.GetString() << std::endl;
-  }
-
-  return Error::Success;
-}
-
-HttpClient::HttpClient(const std::string& url, bool verbose)
-    : InferenceServerClient(verbose), url_(url)
-// ,easy_handle_(reinterpret_cast<void*>(curl_easy_init()) // TODO FIXME TKG
-{
-}
-
-HttpClient::~HttpClient()
-{
-  exiting_ = true;
-
-  // FIXME TODO TKG
-  // if (easy_handle_ != nullptr) {
-  //  curl_easy_cleanup(reinterpret_cast<CURL*>(easy_handle_));
-  //}
-}
-
-}}}}  // namespace triton::perfanalyzer::clientbackend::openai
\ No newline at end of file
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_http_client.h b/src/c++/perf_analyzer/client_backend/openai/openai_http_client.h
deleted file mode 100644
index bbdaddfe9..000000000
--- a/src/c++/perf_analyzer/client_backend/openai/openai_http_client.h
+++ /dev/null
@@ -1,106 +0,0 @@
-// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#pragma once
-
-#include "../client_backend.h"
-#include "common.h"
-#include "openai_infer_input.h"
-
-
-namespace tc = triton::client;
-
-namespace triton { namespace perfanalyzer { namespace clientbackend {
-namespace openai {
-
-class InferResult;
-class HttpInferRequest;
-
-using OpenAiOnCompleteFn = std::function<void(InferResult*)>;
-
-//==============================================================================
-/// An HttpClient object is used to perform any kind of communication with the
-/// OpenAi service using <TODO: FILL IN>
-///
-/// \code
-///   std::unique_ptr<HttpClient> client;
-///   HttpClient::Create(&client, "localhost:8080");
-///   ...
-///   ...
-/// \endcode
-///
-class HttpClient : public tc::InferenceServerClient {
- public:
-  ~HttpClient();
-
-  /// TODO: Adjust as needed
-  /// Create a client that can be used to communicate with the server.
-  /// \param client Returns a new InferenceServerHttpClient object.
-  /// \param server_url The inference server name and port.
-  /// \param verbose If true generate verbose output when contacting
-  /// the inference server.
-  /// \return Error object indicating success or failure.
-  static Error Create(
-      std::unique_ptr<HttpClient>* client, const std::string& server_url,
-      const bool verbose);
-
-  /// TODO FIXME: Update
-  /// Run asynchronous inference on server.
-  Error AsyncInfer(
-      OpenAiOnCompleteFn callback, const InferOptions& options,
-      const std::vector<InferInput*>& inputs,
-      const std::vector<const InferRequestedOutput*>& outputs =
-          std::vector<const InferRequestedOutput*>(),
-      const Headers& headers = Headers());
-
- private:
-  HttpClient(const std::string& url, bool verbose);
-
-  // The server url
-  const std::string url_;
-};
-
-//======================================================================
-
-class InferResult {
- public:
-  static Error Create(
-      InferResult** infer_result,
-      std::shared_ptr<HttpInferRequest> infer_request);
-  Error RequestStatus() const { return Error::Success; }      // TODO FIXME TKG
-  Error Id(std::string* id) const { return Error::Success; }  // TODO FIXME TKG
-
- private:
-  InferResult(std::shared_ptr<HttpInferRequest> infer_request);
-
-  // The status of the inference
-  Error status_;
-  // The pointer to the HttpInferRequest object
-  std::shared_ptr<HttpInferRequest> infer_request_;
-};
-
-//======================================================================
-
-}}}}  // namespace triton::perfanalyzer::clientbackend::openai
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.cc b/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.cc
index 70d827e85..834e27788 100644
--- a/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.cc
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.cc
@@ -51,9 +51,10 @@ OpenAiInferInput::SetShape(const std::vector<int64_t>& shape)
 Error
 OpenAiInferInput::Reset()
 {
+  data_str_.clear();
+
   bufs_.clear();
   buf_byte_sizes_.clear();
-  bufs_idx_ = 0;
   byte_size_ = 0;
   return Error::Success;
 }
@@ -61,18 +62,12 @@ OpenAiInferInput::Reset()
 Error
 OpenAiInferInput::AppendRaw(const uint8_t* input, size_t input_byte_size)
 {
+  data_str_.clear();
+
   byte_size_ += input_byte_size;
 
   bufs_.push_back(input);
   buf_byte_sizes_.push_back(input_byte_size);
-
-  return Error::Success;
-}
-
-Error
-OpenAiInferInput::ByteSize(size_t* byte_size) const
-{
-  *byte_size = byte_size_;
   return Error::Success;
 }
 
@@ -80,32 +75,19 @@ Error
 OpenAiInferInput::PrepareForRequest()
 {
   // Reset position so request sends entire input.
-  bufs_idx_ = 0;
-  buf_pos_ = 0;
-  return Error::Success;
-}
-
-Error
-OpenAiInferInput::GetNext(
-    const uint8_t** buf, size_t* input_bytes, bool* end_of_input)
-{
-  if (bufs_idx_ < bufs_.size()) {
-    *buf = bufs_[bufs_idx_];
-    *input_bytes = buf_byte_sizes_[bufs_idx_];
-    bufs_idx_++;
-  } else {
-    *buf = nullptr;
-    *input_bytes = 0;
+  if (data_str_.empty() && (byte_size_ != 0)) {
+    for (size_t i = 0; i < bufs_.size(); ++i) {
+      data_str_.append(
+          reinterpret_cast<const char*>(bufs_[i]), buf_byte_sizes_[i]);
+    }
   }
-  *end_of_input = (bufs_idx_ >= bufs_.size());
-
   return Error::Success;
 }
 
 OpenAiInferInput::OpenAiInferInput(
     const std::string& name, const std::vector<int64_t>& dims,
     const std::string& datatype)
-    : InferInput(BackendKind::TENSORFLOW_SERVING, name, datatype), shape_(dims)
+    : InferInput(BackendKind::OPENAI, name, datatype), shape_(dims)
 {
 }
 
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.h b/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.h
index a10b9312f..0c192cfad 100644
--- a/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.h
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.h
@@ -51,14 +51,11 @@ class OpenAiInferInput : public InferInput {
   Error Reset() override;
   /// See InferInput::AppendRaw()
   Error AppendRaw(const uint8_t* input, size_t input_byte_size) override;
-  /// Gets the size of data added into this input in bytes.
-  /// \param byte_size The size of data added in bytes.
-  /// \return Error object indicating success or failure.
-  Error ByteSize(size_t* byte_size) const;
-  /// Resets the heads to start providing data from the beginning.
+  /// Prepare the input to be in the form expected by an OpenAI client,
+  /// must call before accessing the data.
   Error PrepareForRequest();
-  /// Get the next chunk of data if available.
-  Error GetNext(const uint8_t** buf, size_t* input_bytes, bool* end_of_input);
+  /// Get the contiguous data in string.
+  std::string& DataString() { return data_str_; }
 
  private:
   explicit OpenAiInferInput(
@@ -68,9 +65,9 @@ class OpenAiInferInput : public InferInput {
   std::vector<int64_t> shape_;
   size_t byte_size_{0};
 
-  size_t bufs_idx_, buf_pos_;
   std::vector<const uint8_t*> bufs_;
   std::vector<size_t> buf_byte_sizes_;
+  std::string data_str_;
 };
 
 }}}}  // namespace triton::perfanalyzer::clientbackend::openai

From 4ab2fc11c1d002239ca66ad7c257da27f5745d4d Mon Sep 17 00:00:00 2001
From: tgerdes <tgerdes@nvidia.com>
Date: Mon, 4 Mar 2024 10:40:00 -0600
Subject: [PATCH 08/23] Pass endpoint to openai client

---
 .../client_backend/client_backend.cc          | 20 +++++++++----------
 .../client_backend/client_backend.h           | 15 +++++++++-----
 .../client_backend/openai/openai_client.cc    |  6 +++---
 .../client_backend/openai/openai_client.h     |  6 ++++--
 .../openai/openai_client_backend.cc           |  8 ++++----
 .../openai/openai_client_backend.h            |  7 ++++---
 src/c++/perf_analyzer/perf_analyzer.cc        | 13 ++++++------
 7 files changed, 42 insertions(+), 33 deletions(-)

diff --git a/src/c++/perf_analyzer/client_backend/client_backend.cc b/src/c++/perf_analyzer/client_backend/client_backend.cc
index 04a68fefb..c665390bb 100644
--- a/src/c++/perf_analyzer/client_backend/client_backend.cc
+++ b/src/c++/perf_analyzer/client_backend/client_backend.cc
@@ -116,8 +116,8 @@ BackendToGrpcType(const GrpcCompressionAlgorithm compression_algorithm)
 //
 Error
 ClientBackendFactory::Create(
-    const BackendKind kind, const std::string& url, const ProtocolType protocol,
-    const SslOptionsBase& ssl_options,
+    const BackendKind kind, const std::string& url, const std::string& endpoint,
+    const ProtocolType protocol, const SslOptionsBase& ssl_options,
     const std::map<std::string, std::vector<std::string>> trace_options,
     const GrpcCompressionAlgorithm compression_algorithm,
     std::shared_ptr<Headers> http_headers,
@@ -128,9 +128,10 @@ ClientBackendFactory::Create(
     std::shared_ptr<ClientBackendFactory>* factory)
 {
   factory->reset(new ClientBackendFactory(
-      kind, url, protocol, ssl_options, trace_options, compression_algorithm,
-      http_headers, triton_server_path, model_repository_path, verbose,
-      metrics_url, input_tensor_format, output_tensor_format));
+      kind, url, endpoint, protocol, ssl_options, trace_options,
+      compression_algorithm, http_headers, triton_server_path,
+      model_repository_path, verbose, metrics_url, input_tensor_format,
+      output_tensor_format));
   return Error::Success;
 }
 
@@ -139,7 +140,7 @@ ClientBackendFactory::CreateClientBackend(
     std::unique_ptr<ClientBackend>* client_backend)
 {
   RETURN_IF_CB_ERROR(ClientBackend::Create(
-      kind_, url_, protocol_, ssl_options_, trace_options_,
+      kind_, url_, endpoint_, protocol_, ssl_options_, trace_options_,
       compression_algorithm_, http_headers_, verbose_, triton_server_path,
       model_repository_path_, metrics_url_, input_tensor_format_,
       output_tensor_format_, client_backend));
@@ -157,8 +158,8 @@ ClientBackendFactory::Kind()
 //
 Error
 ClientBackend::Create(
-    const BackendKind kind, const std::string& url, const ProtocolType protocol,
-    const SslOptionsBase& ssl_options,
+    const BackendKind kind, const std::string& url, const std::string& endpoint,
+    const ProtocolType protocol, const SslOptionsBase& ssl_options,
     const std::map<std::string, std::vector<std::string>> trace_options,
     const GrpcCompressionAlgorithm compression_algorithm,
     std::shared_ptr<Headers> http_headers, const bool verbose,
@@ -177,10 +178,9 @@ ClientBackend::Create(
         &local_backend));
   }
 #ifdef TRITON_ENABLE_PERF_ANALYZER_OPENAI
-  // TODO -- I think this needs endpoint to be passed in?
   else if (kind == OPENAI) {
     RETURN_IF_CB_ERROR(openai::OpenAiClientBackend::Create(
-        url, protocol, http_headers, verbose, &local_backend));
+        url, endpoint, protocol, http_headers, verbose, &local_backend));
   }
 #endif  // TRITON_ENABLE_PERF_ANALYZER_OPENAI
 #ifdef TRITON_ENABLE_PERF_ANALYZER_TFS
diff --git a/src/c++/perf_analyzer/client_backend/client_backend.h b/src/c++/perf_analyzer/client_backend/client_backend.h
index 487c215ce..3d1f3e89c 100644
--- a/src/c++/perf_analyzer/client_backend/client_backend.h
+++ b/src/c++/perf_analyzer/client_backend/client_backend.h
@@ -268,6 +268,7 @@ class ClientBackendFactory {
   /// Create a factory that can be used to construct Client Backends.
   /// \param kind The kind of client backend to create.
   /// \param url The inference server url and port.
+  /// \param endpoint The endpoint on the inference server to send requests to
   /// \param protocol The protocol type used.
   /// \param ssl_options The SSL options used with client backend.
   /// \param compression_algorithm The compression algorithm to be used
@@ -290,7 +291,8 @@ class ClientBackendFactory {
   /// \return Error object indicating success or failure.
   static Error Create(
       const BackendKind kind, const std::string& url,
-      const ProtocolType protocol, const SslOptionsBase& ssl_options,
+      const std::string& endpoint, const ProtocolType protocol,
+      const SslOptionsBase& ssl_options,
       const std::map<std::string, std::vector<std::string>> trace_options,
       const GrpcCompressionAlgorithm compression_algorithm,
       std::shared_ptr<Headers> http_headers,
@@ -309,7 +311,8 @@ class ClientBackendFactory {
  private:
   ClientBackendFactory(
       const BackendKind kind, const std::string& url,
-      const ProtocolType protocol, const SslOptionsBase& ssl_options,
+      const std::string& endpoint, const ProtocolType protocol,
+      const SslOptionsBase& ssl_options,
       const std::map<std::string, std::vector<std::string>> trace_options,
       const GrpcCompressionAlgorithm compression_algorithm,
       const std::shared_ptr<Headers> http_headers,
@@ -317,8 +320,8 @@ class ClientBackendFactory {
       const std::string& model_repository_path, const bool verbose,
       const std::string& metrics_url, const TensorFormat input_tensor_format,
       const TensorFormat output_tensor_format)
-      : kind_(kind), url_(url), protocol_(protocol), ssl_options_(ssl_options),
-        trace_options_(trace_options),
+      : kind_(kind), url_(url), endpoint_(endpoint), protocol_(protocol),
+        ssl_options_(ssl_options), trace_options_(trace_options),
         compression_algorithm_(compression_algorithm),
         http_headers_(http_headers), triton_server_path(triton_server_path),
         model_repository_path_(model_repository_path), verbose_(verbose),
@@ -329,6 +332,7 @@ class ClientBackendFactory {
 
   const BackendKind kind_;
   const std::string url_;
+  const std::string endpoint_;
   const ProtocolType protocol_;
   const SslOptionsBase& ssl_options_;
   const std::map<std::string, std::vector<std::string>> trace_options_;
@@ -361,7 +365,8 @@ class ClientBackend {
  public:
   static Error Create(
       const BackendKind kind, const std::string& url,
-      const ProtocolType protocol, const SslOptionsBase& ssl_options,
+      const std::string& endpoint, const ProtocolType protocol,
+      const SslOptionsBase& ssl_options,
       const std::map<std::string, std::vector<std::string>> trace_options,
       const GrpcCompressionAlgorithm compression_algorithm,
       std::shared_ptr<Headers> http_headers, const bool verbose,
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc
index f83c3976b..1bab51bd6 100644
--- a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc
@@ -68,9 +68,9 @@ ChatCompletionRequest::SendResponse(bool is_final, bool is_null)
 }
 
 ChatCompletionClient::ChatCompletionClient(
-    const std::string& url, bool verbose, const HttpSslOptions& ssl_options)
-    : HttpClient(
-          std::string(url + "/v1/chat/completions"), verbose, ssl_options)
+    const std::string& url, const std::string& endpoint, bool verbose,
+    const HttpSslOptions& ssl_options)
+    : HttpClient(std::string(url + "/" + endpoint), verbose, ssl_options)
 {
 }
 
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.h b/src/c++/perf_analyzer/client_backend/openai/openai_client.h
index bff2d299f..5ede83143 100644
--- a/src/c++/perf_analyzer/client_backend/openai/openai_client.h
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_client.h
@@ -67,7 +67,7 @@ class ChatCompletionResult : public InferResult {
   {
     if ((http_code_ >= 400) && (http_code_ <= 599)) {
       return Error(
-          "OpenAI response returns HTTP code" + std::to_string(http_code_));
+          "OpenAI response returns HTTP code " + std::to_string(http_code_));
     }
     return Error::Success;
   }
@@ -139,6 +139,7 @@ class ChatCompletionClient : public HttpClient {
   /// \param server_url The inference server name, port, optional
   /// scheme and optional base path in the following format:
   /// <scheme://>host:port/<base-path>.
+  /// \param endpoint The name of the endpoint to send requests to
   /// \param verbose If true generate verbose output when contacting
   /// the inference server.
   /// \param ssl_options Specifies the settings for configuring
@@ -148,7 +149,8 @@ class ChatCompletionClient : public HttpClient {
   /// These options will be ignored if the server_url does not
   /// expose `https://` scheme.
   ChatCompletionClient(
-      const std::string& server_url, bool verbose = false,
+      const std::string& server_url, const std::string& endpoint,
+      bool verbose = false,
       const HttpSslOptions& ssl_options = HttpSslOptions());
 
   /// Simplified AsyncInfer() where the request body is expected to be
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc b/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc
index 9f62beb29..bff94fc70 100644
--- a/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc
@@ -35,9 +35,9 @@ namespace openai {
 
 Error
 OpenAiClientBackend::Create(
-    const std::string& url, const ProtocolType protocol,
-    std::shared_ptr<Headers> http_headers, const bool verbose,
-    std::unique_ptr<ClientBackend>* client_backend)
+    const std::string& url, const std::string& endpoint,
+    const ProtocolType protocol, std::shared_ptr<Headers> http_headers,
+    const bool verbose, std::unique_ptr<ClientBackend>* client_backend)
 {
   if (protocol == ProtocolType::GRPC) {
     return Error(
@@ -47,7 +47,7 @@ OpenAiClientBackend::Create(
       new OpenAiClientBackend(http_headers));
 
   openai_client_backend->http_client_.reset(
-      new ChatCompletionClient(url, verbose));
+      new ChatCompletionClient(url, endpoint, verbose));
 
   *client_backend = std::move(openai_client_backend);
 
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.h b/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.h
index ea9a49a82..94dbd9729 100644
--- a/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.h
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.h
@@ -56,6 +56,7 @@ class OpenAiClientBackend : public ClientBackend {
   /// Create an OpenAI client backend which can be used to interact with the
   /// server.
   /// \param url The inference server url and port.
+  /// \param endpoint The endpoint on the inference server to send requests to
   /// \param protocol The protocol type used.
   /// \param http_headers Map of HTTP headers. The map key/value indicates
   /// the header name/value.
@@ -64,9 +65,9 @@ class OpenAiClientBackend : public ClientBackend {
   /// object.
   /// \return Error object indicating success or failure.
   static Error Create(
-      const std::string& url, const ProtocolType protocol,
-      std::shared_ptr<Headers> http_headers, const bool verbose,
-      std::unique_ptr<ClientBackend>* client_backend);
+      const std::string& url, const std::string& endpoint,
+      const ProtocolType protocol, std::shared_ptr<Headers> http_headers,
+      const bool verbose, std::unique_ptr<ClientBackend>* client_backend);
 
   /// See ClientBackend::AsyncInfer()
   Error AsyncInfer(
diff --git a/src/c++/perf_analyzer/perf_analyzer.cc b/src/c++/perf_analyzer/perf_analyzer.cc
index a1a5ab635..1928772fb 100644
--- a/src/c++/perf_analyzer/perf_analyzer.cc
+++ b/src/c++/perf_analyzer/perf_analyzer.cc
@@ -77,12 +77,13 @@ PerfAnalyzer::CreateAnalyzerObjects()
   std::shared_ptr<cb::ClientBackendFactory> factory;
   FAIL_IF_ERR(
       cb::ClientBackendFactory::Create(
-          params_->kind, params_->url, params_->protocol, params_->ssl_options,
-          params_->trace_options, params_->compression_algorithm,
-          params_->http_headers, params_->triton_server_path,
-          params_->model_repository_path, params_->extra_verbose,
-          params_->metrics_url, params_->input_tensor_format,
-          params_->output_tensor_format, &factory),
+          params_->kind, params_->url, params_->endpoint, params_->protocol,
+          params_->ssl_options, params_->trace_options,
+          params_->compression_algorithm, params_->http_headers,
+          params_->triton_server_path, params_->model_repository_path,
+          params_->extra_verbose, params_->metrics_url,
+          params_->input_tensor_format, params_->output_tensor_format,
+          &factory),
       "failed to create client factory");
 
   FAIL_IF_ERR(

From ffbf1541b252c179c85420d6c2f47ded17787e93 Mon Sep 17 00:00:00 2001
From: tgerdes <tgerdes@nvidia.com>
Date: Mon, 4 Mar 2024 11:16:19 -0600
Subject: [PATCH 09/23] Resolve fixmes

---
 src/c++/library/http_client.h                   |  2 +-
 .../client_backend/openai/http_client.h         | 17 +----------------
 .../client_backend/openai/openai_client.cc      |  1 -
 .../client_backend/openai/openai_client.h       |  7 ++-----
 src/c++/perf_analyzer/perf_utils.cc             |  4 ----
 5 files changed, 4 insertions(+), 27 deletions(-)

diff --git a/src/c++/library/http_client.h b/src/c++/library/http_client.h
index d252b40f1..532ea10fb 100644
--- a/src/c++/library/http_client.h
+++ b/src/c++/library/http_client.h
@@ -49,7 +49,7 @@ struct HttpSslOptions {
   enum KEYTYPE {
     KEY_PEM = 0,
     KEY_DER = 1
-    // TODO: Support loading private key from crypto engine
+    // TODO TMA-1645: Support loading private key from crypto engine
     // KEY_ENG = 2
   };
   explicit HttpSslOptions()
diff --git a/src/c++/perf_analyzer/client_backend/openai/http_client.h b/src/c++/perf_analyzer/client_backend/openai/http_client.h
index c6acfd524..13a0d2e05 100644
--- a/src/c++/perf_analyzer/client_backend/openai/http_client.h
+++ b/src/c++/perf_analyzer/client_backend/openai/http_client.h
@@ -35,19 +35,12 @@
 #include <mutex>
 #include <thread>
 
-// [TODO] Below should already be a generic class for any HTTP use,
-// relocate it so that it can be used elsewhere
 namespace triton { namespace perfanalyzer { namespace clientbackend {
 namespace openai {
 
-// [FIXME] add back "parameter" handling
-// [FIXME] add back "compression" handling
-
 /// The key-value map type to be included in the request
 /// as custom headers.
 typedef std::map<std::string, std::string> Headers;
-/// The key-value map type to be included as URL parameters.
-typedef std::map<std::string, std::string> Parameters;
 
 // The options for authorizing and authenticating SSL/TLS connections.
 struct HttpSslOptions {
@@ -55,7 +48,7 @@ struct HttpSslOptions {
   enum KEYTYPE {
     KEY_PEM = 0,
     KEY_DER = 1
-    // TODO: Support loading private key from crypto engine
+    // TODO TMA-1645: Support loading private key from crypto engine
     // KEY_ENG = 2
   };
   explicit HttpSslOptions()
@@ -115,11 +108,6 @@ class HttpRequest {
   // actual amount copied in 'input_bytes'.
   void GetNextInput(uint8_t* buf, size_t size, size_t* input_bytes);
 
-  // [FIXME] define default callback like
-  // CURLOPT_READFUNCTION, CURLOPT_WRITEFUNCTION here?
-  // the specialized HttpRequest can override the callbacks when read / write
-  // schema has changed.
-
   // Buffer that accumulates the response body.
   std::string response_buffer_;
 
@@ -158,8 +146,6 @@ class HttpClient {
   // Note that this function does not block
   void Send(CURL* handle, std::unique_ptr<HttpRequest>&& request);
 
-  // [FIXME] provide more helper functions to encapsulate CURL detail
-
  protected:
   void AsyncTransfer();
 
@@ -184,7 +170,6 @@ class HttpClient {
   bool verbose_;
 
  private:
-  // [FIXME] should belong to SSL option struct as helper function
   const std::string& ParseSslKeyType(HttpSslOptions::KEYTYPE key_type);
   const std::string& ParseSslCertType(HttpSslOptions::CERTTYPE cert_type);
 };
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc
index 1bab51bd6..c7502657a 100644
--- a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc
@@ -131,7 +131,6 @@ ChatCompletionClient::ResponseHandler(
   // Send response now if streaming, otherwise wait until request has been
   // completed
   if (request->is_stream_) {
-    // [FIXME] assume it is proper chunked of response
     auto done_signal =
         (request->response_buffer_.find("data: [DONE]") != std::string::npos);
     request->SendResponse(
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.h b/src/c++/perf_analyzer/client_backend/openai/openai_client.h
index 5ede83143..a3b1853e3 100644
--- a/src/c++/perf_analyzer/client_backend/openai/openai_client.h
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_client.h
@@ -79,8 +79,8 @@ class ChatCompletionResult : public InferResult {
       const std::string& output_name, const uint8_t** buf,
       size_t* byte_size) const override
   {
-    // [FIXME] disregard "output_name" which is not compatible to
-    // OpenAI protocol
+    // There is only a single output (and it has no defined name), so we can
+    // disregard output_name
     *buf = reinterpret_cast<const uint8_t*>(serialized_response_.c_str());
     *byte_size = serialized_response_.size();
     return Error::Success;
@@ -162,9 +162,6 @@ class ChatCompletionClient : public HttpClient {
 
   const InferStat& ClientInferStat() { return infer_stat_; }
 
-  /// [TODO?] Add AsyncInfer() variant that prepare the request body from
-  /// function arguments. Similar to Triton client library.
-
  private:
   // setup curl handle
   Error PreRunProcessing(CURL* curl, ChatCompletionRequest* request);
diff --git a/src/c++/perf_analyzer/perf_utils.cc b/src/c++/perf_analyzer/perf_utils.cc
index 4c02f56ca..144eaa48a 100644
--- a/src/c++/perf_analyzer/perf_utils.cc
+++ b/src/c++/perf_analyzer/perf_utils.cc
@@ -212,10 +212,6 @@ SerializeExplicitTensor(
 
       std::string element = buffer.GetString();
       uint32_t len = element.size();
-      // FIXME TODO - for BYTES we add the length. Is there any reason that
-      // would be needed here?
-      // serialized.append(reinterpret_cast<const char*>(&len),
-      // sizeof(uint32_t));
       serialized.append(element);
     }
     std::copy(

From 4a8684ed79a3ef977275c977ddb13941694a0678 Mon Sep 17 00:00:00 2001
From: tgerdes <tgerdes@nvidia.com>
Date: Mon, 4 Mar 2024 11:36:34 -0600
Subject: [PATCH 10/23] update copyright years

---
 src/c++/perf_analyzer/client_backend/client_backend.cc | 2 +-
 src/c++/perf_analyzer/command_line_parser.cc           | 2 +-
 src/c++/perf_analyzer/command_line_parser.h            | 2 +-
 src/c++/perf_analyzer/model_parser.cc                  | 2 +-
 src/c++/perf_analyzer/model_parser.h                   | 2 +-
 src/c++/perf_analyzer/perf_analyzer.cc                 | 2 +-
 src/c++/perf_analyzer/perf_utils.cc                    | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/c++/perf_analyzer/client_backend/client_backend.cc b/src/c++/perf_analyzer/client_backend/client_backend.cc
index c665390bb..01585281b 100644
--- a/src/c++/perf_analyzer/client_backend/client_backend.cc
+++ b/src/c++/perf_analyzer/client_backend/client_backend.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
diff --git a/src/c++/perf_analyzer/command_line_parser.cc b/src/c++/perf_analyzer/command_line_parser.cc
index 9bcc5d46f..18af4d994 100644
--- a/src/c++/perf_analyzer/command_line_parser.cc
+++ b/src/c++/perf_analyzer/command_line_parser.cc
@@ -1,4 +1,4 @@
-// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
diff --git a/src/c++/perf_analyzer/command_line_parser.h b/src/c++/perf_analyzer/command_line_parser.h
index 79d387811..cbd807eb4 100644
--- a/src/c++/perf_analyzer/command_line_parser.h
+++ b/src/c++/perf_analyzer/command_line_parser.h
@@ -1,4 +1,4 @@
-// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
diff --git a/src/c++/perf_analyzer/model_parser.cc b/src/c++/perf_analyzer/model_parser.cc
index 30e149c0c..1ab9f7a6d 100644
--- a/src/c++/perf_analyzer/model_parser.cc
+++ b/src/c++/perf_analyzer/model_parser.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
diff --git a/src/c++/perf_analyzer/model_parser.h b/src/c++/perf_analyzer/model_parser.h
index c1e16bac7..c1400d079 100644
--- a/src/c++/perf_analyzer/model_parser.h
+++ b/src/c++/perf_analyzer/model_parser.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
diff --git a/src/c++/perf_analyzer/perf_analyzer.cc b/src/c++/perf_analyzer/perf_analyzer.cc
index 1928772fb..ced5fc991 100644
--- a/src/c++/perf_analyzer/perf_analyzer.cc
+++ b/src/c++/perf_analyzer/perf_analyzer.cc
@@ -1,4 +1,4 @@
-// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
diff --git a/src/c++/perf_analyzer/perf_utils.cc b/src/c++/perf_analyzer/perf_utils.cc
index 144eaa48a..445dd7c54 100644
--- a/src/c++/perf_analyzer/perf_utils.cc
+++ b/src/c++/perf_analyzer/perf_utils.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions

From 9e8533e479800f557cc434cdade17624221e06d8 Mon Sep 17 00:00:00 2001
From: tgerdes <tgerdes@nvidia.com>
Date: Mon, 4 Mar 2024 12:30:14 -0600
Subject: [PATCH 11/23] more cleanup

---
 src/c++/library/http_client.h                              | 2 +-
 src/c++/perf_analyzer/client_backend/openai/http_client.cc | 2 +-
 src/c++/perf_analyzer/test_command_line_parser.cc          | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/c++/library/http_client.h b/src/c++/library/http_client.h
index 532ea10fb..3a94f3fde 100644
--- a/src/c++/library/http_client.h
+++ b/src/c++/library/http_client.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
diff --git a/src/c++/perf_analyzer/client_backend/openai/http_client.cc b/src/c++/perf_analyzer/client_backend/openai/http_client.cc
index 4c8632c52..ff636388b 100644
--- a/src/c++/perf_analyzer/client_backend/openai/http_client.cc
+++ b/src/c++/perf_analyzer/client_backend/openai/http_client.cc
@@ -226,7 +226,7 @@ HttpClient::AsyncTransfer()
             continue;
           }
 
-          long http_code = 400;
+          uint32_t http_code = 400;
           if (msg->data.result == CURLE_OK) {
             curl_easy_getinfo(
                 msg->easy_handle, CURLINFO_RESPONSE_CODE, &http_code);
diff --git a/src/c++/perf_analyzer/test_command_line_parser.cc b/src/c++/perf_analyzer/test_command_line_parser.cc
index 6428a0f2f..2527d2b1b 100644
--- a/src/c++/perf_analyzer/test_command_line_parser.cc
+++ b/src/c++/perf_analyzer/test_command_line_parser.cc
@@ -1,4 +1,4 @@
-// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions

From 381184873095234448b27d305967d1d92f942aa2 Mon Sep 17 00:00:00 2001
From: Timothy Gerdes <50968584+tgerdesnv@users.noreply.github.com>
Date: Tue, 5 Mar 2024 09:21:29 -0600
Subject: [PATCH 12/23] Update src/c++/perf_analyzer/command_line_parser.cc

Co-authored-by: dyastremsky <58150256+dyastremsky@users.noreply.github.com>
---
 src/c++/perf_analyzer/command_line_parser.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/c++/perf_analyzer/command_line_parser.cc b/src/c++/perf_analyzer/command_line_parser.cc
index 18af4d994..1154fc212 100644
--- a/src/c++/perf_analyzer/command_line_parser.cc
+++ b/src/c++/perf_analyzer/command_line_parser.cc
@@ -1922,7 +1922,7 @@ CLParser::VerifyOptions()
     if (params_->endpoint.empty()) {
       Usage(
           "Must supply --endpoint for OpenAI service kind. For example, "
-          "\"v1/chat/completions\"");
+          "\"v1/chat/completions\".");
     }
   }
 

From 305e96cc48e8e3fc5ea1e9e748e64b13fefc7168 Mon Sep 17 00:00:00 2001
From: tgerdes <tgerdes@nvidia.com>
Date: Tue, 5 Mar 2024 09:25:58 -0600
Subject: [PATCH 13/23] remove 'file' from top of files

---
 src/c++/library/common.h                                    | 2 --
 src/c++/library/grpc_client.h                               | 2 --
 src/c++/library/http_client.h                               | 2 --
 src/c++/perf_analyzer/client_backend/openai/openai_client.h | 2 --
 4 files changed, 8 deletions(-)

diff --git a/src/c++/library/common.h b/src/c++/library/common.h
index 9cf99c478..133a32143 100644
--- a/src/c++/library/common.h
+++ b/src/c++/library/common.h
@@ -25,8 +25,6 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #pragma once
 
-/// \file
-
 #include <algorithm>
 #include <chrono>
 #include <condition_variable>
diff --git a/src/c++/library/grpc_client.h b/src/c++/library/grpc_client.h
index cc90b12de..7609c10b7 100644
--- a/src/c++/library/grpc_client.h
+++ b/src/c++/library/grpc_client.h
@@ -25,8 +25,6 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #pragma once
 
-/// \file
-
 #include <grpcpp/grpcpp.h>
 
 #include <queue>
diff --git a/src/c++/library/http_client.h b/src/c++/library/http_client.h
index 3a94f3fde..e06b2eef3 100644
--- a/src/c++/library/http_client.h
+++ b/src/c++/library/http_client.h
@@ -25,8 +25,6 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #pragma once
 
-/// \file
-
 #include <map>
 #include <memory>
 
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.h b/src/c++/perf_analyzer/client_backend/openai/openai_client.h
index a3b1853e3..db58520d1 100644
--- a/src/c++/perf_analyzer/client_backend/openai/openai_client.h
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_client.h
@@ -25,8 +25,6 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #pragma once
 
-/// \file
-
 #include <map>
 #include <memory>
 

From dd4c4ca84154ec45149e13ddffc3146bb3428b6c Mon Sep 17 00:00:00 2001
From: tgerdes <tgerdes@nvidia.com>
Date: Tue, 5 Mar 2024 10:00:34 -0600
Subject: [PATCH 14/23] clean up help message and add endpoint to help

---
 .../openai/openai_client_backend.cc           |  2 +-
 .../openai/openai_infer_input.h               |  4 +-
 src/c++/perf_analyzer/command_line_parser.cc  | 67 ++++++++++++-------
 3 files changed, 44 insertions(+), 29 deletions(-)

diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc b/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc
index bff94fc70..1296a519c 100644
--- a/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc
@@ -67,7 +67,7 @@ OpenAiClientBackend::AsyncInfer(
   auto raw_input = dynamic_cast<OpenAiInferInput*>(inputs[0]);
   raw_input->PrepareForRequest();
   RETURN_IF_CB_ERROR(http_client_->AsyncInfer(
-      callback, raw_input->DataString(), options.request_id_));
+      callback, raw_input->GetRequestBody(), options.request_id_));
   return Error::Success;
 }
 
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.h b/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.h
index 0c192cfad..f5fd5ea42 100644
--- a/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.h
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.h
@@ -54,8 +54,8 @@ class OpenAiInferInput : public InferInput {
   /// Prepare the input to be in the form expected by an OpenAI client,
   /// must call before accessing the data.
   Error PrepareForRequest();
-  /// Get the contiguous data in string.
-  std::string& DataString() { return data_str_; }
+  /// Get the contiguous request body string
+  std::string& GetRequestBody() { return data_str_; }
 
  private:
   explicit OpenAiInferInput(
diff --git a/src/c++/perf_analyzer/command_line_parser.cc b/src/c++/perf_analyzer/command_line_parser.cc
index 1154fc212..a74df4e25 100644
--- a/src/c++/perf_analyzer/command_line_parser.cc
+++ b/src/c++/perf_analyzer/command_line_parser.cc
@@ -98,13 +98,15 @@ CLParser::Usage(const std::string& msg)
   std::cerr << "Usage: " << argv_[0] << " [options]" << std::endl;
   std::cerr << "==== SYNOPSIS ====\n \n";
   std::cerr << "\t--version " << std::endl;
-  std::cerr << "\t--service-kind "
-               "<\"triton\"|\"tfserving\"|\"torchserve\"|\"triton_c_api\">"
-            << std::endl;
   std::cerr << "\t-m <model name>" << std::endl;
   std::cerr << "\t-x <model version>" << std::endl;
-  std::cerr << "\t--bls-composing-models=<string>" << std::endl;
+  std::cerr << "\t--bls-composing-models <string>" << std::endl;
   std::cerr << "\t--model-signature-name <model signature name>" << std::endl;
+  std::cerr
+      << "\t--service-kind "
+         "<\"triton\"|\"openai\"|\"tfserving\"|\"torchserve\"|\"triton_c_api\">"
+      << std::endl;
+  std::cerr << "\t--endpoint <string>" << std::endl;
   std::cerr << "\t-v" << std::endl;
   std::cerr << std::endl;
   std::cerr << "I. MEASUREMENT PARAMETERS: " << std::endl;
@@ -151,8 +153,8 @@ CLParser::Usage(const std::string& msg)
   std::cerr << "\t--sequence-id-range <start:end>" << std::endl;
   std::cerr << "\t--string-length <length>" << std::endl;
   std::cerr << "\t--string-data <string>" << std::endl;
-  std::cerr << "\t--input-tensor-format=[binary|json]" << std::endl;
-  std::cerr << "\t--output-tensor-format=[binary|json]" << std::endl;
+  std::cerr << "\t--input-tensor-format [binary|json]" << std::endl;
+  std::cerr << "\t--output-tensor-format [binary|json]" << std::endl;
   std::cerr << "\tDEPRECATED OPTIONS" << std::endl;
   std::cerr << "\t-z" << std::endl;
   std::cerr << "\t--data-directory <path>" << std::endl;
@@ -196,21 +198,6 @@ CLParser::Usage(const std::string& msg)
                    18)
             << std::endl;
 
-  std::cerr
-      << FormatMessage(
-             " --service-kind: Describes the kind of service perf_analyzer to "
-             "generate load for. The options are \"triton\", \"triton_c_api\", "
-             "\"tfserving\" and \"torchserve\". Default value is \"triton\". "
-             "Note in order to use \"torchserve\" backend --input-data option "
-             "must point to a json file holding data in the following format "
-             "{\"data\" : [{\"TORCHSERVE_INPUT\" : [\"<complete path to the "
-             "content file>\"]}, {...}...]}. The type of file here will depend "
-             "on the model. In order to use \"triton_c_api\" you must specify "
-             "the Triton server install path and the model repository path via "
-             "the --triton-server-directory and --model-repository flags",
-             18)
-      << std::endl;
-
   std::cerr
       << std::setw(9) << std::left << " -m: "
       << FormatMessage(
@@ -232,6 +219,33 @@ CLParser::Usage(const std::string& msg)
                    "\"tfserving\".",
                    18)
             << std::endl;
+
+  std::cerr
+      << FormatMessage(
+             " --service-kind: Describes the kind of service perf_analyzer to "
+             "generate load for. The options are \"triton\", \"openai\", "
+             "\"triton_c_api\", \"tfserving\" and \"torchserve\". Default "
+             "value is \"triton\". Note in order to use \"openai\" you must "
+             "specify an endpoint via --endpoint. "
+             "Note in order to use \"torchserve\" backend --input-data option "
+             "must point to a json file holding data in the following format "
+             "{\"data\" : [{\"TORCHSERVE_INPUT\" : [\"<complete path to the "
+             "content file>\"]}, {...}...]}. The type of file here will depend "
+             "on the model. In order to use \"triton_c_api\" you must specify "
+             "the Triton server install path and the model repository path via "
+             "the --triton-server-directory and --model-repository flags",
+             18)
+      << std::endl;
+
+  std::cerr
+      << FormatMessage(
+             " --endpoint: Describes what endpoint to send requests to on the "
+             "server. This is required when using \"openai\" service-kind, and "
+             "is ignored for all other cases. Currently only "
+             "\"v1/chat/completions\" is confirmed to work.",
+             18)
+      << std::endl;
+
   std::cerr << std::setw(9) << std::left
             << " -v: " << FormatMessage("Enables verbose mode.", 9)
             << std::endl;
@@ -303,7 +317,7 @@ CLParser::Usage(const std::string& msg)
       << std::endl;
   std::cerr
       << FormatMessage(
-             "--periodic-concurrency-range <start:end:step>: Determines the "
+             " --periodic-concurrency-range <start:end:step>: Determines the "
              "range of concurrency levels in the similar but slightly "
              "different manner as the --concurrency-range. Perf Analyzer will "
              "start from the concurrency level of 'start' and increase by "
@@ -323,7 +337,7 @@ CLParser::Usage(const std::string& msg)
       << std::endl;
   std::cerr
       << FormatMessage(
-             "--request-period <n>: Indicates the number of responses that "
+             " --request-period <n>: Indicates the number of responses that "
              "each request must receive before new, concurrent requests are "
              "sent when --periodic-concurrency-range is specified. Default "
              "value is 10.",
@@ -331,7 +345,7 @@ CLParser::Usage(const std::string& msg)
       << std::endl;
   std::cerr
       << FormatMessage(
-             "--request-parameter <name:value:type>: Specifies a custom "
+             " --request-parameter <name:value:type>: Specifies a custom "
              "parameter that can be sent to a Triton backend as part of the "
              "request. For example, providing '--request-parameter "
              "max_tokens:256:int' to the command line will set an additional "
@@ -382,7 +396,7 @@ CLParser::Usage(const std::string& msg)
       << std::endl;
   std::cerr
       << FormatMessage(
-             "--binary-search: Enables the binary search on the specified "
+             " --binary-search: Enables the binary search on the specified "
              "search range. This option requires 'start' and 'end' to be "
              "expilicitly specified in the --concurrency-range or "
              "--request-rate-range. When using this option, 'step' is more "
@@ -393,7 +407,7 @@ CLParser::Usage(const std::string& msg)
       << std::endl;
 
   std::cerr << FormatMessage(
-                   "--num-of-sequences: Sets the number of concurrent "
+                   " --num-of-sequences: Sets the number of concurrent "
                    "sequences for sequence models. This option is ignored when "
                    "--request-rate-range is not specified. By default, its "
                    "value is 4.",
@@ -1613,6 +1627,7 @@ CLParser::ParseCommandLine(int argc, char** argv)
         }
         case 62: {
           params_->endpoint = optarg;
+          break;
         }
         case 'v':
           params_->extra_verbose = params_->verbose;

From cac8bff625bb3cec4423130d70bb70c124f559f0 Mon Sep 17 00:00:00 2001
From: tgerdes <tgerdes@nvidia.com>
Date: Tue, 5 Mar 2024 10:58:54 -0600
Subject: [PATCH 15/23] Fix client stats

---
 .../perf_analyzer/client_backend/openai/openai_client.cc  | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc
index c7502657a..ccd23e6ff 100644
--- a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc
@@ -158,12 +158,12 @@ ChatCompletionClient::AsyncInfer(
 
   auto completion_callback = [this](HttpRequest* req) {
     auto request = static_cast<ChatCompletionRequest*>(req);
-    if (!request->is_stream_) {
-      request->SendResponse(true /* is_final */, false /* is_null */);
-    }
     request->timer_.CaptureTimestamp(
         triton::client::RequestTimers::Kind::REQUEST_END);
     UpdateInferStat(request->timer_);
+    if (!request->is_stream_) {
+      request->SendResponse(true /* is_final */, false /* is_null */);
+    }
   };
   std::unique_ptr<HttpRequest> request(new ChatCompletionRequest(
       std::move(completion_callback), std::move(callback), request_id,
@@ -185,6 +185,8 @@ ChatCompletionClient::AsyncInfer(
   raw_request->timer_.CaptureTimestamp(
       triton::client::RequestTimers::Kind::SEND_START);
   Send(multi_easy_handle, std::move(request));
+  raw_request->timer_.CaptureTimestamp(
+      triton::client::RequestTimers::Kind::SEND_END);
   return Error::Success;
 }
 

From 5b7434ae3fc3c72dfa4c42401961c381be548e6c Mon Sep 17 00:00:00 2001
From: tgerdes <tgerdes@nvidia.com>
Date: Tue, 5 Mar 2024 11:46:03 -0600
Subject: [PATCH 16/23] remove unused fn

---
 .../client_backend/openai/openai_client_backend.h              | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.h b/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.h
index 94dbd9729..f6c6490c6 100644
--- a/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.h
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.h
@@ -84,9 +84,6 @@ class OpenAiClientBackend : public ClientBackend {
   {
   }
 
-  void ParseInferStat(
-      const tc::InferStat& openai_infer_stat, InferStat* infer_stat);
-
   std::unique_ptr<openai::ChatCompletionClient> http_client_;
   std::shared_ptr<Headers> http_headers_;
 };

From 5eab7b53ead585e0ea96f91d52ec57b25d538240 Mon Sep 17 00:00:00 2001
From: tgerdes <tgerdes@nvidia.com>
Date: Tue, 5 Mar 2024 11:58:23 -0600
Subject: [PATCH 17/23] Assert on json input format

---
 src/c++/perf_analyzer/perf_utils.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/c++/perf_analyzer/perf_utils.cc b/src/c++/perf_analyzer/perf_utils.cc
index 445dd7c54..6088c1b6b 100644
--- a/src/c++/perf_analyzer/perf_utils.cc
+++ b/src/c++/perf_analyzer/perf_utils.cc
@@ -205,7 +205,13 @@ SerializeExplicitTensor(
   } else if (dt.compare("JSON") == 0) {
     std::string serialized = "";
 
-    for (const auto& value : tensor.GetArray()) {
+    auto values = tensor.GetArray();
+    if (values.Size() != 1) {
+      return cb::Error(
+          "JSON format does not yet support multiple json objects in the "
+          "input");
+    }
+    for (const auto& value : values) {
       rapidjson::StringBuffer buffer;
       rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
       value.Accept(writer);

From 603631a9a48081df7fedcbaae4faee6462a164d3 Mon Sep 17 00:00:00 2001
From: tgerdes <tgerdes@nvidia.com>
Date: Tue, 5 Mar 2024 12:41:41 -0600
Subject: [PATCH 18/23] Use a single SEND_END point

---
 .../perf_analyzer/client_backend/openai/openai_client.cc  | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc
index ccd23e6ff..362278436 100644
--- a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc
@@ -84,10 +84,8 @@ ChatCompletionClient::RequestProvider(
   request->GetNextInput(
       reinterpret_cast<uint8_t*>(contents), size * nmemb, &input_bytes);
 
-  if (input_bytes == 0) {
-    request->timer_.CaptureTimestamp(
-        triton::client::RequestTimers::Kind::SEND_END);
-  }
+  request->timer_.CaptureTimestamp(
+      triton::client::RequestTimers::Kind::SEND_END);
 
   return input_bytes;
 }
@@ -185,8 +183,6 @@ ChatCompletionClient::AsyncInfer(
   raw_request->timer_.CaptureTimestamp(
       triton::client::RequestTimers::Kind::SEND_START);
   Send(multi_easy_handle, std::move(request));
-  raw_request->timer_.CaptureTimestamp(
-      triton::client::RequestTimers::Kind::SEND_END);
   return Error::Success;
 }
 

From 40d64654ca125d912491cd92c38dc01ea113b062 Mon Sep 17 00:00:00 2001
From: tgerdes <tgerdes@nvidia.com>
Date: Tue, 5 Mar 2024 13:31:56 -0600
Subject: [PATCH 19/23] Add sync assert. Add OPENAI to helper fn

---
 src/c++/perf_analyzer/client_backend/client_backend.cc | 3 +++
 src/c++/perf_analyzer/command_line_parser.cc           | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/src/c++/perf_analyzer/client_backend/client_backend.cc b/src/c++/perf_analyzer/client_backend/client_backend.cc
index 01585281b..92546d36d 100644
--- a/src/c++/perf_analyzer/client_backend/client_backend.cc
+++ b/src/c++/perf_analyzer/client_backend/client_backend.cc
@@ -90,6 +90,9 @@ BackendKindToString(const BackendKind kind)
     case TRITON_C_API:
       return std::string("TRITON_C_API");
       break;
+    case OPENAI:
+      return std::string("OPENAI");
+      break;
     default:
       return std::string("UNKNOWN");
       break;
diff --git a/src/c++/perf_analyzer/command_line_parser.cc b/src/c++/perf_analyzer/command_line_parser.cc
index a74df4e25..42f9044c7 100644
--- a/src/c++/perf_analyzer/command_line_parser.cc
+++ b/src/c++/perf_analyzer/command_line_parser.cc
@@ -1939,6 +1939,9 @@ CLParser::VerifyOptions()
           "Must supply --endpoint for OpenAI service kind. For example, "
           "\"v1/chat/completions\".");
     }
+    if (!params_->async) {
+      Usage("Only async mode is currently supported for OpenAI service-kind");
+    }
   }
 
   if (params_->should_collect_metrics &&

From 7827fee88959a1f06f1699357c59e52edef967d5 Mon Sep 17 00:00:00 2001
From: tgerdes <tgerdes@nvidia.com>
Date: Wed, 6 Mar 2024 08:59:29 -0600
Subject: [PATCH 20/23] remove unused typedef

---
 src/c++/perf_analyzer/client_backend/openai/http_client.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/c++/perf_analyzer/client_backend/openai/http_client.h b/src/c++/perf_analyzer/client_backend/openai/http_client.h
index 13a0d2e05..3c311569e 100644
--- a/src/c++/perf_analyzer/client_backend/openai/http_client.h
+++ b/src/c++/perf_analyzer/client_backend/openai/http_client.h
@@ -38,10 +38,6 @@
 namespace triton { namespace perfanalyzer { namespace clientbackend {
 namespace openai {
 
-/// The key-value map type to be included in the request
-/// as custom headers.
-typedef std::map<std::string, std::string> Headers;
-
 // The options for authorizing and authenticating SSL/TLS connections.
 struct HttpSslOptions {
   enum CERTTYPE { CERT_PEM = 0, CERT_DER = 1 };

From 7df09ef48ea9aae128c4a625cf62144acac9a5f6 Mon Sep 17 00:00:00 2001
From: tgerdes <tgerdes@nvidia.com>
Date: Wed, 6 Mar 2024 15:27:24 -0600
Subject: [PATCH 21/23] Add batch size assert

---
 src/c++/perf_analyzer/command_line_parser.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/c++/perf_analyzer/command_line_parser.cc b/src/c++/perf_analyzer/command_line_parser.cc
index 42f9044c7..9c8ebacac 100644
--- a/src/c++/perf_analyzer/command_line_parser.cc
+++ b/src/c++/perf_analyzer/command_line_parser.cc
@@ -1942,6 +1942,9 @@ CLParser::VerifyOptions()
     if (!params_->async) {
       Usage("Only async mode is currently supported for OpenAI service-kind");
     }
+    if (params_->batch_size != 1) {
+      Usage("Batching is not currently supported with OpenAI service-kind");
+    }
   }
 
   if (params_->should_collect_metrics &&

From 605217633fcb4736e62876eb46ccff688a28370d Mon Sep 17 00:00:00 2001
From: GuanLuo <41310872+GuanLuo@users.noreply.github.com>
Date: Wed, 6 Mar 2024 13:33:39 -0800
Subject: [PATCH 22/23] Address comment (#487)

* Address comment

* Update src/c++/perf_analyzer/client_backend/openai/openai_client.cc

* Update src/c++/perf_analyzer/client_backend/openai/http_client.cc

* formatting

---------

Co-authored-by: Timothy Gerdes <50968584+tgerdesnv@users.noreply.github.com>
Co-authored-by: tgerdes <tgerdes@nvidia.com>
---
 .../client_backend/openai/http_client.cc      | 27 ++++++++++++-------
 .../client_backend/openai/http_client.h       |  1 +
 .../client_backend/openai/openai_client.cc    | 18 ++++++++++---
 3 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/src/c++/perf_analyzer/client_backend/openai/http_client.cc b/src/c++/perf_analyzer/client_backend/openai/http_client.cc
index ff636388b..08e4b4b3c 100644
--- a/src/c++/perf_analyzer/client_backend/openai/http_client.cc
+++ b/src/c++/perf_analyzer/client_backend/openai/http_client.cc
@@ -76,19 +76,25 @@ HttpRequest::GetNextInput(uint8_t* buf, size_t size, size_t* input_bytes)
   }
 }
 
+std::mutex HttpClient::curl_init_mtx_{};
 HttpClient::HttpClient(
     const std::string& server_url, bool verbose,
     const HttpSslOptions& ssl_options)
     : url_(server_url), verbose_(verbose), ssl_options_(ssl_options)
 {
-  auto* ver = curl_version_info(CURLVERSION_NOW);
-  if (ver->features & CURL_VERSION_THREADSAFE == 0) {
-    throw std::runtime_error(
-        "HTTP client has dependency on CURL library to have thread-safe "
-        "support (CURL_VERSION_THREADSAFE set)");
-  }
-  if (curl_global_init(CURL_GLOBAL_ALL) != 0) {
-    throw std::runtime_error("CURL global initialization failed");
+  // [TODO TMA-1670] uncomment below and remove class-wise mutex once confirm
+  // curl >= 7.84.0 will always be used
+  // auto* ver = curl_version_info(CURLVERSION_NOW);
+  // if (ver->features & CURL_VERSION_THREADSAFE == 0) {
+  //   throw std::runtime_error(
+  //       "HTTP client has dependency on CURL library to have thread-safe "
+  //       "support (CURL_VERSION_THREADSAFE set)");
+  // }
+  {
+    std::lock_guard<std::mutex> lk(curl_init_mtx_);
+    if (curl_global_init(CURL_GLOBAL_ALL) != 0) {
+      throw std::runtime_error("CURL global initialization failed");
+    }
   }
 
   multi_handle_ = curl_multi_init();
@@ -114,7 +120,10 @@ HttpClient::~HttpClient()
   }
   curl_multi_cleanup(multi_handle_);
 
-  curl_global_cleanup();
+  {
+    std::lock_guard<std::mutex> lk(curl_init_mtx_);
+    curl_global_cleanup();
+  }
 }
 
 const std::string&
diff --git a/src/c++/perf_analyzer/client_backend/openai/http_client.h b/src/c++/perf_analyzer/client_backend/openai/http_client.h
index 3c311569e..6b78d836e 100644
--- a/src/c++/perf_analyzer/client_backend/openai/http_client.h
+++ b/src/c++/perf_analyzer/client_backend/openai/http_client.h
@@ -168,5 +168,6 @@ class HttpClient {
  private:
   const std::string& ParseSslKeyType(HttpSslOptions::KEYTYPE key_type);
   const std::string& ParseSslCertType(HttpSslOptions::CERTTYPE cert_type);
+  static std::mutex curl_init_mtx_;
 };
 }}}}  // namespace triton::perfanalyzer::clientbackend::openai
diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc
index 362278436..28e55f3c0 100644
--- a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc
+++ b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc
@@ -114,8 +114,21 @@ size_t
 ChatCompletionClient::ResponseHandler(
     void* contents, size_t size, size_t nmemb, void* userp)
 {
-  // [WIP] verify if the SSE responses received are complete, or the response
-  // need to be stitched first
+  // [TODO TMA-1666] verify if the SSE responses received are complete, or the
+  // response need to be stitched first. To verify, print out the received
+  // responses from SendResponse() to make sure the OpenAI server doesn't chunk
+  // the HTTP responses in the way that misaligns with the SSE responses. Reason
+  // of not stitching responses now is that it is a bit complicated that to make
+  // the write callback bulletproof is to assume the response can be chunked at
+  // arbitrary position, then bake in checking for SSE style (data:.*\n\n) by
+  // iterating all received buffer character by character.
+  size_t result_bytes = size * nmemb;
+  // return early if the response is empty as the response handling is
+  // triggered by the content of the response.
+  if (result_bytes == 0) {
+    return result_bytes;
+  }
+
   auto request = reinterpret_cast<ChatCompletionRequest*>(userp);
   if (request->timer_.Timestamp(
           triton::client::RequestTimers::Kind::RECV_START) == 0) {
@@ -124,7 +137,6 @@ ChatCompletionClient::ResponseHandler(
   }
 
   char* buf = reinterpret_cast<char*>(contents);
-  size_t result_bytes = size * nmemb;
   request->response_buffer_.append(buf, result_bytes);
   // Send response now if streaming, otherwise wait until request has been
   // completed

From 46a03db6eef16b47362ee620b993e05367b6cca8 Mon Sep 17 00:00:00 2001
From: Timothy Gerdes <50968584+tgerdesnv@users.noreply.github.com>
Date: Wed, 6 Mar 2024 16:09:42 -0600
Subject: [PATCH 23/23] Make copy of exported data so it isn't corrupted (#488)

---
 src/c++/perf_analyzer/infer_context.cc        |  2 +-
 .../perf_analyzer/profile_data_exporter.cc    |  4 +--
 src/c++/perf_analyzer/request_record.h        | 30 +++++++++++++++----
 .../test_profile_data_collector.cc            | 13 +++++---
 4 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/src/c++/perf_analyzer/infer_context.cc b/src/c++/perf_analyzer/infer_context.cc
index 4e998428b..6da86fef3 100644
--- a/src/c++/perf_analyzer/infer_context.cc
+++ b/src/c++/perf_analyzer/infer_context.cc
@@ -188,7 +188,7 @@ InferContext::GetOutput(const cb::InferResult& infer_result)
     const uint8_t* buf{nullptr};
     size_t byte_size{0};
     infer_result.RawData(requested_output->Name(), &buf, &byte_size);
-    output[requested_output->Name()] = {buf, byte_size};
+    output.emplace(requested_output->Name(), ResponseData(buf, byte_size));
   }
   return output;
 }
diff --git a/src/c++/perf_analyzer/profile_data_exporter.cc b/src/c++/perf_analyzer/profile_data_exporter.cc
index 3bcd6f83e..d840c460d 100644
--- a/src/c++/perf_analyzer/profile_data_exporter.cc
+++ b/src/c++/perf_analyzer/profile_data_exporter.cc
@@ -160,8 +160,8 @@ ProfileDataExporter::AddResponseOutputs(
     rapidjson::Value response_output_json(rapidjson::kObjectType);
     for (const auto& output : response_output) {
       const auto& name{output.first};
-      const auto& buf{output.second.first};
-      const auto& byte_size{output.second.second};
+      const auto& buf{output.second.data_.get()};
+      const auto& byte_size{output.second.size_};
       rapidjson::Value name_json(name.c_str(), document_.GetAllocator());
       rapidjson::Value output_json{};
       if (buf != nullptr) {
diff --git a/src/c++/perf_analyzer/request_record.h b/src/c++/perf_analyzer/request_record.h
index fd6252a57..b4d122eb7 100644
--- a/src/c++/perf_analyzer/request_record.h
+++ b/src/c++/perf_analyzer/request_record.h
@@ -33,20 +33,40 @@
 
 namespace triton { namespace perfanalyzer {
 
+/// A record containing the data of a single response
+struct ResponseData {
+  ResponseData(const uint8_t* buf, size_t size)
+  {
+    uint8_t* array = new uint8_t[size];
+    std::memcpy(array, buf, size);
+    data_ = std::shared_ptr<uint8_t>(array, [](uint8_t* p) { delete[] p; });
+    size_ = size;
+  }
+
+  // Define equality comparison operator so it can be inserted into maps
+  bool operator==(const ResponseData& other) const
+  {
+    if (size_ != other.size_)
+      return false;
+    // Compare the contents of the arrays
+    return std::memcmp(data_.get(), other.data_.get(), size_) == 0;
+  }
+
+  std::shared_ptr<uint8_t> data_;
+  size_t size_;
+};
+
 
 /// A record of an individual request
 struct RequestRecord {
-  using ResponseOutput =
-      std::unordered_map<std::string, std::pair<const uint8_t*, size_t>>;
+  using ResponseOutput = std::unordered_map<std::string, ResponseData>;
 
   RequestRecord(
       std::chrono::time_point<std::chrono::system_clock> start_time =
           std::chrono::time_point<std::chrono::system_clock>(),
       std::vector<std::chrono::time_point<std::chrono::system_clock>>
           response_timestamps = {},
-      std::vector<
-          std::unordered_map<std::string, std::pair<const uint8_t*, size_t>>>
-          response_outputs = {},
+      std::vector<ResponseOutput> response_outputs = {},
       bool sequence_end = true, bool delayed = false, uint64_t sequence_id = 0,
       bool has_null_last_response = false)
       : start_time_(start_time), response_timestamps_(response_timestamps),
diff --git a/src/c++/perf_analyzer/test_profile_data_collector.cc b/src/c++/perf_analyzer/test_profile_data_collector.cc
index b6ce7ffab..dfed394ac 100644
--- a/src/c++/perf_analyzer/test_profile_data_collector.cc
+++ b/src/c++/perf_analyzer/test_profile_data_collector.cc
@@ -63,10 +63,13 @@ TEST_CASE("profile_data_collector: AddData")
   auto request1_timestamp{clock_epoch + std::chrono::nanoseconds(1)};
   auto request1_response1_timestamp{clock_epoch + std::chrono::nanoseconds(2)};
   auto request1_response2_timestamp{clock_epoch + std::chrono::nanoseconds(3)};
+  uint8_t fake_data[] = {0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08};
   RequestRecord::ResponseOutput request1_response1_output{
-      {"key1", {nullptr, 1}}, {"key2", {nullptr, 2}}};
+      {"key1", ResponseData(fake_data, 1)},
+      {"key2", ResponseData(fake_data, 2)}};
   RequestRecord::ResponseOutput request1_response2_output{
-      {"key3", {nullptr, 3}}, {"key4", {nullptr, 4}}};
+      {"key3", ResponseData(fake_data, 3)},
+      {"key4", ResponseData(fake_data, 4)}};
 
   RequestRecord request_record1{
       request1_timestamp,
@@ -83,9 +86,11 @@ TEST_CASE("profile_data_collector: AddData")
   auto request2_response1_timestamp{clock_epoch + std::chrono::nanoseconds(5)};
   auto request2_response2_timestamp{clock_epoch + std::chrono::nanoseconds(6)};
   RequestRecord::ResponseOutput request2_response1_output{
-      {"key5", {nullptr, 5}}, {"key6", {nullptr, 6}}};
+      {"key5", ResponseData(fake_data, 5)},
+      {"key6", ResponseData(fake_data, 6)}};
   RequestRecord::ResponseOutput request2_response2_output{
-      {"key7", {nullptr, 7}}, {"key8", {nullptr, 8}}};
+      {"key7", ResponseData(fake_data, 7)},
+      {"key8", ResponseData(fake_data, 8)}};
 
   RequestRecord request_record2{
       request2_timestamp,