OpenAi endpoint support (#476)

* Add openai service-kind and add endpoint to CLI * Add openai to model parser * OpenAI client backend + cmake * Create OpenAI backend * New JSON datatype for PA. Show json data available at http_client level * Add an output to OpenAI models * Add OpenAI client (#482) * Add OpenAI client * Address comment * Pass endpoint to openai client * Resolve fixmes * update copyright years * more cleanup * Update src/c++/perf_analyzer/command_line_parser.cc Co-authored-by: dyastremsky <[email protected]> * remove 'file' from top of files * clean up help message and add endpoint to help * Fix client stats * remove unused fn * Assert on json input format * Use a single SEND_END point * Add sync assert. Add OPENAI to helper fn * remove unused typedef * Add batch size assert * Address comment (#487) * Address comment * Update src/c++/perf_analyzer/client_backend/openai/openai_client.cc * Update src/c++/perf_analyzer/client_backend/openai/http_client.cc * formatting --------- Co-authored-by: Timothy Gerdes <[email protected]> Co-authored-by: tgerdes <[email protected]> * Make copy of exported data so it isn't corrupted (#488) --------- Co-authored-by: oandreeva-nv <[email protected]> Co-authored-by: GuanLuo <[email protected]> Co-authored-by: dyastremsky <[email protected]>
triton-inference-server · Mar 6, 2024 · a11ffa2 · a11ffa2
1 parent 6225a3f
commit a11ffa2
Show file tree

Hide file tree

Showing 28 changed files with 1,635 additions and 78 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -45,6 +45,7 @@ option(TRITON_ENABLE_PERF_ANALYZER "Enable Performance Analyzer" OFF)
 option(TRITON_ENABLE_PERF_ANALYZER_C_API "Enable Performance Analyzer C API" OFF)
 option(TRITON_ENABLE_PERF_ANALYZER_TFS "Enable TensorFlow Serving support for Performance Analyzer" OFF)
 option(TRITON_ENABLE_PERF_ANALYZER_TS "Enable TorchServe support for Performance Analyzer" OFF)
+option(TRITON_ENABLE_PERF_ANALYZER_OPENAI "Enable OpenAI support for Performance Analyzer" OFF)
 option(TRITON_ENABLE_EXAMPLES "Include examples in build" OFF)
 option(TRITON_ENABLE_TESTS "Include tests in build" OFF)
 option(TRITON_ENABLE_GPU "Enable GPU support in libraries" OFF)
@@ -142,6 +143,9 @@ if(TRITON_ENABLE_CC_HTTP OR TRITON_ENABLE_CC_GRPC OR TRITON_ENABLE_PERF_ANALYZER
   if(NOT ${TRITON_ENABLE_PERF_ANALYZER} AND ${TRITON_ENABLE_PERF_ANALYZER_TS})
     message(FATAL_ERROR "TRITON_ENABLE_PERF_ANALYZER_TS=ON requires TRITON_ENABLE_PERF_ANALYZER=ON")
   endif() # NOT TRITON_ENABLE_PERF_ANALYZER AND TRITON_ENABLE_PERF_ANALYZER_TS
+  if(NOT ${TRITON_ENABLE_PERF_ANALYZER} AND ${TRITON_ENABLE_PERF_ANALYZER_OPENAI})
+    message(FATAL_ERROR "TRITON_ENABLE_PERF_ANALYZER_OPENAI=ON requires TRITON_ENABLE_PERF_ANALYZER=ON")
+  endif() # NOT TRITON_ENABLE_PERF_ANALYZER AND TRITON_ENABLE_PERF_ANALYZER_OPENAI
 
   ExternalProject_Add(cc-clients
     PREFIX cc-clients
@@ -167,6 +171,7 @@ if(TRITON_ENABLE_CC_HTTP OR TRITON_ENABLE_CC_GRPC OR TRITON_ENABLE_PERF_ANALYZER
       -DTRITON_ENABLE_PERF_ANALYZER_C_API:BOOL=${TRITON_ENABLE_PERF_ANALYZER_C_API}
       -DTRITON_ENABLE_PERF_ANALYZER_TFS:BOOL=${TRITON_ENABLE_PERF_ANALYZER_TFS}
       -DTRITON_ENABLE_PERF_ANALYZER_TS:BOOL=${TRITON_ENABLE_PERF_ANALYZER_TS}
+      -DTRITON_ENABLE_PERF_ANALYZER_OPENAI:BOOL=${TRITON_ENABLE_PERF_ANALYZER_OPENAI}
       -DTRITON_ENABLE_EXAMPLES:BOOL=${TRITON_ENABLE_EXAMPLES}
       -DTRITON_ENABLE_TESTS:BOOL=${TRITON_ENABLE_TESTS}
       -DTRITON_ENABLE_GPU:BOOL=${TRITON_ENABLE_GPU}
@@ -209,6 +214,7 @@ if(TRITON_ENABLE_PYTHON_HTTP OR TRITON_ENABLE_PYTHON_GRPC)
       -DTRITON_ENABLE_PERF_ANALYZER_C_API:BOOL=${TRITON_ENABLE_PERF_ANALYZER_C_API}
       -DTRITON_ENABLE_PERF_ANALYZER_TFS:BOOL=${TRITON_ENABLE_PERF_ANALYZER_TFS}
       -DTRITON_ENABLE_PERF_ANALYZER_TS:BOOL=${TRITON_ENABLE_PERF_ANALYZER_TS}
+      -DTRITON_ENABLE_PERF_ANALYZER_OPENAI:BOOL=${TRITON_ENABLE_PERF_ANALYZER_OPENAI}
       -DTRITON_ENABLE_EXAMPLES:BOOL=${TRITON_ENABLE_EXAMPLES}
       -DTRITON_ENABLE_TESTS:BOOL=${TRITON_ENABLE_TESTS}
       -DTRITON_ENABLE_GPU:BOOL=${TRITON_ENABLE_GPU}

diff --git a/src/c++/library/common.h b/src/c++/library/common.h
@@ -25,8 +25,6 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #pragma once
 
-/// \file
-
 #include <algorithm>
 #include <chrono>
 #include <condition_variable>

diff --git a/src/c++/library/grpc_client.h b/src/c++/library/grpc_client.h
@@ -25,8 +25,6 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #pragma once
 
-/// \file
-
 #include <grpcpp/grpcpp.h>
 
 #include <queue>

diff --git a/src/c++/library/http_client.h b/src/c++/library/http_client.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -25,8 +25,6 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #pragma once
 
-/// \file
-
 #include <map>
 #include <memory>
 
@@ -49,7 +47,7 @@ struct HttpSslOptions {
   enum KEYTYPE {
     KEY_PEM = 0,
     KEY_DER = 1
-    // TODO: Support loading private key from crypto engine
+    // TODO TMA-1645: Support loading private key from crypto engine
     // KEY_ENG = 2
   };
   explicit HttpSslOptions()

diff --git a/src/c++/perf_analyzer/CMakeLists.txt b/src/c++/perf_analyzer/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -170,6 +170,13 @@ if(TRITON_ENABLE_PERF_ANALYZER_TS)
   )
 endif()
 
+if(TRITON_ENABLE_PERF_ANALYZER_OPENAI)
+  target_compile_definitions(
+    client-backend-library
+    PUBLIC TRITON_ENABLE_PERF_ANALYZER_OPENAI=1
+  )
+endif()
+
 install(
   TARGETS perf_analyzer
   RUNTIME DESTINATION bin

diff --git a/src/c++/perf_analyzer/client_backend/CMakeLists.txt b/src/c++/perf_analyzer/client_backend/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -43,6 +43,10 @@ if(TRITON_ENABLE_PERF_ANALYZER_TS)
   add_subdirectory(torchserve)
 endif()
 
+if(TRITON_ENABLE_PERF_ANALYZER_OPENAI)
+  add_subdirectory(openai)
+endif()
+
 set(
   CLIENT_BACKEND_SRCS
   client_backend.cc
@@ -71,6 +75,12 @@ if(TRITON_ENABLE_PERF_ANALYZER_TS)
   set(TS_TARGET_INCLUDE_DIRECTORY PRIVATE $<TARGET_PROPERTY:ts-client-backend-library,INCLUDE_DIRECTORIES>)
 endif()
 
+if(TRITON_ENABLE_PERF_ANALYZER_OPENAI)
+  set(OPENAI_LIBRARY $<TARGET_OBJECTS:openai-client-backend-library>)
+  set(OPENAI_TARGET_LINK_LIBRARY PUBLIC $<TARGET_PROPERTY:openai-client-backend-library,LINK_LIBRARIES>)
+  set(OPENAI_TARGET_INCLUDE_DIRECTORY PRIVATE $<TARGET_PROPERTY:openai-client-backend-library,INCLUDE_DIRECTORIES>)
+endif()
+
 add_library(
   client-backend-library
   ${CLIENT_BACKEND_SRCS}
@@ -80,6 +90,7 @@ add_library(
   ${CAPI_LIBRARY}
   ${TFS_LIBRARY}
   ${TS_LIBRARY}
+  ${OPENAI_LIBRARY}
 )
 
 target_link_libraries(
@@ -89,6 +100,7 @@ target_link_libraries(
   ${CAPI_TARGET_LINK_LIBRARY}
   ${TFS_TARGET_LINK_LIBRARY}
   ${TS_TARGET_LINK_LIBRARY}
+  ${OPENAI_TARGET_LINK_LIBRARY}
 )
 
 target_include_directories(
@@ -97,4 +109,5 @@ target_include_directories(
   ${CAPI_TARGET_INCLUDE_DIRECTORY}
   ${TFS_TARGET_INCLUDE_DIRECTORY}
   ${TS_TARGET_INCLUDE_DIRECTORY}
+  ${OPENAI_TARGET_INCLUDE_DIRECTORY}
 )
diff --git a/src/c++/perf_analyzer/client_backend/client_backend.cc b/src/c++/perf_analyzer/client_backend/client_backend.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -32,6 +32,10 @@
 #include "triton_c_api/triton_c_api_backend.h"
 #endif  // TRITON_ENABLE_PERF_ANALYZER_C_API
 
+#ifdef TRITON_ENABLE_PERF_ANALYZER_OPENAI
+#include "openai/openai_client_backend.h"
+#endif  // TRITON_ENABLE_PERF_ANALYZER_OPENAI
+
 #ifdef TRITON_ENABLE_PERF_ANALYZER_TFS
 #include "tensorflow_serving/tfserve_client_backend.h"
 #endif  // TRITON_ENABLE_PERF_ANALYZER_TFS
@@ -86,6 +90,9 @@ BackendKindToString(const BackendKind kind)
     case TRITON_C_API:
       return std::string("TRITON_C_API");
       break;
+    case OPENAI:
+      return std::string("OPENAI");
+      break;
     default:
       return std::string("UNKNOWN");
       break;
@@ -112,8 +119,8 @@ BackendToGrpcType(const GrpcCompressionAlgorithm compression_algorithm)
 //
 Error
 ClientBackendFactory::Create(
-    const BackendKind kind, const std::string& url, const ProtocolType protocol,
-    const SslOptionsBase& ssl_options,
+    const BackendKind kind, const std::string& url, const std::string& endpoint,
+    const ProtocolType protocol, const SslOptionsBase& ssl_options,
     const std::map<std::string, std::vector<std::string>> trace_options,
     const GrpcCompressionAlgorithm compression_algorithm,
     std::shared_ptr<Headers> http_headers,
@@ -124,9 +131,10 @@ ClientBackendFactory::Create(
     std::shared_ptr<ClientBackendFactory>* factory)
 {
   factory->reset(new ClientBackendFactory(
-      kind, url, protocol, ssl_options, trace_options, compression_algorithm,
-      http_headers, triton_server_path, model_repository_path, verbose,
-      metrics_url, input_tensor_format, output_tensor_format));
+      kind, url, endpoint, protocol, ssl_options, trace_options,
+      compression_algorithm, http_headers, triton_server_path,
+      model_repository_path, verbose, metrics_url, input_tensor_format,
+      output_tensor_format));
   return Error::Success;
 }
 
@@ -135,7 +143,7 @@ ClientBackendFactory::CreateClientBackend(
     std::unique_ptr<ClientBackend>* client_backend)
 {
   RETURN_IF_CB_ERROR(ClientBackend::Create(
-      kind_, url_, protocol_, ssl_options_, trace_options_,
+      kind_, url_, endpoint_, protocol_, ssl_options_, trace_options_,
       compression_algorithm_, http_headers_, verbose_, triton_server_path,
       model_repository_path_, metrics_url_, input_tensor_format_,
       output_tensor_format_, client_backend));
@@ -153,8 +161,8 @@ ClientBackendFactory::Kind()
 //
 Error
 ClientBackend::Create(
-    const BackendKind kind, const std::string& url, const ProtocolType protocol,
-    const SslOptionsBase& ssl_options,
+    const BackendKind kind, const std::string& url, const std::string& endpoint,
+    const ProtocolType protocol, const SslOptionsBase& ssl_options,
     const std::map<std::string, std::vector<std::string>> trace_options,
     const GrpcCompressionAlgorithm compression_algorithm,
     std::shared_ptr<Headers> http_headers, const bool verbose,
@@ -172,6 +180,12 @@ ClientBackend::Create(
         metrics_url, input_tensor_format, output_tensor_format,
         &local_backend));
   }
+#ifdef TRITON_ENABLE_PERF_ANALYZER_OPENAI
+  else if (kind == OPENAI) {
+    RETURN_IF_CB_ERROR(openai::OpenAiClientBackend::Create(
+        url, endpoint, protocol, http_headers, verbose, &local_backend));
+  }
+#endif  // TRITON_ENABLE_PERF_ANALYZER_OPENAI
 #ifdef TRITON_ENABLE_PERF_ANALYZER_TFS
   else if (kind == TENSORFLOW_SERVING) {
     RETURN_IF_CB_ERROR(tfserving::TFServeClientBackend::Create(
@@ -421,6 +435,12 @@ InferInput::Create(
     RETURN_IF_CB_ERROR(tritonremote::TritonInferInput::Create(
         infer_input, name, dims, datatype));
   }
+#ifdef TRITON_ENABLE_PERF_ANALYZER_OPENAI
+  else if (kind == OPENAI) {
+    RETURN_IF_CB_ERROR(
+        openai::OpenAiInferInput::Create(infer_input, name, dims, datatype));
+  }
+#endif  // TRITON_ENABLE_PERF_ANALYZER_OPENAI
 #ifdef TRITON_ENABLE_PERF_ANALYZER_TFS
   else if (kind == TENSORFLOW_SERVING) {
     RETURN_IF_CB_ERROR(tfserving::TFServeInferInput::Create(
@@ -505,6 +525,12 @@ InferRequestedOutput::Create(
     RETURN_IF_CB_ERROR(tritonremote::TritonInferRequestedOutput::Create(
         infer_output, name, class_count));
   }
+#ifdef TRITON_ENABLE_PERF_ANALYZER_OPENAI
+  else if (kind == OPENAI) {
+    RETURN_IF_CB_ERROR(
+        openai::OpenAiInferRequestedOutput::Create(infer_output, name));
+  }
+#endif  // TRITON_ENABLE_PERF_ANALYZER_OPENAI
 #ifdef TRITON_ENABLE_PERF_ANALYZER_TFS
   else if (kind == TENSORFLOW_SERVING) {
     RETURN_IF_CB_ERROR(

diff --git a/src/c++/perf_analyzer/client_backend/client_backend.h b/src/c++/perf_analyzer/client_backend/client_backend.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -135,7 +135,8 @@ enum BackendKind {
   TRITON = 0,
   TENSORFLOW_SERVING = 1,
   TORCHSERVE = 2,
-  TRITON_C_API = 3
+  TRITON_C_API = 3,
+  OPENAI = 4
 };
 enum ProtocolType { HTTP = 0, GRPC = 1, UNKNOWN = 2 };
 enum GrpcCompressionAlgorithm {
@@ -267,6 +268,7 @@ class ClientBackendFactory {
   /// Create a factory that can be used to construct Client Backends.
   /// \param kind The kind of client backend to create.
   /// \param url The inference server url and port.
+  /// \param endpoint The endpoint on the inference server to send requests to
   /// \param protocol The protocol type used.
   /// \param ssl_options The SSL options used with client backend.
   /// \param compression_algorithm The compression algorithm to be used
@@ -289,7 +291,8 @@ class ClientBackendFactory {
   /// \return Error object indicating success or failure.
   static Error Create(
       const BackendKind kind, const std::string& url,
-      const ProtocolType protocol, const SslOptionsBase& ssl_options,
+      const std::string& endpoint, const ProtocolType protocol,
+      const SslOptionsBase& ssl_options,
       const std::map<std::string, std::vector<std::string>> trace_options,
       const GrpcCompressionAlgorithm compression_algorithm,
       std::shared_ptr<Headers> http_headers,
@@ -308,16 +311,17 @@ class ClientBackendFactory {
  private:
   ClientBackendFactory(
       const BackendKind kind, const std::string& url,
-      const ProtocolType protocol, const SslOptionsBase& ssl_options,
+      const std::string& endpoint, const ProtocolType protocol,
+      const SslOptionsBase& ssl_options,
       const std::map<std::string, std::vector<std::string>> trace_options,
       const GrpcCompressionAlgorithm compression_algorithm,
       const std::shared_ptr<Headers> http_headers,
       const std::string& triton_server_path,
       const std::string& model_repository_path, const bool verbose,
       const std::string& metrics_url, const TensorFormat input_tensor_format,
       const TensorFormat output_tensor_format)
-      : kind_(kind), url_(url), protocol_(protocol), ssl_options_(ssl_options),
-        trace_options_(trace_options),
+      : kind_(kind), url_(url), endpoint_(endpoint), protocol_(protocol),
+        ssl_options_(ssl_options), trace_options_(trace_options),
         compression_algorithm_(compression_algorithm),
         http_headers_(http_headers), triton_server_path(triton_server_path),
         model_repository_path_(model_repository_path), verbose_(verbose),
@@ -328,6 +332,7 @@ class ClientBackendFactory {
 
   const BackendKind kind_;
   const std::string url_;
+  const std::string endpoint_;
   const ProtocolType protocol_;
   const SslOptionsBase& ssl_options_;
   const std::map<std::string, std::vector<std::string>> trace_options_;
@@ -360,7 +365,8 @@ class ClientBackend {
  public:
   static Error Create(
       const BackendKind kind, const std::string& url,
-      const ProtocolType protocol, const SslOptionsBase& ssl_options,
+      const std::string& endpoint, const ProtocolType protocol,
+      const SslOptionsBase& ssl_options,
       const std::map<std::string, std::vector<std::string>> trace_options,
       const GrpcCompressionAlgorithm compression_algorithm,
       std::shared_ptr<Headers> http_headers, const bool verbose,

diff --git a/src/c++/perf_analyzer/client_backend/openai/CMakeLists.txt b/src/c++/perf_analyzer/client_backend/openai/CMakeLists.txt
@@ -0,0 +1,60 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cmake_minimum_required (VERSION 3.18)
+
+set(
+    OPENAI_CLIENT_BACKEND_SRCS
+    http_client.cc
+    openai_client_backend.cc
+    openai_client.cc
+    openai_infer_input.cc
+)
+
+set(
+    OPENAI_CLIENT_BACKEND_HDRS
+    http_client.h
+    openai_client_backend.h
+    openai_client.h
+    openai_infer_input.h
+)
+
+add_library(
+    openai-client-backend-library  EXCLUDE_FROM_ALL OBJECT
+    ${OPENAI_CLIENT_BACKEND_SRCS}
+    ${OPENAI_CLIENT_BACKEND_HDRS}
+)
+
+target_link_libraries(
+  openai-client-backend-library
+  PUBLIC CURL::libcurl
+  PUBLIC httpclient_static
+)
+
+if(${TRITON_ENABLE_GPU})
+    target_include_directories(openai-client-backend-library PUBLIC ${CUDA_INCLUDE_DIRS})
+    target_link_libraries(openai-client-backend-library PRIVATE ${CUDA_LIBRARIES})
+endif() # TRITON_ENABLE_GPU