diff --git a/src/c++/perf_analyzer/client_backend/client_backend.cc b/src/c++/perf_analyzer/client_backend/client_backend.cc index 282c6e181..869762942 100644 --- a/src/c++/perf_analyzer/client_backend/client_backend.cc +++ b/src/c++/perf_analyzer/client_backend/client_backend.cc @@ -435,10 +435,7 @@ InferInput::Create( #ifdef TRITON_ENABLE_PERF_ANALYZER_OPENAI else if (kind == OPENAI) { RETURN_IF_CB_ERROR( - // FIXME TODO TKG - // openai::OpenAiInferInput::Create(infer_input, name, dims, datatype)); - tritonremote::TritonInferInput::Create( - infer_input, name, dims, datatype)); + openai::OpenAiInferInput::Create(infer_input, name, dims, datatype)); } #endif // TRITON_ENABLE_PERF_ANALYZER_OPENAI #ifdef TRITON_ENABLE_PERF_ANALYZER_TFS diff --git a/src/c++/perf_analyzer/client_backend/openai/CMakeLists.txt b/src/c++/perf_analyzer/client_backend/openai/CMakeLists.txt index ec839a2b3..3ef867e9f 100644 --- a/src/c++/perf_analyzer/client_backend/openai/CMakeLists.txt +++ b/src/c++/perf_analyzer/client_backend/openai/CMakeLists.txt @@ -30,12 +30,14 @@ set( OPENAI_CLIENT_BACKEND_SRCS openai_client_backend.cc openai_http_client.cc + openai_infer_input.cc ) set( OPENAI_CLIENT_BACKEND_HDRS openai_client_backend.h openai_http_client.h + openai_infer_input.h ) add_library( diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc b/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc index d9cca25d9..d017b8b23 100644 --- a/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc +++ b/src/c++/perf_analyzer/client_backend/openai/openai_client_backend.cc @@ -42,11 +42,10 @@ OpenAiClientBackend::Create( "perf_analyzer does not support gRPC protocol with OpenAI endpoints"); } std::unique_ptr openai_client_backend( - new OpenAiClientBackend(http_headers)); + new OpenAiClientBackend(http_headers)); - // TODO: Adjust as needed - RETURN_IF_CB_ERROR(HttpClient::Create( - &(openai_client_backend->http_client_), url, verbose)); + RETURN_IF_CB_ERROR( + HttpClient::Create(&(openai_client_backend->http_client_), url, verbose)); *client_backend = std::move(openai_client_backend); @@ -64,8 +63,8 @@ OpenAiClientBackend::AsyncInfer( callback(result); }; - // TODO: make an async infer call - //RETURN_IF_CB_ERROR(http_client_->AsyncInfer(...)); + RETURN_IF_CB_ERROR(http_client_->AsyncInfer( + wrapped_callback, options, inputs, outputs, *http_headers_)); return Error::Success; } @@ -112,8 +111,7 @@ OpenAiInferRequestedOutput::Create( return Error::Success; } -OpenAiInferRequestedOutput::OpenAiInferRequestedOutput( - const std::string& name) +OpenAiInferRequestedOutput::OpenAiInferRequestedOutput(const std::string& name) : InferRequestedOutput(BackendKind::OPENAI, name) { } diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_http_client.cc b/src/c++/perf_analyzer/client_backend/openai/openai_http_client.cc index 5263407c3..a9d4029cb 100644 --- a/src/c++/perf_analyzer/client_backend/openai/openai_http_client.cc +++ b/src/c++/perf_analyzer/client_backend/openai/openai_http_client.cc @@ -26,6 +26,8 @@ #include "openai_http_client.h" +#include + namespace triton { namespace perfanalyzer { namespace clientbackend { namespace openai { @@ -40,6 +42,63 @@ HttpClient::Create( return Error::Success; } +Error +HttpClient::AsyncInfer( + OpenAiOnCompleteFn callback, const InferOptions& options, + const std::vector& inputs, + const std::vector& outputs, + const Headers& headers) +{ + // TODO FIXME implement + + // TODO FIXME cleanup or remove this. It just proves the json data arrives + rapidjson::Document d{}; + + if (inputs.size() != 1) { + return Error("Only expecting one input"); + } + + auto raw_input = dynamic_cast(inputs[0]); + + raw_input->PrepareForRequest(); + bool end_of_input = false; + const uint8_t* buf; + size_t buf_size; + raw_input->GetNext(&buf, &buf_size, &end_of_input); + if (!end_of_input) { + return Error("Unexpected multiple json data inputs"); + } + if (buf == nullptr) { + return Error("Unexpected null json data"); + } + + std::string json_str(reinterpret_cast(buf), buf_size); + std::cout << "FIXME TODO: JSON data string is " << json_str << std::endl; + + + if (d.Parse(json_str.c_str()).HasParseError()) { + return Error("Unable to parse json string: " + json_str); + } + + // FIXME TKG -- where/how would the 'streaming' option get plugged in? + + // FIXME TKG -- GOOD GOD! Is it this hard to add a single value into a json + // object?? + d.AddMember( + "model", + rapidjson::Value().SetString( + options.model_name_.c_str(), + static_cast(options.model_name_.length()), + d.GetAllocator()), + d.GetAllocator()); + + for (auto itr = d.MemberBegin(); itr != d.MemberEnd(); ++itr) { + std::cout << "FIXME TODO: valid JSON object has key " + << itr->name.GetString() << std::endl; + } + + return Error::Success; +} HttpClient::HttpClient(const std::string& url, bool verbose) : InferenceServerClient(verbose), url_(url) diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_http_client.h b/src/c++/perf_analyzer/client_backend/openai/openai_http_client.h index 67f7d9144..bbdaddfe9 100644 --- a/src/c++/perf_analyzer/client_backend/openai/openai_http_client.h +++ b/src/c++/perf_analyzer/client_backend/openai/openai_http_client.h @@ -27,6 +27,7 @@ #include "../client_backend.h" #include "common.h" +#include "openai_infer_input.h" namespace tc = triton::client; @@ -37,6 +38,8 @@ namespace openai { class InferResult; class HttpInferRequest; +using OpenAiOnCompleteFn = std::function; + //============================================================================== /// An HttpClient object is used to perform any kind of communication with the /// OpenAi service using @@ -63,6 +66,15 @@ class HttpClient : public tc::InferenceServerClient { std::unique_ptr* client, const std::string& server_url, const bool verbose); + /// TODO FIXME: Update + /// Run asynchronous inference on server. + Error AsyncInfer( + OpenAiOnCompleteFn callback, const InferOptions& options, + const std::vector& inputs, + const std::vector& outputs = + std::vector(), + const Headers& headers = Headers()); + private: HttpClient(const std::string& url, bool verbose); diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.cc b/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.cc new file mode 100644 index 000000000..70d827e85 --- /dev/null +++ b/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.cc @@ -0,0 +1,112 @@ +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "openai_infer_input.h" + +namespace triton { namespace perfanalyzer { namespace clientbackend { +namespace openai { + +Error +OpenAiInferInput::Create( + InferInput** infer_input, const std::string& name, + const std::vector& dims, const std::string& datatype) +{ + OpenAiInferInput* local_infer_input = + new OpenAiInferInput(name, dims, datatype); + + *infer_input = local_infer_input; + return Error::Success; +} + +Error +OpenAiInferInput::SetShape(const std::vector& shape) +{ + shape_ = shape; + return Error::Success; +} + +Error +OpenAiInferInput::Reset() +{ + bufs_.clear(); + buf_byte_sizes_.clear(); + bufs_idx_ = 0; + byte_size_ = 0; + return Error::Success; +} + +Error +OpenAiInferInput::AppendRaw(const uint8_t* input, size_t input_byte_size) +{ + byte_size_ += input_byte_size; + + bufs_.push_back(input); + buf_byte_sizes_.push_back(input_byte_size); + + return Error::Success; +} + +Error +OpenAiInferInput::ByteSize(size_t* byte_size) const +{ + *byte_size = byte_size_; + return Error::Success; +} + +Error +OpenAiInferInput::PrepareForRequest() +{ + // Reset position so request sends entire input. + bufs_idx_ = 0; + buf_pos_ = 0; + return Error::Success; +} + +Error +OpenAiInferInput::GetNext( + const uint8_t** buf, size_t* input_bytes, bool* end_of_input) +{ + if (bufs_idx_ < bufs_.size()) { + *buf = bufs_[bufs_idx_]; + *input_bytes = buf_byte_sizes_[bufs_idx_]; + bufs_idx_++; + } else { + *buf = nullptr; + *input_bytes = 0; + } + *end_of_input = (bufs_idx_ >= bufs_.size()); + + return Error::Success; +} + +OpenAiInferInput::OpenAiInferInput( + const std::string& name, const std::vector& dims, + const std::string& datatype) + : InferInput(BackendKind::TENSORFLOW_SERVING, name, datatype), shape_(dims) +{ +} + +}}}} // namespace triton::perfanalyzer::clientbackend::openai diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.h b/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.h new file mode 100644 index 000000000..a10b9312f --- /dev/null +++ b/src/c++/perf_analyzer/client_backend/openai/openai_infer_input.h @@ -0,0 +1,76 @@ +// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include + +#include "../../perf_utils.h" +#include "../client_backend.h" + + +namespace triton { namespace perfanalyzer { namespace clientbackend { +namespace openai { + +//============================================================== +/// OpenAiInferInput instance holds the information regarding +/// model input tensors and their corresponding generated data. +/// +class OpenAiInferInput : public InferInput { + public: + static Error Create( + InferInput** infer_input, const std::string& name, + const std::vector& dims, const std::string& datatype); + /// See InferInput::Shape() + const std::vector& Shape() const override { return shape_; } + /// See InferInput::SetShape() + Error SetShape(const std::vector& shape) override; + /// See InferInput::Reset() + Error Reset() override; + /// See InferInput::AppendRaw() + Error AppendRaw(const uint8_t* input, size_t input_byte_size) override; + /// Gets the size of data added into this input in bytes. + /// \param byte_size The size of data added in bytes. + /// \return Error object indicating success or failure. + Error ByteSize(size_t* byte_size) const; + /// Resets the heads to start providing data from the beginning. + Error PrepareForRequest(); + /// Get the next chunk of data if available. + Error GetNext(const uint8_t** buf, size_t* input_bytes, bool* end_of_input); + + private: + explicit OpenAiInferInput( + const std::string& name, const std::vector& dims, + const std::string& datatype); + + std::vector shape_; + size_t byte_size_{0}; + + size_t bufs_idx_, buf_pos_; + std::vector bufs_; + std::vector buf_byte_sizes_; +}; + +}}}} // namespace triton::perfanalyzer::clientbackend::openai diff --git a/src/c++/perf_analyzer/perf_utils.cc b/src/c++/perf_analyzer/perf_utils.cc index d6c0a8a37..4c02f56ca 100644 --- a/src/c++/perf_analyzer/perf_utils.cc +++ b/src/c++/perf_analyzer/perf_utils.cc @@ -27,6 +27,8 @@ #include "perf_utils.h" #include +#include +#include #include #include @@ -200,6 +202,25 @@ SerializeExplicitTensor( std::copy( serialized.begin(), serialized.end(), std::back_inserter(*decoded_data)); + } else if (dt.compare("JSON") == 0) { + std::string serialized = ""; + + for (const auto& value : tensor.GetArray()) { + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + value.Accept(writer); + + std::string element = buffer.GetString(); + uint32_t len = element.size(); + // FIXME TODO - for BYTES we add the length. Is there any reason that + // would be needed here? + // serialized.append(reinterpret_cast(&len), + // sizeof(uint32_t)); + serialized.append(element); + } + std::copy( + serialized.begin(), serialized.end(), + std::back_inserter(*decoded_data)); } else { for (const auto& value : tensor.GetArray()) { if (dt.compare("BOOL") == 0) { @@ -298,6 +319,8 @@ SerializeExplicitTensor( double element(value.GetDouble()); const char* src = reinterpret_cast(&element); decoded_data->insert(decoded_data->end(), src, src + sizeof(double)); + } else { + return cb::Error("Unexpected type " + dt); } } }