diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc index cd517f6a6..9b167fae1 100644 --- a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc +++ b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc @@ -63,6 +63,7 @@ namespace openai { void ChatCompletionRequest::SendResponse(bool is_final, bool is_null) { + final_response_sent_ = is_final; response_callback_(new ChatCompletionResult( http_code_, std::move(response_buffer_), is_final, is_null, request_id_)); } @@ -172,7 +173,11 @@ ChatCompletionClient::AsyncInfer( request->timer_.CaptureTimestamp( triton::client::RequestTimers::Kind::REQUEST_END); UpdateInferStat(request->timer_); - if (!request->is_stream_) { + + // Send final response on request completion + // if it has not already been sent. + // (e.g. in the case of seeing [DONE] in streaming case) + if (!request->IsFinalResponseSent()) { request->SendResponse(true /* is_final */, false /* is_null */); } }; diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.h b/src/c++/perf_analyzer/client_backend/openai/openai_client.h index aadcb3252..00ccbd5fa 100644 --- a/src/c++/perf_analyzer/client_backend/openai/openai_client.h +++ b/src/c++/perf_analyzer/client_backend/openai/openai_client.h @@ -121,12 +121,14 @@ class ChatCompletionRequest : public HttpRequest { request_id_(request_id) { } + bool IsFinalResponseSent() { return final_response_sent_; }; void SendResponse(bool is_final, bool is_null); bool is_stream_{false}; std::function response_callback_{nullptr}; // The timers for infer request. triton::client::RequestTimers timer_; const std::string request_id_; + bool final_response_sent_{false}; }; class ChatCompletionClient : public HttpClient {