diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc index cd517f6a6..e835f988e 100644 --- a/src/c++/perf_analyzer/client_backend/openai/openai_client.cc +++ b/src/c++/perf_analyzer/client_backend/openai/openai_client.cc @@ -63,6 +63,14 @@ namespace openai { void ChatCompletionRequest::SendResponse(bool is_final, bool is_null) { + // if final response has already been sent + // due to detecting the [DONE] + // ignore final response due to request completion + if (final_response_sent_) { + return; + } + + final_response_sent_ = is_final; response_callback_(new ChatCompletionResult( http_code_, std::move(response_buffer_), is_final, is_null, request_id_)); } @@ -172,9 +180,11 @@ ChatCompletionClient::AsyncInfer( request->timer_.CaptureTimestamp( triton::client::RequestTimers::Kind::REQUEST_END); UpdateInferStat(request->timer_); - if (!request->is_stream_) { - request->SendResponse(true /* is_final */, false /* is_null */); - } + + // Send Response checks if a final + // response has already been sent + // (in the case of seeing [DONE] in streaming case) + request->SendResponse(true /* is_final */, false /* is_null */); }; std::unique_ptr request(new ChatCompletionRequest( std::move(completion_callback), std::move(callback), request_id, diff --git a/src/c++/perf_analyzer/client_backend/openai/openai_client.h b/src/c++/perf_analyzer/client_backend/openai/openai_client.h index aadcb3252..e63728fc4 100644 --- a/src/c++/perf_analyzer/client_backend/openai/openai_client.h +++ b/src/c++/perf_analyzer/client_backend/openai/openai_client.h @@ -127,6 +127,7 @@ class ChatCompletionRequest : public HttpRequest { // The timers for infer request. triton::client::RequestTimers timer_; const std::string request_id_; + bool final_response_sent_{false}; }; class ChatCompletionClient : public HttpClient {