diff --git a/src/model.py b/src/model.py index f21f2a35..80f51320 100644 --- a/src/model.py +++ b/src/model.py @@ -264,12 +264,21 @@ async def generate(self, request): self.logger.log_info("[vllm] Successfully cancelled the request") break if stream: - response_sender.send(self.create_response(output)) + if output.finished: + response_sender.send( + self.create_response(output), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, + ) + else: + response_sender.send(self.create_response(output)) else: last_output = output if not stream: - response_sender.send(self.create_response(last_output)) + response_sender.send( + self.create_response(last_output), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, + ) except Exception as e: self.logger.log_info(f"[vllm] Error generating stream: {e}") @@ -280,10 +289,11 @@ async def generate(self, request): response = pb_utils.InferenceResponse( output_tensors=[triton_output_tensor], error=error ) - response_sender.send(response) + response_sender.send( + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) raise e finally: - response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) self.ongoing_request_count -= 1 def execute(self, requests):