Skip to content

Commit

Permalink
Piggyback final flag as a part of final response (#28)
Browse files Browse the repository at this point in the history
  • Loading branch information
Tabrizian authored Jan 14, 2024
1 parent 6e084cd commit 52c1c3c
Showing 1 changed file with 14 additions and 4 deletions.
18 changes: 14 additions & 4 deletions src/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,12 +264,21 @@ async def generate(self, request):
self.logger.log_info("[vllm] Successfully cancelled the request")
break
if stream:
response_sender.send(self.create_response(output))
if output.finished:
response_sender.send(
self.create_response(output),
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL,
)
else:
response_sender.send(self.create_response(output))
else:
last_output = output

if not stream:
response_sender.send(self.create_response(last_output))
response_sender.send(
self.create_response(last_output),
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL,
)

except Exception as e:
self.logger.log_info(f"[vllm] Error generating stream: {e}")
Expand All @@ -280,10 +289,11 @@ async def generate(self, request):
response = pb_utils.InferenceResponse(
output_tensors=[triton_output_tensor], error=error
)
response_sender.send(response)
response_sender.send(
response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
)
raise e
finally:
response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
self.ongoing_request_count -= 1

def execute(self, requests):
Expand Down

0 comments on commit 52c1c3c

Please sign in to comment.