Skip to content

Commit

Permalink
fix: Include garbage collect for non streaming and improve triggering…
Browse files Browse the repository at this point in the history
… logic (#52)
  • Loading branch information
kthui authored Jul 26, 2024
1 parent fab8e86 commit c54dfef
Showing 1 changed file with 6 additions and 3 deletions.
9 changes: 6 additions & 3 deletions src/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def response_loop(self):
if response_flag == pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL:
self.ongoing_request_count -= 1
del response_sender
if self._response_queue.empty():
if self.ongoing_request_count == 0:
gc.collect()

def create_response(self, vllm_output, prepend_input):
Expand Down Expand Up @@ -343,6 +343,7 @@ async def generate(self, request):
"""
response_sender = request.get_response_sender()
self.ongoing_request_count += 1
decrement_ongoing_request_count = True
try:
request_id = random_uuid()
prompt = pb_utils.get_input_tensor_by_name(
Expand Down Expand Up @@ -398,9 +399,8 @@ async def generate(self, request):
lora_request = LoRARequest(lora_id, lora_int_id, lora_local_path)

response_iterator = await self.llm_engine.add_request(
request_id, prompt, sampling_params
request_id, prompt, sampling_params, lora_request=lora_request
)
decrement_ongoing_request_count = True

async for output in response_iterator:
if response_sender.is_cancelled():
Expand Down Expand Up @@ -447,6 +447,9 @@ async def generate(self, request):
finally:
if decrement_ongoing_request_count:
self.ongoing_request_count -= 1
del response_sender
if self.ongoing_request_count == 0:
gc.collect()

def verify_loras(self, request):
# We will check if the requested lora exists here, if not we will send a
Expand Down

0 comments on commit c54dfef

Please sign in to comment.