Skip to content

Commit

Permalink
fix(summaries) make run_job more resilient
Browse files Browse the repository at this point in the history
Consider the timeout over the entire process, not only inference.
  • Loading branch information
saghul committed Sep 5, 2024
1 parent 044446b commit fef096e
Showing 1 changed file with 9 additions and 4 deletions.
13 changes: 9 additions & 4 deletions skynet/modules/ttt/summaries/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,15 @@ async def update_job(job_id: str, expires: int = None, **kwargs) -> Job:


async def run_job(job: Job) -> None:
exit_task = asyncio.create_task(restart_on_timeout(job))

try:
await _run_job(job)
finally:
exit_task.cancel()


async def _run_job(job: Job) -> None:
has_failed = False
result = None
worker_id = await db.db.client_id()
Expand All @@ -137,8 +146,6 @@ async def run_job(job: Job) -> None:

result = job.payload.text
else:
exit_task = asyncio.create_task(restart_on_timeout(job))

try:
customer_id = job.metadata.customer_id
options = get_credentials(customer_id)
Expand Down Expand Up @@ -169,8 +176,6 @@ async def run_job(job: Job) -> None:
has_failed = True
result = str(e)

exit_task.cancel()

updated_job = await update_job(
expires=redis_exp_seconds if not has_failed else None,
job_id=job.id,
Expand Down

0 comments on commit fef096e

Please sign in to comment.