Skip to content

Commit

Permalink
Add some detached <-> scheduler edge cases
Browse files Browse the repository at this point in the history
  • Loading branch information
yngve-sk committed Oct 7, 2024
1 parent d1c3a88 commit c11b443
Showing 1 changed file with 25 additions and 6 deletions.
31 changes: 25 additions & 6 deletions src/everest/detached/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from seba_sqlite.exceptions import ObjectNotFoundError
from seba_sqlite.snapshot import SebaSnapshot

from ert import BatchContext, BatchSimulator
from ert import BatchContext, BatchSimulator, JobState
from ert.config import ErtConfig, QueueSystem
from everest.config import EverestConfig
from everest.config_keys import ConfigKeys as CK
Expand Down Expand Up @@ -180,11 +180,30 @@ def wait_for_server(
)
# Job queueing may fail:
if context is not None and context.has_job_failed(0):
path = context.job_progress(0).steps[0].std_err_file
for err in extract_errors_from_file(path):
update_everserver_status(config, ServerStatus.failed, message=err)
logging.error(err)
raise SystemExit("Failed to start Everest server.")
job_progress = context.job_progress(0)

if job_progress is not None:
path = context.job_progress(0).steps[0].std_err_file
for err in extract_errors_from_file(path):
update_everserver_status(
config, ServerStatus.failed, message=err
)
logging.error(err)
raise SystemExit("Failed to start Everest server.")
else:
try:
state = context.get_job_state(0)

if state == JobState.WAITING:
# Job did fail, but is now in WAITING
logging.error("wait_for_server, job failing -> waiting")
except IndexError as e:
# Job is no longer registered in scheduler
logging.error(
f"wait_for_server, job removed from scheduler\n{e}"
)
raise SystemExit("Failed to start Everest server.") from e

sleep_time = sleep_time_increment * (2**retry_count)
time.sleep(sleep_time)
if server_is_running(config):
Expand Down

0 comments on commit c11b443

Please sign in to comment.