From 06b6103a42d6f6cff668b436ac067aa083409767 Mon Sep 17 00:00:00 2001 From: Marco Donadoni Date: Wed, 9 Aug 2023 10:53:46 +0200 Subject: [PATCH 1/2] job-monitor: consider `OOMKilled` pods as failed Closes #396 --- CHANGES.rst | 1 + reana_job_controller/job_monitor.py | 29 ++++++++++++++++++++++++++--- tests/test_job_monitor.py | 1 + 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index af30f41c..bd7fd6c6 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -6,6 +6,7 @@ Version 0.9.1 (UNRELEASED) - Fixes intermittent Slurm connection issues by DNS-resolving the Slurm head node IPv4 address before establishing connections. - Fixes deletion of failed jobs not being performed when Kerberos is enabled. +- Fixes job monitoring to consider OOM-killed jobs as failed. - Changes Paramiko to version 3.0.0. - Changes HTCondor to version 9.0.17 (LTS). diff --git a/reana_job_controller/job_monitor.py b/reana_job_controller/job_monitor.py index a4d0394e..dd23d912 100644 --- a/reana_job_controller/job_monitor.py +++ b/reana_job_controller/job_monitor.py @@ -171,14 +171,37 @@ def get_job_status(self, job_pod) -> Optional[str]: """Get Kubernetes based REANA job status.""" status = None backend_job_id = self.get_backend_job_id(job_pod) + container_statuses = self._get_job_container_statuses(job_pod) + if job_pod.status.phase == "Succeeded": - logging.info("Kubernetes job id: {} succeeded.".format(backend_job_id)) - status = JobStatus.finished.name + # checking that all the containers are `Completed`, as sometimes there + # can be `OOMKilled` containers that are considered as successful + for container in container_statuses: + try: + reason = container.state.terminated.reason + except AttributeError: + reason = None + if not reason: + logging.info( + f"No termination reason for container {container.name} in " + f"Kubernetes job {backend_job_id}, assuming successful." + ) + elif reason != "Completed": + logging.info( + f"Kubernetes job id: {backend_job_id} failed, phase 'Succeeded' but " + f"container '{container.name}' was terminated because of '{reason}'." + ) + status = JobStatus.failed.name + + if not status: + logging.info("Kubernetes job id: {} succeeded.".format(backend_job_id)) + status = JobStatus.finished.name + elif job_pod.status.phase == "Failed": logging.info("Kubernetes job id: {} failed.".format(backend_job_id)) status = JobStatus.failed.name + elif job_pod.status.phase == "Pending": - container_statuses = self._get_job_container_statuses(job_pod) for container in container_statuses: try: reason = container.state.waiting.reason diff --git a/tests/test_job_monitor.py b/tests/test_job_monitor.py index 6751e24e..154da13d 100644 --- a/tests/test_job_monitor.py +++ b/tests/test_job_monitor.py @@ -72,6 +72,7 @@ def test_kubernetes_get_job_logs( ("Succeeded", "Completed", "finished"), ("Failed", "Error", "failed"), ("Pending", ["Running", "ErrImagePull"], "failed"), + ("Succeeded", "OOMKilled", "failed"), ], ) def test_kubernetes_get_job_status( From 0872804e2f4b8fc52bb526013276c76005624119 Mon Sep 17 00:00:00 2001 From: Marco Donadoni Date: Wed, 9 Aug 2023 10:54:00 +0200 Subject: [PATCH 2/2] tests: add support for podman --- run-tests.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/run-tests.sh b/run-tests.sh index b82a7a5c..7d2f0d7c 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -48,6 +48,10 @@ start_db_container () { docker run --rm --name postgres__reana-job-controller -p 5432:5432 -e POSTGRES_PASSWORD=mysecretpassword -d docker.io/library/postgres:12.13 _check_ready "Postgres" _db_check db_container_ip=$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' postgres__reana-job-controller) + if [[ -z $db_container_ip ]]; then + # container does not have an IP when using podman + db_container_ip="localhost" + fi export REANA_SQLALCHEMY_DATABASE_URI=postgresql+psycopg2://postgres:mysecretpassword@$db_container_ip/postgres }