From 693a54f74a017323206d552afd37b9935c436a5f Mon Sep 17 00:00:00 2001 From: Manuel Giffels Date: Sat, 16 Dec 2023 11:05:07 +0100 Subject: [PATCH] Fix handling squeue invalid jobid --- docs/source/changelog.rst | 2 +- tardis/adapters/sites/slurm.py | 10 ++++++++++ tests/adapters_t/sites_t/test_slurm.py | 26 +++++++++++++++++++++++++- 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index c6667ad5..a72b8fda 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,4 +1,4 @@ -.. Created by changelog.py at 2023-11-29, command +.. Created by changelog.py at 2023-12-16, command '/Users/giffler/.cache/pre-commit/repor6pnmwlm/py_env-python3.10/bin/changelog docs/source/changes compile --categories Added Changed Fixed Security Deprecated --output=docs/source/changelog.rst' based on the format of 'https://keepachangelog.com/' diff --git a/tardis/adapters/sites/slurm.py b/tardis/adapters/sites/slurm.py index ff6ed652..3c37e52b 100644 --- a/tardis/adapters/sites/slurm.py +++ b/tardis/adapters/sites/slurm.py @@ -45,8 +45,18 @@ async def squeue( try: slurm_status = await executor.run_command(cmd) except CommandExecutionFailure as cf: + # In case a job is already completed and + # only **one** non-existing job id is provided + # squeue is failing with exit code 1 + # and prints "Invalid job id specified" to stderr + if "Invalid job id specified" in cf.stderr and len(resource_attributes) == 1: + logger.info( + f"{remote_resource_ids} is not valid anymore. Assuming it is completed." + ) + return [{"State": "COMPLETED"}] logger.warning(f"Slurm status update has failed due to {cf}.") raise + else: for row in csv_parser( slurm_status.stdout, fieldnames=tuple(attributes.keys()), delimiter="|" diff --git a/tests/adapters_t/sites_t/test_slurm.py b/tests/adapters_t/sites_t/test_slurm.py index 3e391452..673d3d9c 100644 --- a/tests/adapters_t/sites_t/test_slurm.py +++ b/tests/adapters_t/sites_t/test_slurm.py @@ -304,7 +304,31 @@ def test_resource_state_translation(self): self.mock_executor.reset_mock() @mock_executor_run_command("") - def test_resource_status_of_completed_jobs(self): + def test_resource_status_of_completed_jobs_w_empty_reply(self): + response = run_async( + self.slurm_adapter.resource_status, + AttributeDict( + resource_id="1390065", + remote_resource_uuid="1351043", + ), + ) + + self.assertEqual(response.resource_status, ResourceStatus.Deleted) + + self.mock_executor.return_value.run_command.assert_called_with( + 'squeue -o "%A|%N|%T" -h -t all --job=1351043' + ) + + @mock_executor_run_command( + stdout="", + raise_exception=CommandExecutionFailure( + message="Run command squeue --job=1351043 via SSHExecutor failed", + stdout="", + stderr="slurm_load_jobs error: Invalid job id specified", + exit_code=1, + ), + ) + def test_resource_status_of_completed_jobs_w_raised_exception(self): response = run_async( self.slurm_adapter.resource_status, AttributeDict(