Skip to content

Commit

Permalink
Fix handling squeue invalid jobid
Browse files Browse the repository at this point in the history
  • Loading branch information
giffels committed Dec 16, 2023
1 parent 685fda7 commit 693a54f
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 2 deletions.
2 changes: 1 addition & 1 deletion docs/source/changelog.rst
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.. Created by changelog.py at 2023-11-29, command
.. Created by changelog.py at 2023-12-16, command
'/Users/giffler/.cache/pre-commit/repor6pnmwlm/py_env-python3.10/bin/changelog docs/source/changes compile --categories Added Changed Fixed Security Deprecated --output=docs/source/changelog.rst'
based on the format of 'https://keepachangelog.com/'
Expand Down
10 changes: 10 additions & 0 deletions tardis/adapters/sites/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,18 @@ async def squeue(
try:
slurm_status = await executor.run_command(cmd)
except CommandExecutionFailure as cf:
# In case a job is already completed and
# only **one** non-existing job id is provided
# squeue is failing with exit code 1
# and prints "Invalid job id specified" to stderr
if "Invalid job id specified" in cf.stderr and len(resource_attributes) == 1:
logger.info(
f"{remote_resource_ids} is not valid anymore. Assuming it is completed."
)
return [{"State": "COMPLETED"}]
logger.warning(f"Slurm status update has failed due to {cf}.")
raise

else:
for row in csv_parser(
slurm_status.stdout, fieldnames=tuple(attributes.keys()), delimiter="|"
Expand Down
26 changes: 25 additions & 1 deletion tests/adapters_t/sites_t/test_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,31 @@ def test_resource_state_translation(self):
self.mock_executor.reset_mock()

@mock_executor_run_command("")
def test_resource_status_of_completed_jobs(self):
def test_resource_status_of_completed_jobs_w_empty_reply(self):
response = run_async(
self.slurm_adapter.resource_status,
AttributeDict(
resource_id="1390065",
remote_resource_uuid="1351043",
),
)

self.assertEqual(response.resource_status, ResourceStatus.Deleted)

self.mock_executor.return_value.run_command.assert_called_with(
'squeue -o "%A|%N|%T" -h -t all --job=1351043'
)

@mock_executor_run_command(
stdout="",
raise_exception=CommandExecutionFailure(
message="Run command squeue --job=1351043 via SSHExecutor failed",
stdout="",
stderr="slurm_load_jobs error: Invalid job id specified",
exit_code=1,
),
)
def test_resource_status_of_completed_jobs_w_raised_exception(self):
response = run_async(
self.slurm_adapter.resource_status,
AttributeDict(
Expand Down

0 comments on commit 693a54f

Please sign in to comment.