From 036d4fafbd21e0e53c678e610a1200d6284e9439 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Fri, 10 May 2024 07:00:18 +0200 Subject: [PATCH 1/3] Don't log exception if cancelled slurm job doesn't have stderr file This seems entirely expected and not worth reporting at an ERROR level. Fixes https://sentry.galaxyproject.org/share/issue/c7332e0cf1554c33b2cf56283acfcd1d/: ``` FileNotFoundError [Errno 2] No such file or directory: '/corral4/main/jobs/057/797/57797989/galaxy_57797989.e' ``` --- lib/galaxy/jobs/runners/slurm.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/galaxy/jobs/runners/slurm.py b/lib/galaxy/jobs/runners/slurm.py index 2c0ab4af6b59..98c373d96645 100644 --- a/lib/galaxy/jobs/runners/slurm.py +++ b/lib/galaxy/jobs/runners/slurm.py @@ -222,6 +222,9 @@ def __check_memory_limit(self, efile_path): return OUT_OF_MEMORY_MSG elif any(_ in stripped_line for _ in SLURM_MEMORY_LIMIT_EXCEEDED_PARTIAL_WARNINGS): return PROBABLY_OUT_OF_MEMORY_MSG + except FileNotFoundError: + # Entirely expected, as __check_memory_limit is only called if the job state is CANCELLED + return False except Exception: log.exception("Error reading end of %s:", efile_path) From c2d6986cd02c2a9f8f3822afe4a12211d86343e0 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Fri, 10 May 2024 07:09:01 +0200 Subject: [PATCH 2/3] Include traceback when logging email PJA exception The exception that is logged currently is: ``` EmailAction PJA Failed, exception: 'NoneType' object has no attribute 'name' ``` The unepxected NoneType is probably job.history, but hard to know. --- lib/galaxy/job_execution/actions/post.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/lib/galaxy/job_execution/actions/post.py b/lib/galaxy/job_execution/actions/post.py index 75b03cd3e9e6..10c1e3a57071 100644 --- a/lib/galaxy/job_execution/actions/post.py +++ b/lib/galaxy/job_execution/actions/post.py @@ -9,10 +9,7 @@ from galaxy.model import PostJobActionAssociation from galaxy.model.base import transaction -from galaxy.util import ( - send_mail, - unicodify, -) +from galaxy.util import send_mail from galaxy.util.custom_logging import get_logger log = get_logger(__name__) @@ -70,8 +67,8 @@ def execute(cls, app, sa_session, action, job, replacement_dict, final_job_state if link_invocation: body += f"\n\nWorkflow Invocation Report:\n{link_invocation}" send_mail(app.config.email_from, to, subject, body, app.config) - except Exception as e: - log.error("EmailAction PJA Failed, exception: %s", unicodify(e)) + except Exception: + log.exception("EmailAction PJA Failed") @classmethod def get_short_str(cls, pja): From d5e5f7d62d5c9ddc68a6d99aec7ec89c87a44de8 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Fri, 10 May 2024 07:30:32 +0200 Subject: [PATCH 3/3] Downgrade missing output file in working directory for failed jobs That seems perfectly normal, since jobs might die before they're actually even sent to the DRM, for instance when this happens: ``` parameter 'input1': the previously selected dataset has been deleted. ``` Might even set this to debug ... --- lib/galaxy/jobs/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/galaxy/jobs/__init__.py b/lib/galaxy/jobs/__init__.py index 11e66477a942..084f0e9d62ec 100644 --- a/lib/galaxy/jobs/__init__.py +++ b/lib/galaxy/jobs/__init__.py @@ -1419,8 +1419,8 @@ def fail( try: shutil.move(dataset_path.false_path, dataset_path.real_path) log.debug("fail(): Moved %s to %s", dataset_path.false_path, dataset_path.real_path) - except OSError as e: - log.error("fail(): Missing output file in working directory: %s", unicodify(e)) + except FileNotFoundError as e: + log.warning("fail(): Missing output file in working directory: %s", unicodify(e)) except Exception as e: log.exception(str(e)) for dataset_assoc in job.output_datasets + job.output_library_datasets: