From 6edf2426bb1b18f6898accd956ca711b5a6efd8c Mon Sep 17 00:00:00 2001 From: Keith Suderman Date: Fri, 20 Oct 2023 10:19:54 -0400 Subject: [PATCH 1/5] Patch logging statements in lib/galaxy/jobs/runners --- lib/galaxy/jobs/runners/__init__.py | 18 +++--- lib/galaxy/jobs/runners/aws.py | 10 +-- lib/galaxy/jobs/runners/cli.py | 36 ++++------- lib/galaxy/jobs/runners/condor.py | 34 +++++----- lib/galaxy/jobs/runners/drmaa.py | 30 ++++----- lib/galaxy/jobs/runners/godocker.py | 24 +++---- lib/galaxy/jobs/runners/kubernetes.py | 22 +++---- lib/galaxy/jobs/runners/local.py | 8 +-- lib/galaxy/jobs/runners/pbs.py | 62 ++++++++----------- lib/galaxy/jobs/runners/pulsar.py | 36 +++++------ .../jobs/runners/state_handlers/resubmit.py | 2 +- lib/galaxy/jobs/runners/tasks.py | 27 ++++---- lib/galaxy/jobs/runners/univa.py | 40 ++++++------ lib/galaxy/jobs/runners/util/cli/job/lsf.py | 4 +- lib/galaxy/jobs/runners/util/cli/job/pbs.py | 2 +- lib/galaxy/jobs/runners/util/cli/job/slurm.py | 2 +- .../jobs/runners/util/cli/job/torque.py | 2 +- lib/galaxy/jobs/runners/util/pykube_util.py | 2 +- lib/galaxy/jobs/runners/util/sudo.py | 2 +- 19 files changed, 156 insertions(+), 207 deletions(-) diff --git a/lib/galaxy/jobs/runners/__init__.py b/lib/galaxy/jobs/runners/__init__.py index c14aea726f03..5154d4dff93f 100644 --- a/lib/galaxy/jobs/runners/__init__.py +++ b/lib/galaxy/jobs/runners/__init__.py @@ -116,7 +116,7 @@ def _init_worker_threads(self): """Start ``nworkers`` worker threads.""" self.work_queue = Queue() self.work_threads = [] - log.debug(f"Starting {self.nworkers} {self.runner_name} workers") + log.debug("Starting %s %s workers", self.nworkers, self.runner_name) for i in range(self.nworkers): worker = threading.Thread(name="%s.work_thread-%d" % (self.runner_name, i), target=self.run_next) worker.daemon = True @@ -201,11 +201,11 @@ def put(self, job_wrapper: "MinimalJobWrapper"): # Required for exceptions thrown by object store incompatiblity. # tested by test/integration/objectstore/test_private_handling.py job_wrapper.fail(str(e), exception=e) - log.debug(f"Job [{job_wrapper.job_id}] failed to queue {put_timer}") + log.debug("Job [%s] failed to queue %s", job_wrapper.job_id, put_timer) return if queue_job: self.mark_as_queued(job_wrapper) - log.debug(f"Job [{job_wrapper.job_id}] queued {put_timer}") + log.debug("Job [%s] queued %s", job_wrapper.job_id, put_timer) def mark_as_queued(self, job_wrapper: "MinimalJobWrapper"): self.work_queue.put((self.queue_job, job_wrapper)) @@ -278,12 +278,12 @@ def prepare_job( # Make sure the job hasn't been deleted if job_state == model.Job.states.DELETED: - log.debug(f"({job_id}) Job deleted by user before it entered the {self.runner_name} queue") + log.debug("(%s) Job deleted by user before it entered the %s queue", job_id, self.runner_name) if self.app.config.cleanup_job in ("always", "onsuccess"): job_wrapper.cleanup() return False elif job_state != model.Job.states.QUEUED: - log.info(f"({job_id}) Job is in state {job_state}, skipping execution") + log.info("(%s) Job is in state %s, skipping execution", job_id, job_state) # cleanup may not be safe in all states return False @@ -475,7 +475,7 @@ def _handle_metadata_externally(self, job_wrapper: "MinimalJobWrapper", resolve_ env=os.environ, preexec_fn=os.setpgrp, ) - log.debug("execution of external set_meta for job %d finished" % job_wrapper.job_id) + log.debug("execution of external set_meta for job %d finished", job_wrapper.job_id) def get_job_file(self, job_wrapper: "MinimalJobWrapper", **kwds) -> str: job_metrics = job_wrapper.app.job_metrics @@ -504,7 +504,7 @@ def get_job_file(self, job_wrapper: "MinimalJobWrapper", **kwds) -> str: # Additional logging to enable if debugging from_work_dir handling, metadata # commands, etc... (or just peak in the job script.) job_id = job_wrapper.job_id - log.debug(f"({job_id}) command is: {command_line}") + log.debug("(%s) command is: %s", job_id, command_line) options.update(**kwds) return job_script(**options) @@ -713,7 +713,7 @@ def cleanup(self): prefix = f"({self.job_wrapper.get_id_tag()})" else: prefix = f"({self.job_wrapper.get_id_tag()}/{self.job_id})" - log.debug(f"{prefix} Unable to cleanup {file}: {unicodify(e)}") + log.debug("%s Unable to cleanup %s: %s", prefix, file, unicodify(e)) class AsynchronousJobState(JobState): @@ -840,7 +840,7 @@ def monitor_job(self, job_state): def shutdown(self): """Attempts to gracefully shut down the monitor thread""" - log.info(f"{self.runner_name}: Sending stop signal to monitor thread") + log.info("%s: Sending stop signal to monitor thread", self.runner_name) self.monitor_queue.put(STOP_SIGNAL) # Call the parent's shutdown method to stop workers self.shutdown_monitor() diff --git a/lib/galaxy/jobs/runners/aws.py b/lib/galaxy/jobs/runners/aws.py index bed35ec19c37..c9c07982c1c2 100644 --- a/lib/galaxy/jobs/runners/aws.py +++ b/lib/galaxy/jobs/runners/aws.py @@ -217,9 +217,9 @@ def __init__(self, app, nworkers, **kwargs): self._batch_client = session.client("batch") def queue_job(self, job_wrapper): - log.debug(f"Starting queue_job for job {job_wrapper.get_id_tag()}") + log.debug("Starting queue_job for job %s", job_wrapper.get_id_tag()) if not self.prepare_job(job_wrapper, include_metadata=False, modify_command_for_container=False): - log.debug(f"Not ready {job_wrapper.get_id_tag()}") + log.debug("Not ready %s", job_wrapper.get_id_tag()) return job_destination = job_wrapper.job_destination @@ -252,7 +252,7 @@ def _get_job_definition(self, job_wrapper, destination_params): jd_arn = self._register_job_definition(jd_name, container_image, destination_params) else: jd_arn = res["jobDefinitions"][0]["jobDefinitionArn"] - log.debug(f"Found existing job definition: {jd_name}.") + log.debug("Found existing job definition: %s.", jd_name) return jd_arn @@ -325,7 +325,7 @@ def _get_retry_strategy(self, destination_params): return strategy def _register_job_definition(self, jd_name, container_image, destination_params): - log.debug(f"Registering a new job definition: {jd_name}.") + log.debug("Registering a new job definition: %s.", jd_name) platform = destination_params.get("platform") volumes, mount_points = self._get_mount_volumes(destination_params) @@ -375,7 +375,7 @@ def _submit_job(self, job_def, job_wrapper, destination_params): job_name = self.JOB_NAME_PREFIX + job_wrapper.get_id_tag() command_script_path = self.write_command(job_wrapper) - log.info(f"Submitting job {job_name} to AWS Batch.") + log.info("Submitting job %s to AWS Batch.", job_name) res = self._batch_client.submit_job( jobName=job_name, jobQueue=destination_params.get("job_queue"), diff --git a/lib/galaxy/jobs/runners/cli.py b/lib/galaxy/jobs/runners/cli.py index 3af7b71d696a..71fccd89bfa7 100644 --- a/lib/galaxy/jobs/runners/cli.py +++ b/lib/galaxy/jobs/runners/cli.py @@ -50,7 +50,7 @@ def url_to_destination(self, url): job_params = {f"job_{k}": v for k, v in [kv.split("=", 1) for kv in job_params.split("&")]} params.update(shell_params) params.update(job_params) - log.debug(f"Converted URL '{url}' to destination runner=cli, params={params}") + log.debug("Converted URL '%s' to destination runner=cli, params=%s", url, params) # Create a dynamic JobDestination return JobDestination(runner="cli", params=params) @@ -96,7 +96,7 @@ def queue_job(self, job_wrapper): job_wrapper.cleanup() return - log.debug(f"({galaxy_id_tag}) submitting file: {ajs.job_file}") + log.debug("(%s) submitting file: %s", galaxy_id_tag, ajs.job_file) returncode, stdout = self.submit(shell, job_interface, ajs.job_file, galaxy_id_tag, retry=MAX_SUBMIT_RETRY) if returncode != 0: @@ -107,11 +107,11 @@ def queue_job(self, job_wrapper): submit_stdout = stdout.strip() external_job_id = submit_stdout and submit_stdout.split()[-1] if not external_job_id: - log.error(f"({galaxy_id_tag}) submission did not return a job identifier, failing job") + log.error("(%s) submission did not return a job identifier, failing job", galaxy_id_tag) job_wrapper.fail("failure submitting job") return - log.info(f"({galaxy_id_tag}) queued with identifier: {external_job_id}") + log.info("(%s) queued with identifier: %s", galaxy_id_tag, external_job_id) # store runner information for tracking if Galaxy restarts job_wrapper.set_external_id(external_job_id) @@ -136,9 +136,7 @@ def submit(self, shell, job_interface, job_file, galaxy_id_tag, retry=MAX_SUBMIT if returncode == 0: stdout = cmd_out.stdout if not stdout or not stdout.strip(): - log.warning( - f"({galaxy_id_tag}) Execute returned a 0 exit code but no external identifier will be recovered from empty stdout - stderr is {cmd_out.stderr}" - ) + log.warning("(%s) Execute returned a 0 exit code but no external identifier will be recovered from empty stdout - stderr is %s", galaxy_id_tag, cmd_out.stderr) return returncode, stdout stdout = f"({galaxy_id_tag}) submission failed (stdout): {cmd_out.stdout}" stderr = f"({galaxy_id_tag}) submission failed (stderr): {cmd_out.stderr}" @@ -170,18 +168,16 @@ def check_watched_items(self): if ajs.job_wrapper.get_state() == model.Job.states.DELETED: continue - log.debug(f"({id_tag}/{external_job_id}) job not found in batch state check") + log.debug("(%s/%s) job not found in batch state check", id_tag, external_job_id) shell_params, job_params = self.parse_destination_params(ajs.job_destination.params) shell, job_interface = self.get_cli_plugins(shell_params, job_params) cmd_out = shell.execute(job_interface.get_single_status(external_job_id)) state = job_interface.parse_single_status(cmd_out.stdout, external_job_id) if not state == model.Job.states.OK: - log.warning( - f"({id_tag}/{external_job_id}) job not found in batch state check, but found in individual state check" - ) + log.warning("(%s/%s) job not found in batch state check, but found in individual state check", id_tag, external_job_id) job_state = ajs.job_wrapper.get_state() if state != old_state: - log.debug(f"({id_tag}/{external_job_id}) state change: from {old_state} to {state}") + log.debug("(%s/%s) state change: from %s to %s", id_tag, external_job_id, old_state, state) if state == model.Job.states.ERROR and job_state != model.Job.states.STOPPED: # Try to find out the reason for exiting - this needs to happen before change_state # otherwise jobs depending on resubmission outputs see that job as failed and pause. @@ -201,7 +197,7 @@ def check_watched_items(self): ) if external_metadata: self.work_queue.put((self.handle_metadata_externally, ajs)) - log.debug(f"({id_tag}/{external_job_id}) job execution finished, running job wrapper finish method") + log.debug("(%s/%s) job execution finished, running job wrapper finish method", id_tag, external_job_id) self.work_queue.put((self.finish_job, ajs)) else: new_watched.append(ajs) @@ -253,11 +249,9 @@ def stop_job(self, job_wrapper): shell, job_interface = self.get_cli_plugins(shell_params, job_params) cmd_out = shell.execute(job_interface.delete(job.job_runner_external_id)) assert cmd_out.returncode == 0, cmd_out.stderr - log.debug(f"({job.id}/{job.job_runner_external_id}) Terminated at user's request") + log.debug("(%s/%s) Terminated at user's request", job.id, job.job_runner_external_id) except Exception as e: - log.debug( - f"({job.id}/{job.job_runner_external_id}) User killed running job, but error encountered during termination: {e}" - ) + log.debug("(%s/%s) User killed running job, but error encountered during termination: %s", job.id, job.job_runner_external_id, e) def recover(self, job, job_wrapper): """Recovers jobs stuck in the queued/running state when Galaxy started""" @@ -273,16 +267,12 @@ def recover(self, job, job_wrapper): ) ajs.command_line = job.command_line if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): - log.debug( - f"({job.id}/{job.job_runner_external_id}) is still in {job.state} state, adding to the runner monitor queue" - ) + log.debug("(%s/%s) is still in %s state, adding to the runner monitor queue", job.id, job.job_runner_external_id, job.state) ajs.old_state = model.Job.states.RUNNING ajs.running = True self.monitor_queue.put(ajs) elif job.state == model.Job.states.QUEUED: - log.debug( - f"({job.id}/{job.job_runner_external_id}) is still in queued state, adding to the runner monitor queue" - ) + log.debug("(%s/%s) is still in queued state, adding to the runner monitor queue", job.id, job.job_runner_external_id) ajs.old_state = model.Job.states.QUEUED ajs.running = False self.monitor_queue.put(ajs) diff --git a/lib/galaxy/jobs/runners/condor.py b/lib/galaxy/jobs/runners/condor.py index 6e9862ac9e02..4c20334ba7f0 100644 --- a/lib/galaxy/jobs/runners/condor.py +++ b/lib/galaxy/jobs/runners/condor.py @@ -132,11 +132,11 @@ def queue_job(self, job_wrapper): job_wrapper.cleanup() return - log.debug(f"({galaxy_id_tag}) submitting file {executable}") + log.debug("(%s) submitting file %s", galaxy_id_tag, executable) external_job_id, message = condor_submit(submit_file) if external_job_id is None: - log.debug(f"condor_submit failed for job {job_wrapper.get_id_tag()}: {message}") + log.debug("condor_submit failed for job %s: %s", job_wrapper.get_id_tag(), message) if self.app.config.cleanup_job == "always": os.unlink(submit_file) cjs.cleanup() @@ -145,7 +145,7 @@ def queue_job(self, job_wrapper): os.unlink(submit_file) - log.info(f"({galaxy_id_tag}) queued as {external_job_id}") + log.info("(%s) queued as %s", galaxy_id_tag, external_job_id) # store runner information for tracking if Galaxy restarts job_wrapper.set_external_id(external_job_id) @@ -181,7 +181,7 @@ def check_watched_items(self): except Exception: # so we don't kill the monitor thread log.exception(f"({galaxy_id_tag}/{job_id}) Unable to check job status") - log.warning(f"({galaxy_id_tag}/{job_id}) job will now be errored") + log.warning("(%s/%s) job will now be errored", galaxy_id_tag, job_id) cjs.fail_message = "Cluster could not complete job" self.work_queue.put((self.fail_job, cjs)) continue @@ -191,10 +191,10 @@ def check_watched_items(self): cjs.job_wrapper.check_for_entry_points() if job_running and not cjs.running: - log.debug(f"({galaxy_id_tag}/{job_id}) job is now running") + log.debug("(%s/%s) job is now running", galaxy_id_tag, job_id) cjs.job_wrapper.change_state(model.Job.states.RUNNING) if not job_running and cjs.running: - log.debug(f"({galaxy_id_tag}/{job_id}) job has stopped running") + log.debug("(%s/%s) job has stopped running", galaxy_id_tag, job_id) # Will switching from RUNNING to QUEUED confuse Galaxy? # cjs.job_wrapper.change_state( model.Job.states.QUEUED ) job_state = cjs.job_wrapper.get_state() @@ -205,11 +205,11 @@ def check_watched_items(self): ) if external_metadata: self._handle_metadata_externally(cjs.job_wrapper, resolve_requirements=True) - log.debug(f"({galaxy_id_tag}/{job_id}) job has completed") + log.debug("(%s/%s) job has completed", galaxy_id_tag, job_id) self.work_queue.put((self.finish_job, cjs)) continue if job_failed: - log.debug(f"({galaxy_id_tag}/{job_id}) job failed") + log.debug("(%s/%s) job failed", galaxy_id_tag, job_id) cjs.failed = True self.work_queue.put((self.fail_job, cjs)) continue @@ -225,7 +225,7 @@ def stop_job(self, job_wrapper): galaxy_id_tag = job_wrapper.get_id_tag() if job.container: try: - log.info(f"stop_job(): {job.id}: trying to stop container .... ({external_id})") + log.info("stop_job(): %s: trying to stop container .... (%s)", job.id, external_id) # self.watched = [cjs for cjs in self.watched if cjs.job_id != external_id] new_watch_list = list() cjs = None @@ -244,21 +244,21 @@ def stop_job(self, job_wrapper): ) if external_metadata: self._handle_metadata_externally(cjs.job_wrapper, resolve_requirements=True) - log.debug(f"({galaxy_id_tag}/{external_id}) job has completed") + log.debug("(%s/%s) job has completed", galaxy_id_tag, external_id) self.work_queue.put((self.finish_job, cjs)) except Exception as e: - log.warning(f"stop_job(): {job.id}: trying to stop container failed. ({e})") + log.warning("stop_job(): %s: trying to stop container failed. (%s)", job.id, e) try: self._kill_container(job_wrapper) except Exception as e: - log.warning(f"stop_job(): {job.id}: trying to kill container failed. ({e})") + log.warning("stop_job(): %s: trying to kill container failed. (%s)", job.id, e) failure_message = condor_stop(external_id) if failure_message: - log.debug(f"({external_id}). Failed to stop condor {failure_message}") + log.debug("(%s). Failed to stop condor %s", external_id, failure_message) else: failure_message = condor_stop(external_id) if failure_message: - log.debug(f"({external_id}). Failed to stop condor {failure_message}") + log.debug("(%s). Failed to stop condor %s", external_id, failure_message) def recover(self, job, job_wrapper): """Recovers jobs stuck in the queued/running state when Galaxy started""" @@ -276,13 +276,11 @@ def recover(self, job, job_wrapper): cjs.user_log = os.path.join(job_wrapper.working_directory, f"galaxy_{galaxy_id_tag}.condor.log") cjs.register_cleanup_file_attribute("user_log") if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): - log.debug( - f"({job.id}/{job.get_job_runner_external_id()}) is still in {job.state} state, adding to the DRM queue" - ) + log.debug("(%s/%s) is still in %s state, adding to the DRM queue", job.id, job.get_job_runner_external_id(), job.state) cjs.running = True self.monitor_queue.put(cjs) elif job.state == model.Job.states.QUEUED: - log.debug(f"({job.id}/{job.job_runner_external_id}) is still in DRM queued state, adding to the DRM queue") + log.debug("(%s/%s) is still in DRM queued state, adding to the DRM queue", job.id, job.job_runner_external_id) cjs.running = False self.monitor_queue.put(cjs) diff --git a/lib/galaxy/jobs/runners/drmaa.py b/lib/galaxy/jobs/runners/drmaa.py index 5c1d68f70490..8cbe1ecffba8 100644 --- a/lib/galaxy/jobs/runners/drmaa.py +++ b/lib/galaxy/jobs/runners/drmaa.py @@ -108,10 +108,10 @@ def url_to_destination(self, url): native_spec = url.split("/")[2] if native_spec: params = dict(nativeSpecification=native_spec) - log.debug(f"Converted URL '{url}' to destination runner=drmaa, params={params}") + log.debug("Converted URL '%s' to destination runner=drmaa, params=%s", url, params) return JobDestination(runner="drmaa", params=params) else: - log.debug(f"Converted URL '{url}' to destination runner=drmaa") + log.debug("Converted URL '%s' to destination runner=drmaa", url) return JobDestination(runner="drmaa") def get_native_spec(self, url): @@ -197,7 +197,7 @@ def queue_job(self, job_wrapper): log.exception("(%s) drmaa.Session.runJob() failed unconditionally", galaxy_id_tag) trynum = 5 else: - log.error(f"({galaxy_id_tag}) All attempts to submit job failed") + log.error("(%s) All attempts to submit job failed", galaxy_id_tag) if not fail_msg: fail_msg = DEFAULT_JOB_RUNNER_FAILURE_MESSAGE job_wrapper.fail(fail_msg) @@ -216,13 +216,13 @@ def queue_job(self, job_wrapper): job_wrapper.fail(fail_msg) return pwent = job_wrapper.galaxy_system_pwent - log.debug(f"({galaxy_id_tag}) submitting with credentials: {pwent[0]} [uid: {pwent[2]}]") + log.debug("(%s) submitting with credentials: %s [uid: %s]", galaxy_id_tag, pwent[0], pwent[2]) self.userid = pwent[2] external_job_id = self.external_runjob(external_runjob_script, filename, pwent[2]) if external_job_id is None: job_wrapper.fail(f"({galaxy_id_tag}) could not queue job") return - log.info(f"({galaxy_id_tag}) queued as {external_job_id}") + log.info("(%s) queued as %s", galaxy_id_tag, external_job_id) # store runner information for tracking if Galaxy restarts job_wrapper.set_external_id(external_job_id) @@ -327,7 +327,7 @@ def check_watched_item(self, ajs, new_watched): except Exception: # so we don't kill the monitor thread log.exception(f"({galaxy_id_tag}/{external_job_id}) unable to check job status") - log.warning(f"({galaxy_id_tag}/{external_job_id}) job will now be errored") + log.warning("(%s/%s) job will now be errored", galaxy_id_tag, external_job_id) ajs.fail_message = "Cluster could not complete job" self.work_queue.put((self.fail_job, ajs)) return None @@ -347,7 +347,7 @@ def check_watched_items(self): if state is None: continue if state != old_state: - log.debug(f"({galaxy_id_tag}/{external_job_id}) state change: {self.drmaa_job_state_strings[state]}") + log.debug("(%s/%s) state change: %s", galaxy_id_tag, external_job_id, self.drmaa_job_state_strings[state]) if state == drmaa.JobState.RUNNING and not ajs.running: ajs.running = True ajs.job_wrapper.change_state(model.Job.states.RUNNING) @@ -381,11 +381,11 @@ def stop_job(self, job_wrapper): cmd = shlex.split(kill_script) cmd.extend([str(ext_id), str(self.userid)]) commands.execute(cmd) - log.info(f"({job.id}/{ext_id}) Removed from DRM queue at user's request") + log.info("(%s/%s) Removed from DRM queue at user's request", job.id, ext_id) except drmaa.InvalidJobException: log.exception(f"({job.id}/{ext_id}) User killed running job, but it was already dead") except commands.CommandLineException as e: - log.error(f"({job.id}/{ext_id}) User killed running job, but command execution failed: {unicodify(e)}") + log.error("(%s/%s) User killed running job, but command execution failed: %s", job.id, ext_id, unicodify(e)) except Exception: log.exception(f"({job.id}/{ext_id}) User killed running job, but error encountered removing from DRM queue") @@ -403,16 +403,12 @@ def recover(self, job, job_wrapper): ) ajs.command_line = job.get_command_line() if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): - log.debug( - f"({job.id}/{job.get_job_runner_external_id()}) is still in {job.state} state, adding to the DRM queue" - ) + log.debug("(%s/%s) is still in %s state, adding to the DRM queue", job.id, job.get_job_runner_external_id(), job.state) ajs.old_state = drmaa.JobState.RUNNING ajs.running = True self.monitor_queue.put(ajs) elif job.get_state() == model.Job.states.QUEUED: - log.debug( - f"({job.id}/{job.get_job_runner_external_id()}) is still in DRM queued state, adding to the DRM queue" - ) + log.debug("(%s/%s) is still in DRM queued state, adding to the DRM queue", job.id, job.get_job_runner_external_id()) ajs.old_state = drmaa.JobState.QUEUED_ACTIVE ajs.running = False self.monitor_queue.put(ajs) @@ -422,7 +418,7 @@ def store_jobtemplate(self, job_wrapper, jt): filename = os.path.join(job_wrapper.working_directory, f"{job_wrapper.get_id_tag()}.jt_json") with open(filename, "w") as fp: json.dump(jt, fp) - log.debug(f"({job_wrapper.job_id}) Job script for external submission is: {filename}") + log.debug("(%s) Job script for external submission is: %s", job_wrapper.job_id, filename) return filename def external_runjob(self, external_runjob_script, jobtemplate_filename, username): @@ -432,7 +428,7 @@ def external_runjob(self, external_runjob_script, jobtemplate_filename, username """ cmd = shlex.split(external_runjob_script) cmd.extend([str(username), jobtemplate_filename]) - log.info(f"Running command: {' '.join(cmd)}") + log.info("Running command: %s", ' '.join(cmd)) try: stdoutdata = commands.execute(cmd).strip() except commands.CommandLineException: diff --git a/lib/galaxy/jobs/runners/godocker.py b/lib/galaxy/jobs/runners/godocker.py index bcacb84fb700..2ddfce1b09df 100644 --- a/lib/galaxy/jobs/runners/godocker.py +++ b/lib/galaxy/jobs/runners/godocker.py @@ -158,7 +158,7 @@ def queue_job(self, job_wrapper): log.error("Job creation failure. No Response from GoDocker") job_wrapper.fail("Not submitted") else: - log.debug(f"Starting queue_job for job {job_id}") + log.debug("Starting queue_job for job %s", job_id) # Create an object of AsynchronousJobState and add it to the monitor queue. ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, @@ -186,7 +186,7 @@ def check_watched_item(self, job_state): # Get task from GoDocker job_persisted_state = job_state.job_wrapper.get_state() job_status_god = self.get_task(job_state.job_id) - log.debug(f"Job ID: {str(job_state.job_id)} Job Status: {str(job_status_god['status']['primary'])}") + log.debug("Job ID: %s Job Status: %s", str(job_state.job_id), str(job_status_god['status']['primary'])) if job_status_god["status"]["primary"] == "over" or job_persisted_state == model.Job.states.STOPPED: job_state.running = False @@ -238,7 +238,7 @@ def stop_job(self, job_wrapper): # This function is called by fail_job() # No Return data expected job_id = job_wrapper.job_id - log.debug(f"STOP JOB EXECUTION OF JOB ID: {str(job_id)}") + log.debug("STOP JOB EXECUTION OF JOB ID: %s", str(job_id)) # Get task status from GoDocker. job_status_god = self.get_task_status(job_id) if job_status_god["status"]["primary"] != "over": @@ -260,17 +260,13 @@ def recover(self, job, job_wrapper): ) job_wrapper.command_line = job.command_line if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): - log.debug( - f"({job.id}/{job.get_job_runner_external_id()}) is still in {job.state} state, adding to the god queue" - ) + log.debug("(%s/%s) is still in %s state, adding to the god queue", job.id, job.get_job_runner_external_id(), job.state) ajs.old_state = "R" ajs.running = True self.monitor_queue.put(ajs) elif job.state == model.Job.states.QUEUED: - log.debug( - f"({job.id}/{job.get_job_runner_external_id()}) is still in god queued state, adding to the god queue" - ) + log.debug("(%s/%s) is still in god queued state, adding to the god queue", job.id, job.get_job_runner_external_id()) ajs.old_state = "Q" ajs.running = False self.monitor_queue.put(ajs) @@ -309,9 +305,9 @@ def create_log_file(self, job_state, job_status_god): log_file.write(out_log) log_file.close() f.close() - log.debug(f"CREATE OUTPUT FILE: {job_state.output_file}") - log.debug(f"CREATE ERROR FILE: {job_state.error_file}") - log.debug(f"CREATE EXIT CODE FILE: {job_state.exit_code_file}") + log.debug("CREATE OUTPUT FILE: %s", job_state.output_file) + log.debug("CREATE ERROR FILE: %s", job_state.error_file) + log.debug("CREATE EXIT CODE FILE: %s", job_state.exit_code_file) except OSError as e: log.error("Could not access task log file: %s", unicodify(e)) log.debug("IO Error occurred when accessing the files.") @@ -325,7 +321,7 @@ def login(self, apikey, login, server, noCert=False): Create Login model schema of GoDocker and call the http_post_request method. """ log.debug("LOGIN TASK TO BE EXECUTED \n") - log.debug(f"GODOCKER LOGIN: {str(login)}") + log.debug("GODOCKER LOGIN: %s", str(login)) data = json.dumps({"user": login, "apikey": apikey}) # Create object of Godocker class g_auth = Godocker(server, login, apikey, noCert) @@ -360,7 +356,7 @@ def post_task(self, job_wrapper): docker_image = self._find_container(job_wrapper).container_id log.debug("GoDocker runner using container %s.", docker_image) except Exception: - log.error(f"Unable to find docker_image for job {job_wrapper.job_id}, failing.") + log.error("Unable to find docker_image for job %s, failing.", job_wrapper.job_id) return False volumes = [] diff --git a/lib/galaxy/jobs/runners/kubernetes.py b/lib/galaxy/jobs/runners/kubernetes.py index 1f9e984605c3..aff56724d9bd 100644 --- a/lib/galaxy/jobs/runners/kubernetes.py +++ b/lib/galaxy/jobs/runners/kubernetes.py @@ -144,7 +144,7 @@ def queue_job(self, job_wrapper): # prepare the job # We currently don't need to include_metadata or include_work_dir_outputs, as working directory is the same # where galaxy will expect results. - log.debug(f"Starting queue_job for job {job_wrapper.get_id_tag()}") + log.debug("Starting queue_job for job %s", job_wrapper.get_id_tag()) ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, @@ -226,7 +226,7 @@ def __configure_port_routing(self, ajs): # Configure additional k8s service and ingress for tools with guest ports k8s_job_prefix = self.__produce_k8s_job_prefix() k8s_job_name = self.__get_k8s_job_name(k8s_job_prefix, ajs.job_wrapper) - log.debug(f"Configuring entry points and deploying service/ingress for job with ID {ajs.job_id}") + log.debug("Configuring entry points and deploying service/ingress for job with ID %s", ajs.job_id) k8s_service_obj = service_object_dict(self.runner_params, k8s_job_name, self.__get_k8s_service_spec(ajs)) k8s_ingress_obj = ingress_object_dict(self.runner_params, k8s_job_name, self.__get_k8s_ingress_spec(ajs)) @@ -879,7 +879,7 @@ def __job_pending_due_to_unschedulable_pod(self, job_state): def __cleanup_k8s_guest_ports(self, job_wrapper, k8s_job): k8s_job_prefix = self.__produce_k8s_job_prefix() k8s_job_name = f"{k8s_job_prefix}-{self.__force_label_conformity(job_wrapper.get_id_tag())}" - log.debug(f"Deleting service/ingress for job with ID {job_wrapper.get_id_tag()}") + log.debug("Deleting service/ingress for job with ID %s", job_wrapper.get_id_tag()) job_failed = k8s_job.obj["status"]["failed"] > 0 if "failed" in k8s_job.obj["status"] else False ingress_to_delete = find_ingress_object_by_name( self._pykube_api, k8s_job_name, self.runner_params["k8s_namespace"] @@ -908,7 +908,7 @@ def stop_job(self, job_wrapper): self.__cleanup_k8s_job(k8s_job) # TODO assert whether job parallelism == 0 # assert not job_to_delete.exists(), "Could not delete job,"+job.job_runner_external_id+" it still exists" - log.debug(f"({job.id}/{job.job_runner_external_id}) Terminated at user's request") + log.debug("(%s/%s) Terminated at user's request", job.id, job.job_runner_external_id) except Exception as e: log.exception( @@ -920,7 +920,7 @@ def stop_job(self, job_wrapper): def recover(self, job, job_wrapper): """Recovers jobs stuck in the queued/running state when Galaxy started""" job_id = job.get_job_runner_external_id() - log.debug(f"k8s trying to recover job: {job_id}") + log.debug("k8s trying to recover job: %s", job_id) if job_id is None: self.put(job_wrapper) return @@ -932,20 +932,12 @@ def recover(self, job, job_wrapper): ) ajs.command_line = job.command_line if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): - log.debug( - "({}/{}) is still in {} state, adding to the runner monitor queue".format( - job.id, job.job_runner_external_id, job.state - ) - ) + log.debug("(%s/%s) is still in %s state, adding to the runner monitor queue", job.id, job.job_runner_external_id, job.state) ajs.old_state = model.Job.states.RUNNING ajs.running = True self.monitor_queue.put(ajs) elif job.state == model.Job.states.QUEUED: - log.debug( - "({}/{}) is still in queued state, adding to the runner monitor queue".format( - job.id, job.job_runner_external_id - ) - ) + log.debug("(%s/%s) is still in queued state, adding to the runner monitor queue", job.id, job.job_runner_external_id) ajs.old_state = model.Job.states.QUEUED ajs.running = False self.monitor_queue.put(ajs) diff --git a/lib/galaxy/jobs/runners/local.py b/lib/galaxy/jobs/runners/local.py index d70768f8b856..1f46e6683187 100644 --- a/lib/galaxy/jobs/runners/local.py +++ b/lib/galaxy/jobs/runners/local.py @@ -95,7 +95,7 @@ def queue_job(self, job_wrapper): try: stdout_file = tempfile.NamedTemporaryFile(mode="wb+", suffix="_stdout", dir=job_wrapper.working_directory) stderr_file = tempfile.NamedTemporaryFile(mode="wb+", suffix="_stderr", dir=job_wrapper.working_directory) - log.debug(f"({job_id}) executing job script: {job_file}") + log.debug("(%s) executing job script: %s", job_id, job_file) # The preexec_fn argument of Popen() is used to call os.setpgrp() in # the child process just before the child is executed. This will set # the PGID of the child process to its PID (i.e. ensures that it is @@ -140,7 +140,7 @@ def queue_job(self, job_wrapper): stderr = self._job_io_for_db(stderr_file) stdout_file.close() stderr_file.close() - log.debug(f"execution finished: {job_file}") + log.debug("execution finished: %s", job_file) except Exception: log.exception("failure running job %d", job_wrapper.job_id) self._fail_job_local(job_wrapper, "failure running job") @@ -166,11 +166,11 @@ def stop_job(self, job_wrapper): # metadata internal or job not complete yet pid = job.get_job_runner_external_id() if pid in [None, ""]: - log.warning(f"stop_job(): {job.id}: no PID in database for job, unable to stop") + log.warning("stop_job(): %s: no PID in database for job, unable to stop", job.id) return pid = int(pid) if not check_pg(pid): - log.warning("stop_job(): %s: Process group %d was already dead or can't be signaled" % (job.id, pid)) + log.warning("stop_job(): %s: Process group %d was already dead or can't be signaled", (job.id, pid)) return log.debug("stop_job(): %s: Terminating process group %d", job.id, pid) kill_pg(pid) diff --git a/lib/galaxy/jobs/runners/pbs.py b/lib/galaxy/jobs/runners/pbs.py index fa22c11b1a70..26176f851325 100644 --- a/lib/galaxy/jobs/runners/pbs.py +++ b/lib/galaxy/jobs/runners/pbs.py @@ -113,7 +113,7 @@ def __init__(self, app, nworkers): def default_pbs_server(self): if self.__default_pbs_server is None: self.__default_pbs_server = pbs.pbs_default() - log.debug(f"Set default PBS server to {self.default_pbs_server}") + log.debug("Set default PBS server to %s", self.default_pbs_server) return self.__default_pbs_server def url_to_destination(self, url): @@ -151,7 +151,7 @@ def url_to_destination(self, url): param, value = opt.split(None, 1) params[param] = value - log.debug(f"Converted URL '{url}' to destination runner=pbs, params={params}") + log.debug("Converted URL '%s' to destination runner=pbs, params=%s", url, params) # Create a dynamic JobDestination return JobDestination(runner="pbs", params=params) @@ -171,7 +171,7 @@ def parse_destination_params(self, params): arg = arg.lstrip("-") args[arg] = value except Exception: - log.warning(f"Unrecognized long argument in destination params: {arg}") + log.warning("Unrecognized long argument in destination params: %s", arg) return self.__args_to_attrs(args) # Internal stuff @@ -234,7 +234,7 @@ def queue_job(self, job_wrapper): if c <= 0: errno, text = pbs.error() job_wrapper.fail("Unable to queue job for execution. Resubmitting the job may succeed.") - log.error(f"Connection to PBS server for submit failed: {errno}: {text}") + log.error("Connection to PBS server for submit failed: %s: %s", errno, text) return # define job attributes @@ -297,7 +297,7 @@ def queue_job(self, job_wrapper): self.write_executable_script(job_file, script, job_io=job_wrapper.job_io) # job was deleted while we were preparing it if job_wrapper.get_state() in (model.Job.states.DELETED, model.Job.states.STOPPED): - log.debug(f"Job {job_wrapper.job_id} deleted/stopped by user before it entered the PBS queue") + log.debug("Job %s deleted/stopped by user before it entered the PBS queue", job_wrapper.job_id) pbs.pbs_disconnect(c) if job_wrapper.cleanup_job in ("always", "onsuccess"): self.cleanup((ofile, efile, ecfile, job_file)) @@ -308,7 +308,7 @@ def queue_job(self, job_wrapper): # The job tag includes the job and the task identifier # (if a TaskWrapper was passed in): galaxy_job_id = job_wrapper.get_id_tag() - log.debug(f"({galaxy_job_id}) submitting file {job_file}") + log.debug("(%s) submitting file %s", galaxy_job_id, job_file) tries = 0 while tries < 5: @@ -318,17 +318,17 @@ def queue_job(self, job_wrapper): pbs.pbs_disconnect(c) break errno, text = pbs.error() - log.warning("(%s) pbs_submit failed (try %d/5), PBS error %d: %s" % (galaxy_job_id, tries, errno, text)) + log.warning("(%s) pbs_submit failed (try %d/5), PBS error %d: %s", (galaxy_job_id, tries, errno, text)) time.sleep(2) else: - log.error(f"({galaxy_job_id}) All attempts to submit job failed") + log.error("(%s) All attempts to submit job failed", galaxy_job_id) job_wrapper.fail("Unable to run this job due to a cluster error, please retry it later") return if pbs_queue_name is None: - log.debug(f"({galaxy_job_id}) queued in default queue as {job_id}") + log.debug("(%s) queued in default queue as %s", galaxy_job_id, job_id) else: - log.debug(f"({galaxy_job_id}) queued in {pbs_queue_name} queue as {job_id}") + log.debug("(%s) queued in %s queue as %s", galaxy_job_id, pbs_queue_name, job_id) # persist destination job_wrapper.set_job_destination(job_destination, job_id) @@ -363,7 +363,7 @@ def check_watched_items(self): old_state = pbs_job_state.old_state pbs_server_name = self.__get_pbs_server(pbs_job_state.job_destination.params) if pbs_server_name in failures: - log.debug(f"({galaxy_job_id}/{job_id}) Skipping state check because PBS server connection failed") + log.debug("(%s/%s) Skipping state check because PBS server connection failed", galaxy_job_id, job_id) new_watched.append(pbs_job_state) continue try: @@ -374,25 +374,21 @@ def check_watched_items(self): try: # Recheck to make sure it wasn't a communication problem self.check_single_job(pbs_server_name, job_id) - log.warning( - f"({galaxy_job_id}/{job_id}) PBS job was not in state check list, but was found with individual state check" - ) + log.warning("(%s/%s) PBS job was not in state check list, but was found with individual state check", galaxy_job_id, job_id) new_watched.append(pbs_job_state) except Exception: errno, text = pbs.error() if errno == 15001: # 15001 == job not in queue - log.debug(f"({galaxy_job_id}/{job_id}) PBS job has left queue") + log.debug("(%s/%s) PBS job has left queue", galaxy_job_id, job_id) self.work_queue.put((self.finish_job, pbs_job_state)) else: # Unhandled error, continue to monitor - log.info( - "(%s/%s) PBS state check resulted in error (%d): %s" % (galaxy_job_id, job_id, errno, text) - ) + log.info("(%s/%s) PBS state check resulted in error (%d): %s", (galaxy_job_id, job_id, errno, text)) new_watched.append(pbs_job_state) continue if status.job_state != old_state: - log.debug(f"({galaxy_job_id}/{job_id}) PBS job state changed from {old_state} to {status.job_state}") + log.debug("(%s/%s) PBS job state changed from %s to %s", galaxy_job_id, job_id, old_state, status.job_state) if status.job_state == "R" and not pbs_job_state.running: pbs_job_state.running = True pbs_job_state.job_wrapper.change_state(model.Job.states.RUNNING) @@ -410,18 +406,18 @@ def check_watched_items(self): int(status.exit_status) == 0 or pbs_job_state.job_wrapper.get_state() == model.Job.states.STOPPED ) - log.debug(f"({galaxy_job_id}/{job_id}) PBS job has completed successfully") + log.debug("(%s/%s) PBS job has completed successfully", galaxy_job_id, job_id) except AssertionError: exit_status = int(status.exit_status) error_message = JOB_EXIT_STATUS.get(exit_status, f"Unknown error: {status.exit_status}") pbs_job_state.fail_message = CLUSTER_ERROR_MESSAGE % error_message - log.error(f"({galaxy_job_id}/{job_id}) PBS job failed: {error_message}") + log.error("(%s/%s) PBS job failed: %s", galaxy_job_id, job_id, error_message) pbs_job_state.stop_job = False self.work_queue.put((self.fail_job, pbs_job_state)) continue except AttributeError: # No exit_status, can't verify proper completion so we just have to assume success. - log.debug(f"({galaxy_job_id}/{job_id}) PBS job has completed") + log.debug("(%s/%s) PBS job has completed", galaxy_job_id, job_id) self.work_queue.put((self.finish_job, pbs_job_state)) continue pbs_job_state.old_state = status.job_state @@ -446,7 +442,7 @@ def check_all_jobs(self): for pbs_server_name in servers: c = pbs.pbs_connect(util.smart_str(pbs_server_name)) if c <= 0: - log.debug(f"connection to PBS server {pbs_server_name} for state check failed") + log.debug("connection to PBS server %s for state check failed", pbs_server_name) failures.append(pbs_server_name) continue stat_attrl = pbs.new_attrl(3) @@ -479,7 +475,7 @@ def check_single_job(self, pbs_server_name, job_id): """ c = pbs.pbs_connect(util.smart_str(pbs_server_name)) if c <= 0: - log.debug(f"connection to PBS server {pbs_server_name} for state check failed") + log.debug("connection to PBS server %s for state check failed", pbs_server_name) return None stat_attrl = pbs.new_attrl(1) stat_attrl[0].name = pbs.ATTR_state @@ -507,7 +503,7 @@ def stop_job(self, job_wrapper): job = job_wrapper.get_job() job_id = job.get_job_runner_external_id().encode("utf-8") job_tag = f"({job.get_id_tag()}/{job_id})" - log.debug(f"{job_tag} Stopping PBS job") + log.debug("%s Stopping PBS job", job_tag) # Declare the connection handle c so that it can be cleaned up: c = None @@ -515,17 +511,17 @@ def stop_job(self, job_wrapper): try: pbs_server_name = self.__get_pbs_server(job.destination_params) if pbs_server_name is None: - log.debug("(%s) Job queued but no destination stored in job params, cannot delete" % job_tag) + log.debug("(%s) Job queued but no destination stored in job params, cannot delete", job_tag) return c = pbs.pbs_connect(util.smart_str(pbs_server_name)) if c <= 0: - log.debug(f"({job_tag}) Connection to PBS server for job delete failed") + log.debug("(%s) Connection to PBS server for job delete failed", job_tag) return pbs.pbs_deljob(c, job_id, "") - log.debug(f"{job_tag} Removed from PBS queue before job completion") + log.debug("%s Removed from PBS queue before job completion", job_tag) except Exception: e = traceback.format_exc() - log.debug(f"{job_tag} Unable to stop job: {e}") + log.debug("%s Unable to stop job: %s", job_tag, e) finally: # Cleanup: disconnect from the server. if None is not c: @@ -546,16 +542,12 @@ def recover(self, job, job_wrapper): pbs_job_state.runner_url = job_wrapper.get_job_runner_url() job_wrapper.command_line = job.command_line if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): - log.debug( - f"({job.id}/{job.get_job_runner_external_id()}) is still in {job.state} state, adding to the PBS queue" - ) + log.debug("(%s/%s) is still in %s state, adding to the PBS queue", job.id, job.get_job_runner_external_id(), job.state) pbs_job_state.old_state = "R" pbs_job_state.running = True self.monitor_queue.put(pbs_job_state) elif job.state == model.Job.states.QUEUED: - log.debug( - f"({job.id}/{job.get_job_runner_external_id()}) is still in PBS queued state, adding to the PBS queue" - ) + log.debug("(%s/%s) is still in PBS queued state, adding to the PBS queue", job.id, job.get_job_runner_external_id()) pbs_job_state.old_state = "Q" pbs_job_state.running = False self.monitor_queue.put(pbs_job_state) diff --git a/lib/galaxy/jobs/runners/pulsar.py b/lib/galaxy/jobs/runners/pulsar.py index 2bba3504024a..4ec459845671 100644 --- a/lib/galaxy/jobs/runners/pulsar.py +++ b/lib/galaxy/jobs/runners/pulsar.py @@ -255,7 +255,7 @@ def __init_pulsar_app(self, conf, pulsar_conf_path): if pulsar_conf_path is None: log.info("Creating a Pulsar app with default configuration (no pulsar_conf specified).") else: - log.info(f"Loading Pulsar app configuration from {pulsar_conf_path}") + log.info("Loading Pulsar app configuration from %s", pulsar_conf_path) with open(pulsar_conf_path) as f: conf.update(yaml.safe_load(f) or {}) if "job_metrics_config_file" not in conf: @@ -403,7 +403,7 @@ def queue_job(self, job_wrapper): config_files = job_wrapper.extra_filenames tool_script = os.path.join(job_wrapper.working_directory, "tool_script.sh") if os.path.exists(tool_script): - log.debug(f"Registering tool_script for Pulsar transfer [{tool_script}]") + log.debug("Registering tool_script for Pulsar transfer [%s]", tool_script) job_directory_files.append(tool_script) config_files.append(tool_script) # Following is job destination environment variables @@ -437,7 +437,7 @@ def queue_job(self, job_wrapper): tool_directory_required_files=tool_directory_required_files, ) external_job_id = pulsar_submit_job(client, client_job_description, remote_job_config) - log.info(f"Pulsar job submitted with job_id {external_job_id}") + log.info("Pulsar job submitted with job_id %s", external_job_id) job = job_wrapper.get_job() # Set the job destination here (unlike other runners) because there are likely additional job destination # params from the Pulsar client. @@ -567,7 +567,7 @@ def _populate_parameter_defaults(self, job_destination): for key, value in self.destination_defaults.items(): if key in params: if value is PARAMETER_SPECIFICATION_IGNORED: - log.warning(f"Pulsar runner in selected configuration ignores parameter {key}") + log.warning("Pulsar runner in selected configuration ignores parameter %s", key) continue # if self.runner_params.get( key, None ): # # Let plugin define defaults for some parameters - @@ -711,12 +711,9 @@ def check_pid(self, pid): return True except OSError as e: if e.errno == errno.ESRCH: - log.debug("check_pid(): PID %d is dead" % pid) + log.debug("check_pid(): PID %d is dead", pid) else: - log.warning( - "check_pid(): Got errno %s when attempting to check PID %d: %s" - % (errno.errorcode[e.errno], pid, e.strerror) - ) + log.warning("check_pid(): Got errno %s when attempting to check PID %d: %s", (errno.errorcode[e.errno], pid, e.strerror)) return False def stop_job(self, job_wrapper): @@ -731,32 +728,29 @@ def stop_job(self, job_wrapper): 0 ].job_runner_external_pid # every JobExternalOutputMetadata has a pid set, we just need to take from one of them if pid in [None, ""]: - log.warning(f"stop_job(): {job.id}: no PID in database for job, unable to stop") + log.warning("stop_job(): %s: no PID in database for job, unable to stop", job.id) return pid = int(pid) if not self.check_pid(pid): - log.warning("stop_job(): %s: PID %d was already dead or can't be signaled" % (job.id, pid)) + log.warning("stop_job(): %s: PID %d was already dead or can't be signaled", (job.id, pid)) return for sig in [15, 9]: try: os.killpg(pid, sig) except OSError as e: - log.warning( - "stop_job(): %s: Got errno %s when attempting to signal %d to PID %d: %s" - % (job.id, errno.errorcode[e.errno], sig, pid, e.strerror) - ) + log.warning("stop_job(): %s: Got errno %s when attempting to signal %d to PID %d: %s", (job.id, errno.errorcode[e.errno], sig, pid, e.strerror)) return # give up sleep(2) if not self.check_pid(pid): - log.debug("stop_job(): %s: PID %d successfully killed with signal %d" % (job.id, pid, sig)) + log.debug("stop_job(): %s: PID %d successfully killed with signal %d", (job.id, pid, sig)) return else: - log.warning("stop_job(): %s: PID %d refuses to die after signaling TERM/KILL" % (job.id, pid)) + log.warning("stop_job(): %s: PID %d refuses to die after signaling TERM/KILL", (job.id, pid)) else: # Remote kill pulsar_url = job.job_runner_name job_id = job.job_runner_external_id - log.debug(f"Attempt remote Pulsar kill of job with url {pulsar_url} and id {job_id}") + log.debug("Attempt remote Pulsar kill of job with url %s and id %s", pulsar_url, job_id) client = self.get_client(job.destination_params, job_id) client.kill() @@ -766,7 +760,7 @@ def recover(self, job, job_wrapper): job_wrapper.command_line = job.get_command_line() state = job.get_state() if state in [model.Job.states.RUNNING, model.Job.states.QUEUED, model.Job.states.STOPPED]: - log.debug(f"(Pulsar/{job.id}) is still in {state} state, adding to the Pulsar queue") + log.debug("(Pulsar/%s) is still in %s state, adding to the Pulsar queue", job.id, state) job_state.old_state = True job_state.running = state == model.Job.states.RUNNING self.monitor_queue.put(job_state) @@ -838,7 +832,7 @@ def check_job_config(remote_job_config, check_features=None): # 0.6.0 was newest Pulsar version that did not report it's version. pulsar_version = PulsarJobRunner.pulsar_version(remote_job_config) needed_version = Version("0.0.0") - log.info(f"pulsar_version is {pulsar_version}") + log.info("pulsar_version is %s", pulsar_version) for feature, needed in list(check_features.items()) + [("_default_", True)]: if not needed: continue @@ -1153,7 +1147,7 @@ def input_metadata_rewrite(self, dataset, metadata_val): metadata_val, client_input_path_type=CLIENT_INPUT_PATH_TYPES.INPUT_METADATA_PATH ) if remote_input_path: - log.info(f"input_metadata_rewrite is {remote_input_path} from {metadata_val}") + log.info("input_metadata_rewrite is %s from %s", remote_input_path, metadata_val) self.path_rewrites_input_metadata[metadata_val] = remote_input_path return remote_input_path diff --git a/lib/galaxy/jobs/runners/state_handlers/resubmit.py b/lib/galaxy/jobs/runners/state_handlers/resubmit.py index b6bea8e3f467..cd763a833c95 100644 --- a/lib/galaxy/jobs/runners/state_handlers/resubmit.py +++ b/lib/galaxy/jobs/runners/state_handlers/resubmit.py @@ -98,7 +98,7 @@ def _handle_resubmit_definitions(resubmit_definitions, app, job_runner, job_stat float(delay) new_destination.params["__resubmit_delay_seconds"] = str(delay) except ValueError: - log.warning(f"Cannot delay job with delay [{delay}], does not appear to be a number.") + log.warning("Cannot delay job with delay [%s], does not appear to be a number.", delay) job_state.job_wrapper.set_job_destination(new_destination) # Clear external ID (state change below flushes the change) job.job_runner_external_id = None diff --git a/lib/galaxy/jobs/runners/tasks.py b/lib/galaxy/jobs/runners/tasks.py index 70088bb08e8d..a39fe00448ea 100644 --- a/lib/galaxy/jobs/runners/tasks.py +++ b/lib/galaxy/jobs/runners/tasks.py @@ -100,7 +100,7 @@ def queue_job(self, job_wrapper): task_state = tw.get_state() if model.Task.states.ERROR == task_state: job_exit_code = tw.get_exit_code() - log.debug("Canceling job %d: Task %s returned an error" % (tw.job_id, tw.task_id)) + log.debug("Canceling job %d: Task %s returned an error", (tw.job_id, tw.task_id)) self._cancel_job(job_wrapper, task_wrappers) tasks_complete = True break @@ -114,7 +114,7 @@ def queue_job(self, job_wrapper): if sleep_time < 8: sleep_time *= 2 job_wrapper.reclaim_ownership() # if running as the actual user, change ownership before merging. - log.debug(f"execution finished - beginning merge: {command_line}") + log.debug("execution finished - beginning merge: %s", command_line) stdout, stderr = splitter.do_merge(job_wrapper, task_wrappers) except Exception: job_wrapper.fail("failure running job", exception=True) @@ -141,7 +141,7 @@ def stop_job(self, job_wrapper): tasks = job.get_tasks() if len(tasks) > 0: for task in tasks: - log.debug(f"Killing task's job {task.id}") + log.debug("Killing task's job %s", task.id) self.app.job_manager.job_handler.dispatcher.stop(task) # There were no subtasks, so just kill the job. We'll touch @@ -157,7 +157,7 @@ def stop_job(self, job_wrapper): else: pid = job.job_runner_external_id if pid in [None, ""]: - log.warning(f"stop_job(): {job.id}: no PID in database for job, unable to stop") + log.warning("stop_job(): %s: no PID in database for job, unable to stop", job.id) return self._stop_pid(pid, job.id) @@ -196,9 +196,7 @@ def _cancel_job(self, job_wrapper, task_wrappers): task = task_wrapper.get_task() task_state = task.get_state() if model.Task.states.QUEUED == task_state: - log.debug( - "_cancel_job for job %d: Task %d is not running; setting state to DELETED" % (job.id, task.id) - ) + log.debug("_cancel_job for job %d: Task %d is not running; setting state to DELETED", (job.id, task.id)) task_wrapper.change_state(task.states.DELETED) # If a task failed, then the caller will have waited a few seconds # before recognizing the failure. In that time, a queued task could @@ -209,7 +207,7 @@ def _cancel_job(self, job_wrapper, task_wrappers): for task_wrapper in task_wrappers: if model.Task.states.RUNNING == task_wrapper.get_state(): task = task_wrapper.get_task() - log.debug("_cancel_job for job %d: Stopping running task %d" % (job.id, task.id)) + log.debug("_cancel_job for job %d: Stopping running task %d", (job.id, task.id)) job_wrapper.app.job_manager.job_handler.dispatcher.stop(task) def _check_pid(self, pid): @@ -224,9 +222,9 @@ def _stop_pid(self, pid, job_id): job's id (which is used for logging messages only right now). """ pid = int(pid) - log.debug(f"Stopping pid {pid}") + log.debug("Stopping pid %s", pid) if not self._check_pid(pid): - log.warning("_stop_pid(): %s: PID %d was already dead or can't be signaled" % (job_id, pid)) + log.warning("_stop_pid(): %s: PID %d was already dead or can't be signaled", (job_id, pid)) return for sig in [15, 9]: try: @@ -234,16 +232,13 @@ def _stop_pid(self, pid, job_id): except OSError as e: # This warning could be bogus; many tasks are stopped with # SIGTERM (signal 15), but ymmv depending on the platform. - log.warning( - "_stop_pid(): %s: Got errno %s when attempting to signal %d to PID %d: %s" - % (job_id, errno.errorcode[e.errno], sig, pid, e.strerror) - ) + log.warning("_stop_pid(): %s: Got errno %s when attempting to signal %d to PID %d: %s", (job_id, errno.errorcode[e.errno], sig, pid, e.strerror)) return # TODO: If we're stopping lots of tasks, then we will want to put this # avoid a two-second overhead using some other asynchronous method. sleep(2) if not self._check_pid(pid): - log.debug("_stop_pid(): %s: PID %d successfully killed with signal %d" % (job_id, pid, sig)) + log.debug("_stop_pid(): %s: PID %d successfully killed with signal %d", (job_id, pid, sig)) return else: - log.warning("_stop_pid(): %s: PID %d refuses to die after signaling TERM/KILL" % (job_id, pid)) + log.warning("_stop_pid(): %s: PID %d refuses to die after signaling TERM/KILL", (job_id, pid)) diff --git a/lib/galaxy/jobs/runners/univa.py b/lib/galaxy/jobs/runners/univa.py index c0d2ecb730c3..870a0265a74e 100644 --- a/lib/galaxy/jobs/runners/univa.py +++ b/lib/galaxy/jobs/runners/univa.py @@ -96,7 +96,7 @@ def _complete_terminal_job(self, ajs, drmaa_state, **kwargs): ajs.fail_message = "This job failed because it was cancelled." drmaa_state = self.drmaa.JobState.FAILED elif ("signal" in extinfo and extinfo["signal"] == "SIGKILL") and time_wasted > time_granted: - log.error(f"({ajs.job_wrapper.get_id_tag()}/{ajs.job_id}) Job hit walltime") + log.error("(%s/%s) Job hit walltime", ajs.job_wrapper.get_id_tag(), ajs.job_id) ajs.fail_message = ( "This job was terminated because it ran longer than the maximum allowed job run time." ) @@ -104,9 +104,7 @@ def _complete_terminal_job(self, ajs, drmaa_state, **kwargs): drmaa_state = self.drmaa.JobState.FAILED # test wasted>granted memory only if failed != 0 and exit_status != 0, ie if marked as failed elif state == self.drmaa.JobState.FAILED and mem_wasted > mem_granted * slots: - log.error( - f"({ajs.job_wrapper.get_id_tag()}/{ajs.job_id}) Job hit memory limit ({mem_wasted}>{mem_granted})" - ) + log.error("(%s/%s) Job hit memory limit (%s>%s)", ajs.job_wrapper.get_id_tag(), ajs.job_id, mem_wasted, mem_granted) ajs.fail_message = "This job was terminated because it used more than the maximum allowed memory." ajs.runner_state = ajs.runner_states.MEMORY_LIMIT_REACHED drmaa_state = self.drmaa_job_states.FAILED @@ -119,16 +117,14 @@ def _complete_terminal_job(self, ajs, drmaa_state, **kwargs): self.drmaa.JobState.SYSTEM_SUSPENDED, self.drmaa.JobState.USER_SUSPENDED, ]: - log.warning( - f"({ajs.job_wrapper.get_id_tag()}/{ajs.job_id}) Job is {self.drmaa_job_state_strings[state]}, returning to monitor queue" - ) + log.warning("(%s/%s) Job is %s, returning to monitor queue", ajs.job_wrapper.get_id_tag(), ajs.job_id, self.drmaa_job_state_strings[state]) # TODO return True? return True # job was not actually terminal elif state == self.drmaa.JobState.UNDETERMINED: - log.warning(f"({ajs.job_wrapper.get_id_tag()}/{ajs.job_id}) Job state could not be determined") + log.warning("(%s/%s) Job state could not be determined", ajs.job_wrapper.get_id_tag(), ajs.job_id) drmaa_state = self.drmaa_job_states.FAILED else: - log.error(f"DRMAAUniva: job {ajs.job_id} determined unknown state {state}") + log.error("DRMAAUniva: job %s determined unknown state %s", ajs.job_id, state) drmaa_state = self.drmaa_job_states.FAILED # by default, finish the job with the state from drmaa return super()._complete_terminal_job(ajs, drmaa_state=drmaa_state) @@ -275,7 +271,7 @@ def _get_drmaa_state_qacct(self, job_id, extinfo): # "NONE". If qdel was called multiple times, every invocation is recorded in a comma # separated list. if "deleted_by" in qacct and qacct["deleted_by"] != "NONE": - log.info(f"DRMAAUniva: job {job_id} was aborted by {qacct['deleted_by']}") + log.info("DRMAAUniva: job %s was aborted by %s", job_id, qacct['deleted_by']) extinfo["deleted"] = True return self.drmaa.JobState.FAILED @@ -291,14 +287,14 @@ def _get_drmaa_state_qacct(self, job_id, extinfo): if "exit_status" in qacct: qacct["exit_status"] = int(qacct["exit_status"]) if qacct["exit_status"] < 1: - log.error(f"DRMAAUniva: job {job_id} has exit status {qacct['exit_status']}") + log.error("DRMAAUniva: job %s has exit status %s", job_id, qacct['exit_status']) state = self.drmaa.JobState.DONE elif 0 < qacct["exit_status"] < 129: - log.error(f"DRMAAUniva: job {job_id} has exit status {qacct['exit_status']}") + log.error("DRMAAUniva: job %s has exit status %s", job_id, qacct['exit_status']) extinfo["exit_status"] = qacct["exit_status"] state = self.drmaa.JobState.FAILED else: - log.error(f"DRMAAUniva: job {job_id} was killed by signal {qacct['exit_status'] - 128}") + log.error("DRMAAUniva: job %s was killed by signal %s", job_id, qacct['exit_status'] - 128) state = self.drmaa.JobState.FAILED extinfo["signal"] = signals[qacct["exit_status"] - 128] @@ -318,7 +314,7 @@ def _get_drmaa_state_qacct(self, job_id, extinfo): elif code in [24, 25]: state = self.drmaa.JobState.RUNNING else: - log.error(f"DRMAAUniva: job {job_id} failed with failure {qacct['failed']}") + log.error("DRMAAUniva: job %s failed with failure %s", job_id, qacct['failed']) state = self.drmaa.JobState.FAILED # log.debug("UnivaJobRunner._get_drmaa_state_qacct ({jobid}) -> {state}".format(jobid=job_id, state=self.drmaa_job_state_strings[state])) return state @@ -419,7 +415,7 @@ def _get_drmaa_state_wait(self, job_id, ds, extinfo): extinfo["slots"] = float(rv.resourceUsage["slots"]) # log.debug("wait -> \texitStatus {0}\thasCoreDump {1}\thasExited {2}\thasSignal {3}\tjobId {4}\t\tterminatedSignal {5}\twasAborted {6}\tresourceUsage {7}".format(rv.exitStatus, rv.hasCoreDump, rv.hasExited, rv.hasSignal, rv.jobId, rv.terminatedSignal, rv.wasAborted, rv.resourceUsage)) if rv.wasAborted: - log.error(f"DRMAAUniva: job {job_id} was aborted according to wait()") + log.error("DRMAAUniva: job %s was aborted according to wait()", job_id) extinfo["deleted"] = True return self.drmaa.JobState.FAILED @@ -427,19 +423,19 @@ def _get_drmaa_state_wait(self, job_id, ds, extinfo): # but also violation of scheduler constraints state = self.drmaa.JobState.DONE if rv.exitStatus != 0: - log.error(f"DRMAAUniva: job {job_id} has exit status {rv.exitStatus}") + log.error("DRMAAUniva: job %s has exit status %s", job_id, rv.exitStatus) extinfo["state"] = self.drmaa.JobState.FAILED if not rv.hasExited or rv.hasSignal: if rv.hasCoreDump != 0: - log.error(f"DRMAAUniva: job {job_id} has core dump") + log.error("DRMAAUniva: job %s has core dump", job_id) extinfo["state"] = self.drmaa.JobState.FAILED elif len(rv.terminatedSignal) > 0: - log.error(f"DRMAAUniva: job {job_id} was kill by signal {rv.terminatedSignal}") + log.error("DRMAAUniva: job %s was kill by signal %s", job_id, rv.terminatedSignal) state = self.drmaa.JobState.FAILED extinfo["signal"] = rv.terminatedSignal elif rv.wasAborted == 0: - log.error(f"DRMAAUniva: job {job_id} has finished in unclear condition") + log.error("DRMAAUniva: job %s has finished in unclear condition", job_id) state = self.drmaa.JobState.FAILED # log.debug("UnivaJobRunner._get_drmaa_state_wait ({jobid}) -> {state}".format(jobid=job_id, state=self.drmaa_job_state_strings[state])) return state @@ -561,7 +557,7 @@ def _map_qstat_drmaa_states(self, job_id, state, extinfo): elif "w" in state: return self.drmaa.JobState.QUEUED_ACTIVE else: - log.error(f"DRMAAUniva: job {job_id} unknown state from qstat: {state}") + log.error("DRMAAUniva: job %s unknown state from qstat: %s", job_id, state) return self.drmaa.JobState.UNDETERMINED @@ -595,12 +591,12 @@ def _parse_native_specs(job_id, native_spec): if m is not None: tme = _parse_time(m.group(1)) if tme is None: - log.error(f"DRMAAUniva: job {job_id} has unparsable time native spec {native_spec}") + log.error("DRMAAUniva: job %s has unparsable time native spec %s", job_id, native_spec) # parse memory m = re.search(r"mem=([\d.]+[KGMT]?)[\s,]*", native_spec) if m is not None: mem = size_to_bytes(m.group(1)) # mem = _parse_mem(m.group(1)) if mem is None: - log.error(f"DRMAAUniva: job {job_id} has unparsable memory native spec {native_spec}") + log.error("DRMAAUniva: job %s has unparsable memory native spec %s", job_id, native_spec) return tme, mem diff --git a/lib/galaxy/jobs/runners/util/cli/job/lsf.py b/lib/galaxy/jobs/runners/util/cli/job/lsf.py index cdaedf447a42..272e198c0451 100644 --- a/lib/galaxy/jobs/runners/util/cli/job/lsf.py +++ b/lib/galaxy/jobs/runners/util/cli/job/lsf.py @@ -36,7 +36,7 @@ def job_script_kwargs(self, ofile, efile, job_name): k = argmap[k] scriptargs[k] = v except Exception: - log.warning(f"Unrecognized long argument passed to LSF CLI plugin: {k}") + log.warning("Unrecognized long argument passed to LSF CLI plugin: %s", k) # Generated template. template_scriptargs = "" @@ -81,7 +81,7 @@ def parse_single_status(self, status, job_id): # which would be badly handled here. So this only works well when Galaxy # is constantly monitoring the jobs. The logic here is that DONE jobs get forgotten # faster than failed jobs. - log.warning(f"Job id '{job_id}' not found LSF status check") + log.warning("Job id '%s' not found LSF status check", job_id) return job_states.OK return self._get_job_state(status) diff --git a/lib/galaxy/jobs/runners/util/cli/job/pbs.py b/lib/galaxy/jobs/runners/util/cli/job/pbs.py index 5e2cc06d8a8a..e4851db28130 100644 --- a/lib/galaxy/jobs/runners/util/cli/job/pbs.py +++ b/lib/galaxy/jobs/runners/util/cli/job/pbs.py @@ -19,7 +19,7 @@ def parse_status(self, status, job_ids): try: data = json.loads(status) except Exception: - log.warning(f"No valid qstat JSON return from `qstat -f -F json`, got the following: {status}") + log.warning("No valid qstat JSON return from `qstat -f -F json`, got the following: %s", status) rval = {} for job_id, job in data.get("Jobs", {}).items(): if job_id in job_ids: diff --git a/lib/galaxy/jobs/runners/util/cli/job/slurm.py b/lib/galaxy/jobs/runners/util/cli/job/slurm.py index 5c1626d7315f..996a680d5183 100644 --- a/lib/galaxy/jobs/runners/util/cli/job/slurm.py +++ b/lib/galaxy/jobs/runners/util/cli/job/slurm.py @@ -25,7 +25,7 @@ def job_script_kwargs(self, ofile, efile, job_name): k = argmap[k] scriptargs[k] = v except Exception: - log.warning(f"Unrecognized long argument passed to Slurm CLI plugin: {k}") + log.warning("Unrecognized long argument passed to Slurm CLI plugin: %s", k) # Generated template. template_scriptargs = "" diff --git a/lib/galaxy/jobs/runners/util/cli/job/torque.py b/lib/galaxy/jobs/runners/util/cli/job/torque.py index 56754b142b58..d48fefd8f0b3 100644 --- a/lib/galaxy/jobs/runners/util/cli/job/torque.py +++ b/lib/galaxy/jobs/runners/util/cli/job/torque.py @@ -76,7 +76,7 @@ def parse_status(self, status, job_ids): except Exception: tree = None if tree is None: - log.warning(f"No valid qstat XML return from `qstat -x`, got the following: {status}") + log.warning("No valid qstat XML return from `qstat -x`, got the following: %s", status) return None else: for job in tree.findall("Job"): diff --git a/lib/galaxy/jobs/runners/util/pykube_util.py b/lib/galaxy/jobs/runners/util/pykube_util.py index 7c3f32d87b09..45cb26db3473 100644 --- a/lib/galaxy/jobs/runners/util/pykube_util.py +++ b/lib/galaxy/jobs/runners/util/pykube_util.py @@ -295,7 +295,7 @@ def galaxy_instance_id(params): if re.match(r"(?!-)[a-z\d-]{1,20}(? Date: Fri, 20 Oct 2023 12:20:19 -0400 Subject: [PATCH 2/5] Re-ran patching --- lib/galaxy/jobs/runners/__init__.py | 18 +++--- lib/galaxy/jobs/runners/aws.py | 10 +-- lib/galaxy/jobs/runners/cli.py | 36 ++++------- lib/galaxy/jobs/runners/condor.py | 34 +++++----- lib/galaxy/jobs/runners/drmaa.py | 30 ++++----- lib/galaxy/jobs/runners/godocker.py | 24 +++---- lib/galaxy/jobs/runners/kubernetes.py | 22 +++---- lib/galaxy/jobs/runners/local.py | 8 +-- lib/galaxy/jobs/runners/pbs.py | 62 ++++++++----------- lib/galaxy/jobs/runners/pulsar.py | 36 +++++------ .../jobs/runners/state_handlers/resubmit.py | 2 +- lib/galaxy/jobs/runners/tasks.py | 27 ++++---- lib/galaxy/jobs/runners/univa.py | 40 ++++++------ lib/galaxy/jobs/runners/util/cli/job/lsf.py | 4 +- lib/galaxy/jobs/runners/util/cli/job/pbs.py | 2 +- lib/galaxy/jobs/runners/util/cli/job/slurm.py | 2 +- .../jobs/runners/util/cli/job/torque.py | 2 +- lib/galaxy/jobs/runners/util/pykube_util.py | 2 +- lib/galaxy/jobs/runners/util/sudo.py | 2 +- 19 files changed, 156 insertions(+), 207 deletions(-) diff --git a/lib/galaxy/jobs/runners/__init__.py b/lib/galaxy/jobs/runners/__init__.py index c14aea726f03..5154d4dff93f 100644 --- a/lib/galaxy/jobs/runners/__init__.py +++ b/lib/galaxy/jobs/runners/__init__.py @@ -116,7 +116,7 @@ def _init_worker_threads(self): """Start ``nworkers`` worker threads.""" self.work_queue = Queue() self.work_threads = [] - log.debug(f"Starting {self.nworkers} {self.runner_name} workers") + log.debug("Starting %s %s workers", self.nworkers, self.runner_name) for i in range(self.nworkers): worker = threading.Thread(name="%s.work_thread-%d" % (self.runner_name, i), target=self.run_next) worker.daemon = True @@ -201,11 +201,11 @@ def put(self, job_wrapper: "MinimalJobWrapper"): # Required for exceptions thrown by object store incompatiblity. # tested by test/integration/objectstore/test_private_handling.py job_wrapper.fail(str(e), exception=e) - log.debug(f"Job [{job_wrapper.job_id}] failed to queue {put_timer}") + log.debug("Job [%s] failed to queue %s", job_wrapper.job_id, put_timer) return if queue_job: self.mark_as_queued(job_wrapper) - log.debug(f"Job [{job_wrapper.job_id}] queued {put_timer}") + log.debug("Job [%s] queued %s", job_wrapper.job_id, put_timer) def mark_as_queued(self, job_wrapper: "MinimalJobWrapper"): self.work_queue.put((self.queue_job, job_wrapper)) @@ -278,12 +278,12 @@ def prepare_job( # Make sure the job hasn't been deleted if job_state == model.Job.states.DELETED: - log.debug(f"({job_id}) Job deleted by user before it entered the {self.runner_name} queue") + log.debug("(%s) Job deleted by user before it entered the %s queue", job_id, self.runner_name) if self.app.config.cleanup_job in ("always", "onsuccess"): job_wrapper.cleanup() return False elif job_state != model.Job.states.QUEUED: - log.info(f"({job_id}) Job is in state {job_state}, skipping execution") + log.info("(%s) Job is in state %s, skipping execution", job_id, job_state) # cleanup may not be safe in all states return False @@ -475,7 +475,7 @@ def _handle_metadata_externally(self, job_wrapper: "MinimalJobWrapper", resolve_ env=os.environ, preexec_fn=os.setpgrp, ) - log.debug("execution of external set_meta for job %d finished" % job_wrapper.job_id) + log.debug("execution of external set_meta for job %d finished", job_wrapper.job_id) def get_job_file(self, job_wrapper: "MinimalJobWrapper", **kwds) -> str: job_metrics = job_wrapper.app.job_metrics @@ -504,7 +504,7 @@ def get_job_file(self, job_wrapper: "MinimalJobWrapper", **kwds) -> str: # Additional logging to enable if debugging from_work_dir handling, metadata # commands, etc... (or just peak in the job script.) job_id = job_wrapper.job_id - log.debug(f"({job_id}) command is: {command_line}") + log.debug("(%s) command is: %s", job_id, command_line) options.update(**kwds) return job_script(**options) @@ -713,7 +713,7 @@ def cleanup(self): prefix = f"({self.job_wrapper.get_id_tag()})" else: prefix = f"({self.job_wrapper.get_id_tag()}/{self.job_id})" - log.debug(f"{prefix} Unable to cleanup {file}: {unicodify(e)}") + log.debug("%s Unable to cleanup %s: %s", prefix, file, unicodify(e)) class AsynchronousJobState(JobState): @@ -840,7 +840,7 @@ def monitor_job(self, job_state): def shutdown(self): """Attempts to gracefully shut down the monitor thread""" - log.info(f"{self.runner_name}: Sending stop signal to monitor thread") + log.info("%s: Sending stop signal to monitor thread", self.runner_name) self.monitor_queue.put(STOP_SIGNAL) # Call the parent's shutdown method to stop workers self.shutdown_monitor() diff --git a/lib/galaxy/jobs/runners/aws.py b/lib/galaxy/jobs/runners/aws.py index bed35ec19c37..c9c07982c1c2 100644 --- a/lib/galaxy/jobs/runners/aws.py +++ b/lib/galaxy/jobs/runners/aws.py @@ -217,9 +217,9 @@ def __init__(self, app, nworkers, **kwargs): self._batch_client = session.client("batch") def queue_job(self, job_wrapper): - log.debug(f"Starting queue_job for job {job_wrapper.get_id_tag()}") + log.debug("Starting queue_job for job %s", job_wrapper.get_id_tag()) if not self.prepare_job(job_wrapper, include_metadata=False, modify_command_for_container=False): - log.debug(f"Not ready {job_wrapper.get_id_tag()}") + log.debug("Not ready %s", job_wrapper.get_id_tag()) return job_destination = job_wrapper.job_destination @@ -252,7 +252,7 @@ def _get_job_definition(self, job_wrapper, destination_params): jd_arn = self._register_job_definition(jd_name, container_image, destination_params) else: jd_arn = res["jobDefinitions"][0]["jobDefinitionArn"] - log.debug(f"Found existing job definition: {jd_name}.") + log.debug("Found existing job definition: %s.", jd_name) return jd_arn @@ -325,7 +325,7 @@ def _get_retry_strategy(self, destination_params): return strategy def _register_job_definition(self, jd_name, container_image, destination_params): - log.debug(f"Registering a new job definition: {jd_name}.") + log.debug("Registering a new job definition: %s.", jd_name) platform = destination_params.get("platform") volumes, mount_points = self._get_mount_volumes(destination_params) @@ -375,7 +375,7 @@ def _submit_job(self, job_def, job_wrapper, destination_params): job_name = self.JOB_NAME_PREFIX + job_wrapper.get_id_tag() command_script_path = self.write_command(job_wrapper) - log.info(f"Submitting job {job_name} to AWS Batch.") + log.info("Submitting job %s to AWS Batch.", job_name) res = self._batch_client.submit_job( jobName=job_name, jobQueue=destination_params.get("job_queue"), diff --git a/lib/galaxy/jobs/runners/cli.py b/lib/galaxy/jobs/runners/cli.py index 3af7b71d696a..71fccd89bfa7 100644 --- a/lib/galaxy/jobs/runners/cli.py +++ b/lib/galaxy/jobs/runners/cli.py @@ -50,7 +50,7 @@ def url_to_destination(self, url): job_params = {f"job_{k}": v for k, v in [kv.split("=", 1) for kv in job_params.split("&")]} params.update(shell_params) params.update(job_params) - log.debug(f"Converted URL '{url}' to destination runner=cli, params={params}") + log.debug("Converted URL '%s' to destination runner=cli, params=%s", url, params) # Create a dynamic JobDestination return JobDestination(runner="cli", params=params) @@ -96,7 +96,7 @@ def queue_job(self, job_wrapper): job_wrapper.cleanup() return - log.debug(f"({galaxy_id_tag}) submitting file: {ajs.job_file}") + log.debug("(%s) submitting file: %s", galaxy_id_tag, ajs.job_file) returncode, stdout = self.submit(shell, job_interface, ajs.job_file, galaxy_id_tag, retry=MAX_SUBMIT_RETRY) if returncode != 0: @@ -107,11 +107,11 @@ def queue_job(self, job_wrapper): submit_stdout = stdout.strip() external_job_id = submit_stdout and submit_stdout.split()[-1] if not external_job_id: - log.error(f"({galaxy_id_tag}) submission did not return a job identifier, failing job") + log.error("(%s) submission did not return a job identifier, failing job", galaxy_id_tag) job_wrapper.fail("failure submitting job") return - log.info(f"({galaxy_id_tag}) queued with identifier: {external_job_id}") + log.info("(%s) queued with identifier: %s", galaxy_id_tag, external_job_id) # store runner information for tracking if Galaxy restarts job_wrapper.set_external_id(external_job_id) @@ -136,9 +136,7 @@ def submit(self, shell, job_interface, job_file, galaxy_id_tag, retry=MAX_SUBMIT if returncode == 0: stdout = cmd_out.stdout if not stdout or not stdout.strip(): - log.warning( - f"({galaxy_id_tag}) Execute returned a 0 exit code but no external identifier will be recovered from empty stdout - stderr is {cmd_out.stderr}" - ) + log.warning("(%s) Execute returned a 0 exit code but no external identifier will be recovered from empty stdout - stderr is %s", galaxy_id_tag, cmd_out.stderr) return returncode, stdout stdout = f"({galaxy_id_tag}) submission failed (stdout): {cmd_out.stdout}" stderr = f"({galaxy_id_tag}) submission failed (stderr): {cmd_out.stderr}" @@ -170,18 +168,16 @@ def check_watched_items(self): if ajs.job_wrapper.get_state() == model.Job.states.DELETED: continue - log.debug(f"({id_tag}/{external_job_id}) job not found in batch state check") + log.debug("(%s/%s) job not found in batch state check", id_tag, external_job_id) shell_params, job_params = self.parse_destination_params(ajs.job_destination.params) shell, job_interface = self.get_cli_plugins(shell_params, job_params) cmd_out = shell.execute(job_interface.get_single_status(external_job_id)) state = job_interface.parse_single_status(cmd_out.stdout, external_job_id) if not state == model.Job.states.OK: - log.warning( - f"({id_tag}/{external_job_id}) job not found in batch state check, but found in individual state check" - ) + log.warning("(%s/%s) job not found in batch state check, but found in individual state check", id_tag, external_job_id) job_state = ajs.job_wrapper.get_state() if state != old_state: - log.debug(f"({id_tag}/{external_job_id}) state change: from {old_state} to {state}") + log.debug("(%s/%s) state change: from %s to %s", id_tag, external_job_id, old_state, state) if state == model.Job.states.ERROR and job_state != model.Job.states.STOPPED: # Try to find out the reason for exiting - this needs to happen before change_state # otherwise jobs depending on resubmission outputs see that job as failed and pause. @@ -201,7 +197,7 @@ def check_watched_items(self): ) if external_metadata: self.work_queue.put((self.handle_metadata_externally, ajs)) - log.debug(f"({id_tag}/{external_job_id}) job execution finished, running job wrapper finish method") + log.debug("(%s/%s) job execution finished, running job wrapper finish method", id_tag, external_job_id) self.work_queue.put((self.finish_job, ajs)) else: new_watched.append(ajs) @@ -253,11 +249,9 @@ def stop_job(self, job_wrapper): shell, job_interface = self.get_cli_plugins(shell_params, job_params) cmd_out = shell.execute(job_interface.delete(job.job_runner_external_id)) assert cmd_out.returncode == 0, cmd_out.stderr - log.debug(f"({job.id}/{job.job_runner_external_id}) Terminated at user's request") + log.debug("(%s/%s) Terminated at user's request", job.id, job.job_runner_external_id) except Exception as e: - log.debug( - f"({job.id}/{job.job_runner_external_id}) User killed running job, but error encountered during termination: {e}" - ) + log.debug("(%s/%s) User killed running job, but error encountered during termination: %s", job.id, job.job_runner_external_id, e) def recover(self, job, job_wrapper): """Recovers jobs stuck in the queued/running state when Galaxy started""" @@ -273,16 +267,12 @@ def recover(self, job, job_wrapper): ) ajs.command_line = job.command_line if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): - log.debug( - f"({job.id}/{job.job_runner_external_id}) is still in {job.state} state, adding to the runner monitor queue" - ) + log.debug("(%s/%s) is still in %s state, adding to the runner monitor queue", job.id, job.job_runner_external_id, job.state) ajs.old_state = model.Job.states.RUNNING ajs.running = True self.monitor_queue.put(ajs) elif job.state == model.Job.states.QUEUED: - log.debug( - f"({job.id}/{job.job_runner_external_id}) is still in queued state, adding to the runner monitor queue" - ) + log.debug("(%s/%s) is still in queued state, adding to the runner monitor queue", job.id, job.job_runner_external_id) ajs.old_state = model.Job.states.QUEUED ajs.running = False self.monitor_queue.put(ajs) diff --git a/lib/galaxy/jobs/runners/condor.py b/lib/galaxy/jobs/runners/condor.py index 6e9862ac9e02..4c20334ba7f0 100644 --- a/lib/galaxy/jobs/runners/condor.py +++ b/lib/galaxy/jobs/runners/condor.py @@ -132,11 +132,11 @@ def queue_job(self, job_wrapper): job_wrapper.cleanup() return - log.debug(f"({galaxy_id_tag}) submitting file {executable}") + log.debug("(%s) submitting file %s", galaxy_id_tag, executable) external_job_id, message = condor_submit(submit_file) if external_job_id is None: - log.debug(f"condor_submit failed for job {job_wrapper.get_id_tag()}: {message}") + log.debug("condor_submit failed for job %s: %s", job_wrapper.get_id_tag(), message) if self.app.config.cleanup_job == "always": os.unlink(submit_file) cjs.cleanup() @@ -145,7 +145,7 @@ def queue_job(self, job_wrapper): os.unlink(submit_file) - log.info(f"({galaxy_id_tag}) queued as {external_job_id}") + log.info("(%s) queued as %s", galaxy_id_tag, external_job_id) # store runner information for tracking if Galaxy restarts job_wrapper.set_external_id(external_job_id) @@ -181,7 +181,7 @@ def check_watched_items(self): except Exception: # so we don't kill the monitor thread log.exception(f"({galaxy_id_tag}/{job_id}) Unable to check job status") - log.warning(f"({galaxy_id_tag}/{job_id}) job will now be errored") + log.warning("(%s/%s) job will now be errored", galaxy_id_tag, job_id) cjs.fail_message = "Cluster could not complete job" self.work_queue.put((self.fail_job, cjs)) continue @@ -191,10 +191,10 @@ def check_watched_items(self): cjs.job_wrapper.check_for_entry_points() if job_running and not cjs.running: - log.debug(f"({galaxy_id_tag}/{job_id}) job is now running") + log.debug("(%s/%s) job is now running", galaxy_id_tag, job_id) cjs.job_wrapper.change_state(model.Job.states.RUNNING) if not job_running and cjs.running: - log.debug(f"({galaxy_id_tag}/{job_id}) job has stopped running") + log.debug("(%s/%s) job has stopped running", galaxy_id_tag, job_id) # Will switching from RUNNING to QUEUED confuse Galaxy? # cjs.job_wrapper.change_state( model.Job.states.QUEUED ) job_state = cjs.job_wrapper.get_state() @@ -205,11 +205,11 @@ def check_watched_items(self): ) if external_metadata: self._handle_metadata_externally(cjs.job_wrapper, resolve_requirements=True) - log.debug(f"({galaxy_id_tag}/{job_id}) job has completed") + log.debug("(%s/%s) job has completed", galaxy_id_tag, job_id) self.work_queue.put((self.finish_job, cjs)) continue if job_failed: - log.debug(f"({galaxy_id_tag}/{job_id}) job failed") + log.debug("(%s/%s) job failed", galaxy_id_tag, job_id) cjs.failed = True self.work_queue.put((self.fail_job, cjs)) continue @@ -225,7 +225,7 @@ def stop_job(self, job_wrapper): galaxy_id_tag = job_wrapper.get_id_tag() if job.container: try: - log.info(f"stop_job(): {job.id}: trying to stop container .... ({external_id})") + log.info("stop_job(): %s: trying to stop container .... (%s)", job.id, external_id) # self.watched = [cjs for cjs in self.watched if cjs.job_id != external_id] new_watch_list = list() cjs = None @@ -244,21 +244,21 @@ def stop_job(self, job_wrapper): ) if external_metadata: self._handle_metadata_externally(cjs.job_wrapper, resolve_requirements=True) - log.debug(f"({galaxy_id_tag}/{external_id}) job has completed") + log.debug("(%s/%s) job has completed", galaxy_id_tag, external_id) self.work_queue.put((self.finish_job, cjs)) except Exception as e: - log.warning(f"stop_job(): {job.id}: trying to stop container failed. ({e})") + log.warning("stop_job(): %s: trying to stop container failed. (%s)", job.id, e) try: self._kill_container(job_wrapper) except Exception as e: - log.warning(f"stop_job(): {job.id}: trying to kill container failed. ({e})") + log.warning("stop_job(): %s: trying to kill container failed. (%s)", job.id, e) failure_message = condor_stop(external_id) if failure_message: - log.debug(f"({external_id}). Failed to stop condor {failure_message}") + log.debug("(%s). Failed to stop condor %s", external_id, failure_message) else: failure_message = condor_stop(external_id) if failure_message: - log.debug(f"({external_id}). Failed to stop condor {failure_message}") + log.debug("(%s). Failed to stop condor %s", external_id, failure_message) def recover(self, job, job_wrapper): """Recovers jobs stuck in the queued/running state when Galaxy started""" @@ -276,13 +276,11 @@ def recover(self, job, job_wrapper): cjs.user_log = os.path.join(job_wrapper.working_directory, f"galaxy_{galaxy_id_tag}.condor.log") cjs.register_cleanup_file_attribute("user_log") if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): - log.debug( - f"({job.id}/{job.get_job_runner_external_id()}) is still in {job.state} state, adding to the DRM queue" - ) + log.debug("(%s/%s) is still in %s state, adding to the DRM queue", job.id, job.get_job_runner_external_id(), job.state) cjs.running = True self.monitor_queue.put(cjs) elif job.state == model.Job.states.QUEUED: - log.debug(f"({job.id}/{job.job_runner_external_id}) is still in DRM queued state, adding to the DRM queue") + log.debug("(%s/%s) is still in DRM queued state, adding to the DRM queue", job.id, job.job_runner_external_id) cjs.running = False self.monitor_queue.put(cjs) diff --git a/lib/galaxy/jobs/runners/drmaa.py b/lib/galaxy/jobs/runners/drmaa.py index 5c1d68f70490..8cbe1ecffba8 100644 --- a/lib/galaxy/jobs/runners/drmaa.py +++ b/lib/galaxy/jobs/runners/drmaa.py @@ -108,10 +108,10 @@ def url_to_destination(self, url): native_spec = url.split("/")[2] if native_spec: params = dict(nativeSpecification=native_spec) - log.debug(f"Converted URL '{url}' to destination runner=drmaa, params={params}") + log.debug("Converted URL '%s' to destination runner=drmaa, params=%s", url, params) return JobDestination(runner="drmaa", params=params) else: - log.debug(f"Converted URL '{url}' to destination runner=drmaa") + log.debug("Converted URL '%s' to destination runner=drmaa", url) return JobDestination(runner="drmaa") def get_native_spec(self, url): @@ -197,7 +197,7 @@ def queue_job(self, job_wrapper): log.exception("(%s) drmaa.Session.runJob() failed unconditionally", galaxy_id_tag) trynum = 5 else: - log.error(f"({galaxy_id_tag}) All attempts to submit job failed") + log.error("(%s) All attempts to submit job failed", galaxy_id_tag) if not fail_msg: fail_msg = DEFAULT_JOB_RUNNER_FAILURE_MESSAGE job_wrapper.fail(fail_msg) @@ -216,13 +216,13 @@ def queue_job(self, job_wrapper): job_wrapper.fail(fail_msg) return pwent = job_wrapper.galaxy_system_pwent - log.debug(f"({galaxy_id_tag}) submitting with credentials: {pwent[0]} [uid: {pwent[2]}]") + log.debug("(%s) submitting with credentials: %s [uid: %s]", galaxy_id_tag, pwent[0], pwent[2]) self.userid = pwent[2] external_job_id = self.external_runjob(external_runjob_script, filename, pwent[2]) if external_job_id is None: job_wrapper.fail(f"({galaxy_id_tag}) could not queue job") return - log.info(f"({galaxy_id_tag}) queued as {external_job_id}") + log.info("(%s) queued as %s", galaxy_id_tag, external_job_id) # store runner information for tracking if Galaxy restarts job_wrapper.set_external_id(external_job_id) @@ -327,7 +327,7 @@ def check_watched_item(self, ajs, new_watched): except Exception: # so we don't kill the monitor thread log.exception(f"({galaxy_id_tag}/{external_job_id}) unable to check job status") - log.warning(f"({galaxy_id_tag}/{external_job_id}) job will now be errored") + log.warning("(%s/%s) job will now be errored", galaxy_id_tag, external_job_id) ajs.fail_message = "Cluster could not complete job" self.work_queue.put((self.fail_job, ajs)) return None @@ -347,7 +347,7 @@ def check_watched_items(self): if state is None: continue if state != old_state: - log.debug(f"({galaxy_id_tag}/{external_job_id}) state change: {self.drmaa_job_state_strings[state]}") + log.debug("(%s/%s) state change: %s", galaxy_id_tag, external_job_id, self.drmaa_job_state_strings[state]) if state == drmaa.JobState.RUNNING and not ajs.running: ajs.running = True ajs.job_wrapper.change_state(model.Job.states.RUNNING) @@ -381,11 +381,11 @@ def stop_job(self, job_wrapper): cmd = shlex.split(kill_script) cmd.extend([str(ext_id), str(self.userid)]) commands.execute(cmd) - log.info(f"({job.id}/{ext_id}) Removed from DRM queue at user's request") + log.info("(%s/%s) Removed from DRM queue at user's request", job.id, ext_id) except drmaa.InvalidJobException: log.exception(f"({job.id}/{ext_id}) User killed running job, but it was already dead") except commands.CommandLineException as e: - log.error(f"({job.id}/{ext_id}) User killed running job, but command execution failed: {unicodify(e)}") + log.error("(%s/%s) User killed running job, but command execution failed: %s", job.id, ext_id, unicodify(e)) except Exception: log.exception(f"({job.id}/{ext_id}) User killed running job, but error encountered removing from DRM queue") @@ -403,16 +403,12 @@ def recover(self, job, job_wrapper): ) ajs.command_line = job.get_command_line() if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): - log.debug( - f"({job.id}/{job.get_job_runner_external_id()}) is still in {job.state} state, adding to the DRM queue" - ) + log.debug("(%s/%s) is still in %s state, adding to the DRM queue", job.id, job.get_job_runner_external_id(), job.state) ajs.old_state = drmaa.JobState.RUNNING ajs.running = True self.monitor_queue.put(ajs) elif job.get_state() == model.Job.states.QUEUED: - log.debug( - f"({job.id}/{job.get_job_runner_external_id()}) is still in DRM queued state, adding to the DRM queue" - ) + log.debug("(%s/%s) is still in DRM queued state, adding to the DRM queue", job.id, job.get_job_runner_external_id()) ajs.old_state = drmaa.JobState.QUEUED_ACTIVE ajs.running = False self.monitor_queue.put(ajs) @@ -422,7 +418,7 @@ def store_jobtemplate(self, job_wrapper, jt): filename = os.path.join(job_wrapper.working_directory, f"{job_wrapper.get_id_tag()}.jt_json") with open(filename, "w") as fp: json.dump(jt, fp) - log.debug(f"({job_wrapper.job_id}) Job script for external submission is: {filename}") + log.debug("(%s) Job script for external submission is: %s", job_wrapper.job_id, filename) return filename def external_runjob(self, external_runjob_script, jobtemplate_filename, username): @@ -432,7 +428,7 @@ def external_runjob(self, external_runjob_script, jobtemplate_filename, username """ cmd = shlex.split(external_runjob_script) cmd.extend([str(username), jobtemplate_filename]) - log.info(f"Running command: {' '.join(cmd)}") + log.info("Running command: %s", ' '.join(cmd)) try: stdoutdata = commands.execute(cmd).strip() except commands.CommandLineException: diff --git a/lib/galaxy/jobs/runners/godocker.py b/lib/galaxy/jobs/runners/godocker.py index bcacb84fb700..2ddfce1b09df 100644 --- a/lib/galaxy/jobs/runners/godocker.py +++ b/lib/galaxy/jobs/runners/godocker.py @@ -158,7 +158,7 @@ def queue_job(self, job_wrapper): log.error("Job creation failure. No Response from GoDocker") job_wrapper.fail("Not submitted") else: - log.debug(f"Starting queue_job for job {job_id}") + log.debug("Starting queue_job for job %s", job_id) # Create an object of AsynchronousJobState and add it to the monitor queue. ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, @@ -186,7 +186,7 @@ def check_watched_item(self, job_state): # Get task from GoDocker job_persisted_state = job_state.job_wrapper.get_state() job_status_god = self.get_task(job_state.job_id) - log.debug(f"Job ID: {str(job_state.job_id)} Job Status: {str(job_status_god['status']['primary'])}") + log.debug("Job ID: %s Job Status: %s", str(job_state.job_id), str(job_status_god['status']['primary'])) if job_status_god["status"]["primary"] == "over" or job_persisted_state == model.Job.states.STOPPED: job_state.running = False @@ -238,7 +238,7 @@ def stop_job(self, job_wrapper): # This function is called by fail_job() # No Return data expected job_id = job_wrapper.job_id - log.debug(f"STOP JOB EXECUTION OF JOB ID: {str(job_id)}") + log.debug("STOP JOB EXECUTION OF JOB ID: %s", str(job_id)) # Get task status from GoDocker. job_status_god = self.get_task_status(job_id) if job_status_god["status"]["primary"] != "over": @@ -260,17 +260,13 @@ def recover(self, job, job_wrapper): ) job_wrapper.command_line = job.command_line if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): - log.debug( - f"({job.id}/{job.get_job_runner_external_id()}) is still in {job.state} state, adding to the god queue" - ) + log.debug("(%s/%s) is still in %s state, adding to the god queue", job.id, job.get_job_runner_external_id(), job.state) ajs.old_state = "R" ajs.running = True self.monitor_queue.put(ajs) elif job.state == model.Job.states.QUEUED: - log.debug( - f"({job.id}/{job.get_job_runner_external_id()}) is still in god queued state, adding to the god queue" - ) + log.debug("(%s/%s) is still in god queued state, adding to the god queue", job.id, job.get_job_runner_external_id()) ajs.old_state = "Q" ajs.running = False self.monitor_queue.put(ajs) @@ -309,9 +305,9 @@ def create_log_file(self, job_state, job_status_god): log_file.write(out_log) log_file.close() f.close() - log.debug(f"CREATE OUTPUT FILE: {job_state.output_file}") - log.debug(f"CREATE ERROR FILE: {job_state.error_file}") - log.debug(f"CREATE EXIT CODE FILE: {job_state.exit_code_file}") + log.debug("CREATE OUTPUT FILE: %s", job_state.output_file) + log.debug("CREATE ERROR FILE: %s", job_state.error_file) + log.debug("CREATE EXIT CODE FILE: %s", job_state.exit_code_file) except OSError as e: log.error("Could not access task log file: %s", unicodify(e)) log.debug("IO Error occurred when accessing the files.") @@ -325,7 +321,7 @@ def login(self, apikey, login, server, noCert=False): Create Login model schema of GoDocker and call the http_post_request method. """ log.debug("LOGIN TASK TO BE EXECUTED \n") - log.debug(f"GODOCKER LOGIN: {str(login)}") + log.debug("GODOCKER LOGIN: %s", str(login)) data = json.dumps({"user": login, "apikey": apikey}) # Create object of Godocker class g_auth = Godocker(server, login, apikey, noCert) @@ -360,7 +356,7 @@ def post_task(self, job_wrapper): docker_image = self._find_container(job_wrapper).container_id log.debug("GoDocker runner using container %s.", docker_image) except Exception: - log.error(f"Unable to find docker_image for job {job_wrapper.job_id}, failing.") + log.error("Unable to find docker_image for job %s, failing.", job_wrapper.job_id) return False volumes = [] diff --git a/lib/galaxy/jobs/runners/kubernetes.py b/lib/galaxy/jobs/runners/kubernetes.py index 1f9e984605c3..aff56724d9bd 100644 --- a/lib/galaxy/jobs/runners/kubernetes.py +++ b/lib/galaxy/jobs/runners/kubernetes.py @@ -144,7 +144,7 @@ def queue_job(self, job_wrapper): # prepare the job # We currently don't need to include_metadata or include_work_dir_outputs, as working directory is the same # where galaxy will expect results. - log.debug(f"Starting queue_job for job {job_wrapper.get_id_tag()}") + log.debug("Starting queue_job for job %s", job_wrapper.get_id_tag()) ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, @@ -226,7 +226,7 @@ def __configure_port_routing(self, ajs): # Configure additional k8s service and ingress for tools with guest ports k8s_job_prefix = self.__produce_k8s_job_prefix() k8s_job_name = self.__get_k8s_job_name(k8s_job_prefix, ajs.job_wrapper) - log.debug(f"Configuring entry points and deploying service/ingress for job with ID {ajs.job_id}") + log.debug("Configuring entry points and deploying service/ingress for job with ID %s", ajs.job_id) k8s_service_obj = service_object_dict(self.runner_params, k8s_job_name, self.__get_k8s_service_spec(ajs)) k8s_ingress_obj = ingress_object_dict(self.runner_params, k8s_job_name, self.__get_k8s_ingress_spec(ajs)) @@ -879,7 +879,7 @@ def __job_pending_due_to_unschedulable_pod(self, job_state): def __cleanup_k8s_guest_ports(self, job_wrapper, k8s_job): k8s_job_prefix = self.__produce_k8s_job_prefix() k8s_job_name = f"{k8s_job_prefix}-{self.__force_label_conformity(job_wrapper.get_id_tag())}" - log.debug(f"Deleting service/ingress for job with ID {job_wrapper.get_id_tag()}") + log.debug("Deleting service/ingress for job with ID %s", job_wrapper.get_id_tag()) job_failed = k8s_job.obj["status"]["failed"] > 0 if "failed" in k8s_job.obj["status"] else False ingress_to_delete = find_ingress_object_by_name( self._pykube_api, k8s_job_name, self.runner_params["k8s_namespace"] @@ -908,7 +908,7 @@ def stop_job(self, job_wrapper): self.__cleanup_k8s_job(k8s_job) # TODO assert whether job parallelism == 0 # assert not job_to_delete.exists(), "Could not delete job,"+job.job_runner_external_id+" it still exists" - log.debug(f"({job.id}/{job.job_runner_external_id}) Terminated at user's request") + log.debug("(%s/%s) Terminated at user's request", job.id, job.job_runner_external_id) except Exception as e: log.exception( @@ -920,7 +920,7 @@ def stop_job(self, job_wrapper): def recover(self, job, job_wrapper): """Recovers jobs stuck in the queued/running state when Galaxy started""" job_id = job.get_job_runner_external_id() - log.debug(f"k8s trying to recover job: {job_id}") + log.debug("k8s trying to recover job: %s", job_id) if job_id is None: self.put(job_wrapper) return @@ -932,20 +932,12 @@ def recover(self, job, job_wrapper): ) ajs.command_line = job.command_line if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): - log.debug( - "({}/{}) is still in {} state, adding to the runner monitor queue".format( - job.id, job.job_runner_external_id, job.state - ) - ) + log.debug("(%s/%s) is still in %s state, adding to the runner monitor queue", job.id, job.job_runner_external_id, job.state) ajs.old_state = model.Job.states.RUNNING ajs.running = True self.monitor_queue.put(ajs) elif job.state == model.Job.states.QUEUED: - log.debug( - "({}/{}) is still in queued state, adding to the runner monitor queue".format( - job.id, job.job_runner_external_id - ) - ) + log.debug("(%s/%s) is still in queued state, adding to the runner monitor queue", job.id, job.job_runner_external_id) ajs.old_state = model.Job.states.QUEUED ajs.running = False self.monitor_queue.put(ajs) diff --git a/lib/galaxy/jobs/runners/local.py b/lib/galaxy/jobs/runners/local.py index d70768f8b856..27bdbb33190f 100644 --- a/lib/galaxy/jobs/runners/local.py +++ b/lib/galaxy/jobs/runners/local.py @@ -95,7 +95,7 @@ def queue_job(self, job_wrapper): try: stdout_file = tempfile.NamedTemporaryFile(mode="wb+", suffix="_stdout", dir=job_wrapper.working_directory) stderr_file = tempfile.NamedTemporaryFile(mode="wb+", suffix="_stderr", dir=job_wrapper.working_directory) - log.debug(f"({job_id}) executing job script: {job_file}") + log.debug("(%s) executing job script: %s", job_id, job_file) # The preexec_fn argument of Popen() is used to call os.setpgrp() in # the child process just before the child is executed. This will set # the PGID of the child process to its PID (i.e. ensures that it is @@ -140,7 +140,7 @@ def queue_job(self, job_wrapper): stderr = self._job_io_for_db(stderr_file) stdout_file.close() stderr_file.close() - log.debug(f"execution finished: {job_file}") + log.debug("execution finished: %s", job_file) except Exception: log.exception("failure running job %d", job_wrapper.job_id) self._fail_job_local(job_wrapper, "failure running job") @@ -166,11 +166,11 @@ def stop_job(self, job_wrapper): # metadata internal or job not complete yet pid = job.get_job_runner_external_id() if pid in [None, ""]: - log.warning(f"stop_job(): {job.id}: no PID in database for job, unable to stop") + log.warning("stop_job(): %s: no PID in database for job, unable to stop", job.id) return pid = int(pid) if not check_pg(pid): - log.warning("stop_job(): %s: Process group %d was already dead or can't be signaled" % (job.id, pid)) + log.warning("stop_job(): %s: Process group %d was already dead or can't be signaled", job.id, pid) return log.debug("stop_job(): %s: Terminating process group %d", job.id, pid) kill_pg(pid) diff --git a/lib/galaxy/jobs/runners/pbs.py b/lib/galaxy/jobs/runners/pbs.py index fa22c11b1a70..d4a0f3d3c8e7 100644 --- a/lib/galaxy/jobs/runners/pbs.py +++ b/lib/galaxy/jobs/runners/pbs.py @@ -113,7 +113,7 @@ def __init__(self, app, nworkers): def default_pbs_server(self): if self.__default_pbs_server is None: self.__default_pbs_server = pbs.pbs_default() - log.debug(f"Set default PBS server to {self.default_pbs_server}") + log.debug("Set default PBS server to %s", self.default_pbs_server) return self.__default_pbs_server def url_to_destination(self, url): @@ -151,7 +151,7 @@ def url_to_destination(self, url): param, value = opt.split(None, 1) params[param] = value - log.debug(f"Converted URL '{url}' to destination runner=pbs, params={params}") + log.debug("Converted URL '%s' to destination runner=pbs, params=%s", url, params) # Create a dynamic JobDestination return JobDestination(runner="pbs", params=params) @@ -171,7 +171,7 @@ def parse_destination_params(self, params): arg = arg.lstrip("-") args[arg] = value except Exception: - log.warning(f"Unrecognized long argument in destination params: {arg}") + log.warning("Unrecognized long argument in destination params: %s", arg) return self.__args_to_attrs(args) # Internal stuff @@ -234,7 +234,7 @@ def queue_job(self, job_wrapper): if c <= 0: errno, text = pbs.error() job_wrapper.fail("Unable to queue job for execution. Resubmitting the job may succeed.") - log.error(f"Connection to PBS server for submit failed: {errno}: {text}") + log.error("Connection to PBS server for submit failed: %s: %s", errno, text) return # define job attributes @@ -297,7 +297,7 @@ def queue_job(self, job_wrapper): self.write_executable_script(job_file, script, job_io=job_wrapper.job_io) # job was deleted while we were preparing it if job_wrapper.get_state() in (model.Job.states.DELETED, model.Job.states.STOPPED): - log.debug(f"Job {job_wrapper.job_id} deleted/stopped by user before it entered the PBS queue") + log.debug("Job %s deleted/stopped by user before it entered the PBS queue", job_wrapper.job_id) pbs.pbs_disconnect(c) if job_wrapper.cleanup_job in ("always", "onsuccess"): self.cleanup((ofile, efile, ecfile, job_file)) @@ -308,7 +308,7 @@ def queue_job(self, job_wrapper): # The job tag includes the job and the task identifier # (if a TaskWrapper was passed in): galaxy_job_id = job_wrapper.get_id_tag() - log.debug(f"({galaxy_job_id}) submitting file {job_file}") + log.debug("(%s) submitting file %s", galaxy_job_id, job_file) tries = 0 while tries < 5: @@ -318,17 +318,17 @@ def queue_job(self, job_wrapper): pbs.pbs_disconnect(c) break errno, text = pbs.error() - log.warning("(%s) pbs_submit failed (try %d/5), PBS error %d: %s" % (galaxy_job_id, tries, errno, text)) + log.warning("(%s) pbs_submit failed (try %d/5), PBS error %d: %s", galaxy_job_id, tries, errno, text) time.sleep(2) else: - log.error(f"({galaxy_job_id}) All attempts to submit job failed") + log.error("(%s) All attempts to submit job failed", galaxy_job_id) job_wrapper.fail("Unable to run this job due to a cluster error, please retry it later") return if pbs_queue_name is None: - log.debug(f"({galaxy_job_id}) queued in default queue as {job_id}") + log.debug("(%s) queued in default queue as %s", galaxy_job_id, job_id) else: - log.debug(f"({galaxy_job_id}) queued in {pbs_queue_name} queue as {job_id}") + log.debug("(%s) queued in %s queue as %s", galaxy_job_id, pbs_queue_name, job_id) # persist destination job_wrapper.set_job_destination(job_destination, job_id) @@ -363,7 +363,7 @@ def check_watched_items(self): old_state = pbs_job_state.old_state pbs_server_name = self.__get_pbs_server(pbs_job_state.job_destination.params) if pbs_server_name in failures: - log.debug(f"({galaxy_job_id}/{job_id}) Skipping state check because PBS server connection failed") + log.debug("(%s/%s) Skipping state check because PBS server connection failed", galaxy_job_id, job_id) new_watched.append(pbs_job_state) continue try: @@ -374,25 +374,21 @@ def check_watched_items(self): try: # Recheck to make sure it wasn't a communication problem self.check_single_job(pbs_server_name, job_id) - log.warning( - f"({galaxy_job_id}/{job_id}) PBS job was not in state check list, but was found with individual state check" - ) + log.warning("(%s/%s) PBS job was not in state check list, but was found with individual state check", galaxy_job_id, job_id) new_watched.append(pbs_job_state) except Exception: errno, text = pbs.error() if errno == 15001: # 15001 == job not in queue - log.debug(f"({galaxy_job_id}/{job_id}) PBS job has left queue") + log.debug("(%s/%s) PBS job has left queue", galaxy_job_id, job_id) self.work_queue.put((self.finish_job, pbs_job_state)) else: # Unhandled error, continue to monitor - log.info( - "(%s/%s) PBS state check resulted in error (%d): %s" % (galaxy_job_id, job_id, errno, text) - ) + log.info("(%s/%s) PBS state check resulted in error (%d): %s", galaxy_job_id, job_id, errno, text) new_watched.append(pbs_job_state) continue if status.job_state != old_state: - log.debug(f"({galaxy_job_id}/{job_id}) PBS job state changed from {old_state} to {status.job_state}") + log.debug("(%s/%s) PBS job state changed from %s to %s", galaxy_job_id, job_id, old_state, status.job_state) if status.job_state == "R" and not pbs_job_state.running: pbs_job_state.running = True pbs_job_state.job_wrapper.change_state(model.Job.states.RUNNING) @@ -410,18 +406,18 @@ def check_watched_items(self): int(status.exit_status) == 0 or pbs_job_state.job_wrapper.get_state() == model.Job.states.STOPPED ) - log.debug(f"({galaxy_job_id}/{job_id}) PBS job has completed successfully") + log.debug("(%s/%s) PBS job has completed successfully", galaxy_job_id, job_id) except AssertionError: exit_status = int(status.exit_status) error_message = JOB_EXIT_STATUS.get(exit_status, f"Unknown error: {status.exit_status}") pbs_job_state.fail_message = CLUSTER_ERROR_MESSAGE % error_message - log.error(f"({galaxy_job_id}/{job_id}) PBS job failed: {error_message}") + log.error("(%s/%s) PBS job failed: %s", galaxy_job_id, job_id, error_message) pbs_job_state.stop_job = False self.work_queue.put((self.fail_job, pbs_job_state)) continue except AttributeError: # No exit_status, can't verify proper completion so we just have to assume success. - log.debug(f"({galaxy_job_id}/{job_id}) PBS job has completed") + log.debug("(%s/%s) PBS job has completed", galaxy_job_id, job_id) self.work_queue.put((self.finish_job, pbs_job_state)) continue pbs_job_state.old_state = status.job_state @@ -446,7 +442,7 @@ def check_all_jobs(self): for pbs_server_name in servers: c = pbs.pbs_connect(util.smart_str(pbs_server_name)) if c <= 0: - log.debug(f"connection to PBS server {pbs_server_name} for state check failed") + log.debug("connection to PBS server %s for state check failed", pbs_server_name) failures.append(pbs_server_name) continue stat_attrl = pbs.new_attrl(3) @@ -479,7 +475,7 @@ def check_single_job(self, pbs_server_name, job_id): """ c = pbs.pbs_connect(util.smart_str(pbs_server_name)) if c <= 0: - log.debug(f"connection to PBS server {pbs_server_name} for state check failed") + log.debug("connection to PBS server %s for state check failed", pbs_server_name) return None stat_attrl = pbs.new_attrl(1) stat_attrl[0].name = pbs.ATTR_state @@ -507,7 +503,7 @@ def stop_job(self, job_wrapper): job = job_wrapper.get_job() job_id = job.get_job_runner_external_id().encode("utf-8") job_tag = f"({job.get_id_tag()}/{job_id})" - log.debug(f"{job_tag} Stopping PBS job") + log.debug("%s Stopping PBS job", job_tag) # Declare the connection handle c so that it can be cleaned up: c = None @@ -515,17 +511,17 @@ def stop_job(self, job_wrapper): try: pbs_server_name = self.__get_pbs_server(job.destination_params) if pbs_server_name is None: - log.debug("(%s) Job queued but no destination stored in job params, cannot delete" % job_tag) + log.debug("(%s) Job queued but no destination stored in job params, cannot delete", job_tag) return c = pbs.pbs_connect(util.smart_str(pbs_server_name)) if c <= 0: - log.debug(f"({job_tag}) Connection to PBS server for job delete failed") + log.debug("(%s) Connection to PBS server for job delete failed", job_tag) return pbs.pbs_deljob(c, job_id, "") - log.debug(f"{job_tag} Removed from PBS queue before job completion") + log.debug("%s Removed from PBS queue before job completion", job_tag) except Exception: e = traceback.format_exc() - log.debug(f"{job_tag} Unable to stop job: {e}") + log.debug("%s Unable to stop job: %s", job_tag, e) finally: # Cleanup: disconnect from the server. if None is not c: @@ -546,16 +542,12 @@ def recover(self, job, job_wrapper): pbs_job_state.runner_url = job_wrapper.get_job_runner_url() job_wrapper.command_line = job.command_line if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): - log.debug( - f"({job.id}/{job.get_job_runner_external_id()}) is still in {job.state} state, adding to the PBS queue" - ) + log.debug("(%s/%s) is still in %s state, adding to the PBS queue", job.id, job.get_job_runner_external_id(), job.state) pbs_job_state.old_state = "R" pbs_job_state.running = True self.monitor_queue.put(pbs_job_state) elif job.state == model.Job.states.QUEUED: - log.debug( - f"({job.id}/{job.get_job_runner_external_id()}) is still in PBS queued state, adding to the PBS queue" - ) + log.debug("(%s/%s) is still in PBS queued state, adding to the PBS queue", job.id, job.get_job_runner_external_id()) pbs_job_state.old_state = "Q" pbs_job_state.running = False self.monitor_queue.put(pbs_job_state) diff --git a/lib/galaxy/jobs/runners/pulsar.py b/lib/galaxy/jobs/runners/pulsar.py index 2bba3504024a..5a574d4c3fc7 100644 --- a/lib/galaxy/jobs/runners/pulsar.py +++ b/lib/galaxy/jobs/runners/pulsar.py @@ -255,7 +255,7 @@ def __init_pulsar_app(self, conf, pulsar_conf_path): if pulsar_conf_path is None: log.info("Creating a Pulsar app with default configuration (no pulsar_conf specified).") else: - log.info(f"Loading Pulsar app configuration from {pulsar_conf_path}") + log.info("Loading Pulsar app configuration from %s", pulsar_conf_path) with open(pulsar_conf_path) as f: conf.update(yaml.safe_load(f) or {}) if "job_metrics_config_file" not in conf: @@ -403,7 +403,7 @@ def queue_job(self, job_wrapper): config_files = job_wrapper.extra_filenames tool_script = os.path.join(job_wrapper.working_directory, "tool_script.sh") if os.path.exists(tool_script): - log.debug(f"Registering tool_script for Pulsar transfer [{tool_script}]") + log.debug("Registering tool_script for Pulsar transfer [%s]", tool_script) job_directory_files.append(tool_script) config_files.append(tool_script) # Following is job destination environment variables @@ -437,7 +437,7 @@ def queue_job(self, job_wrapper): tool_directory_required_files=tool_directory_required_files, ) external_job_id = pulsar_submit_job(client, client_job_description, remote_job_config) - log.info(f"Pulsar job submitted with job_id {external_job_id}") + log.info("Pulsar job submitted with job_id %s", external_job_id) job = job_wrapper.get_job() # Set the job destination here (unlike other runners) because there are likely additional job destination # params from the Pulsar client. @@ -567,7 +567,7 @@ def _populate_parameter_defaults(self, job_destination): for key, value in self.destination_defaults.items(): if key in params: if value is PARAMETER_SPECIFICATION_IGNORED: - log.warning(f"Pulsar runner in selected configuration ignores parameter {key}") + log.warning("Pulsar runner in selected configuration ignores parameter %s", key) continue # if self.runner_params.get( key, None ): # # Let plugin define defaults for some parameters - @@ -711,12 +711,9 @@ def check_pid(self, pid): return True except OSError as e: if e.errno == errno.ESRCH: - log.debug("check_pid(): PID %d is dead" % pid) + log.debug("check_pid(): PID %d is dead", pid) else: - log.warning( - "check_pid(): Got errno %s when attempting to check PID %d: %s" - % (errno.errorcode[e.errno], pid, e.strerror) - ) + log.warning("check_pid(): Got errno %s when attempting to check PID %d: %s", errno.errorcode[e.errno], pid, e.strerror) return False def stop_job(self, job_wrapper): @@ -731,32 +728,29 @@ def stop_job(self, job_wrapper): 0 ].job_runner_external_pid # every JobExternalOutputMetadata has a pid set, we just need to take from one of them if pid in [None, ""]: - log.warning(f"stop_job(): {job.id}: no PID in database for job, unable to stop") + log.warning("stop_job(): %s: no PID in database for job, unable to stop", job.id) return pid = int(pid) if not self.check_pid(pid): - log.warning("stop_job(): %s: PID %d was already dead or can't be signaled" % (job.id, pid)) + log.warning("stop_job(): %s: PID %d was already dead or can't be signaled", job.id, pid) return for sig in [15, 9]: try: os.killpg(pid, sig) except OSError as e: - log.warning( - "stop_job(): %s: Got errno %s when attempting to signal %d to PID %d: %s" - % (job.id, errno.errorcode[e.errno], sig, pid, e.strerror) - ) + log.warning("stop_job(): %s: Got errno %s when attempting to signal %d to PID %d: %s", job.id, errno.errorcode[e.errno], sig, pid, e.strerror) return # give up sleep(2) if not self.check_pid(pid): - log.debug("stop_job(): %s: PID %d successfully killed with signal %d" % (job.id, pid, sig)) + log.debug("stop_job(): %s: PID %d successfully killed with signal %d", job.id, pid, sig) return else: - log.warning("stop_job(): %s: PID %d refuses to die after signaling TERM/KILL" % (job.id, pid)) + log.warning("stop_job(): %s: PID %d refuses to die after signaling TERM/KILL", job.id, pid) else: # Remote kill pulsar_url = job.job_runner_name job_id = job.job_runner_external_id - log.debug(f"Attempt remote Pulsar kill of job with url {pulsar_url} and id {job_id}") + log.debug("Attempt remote Pulsar kill of job with url %s and id %s", pulsar_url, job_id) client = self.get_client(job.destination_params, job_id) client.kill() @@ -766,7 +760,7 @@ def recover(self, job, job_wrapper): job_wrapper.command_line = job.get_command_line() state = job.get_state() if state in [model.Job.states.RUNNING, model.Job.states.QUEUED, model.Job.states.STOPPED]: - log.debug(f"(Pulsar/{job.id}) is still in {state} state, adding to the Pulsar queue") + log.debug("(Pulsar/%s) is still in %s state, adding to the Pulsar queue", job.id, state) job_state.old_state = True job_state.running = state == model.Job.states.RUNNING self.monitor_queue.put(job_state) @@ -838,7 +832,7 @@ def check_job_config(remote_job_config, check_features=None): # 0.6.0 was newest Pulsar version that did not report it's version. pulsar_version = PulsarJobRunner.pulsar_version(remote_job_config) needed_version = Version("0.0.0") - log.info(f"pulsar_version is {pulsar_version}") + log.info("pulsar_version is %s", pulsar_version) for feature, needed in list(check_features.items()) + [("_default_", True)]: if not needed: continue @@ -1153,7 +1147,7 @@ def input_metadata_rewrite(self, dataset, metadata_val): metadata_val, client_input_path_type=CLIENT_INPUT_PATH_TYPES.INPUT_METADATA_PATH ) if remote_input_path: - log.info(f"input_metadata_rewrite is {remote_input_path} from {metadata_val}") + log.info("input_metadata_rewrite is %s from %s", remote_input_path, metadata_val) self.path_rewrites_input_metadata[metadata_val] = remote_input_path return remote_input_path diff --git a/lib/galaxy/jobs/runners/state_handlers/resubmit.py b/lib/galaxy/jobs/runners/state_handlers/resubmit.py index b6bea8e3f467..cd763a833c95 100644 --- a/lib/galaxy/jobs/runners/state_handlers/resubmit.py +++ b/lib/galaxy/jobs/runners/state_handlers/resubmit.py @@ -98,7 +98,7 @@ def _handle_resubmit_definitions(resubmit_definitions, app, job_runner, job_stat float(delay) new_destination.params["__resubmit_delay_seconds"] = str(delay) except ValueError: - log.warning(f"Cannot delay job with delay [{delay}], does not appear to be a number.") + log.warning("Cannot delay job with delay [%s], does not appear to be a number.", delay) job_state.job_wrapper.set_job_destination(new_destination) # Clear external ID (state change below flushes the change) job.job_runner_external_id = None diff --git a/lib/galaxy/jobs/runners/tasks.py b/lib/galaxy/jobs/runners/tasks.py index 70088bb08e8d..00598fa1c449 100644 --- a/lib/galaxy/jobs/runners/tasks.py +++ b/lib/galaxy/jobs/runners/tasks.py @@ -100,7 +100,7 @@ def queue_job(self, job_wrapper): task_state = tw.get_state() if model.Task.states.ERROR == task_state: job_exit_code = tw.get_exit_code() - log.debug("Canceling job %d: Task %s returned an error" % (tw.job_id, tw.task_id)) + log.debug("Canceling job %d: Task %s returned an error", tw.job_id, tw.task_id) self._cancel_job(job_wrapper, task_wrappers) tasks_complete = True break @@ -114,7 +114,7 @@ def queue_job(self, job_wrapper): if sleep_time < 8: sleep_time *= 2 job_wrapper.reclaim_ownership() # if running as the actual user, change ownership before merging. - log.debug(f"execution finished - beginning merge: {command_line}") + log.debug("execution finished - beginning merge: %s", command_line) stdout, stderr = splitter.do_merge(job_wrapper, task_wrappers) except Exception: job_wrapper.fail("failure running job", exception=True) @@ -141,7 +141,7 @@ def stop_job(self, job_wrapper): tasks = job.get_tasks() if len(tasks) > 0: for task in tasks: - log.debug(f"Killing task's job {task.id}") + log.debug("Killing task's job %s", task.id) self.app.job_manager.job_handler.dispatcher.stop(task) # There were no subtasks, so just kill the job. We'll touch @@ -157,7 +157,7 @@ def stop_job(self, job_wrapper): else: pid = job.job_runner_external_id if pid in [None, ""]: - log.warning(f"stop_job(): {job.id}: no PID in database for job, unable to stop") + log.warning("stop_job(): %s: no PID in database for job, unable to stop", job.id) return self._stop_pid(pid, job.id) @@ -196,9 +196,7 @@ def _cancel_job(self, job_wrapper, task_wrappers): task = task_wrapper.get_task() task_state = task.get_state() if model.Task.states.QUEUED == task_state: - log.debug( - "_cancel_job for job %d: Task %d is not running; setting state to DELETED" % (job.id, task.id) - ) + log.debug("_cancel_job for job %d: Task %d is not running; setting state to DELETED", job.id, task.id) task_wrapper.change_state(task.states.DELETED) # If a task failed, then the caller will have waited a few seconds # before recognizing the failure. In that time, a queued task could @@ -209,7 +207,7 @@ def _cancel_job(self, job_wrapper, task_wrappers): for task_wrapper in task_wrappers: if model.Task.states.RUNNING == task_wrapper.get_state(): task = task_wrapper.get_task() - log.debug("_cancel_job for job %d: Stopping running task %d" % (job.id, task.id)) + log.debug("_cancel_job for job %d: Stopping running task %d", job.id, task.id) job_wrapper.app.job_manager.job_handler.dispatcher.stop(task) def _check_pid(self, pid): @@ -224,9 +222,9 @@ def _stop_pid(self, pid, job_id): job's id (which is used for logging messages only right now). """ pid = int(pid) - log.debug(f"Stopping pid {pid}") + log.debug("Stopping pid %s", pid) if not self._check_pid(pid): - log.warning("_stop_pid(): %s: PID %d was already dead or can't be signaled" % (job_id, pid)) + log.warning("_stop_pid(): %s: PID %d was already dead or can't be signaled", job_id, pid) return for sig in [15, 9]: try: @@ -234,16 +232,13 @@ def _stop_pid(self, pid, job_id): except OSError as e: # This warning could be bogus; many tasks are stopped with # SIGTERM (signal 15), but ymmv depending on the platform. - log.warning( - "_stop_pid(): %s: Got errno %s when attempting to signal %d to PID %d: %s" - % (job_id, errno.errorcode[e.errno], sig, pid, e.strerror) - ) + log.warning("_stop_pid(): %s: Got errno %s when attempting to signal %d to PID %d: %s", job_id, errno.errorcode[e.errno], sig, pid, e.strerror) return # TODO: If we're stopping lots of tasks, then we will want to put this # avoid a two-second overhead using some other asynchronous method. sleep(2) if not self._check_pid(pid): - log.debug("_stop_pid(): %s: PID %d successfully killed with signal %d" % (job_id, pid, sig)) + log.debug("_stop_pid(): %s: PID %d successfully killed with signal %d", job_id, pid, sig) return else: - log.warning("_stop_pid(): %s: PID %d refuses to die after signaling TERM/KILL" % (job_id, pid)) + log.warning("_stop_pid(): %s: PID %d refuses to die after signaling TERM/KILL", job_id, pid) diff --git a/lib/galaxy/jobs/runners/univa.py b/lib/galaxy/jobs/runners/univa.py index c0d2ecb730c3..870a0265a74e 100644 --- a/lib/galaxy/jobs/runners/univa.py +++ b/lib/galaxy/jobs/runners/univa.py @@ -96,7 +96,7 @@ def _complete_terminal_job(self, ajs, drmaa_state, **kwargs): ajs.fail_message = "This job failed because it was cancelled." drmaa_state = self.drmaa.JobState.FAILED elif ("signal" in extinfo and extinfo["signal"] == "SIGKILL") and time_wasted > time_granted: - log.error(f"({ajs.job_wrapper.get_id_tag()}/{ajs.job_id}) Job hit walltime") + log.error("(%s/%s) Job hit walltime", ajs.job_wrapper.get_id_tag(), ajs.job_id) ajs.fail_message = ( "This job was terminated because it ran longer than the maximum allowed job run time." ) @@ -104,9 +104,7 @@ def _complete_terminal_job(self, ajs, drmaa_state, **kwargs): drmaa_state = self.drmaa.JobState.FAILED # test wasted>granted memory only if failed != 0 and exit_status != 0, ie if marked as failed elif state == self.drmaa.JobState.FAILED and mem_wasted > mem_granted * slots: - log.error( - f"({ajs.job_wrapper.get_id_tag()}/{ajs.job_id}) Job hit memory limit ({mem_wasted}>{mem_granted})" - ) + log.error("(%s/%s) Job hit memory limit (%s>%s)", ajs.job_wrapper.get_id_tag(), ajs.job_id, mem_wasted, mem_granted) ajs.fail_message = "This job was terminated because it used more than the maximum allowed memory." ajs.runner_state = ajs.runner_states.MEMORY_LIMIT_REACHED drmaa_state = self.drmaa_job_states.FAILED @@ -119,16 +117,14 @@ def _complete_terminal_job(self, ajs, drmaa_state, **kwargs): self.drmaa.JobState.SYSTEM_SUSPENDED, self.drmaa.JobState.USER_SUSPENDED, ]: - log.warning( - f"({ajs.job_wrapper.get_id_tag()}/{ajs.job_id}) Job is {self.drmaa_job_state_strings[state]}, returning to monitor queue" - ) + log.warning("(%s/%s) Job is %s, returning to monitor queue", ajs.job_wrapper.get_id_tag(), ajs.job_id, self.drmaa_job_state_strings[state]) # TODO return True? return True # job was not actually terminal elif state == self.drmaa.JobState.UNDETERMINED: - log.warning(f"({ajs.job_wrapper.get_id_tag()}/{ajs.job_id}) Job state could not be determined") + log.warning("(%s/%s) Job state could not be determined", ajs.job_wrapper.get_id_tag(), ajs.job_id) drmaa_state = self.drmaa_job_states.FAILED else: - log.error(f"DRMAAUniva: job {ajs.job_id} determined unknown state {state}") + log.error("DRMAAUniva: job %s determined unknown state %s", ajs.job_id, state) drmaa_state = self.drmaa_job_states.FAILED # by default, finish the job with the state from drmaa return super()._complete_terminal_job(ajs, drmaa_state=drmaa_state) @@ -275,7 +271,7 @@ def _get_drmaa_state_qacct(self, job_id, extinfo): # "NONE". If qdel was called multiple times, every invocation is recorded in a comma # separated list. if "deleted_by" in qacct and qacct["deleted_by"] != "NONE": - log.info(f"DRMAAUniva: job {job_id} was aborted by {qacct['deleted_by']}") + log.info("DRMAAUniva: job %s was aborted by %s", job_id, qacct['deleted_by']) extinfo["deleted"] = True return self.drmaa.JobState.FAILED @@ -291,14 +287,14 @@ def _get_drmaa_state_qacct(self, job_id, extinfo): if "exit_status" in qacct: qacct["exit_status"] = int(qacct["exit_status"]) if qacct["exit_status"] < 1: - log.error(f"DRMAAUniva: job {job_id} has exit status {qacct['exit_status']}") + log.error("DRMAAUniva: job %s has exit status %s", job_id, qacct['exit_status']) state = self.drmaa.JobState.DONE elif 0 < qacct["exit_status"] < 129: - log.error(f"DRMAAUniva: job {job_id} has exit status {qacct['exit_status']}") + log.error("DRMAAUniva: job %s has exit status %s", job_id, qacct['exit_status']) extinfo["exit_status"] = qacct["exit_status"] state = self.drmaa.JobState.FAILED else: - log.error(f"DRMAAUniva: job {job_id} was killed by signal {qacct['exit_status'] - 128}") + log.error("DRMAAUniva: job %s was killed by signal %s", job_id, qacct['exit_status'] - 128) state = self.drmaa.JobState.FAILED extinfo["signal"] = signals[qacct["exit_status"] - 128] @@ -318,7 +314,7 @@ def _get_drmaa_state_qacct(self, job_id, extinfo): elif code in [24, 25]: state = self.drmaa.JobState.RUNNING else: - log.error(f"DRMAAUniva: job {job_id} failed with failure {qacct['failed']}") + log.error("DRMAAUniva: job %s failed with failure %s", job_id, qacct['failed']) state = self.drmaa.JobState.FAILED # log.debug("UnivaJobRunner._get_drmaa_state_qacct ({jobid}) -> {state}".format(jobid=job_id, state=self.drmaa_job_state_strings[state])) return state @@ -419,7 +415,7 @@ def _get_drmaa_state_wait(self, job_id, ds, extinfo): extinfo["slots"] = float(rv.resourceUsage["slots"]) # log.debug("wait -> \texitStatus {0}\thasCoreDump {1}\thasExited {2}\thasSignal {3}\tjobId {4}\t\tterminatedSignal {5}\twasAborted {6}\tresourceUsage {7}".format(rv.exitStatus, rv.hasCoreDump, rv.hasExited, rv.hasSignal, rv.jobId, rv.terminatedSignal, rv.wasAborted, rv.resourceUsage)) if rv.wasAborted: - log.error(f"DRMAAUniva: job {job_id} was aborted according to wait()") + log.error("DRMAAUniva: job %s was aborted according to wait()", job_id) extinfo["deleted"] = True return self.drmaa.JobState.FAILED @@ -427,19 +423,19 @@ def _get_drmaa_state_wait(self, job_id, ds, extinfo): # but also violation of scheduler constraints state = self.drmaa.JobState.DONE if rv.exitStatus != 0: - log.error(f"DRMAAUniva: job {job_id} has exit status {rv.exitStatus}") + log.error("DRMAAUniva: job %s has exit status %s", job_id, rv.exitStatus) extinfo["state"] = self.drmaa.JobState.FAILED if not rv.hasExited or rv.hasSignal: if rv.hasCoreDump != 0: - log.error(f"DRMAAUniva: job {job_id} has core dump") + log.error("DRMAAUniva: job %s has core dump", job_id) extinfo["state"] = self.drmaa.JobState.FAILED elif len(rv.terminatedSignal) > 0: - log.error(f"DRMAAUniva: job {job_id} was kill by signal {rv.terminatedSignal}") + log.error("DRMAAUniva: job %s was kill by signal %s", job_id, rv.terminatedSignal) state = self.drmaa.JobState.FAILED extinfo["signal"] = rv.terminatedSignal elif rv.wasAborted == 0: - log.error(f"DRMAAUniva: job {job_id} has finished in unclear condition") + log.error("DRMAAUniva: job %s has finished in unclear condition", job_id) state = self.drmaa.JobState.FAILED # log.debug("UnivaJobRunner._get_drmaa_state_wait ({jobid}) -> {state}".format(jobid=job_id, state=self.drmaa_job_state_strings[state])) return state @@ -561,7 +557,7 @@ def _map_qstat_drmaa_states(self, job_id, state, extinfo): elif "w" in state: return self.drmaa.JobState.QUEUED_ACTIVE else: - log.error(f"DRMAAUniva: job {job_id} unknown state from qstat: {state}") + log.error("DRMAAUniva: job %s unknown state from qstat: %s", job_id, state) return self.drmaa.JobState.UNDETERMINED @@ -595,12 +591,12 @@ def _parse_native_specs(job_id, native_spec): if m is not None: tme = _parse_time(m.group(1)) if tme is None: - log.error(f"DRMAAUniva: job {job_id} has unparsable time native spec {native_spec}") + log.error("DRMAAUniva: job %s has unparsable time native spec %s", job_id, native_spec) # parse memory m = re.search(r"mem=([\d.]+[KGMT]?)[\s,]*", native_spec) if m is not None: mem = size_to_bytes(m.group(1)) # mem = _parse_mem(m.group(1)) if mem is None: - log.error(f"DRMAAUniva: job {job_id} has unparsable memory native spec {native_spec}") + log.error("DRMAAUniva: job %s has unparsable memory native spec %s", job_id, native_spec) return tme, mem diff --git a/lib/galaxy/jobs/runners/util/cli/job/lsf.py b/lib/galaxy/jobs/runners/util/cli/job/lsf.py index cdaedf447a42..272e198c0451 100644 --- a/lib/galaxy/jobs/runners/util/cli/job/lsf.py +++ b/lib/galaxy/jobs/runners/util/cli/job/lsf.py @@ -36,7 +36,7 @@ def job_script_kwargs(self, ofile, efile, job_name): k = argmap[k] scriptargs[k] = v except Exception: - log.warning(f"Unrecognized long argument passed to LSF CLI plugin: {k}") + log.warning("Unrecognized long argument passed to LSF CLI plugin: %s", k) # Generated template. template_scriptargs = "" @@ -81,7 +81,7 @@ def parse_single_status(self, status, job_id): # which would be badly handled here. So this only works well when Galaxy # is constantly monitoring the jobs. The logic here is that DONE jobs get forgotten # faster than failed jobs. - log.warning(f"Job id '{job_id}' not found LSF status check") + log.warning("Job id '%s' not found LSF status check", job_id) return job_states.OK return self._get_job_state(status) diff --git a/lib/galaxy/jobs/runners/util/cli/job/pbs.py b/lib/galaxy/jobs/runners/util/cli/job/pbs.py index 5e2cc06d8a8a..e4851db28130 100644 --- a/lib/galaxy/jobs/runners/util/cli/job/pbs.py +++ b/lib/galaxy/jobs/runners/util/cli/job/pbs.py @@ -19,7 +19,7 @@ def parse_status(self, status, job_ids): try: data = json.loads(status) except Exception: - log.warning(f"No valid qstat JSON return from `qstat -f -F json`, got the following: {status}") + log.warning("No valid qstat JSON return from `qstat -f -F json`, got the following: %s", status) rval = {} for job_id, job in data.get("Jobs", {}).items(): if job_id in job_ids: diff --git a/lib/galaxy/jobs/runners/util/cli/job/slurm.py b/lib/galaxy/jobs/runners/util/cli/job/slurm.py index 5c1626d7315f..996a680d5183 100644 --- a/lib/galaxy/jobs/runners/util/cli/job/slurm.py +++ b/lib/galaxy/jobs/runners/util/cli/job/slurm.py @@ -25,7 +25,7 @@ def job_script_kwargs(self, ofile, efile, job_name): k = argmap[k] scriptargs[k] = v except Exception: - log.warning(f"Unrecognized long argument passed to Slurm CLI plugin: {k}") + log.warning("Unrecognized long argument passed to Slurm CLI plugin: %s", k) # Generated template. template_scriptargs = "" diff --git a/lib/galaxy/jobs/runners/util/cli/job/torque.py b/lib/galaxy/jobs/runners/util/cli/job/torque.py index 56754b142b58..d48fefd8f0b3 100644 --- a/lib/galaxy/jobs/runners/util/cli/job/torque.py +++ b/lib/galaxy/jobs/runners/util/cli/job/torque.py @@ -76,7 +76,7 @@ def parse_status(self, status, job_ids): except Exception: tree = None if tree is None: - log.warning(f"No valid qstat XML return from `qstat -x`, got the following: {status}") + log.warning("No valid qstat XML return from `qstat -x`, got the following: %s", status) return None else: for job in tree.findall("Job"): diff --git a/lib/galaxy/jobs/runners/util/pykube_util.py b/lib/galaxy/jobs/runners/util/pykube_util.py index 7c3f32d87b09..45cb26db3473 100644 --- a/lib/galaxy/jobs/runners/util/pykube_util.py +++ b/lib/galaxy/jobs/runners/util/pykube_util.py @@ -295,7 +295,7 @@ def galaxy_instance_id(params): if re.match(r"(?!-)[a-z\d-]{1,20}(? Date: Fri, 20 Oct 2023 12:32:02 -0400 Subject: [PATCH 3/5] Removed extraneous back slash character --- lib/galaxy/jobs/runners/pulsar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/jobs/runners/pulsar.py b/lib/galaxy/jobs/runners/pulsar.py index dcbd08e45d2f..5a574d4c3fc7 100644 --- a/lib/galaxy/jobs/runners/pulsar.py +++ b/lib/galaxy/jobs/runners/pulsar.py @@ -738,7 +738,7 @@ def stop_job(self, job_wrapper): try: os.killpg(pid, sig) except OSError as e: -\ log.warning("stop_job(): %s: Got errno %s when attempting to signal %d to PID %d: %s", job.id, errno.errorcode[e.errno], sig, pid, e.strerror) + log.warning("stop_job(): %s: Got errno %s when attempting to signal %d to PID %d: %s", job.id, errno.errorcode[e.errno], sig, pid, e.strerror) return # give up sleep(2) if not self.check_pid(pid): From a5af51bec2bb85a6e64de87eda4e97ee9395487c Mon Sep 17 00:00:00 2001 From: Keith Suderman Date: Fri, 20 Oct 2023 14:06:34 -0400 Subject: [PATCH 4/5] Format changes --- lib/galaxy/jobs/runners/cli.py | 32 ++++++++++++++++++++++----- lib/galaxy/jobs/runners/condor.py | 11 +++++++-- lib/galaxy/jobs/runners/drmaa.py | 19 ++++++++++++---- lib/galaxy/jobs/runners/godocker.py | 15 ++++++++++--- lib/galaxy/jobs/runners/kubernetes.py | 13 +++++++++-- lib/galaxy/jobs/runners/pbs.py | 27 +++++++++++++++++----- lib/galaxy/jobs/runners/pulsar.py | 16 ++++++++++++-- lib/galaxy/jobs/runners/tasks.py | 9 +++++++- lib/galaxy/jobs/runners/univa.py | 25 +++++++++++++++------ lib/galaxy/jobs/runners/util/sudo.py | 2 +- 10 files changed, 137 insertions(+), 32 deletions(-) diff --git a/lib/galaxy/jobs/runners/cli.py b/lib/galaxy/jobs/runners/cli.py index 71fccd89bfa7..5882d3d4ece1 100644 --- a/lib/galaxy/jobs/runners/cli.py +++ b/lib/galaxy/jobs/runners/cli.py @@ -136,7 +136,11 @@ def submit(self, shell, job_interface, job_file, galaxy_id_tag, retry=MAX_SUBMIT if returncode == 0: stdout = cmd_out.stdout if not stdout or not stdout.strip(): - log.warning("(%s) Execute returned a 0 exit code but no external identifier will be recovered from empty stdout - stderr is %s", galaxy_id_tag, cmd_out.stderr) + log.warning( + "(%s) Execute returned a 0 exit code but no external identifier will be recovered from empty stdout - stderr is %s", + galaxy_id_tag, + cmd_out.stderr, + ) return returncode, stdout stdout = f"({galaxy_id_tag}) submission failed (stdout): {cmd_out.stdout}" stderr = f"({galaxy_id_tag}) submission failed (stderr): {cmd_out.stderr}" @@ -174,7 +178,11 @@ def check_watched_items(self): cmd_out = shell.execute(job_interface.get_single_status(external_job_id)) state = job_interface.parse_single_status(cmd_out.stdout, external_job_id) if not state == model.Job.states.OK: - log.warning("(%s/%s) job not found in batch state check, but found in individual state check", id_tag, external_job_id) + log.warning( + "(%s/%s) job not found in batch state check, but found in individual state check", + id_tag, + external_job_id, + ) job_state = ajs.job_wrapper.get_state() if state != old_state: log.debug("(%s/%s) state change: from %s to %s", id_tag, external_job_id, old_state, state) @@ -251,7 +259,12 @@ def stop_job(self, job_wrapper): assert cmd_out.returncode == 0, cmd_out.stderr log.debug("(%s/%s) Terminated at user's request", job.id, job.job_runner_external_id) except Exception as e: - log.debug("(%s/%s) User killed running job, but error encountered during termination: %s", job.id, job.job_runner_external_id, e) + log.debug( + "(%s/%s) User killed running job, but error encountered during termination: %s", + job.id, + job.job_runner_external_id, + e, + ) def recover(self, job, job_wrapper): """Recovers jobs stuck in the queued/running state when Galaxy started""" @@ -267,12 +280,21 @@ def recover(self, job, job_wrapper): ) ajs.command_line = job.command_line if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): - log.debug("(%s/%s) is still in %s state, adding to the runner monitor queue", job.id, job.job_runner_external_id, job.state) + log.debug( + "(%s/%s) is still in %s state, adding to the runner monitor queue", + job.id, + job.job_runner_external_id, + job.state, + ) ajs.old_state = model.Job.states.RUNNING ajs.running = True self.monitor_queue.put(ajs) elif job.state == model.Job.states.QUEUED: - log.debug("(%s/%s) is still in queued state, adding to the runner monitor queue", job.id, job.job_runner_external_id) + log.debug( + "(%s/%s) is still in queued state, adding to the runner monitor queue", + job.id, + job.job_runner_external_id, + ) ajs.old_state = model.Job.states.QUEUED ajs.running = False self.monitor_queue.put(ajs) diff --git a/lib/galaxy/jobs/runners/condor.py b/lib/galaxy/jobs/runners/condor.py index 4c20334ba7f0..e148c2dae584 100644 --- a/lib/galaxy/jobs/runners/condor.py +++ b/lib/galaxy/jobs/runners/condor.py @@ -276,11 +276,18 @@ def recover(self, job, job_wrapper): cjs.user_log = os.path.join(job_wrapper.working_directory, f"galaxy_{galaxy_id_tag}.condor.log") cjs.register_cleanup_file_attribute("user_log") if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): - log.debug("(%s/%s) is still in %s state, adding to the DRM queue", job.id, job.get_job_runner_external_id(), job.state) + log.debug( + "(%s/%s) is still in %s state, adding to the DRM queue", + job.id, + job.get_job_runner_external_id(), + job.state, + ) cjs.running = True self.monitor_queue.put(cjs) elif job.state == model.Job.states.QUEUED: - log.debug("(%s/%s) is still in DRM queued state, adding to the DRM queue", job.id, job.job_runner_external_id) + log.debug( + "(%s/%s) is still in DRM queued state, adding to the DRM queue", job.id, job.job_runner_external_id + ) cjs.running = False self.monitor_queue.put(cjs) diff --git a/lib/galaxy/jobs/runners/drmaa.py b/lib/galaxy/jobs/runners/drmaa.py index 8cbe1ecffba8..c034bcdd9991 100644 --- a/lib/galaxy/jobs/runners/drmaa.py +++ b/lib/galaxy/jobs/runners/drmaa.py @@ -347,7 +347,9 @@ def check_watched_items(self): if state is None: continue if state != old_state: - log.debug("(%s/%s) state change: %s", galaxy_id_tag, external_job_id, self.drmaa_job_state_strings[state]) + log.debug( + "(%s/%s) state change: %s", galaxy_id_tag, external_job_id, self.drmaa_job_state_strings[state] + ) if state == drmaa.JobState.RUNNING and not ajs.running: ajs.running = True ajs.job_wrapper.change_state(model.Job.states.RUNNING) @@ -403,12 +405,21 @@ def recover(self, job, job_wrapper): ) ajs.command_line = job.get_command_line() if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): - log.debug("(%s/%s) is still in %s state, adding to the DRM queue", job.id, job.get_job_runner_external_id(), job.state) + log.debug( + "(%s/%s) is still in %s state, adding to the DRM queue", + job.id, + job.get_job_runner_external_id(), + job.state, + ) ajs.old_state = drmaa.JobState.RUNNING ajs.running = True self.monitor_queue.put(ajs) elif job.get_state() == model.Job.states.QUEUED: - log.debug("(%s/%s) is still in DRM queued state, adding to the DRM queue", job.id, job.get_job_runner_external_id()) + log.debug( + "(%s/%s) is still in DRM queued state, adding to the DRM queue", + job.id, + job.get_job_runner_external_id(), + ) ajs.old_state = drmaa.JobState.QUEUED_ACTIVE ajs.running = False self.monitor_queue.put(ajs) @@ -428,7 +439,7 @@ def external_runjob(self, external_runjob_script, jobtemplate_filename, username """ cmd = shlex.split(external_runjob_script) cmd.extend([str(username), jobtemplate_filename]) - log.info("Running command: %s", ' '.join(cmd)) + log.info("Running command: %s", " ".join(cmd)) try: stdoutdata = commands.execute(cmd).strip() except commands.CommandLineException: diff --git a/lib/galaxy/jobs/runners/godocker.py b/lib/galaxy/jobs/runners/godocker.py index 2ddfce1b09df..aae7e906ef4f 100644 --- a/lib/galaxy/jobs/runners/godocker.py +++ b/lib/galaxy/jobs/runners/godocker.py @@ -186,7 +186,7 @@ def check_watched_item(self, job_state): # Get task from GoDocker job_persisted_state = job_state.job_wrapper.get_state() job_status_god = self.get_task(job_state.job_id) - log.debug("Job ID: %s Job Status: %s", str(job_state.job_id), str(job_status_god['status']['primary'])) + log.debug("Job ID: %s Job Status: %s", str(job_state.job_id), str(job_status_god["status"]["primary"])) if job_status_god["status"]["primary"] == "over" or job_persisted_state == model.Job.states.STOPPED: job_state.running = False @@ -260,13 +260,22 @@ def recover(self, job, job_wrapper): ) job_wrapper.command_line = job.command_line if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): - log.debug("(%s/%s) is still in %s state, adding to the god queue", job.id, job.get_job_runner_external_id(), job.state) + log.debug( + "(%s/%s) is still in %s state, adding to the god queue", + job.id, + job.get_job_runner_external_id(), + job.state, + ) ajs.old_state = "R" ajs.running = True self.monitor_queue.put(ajs) elif job.state == model.Job.states.QUEUED: - log.debug("(%s/%s) is still in god queued state, adding to the god queue", job.id, job.get_job_runner_external_id()) + log.debug( + "(%s/%s) is still in god queued state, adding to the god queue", + job.id, + job.get_job_runner_external_id(), + ) ajs.old_state = "Q" ajs.running = False self.monitor_queue.put(ajs) diff --git a/lib/galaxy/jobs/runners/kubernetes.py b/lib/galaxy/jobs/runners/kubernetes.py index aff56724d9bd..319fd6c08afa 100644 --- a/lib/galaxy/jobs/runners/kubernetes.py +++ b/lib/galaxy/jobs/runners/kubernetes.py @@ -932,12 +932,21 @@ def recover(self, job, job_wrapper): ) ajs.command_line = job.command_line if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): - log.debug("(%s/%s) is still in %s state, adding to the runner monitor queue", job.id, job.job_runner_external_id, job.state) + log.debug( + "(%s/%s) is still in %s state, adding to the runner monitor queue", + job.id, + job.job_runner_external_id, + job.state, + ) ajs.old_state = model.Job.states.RUNNING ajs.running = True self.monitor_queue.put(ajs) elif job.state == model.Job.states.QUEUED: - log.debug("(%s/%s) is still in queued state, adding to the runner monitor queue", job.id, job.job_runner_external_id) + log.debug( + "(%s/%s) is still in queued state, adding to the runner monitor queue", + job.id, + job.job_runner_external_id, + ) ajs.old_state = model.Job.states.QUEUED ajs.running = False self.monitor_queue.put(ajs) diff --git a/lib/galaxy/jobs/runners/pbs.py b/lib/galaxy/jobs/runners/pbs.py index d4a0f3d3c8e7..7c43b6f489ef 100644 --- a/lib/galaxy/jobs/runners/pbs.py +++ b/lib/galaxy/jobs/runners/pbs.py @@ -374,7 +374,11 @@ def check_watched_items(self): try: # Recheck to make sure it wasn't a communication problem self.check_single_job(pbs_server_name, job_id) - log.warning("(%s/%s) PBS job was not in state check list, but was found with individual state check", galaxy_job_id, job_id) + log.warning( + "(%s/%s) PBS job was not in state check list, but was found with individual state check", + galaxy_job_id, + job_id, + ) new_watched.append(pbs_job_state) except Exception: errno, text = pbs.error() @@ -384,11 +388,15 @@ def check_watched_items(self): self.work_queue.put((self.finish_job, pbs_job_state)) else: # Unhandled error, continue to monitor - log.info("(%s/%s) PBS state check resulted in error (%d): %s", galaxy_job_id, job_id, errno, text) + log.info( + "(%s/%s) PBS state check resulted in error (%d): %s", galaxy_job_id, job_id, errno, text + ) new_watched.append(pbs_job_state) continue if status.job_state != old_state: - log.debug("(%s/%s) PBS job state changed from %s to %s", galaxy_job_id, job_id, old_state, status.job_state) + log.debug( + "(%s/%s) PBS job state changed from %s to %s", galaxy_job_id, job_id, old_state, status.job_state + ) if status.job_state == "R" and not pbs_job_state.running: pbs_job_state.running = True pbs_job_state.job_wrapper.change_state(model.Job.states.RUNNING) @@ -542,12 +550,21 @@ def recover(self, job, job_wrapper): pbs_job_state.runner_url = job_wrapper.get_job_runner_url() job_wrapper.command_line = job.command_line if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): - log.debug("(%s/%s) is still in %s state, adding to the PBS queue", job.id, job.get_job_runner_external_id(), job.state) + log.debug( + "(%s/%s) is still in %s state, adding to the PBS queue", + job.id, + job.get_job_runner_external_id(), + job.state, + ) pbs_job_state.old_state = "R" pbs_job_state.running = True self.monitor_queue.put(pbs_job_state) elif job.state == model.Job.states.QUEUED: - log.debug("(%s/%s) is still in PBS queued state, adding to the PBS queue", job.id, job.get_job_runner_external_id()) + log.debug( + "(%s/%s) is still in PBS queued state, adding to the PBS queue", + job.id, + job.get_job_runner_external_id(), + ) pbs_job_state.old_state = "Q" pbs_job_state.running = False self.monitor_queue.put(pbs_job_state) diff --git a/lib/galaxy/jobs/runners/pulsar.py b/lib/galaxy/jobs/runners/pulsar.py index 5a574d4c3fc7..545e11e6b872 100644 --- a/lib/galaxy/jobs/runners/pulsar.py +++ b/lib/galaxy/jobs/runners/pulsar.py @@ -713,7 +713,12 @@ def check_pid(self, pid): if e.errno == errno.ESRCH: log.debug("check_pid(): PID %d is dead", pid) else: - log.warning("check_pid(): Got errno %s when attempting to check PID %d: %s", errno.errorcode[e.errno], pid, e.strerror) + log.warning( + "check_pid(): Got errno %s when attempting to check PID %d: %s", + errno.errorcode[e.errno], + pid, + e.strerror, + ) return False def stop_job(self, job_wrapper): @@ -738,7 +743,14 @@ def stop_job(self, job_wrapper): try: os.killpg(pid, sig) except OSError as e: - log.warning("stop_job(): %s: Got errno %s when attempting to signal %d to PID %d: %s", job.id, errno.errorcode[e.errno], sig, pid, e.strerror) + log.warning( + "stop_job(): %s: Got errno %s when attempting to signal %d to PID %d: %s", + job.id, + errno.errorcode[e.errno], + sig, + pid, + e.strerror, + ) return # give up sleep(2) if not self.check_pid(pid): diff --git a/lib/galaxy/jobs/runners/tasks.py b/lib/galaxy/jobs/runners/tasks.py index 00598fa1c449..73ca1cf158dc 100644 --- a/lib/galaxy/jobs/runners/tasks.py +++ b/lib/galaxy/jobs/runners/tasks.py @@ -232,7 +232,14 @@ def _stop_pid(self, pid, job_id): except OSError as e: # This warning could be bogus; many tasks are stopped with # SIGTERM (signal 15), but ymmv depending on the platform. - log.warning("_stop_pid(): %s: Got errno %s when attempting to signal %d to PID %d: %s", job_id, errno.errorcode[e.errno], sig, pid, e.strerror) + log.warning( + "_stop_pid(): %s: Got errno %s when attempting to signal %d to PID %d: %s", + job_id, + errno.errorcode[e.errno], + sig, + pid, + e.strerror, + ) return # TODO: If we're stopping lots of tasks, then we will want to put this # avoid a two-second overhead using some other asynchronous method. diff --git a/lib/galaxy/jobs/runners/univa.py b/lib/galaxy/jobs/runners/univa.py index 870a0265a74e..d2845ce5b97c 100644 --- a/lib/galaxy/jobs/runners/univa.py +++ b/lib/galaxy/jobs/runners/univa.py @@ -104,7 +104,13 @@ def _complete_terminal_job(self, ajs, drmaa_state, **kwargs): drmaa_state = self.drmaa.JobState.FAILED # test wasted>granted memory only if failed != 0 and exit_status != 0, ie if marked as failed elif state == self.drmaa.JobState.FAILED and mem_wasted > mem_granted * slots: - log.error("(%s/%s) Job hit memory limit (%s>%s)", ajs.job_wrapper.get_id_tag(), ajs.job_id, mem_wasted, mem_granted) + log.error( + "(%s/%s) Job hit memory limit (%s>%s)", + ajs.job_wrapper.get_id_tag(), + ajs.job_id, + mem_wasted, + mem_granted, + ) ajs.fail_message = "This job was terminated because it used more than the maximum allowed memory." ajs.runner_state = ajs.runner_states.MEMORY_LIMIT_REACHED drmaa_state = self.drmaa_job_states.FAILED @@ -117,7 +123,12 @@ def _complete_terminal_job(self, ajs, drmaa_state, **kwargs): self.drmaa.JobState.SYSTEM_SUSPENDED, self.drmaa.JobState.USER_SUSPENDED, ]: - log.warning("(%s/%s) Job is %s, returning to monitor queue", ajs.job_wrapper.get_id_tag(), ajs.job_id, self.drmaa_job_state_strings[state]) + log.warning( + "(%s/%s) Job is %s, returning to monitor queue", + ajs.job_wrapper.get_id_tag(), + ajs.job_id, + self.drmaa_job_state_strings[state], + ) # TODO return True? return True # job was not actually terminal elif state == self.drmaa.JobState.UNDETERMINED: @@ -271,7 +282,7 @@ def _get_drmaa_state_qacct(self, job_id, extinfo): # "NONE". If qdel was called multiple times, every invocation is recorded in a comma # separated list. if "deleted_by" in qacct and qacct["deleted_by"] != "NONE": - log.info("DRMAAUniva: job %s was aborted by %s", job_id, qacct['deleted_by']) + log.info("DRMAAUniva: job %s was aborted by %s", job_id, qacct["deleted_by"]) extinfo["deleted"] = True return self.drmaa.JobState.FAILED @@ -287,14 +298,14 @@ def _get_drmaa_state_qacct(self, job_id, extinfo): if "exit_status" in qacct: qacct["exit_status"] = int(qacct["exit_status"]) if qacct["exit_status"] < 1: - log.error("DRMAAUniva: job %s has exit status %s", job_id, qacct['exit_status']) + log.error("DRMAAUniva: job %s has exit status %s", job_id, qacct["exit_status"]) state = self.drmaa.JobState.DONE elif 0 < qacct["exit_status"] < 129: - log.error("DRMAAUniva: job %s has exit status %s", job_id, qacct['exit_status']) + log.error("DRMAAUniva: job %s has exit status %s", job_id, qacct["exit_status"]) extinfo["exit_status"] = qacct["exit_status"] state = self.drmaa.JobState.FAILED else: - log.error("DRMAAUniva: job %s was killed by signal %s", job_id, qacct['exit_status'] - 128) + log.error("DRMAAUniva: job %s was killed by signal %s", job_id, qacct["exit_status"] - 128) state = self.drmaa.JobState.FAILED extinfo["signal"] = signals[qacct["exit_status"] - 128] @@ -314,7 +325,7 @@ def _get_drmaa_state_qacct(self, job_id, extinfo): elif code in [24, 25]: state = self.drmaa.JobState.RUNNING else: - log.error("DRMAAUniva: job %s failed with failure %s", job_id, qacct['failed']) + log.error("DRMAAUniva: job %s failed with failure %s", job_id, qacct["failed"]) state = self.drmaa.JobState.FAILED # log.debug("UnivaJobRunner._get_drmaa_state_qacct ({jobid}) -> {state}".format(jobid=job_id, state=self.drmaa_job_state_strings[state])) return state diff --git a/lib/galaxy/jobs/runners/util/sudo.py b/lib/galaxy/jobs/runners/util/sudo.py index 2d4b301c982e..3e27bd20af33 100644 --- a/lib/galaxy/jobs/runners/util/sudo.py +++ b/lib/galaxy/jobs/runners/util/sudo.py @@ -21,6 +21,6 @@ def sudo_popen(*args, **kwargs): if user: full_command.extend([SUDO_USER_ARG, user]) full_command.extend(args) - log.info("About to execute the following sudo command - [%s]", ' '.join(full_command)) + log.info("About to execute the following sudo command - [%s]", " ".join(full_command)) p = Popen(full_command, shell=False, stdout=PIPE, stderr=PIPE) return p From 043667e1ab4b2c1b7431ded414ec35804f34a840 Mon Sep 17 00:00:00 2001 From: Keith Suderman Date: Mon, 23 Oct 2023 12:10:54 -0400 Subject: [PATCH 5/5] Rerun logfix --- lib/galaxy/jobs/runners/__init__.py | 18 +++--- lib/galaxy/jobs/runners/aws.py | 10 +-- lib/galaxy/jobs/runners/cli.py | 38 +++++++---- lib/galaxy/jobs/runners/condor.py | 37 ++++++----- lib/galaxy/jobs/runners/drmaa.py | 33 ++++++---- lib/galaxy/jobs/runners/godocker.py | 25 +++++--- lib/galaxy/jobs/runners/kubernetes.py | 23 +++---- lib/galaxy/jobs/runners/local.py | 8 +-- lib/galaxy/jobs/runners/pbs.py | 63 +++++++++++-------- lib/galaxy/jobs/runners/pulsar.py | 40 +++++++----- .../jobs/runners/state_handlers/resubmit.py | 2 +- lib/galaxy/jobs/runners/tasks.py | 30 ++++----- lib/galaxy/jobs/runners/univa.py | 43 +++++++------ lib/galaxy/jobs/runners/util/cli/job/lsf.py | 4 +- lib/galaxy/jobs/runners/util/cli/job/pbs.py | 2 +- lib/galaxy/jobs/runners/util/cli/job/slurm.py | 2 +- .../jobs/runners/util/cli/job/torque.py | 2 +- lib/galaxy/jobs/runners/util/pykube_util.py | 2 +- lib/galaxy/jobs/runners/util/sudo.py | 2 +- 19 files changed, 219 insertions(+), 165 deletions(-) diff --git a/lib/galaxy/jobs/runners/__init__.py b/lib/galaxy/jobs/runners/__init__.py index c14aea726f03..5154d4dff93f 100644 --- a/lib/galaxy/jobs/runners/__init__.py +++ b/lib/galaxy/jobs/runners/__init__.py @@ -116,7 +116,7 @@ def _init_worker_threads(self): """Start ``nworkers`` worker threads.""" self.work_queue = Queue() self.work_threads = [] - log.debug(f"Starting {self.nworkers} {self.runner_name} workers") + log.debug("Starting %s %s workers", self.nworkers, self.runner_name) for i in range(self.nworkers): worker = threading.Thread(name="%s.work_thread-%d" % (self.runner_name, i), target=self.run_next) worker.daemon = True @@ -201,11 +201,11 @@ def put(self, job_wrapper: "MinimalJobWrapper"): # Required for exceptions thrown by object store incompatiblity. # tested by test/integration/objectstore/test_private_handling.py job_wrapper.fail(str(e), exception=e) - log.debug(f"Job [{job_wrapper.job_id}] failed to queue {put_timer}") + log.debug("Job [%s] failed to queue %s", job_wrapper.job_id, put_timer) return if queue_job: self.mark_as_queued(job_wrapper) - log.debug(f"Job [{job_wrapper.job_id}] queued {put_timer}") + log.debug("Job [%s] queued %s", job_wrapper.job_id, put_timer) def mark_as_queued(self, job_wrapper: "MinimalJobWrapper"): self.work_queue.put((self.queue_job, job_wrapper)) @@ -278,12 +278,12 @@ def prepare_job( # Make sure the job hasn't been deleted if job_state == model.Job.states.DELETED: - log.debug(f"({job_id}) Job deleted by user before it entered the {self.runner_name} queue") + log.debug("(%s) Job deleted by user before it entered the %s queue", job_id, self.runner_name) if self.app.config.cleanup_job in ("always", "onsuccess"): job_wrapper.cleanup() return False elif job_state != model.Job.states.QUEUED: - log.info(f"({job_id}) Job is in state {job_state}, skipping execution") + log.info("(%s) Job is in state %s, skipping execution", job_id, job_state) # cleanup may not be safe in all states return False @@ -475,7 +475,7 @@ def _handle_metadata_externally(self, job_wrapper: "MinimalJobWrapper", resolve_ env=os.environ, preexec_fn=os.setpgrp, ) - log.debug("execution of external set_meta for job %d finished" % job_wrapper.job_id) + log.debug("execution of external set_meta for job %d finished", job_wrapper.job_id) def get_job_file(self, job_wrapper: "MinimalJobWrapper", **kwds) -> str: job_metrics = job_wrapper.app.job_metrics @@ -504,7 +504,7 @@ def get_job_file(self, job_wrapper: "MinimalJobWrapper", **kwds) -> str: # Additional logging to enable if debugging from_work_dir handling, metadata # commands, etc... (or just peak in the job script.) job_id = job_wrapper.job_id - log.debug(f"({job_id}) command is: {command_line}") + log.debug("(%s) command is: %s", job_id, command_line) options.update(**kwds) return job_script(**options) @@ -713,7 +713,7 @@ def cleanup(self): prefix = f"({self.job_wrapper.get_id_tag()})" else: prefix = f"({self.job_wrapper.get_id_tag()}/{self.job_id})" - log.debug(f"{prefix} Unable to cleanup {file}: {unicodify(e)}") + log.debug("%s Unable to cleanup %s: %s", prefix, file, unicodify(e)) class AsynchronousJobState(JobState): @@ -840,7 +840,7 @@ def monitor_job(self, job_state): def shutdown(self): """Attempts to gracefully shut down the monitor thread""" - log.info(f"{self.runner_name}: Sending stop signal to monitor thread") + log.info("%s: Sending stop signal to monitor thread", self.runner_name) self.monitor_queue.put(STOP_SIGNAL) # Call the parent's shutdown method to stop workers self.shutdown_monitor() diff --git a/lib/galaxy/jobs/runners/aws.py b/lib/galaxy/jobs/runners/aws.py index bed35ec19c37..c9c07982c1c2 100644 --- a/lib/galaxy/jobs/runners/aws.py +++ b/lib/galaxy/jobs/runners/aws.py @@ -217,9 +217,9 @@ def __init__(self, app, nworkers, **kwargs): self._batch_client = session.client("batch") def queue_job(self, job_wrapper): - log.debug(f"Starting queue_job for job {job_wrapper.get_id_tag()}") + log.debug("Starting queue_job for job %s", job_wrapper.get_id_tag()) if not self.prepare_job(job_wrapper, include_metadata=False, modify_command_for_container=False): - log.debug(f"Not ready {job_wrapper.get_id_tag()}") + log.debug("Not ready %s", job_wrapper.get_id_tag()) return job_destination = job_wrapper.job_destination @@ -252,7 +252,7 @@ def _get_job_definition(self, job_wrapper, destination_params): jd_arn = self._register_job_definition(jd_name, container_image, destination_params) else: jd_arn = res["jobDefinitions"][0]["jobDefinitionArn"] - log.debug(f"Found existing job definition: {jd_name}.") + log.debug("Found existing job definition: %s.", jd_name) return jd_arn @@ -325,7 +325,7 @@ def _get_retry_strategy(self, destination_params): return strategy def _register_job_definition(self, jd_name, container_image, destination_params): - log.debug(f"Registering a new job definition: {jd_name}.") + log.debug("Registering a new job definition: %s.", jd_name) platform = destination_params.get("platform") volumes, mount_points = self._get_mount_volumes(destination_params) @@ -375,7 +375,7 @@ def _submit_job(self, job_def, job_wrapper, destination_params): job_name = self.JOB_NAME_PREFIX + job_wrapper.get_id_tag() command_script_path = self.write_command(job_wrapper) - log.info(f"Submitting job {job_name} to AWS Batch.") + log.info("Submitting job %s to AWS Batch.", job_name) res = self._batch_client.submit_job( jobName=job_name, jobQueue=destination_params.get("job_queue"), diff --git a/lib/galaxy/jobs/runners/cli.py b/lib/galaxy/jobs/runners/cli.py index 3af7b71d696a..5882d3d4ece1 100644 --- a/lib/galaxy/jobs/runners/cli.py +++ b/lib/galaxy/jobs/runners/cli.py @@ -50,7 +50,7 @@ def url_to_destination(self, url): job_params = {f"job_{k}": v for k, v in [kv.split("=", 1) for kv in job_params.split("&")]} params.update(shell_params) params.update(job_params) - log.debug(f"Converted URL '{url}' to destination runner=cli, params={params}") + log.debug("Converted URL '%s' to destination runner=cli, params=%s", url, params) # Create a dynamic JobDestination return JobDestination(runner="cli", params=params) @@ -96,7 +96,7 @@ def queue_job(self, job_wrapper): job_wrapper.cleanup() return - log.debug(f"({galaxy_id_tag}) submitting file: {ajs.job_file}") + log.debug("(%s) submitting file: %s", galaxy_id_tag, ajs.job_file) returncode, stdout = self.submit(shell, job_interface, ajs.job_file, galaxy_id_tag, retry=MAX_SUBMIT_RETRY) if returncode != 0: @@ -107,11 +107,11 @@ def queue_job(self, job_wrapper): submit_stdout = stdout.strip() external_job_id = submit_stdout and submit_stdout.split()[-1] if not external_job_id: - log.error(f"({galaxy_id_tag}) submission did not return a job identifier, failing job") + log.error("(%s) submission did not return a job identifier, failing job", galaxy_id_tag) job_wrapper.fail("failure submitting job") return - log.info(f"({galaxy_id_tag}) queued with identifier: {external_job_id}") + log.info("(%s) queued with identifier: %s", galaxy_id_tag, external_job_id) # store runner information for tracking if Galaxy restarts job_wrapper.set_external_id(external_job_id) @@ -137,7 +137,9 @@ def submit(self, shell, job_interface, job_file, galaxy_id_tag, retry=MAX_SUBMIT stdout = cmd_out.stdout if not stdout or not stdout.strip(): log.warning( - f"({galaxy_id_tag}) Execute returned a 0 exit code but no external identifier will be recovered from empty stdout - stderr is {cmd_out.stderr}" + "(%s) Execute returned a 0 exit code but no external identifier will be recovered from empty stdout - stderr is %s", + galaxy_id_tag, + cmd_out.stderr, ) return returncode, stdout stdout = f"({galaxy_id_tag}) submission failed (stdout): {cmd_out.stdout}" @@ -170,18 +172,20 @@ def check_watched_items(self): if ajs.job_wrapper.get_state() == model.Job.states.DELETED: continue - log.debug(f"({id_tag}/{external_job_id}) job not found in batch state check") + log.debug("(%s/%s) job not found in batch state check", id_tag, external_job_id) shell_params, job_params = self.parse_destination_params(ajs.job_destination.params) shell, job_interface = self.get_cli_plugins(shell_params, job_params) cmd_out = shell.execute(job_interface.get_single_status(external_job_id)) state = job_interface.parse_single_status(cmd_out.stdout, external_job_id) if not state == model.Job.states.OK: log.warning( - f"({id_tag}/{external_job_id}) job not found in batch state check, but found in individual state check" + "(%s/%s) job not found in batch state check, but found in individual state check", + id_tag, + external_job_id, ) job_state = ajs.job_wrapper.get_state() if state != old_state: - log.debug(f"({id_tag}/{external_job_id}) state change: from {old_state} to {state}") + log.debug("(%s/%s) state change: from %s to %s", id_tag, external_job_id, old_state, state) if state == model.Job.states.ERROR and job_state != model.Job.states.STOPPED: # Try to find out the reason for exiting - this needs to happen before change_state # otherwise jobs depending on resubmission outputs see that job as failed and pause. @@ -201,7 +205,7 @@ def check_watched_items(self): ) if external_metadata: self.work_queue.put((self.handle_metadata_externally, ajs)) - log.debug(f"({id_tag}/{external_job_id}) job execution finished, running job wrapper finish method") + log.debug("(%s/%s) job execution finished, running job wrapper finish method", id_tag, external_job_id) self.work_queue.put((self.finish_job, ajs)) else: new_watched.append(ajs) @@ -253,10 +257,13 @@ def stop_job(self, job_wrapper): shell, job_interface = self.get_cli_plugins(shell_params, job_params) cmd_out = shell.execute(job_interface.delete(job.job_runner_external_id)) assert cmd_out.returncode == 0, cmd_out.stderr - log.debug(f"({job.id}/{job.job_runner_external_id}) Terminated at user's request") + log.debug("(%s/%s) Terminated at user's request", job.id, job.job_runner_external_id) except Exception as e: log.debug( - f"({job.id}/{job.job_runner_external_id}) User killed running job, but error encountered during termination: {e}" + "(%s/%s) User killed running job, but error encountered during termination: %s", + job.id, + job.job_runner_external_id, + e, ) def recover(self, job, job_wrapper): @@ -274,14 +281,19 @@ def recover(self, job, job_wrapper): ajs.command_line = job.command_line if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): log.debug( - f"({job.id}/{job.job_runner_external_id}) is still in {job.state} state, adding to the runner monitor queue" + "(%s/%s) is still in %s state, adding to the runner monitor queue", + job.id, + job.job_runner_external_id, + job.state, ) ajs.old_state = model.Job.states.RUNNING ajs.running = True self.monitor_queue.put(ajs) elif job.state == model.Job.states.QUEUED: log.debug( - f"({job.id}/{job.job_runner_external_id}) is still in queued state, adding to the runner monitor queue" + "(%s/%s) is still in queued state, adding to the runner monitor queue", + job.id, + job.job_runner_external_id, ) ajs.old_state = model.Job.states.QUEUED ajs.running = False diff --git a/lib/galaxy/jobs/runners/condor.py b/lib/galaxy/jobs/runners/condor.py index 6e9862ac9e02..e148c2dae584 100644 --- a/lib/galaxy/jobs/runners/condor.py +++ b/lib/galaxy/jobs/runners/condor.py @@ -132,11 +132,11 @@ def queue_job(self, job_wrapper): job_wrapper.cleanup() return - log.debug(f"({galaxy_id_tag}) submitting file {executable}") + log.debug("(%s) submitting file %s", galaxy_id_tag, executable) external_job_id, message = condor_submit(submit_file) if external_job_id is None: - log.debug(f"condor_submit failed for job {job_wrapper.get_id_tag()}: {message}") + log.debug("condor_submit failed for job %s: %s", job_wrapper.get_id_tag(), message) if self.app.config.cleanup_job == "always": os.unlink(submit_file) cjs.cleanup() @@ -145,7 +145,7 @@ def queue_job(self, job_wrapper): os.unlink(submit_file) - log.info(f"({galaxy_id_tag}) queued as {external_job_id}") + log.info("(%s) queued as %s", galaxy_id_tag, external_job_id) # store runner information for tracking if Galaxy restarts job_wrapper.set_external_id(external_job_id) @@ -181,7 +181,7 @@ def check_watched_items(self): except Exception: # so we don't kill the monitor thread log.exception(f"({galaxy_id_tag}/{job_id}) Unable to check job status") - log.warning(f"({galaxy_id_tag}/{job_id}) job will now be errored") + log.warning("(%s/%s) job will now be errored", galaxy_id_tag, job_id) cjs.fail_message = "Cluster could not complete job" self.work_queue.put((self.fail_job, cjs)) continue @@ -191,10 +191,10 @@ def check_watched_items(self): cjs.job_wrapper.check_for_entry_points() if job_running and not cjs.running: - log.debug(f"({galaxy_id_tag}/{job_id}) job is now running") + log.debug("(%s/%s) job is now running", galaxy_id_tag, job_id) cjs.job_wrapper.change_state(model.Job.states.RUNNING) if not job_running and cjs.running: - log.debug(f"({galaxy_id_tag}/{job_id}) job has stopped running") + log.debug("(%s/%s) job has stopped running", galaxy_id_tag, job_id) # Will switching from RUNNING to QUEUED confuse Galaxy? # cjs.job_wrapper.change_state( model.Job.states.QUEUED ) job_state = cjs.job_wrapper.get_state() @@ -205,11 +205,11 @@ def check_watched_items(self): ) if external_metadata: self._handle_metadata_externally(cjs.job_wrapper, resolve_requirements=True) - log.debug(f"({galaxy_id_tag}/{job_id}) job has completed") + log.debug("(%s/%s) job has completed", galaxy_id_tag, job_id) self.work_queue.put((self.finish_job, cjs)) continue if job_failed: - log.debug(f"({galaxy_id_tag}/{job_id}) job failed") + log.debug("(%s/%s) job failed", galaxy_id_tag, job_id) cjs.failed = True self.work_queue.put((self.fail_job, cjs)) continue @@ -225,7 +225,7 @@ def stop_job(self, job_wrapper): galaxy_id_tag = job_wrapper.get_id_tag() if job.container: try: - log.info(f"stop_job(): {job.id}: trying to stop container .... ({external_id})") + log.info("stop_job(): %s: trying to stop container .... (%s)", job.id, external_id) # self.watched = [cjs for cjs in self.watched if cjs.job_id != external_id] new_watch_list = list() cjs = None @@ -244,21 +244,21 @@ def stop_job(self, job_wrapper): ) if external_metadata: self._handle_metadata_externally(cjs.job_wrapper, resolve_requirements=True) - log.debug(f"({galaxy_id_tag}/{external_id}) job has completed") + log.debug("(%s/%s) job has completed", galaxy_id_tag, external_id) self.work_queue.put((self.finish_job, cjs)) except Exception as e: - log.warning(f"stop_job(): {job.id}: trying to stop container failed. ({e})") + log.warning("stop_job(): %s: trying to stop container failed. (%s)", job.id, e) try: self._kill_container(job_wrapper) except Exception as e: - log.warning(f"stop_job(): {job.id}: trying to kill container failed. ({e})") + log.warning("stop_job(): %s: trying to kill container failed. (%s)", job.id, e) failure_message = condor_stop(external_id) if failure_message: - log.debug(f"({external_id}). Failed to stop condor {failure_message}") + log.debug("(%s). Failed to stop condor %s", external_id, failure_message) else: failure_message = condor_stop(external_id) if failure_message: - log.debug(f"({external_id}). Failed to stop condor {failure_message}") + log.debug("(%s). Failed to stop condor %s", external_id, failure_message) def recover(self, job, job_wrapper): """Recovers jobs stuck in the queued/running state when Galaxy started""" @@ -277,12 +277,17 @@ def recover(self, job, job_wrapper): cjs.register_cleanup_file_attribute("user_log") if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): log.debug( - f"({job.id}/{job.get_job_runner_external_id()}) is still in {job.state} state, adding to the DRM queue" + "(%s/%s) is still in %s state, adding to the DRM queue", + job.id, + job.get_job_runner_external_id(), + job.state, ) cjs.running = True self.monitor_queue.put(cjs) elif job.state == model.Job.states.QUEUED: - log.debug(f"({job.id}/{job.job_runner_external_id}) is still in DRM queued state, adding to the DRM queue") + log.debug( + "(%s/%s) is still in DRM queued state, adding to the DRM queue", job.id, job.job_runner_external_id + ) cjs.running = False self.monitor_queue.put(cjs) diff --git a/lib/galaxy/jobs/runners/drmaa.py b/lib/galaxy/jobs/runners/drmaa.py index 5c1d68f70490..c034bcdd9991 100644 --- a/lib/galaxy/jobs/runners/drmaa.py +++ b/lib/galaxy/jobs/runners/drmaa.py @@ -108,10 +108,10 @@ def url_to_destination(self, url): native_spec = url.split("/")[2] if native_spec: params = dict(nativeSpecification=native_spec) - log.debug(f"Converted URL '{url}' to destination runner=drmaa, params={params}") + log.debug("Converted URL '%s' to destination runner=drmaa, params=%s", url, params) return JobDestination(runner="drmaa", params=params) else: - log.debug(f"Converted URL '{url}' to destination runner=drmaa") + log.debug("Converted URL '%s' to destination runner=drmaa", url) return JobDestination(runner="drmaa") def get_native_spec(self, url): @@ -197,7 +197,7 @@ def queue_job(self, job_wrapper): log.exception("(%s) drmaa.Session.runJob() failed unconditionally", galaxy_id_tag) trynum = 5 else: - log.error(f"({galaxy_id_tag}) All attempts to submit job failed") + log.error("(%s) All attempts to submit job failed", galaxy_id_tag) if not fail_msg: fail_msg = DEFAULT_JOB_RUNNER_FAILURE_MESSAGE job_wrapper.fail(fail_msg) @@ -216,13 +216,13 @@ def queue_job(self, job_wrapper): job_wrapper.fail(fail_msg) return pwent = job_wrapper.galaxy_system_pwent - log.debug(f"({galaxy_id_tag}) submitting with credentials: {pwent[0]} [uid: {pwent[2]}]") + log.debug("(%s) submitting with credentials: %s [uid: %s]", galaxy_id_tag, pwent[0], pwent[2]) self.userid = pwent[2] external_job_id = self.external_runjob(external_runjob_script, filename, pwent[2]) if external_job_id is None: job_wrapper.fail(f"({galaxy_id_tag}) could not queue job") return - log.info(f"({galaxy_id_tag}) queued as {external_job_id}") + log.info("(%s) queued as %s", galaxy_id_tag, external_job_id) # store runner information for tracking if Galaxy restarts job_wrapper.set_external_id(external_job_id) @@ -327,7 +327,7 @@ def check_watched_item(self, ajs, new_watched): except Exception: # so we don't kill the monitor thread log.exception(f"({galaxy_id_tag}/{external_job_id}) unable to check job status") - log.warning(f"({galaxy_id_tag}/{external_job_id}) job will now be errored") + log.warning("(%s/%s) job will now be errored", galaxy_id_tag, external_job_id) ajs.fail_message = "Cluster could not complete job" self.work_queue.put((self.fail_job, ajs)) return None @@ -347,7 +347,9 @@ def check_watched_items(self): if state is None: continue if state != old_state: - log.debug(f"({galaxy_id_tag}/{external_job_id}) state change: {self.drmaa_job_state_strings[state]}") + log.debug( + "(%s/%s) state change: %s", galaxy_id_tag, external_job_id, self.drmaa_job_state_strings[state] + ) if state == drmaa.JobState.RUNNING and not ajs.running: ajs.running = True ajs.job_wrapper.change_state(model.Job.states.RUNNING) @@ -381,11 +383,11 @@ def stop_job(self, job_wrapper): cmd = shlex.split(kill_script) cmd.extend([str(ext_id), str(self.userid)]) commands.execute(cmd) - log.info(f"({job.id}/{ext_id}) Removed from DRM queue at user's request") + log.info("(%s/%s) Removed from DRM queue at user's request", job.id, ext_id) except drmaa.InvalidJobException: log.exception(f"({job.id}/{ext_id}) User killed running job, but it was already dead") except commands.CommandLineException as e: - log.error(f"({job.id}/{ext_id}) User killed running job, but command execution failed: {unicodify(e)}") + log.error("(%s/%s) User killed running job, but command execution failed: %s", job.id, ext_id, unicodify(e)) except Exception: log.exception(f"({job.id}/{ext_id}) User killed running job, but error encountered removing from DRM queue") @@ -404,14 +406,19 @@ def recover(self, job, job_wrapper): ajs.command_line = job.get_command_line() if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): log.debug( - f"({job.id}/{job.get_job_runner_external_id()}) is still in {job.state} state, adding to the DRM queue" + "(%s/%s) is still in %s state, adding to the DRM queue", + job.id, + job.get_job_runner_external_id(), + job.state, ) ajs.old_state = drmaa.JobState.RUNNING ajs.running = True self.monitor_queue.put(ajs) elif job.get_state() == model.Job.states.QUEUED: log.debug( - f"({job.id}/{job.get_job_runner_external_id()}) is still in DRM queued state, adding to the DRM queue" + "(%s/%s) is still in DRM queued state, adding to the DRM queue", + job.id, + job.get_job_runner_external_id(), ) ajs.old_state = drmaa.JobState.QUEUED_ACTIVE ajs.running = False @@ -422,7 +429,7 @@ def store_jobtemplate(self, job_wrapper, jt): filename = os.path.join(job_wrapper.working_directory, f"{job_wrapper.get_id_tag()}.jt_json") with open(filename, "w") as fp: json.dump(jt, fp) - log.debug(f"({job_wrapper.job_id}) Job script for external submission is: {filename}") + log.debug("(%s) Job script for external submission is: %s", job_wrapper.job_id, filename) return filename def external_runjob(self, external_runjob_script, jobtemplate_filename, username): @@ -432,7 +439,7 @@ def external_runjob(self, external_runjob_script, jobtemplate_filename, username """ cmd = shlex.split(external_runjob_script) cmd.extend([str(username), jobtemplate_filename]) - log.info(f"Running command: {' '.join(cmd)}") + log.info("Running command: %s", " ".join(cmd)) try: stdoutdata = commands.execute(cmd).strip() except commands.CommandLineException: diff --git a/lib/galaxy/jobs/runners/godocker.py b/lib/galaxy/jobs/runners/godocker.py index bcacb84fb700..aae7e906ef4f 100644 --- a/lib/galaxy/jobs/runners/godocker.py +++ b/lib/galaxy/jobs/runners/godocker.py @@ -158,7 +158,7 @@ def queue_job(self, job_wrapper): log.error("Job creation failure. No Response from GoDocker") job_wrapper.fail("Not submitted") else: - log.debug(f"Starting queue_job for job {job_id}") + log.debug("Starting queue_job for job %s", job_id) # Create an object of AsynchronousJobState and add it to the monitor queue. ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, @@ -186,7 +186,7 @@ def check_watched_item(self, job_state): # Get task from GoDocker job_persisted_state = job_state.job_wrapper.get_state() job_status_god = self.get_task(job_state.job_id) - log.debug(f"Job ID: {str(job_state.job_id)} Job Status: {str(job_status_god['status']['primary'])}") + log.debug("Job ID: %s Job Status: %s", str(job_state.job_id), str(job_status_god["status"]["primary"])) if job_status_god["status"]["primary"] == "over" or job_persisted_state == model.Job.states.STOPPED: job_state.running = False @@ -238,7 +238,7 @@ def stop_job(self, job_wrapper): # This function is called by fail_job() # No Return data expected job_id = job_wrapper.job_id - log.debug(f"STOP JOB EXECUTION OF JOB ID: {str(job_id)}") + log.debug("STOP JOB EXECUTION OF JOB ID: %s", str(job_id)) # Get task status from GoDocker. job_status_god = self.get_task_status(job_id) if job_status_god["status"]["primary"] != "over": @@ -261,7 +261,10 @@ def recover(self, job, job_wrapper): job_wrapper.command_line = job.command_line if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): log.debug( - f"({job.id}/{job.get_job_runner_external_id()}) is still in {job.state} state, adding to the god queue" + "(%s/%s) is still in %s state, adding to the god queue", + job.id, + job.get_job_runner_external_id(), + job.state, ) ajs.old_state = "R" ajs.running = True @@ -269,7 +272,9 @@ def recover(self, job, job_wrapper): elif job.state == model.Job.states.QUEUED: log.debug( - f"({job.id}/{job.get_job_runner_external_id()}) is still in god queued state, adding to the god queue" + "(%s/%s) is still in god queued state, adding to the god queue", + job.id, + job.get_job_runner_external_id(), ) ajs.old_state = "Q" ajs.running = False @@ -309,9 +314,9 @@ def create_log_file(self, job_state, job_status_god): log_file.write(out_log) log_file.close() f.close() - log.debug(f"CREATE OUTPUT FILE: {job_state.output_file}") - log.debug(f"CREATE ERROR FILE: {job_state.error_file}") - log.debug(f"CREATE EXIT CODE FILE: {job_state.exit_code_file}") + log.debug("CREATE OUTPUT FILE: %s", job_state.output_file) + log.debug("CREATE ERROR FILE: %s", job_state.error_file) + log.debug("CREATE EXIT CODE FILE: %s", job_state.exit_code_file) except OSError as e: log.error("Could not access task log file: %s", unicodify(e)) log.debug("IO Error occurred when accessing the files.") @@ -325,7 +330,7 @@ def login(self, apikey, login, server, noCert=False): Create Login model schema of GoDocker and call the http_post_request method. """ log.debug("LOGIN TASK TO BE EXECUTED \n") - log.debug(f"GODOCKER LOGIN: {str(login)}") + log.debug("GODOCKER LOGIN: %s", str(login)) data = json.dumps({"user": login, "apikey": apikey}) # Create object of Godocker class g_auth = Godocker(server, login, apikey, noCert) @@ -360,7 +365,7 @@ def post_task(self, job_wrapper): docker_image = self._find_container(job_wrapper).container_id log.debug("GoDocker runner using container %s.", docker_image) except Exception: - log.error(f"Unable to find docker_image for job {job_wrapper.job_id}, failing.") + log.error("Unable to find docker_image for job %s, failing.", job_wrapper.job_id) return False volumes = [] diff --git a/lib/galaxy/jobs/runners/kubernetes.py b/lib/galaxy/jobs/runners/kubernetes.py index 1f9e984605c3..319fd6c08afa 100644 --- a/lib/galaxy/jobs/runners/kubernetes.py +++ b/lib/galaxy/jobs/runners/kubernetes.py @@ -144,7 +144,7 @@ def queue_job(self, job_wrapper): # prepare the job # We currently don't need to include_metadata or include_work_dir_outputs, as working directory is the same # where galaxy will expect results. - log.debug(f"Starting queue_job for job {job_wrapper.get_id_tag()}") + log.debug("Starting queue_job for job %s", job_wrapper.get_id_tag()) ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, @@ -226,7 +226,7 @@ def __configure_port_routing(self, ajs): # Configure additional k8s service and ingress for tools with guest ports k8s_job_prefix = self.__produce_k8s_job_prefix() k8s_job_name = self.__get_k8s_job_name(k8s_job_prefix, ajs.job_wrapper) - log.debug(f"Configuring entry points and deploying service/ingress for job with ID {ajs.job_id}") + log.debug("Configuring entry points and deploying service/ingress for job with ID %s", ajs.job_id) k8s_service_obj = service_object_dict(self.runner_params, k8s_job_name, self.__get_k8s_service_spec(ajs)) k8s_ingress_obj = ingress_object_dict(self.runner_params, k8s_job_name, self.__get_k8s_ingress_spec(ajs)) @@ -879,7 +879,7 @@ def __job_pending_due_to_unschedulable_pod(self, job_state): def __cleanup_k8s_guest_ports(self, job_wrapper, k8s_job): k8s_job_prefix = self.__produce_k8s_job_prefix() k8s_job_name = f"{k8s_job_prefix}-{self.__force_label_conformity(job_wrapper.get_id_tag())}" - log.debug(f"Deleting service/ingress for job with ID {job_wrapper.get_id_tag()}") + log.debug("Deleting service/ingress for job with ID %s", job_wrapper.get_id_tag()) job_failed = k8s_job.obj["status"]["failed"] > 0 if "failed" in k8s_job.obj["status"] else False ingress_to_delete = find_ingress_object_by_name( self._pykube_api, k8s_job_name, self.runner_params["k8s_namespace"] @@ -908,7 +908,7 @@ def stop_job(self, job_wrapper): self.__cleanup_k8s_job(k8s_job) # TODO assert whether job parallelism == 0 # assert not job_to_delete.exists(), "Could not delete job,"+job.job_runner_external_id+" it still exists" - log.debug(f"({job.id}/{job.job_runner_external_id}) Terminated at user's request") + log.debug("(%s/%s) Terminated at user's request", job.id, job.job_runner_external_id) except Exception as e: log.exception( @@ -920,7 +920,7 @@ def stop_job(self, job_wrapper): def recover(self, job, job_wrapper): """Recovers jobs stuck in the queued/running state when Galaxy started""" job_id = job.get_job_runner_external_id() - log.debug(f"k8s trying to recover job: {job_id}") + log.debug("k8s trying to recover job: %s", job_id) if job_id is None: self.put(job_wrapper) return @@ -933,18 +933,19 @@ def recover(self, job, job_wrapper): ajs.command_line = job.command_line if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): log.debug( - "({}/{}) is still in {} state, adding to the runner monitor queue".format( - job.id, job.job_runner_external_id, job.state - ) + "(%s/%s) is still in %s state, adding to the runner monitor queue", + job.id, + job.job_runner_external_id, + job.state, ) ajs.old_state = model.Job.states.RUNNING ajs.running = True self.monitor_queue.put(ajs) elif job.state == model.Job.states.QUEUED: log.debug( - "({}/{}) is still in queued state, adding to the runner monitor queue".format( - job.id, job.job_runner_external_id - ) + "(%s/%s) is still in queued state, adding to the runner monitor queue", + job.id, + job.job_runner_external_id, ) ajs.old_state = model.Job.states.QUEUED ajs.running = False diff --git a/lib/galaxy/jobs/runners/local.py b/lib/galaxy/jobs/runners/local.py index d70768f8b856..27bdbb33190f 100644 --- a/lib/galaxy/jobs/runners/local.py +++ b/lib/galaxy/jobs/runners/local.py @@ -95,7 +95,7 @@ def queue_job(self, job_wrapper): try: stdout_file = tempfile.NamedTemporaryFile(mode="wb+", suffix="_stdout", dir=job_wrapper.working_directory) stderr_file = tempfile.NamedTemporaryFile(mode="wb+", suffix="_stderr", dir=job_wrapper.working_directory) - log.debug(f"({job_id}) executing job script: {job_file}") + log.debug("(%s) executing job script: %s", job_id, job_file) # The preexec_fn argument of Popen() is used to call os.setpgrp() in # the child process just before the child is executed. This will set # the PGID of the child process to its PID (i.e. ensures that it is @@ -140,7 +140,7 @@ def queue_job(self, job_wrapper): stderr = self._job_io_for_db(stderr_file) stdout_file.close() stderr_file.close() - log.debug(f"execution finished: {job_file}") + log.debug("execution finished: %s", job_file) except Exception: log.exception("failure running job %d", job_wrapper.job_id) self._fail_job_local(job_wrapper, "failure running job") @@ -166,11 +166,11 @@ def stop_job(self, job_wrapper): # metadata internal or job not complete yet pid = job.get_job_runner_external_id() if pid in [None, ""]: - log.warning(f"stop_job(): {job.id}: no PID in database for job, unable to stop") + log.warning("stop_job(): %s: no PID in database for job, unable to stop", job.id) return pid = int(pid) if not check_pg(pid): - log.warning("stop_job(): %s: Process group %d was already dead or can't be signaled" % (job.id, pid)) + log.warning("stop_job(): %s: Process group %d was already dead or can't be signaled", job.id, pid) return log.debug("stop_job(): %s: Terminating process group %d", job.id, pid) kill_pg(pid) diff --git a/lib/galaxy/jobs/runners/pbs.py b/lib/galaxy/jobs/runners/pbs.py index fa22c11b1a70..7c43b6f489ef 100644 --- a/lib/galaxy/jobs/runners/pbs.py +++ b/lib/galaxy/jobs/runners/pbs.py @@ -113,7 +113,7 @@ def __init__(self, app, nworkers): def default_pbs_server(self): if self.__default_pbs_server is None: self.__default_pbs_server = pbs.pbs_default() - log.debug(f"Set default PBS server to {self.default_pbs_server}") + log.debug("Set default PBS server to %s", self.default_pbs_server) return self.__default_pbs_server def url_to_destination(self, url): @@ -151,7 +151,7 @@ def url_to_destination(self, url): param, value = opt.split(None, 1) params[param] = value - log.debug(f"Converted URL '{url}' to destination runner=pbs, params={params}") + log.debug("Converted URL '%s' to destination runner=pbs, params=%s", url, params) # Create a dynamic JobDestination return JobDestination(runner="pbs", params=params) @@ -171,7 +171,7 @@ def parse_destination_params(self, params): arg = arg.lstrip("-") args[arg] = value except Exception: - log.warning(f"Unrecognized long argument in destination params: {arg}") + log.warning("Unrecognized long argument in destination params: %s", arg) return self.__args_to_attrs(args) # Internal stuff @@ -234,7 +234,7 @@ def queue_job(self, job_wrapper): if c <= 0: errno, text = pbs.error() job_wrapper.fail("Unable to queue job for execution. Resubmitting the job may succeed.") - log.error(f"Connection to PBS server for submit failed: {errno}: {text}") + log.error("Connection to PBS server for submit failed: %s: %s", errno, text) return # define job attributes @@ -297,7 +297,7 @@ def queue_job(self, job_wrapper): self.write_executable_script(job_file, script, job_io=job_wrapper.job_io) # job was deleted while we were preparing it if job_wrapper.get_state() in (model.Job.states.DELETED, model.Job.states.STOPPED): - log.debug(f"Job {job_wrapper.job_id} deleted/stopped by user before it entered the PBS queue") + log.debug("Job %s deleted/stopped by user before it entered the PBS queue", job_wrapper.job_id) pbs.pbs_disconnect(c) if job_wrapper.cleanup_job in ("always", "onsuccess"): self.cleanup((ofile, efile, ecfile, job_file)) @@ -308,7 +308,7 @@ def queue_job(self, job_wrapper): # The job tag includes the job and the task identifier # (if a TaskWrapper was passed in): galaxy_job_id = job_wrapper.get_id_tag() - log.debug(f"({galaxy_job_id}) submitting file {job_file}") + log.debug("(%s) submitting file %s", galaxy_job_id, job_file) tries = 0 while tries < 5: @@ -318,17 +318,17 @@ def queue_job(self, job_wrapper): pbs.pbs_disconnect(c) break errno, text = pbs.error() - log.warning("(%s) pbs_submit failed (try %d/5), PBS error %d: %s" % (galaxy_job_id, tries, errno, text)) + log.warning("(%s) pbs_submit failed (try %d/5), PBS error %d: %s", galaxy_job_id, tries, errno, text) time.sleep(2) else: - log.error(f"({galaxy_job_id}) All attempts to submit job failed") + log.error("(%s) All attempts to submit job failed", galaxy_job_id) job_wrapper.fail("Unable to run this job due to a cluster error, please retry it later") return if pbs_queue_name is None: - log.debug(f"({galaxy_job_id}) queued in default queue as {job_id}") + log.debug("(%s) queued in default queue as %s", galaxy_job_id, job_id) else: - log.debug(f"({galaxy_job_id}) queued in {pbs_queue_name} queue as {job_id}") + log.debug("(%s) queued in %s queue as %s", galaxy_job_id, pbs_queue_name, job_id) # persist destination job_wrapper.set_job_destination(job_destination, job_id) @@ -363,7 +363,7 @@ def check_watched_items(self): old_state = pbs_job_state.old_state pbs_server_name = self.__get_pbs_server(pbs_job_state.job_destination.params) if pbs_server_name in failures: - log.debug(f"({galaxy_job_id}/{job_id}) Skipping state check because PBS server connection failed") + log.debug("(%s/%s) Skipping state check because PBS server connection failed", galaxy_job_id, job_id) new_watched.append(pbs_job_state) continue try: @@ -375,24 +375,28 @@ def check_watched_items(self): # Recheck to make sure it wasn't a communication problem self.check_single_job(pbs_server_name, job_id) log.warning( - f"({galaxy_job_id}/{job_id}) PBS job was not in state check list, but was found with individual state check" + "(%s/%s) PBS job was not in state check list, but was found with individual state check", + galaxy_job_id, + job_id, ) new_watched.append(pbs_job_state) except Exception: errno, text = pbs.error() if errno == 15001: # 15001 == job not in queue - log.debug(f"({galaxy_job_id}/{job_id}) PBS job has left queue") + log.debug("(%s/%s) PBS job has left queue", galaxy_job_id, job_id) self.work_queue.put((self.finish_job, pbs_job_state)) else: # Unhandled error, continue to monitor log.info( - "(%s/%s) PBS state check resulted in error (%d): %s" % (galaxy_job_id, job_id, errno, text) + "(%s/%s) PBS state check resulted in error (%d): %s", galaxy_job_id, job_id, errno, text ) new_watched.append(pbs_job_state) continue if status.job_state != old_state: - log.debug(f"({galaxy_job_id}/{job_id}) PBS job state changed from {old_state} to {status.job_state}") + log.debug( + "(%s/%s) PBS job state changed from %s to %s", galaxy_job_id, job_id, old_state, status.job_state + ) if status.job_state == "R" and not pbs_job_state.running: pbs_job_state.running = True pbs_job_state.job_wrapper.change_state(model.Job.states.RUNNING) @@ -410,18 +414,18 @@ def check_watched_items(self): int(status.exit_status) == 0 or pbs_job_state.job_wrapper.get_state() == model.Job.states.STOPPED ) - log.debug(f"({galaxy_job_id}/{job_id}) PBS job has completed successfully") + log.debug("(%s/%s) PBS job has completed successfully", galaxy_job_id, job_id) except AssertionError: exit_status = int(status.exit_status) error_message = JOB_EXIT_STATUS.get(exit_status, f"Unknown error: {status.exit_status}") pbs_job_state.fail_message = CLUSTER_ERROR_MESSAGE % error_message - log.error(f"({galaxy_job_id}/{job_id}) PBS job failed: {error_message}") + log.error("(%s/%s) PBS job failed: %s", galaxy_job_id, job_id, error_message) pbs_job_state.stop_job = False self.work_queue.put((self.fail_job, pbs_job_state)) continue except AttributeError: # No exit_status, can't verify proper completion so we just have to assume success. - log.debug(f"({galaxy_job_id}/{job_id}) PBS job has completed") + log.debug("(%s/%s) PBS job has completed", galaxy_job_id, job_id) self.work_queue.put((self.finish_job, pbs_job_state)) continue pbs_job_state.old_state = status.job_state @@ -446,7 +450,7 @@ def check_all_jobs(self): for pbs_server_name in servers: c = pbs.pbs_connect(util.smart_str(pbs_server_name)) if c <= 0: - log.debug(f"connection to PBS server {pbs_server_name} for state check failed") + log.debug("connection to PBS server %s for state check failed", pbs_server_name) failures.append(pbs_server_name) continue stat_attrl = pbs.new_attrl(3) @@ -479,7 +483,7 @@ def check_single_job(self, pbs_server_name, job_id): """ c = pbs.pbs_connect(util.smart_str(pbs_server_name)) if c <= 0: - log.debug(f"connection to PBS server {pbs_server_name} for state check failed") + log.debug("connection to PBS server %s for state check failed", pbs_server_name) return None stat_attrl = pbs.new_attrl(1) stat_attrl[0].name = pbs.ATTR_state @@ -507,7 +511,7 @@ def stop_job(self, job_wrapper): job = job_wrapper.get_job() job_id = job.get_job_runner_external_id().encode("utf-8") job_tag = f"({job.get_id_tag()}/{job_id})" - log.debug(f"{job_tag} Stopping PBS job") + log.debug("%s Stopping PBS job", job_tag) # Declare the connection handle c so that it can be cleaned up: c = None @@ -515,17 +519,17 @@ def stop_job(self, job_wrapper): try: pbs_server_name = self.__get_pbs_server(job.destination_params) if pbs_server_name is None: - log.debug("(%s) Job queued but no destination stored in job params, cannot delete" % job_tag) + log.debug("(%s) Job queued but no destination stored in job params, cannot delete", job_tag) return c = pbs.pbs_connect(util.smart_str(pbs_server_name)) if c <= 0: - log.debug(f"({job_tag}) Connection to PBS server for job delete failed") + log.debug("(%s) Connection to PBS server for job delete failed", job_tag) return pbs.pbs_deljob(c, job_id, "") - log.debug(f"{job_tag} Removed from PBS queue before job completion") + log.debug("%s Removed from PBS queue before job completion", job_tag) except Exception: e = traceback.format_exc() - log.debug(f"{job_tag} Unable to stop job: {e}") + log.debug("%s Unable to stop job: %s", job_tag, e) finally: # Cleanup: disconnect from the server. if None is not c: @@ -547,14 +551,19 @@ def recover(self, job, job_wrapper): job_wrapper.command_line = job.command_line if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): log.debug( - f"({job.id}/{job.get_job_runner_external_id()}) is still in {job.state} state, adding to the PBS queue" + "(%s/%s) is still in %s state, adding to the PBS queue", + job.id, + job.get_job_runner_external_id(), + job.state, ) pbs_job_state.old_state = "R" pbs_job_state.running = True self.monitor_queue.put(pbs_job_state) elif job.state == model.Job.states.QUEUED: log.debug( - f"({job.id}/{job.get_job_runner_external_id()}) is still in PBS queued state, adding to the PBS queue" + "(%s/%s) is still in PBS queued state, adding to the PBS queue", + job.id, + job.get_job_runner_external_id(), ) pbs_job_state.old_state = "Q" pbs_job_state.running = False diff --git a/lib/galaxy/jobs/runners/pulsar.py b/lib/galaxy/jobs/runners/pulsar.py index 2bba3504024a..545e11e6b872 100644 --- a/lib/galaxy/jobs/runners/pulsar.py +++ b/lib/galaxy/jobs/runners/pulsar.py @@ -255,7 +255,7 @@ def __init_pulsar_app(self, conf, pulsar_conf_path): if pulsar_conf_path is None: log.info("Creating a Pulsar app with default configuration (no pulsar_conf specified).") else: - log.info(f"Loading Pulsar app configuration from {pulsar_conf_path}") + log.info("Loading Pulsar app configuration from %s", pulsar_conf_path) with open(pulsar_conf_path) as f: conf.update(yaml.safe_load(f) or {}) if "job_metrics_config_file" not in conf: @@ -403,7 +403,7 @@ def queue_job(self, job_wrapper): config_files = job_wrapper.extra_filenames tool_script = os.path.join(job_wrapper.working_directory, "tool_script.sh") if os.path.exists(tool_script): - log.debug(f"Registering tool_script for Pulsar transfer [{tool_script}]") + log.debug("Registering tool_script for Pulsar transfer [%s]", tool_script) job_directory_files.append(tool_script) config_files.append(tool_script) # Following is job destination environment variables @@ -437,7 +437,7 @@ def queue_job(self, job_wrapper): tool_directory_required_files=tool_directory_required_files, ) external_job_id = pulsar_submit_job(client, client_job_description, remote_job_config) - log.info(f"Pulsar job submitted with job_id {external_job_id}") + log.info("Pulsar job submitted with job_id %s", external_job_id) job = job_wrapper.get_job() # Set the job destination here (unlike other runners) because there are likely additional job destination # params from the Pulsar client. @@ -567,7 +567,7 @@ def _populate_parameter_defaults(self, job_destination): for key, value in self.destination_defaults.items(): if key in params: if value is PARAMETER_SPECIFICATION_IGNORED: - log.warning(f"Pulsar runner in selected configuration ignores parameter {key}") + log.warning("Pulsar runner in selected configuration ignores parameter %s", key) continue # if self.runner_params.get( key, None ): # # Let plugin define defaults for some parameters - @@ -711,11 +711,13 @@ def check_pid(self, pid): return True except OSError as e: if e.errno == errno.ESRCH: - log.debug("check_pid(): PID %d is dead" % pid) + log.debug("check_pid(): PID %d is dead", pid) else: log.warning( - "check_pid(): Got errno %s when attempting to check PID %d: %s" - % (errno.errorcode[e.errno], pid, e.strerror) + "check_pid(): Got errno %s when attempting to check PID %d: %s", + errno.errorcode[e.errno], + pid, + e.strerror, ) return False @@ -731,32 +733,36 @@ def stop_job(self, job_wrapper): 0 ].job_runner_external_pid # every JobExternalOutputMetadata has a pid set, we just need to take from one of them if pid in [None, ""]: - log.warning(f"stop_job(): {job.id}: no PID in database for job, unable to stop") + log.warning("stop_job(): %s: no PID in database for job, unable to stop", job.id) return pid = int(pid) if not self.check_pid(pid): - log.warning("stop_job(): %s: PID %d was already dead or can't be signaled" % (job.id, pid)) + log.warning("stop_job(): %s: PID %d was already dead or can't be signaled", job.id, pid) return for sig in [15, 9]: try: os.killpg(pid, sig) except OSError as e: log.warning( - "stop_job(): %s: Got errno %s when attempting to signal %d to PID %d: %s" - % (job.id, errno.errorcode[e.errno], sig, pid, e.strerror) + "stop_job(): %s: Got errno %s when attempting to signal %d to PID %d: %s", + job.id, + errno.errorcode[e.errno], + sig, + pid, + e.strerror, ) return # give up sleep(2) if not self.check_pid(pid): - log.debug("stop_job(): %s: PID %d successfully killed with signal %d" % (job.id, pid, sig)) + log.debug("stop_job(): %s: PID %d successfully killed with signal %d", job.id, pid, sig) return else: - log.warning("stop_job(): %s: PID %d refuses to die after signaling TERM/KILL" % (job.id, pid)) + log.warning("stop_job(): %s: PID %d refuses to die after signaling TERM/KILL", job.id, pid) else: # Remote kill pulsar_url = job.job_runner_name job_id = job.job_runner_external_id - log.debug(f"Attempt remote Pulsar kill of job with url {pulsar_url} and id {job_id}") + log.debug("Attempt remote Pulsar kill of job with url %s and id %s", pulsar_url, job_id) client = self.get_client(job.destination_params, job_id) client.kill() @@ -766,7 +772,7 @@ def recover(self, job, job_wrapper): job_wrapper.command_line = job.get_command_line() state = job.get_state() if state in [model.Job.states.RUNNING, model.Job.states.QUEUED, model.Job.states.STOPPED]: - log.debug(f"(Pulsar/{job.id}) is still in {state} state, adding to the Pulsar queue") + log.debug("(Pulsar/%s) is still in %s state, adding to the Pulsar queue", job.id, state) job_state.old_state = True job_state.running = state == model.Job.states.RUNNING self.monitor_queue.put(job_state) @@ -838,7 +844,7 @@ def check_job_config(remote_job_config, check_features=None): # 0.6.0 was newest Pulsar version that did not report it's version. pulsar_version = PulsarJobRunner.pulsar_version(remote_job_config) needed_version = Version("0.0.0") - log.info(f"pulsar_version is {pulsar_version}") + log.info("pulsar_version is %s", pulsar_version) for feature, needed in list(check_features.items()) + [("_default_", True)]: if not needed: continue @@ -1153,7 +1159,7 @@ def input_metadata_rewrite(self, dataset, metadata_val): metadata_val, client_input_path_type=CLIENT_INPUT_PATH_TYPES.INPUT_METADATA_PATH ) if remote_input_path: - log.info(f"input_metadata_rewrite is {remote_input_path} from {metadata_val}") + log.info("input_metadata_rewrite is %s from %s", remote_input_path, metadata_val) self.path_rewrites_input_metadata[metadata_val] = remote_input_path return remote_input_path diff --git a/lib/galaxy/jobs/runners/state_handlers/resubmit.py b/lib/galaxy/jobs/runners/state_handlers/resubmit.py index b6bea8e3f467..cd763a833c95 100644 --- a/lib/galaxy/jobs/runners/state_handlers/resubmit.py +++ b/lib/galaxy/jobs/runners/state_handlers/resubmit.py @@ -98,7 +98,7 @@ def _handle_resubmit_definitions(resubmit_definitions, app, job_runner, job_stat float(delay) new_destination.params["__resubmit_delay_seconds"] = str(delay) except ValueError: - log.warning(f"Cannot delay job with delay [{delay}], does not appear to be a number.") + log.warning("Cannot delay job with delay [%s], does not appear to be a number.", delay) job_state.job_wrapper.set_job_destination(new_destination) # Clear external ID (state change below flushes the change) job.job_runner_external_id = None diff --git a/lib/galaxy/jobs/runners/tasks.py b/lib/galaxy/jobs/runners/tasks.py index 70088bb08e8d..73ca1cf158dc 100644 --- a/lib/galaxy/jobs/runners/tasks.py +++ b/lib/galaxy/jobs/runners/tasks.py @@ -100,7 +100,7 @@ def queue_job(self, job_wrapper): task_state = tw.get_state() if model.Task.states.ERROR == task_state: job_exit_code = tw.get_exit_code() - log.debug("Canceling job %d: Task %s returned an error" % (tw.job_id, tw.task_id)) + log.debug("Canceling job %d: Task %s returned an error", tw.job_id, tw.task_id) self._cancel_job(job_wrapper, task_wrappers) tasks_complete = True break @@ -114,7 +114,7 @@ def queue_job(self, job_wrapper): if sleep_time < 8: sleep_time *= 2 job_wrapper.reclaim_ownership() # if running as the actual user, change ownership before merging. - log.debug(f"execution finished - beginning merge: {command_line}") + log.debug("execution finished - beginning merge: %s", command_line) stdout, stderr = splitter.do_merge(job_wrapper, task_wrappers) except Exception: job_wrapper.fail("failure running job", exception=True) @@ -141,7 +141,7 @@ def stop_job(self, job_wrapper): tasks = job.get_tasks() if len(tasks) > 0: for task in tasks: - log.debug(f"Killing task's job {task.id}") + log.debug("Killing task's job %s", task.id) self.app.job_manager.job_handler.dispatcher.stop(task) # There were no subtasks, so just kill the job. We'll touch @@ -157,7 +157,7 @@ def stop_job(self, job_wrapper): else: pid = job.job_runner_external_id if pid in [None, ""]: - log.warning(f"stop_job(): {job.id}: no PID in database for job, unable to stop") + log.warning("stop_job(): %s: no PID in database for job, unable to stop", job.id) return self._stop_pid(pid, job.id) @@ -196,9 +196,7 @@ def _cancel_job(self, job_wrapper, task_wrappers): task = task_wrapper.get_task() task_state = task.get_state() if model.Task.states.QUEUED == task_state: - log.debug( - "_cancel_job for job %d: Task %d is not running; setting state to DELETED" % (job.id, task.id) - ) + log.debug("_cancel_job for job %d: Task %d is not running; setting state to DELETED", job.id, task.id) task_wrapper.change_state(task.states.DELETED) # If a task failed, then the caller will have waited a few seconds # before recognizing the failure. In that time, a queued task could @@ -209,7 +207,7 @@ def _cancel_job(self, job_wrapper, task_wrappers): for task_wrapper in task_wrappers: if model.Task.states.RUNNING == task_wrapper.get_state(): task = task_wrapper.get_task() - log.debug("_cancel_job for job %d: Stopping running task %d" % (job.id, task.id)) + log.debug("_cancel_job for job %d: Stopping running task %d", job.id, task.id) job_wrapper.app.job_manager.job_handler.dispatcher.stop(task) def _check_pid(self, pid): @@ -224,9 +222,9 @@ def _stop_pid(self, pid, job_id): job's id (which is used for logging messages only right now). """ pid = int(pid) - log.debug(f"Stopping pid {pid}") + log.debug("Stopping pid %s", pid) if not self._check_pid(pid): - log.warning("_stop_pid(): %s: PID %d was already dead or can't be signaled" % (job_id, pid)) + log.warning("_stop_pid(): %s: PID %d was already dead or can't be signaled", job_id, pid) return for sig in [15, 9]: try: @@ -235,15 +233,19 @@ def _stop_pid(self, pid, job_id): # This warning could be bogus; many tasks are stopped with # SIGTERM (signal 15), but ymmv depending on the platform. log.warning( - "_stop_pid(): %s: Got errno %s when attempting to signal %d to PID %d: %s" - % (job_id, errno.errorcode[e.errno], sig, pid, e.strerror) + "_stop_pid(): %s: Got errno %s when attempting to signal %d to PID %d: %s", + job_id, + errno.errorcode[e.errno], + sig, + pid, + e.strerror, ) return # TODO: If we're stopping lots of tasks, then we will want to put this # avoid a two-second overhead using some other asynchronous method. sleep(2) if not self._check_pid(pid): - log.debug("_stop_pid(): %s: PID %d successfully killed with signal %d" % (job_id, pid, sig)) + log.debug("_stop_pid(): %s: PID %d successfully killed with signal %d", job_id, pid, sig) return else: - log.warning("_stop_pid(): %s: PID %d refuses to die after signaling TERM/KILL" % (job_id, pid)) + log.warning("_stop_pid(): %s: PID %d refuses to die after signaling TERM/KILL", job_id, pid) diff --git a/lib/galaxy/jobs/runners/univa.py b/lib/galaxy/jobs/runners/univa.py index c0d2ecb730c3..d2845ce5b97c 100644 --- a/lib/galaxy/jobs/runners/univa.py +++ b/lib/galaxy/jobs/runners/univa.py @@ -96,7 +96,7 @@ def _complete_terminal_job(self, ajs, drmaa_state, **kwargs): ajs.fail_message = "This job failed because it was cancelled." drmaa_state = self.drmaa.JobState.FAILED elif ("signal" in extinfo and extinfo["signal"] == "SIGKILL") and time_wasted > time_granted: - log.error(f"({ajs.job_wrapper.get_id_tag()}/{ajs.job_id}) Job hit walltime") + log.error("(%s/%s) Job hit walltime", ajs.job_wrapper.get_id_tag(), ajs.job_id) ajs.fail_message = ( "This job was terminated because it ran longer than the maximum allowed job run time." ) @@ -105,7 +105,11 @@ def _complete_terminal_job(self, ajs, drmaa_state, **kwargs): # test wasted>granted memory only if failed != 0 and exit_status != 0, ie if marked as failed elif state == self.drmaa.JobState.FAILED and mem_wasted > mem_granted * slots: log.error( - f"({ajs.job_wrapper.get_id_tag()}/{ajs.job_id}) Job hit memory limit ({mem_wasted}>{mem_granted})" + "(%s/%s) Job hit memory limit (%s>%s)", + ajs.job_wrapper.get_id_tag(), + ajs.job_id, + mem_wasted, + mem_granted, ) ajs.fail_message = "This job was terminated because it used more than the maximum allowed memory." ajs.runner_state = ajs.runner_states.MEMORY_LIMIT_REACHED @@ -120,15 +124,18 @@ def _complete_terminal_job(self, ajs, drmaa_state, **kwargs): self.drmaa.JobState.USER_SUSPENDED, ]: log.warning( - f"({ajs.job_wrapper.get_id_tag()}/{ajs.job_id}) Job is {self.drmaa_job_state_strings[state]}, returning to monitor queue" + "(%s/%s) Job is %s, returning to monitor queue", + ajs.job_wrapper.get_id_tag(), + ajs.job_id, + self.drmaa_job_state_strings[state], ) # TODO return True? return True # job was not actually terminal elif state == self.drmaa.JobState.UNDETERMINED: - log.warning(f"({ajs.job_wrapper.get_id_tag()}/{ajs.job_id}) Job state could not be determined") + log.warning("(%s/%s) Job state could not be determined", ajs.job_wrapper.get_id_tag(), ajs.job_id) drmaa_state = self.drmaa_job_states.FAILED else: - log.error(f"DRMAAUniva: job {ajs.job_id} determined unknown state {state}") + log.error("DRMAAUniva: job %s determined unknown state %s", ajs.job_id, state) drmaa_state = self.drmaa_job_states.FAILED # by default, finish the job with the state from drmaa return super()._complete_terminal_job(ajs, drmaa_state=drmaa_state) @@ -275,7 +282,7 @@ def _get_drmaa_state_qacct(self, job_id, extinfo): # "NONE". If qdel was called multiple times, every invocation is recorded in a comma # separated list. if "deleted_by" in qacct and qacct["deleted_by"] != "NONE": - log.info(f"DRMAAUniva: job {job_id} was aborted by {qacct['deleted_by']}") + log.info("DRMAAUniva: job %s was aborted by %s", job_id, qacct["deleted_by"]) extinfo["deleted"] = True return self.drmaa.JobState.FAILED @@ -291,14 +298,14 @@ def _get_drmaa_state_qacct(self, job_id, extinfo): if "exit_status" in qacct: qacct["exit_status"] = int(qacct["exit_status"]) if qacct["exit_status"] < 1: - log.error(f"DRMAAUniva: job {job_id} has exit status {qacct['exit_status']}") + log.error("DRMAAUniva: job %s has exit status %s", job_id, qacct["exit_status"]) state = self.drmaa.JobState.DONE elif 0 < qacct["exit_status"] < 129: - log.error(f"DRMAAUniva: job {job_id} has exit status {qacct['exit_status']}") + log.error("DRMAAUniva: job %s has exit status %s", job_id, qacct["exit_status"]) extinfo["exit_status"] = qacct["exit_status"] state = self.drmaa.JobState.FAILED else: - log.error(f"DRMAAUniva: job {job_id} was killed by signal {qacct['exit_status'] - 128}") + log.error("DRMAAUniva: job %s was killed by signal %s", job_id, qacct["exit_status"] - 128) state = self.drmaa.JobState.FAILED extinfo["signal"] = signals[qacct["exit_status"] - 128] @@ -318,7 +325,7 @@ def _get_drmaa_state_qacct(self, job_id, extinfo): elif code in [24, 25]: state = self.drmaa.JobState.RUNNING else: - log.error(f"DRMAAUniva: job {job_id} failed with failure {qacct['failed']}") + log.error("DRMAAUniva: job %s failed with failure %s", job_id, qacct["failed"]) state = self.drmaa.JobState.FAILED # log.debug("UnivaJobRunner._get_drmaa_state_qacct ({jobid}) -> {state}".format(jobid=job_id, state=self.drmaa_job_state_strings[state])) return state @@ -419,7 +426,7 @@ def _get_drmaa_state_wait(self, job_id, ds, extinfo): extinfo["slots"] = float(rv.resourceUsage["slots"]) # log.debug("wait -> \texitStatus {0}\thasCoreDump {1}\thasExited {2}\thasSignal {3}\tjobId {4}\t\tterminatedSignal {5}\twasAborted {6}\tresourceUsage {7}".format(rv.exitStatus, rv.hasCoreDump, rv.hasExited, rv.hasSignal, rv.jobId, rv.terminatedSignal, rv.wasAborted, rv.resourceUsage)) if rv.wasAborted: - log.error(f"DRMAAUniva: job {job_id} was aborted according to wait()") + log.error("DRMAAUniva: job %s was aborted according to wait()", job_id) extinfo["deleted"] = True return self.drmaa.JobState.FAILED @@ -427,19 +434,19 @@ def _get_drmaa_state_wait(self, job_id, ds, extinfo): # but also violation of scheduler constraints state = self.drmaa.JobState.DONE if rv.exitStatus != 0: - log.error(f"DRMAAUniva: job {job_id} has exit status {rv.exitStatus}") + log.error("DRMAAUniva: job %s has exit status %s", job_id, rv.exitStatus) extinfo["state"] = self.drmaa.JobState.FAILED if not rv.hasExited or rv.hasSignal: if rv.hasCoreDump != 0: - log.error(f"DRMAAUniva: job {job_id} has core dump") + log.error("DRMAAUniva: job %s has core dump", job_id) extinfo["state"] = self.drmaa.JobState.FAILED elif len(rv.terminatedSignal) > 0: - log.error(f"DRMAAUniva: job {job_id} was kill by signal {rv.terminatedSignal}") + log.error("DRMAAUniva: job %s was kill by signal %s", job_id, rv.terminatedSignal) state = self.drmaa.JobState.FAILED extinfo["signal"] = rv.terminatedSignal elif rv.wasAborted == 0: - log.error(f"DRMAAUniva: job {job_id} has finished in unclear condition") + log.error("DRMAAUniva: job %s has finished in unclear condition", job_id) state = self.drmaa.JobState.FAILED # log.debug("UnivaJobRunner._get_drmaa_state_wait ({jobid}) -> {state}".format(jobid=job_id, state=self.drmaa_job_state_strings[state])) return state @@ -561,7 +568,7 @@ def _map_qstat_drmaa_states(self, job_id, state, extinfo): elif "w" in state: return self.drmaa.JobState.QUEUED_ACTIVE else: - log.error(f"DRMAAUniva: job {job_id} unknown state from qstat: {state}") + log.error("DRMAAUniva: job %s unknown state from qstat: %s", job_id, state) return self.drmaa.JobState.UNDETERMINED @@ -595,12 +602,12 @@ def _parse_native_specs(job_id, native_spec): if m is not None: tme = _parse_time(m.group(1)) if tme is None: - log.error(f"DRMAAUniva: job {job_id} has unparsable time native spec {native_spec}") + log.error("DRMAAUniva: job %s has unparsable time native spec %s", job_id, native_spec) # parse memory m = re.search(r"mem=([\d.]+[KGMT]?)[\s,]*", native_spec) if m is not None: mem = size_to_bytes(m.group(1)) # mem = _parse_mem(m.group(1)) if mem is None: - log.error(f"DRMAAUniva: job {job_id} has unparsable memory native spec {native_spec}") + log.error("DRMAAUniva: job %s has unparsable memory native spec %s", job_id, native_spec) return tme, mem diff --git a/lib/galaxy/jobs/runners/util/cli/job/lsf.py b/lib/galaxy/jobs/runners/util/cli/job/lsf.py index cdaedf447a42..272e198c0451 100644 --- a/lib/galaxy/jobs/runners/util/cli/job/lsf.py +++ b/lib/galaxy/jobs/runners/util/cli/job/lsf.py @@ -36,7 +36,7 @@ def job_script_kwargs(self, ofile, efile, job_name): k = argmap[k] scriptargs[k] = v except Exception: - log.warning(f"Unrecognized long argument passed to LSF CLI plugin: {k}") + log.warning("Unrecognized long argument passed to LSF CLI plugin: %s", k) # Generated template. template_scriptargs = "" @@ -81,7 +81,7 @@ def parse_single_status(self, status, job_id): # which would be badly handled here. So this only works well when Galaxy # is constantly monitoring the jobs. The logic here is that DONE jobs get forgotten # faster than failed jobs. - log.warning(f"Job id '{job_id}' not found LSF status check") + log.warning("Job id '%s' not found LSF status check", job_id) return job_states.OK return self._get_job_state(status) diff --git a/lib/galaxy/jobs/runners/util/cli/job/pbs.py b/lib/galaxy/jobs/runners/util/cli/job/pbs.py index 5e2cc06d8a8a..e4851db28130 100644 --- a/lib/galaxy/jobs/runners/util/cli/job/pbs.py +++ b/lib/galaxy/jobs/runners/util/cli/job/pbs.py @@ -19,7 +19,7 @@ def parse_status(self, status, job_ids): try: data = json.loads(status) except Exception: - log.warning(f"No valid qstat JSON return from `qstat -f -F json`, got the following: {status}") + log.warning("No valid qstat JSON return from `qstat -f -F json`, got the following: %s", status) rval = {} for job_id, job in data.get("Jobs", {}).items(): if job_id in job_ids: diff --git a/lib/galaxy/jobs/runners/util/cli/job/slurm.py b/lib/galaxy/jobs/runners/util/cli/job/slurm.py index 5c1626d7315f..996a680d5183 100644 --- a/lib/galaxy/jobs/runners/util/cli/job/slurm.py +++ b/lib/galaxy/jobs/runners/util/cli/job/slurm.py @@ -25,7 +25,7 @@ def job_script_kwargs(self, ofile, efile, job_name): k = argmap[k] scriptargs[k] = v except Exception: - log.warning(f"Unrecognized long argument passed to Slurm CLI plugin: {k}") + log.warning("Unrecognized long argument passed to Slurm CLI plugin: %s", k) # Generated template. template_scriptargs = "" diff --git a/lib/galaxy/jobs/runners/util/cli/job/torque.py b/lib/galaxy/jobs/runners/util/cli/job/torque.py index 56754b142b58..d48fefd8f0b3 100644 --- a/lib/galaxy/jobs/runners/util/cli/job/torque.py +++ b/lib/galaxy/jobs/runners/util/cli/job/torque.py @@ -76,7 +76,7 @@ def parse_status(self, status, job_ids): except Exception: tree = None if tree is None: - log.warning(f"No valid qstat XML return from `qstat -x`, got the following: {status}") + log.warning("No valid qstat XML return from `qstat -x`, got the following: %s", status) return None else: for job in tree.findall("Job"): diff --git a/lib/galaxy/jobs/runners/util/pykube_util.py b/lib/galaxy/jobs/runners/util/pykube_util.py index 7c3f32d87b09..2baa009b3dce 100644 --- a/lib/galaxy/jobs/runners/util/pykube_util.py +++ b/lib/galaxy/jobs/runners/util/pykube_util.py @@ -295,7 +295,7 @@ def galaxy_instance_id(params): if re.match(r"(?!-)[a-z\d-]{1,20}(?