Skip to content

Commit

Permalink
Drop job integrity checks
Browse files Browse the repository at this point in the history
It's time to rip off the band-aid, I think
409aef5
should have fixed this.
It liekly was only ever a problem if the job handler node is also
doubling as a compute node, as in CI or docker-galaxy-stable.
  • Loading branch information
mvdbeek committed Aug 10, 2024
1 parent 03a7fda commit 7c0c3d7
Show file tree
Hide file tree
Showing 15 changed files with 16 additions and 182 deletions.
33 changes: 0 additions & 33 deletions doc/source/admin/galaxy_options.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1167,39 +1167,6 @@
:Type: str


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
``check_job_script_integrity``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

:Description:
Set to false to disable various checks Galaxy will do to ensure it
can run job scripts before attempting to execute or submit them.
:Default: ``true``
:Type: bool


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
``check_job_script_integrity_count``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

:Description:
Number of checks to execute if check_job_script_integrity is
enabled.
:Default: ``35``
:Type: int


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
``check_job_script_integrity_sleep``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

:Description:
Time to sleep between checks if check_job_script_integrity is
enabled (in seconds).
:Default: ``0.25``
:Type: float


~~~~~~~~~~~~~~~~~~~~~
``default_job_shell``
~~~~~~~~~~~~~~~~~~~~~
Expand Down
3 changes: 0 additions & 3 deletions lib/galaxy/app_unittest_utils/galaxy_mock.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,9 +261,6 @@ def __init__(self, **kwargs):
self.enable_tool_document_cache = False
self.tool_cache_data_dir = os.path.join(self.root, "tool_cache")
self.external_chown_script = None
self.check_job_script_integrity = False
self.check_job_script_integrity_count = 0
self.check_job_script_integrity_sleep = 0

self.default_panel_view = "default"
self.panel_views_dir = ""
Expand Down
12 changes: 0 additions & 12 deletions lib/galaxy/config/sample/galaxy.yml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -913,18 +913,6 @@ galaxy:
# <cache_dir>.
#template_cache_path: compiled_templates

# Set to false to disable various checks Galaxy will do to ensure it
# can run job scripts before attempting to execute or submit them.
#check_job_script_integrity: true

# Number of checks to execute if check_job_script_integrity is
# enabled.
#check_job_script_integrity_count: 35

# Time to sleep between checks if check_job_script_integrity is
# enabled (in seconds).
#check_job_script_integrity_sleep: 0.25

# Set the default shell used by non-containerized jobs Galaxy-wide.
# This defaults to bash for all jobs and can be overridden at the
# destination level for heterogeneous clusters. conda job resolution
Expand Down
3 changes: 3 additions & 0 deletions lib/galaxy/config/sample/tool_shed.yml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ tool_shed:
# options.
#whoosh_index_dir: database/toolshed_whoosh_indexes

# Cache directory for Pydantic model objects.
#model_cache_dir: database/model_cache

# For searching repositories at /api/repositories:
#repo_name_boost: 0.9

Expand Down
22 changes: 0 additions & 22 deletions lib/galaxy/config/schemas/config_schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -848,28 +848,6 @@ mapping:
Mako templates are compiled as needed and cached for reuse, this directory is
used for the cache
check_job_script_integrity:
type: bool
default: true
required: false
desc: |
Set to false to disable various checks Galaxy will do to ensure it
can run job scripts before attempting to execute or submit them.
check_job_script_integrity_count:
type: int
default: 35
required: false
desc: |
Number of checks to execute if check_job_script_integrity is enabled.
check_job_script_integrity_sleep:
type: float
default: .25
required: false
desc: |
Time to sleep between checks if check_job_script_integrity is enabled (in seconds).
default_job_shell:
type: str
default: /bin/bash
Expand Down
9 changes: 0 additions & 9 deletions lib/galaxy/job_execution/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,6 @@ class JobIO(UsesDictVisibleKeys):
"len_file_path",
"builds_file_path",
"file_sources_dict",
"check_job_script_integrity",
"check_job_script_integrity_count",
"check_job_script_integrity_sleep",
"tool_source",
"tool_source_class",
"tool_dir",
Expand All @@ -105,9 +102,6 @@ def __init__(
new_file_path: str,
len_file_path: str,
builds_file_path: str,
check_job_script_integrity: bool,
check_job_script_integrity_count: int,
check_job_script_integrity_sleep: float,
file_sources_dict: Dict[str, Any],
user_context: Union[FileSourcesUserContext, Dict[str, Any]],
tool_source: Optional[str] = None,
Expand Down Expand Up @@ -137,9 +131,6 @@ def __init__(
self.new_file_path = new_file_path
self.len_file_path = len_file_path
self.builds_file_path = builds_file_path
self.check_job_script_integrity = check_job_script_integrity
self.check_job_script_integrity_count = check_job_script_integrity_count
self.check_job_script_integrity_sleep = check_job_script_integrity_sleep
self.tool_dir = tool_dir
self.is_task = is_task
self.tool_source = tool_source
Expand Down
3 changes: 0 additions & 3 deletions lib/galaxy/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1083,9 +1083,6 @@ def job_io(self):
len_file_path=self.app.config.len_file_path,
file_sources_dict=self.app.file_sources.to_dict(for_serialization=True, user_context=user_context),
user_context=user_context,
check_job_script_integrity=self.app.config.check_job_script_integrity,
check_job_script_integrity_count=self.app.config.check_job_script_integrity_count,
check_job_script_integrity_sleep=self.app.config.check_job_script_integrity_sleep,
tool_source=tool_source,
tool_source_class=type(self.tool.tool_source).__name__ if self.tool else None,
tool_dir=self.tool and self.tool.tool_dir,
Expand Down
11 changes: 2 additions & 9 deletions lib/galaxy/jobs/command_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,7 @@

from galaxy import util
from galaxy.job_execution.output_collect import default_exit_code_file
from galaxy.jobs.runners.util.job_script import (
INTEGRITY_INJECTION,
ScriptIntegrityChecks,
write_script,
)
from galaxy.jobs.runners.util.job_script import write_script
from galaxy.tool_util.deps.container_classes import (
Container,
TRAP_KILL_CONTAINER,
Expand Down Expand Up @@ -154,7 +150,7 @@ def build_command(
relocate_contents = (
"from galaxy_ext.cwl.handle_outputs import relocate_dynamic_outputs; relocate_dynamic_outputs()"
)
write_script(relocate_script_file, relocate_contents, ScriptIntegrityChecks(check_job_script_integrity=False))
write_script(relocate_script_file, relocate_contents)
commands_builder.append_command(SETUP_GALAXY_FOR_METADATA)
commands_builder.append_command(f"python '{relocate_script_file}'")

Expand Down Expand Up @@ -184,8 +180,6 @@ def __externalize_commands(
# for instance.
if shell and shell.lower() == "none":
return tool_commands
if job_wrapper.job_io.check_job_script_integrity:
integrity_injection = INTEGRITY_INJECTION
set_e = ""
if job_wrapper.strict_shell:
set_e = "set -e\n"
Expand All @@ -196,7 +190,6 @@ def __externalize_commands(
write_script(
local_container_script,
script_contents,
job_io=job_wrapper.job_io,
)
commands = f"{shell} {local_container_script}"
# TODO: Cleanup for_pulsar hack.
Expand Down
5 changes: 2 additions & 3 deletions lib/galaxy/jobs/runners/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
from galaxy.jobs.runners.util import runner_states
from galaxy.jobs.runners.util.env import env_to_statement
from galaxy.jobs.runners.util.job_script import (
DescribesScriptIntegrityChecks,
job_script,
write_script,
)
Expand Down Expand Up @@ -520,8 +519,8 @@ def get_job_file(self, job_wrapper: "MinimalJobWrapper", **kwds) -> str:
options.update(**kwds)
return job_script(**options)

def write_executable_script(self, path: str, contents: str, job_io: DescribesScriptIntegrityChecks) -> None:
write_script(path, contents, job_io)
def write_executable_script(self, path: str, contents: str) -> None:
write_script(path, contents)

def _find_container(
self,
Expand Down
2 changes: 1 addition & 1 deletion lib/galaxy/jobs/runners/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,5 +605,5 @@ def write_command(self, job_wrapper: "MinimalJobWrapper") -> str:
"galaxy_virtual_env": None,
}
job_file_contents = self.get_job_file(job_wrapper, **job_script_props)
self.write_executable_script(job_file, job_file_contents, job_io=job_wrapper.job_io)
self.write_executable_script(job_file, job_file_contents)
return job_file
2 changes: 1 addition & 1 deletion lib/galaxy/jobs/runners/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def __command_line(self, job_wrapper: "MinimalJobWrapper") -> Tuple[str, str]:
"shell": job_wrapper.shell,
}
job_file_contents = self.get_job_file(job_wrapper, **job_script_props)
self.write_executable_script(job_file, job_file_contents, job_io=job_wrapper.job_io)
self.write_executable_script(job_file, job_file_contents)
return job_file, exit_code_path

def queue_job(self, job_wrapper):
Expand Down
80 changes: 6 additions & 74 deletions lib/galaxy/jobs/runners/util/job_script/__init__.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,11 @@
import logging
import os
import subprocess
import time
from dataclasses import dataclass
from string import Template
from typing import (
Any,
Dict,
Optional,
)

from typing_extensions import Protocol

from galaxy.util import (
RWXR_XR_X,
unicodify,
Expand All @@ -32,19 +26,6 @@
GALAXY_SLOTS="1"
"""

INTEGRITY_INJECTION = """
# The following block can be used by the job system
# to ensure this script is runnable before actually attempting
# to run it.
if [ -n "$ABC_TEST_JOB_SCRIPT_INTEGRITY_XYZ" ]; then
exit 42
fi
"""

INTEGRITY_SYNC_COMMAND = "/bin/sync"
DEFAULT_INTEGRITY_CHECK = True
DEFAULT_INTEGRITY_COUNT = 35
DEFAULT_INTEGRITY_SLEEP = 0.25
REQUIRED_TEMPLATE_PARAMS = ["working_directory", "command"]
OPTIONAL_TEMPLATE_PARAMS: Dict[str, Any] = {
"galaxy_lib": None,
Expand All @@ -54,7 +35,6 @@
"slots_statement": SLOTS_STATEMENT_CLUSTER_DEFAULT,
"instrument_pre_commands": "",
"instrument_post_commands": "",
"integrity_injection": INTEGRITY_INJECTION,
"shell": DEFAULT_SHELL,
"preserve_python_environment": True,
"tmp_dir_creation_statement": '""',
Expand Down Expand Up @@ -113,67 +93,19 @@ def job_script(template=DEFAULT_JOB_FILE_TEMPLATE, **kwds):
return template.safe_substitute(template_params)


class DescribesScriptIntegrityChecks(Protocol):
check_job_script_integrity: bool
check_job_script_integrity_count: Optional[int]
check_job_script_integrity_sleep: Optional[float]


@dataclass
class ScriptIntegrityChecks:
"""Minimal class implementing the DescribesScriptIntegrityChecks protocol"""

check_job_script_integrity: bool
check_job_script_integrity_count: Optional[int] = None
check_job_script_integrity_sleep: Optional[float] = None


def write_script(path: str, contents, job_io: DescribesScriptIntegrityChecks, mode: int = RWXR_XR_X) -> None:
def write_script(path: str, contents, mode: int = RWXR_XR_X, use_fork_safe_write=False) -> None:
dir = os.path.dirname(path)
if not os.path.exists(dir):
os.makedirs(dir)
fork_safe_write(path, contents)
if use_fork_safe_write:
fork_safe_write(path, contents)
else:
with open(path, "w", encoding="utf-8") as f:
f.write(unicodify(contents))
os.chmod(path, mode)
if job_io.check_job_script_integrity:
assert job_io.check_job_script_integrity_count is not None
assert job_io.check_job_script_integrity_sleep is not None
_handle_script_integrity(path, job_io.check_job_script_integrity_count, job_io.check_job_script_integrity_sleep)


def _handle_script_integrity(
path: str, check_job_script_integrity_count: int, check_job_script_integrity_sleep: float
) -> None:
script_integrity_verified = False
for _ in range(check_job_script_integrity_count):
try:
returncode = subprocess.call([path], env={"ABC_TEST_JOB_SCRIPT_INTEGRITY_XYZ": "1"})
if returncode == 42:
script_integrity_verified = True
break

log.debug("Script integrity error for file '%s': returncode was %d", path, returncode)

# Else we will sync and wait to see if the script becomes
# executable.
try:
# sync file system to avoid "Text file busy" problems.
# These have occurred both in Docker containers and on EC2 clusters
# under high load.
subprocess.check_call(INTEGRITY_SYNC_COMMAND)
except Exception as e:
log.debug("Error syncing the filesystem: %s", unicodify(e))

except Exception as exc:
log.debug("Script not available yet: %s", unicodify(exc))

time.sleep(check_job_script_integrity_sleep)

if not script_integrity_verified:
raise Exception(f"Failed to write job script '{path}', could not verify job script integrity.")


__all__ = (
"job_script",
"write_script",
"INTEGRITY_INJECTION",
)
6 changes: 0 additions & 6 deletions test/unit/app/jobs/test_command_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,11 +220,6 @@ def __init__(self, job_dir):
self.working_directory = job_dir
self.prepare_input_files_cmds = None
self.commands_in_new_shell = False
self.app = Bunch(
config=Bunch(
check_job_script_integrity=False,
)
)
self.shell = "/bin/sh"
self.use_metadata_binary = False
self.job_id = 1
Expand All @@ -248,7 +243,6 @@ def setup_external_metadata(self, *args, **kwds):
def job_io(self):
return Bunch(
get_output_fnames=lambda: ["output1"],
check_job_script_integrity=False,
version_path=None,
)

Expand Down
4 changes: 1 addition & 3 deletions test/unit/app/jobs/test_runner_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,9 +189,7 @@ def change_state(self, state, job=None):

@property
def job_io(self):
return bunch.Bunch(
get_output_fnames=lambda: [], check_job_script_integrity=False, version_path="/tmp/version_path"
)
return bunch.Bunch(get_output_fnames=lambda: [], version_path="/tmp/version_path")

def get_job(self):
return self.job
Expand Down
3 changes: 0 additions & 3 deletions test/unit/job_execution/test_job_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,6 @@ def job_io(app: FileSourcesMockApp, job: Job) -> JobIO:
builds_file_path=WORKING_DIRECTORY,
user_context=USER_CONTEXT,
file_sources_dict=app.file_sources.to_dict(for_serialization=True, user_context=user_context),
check_job_script_integrity=False,
check_job_script_integrity_count=1,
check_job_script_integrity_sleep=1,
tool_dir=WORKING_DIRECTORY,
is_task=False,
)
Expand Down

0 comments on commit 7c0c3d7

Please sign in to comment.