Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Limit reading/mounting from shared filesystem JobStore #5047

Merged
merged 6 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/running/cliOptions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,13 @@ Allows configuring Toil's data storage.
to use a batch system that does not support
cleanup. Set to "true" if caching
is desired.
--symlinkJobStoreReads BOOL
Allow reads and container mounts from a JobStore's
shared filesystem directly via symlink. Can be turned
off if the shared filesystem can't support the IO load
of all the jobs reading from it at once, and you want
to use ``--caching=True`` to make jobs on each node
read from node-local cache storage. (Default=True)

**Autoscaling Options**
Allows the specification of the minimum and maximum number of nodes in an
Expand Down
2 changes: 2 additions & 0 deletions src/toil/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ class Config:
caching: Optional[bool]
symlinkImports: bool
moveOutputs: bool
symlink_job_store_reads: bool

# Autoscaling options
provisioner: Optional[str]
Expand Down Expand Up @@ -338,6 +339,7 @@ def set_option(option_name: str,
set_option("symlinkImports", old_names=["linkImports"])
set_option("moveOutputs", old_names=["moveExports"])
set_option("caching", old_names=["enableCaching"])
set_option("symlink_job_store_reads")

# Autoscaling options
set_option("provisioner")
Expand Down
11 changes: 7 additions & 4 deletions src/toil/jobStores/fileJobStore.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ def __init__(self, path: str, fanOut: int = 1000) -> None:

self.linkImports = None
self.moveExports = None
self.symlink_job_store_reads = None

def __repr__(self):
return f'FileJobStore({self.jobStoreDir})'
Expand All @@ -123,6 +124,7 @@ def initialize(self, config):
os.makedirs(self.sharedFilesDir, exist_ok=True)
self.linkImports = config.symlinkImports
self.moveExports = config.moveOutputs
self.symlink_job_store_reads = config.symlink_job_store_reads
super().initialize(config)

def resume(self):
Expand Down Expand Up @@ -314,8 +316,8 @@ def _copy_or_link(self, src_path, dst_path, symlink=False):
atomic_copy(srcPath, dst_path)

def _import_file(self, otherCls, uri, shared_file_name=None, hardlink=False, symlink=True):
# symlink argument says whether the caller can take symlinks or not
# ex: if false, it implies the workflow cannot work with symlinks and thus will hardlink imports
# symlink argument says whether the caller can take symlinks or not.
# ex: if false, it means the workflow cannot work with symlinks and we need to hardlink or copy.
# default is true since symlinking everything is ideal
uri_path = unquote(uri.path)
if issubclass(otherCls, FileJobStore):
Expand Down Expand Up @@ -515,8 +517,9 @@ def read_file(self, file_id: str, local_path: str, symlink: bool = False) -> Non
# one over the other will fail.
return

if symlink:
# If the reader will accept a symlink, so always give them one.
if symlink and self.symlink_job_store_reads:
# If the reader will accept a symlink, and we are willing to
# symlink into the jobstore, always give them one.
# There's less that can go wrong.
try:
os.symlink(jobStoreFilePath, local_path)
Expand Down
9 changes: 7 additions & 2 deletions src/toil/options/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,15 +349,15 @@ def is_within(x: Union[int, float]) -> bool:
link_imports_help = ("When using a filesystem based job store, CWL input files are by default symlinked in. "
"Setting this option to True instead copies the files into the job store, which may protect "
"them from being modified externally. When set to False, as long as caching is enabled, "
"Toil will protect the file automatically by changing the permissions to read-only."
"Toil will protect the file automatically by changing the permissions to read-only. "
"default=%(default)s")
link_imports.add_argument("--symlinkImports", dest="symlinkImports", type=strtobool, default=True,
metavar="BOOL", help=link_imports_help)
move_exports = file_store_options.add_mutually_exclusive_group()
move_exports_help = ('When using a filesystem based job store, output files are by default moved to the '
'output directory, and a symlink to the moved exported file is created at the initial '
'location. Setting this option to True instead copies the files into the output directory. '
'Applies to filesystem-based job stores only.'
'Applies to filesystem-based job stores only. '
'default=%(default)s')
move_exports.add_argument("--moveOutputs", dest="moveOutputs", type=strtobool, default=False, metavar="BOOL",
help=move_exports_help)
Expand All @@ -368,6 +368,11 @@ def is_within(x: Union[int, float]) -> bool:
help=caching_help)
# default is None according to PR 4299, seems to be generated at runtime

file_store_options.add_argument("--symlinkJobStoreReads", dest="symlink_job_store_reads", type=strtobool, default=True,
metavar="BOOL",
help="Allow reads and container mounts from a JobStore's shared filesystem directly "
"via symlink. default=%(default)s")

# Auto scaling options
autoscaling_options = parser.add_argument_group(
title="Toil options for autoscaling the cluster of worker nodes.",
Expand Down
26 changes: 26 additions & 0 deletions src/toil/test/jobStores/jobStoreTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1226,6 +1226,32 @@ def test_file_link_imports(self):
os.remove(filename)
os.remove(srcUrl[7:])

def test_symlink_read_control(self):
"""
Test that files are read by symlink when expected
"""

for should_link in (False, True):
# Configure a jobstore to symlink out reads or not, as appropriate
config = self._createConfig()
config.symlink_job_store_reads = should_link
store = FileJobStore(self.namePrefix + ("-link" if should_link else "-nolink"))
store.initialize(config)

# Put something in the job store
src_url, _ = self._prepareTestFile(self._externalStore(), 1)
file_id = store.import_file(src_url, symlink=False)

# Read it out, accepting a symlink
dest_dir = self._createTempDir()
dest_path = os.path.join(dest_dir, "file.dat")
store.read_file(file_id, dest_path, symlink = True)

# Make sure we get a symlink exactly when configured to
assert os.path.exists(dest_path)
assert os.path.islink(dest_path) == should_link



@needs_google_project
@needs_google_storage
Expand Down