Skip to content

Commit

Permalink
Allow a posix file source to prefer linking.
Browse files Browse the repository at this point in the history
The linking upload parameters will still be respected, but if none of them are set data fetch will default to just linking files during upload. This uses Dataset.external_filename instead of symlinks in the objectstore so that Galaxy has better tracking of the links and so this works closer to the way data libraries have always worked.
  • Loading branch information
jmchilton committed Nov 12, 2024
1 parent 784b5f2 commit e0fc1d0
Show file tree
Hide file tree
Showing 6 changed files with 137 additions and 27 deletions.
7 changes: 7 additions & 0 deletions lib/galaxy/files/sources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,10 @@ def to_dict(self, for_serialization=False, user_context: "OptionalUserContext" =
context doesn't need to be present after the plugin is re-hydrated.
"""

@abc.abstractmethod
def prefer_links(self) -> bool:
"""Prefer linking to files."""


class SupportsBrowsing(metaclass=abc.ABCMeta):
"""An interface indicating that this filesource is browsable.
Expand Down Expand Up @@ -351,6 +355,9 @@ def user_has_access(self, user_context: "OptionalUserContext") -> bool:
or (self._user_has_required_roles(user_context) and self._user_has_required_groups(user_context))
)

def prefer_links(self) -> bool:
return False

@property
def user_context_required(self) -> bool:
return self.requires_roles is not None or self.requires_groups is not None
Expand Down
7 changes: 7 additions & 0 deletions lib/galaxy/files/sources/posix.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,15 @@
DEFAULT_ENFORCE_SYMLINK_SECURITY = True
DEFAULT_DELETE_ON_REALIZE = False
DEFAULT_ALLOW_SUBDIR_CREATION = True
DEFAULT_PREFER_LINKS = False


class PosixFilesSourceProperties(FilesSourceProperties, total=False):
root: str
enforce_symlink_security: bool
delete_on_realize: bool
allow_subdir_creation: bool
prefer_links: bool


class PosixFilesSource(BaseFilesSource):
Expand All @@ -53,6 +55,10 @@ def __init__(self, **kwd: Unpack[PosixFilesSourceProperties]):
self.enforce_symlink_security = props.get("enforce_symlink_security", DEFAULT_ENFORCE_SYMLINK_SECURITY)
self.delete_on_realize = props.get("delete_on_realize", DEFAULT_DELETE_ON_REALIZE)
self.allow_subdir_creation = props.get("allow_subdir_creation", DEFAULT_ALLOW_SUBDIR_CREATION)
self._prefer_links = props.get("prefer_links", DEFAULT_PREFER_LINKS)

def prefer_links(self) -> bool:
return self._prefer_links

def _list(
self,
Expand Down Expand Up @@ -182,6 +188,7 @@ def _serialization_props(self, user_context: OptionalUserContext = None) -> Posi
"enforce_symlink_security": self.enforce_symlink_security,
"delete_on_realize": self.delete_on_realize,
"allow_subdir_creation": self.allow_subdir_creation,
"prefer_links": self._prefer_links,
}

@property
Expand Down
9 changes: 7 additions & 2 deletions lib/galaxy/files/uris.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@ def stream_url_to_file(
target_path: Optional[str] = None,
file_source_opts: Optional[FilesSourceOptions] = None,
) -> str:
if file_sources is None:
file_sources = ConfiguredFileSources.from_dict(None, load_stock_plugins=True)
file_sources = ensure_file_sources(file_sources)
file_source, rel_path = file_sources.get_file_source_path(url)
if file_source:
if not target_path:
Expand All @@ -62,6 +61,12 @@ def stream_url_to_file(
raise NoMatchingFileSource(f"Could not find a matching handler for: {url}")


def ensure_file_sources(file_sources: Optional["ConfiguredFileSources"]) -> "ConfiguredFileSources":
if file_sources is None:
file_sources = ConfiguredFileSources.from_dict(None, load_stock_plugins=True)
return file_sources


def stream_to_file(stream, suffix="", prefix="", dir=None, text=False, **kwd):
"""Writes a stream to a temporary file, returns the temporary file's name"""
fd, temp_name = tempfile.mkstemp(suffix=suffix, prefix=prefix, dir=dir, text=text)
Expand Down
75 changes: 56 additions & 19 deletions lib/galaxy/tools/data_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
UploadProblemException,
)
from galaxy.files.uris import (
ensure_file_sources,
stream_to_file,
stream_url_to_file,
)
Expand Down Expand Up @@ -97,13 +98,13 @@ def expand_elements_from(target_or_item):
decompressed_directory = _decompress_target(upload_config, target_or_item)
items = _directory_to_items(decompressed_directory)
elif elements_from == "bagit":
_, elements_from_path = _has_src_to_path(upload_config, target_or_item, is_dataset=False)
_, elements_from_path, _ = _has_src_to_path(upload_config, target_or_item, is_dataset=False)
items = _bagit_to_items(elements_from_path)
elif elements_from == "bagit_archive":
decompressed_directory = _decompress_target(upload_config, target_or_item)
items = _bagit_to_items(decompressed_directory)
elif elements_from == "directory":
_, elements_from_path = _has_src_to_path(upload_config, target_or_item, is_dataset=False)
_, elements_from_path, _ = _has_src_to_path(upload_config, target_or_item, is_dataset=False)
items = _directory_to_items(elements_from_path)
else:
raise Exception(f"Unknown elements from type encountered [{elements_from}]")
Expand Down Expand Up @@ -205,7 +206,7 @@ def _resolve_item(item):
pass
key = keys[composite_item_idx]
writable_file = writable_files[key]
_, src_target = _has_src_to_path(upload_config, composite_item)
_, src_target, _ = _has_src_to_path(upload_config, composite_item)
# do the writing
sniff.handle_composite_file(
datatype,
Expand Down Expand Up @@ -238,10 +239,23 @@ def _resolve_item_with_primary(item):
converted_path = None

deferred = upload_config.get_option(item, "deferred")

link_data_only = upload_config.link_data_only
link_data_only_explicit = upload_config.link_data_only_explicit
if "link_data_only" in item:
# Allow overriding this on a per file basis.
link_data_only, link_data_only_explicit = _link_data_only(item)

name: str
path: Optional[str]
default_in_place = False
if not deferred:
name, path = _has_src_to_path(upload_config, item, is_dataset=True)
name, path, is_link = _has_src_to_path(
upload_config, item, is_dataset=True, link_data_only_explicitly_set=link_data_only_explicit
)
if is_link:
link_data_only = True
default_in_place = True
else:
name, path = _has_src_to_name(item) or "Deferred Dataset", None
sources = []
Expand All @@ -266,10 +280,6 @@ def _resolve_item_with_primary(item):
item["error_message"] = error_message

dbkey = item.get("dbkey", "?")
link_data_only = upload_config.link_data_only
if "link_data_only" in item:
# Allow overriding this on a per file basis.
link_data_only = _link_data_only(item)

ext = "data"
staged_extra_files = None
Expand All @@ -281,7 +291,7 @@ def _resolve_item_with_primary(item):

effective_state = "ok"
if not deferred and not error_message:
in_place = item.get("in_place", False)
in_place = item.get("in_place", default_in_place)
purge_source = item.get("purge_source", True)

registry = upload_config.registry
Expand Down Expand Up @@ -425,7 +435,7 @@ def _bagit_to_items(directory):


def _decompress_target(upload_config: "UploadConfig", target: Dict[str, Any]):
elements_from_name, elements_from_path = _has_src_to_path(upload_config, target, is_dataset=False)
elements_from_name, elements_from_path, _ = _has_src_to_path(upload_config, target, is_dataset=False)
# by default Galaxy will check for a directory with a single file and interpret that
# as the new root for expansion, this is a good user experience for uploading single
# files in a archive but not great from an API perspective. Allow disabling by setting
Expand Down Expand Up @@ -483,13 +493,33 @@ def _has_src_to_name(item) -> Optional[str]:
return name


def _has_src_to_path(upload_config: "UploadConfig", item: Dict[str, Any], is_dataset: bool = False) -> Tuple[str, str]:
def _has_src_to_path(
upload_config: "UploadConfig",
item: Dict[str, Any],
is_dataset: bool = False,
link_data_only: bool = False,
link_data_only_explicitly_set: bool = False,
) -> Tuple[str, str, bool]:
assert "src" in item, item
src = item.get("src")
name = item.get("name")
is_link = False
if src == "url":
url = item.get("url")
file_sources = ensure_file_sources(upload_config.file_sources)
assert url, "url cannot be empty"
if not link_data_only_explicitly_set:
file_source, rel_path = file_sources.get_file_source_path(url)
prefer_links = file_source.prefer_links()
if file_source.prefer_links():
if rel_path.startswith("/"):
rel_path = rel_path[1:]
path = os.path.abspath(os.path.join(file_source.root, rel_path))
if name is None:
name = url.split("/")[-1]
is_link = True
return name, path, is_link

try:
path = stream_url_to_file(url, file_sources=upload_config.file_sources, dir=upload_config.working_directory)
except Exception as e:
Expand All @@ -513,7 +543,7 @@ def _has_src_to_path(upload_config: "UploadConfig", item: Dict[str, Any], is_dat
path = item["path"]
if name is None:
name = os.path.basename(path)
return name, path
return name, path, is_link


def _handle_hash_validation(hash_function: HashFunctionNameEnum, hash_value: str, path: str):
Expand Down Expand Up @@ -564,7 +594,7 @@ def __init__(
self.space_to_tab = request.get("space_to_tab", False)
self.auto_decompress = request.get("auto_decompress", False)
self.deferred = request.get("deferred", False)
self.link_data_only = _link_data_only(request)
self.link_data_only, self.link_data_only_explicit = _link_data_only(request)
self.file_sources_dict = file_sources_dict
self._file_sources = None

Expand Down Expand Up @@ -616,12 +646,19 @@ def ensure_in_working_directory(self, path, purge_source, in_place):
return new_path


def _link_data_only(has_config_dict):
link_data_only = has_config_dict.get("link_data_only", False)
if not isinstance(link_data_only, bool):
# Allow the older string values of 'copy_files' and 'link_to_files'
link_data_only = link_data_only == "copy_files"
return link_data_only
def _link_data_only(has_config_dict) -> Tuple[bool, bool]:
if "link_data_only" in has_config_dict:
link_data_only_raw = has_config_dict["link_data_only"]
if not isinstance(link_data_only_raw, bool):
# Allow the older string values of 'copy_files' and 'link_to_files'
link_data_only = link_data_only_raw == "copy_files"
else:
link_data_only = link_data_only_raw
link_data_only_explicit = True
else:
link_data_only = False
link_data_only_explicit = False
return link_data_only, link_data_only_explicit


def _for_each_src(f, obj):
Expand Down
20 changes: 18 additions & 2 deletions lib/galaxy_test/driver/integration_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
REQUIRED_GROUP_EXPRESSION = f"{GROUP_A} or '{GROUP_B}'"


def get_posix_file_source_config(root_dir: str, roles: str, groups: str, include_test_data_dir: bool) -> str:
def get_posix_file_source_config(
root_dir: str, roles: str, groups: str, include_test_data_dir: bool, prefer_links: bool = False
) -> str:
rval = f"""
- type: posix
id: posix_test
Expand All @@ -26,6 +28,17 @@ def get_posix_file_source_config(root_dir: str, roles: str, groups: str, include
requires_roles: {roles}
requires_groups: {groups}
"""
if prefer_links:
rval += f"""
- type: posix
id: linking_source
label: Posix
doc: Files from local path to links
root: {root_dir}
writable: true
prefer_links: true
"""

if include_test_data_dir:
rval += """
- type: posix
Expand All @@ -44,9 +57,10 @@ def create_file_source_config_file_on(
include_test_data_dir,
required_role_expression,
required_group_expression,
prefer_links: bool = False,
):
file_contents = get_posix_file_source_config(
root_dir, required_role_expression, required_group_expression, include_test_data_dir
root_dir, required_role_expression, required_group_expression, include_test_data_dir, prefer_links=prefer_links
)
file_path = os.path.join(temp_dir, "file_sources_conf_posix.yml")
with open(file_path, "w") as f:
Expand All @@ -67,6 +81,7 @@ def handle_galaxy_config_kwds(
# Require role for access but do not require groups by default on every test to simplify them
required_role_expression=REQUIRED_ROLE_EXPRESSION,
required_group_expression="",
prefer_links: bool = False,
):
temp_dir = os.path.realpath(mkdtemp())
clazz_ = clazz_ or cls
Expand All @@ -79,6 +94,7 @@ def handle_galaxy_config_kwds(
clazz_.include_test_data_dir,
required_role_expression,
required_group_expression,
prefer_links=prefer_links,
)
config["file_sources_config_file"] = file_sources_config_file

Expand Down
46 changes: 42 additions & 4 deletions test/integration/test_remote_files_posix.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Before running this test, start nginx+webdav in Docker using following command:
# docker run -v `pwd`/test/integration/webdav/data:/media -e WEBDAV_USERNAME=alice -e WEBDAV_PASSWORD=secret1234 -p 7083:7083 jmchilton/webdavdev
# Apache Docker host (shown next) doesn't work because displayname not set in response.
# docker run -v `pwd`/test/integration/webdav:/var/lib/dav -e AUTH_TYPE=Basic -e USERNAME=alice -e PASSWORD=secret1234 -e LOCATION=/ -p 7083:80 bytemark/webdav
import os

from sqlalchemy import select

from galaxy.model import Dataset
from galaxy_test.base import api_asserts
from galaxy_test.base.populators import DatasetPopulator
from galaxy_test.driver import integration_util
Expand Down Expand Up @@ -108,3 +108,41 @@ def _assert_list_response_matches_fixtures(self, list_response):

def _assert_access_forbidden_response(self, response):
api_asserts.assert_status_code_is(response, 403)


class TestPreferLinksPosixFileSourceIntegration(PosixFileSourceSetup, integration_util.IntegrationTestCase):
dataset_populator: DatasetPopulator

@classmethod
def handle_galaxy_config_kwds(cls, config):
PosixFileSourceSetup.handle_galaxy_config_kwds(
config,
cls,
prefer_links=True,
)

def setUp(self):
super().setUp()
self._write_file_fixtures()
self.dataset_populator = DatasetPopulator(self.galaxy_interactor)

def test_links_by_default(self):
with self.dataset_populator.test_history() as history_id:
element = dict(src="url", url="gxfiles://linking_source/a")
target = {
"destination": {"type": "hdas"},
"elements": [element],
}
targets = [target]
payload = {
"history_id": history_id,
"targets": targets,
}
new_dataset = self.dataset_populator.fetch(payload, assert_ok=True).json()["outputs"][0]
content = self.dataset_populator.get_history_dataset_content(history_id, dataset=new_dataset)
assert content == "a\n", content
stmt = select(Dataset).order_by(Dataset.create_time.desc()).limit(1)
dataset = self._app.model.session.execute(stmt).unique().scalar_one()
assert dataset.external_filename.endswith("/root/a")
assert os.path.exists(dataset.external_filename)
assert open(dataset.external_filename).read() == "a\n"

0 comments on commit e0fc1d0

Please sign in to comment.